18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-or-later 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * PCI Error Recovery Driver for RPA-compliant PPC64 platform. 48c2ecf20Sopenharmony_ci * Copyright IBM Corp. 2004 2005 58c2ecf20Sopenharmony_ci * Copyright Linas Vepstas <linas@linas.org> 2004, 2005 68c2ecf20Sopenharmony_ci * 78c2ecf20Sopenharmony_ci * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> 88c2ecf20Sopenharmony_ci */ 98c2ecf20Sopenharmony_ci#include <linux/delay.h> 108c2ecf20Sopenharmony_ci#include <linux/interrupt.h> 118c2ecf20Sopenharmony_ci#include <linux/irq.h> 128c2ecf20Sopenharmony_ci#include <linux/module.h> 138c2ecf20Sopenharmony_ci#include <linux/pci.h> 148c2ecf20Sopenharmony_ci#include <linux/pci_hotplug.h> 158c2ecf20Sopenharmony_ci#include <asm/eeh.h> 168c2ecf20Sopenharmony_ci#include <asm/eeh_event.h> 178c2ecf20Sopenharmony_ci#include <asm/ppc-pci.h> 188c2ecf20Sopenharmony_ci#include <asm/pci-bridge.h> 198c2ecf20Sopenharmony_ci#include <asm/prom.h> 208c2ecf20Sopenharmony_ci#include <asm/rtas.h> 218c2ecf20Sopenharmony_ci 228c2ecf20Sopenharmony_cistruct eeh_rmv_data { 238c2ecf20Sopenharmony_ci struct list_head removed_vf_list; 248c2ecf20Sopenharmony_ci int removed_dev_count; 258c2ecf20Sopenharmony_ci}; 268c2ecf20Sopenharmony_ci 278c2ecf20Sopenharmony_cistatic int eeh_result_priority(enum pci_ers_result result) 288c2ecf20Sopenharmony_ci{ 298c2ecf20Sopenharmony_ci switch (result) { 308c2ecf20Sopenharmony_ci case PCI_ERS_RESULT_NONE: 318c2ecf20Sopenharmony_ci return 1; 328c2ecf20Sopenharmony_ci case PCI_ERS_RESULT_NO_AER_DRIVER: 338c2ecf20Sopenharmony_ci return 2; 348c2ecf20Sopenharmony_ci case PCI_ERS_RESULT_RECOVERED: 358c2ecf20Sopenharmony_ci return 3; 368c2ecf20Sopenharmony_ci case PCI_ERS_RESULT_CAN_RECOVER: 378c2ecf20Sopenharmony_ci return 4; 388c2ecf20Sopenharmony_ci case PCI_ERS_RESULT_DISCONNECT: 398c2ecf20Sopenharmony_ci return 5; 408c2ecf20Sopenharmony_ci case PCI_ERS_RESULT_NEED_RESET: 418c2ecf20Sopenharmony_ci return 6; 428c2ecf20Sopenharmony_ci default: 438c2ecf20Sopenharmony_ci WARN_ONCE(1, "Unknown pci_ers_result value: %d\n", (int)result); 448c2ecf20Sopenharmony_ci return 0; 458c2ecf20Sopenharmony_ci } 468c2ecf20Sopenharmony_ci}; 478c2ecf20Sopenharmony_ci 488c2ecf20Sopenharmony_cistatic const char *pci_ers_result_name(enum pci_ers_result result) 498c2ecf20Sopenharmony_ci{ 508c2ecf20Sopenharmony_ci switch (result) { 518c2ecf20Sopenharmony_ci case PCI_ERS_RESULT_NONE: 528c2ecf20Sopenharmony_ci return "none"; 538c2ecf20Sopenharmony_ci case PCI_ERS_RESULT_CAN_RECOVER: 548c2ecf20Sopenharmony_ci return "can recover"; 558c2ecf20Sopenharmony_ci case PCI_ERS_RESULT_NEED_RESET: 568c2ecf20Sopenharmony_ci return "need reset"; 578c2ecf20Sopenharmony_ci case PCI_ERS_RESULT_DISCONNECT: 588c2ecf20Sopenharmony_ci return "disconnect"; 598c2ecf20Sopenharmony_ci case PCI_ERS_RESULT_RECOVERED: 608c2ecf20Sopenharmony_ci return "recovered"; 618c2ecf20Sopenharmony_ci case PCI_ERS_RESULT_NO_AER_DRIVER: 628c2ecf20Sopenharmony_ci return "no AER driver"; 638c2ecf20Sopenharmony_ci default: 648c2ecf20Sopenharmony_ci WARN_ONCE(1, "Unknown result type: %d\n", (int)result); 658c2ecf20Sopenharmony_ci return "unknown"; 668c2ecf20Sopenharmony_ci } 678c2ecf20Sopenharmony_ci}; 688c2ecf20Sopenharmony_ci 698c2ecf20Sopenharmony_cistatic enum pci_ers_result pci_ers_merge_result(enum pci_ers_result old, 708c2ecf20Sopenharmony_ci enum pci_ers_result new) 718c2ecf20Sopenharmony_ci{ 728c2ecf20Sopenharmony_ci if (eeh_result_priority(new) > eeh_result_priority(old)) 738c2ecf20Sopenharmony_ci return new; 748c2ecf20Sopenharmony_ci return old; 758c2ecf20Sopenharmony_ci} 768c2ecf20Sopenharmony_ci 778c2ecf20Sopenharmony_cistatic bool eeh_dev_removed(struct eeh_dev *edev) 788c2ecf20Sopenharmony_ci{ 798c2ecf20Sopenharmony_ci return !edev || (edev->mode & EEH_DEV_REMOVED); 808c2ecf20Sopenharmony_ci} 818c2ecf20Sopenharmony_ci 828c2ecf20Sopenharmony_cistatic bool eeh_edev_actionable(struct eeh_dev *edev) 838c2ecf20Sopenharmony_ci{ 848c2ecf20Sopenharmony_ci if (!edev->pdev) 858c2ecf20Sopenharmony_ci return false; 868c2ecf20Sopenharmony_ci if (edev->pdev->error_state == pci_channel_io_perm_failure) 878c2ecf20Sopenharmony_ci return false; 888c2ecf20Sopenharmony_ci if (eeh_dev_removed(edev)) 898c2ecf20Sopenharmony_ci return false; 908c2ecf20Sopenharmony_ci if (eeh_pe_passed(edev->pe)) 918c2ecf20Sopenharmony_ci return false; 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci return true; 948c2ecf20Sopenharmony_ci} 958c2ecf20Sopenharmony_ci 968c2ecf20Sopenharmony_ci/** 978c2ecf20Sopenharmony_ci * eeh_pcid_get - Get the PCI device driver 988c2ecf20Sopenharmony_ci * @pdev: PCI device 998c2ecf20Sopenharmony_ci * 1008c2ecf20Sopenharmony_ci * The function is used to retrieve the PCI device driver for 1018c2ecf20Sopenharmony_ci * the indicated PCI device. Besides, we will increase the reference 1028c2ecf20Sopenharmony_ci * of the PCI device driver to prevent that being unloaded on 1038c2ecf20Sopenharmony_ci * the fly. Otherwise, kernel crash would be seen. 1048c2ecf20Sopenharmony_ci */ 1058c2ecf20Sopenharmony_cistatic inline struct pci_driver *eeh_pcid_get(struct pci_dev *pdev) 1068c2ecf20Sopenharmony_ci{ 1078c2ecf20Sopenharmony_ci if (!pdev || !pdev->driver) 1088c2ecf20Sopenharmony_ci return NULL; 1098c2ecf20Sopenharmony_ci 1108c2ecf20Sopenharmony_ci if (!try_module_get(pdev->driver->driver.owner)) 1118c2ecf20Sopenharmony_ci return NULL; 1128c2ecf20Sopenharmony_ci 1138c2ecf20Sopenharmony_ci return pdev->driver; 1148c2ecf20Sopenharmony_ci} 1158c2ecf20Sopenharmony_ci 1168c2ecf20Sopenharmony_ci/** 1178c2ecf20Sopenharmony_ci * eeh_pcid_put - Dereference on the PCI device driver 1188c2ecf20Sopenharmony_ci * @pdev: PCI device 1198c2ecf20Sopenharmony_ci * 1208c2ecf20Sopenharmony_ci * The function is called to do dereference on the PCI device 1218c2ecf20Sopenharmony_ci * driver of the indicated PCI device. 1228c2ecf20Sopenharmony_ci */ 1238c2ecf20Sopenharmony_cistatic inline void eeh_pcid_put(struct pci_dev *pdev) 1248c2ecf20Sopenharmony_ci{ 1258c2ecf20Sopenharmony_ci if (!pdev || !pdev->driver) 1268c2ecf20Sopenharmony_ci return; 1278c2ecf20Sopenharmony_ci 1288c2ecf20Sopenharmony_ci module_put(pdev->driver->driver.owner); 1298c2ecf20Sopenharmony_ci} 1308c2ecf20Sopenharmony_ci 1318c2ecf20Sopenharmony_ci/** 1328c2ecf20Sopenharmony_ci * eeh_disable_irq - Disable interrupt for the recovering device 1338c2ecf20Sopenharmony_ci * @dev: PCI device 1348c2ecf20Sopenharmony_ci * 1358c2ecf20Sopenharmony_ci * This routine must be called when reporting temporary or permanent 1368c2ecf20Sopenharmony_ci * error to the particular PCI device to disable interrupt of that 1378c2ecf20Sopenharmony_ci * device. If the device has enabled MSI or MSI-X interrupt, we needn't 1388c2ecf20Sopenharmony_ci * do real work because EEH should freeze DMA transfers for those PCI 1398c2ecf20Sopenharmony_ci * devices encountering EEH errors, which includes MSI or MSI-X. 1408c2ecf20Sopenharmony_ci */ 1418c2ecf20Sopenharmony_cistatic void eeh_disable_irq(struct eeh_dev *edev) 1428c2ecf20Sopenharmony_ci{ 1438c2ecf20Sopenharmony_ci /* Don't disable MSI and MSI-X interrupts. They are 1448c2ecf20Sopenharmony_ci * effectively disabled by the DMA Stopped state 1458c2ecf20Sopenharmony_ci * when an EEH error occurs. 1468c2ecf20Sopenharmony_ci */ 1478c2ecf20Sopenharmony_ci if (edev->pdev->msi_enabled || edev->pdev->msix_enabled) 1488c2ecf20Sopenharmony_ci return; 1498c2ecf20Sopenharmony_ci 1508c2ecf20Sopenharmony_ci if (!irq_has_action(edev->pdev->irq)) 1518c2ecf20Sopenharmony_ci return; 1528c2ecf20Sopenharmony_ci 1538c2ecf20Sopenharmony_ci edev->mode |= EEH_DEV_IRQ_DISABLED; 1548c2ecf20Sopenharmony_ci disable_irq_nosync(edev->pdev->irq); 1558c2ecf20Sopenharmony_ci} 1568c2ecf20Sopenharmony_ci 1578c2ecf20Sopenharmony_ci/** 1588c2ecf20Sopenharmony_ci * eeh_enable_irq - Enable interrupt for the recovering device 1598c2ecf20Sopenharmony_ci * @dev: PCI device 1608c2ecf20Sopenharmony_ci * 1618c2ecf20Sopenharmony_ci * This routine must be called to enable interrupt while failed 1628c2ecf20Sopenharmony_ci * device could be resumed. 1638c2ecf20Sopenharmony_ci */ 1648c2ecf20Sopenharmony_cistatic void eeh_enable_irq(struct eeh_dev *edev) 1658c2ecf20Sopenharmony_ci{ 1668c2ecf20Sopenharmony_ci if ((edev->mode) & EEH_DEV_IRQ_DISABLED) { 1678c2ecf20Sopenharmony_ci edev->mode &= ~EEH_DEV_IRQ_DISABLED; 1688c2ecf20Sopenharmony_ci /* 1698c2ecf20Sopenharmony_ci * FIXME !!!!! 1708c2ecf20Sopenharmony_ci * 1718c2ecf20Sopenharmony_ci * This is just ass backwards. This maze has 1728c2ecf20Sopenharmony_ci * unbalanced irq_enable/disable calls. So instead of 1738c2ecf20Sopenharmony_ci * finding the root cause it works around the warning 1748c2ecf20Sopenharmony_ci * in the irq_enable code by conditionally calling 1758c2ecf20Sopenharmony_ci * into it. 1768c2ecf20Sopenharmony_ci * 1778c2ecf20Sopenharmony_ci * That's just wrong.The warning in the core code is 1788c2ecf20Sopenharmony_ci * there to tell people to fix their asymmetries in 1798c2ecf20Sopenharmony_ci * their own code, not by abusing the core information 1808c2ecf20Sopenharmony_ci * to avoid it. 1818c2ecf20Sopenharmony_ci * 1828c2ecf20Sopenharmony_ci * I so wish that the assymetry would be the other way 1838c2ecf20Sopenharmony_ci * round and a few more irq_disable calls render that 1848c2ecf20Sopenharmony_ci * shit unusable forever. 1858c2ecf20Sopenharmony_ci * 1868c2ecf20Sopenharmony_ci * tglx 1878c2ecf20Sopenharmony_ci */ 1888c2ecf20Sopenharmony_ci if (irqd_irq_disabled(irq_get_irq_data(edev->pdev->irq))) 1898c2ecf20Sopenharmony_ci enable_irq(edev->pdev->irq); 1908c2ecf20Sopenharmony_ci } 1918c2ecf20Sopenharmony_ci} 1928c2ecf20Sopenharmony_ci 1938c2ecf20Sopenharmony_cistatic void eeh_dev_save_state(struct eeh_dev *edev, void *userdata) 1948c2ecf20Sopenharmony_ci{ 1958c2ecf20Sopenharmony_ci struct pci_dev *pdev; 1968c2ecf20Sopenharmony_ci 1978c2ecf20Sopenharmony_ci if (!edev) 1988c2ecf20Sopenharmony_ci return; 1998c2ecf20Sopenharmony_ci 2008c2ecf20Sopenharmony_ci /* 2018c2ecf20Sopenharmony_ci * We cannot access the config space on some adapters. 2028c2ecf20Sopenharmony_ci * Otherwise, it will cause fenced PHB. We don't save 2038c2ecf20Sopenharmony_ci * the content in their config space and will restore 2048c2ecf20Sopenharmony_ci * from the initial config space saved when the EEH 2058c2ecf20Sopenharmony_ci * device is created. 2068c2ecf20Sopenharmony_ci */ 2078c2ecf20Sopenharmony_ci if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) 2088c2ecf20Sopenharmony_ci return; 2098c2ecf20Sopenharmony_ci 2108c2ecf20Sopenharmony_ci pdev = eeh_dev_to_pci_dev(edev); 2118c2ecf20Sopenharmony_ci if (!pdev) 2128c2ecf20Sopenharmony_ci return; 2138c2ecf20Sopenharmony_ci 2148c2ecf20Sopenharmony_ci pci_save_state(pdev); 2158c2ecf20Sopenharmony_ci} 2168c2ecf20Sopenharmony_ci 2178c2ecf20Sopenharmony_cistatic void eeh_set_channel_state(struct eeh_pe *root, pci_channel_state_t s) 2188c2ecf20Sopenharmony_ci{ 2198c2ecf20Sopenharmony_ci struct eeh_pe *pe; 2208c2ecf20Sopenharmony_ci struct eeh_dev *edev, *tmp; 2218c2ecf20Sopenharmony_ci 2228c2ecf20Sopenharmony_ci eeh_for_each_pe(root, pe) 2238c2ecf20Sopenharmony_ci eeh_pe_for_each_dev(pe, edev, tmp) 2248c2ecf20Sopenharmony_ci if (eeh_edev_actionable(edev)) 2258c2ecf20Sopenharmony_ci edev->pdev->error_state = s; 2268c2ecf20Sopenharmony_ci} 2278c2ecf20Sopenharmony_ci 2288c2ecf20Sopenharmony_cistatic void eeh_set_irq_state(struct eeh_pe *root, bool enable) 2298c2ecf20Sopenharmony_ci{ 2308c2ecf20Sopenharmony_ci struct eeh_pe *pe; 2318c2ecf20Sopenharmony_ci struct eeh_dev *edev, *tmp; 2328c2ecf20Sopenharmony_ci 2338c2ecf20Sopenharmony_ci eeh_for_each_pe(root, pe) { 2348c2ecf20Sopenharmony_ci eeh_pe_for_each_dev(pe, edev, tmp) { 2358c2ecf20Sopenharmony_ci if (!eeh_edev_actionable(edev)) 2368c2ecf20Sopenharmony_ci continue; 2378c2ecf20Sopenharmony_ci 2388c2ecf20Sopenharmony_ci if (!eeh_pcid_get(edev->pdev)) 2398c2ecf20Sopenharmony_ci continue; 2408c2ecf20Sopenharmony_ci 2418c2ecf20Sopenharmony_ci if (enable) 2428c2ecf20Sopenharmony_ci eeh_enable_irq(edev); 2438c2ecf20Sopenharmony_ci else 2448c2ecf20Sopenharmony_ci eeh_disable_irq(edev); 2458c2ecf20Sopenharmony_ci 2468c2ecf20Sopenharmony_ci eeh_pcid_put(edev->pdev); 2478c2ecf20Sopenharmony_ci } 2488c2ecf20Sopenharmony_ci } 2498c2ecf20Sopenharmony_ci} 2508c2ecf20Sopenharmony_ci 2518c2ecf20Sopenharmony_citypedef enum pci_ers_result (*eeh_report_fn)(struct eeh_dev *, 2528c2ecf20Sopenharmony_ci struct pci_dev *, 2538c2ecf20Sopenharmony_ci struct pci_driver *); 2548c2ecf20Sopenharmony_cistatic void eeh_pe_report_edev(struct eeh_dev *edev, eeh_report_fn fn, 2558c2ecf20Sopenharmony_ci enum pci_ers_result *result) 2568c2ecf20Sopenharmony_ci{ 2578c2ecf20Sopenharmony_ci struct pci_dev *pdev; 2588c2ecf20Sopenharmony_ci struct pci_driver *driver; 2598c2ecf20Sopenharmony_ci enum pci_ers_result new_result; 2608c2ecf20Sopenharmony_ci 2618c2ecf20Sopenharmony_ci pci_lock_rescan_remove(); 2628c2ecf20Sopenharmony_ci pdev = edev->pdev; 2638c2ecf20Sopenharmony_ci if (pdev) 2648c2ecf20Sopenharmony_ci get_device(&pdev->dev); 2658c2ecf20Sopenharmony_ci pci_unlock_rescan_remove(); 2668c2ecf20Sopenharmony_ci if (!pdev) { 2678c2ecf20Sopenharmony_ci eeh_edev_info(edev, "no device"); 2688c2ecf20Sopenharmony_ci return; 2698c2ecf20Sopenharmony_ci } 2708c2ecf20Sopenharmony_ci device_lock(&pdev->dev); 2718c2ecf20Sopenharmony_ci if (eeh_edev_actionable(edev)) { 2728c2ecf20Sopenharmony_ci driver = eeh_pcid_get(pdev); 2738c2ecf20Sopenharmony_ci 2748c2ecf20Sopenharmony_ci if (!driver) 2758c2ecf20Sopenharmony_ci eeh_edev_info(edev, "no driver"); 2768c2ecf20Sopenharmony_ci else if (!driver->err_handler) 2778c2ecf20Sopenharmony_ci eeh_edev_info(edev, "driver not EEH aware"); 2788c2ecf20Sopenharmony_ci else if (edev->mode & EEH_DEV_NO_HANDLER) 2798c2ecf20Sopenharmony_ci eeh_edev_info(edev, "driver bound too late"); 2808c2ecf20Sopenharmony_ci else { 2818c2ecf20Sopenharmony_ci new_result = fn(edev, pdev, driver); 2828c2ecf20Sopenharmony_ci eeh_edev_info(edev, "%s driver reports: '%s'", 2838c2ecf20Sopenharmony_ci driver->name, 2848c2ecf20Sopenharmony_ci pci_ers_result_name(new_result)); 2858c2ecf20Sopenharmony_ci if (result) 2868c2ecf20Sopenharmony_ci *result = pci_ers_merge_result(*result, 2878c2ecf20Sopenharmony_ci new_result); 2888c2ecf20Sopenharmony_ci } 2898c2ecf20Sopenharmony_ci if (driver) 2908c2ecf20Sopenharmony_ci eeh_pcid_put(pdev); 2918c2ecf20Sopenharmony_ci } else { 2928c2ecf20Sopenharmony_ci eeh_edev_info(edev, "not actionable (%d,%d,%d)", !!pdev, 2938c2ecf20Sopenharmony_ci !eeh_dev_removed(edev), !eeh_pe_passed(edev->pe)); 2948c2ecf20Sopenharmony_ci } 2958c2ecf20Sopenharmony_ci device_unlock(&pdev->dev); 2968c2ecf20Sopenharmony_ci if (edev->pdev != pdev) 2978c2ecf20Sopenharmony_ci eeh_edev_warn(edev, "Device changed during processing!\n"); 2988c2ecf20Sopenharmony_ci put_device(&pdev->dev); 2998c2ecf20Sopenharmony_ci} 3008c2ecf20Sopenharmony_ci 3018c2ecf20Sopenharmony_cistatic void eeh_pe_report(const char *name, struct eeh_pe *root, 3028c2ecf20Sopenharmony_ci eeh_report_fn fn, enum pci_ers_result *result) 3038c2ecf20Sopenharmony_ci{ 3048c2ecf20Sopenharmony_ci struct eeh_pe *pe; 3058c2ecf20Sopenharmony_ci struct eeh_dev *edev, *tmp; 3068c2ecf20Sopenharmony_ci 3078c2ecf20Sopenharmony_ci pr_info("EEH: Beginning: '%s'\n", name); 3088c2ecf20Sopenharmony_ci eeh_for_each_pe(root, pe) eeh_pe_for_each_dev(pe, edev, tmp) 3098c2ecf20Sopenharmony_ci eeh_pe_report_edev(edev, fn, result); 3108c2ecf20Sopenharmony_ci if (result) 3118c2ecf20Sopenharmony_ci pr_info("EEH: Finished:'%s' with aggregate recovery state:'%s'\n", 3128c2ecf20Sopenharmony_ci name, pci_ers_result_name(*result)); 3138c2ecf20Sopenharmony_ci else 3148c2ecf20Sopenharmony_ci pr_info("EEH: Finished:'%s'", name); 3158c2ecf20Sopenharmony_ci} 3168c2ecf20Sopenharmony_ci 3178c2ecf20Sopenharmony_ci/** 3188c2ecf20Sopenharmony_ci * eeh_report_error - Report pci error to each device driver 3198c2ecf20Sopenharmony_ci * @edev: eeh device 3208c2ecf20Sopenharmony_ci * @driver: device's PCI driver 3218c2ecf20Sopenharmony_ci * 3228c2ecf20Sopenharmony_ci * Report an EEH error to each device driver. 3238c2ecf20Sopenharmony_ci */ 3248c2ecf20Sopenharmony_cistatic enum pci_ers_result eeh_report_error(struct eeh_dev *edev, 3258c2ecf20Sopenharmony_ci struct pci_dev *pdev, 3268c2ecf20Sopenharmony_ci struct pci_driver *driver) 3278c2ecf20Sopenharmony_ci{ 3288c2ecf20Sopenharmony_ci enum pci_ers_result rc; 3298c2ecf20Sopenharmony_ci 3308c2ecf20Sopenharmony_ci if (!driver->err_handler->error_detected) 3318c2ecf20Sopenharmony_ci return PCI_ERS_RESULT_NONE; 3328c2ecf20Sopenharmony_ci 3338c2ecf20Sopenharmony_ci eeh_edev_info(edev, "Invoking %s->error_detected(IO frozen)", 3348c2ecf20Sopenharmony_ci driver->name); 3358c2ecf20Sopenharmony_ci rc = driver->err_handler->error_detected(pdev, pci_channel_io_frozen); 3368c2ecf20Sopenharmony_ci 3378c2ecf20Sopenharmony_ci edev->in_error = true; 3388c2ecf20Sopenharmony_ci pci_uevent_ers(pdev, PCI_ERS_RESULT_NONE); 3398c2ecf20Sopenharmony_ci return rc; 3408c2ecf20Sopenharmony_ci} 3418c2ecf20Sopenharmony_ci 3428c2ecf20Sopenharmony_ci/** 3438c2ecf20Sopenharmony_ci * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled 3448c2ecf20Sopenharmony_ci * @edev: eeh device 3458c2ecf20Sopenharmony_ci * @driver: device's PCI driver 3468c2ecf20Sopenharmony_ci * 3478c2ecf20Sopenharmony_ci * Tells each device driver that IO ports, MMIO and config space I/O 3488c2ecf20Sopenharmony_ci * are now enabled. 3498c2ecf20Sopenharmony_ci */ 3508c2ecf20Sopenharmony_cistatic enum pci_ers_result eeh_report_mmio_enabled(struct eeh_dev *edev, 3518c2ecf20Sopenharmony_ci struct pci_dev *pdev, 3528c2ecf20Sopenharmony_ci struct pci_driver *driver) 3538c2ecf20Sopenharmony_ci{ 3548c2ecf20Sopenharmony_ci if (!driver->err_handler->mmio_enabled) 3558c2ecf20Sopenharmony_ci return PCI_ERS_RESULT_NONE; 3568c2ecf20Sopenharmony_ci eeh_edev_info(edev, "Invoking %s->mmio_enabled()", driver->name); 3578c2ecf20Sopenharmony_ci return driver->err_handler->mmio_enabled(pdev); 3588c2ecf20Sopenharmony_ci} 3598c2ecf20Sopenharmony_ci 3608c2ecf20Sopenharmony_ci/** 3618c2ecf20Sopenharmony_ci * eeh_report_reset - Tell device that slot has been reset 3628c2ecf20Sopenharmony_ci * @edev: eeh device 3638c2ecf20Sopenharmony_ci * @driver: device's PCI driver 3648c2ecf20Sopenharmony_ci * 3658c2ecf20Sopenharmony_ci * This routine must be called while EEH tries to reset particular 3668c2ecf20Sopenharmony_ci * PCI device so that the associated PCI device driver could take 3678c2ecf20Sopenharmony_ci * some actions, usually to save data the driver needs so that the 3688c2ecf20Sopenharmony_ci * driver can work again while the device is recovered. 3698c2ecf20Sopenharmony_ci */ 3708c2ecf20Sopenharmony_cistatic enum pci_ers_result eeh_report_reset(struct eeh_dev *edev, 3718c2ecf20Sopenharmony_ci struct pci_dev *pdev, 3728c2ecf20Sopenharmony_ci struct pci_driver *driver) 3738c2ecf20Sopenharmony_ci{ 3748c2ecf20Sopenharmony_ci if (!driver->err_handler->slot_reset || !edev->in_error) 3758c2ecf20Sopenharmony_ci return PCI_ERS_RESULT_NONE; 3768c2ecf20Sopenharmony_ci eeh_edev_info(edev, "Invoking %s->slot_reset()", driver->name); 3778c2ecf20Sopenharmony_ci return driver->err_handler->slot_reset(pdev); 3788c2ecf20Sopenharmony_ci} 3798c2ecf20Sopenharmony_ci 3808c2ecf20Sopenharmony_cistatic void eeh_dev_restore_state(struct eeh_dev *edev, void *userdata) 3818c2ecf20Sopenharmony_ci{ 3828c2ecf20Sopenharmony_ci struct pci_dev *pdev; 3838c2ecf20Sopenharmony_ci 3848c2ecf20Sopenharmony_ci if (!edev) 3858c2ecf20Sopenharmony_ci return; 3868c2ecf20Sopenharmony_ci 3878c2ecf20Sopenharmony_ci /* 3888c2ecf20Sopenharmony_ci * The content in the config space isn't saved because 3898c2ecf20Sopenharmony_ci * the blocked config space on some adapters. We have 3908c2ecf20Sopenharmony_ci * to restore the initial saved config space when the 3918c2ecf20Sopenharmony_ci * EEH device is created. 3928c2ecf20Sopenharmony_ci */ 3938c2ecf20Sopenharmony_ci if (edev->pe && (edev->pe->state & EEH_PE_CFG_RESTRICTED)) { 3948c2ecf20Sopenharmony_ci if (list_is_last(&edev->entry, &edev->pe->edevs)) 3958c2ecf20Sopenharmony_ci eeh_pe_restore_bars(edev->pe); 3968c2ecf20Sopenharmony_ci 3978c2ecf20Sopenharmony_ci return; 3988c2ecf20Sopenharmony_ci } 3998c2ecf20Sopenharmony_ci 4008c2ecf20Sopenharmony_ci pdev = eeh_dev_to_pci_dev(edev); 4018c2ecf20Sopenharmony_ci if (!pdev) 4028c2ecf20Sopenharmony_ci return; 4038c2ecf20Sopenharmony_ci 4048c2ecf20Sopenharmony_ci pci_restore_state(pdev); 4058c2ecf20Sopenharmony_ci} 4068c2ecf20Sopenharmony_ci 4078c2ecf20Sopenharmony_ci/** 4088c2ecf20Sopenharmony_ci * eeh_report_resume - Tell device to resume normal operations 4098c2ecf20Sopenharmony_ci * @edev: eeh device 4108c2ecf20Sopenharmony_ci * @driver: device's PCI driver 4118c2ecf20Sopenharmony_ci * 4128c2ecf20Sopenharmony_ci * This routine must be called to notify the device driver that it 4138c2ecf20Sopenharmony_ci * could resume so that the device driver can do some initialization 4148c2ecf20Sopenharmony_ci * to make the recovered device work again. 4158c2ecf20Sopenharmony_ci */ 4168c2ecf20Sopenharmony_cistatic enum pci_ers_result eeh_report_resume(struct eeh_dev *edev, 4178c2ecf20Sopenharmony_ci struct pci_dev *pdev, 4188c2ecf20Sopenharmony_ci struct pci_driver *driver) 4198c2ecf20Sopenharmony_ci{ 4208c2ecf20Sopenharmony_ci if (!driver->err_handler->resume || !edev->in_error) 4218c2ecf20Sopenharmony_ci return PCI_ERS_RESULT_NONE; 4228c2ecf20Sopenharmony_ci 4238c2ecf20Sopenharmony_ci eeh_edev_info(edev, "Invoking %s->resume()", driver->name); 4248c2ecf20Sopenharmony_ci driver->err_handler->resume(pdev); 4258c2ecf20Sopenharmony_ci 4268c2ecf20Sopenharmony_ci pci_uevent_ers(edev->pdev, PCI_ERS_RESULT_RECOVERED); 4278c2ecf20Sopenharmony_ci#ifdef CONFIG_PCI_IOV 4288c2ecf20Sopenharmony_ci if (eeh_ops->notify_resume) 4298c2ecf20Sopenharmony_ci eeh_ops->notify_resume(edev); 4308c2ecf20Sopenharmony_ci#endif 4318c2ecf20Sopenharmony_ci return PCI_ERS_RESULT_NONE; 4328c2ecf20Sopenharmony_ci} 4338c2ecf20Sopenharmony_ci 4348c2ecf20Sopenharmony_ci/** 4358c2ecf20Sopenharmony_ci * eeh_report_failure - Tell device driver that device is dead. 4368c2ecf20Sopenharmony_ci * @edev: eeh device 4378c2ecf20Sopenharmony_ci * @driver: device's PCI driver 4388c2ecf20Sopenharmony_ci * 4398c2ecf20Sopenharmony_ci * This informs the device driver that the device is permanently 4408c2ecf20Sopenharmony_ci * dead, and that no further recovery attempts will be made on it. 4418c2ecf20Sopenharmony_ci */ 4428c2ecf20Sopenharmony_cistatic enum pci_ers_result eeh_report_failure(struct eeh_dev *edev, 4438c2ecf20Sopenharmony_ci struct pci_dev *pdev, 4448c2ecf20Sopenharmony_ci struct pci_driver *driver) 4458c2ecf20Sopenharmony_ci{ 4468c2ecf20Sopenharmony_ci enum pci_ers_result rc; 4478c2ecf20Sopenharmony_ci 4488c2ecf20Sopenharmony_ci if (!driver->err_handler->error_detected) 4498c2ecf20Sopenharmony_ci return PCI_ERS_RESULT_NONE; 4508c2ecf20Sopenharmony_ci 4518c2ecf20Sopenharmony_ci eeh_edev_info(edev, "Invoking %s->error_detected(permanent failure)", 4528c2ecf20Sopenharmony_ci driver->name); 4538c2ecf20Sopenharmony_ci rc = driver->err_handler->error_detected(pdev, 4548c2ecf20Sopenharmony_ci pci_channel_io_perm_failure); 4558c2ecf20Sopenharmony_ci 4568c2ecf20Sopenharmony_ci pci_uevent_ers(pdev, PCI_ERS_RESULT_DISCONNECT); 4578c2ecf20Sopenharmony_ci return rc; 4588c2ecf20Sopenharmony_ci} 4598c2ecf20Sopenharmony_ci 4608c2ecf20Sopenharmony_cistatic void *eeh_add_virt_device(struct eeh_dev *edev) 4618c2ecf20Sopenharmony_ci{ 4628c2ecf20Sopenharmony_ci struct pci_driver *driver; 4638c2ecf20Sopenharmony_ci struct pci_dev *dev = eeh_dev_to_pci_dev(edev); 4648c2ecf20Sopenharmony_ci 4658c2ecf20Sopenharmony_ci if (!(edev->physfn)) { 4668c2ecf20Sopenharmony_ci eeh_edev_warn(edev, "Not for VF\n"); 4678c2ecf20Sopenharmony_ci return NULL; 4688c2ecf20Sopenharmony_ci } 4698c2ecf20Sopenharmony_ci 4708c2ecf20Sopenharmony_ci driver = eeh_pcid_get(dev); 4718c2ecf20Sopenharmony_ci if (driver) { 4728c2ecf20Sopenharmony_ci if (driver->err_handler) { 4738c2ecf20Sopenharmony_ci eeh_pcid_put(dev); 4748c2ecf20Sopenharmony_ci return NULL; 4758c2ecf20Sopenharmony_ci } 4768c2ecf20Sopenharmony_ci eeh_pcid_put(dev); 4778c2ecf20Sopenharmony_ci } 4788c2ecf20Sopenharmony_ci 4798c2ecf20Sopenharmony_ci#ifdef CONFIG_PCI_IOV 4808c2ecf20Sopenharmony_ci pci_iov_add_virtfn(edev->physfn, edev->vf_index); 4818c2ecf20Sopenharmony_ci#endif 4828c2ecf20Sopenharmony_ci return NULL; 4838c2ecf20Sopenharmony_ci} 4848c2ecf20Sopenharmony_ci 4858c2ecf20Sopenharmony_cistatic void eeh_rmv_device(struct eeh_dev *edev, void *userdata) 4868c2ecf20Sopenharmony_ci{ 4878c2ecf20Sopenharmony_ci struct pci_driver *driver; 4888c2ecf20Sopenharmony_ci struct pci_dev *dev = eeh_dev_to_pci_dev(edev); 4898c2ecf20Sopenharmony_ci struct eeh_rmv_data *rmv_data = (struct eeh_rmv_data *)userdata; 4908c2ecf20Sopenharmony_ci 4918c2ecf20Sopenharmony_ci /* 4928c2ecf20Sopenharmony_ci * Actually, we should remove the PCI bridges as well. 4938c2ecf20Sopenharmony_ci * However, that's lots of complexity to do that, 4948c2ecf20Sopenharmony_ci * particularly some of devices under the bridge might 4958c2ecf20Sopenharmony_ci * support EEH. So we just care about PCI devices for 4968c2ecf20Sopenharmony_ci * simplicity here. 4978c2ecf20Sopenharmony_ci */ 4988c2ecf20Sopenharmony_ci if (!eeh_edev_actionable(edev) || 4998c2ecf20Sopenharmony_ci (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE)) 5008c2ecf20Sopenharmony_ci return; 5018c2ecf20Sopenharmony_ci 5028c2ecf20Sopenharmony_ci if (rmv_data) { 5038c2ecf20Sopenharmony_ci driver = eeh_pcid_get(dev); 5048c2ecf20Sopenharmony_ci if (driver) { 5058c2ecf20Sopenharmony_ci if (driver->err_handler && 5068c2ecf20Sopenharmony_ci driver->err_handler->error_detected && 5078c2ecf20Sopenharmony_ci driver->err_handler->slot_reset) { 5088c2ecf20Sopenharmony_ci eeh_pcid_put(dev); 5098c2ecf20Sopenharmony_ci return; 5108c2ecf20Sopenharmony_ci } 5118c2ecf20Sopenharmony_ci eeh_pcid_put(dev); 5128c2ecf20Sopenharmony_ci } 5138c2ecf20Sopenharmony_ci } 5148c2ecf20Sopenharmony_ci 5158c2ecf20Sopenharmony_ci /* Remove it from PCI subsystem */ 5168c2ecf20Sopenharmony_ci pr_info("EEH: Removing %s without EEH sensitive driver\n", 5178c2ecf20Sopenharmony_ci pci_name(dev)); 5188c2ecf20Sopenharmony_ci edev->mode |= EEH_DEV_DISCONNECTED; 5198c2ecf20Sopenharmony_ci if (rmv_data) 5208c2ecf20Sopenharmony_ci rmv_data->removed_dev_count++; 5218c2ecf20Sopenharmony_ci 5228c2ecf20Sopenharmony_ci if (edev->physfn) { 5238c2ecf20Sopenharmony_ci#ifdef CONFIG_PCI_IOV 5248c2ecf20Sopenharmony_ci pci_iov_remove_virtfn(edev->physfn, edev->vf_index); 5258c2ecf20Sopenharmony_ci edev->pdev = NULL; 5268c2ecf20Sopenharmony_ci#endif 5278c2ecf20Sopenharmony_ci if (rmv_data) 5288c2ecf20Sopenharmony_ci list_add(&edev->rmv_entry, &rmv_data->removed_vf_list); 5298c2ecf20Sopenharmony_ci } else { 5308c2ecf20Sopenharmony_ci pci_lock_rescan_remove(); 5318c2ecf20Sopenharmony_ci pci_stop_and_remove_bus_device(dev); 5328c2ecf20Sopenharmony_ci pci_unlock_rescan_remove(); 5338c2ecf20Sopenharmony_ci } 5348c2ecf20Sopenharmony_ci} 5358c2ecf20Sopenharmony_ci 5368c2ecf20Sopenharmony_cistatic void *eeh_pe_detach_dev(struct eeh_pe *pe, void *userdata) 5378c2ecf20Sopenharmony_ci{ 5388c2ecf20Sopenharmony_ci struct eeh_dev *edev, *tmp; 5398c2ecf20Sopenharmony_ci 5408c2ecf20Sopenharmony_ci eeh_pe_for_each_dev(pe, edev, tmp) { 5418c2ecf20Sopenharmony_ci if (!(edev->mode & EEH_DEV_DISCONNECTED)) 5428c2ecf20Sopenharmony_ci continue; 5438c2ecf20Sopenharmony_ci 5448c2ecf20Sopenharmony_ci edev->mode &= ~(EEH_DEV_DISCONNECTED | EEH_DEV_IRQ_DISABLED); 5458c2ecf20Sopenharmony_ci eeh_pe_tree_remove(edev); 5468c2ecf20Sopenharmony_ci } 5478c2ecf20Sopenharmony_ci 5488c2ecf20Sopenharmony_ci return NULL; 5498c2ecf20Sopenharmony_ci} 5508c2ecf20Sopenharmony_ci 5518c2ecf20Sopenharmony_ci/* 5528c2ecf20Sopenharmony_ci * Explicitly clear PE's frozen state for PowerNV where 5538c2ecf20Sopenharmony_ci * we have frozen PE until BAR restore is completed. It's 5548c2ecf20Sopenharmony_ci * harmless to clear it for pSeries. To be consistent with 5558c2ecf20Sopenharmony_ci * PE reset (for 3 times), we try to clear the frozen state 5568c2ecf20Sopenharmony_ci * for 3 times as well. 5578c2ecf20Sopenharmony_ci */ 5588c2ecf20Sopenharmony_cistatic int eeh_clear_pe_frozen_state(struct eeh_pe *root, bool include_passed) 5598c2ecf20Sopenharmony_ci{ 5608c2ecf20Sopenharmony_ci struct eeh_pe *pe; 5618c2ecf20Sopenharmony_ci int i; 5628c2ecf20Sopenharmony_ci 5638c2ecf20Sopenharmony_ci eeh_for_each_pe(root, pe) { 5648c2ecf20Sopenharmony_ci if (include_passed || !eeh_pe_passed(pe)) { 5658c2ecf20Sopenharmony_ci for (i = 0; i < 3; i++) 5668c2ecf20Sopenharmony_ci if (!eeh_unfreeze_pe(pe)) 5678c2ecf20Sopenharmony_ci break; 5688c2ecf20Sopenharmony_ci if (i >= 3) 5698c2ecf20Sopenharmony_ci return -EIO; 5708c2ecf20Sopenharmony_ci } 5718c2ecf20Sopenharmony_ci } 5728c2ecf20Sopenharmony_ci eeh_pe_state_clear(root, EEH_PE_ISOLATED, include_passed); 5738c2ecf20Sopenharmony_ci return 0; 5748c2ecf20Sopenharmony_ci} 5758c2ecf20Sopenharmony_ci 5768c2ecf20Sopenharmony_ciint eeh_pe_reset_and_recover(struct eeh_pe *pe) 5778c2ecf20Sopenharmony_ci{ 5788c2ecf20Sopenharmony_ci int ret; 5798c2ecf20Sopenharmony_ci 5808c2ecf20Sopenharmony_ci /* Bail if the PE is being recovered */ 5818c2ecf20Sopenharmony_ci if (pe->state & EEH_PE_RECOVERING) 5828c2ecf20Sopenharmony_ci return 0; 5838c2ecf20Sopenharmony_ci 5848c2ecf20Sopenharmony_ci /* Put the PE into recovery mode */ 5858c2ecf20Sopenharmony_ci eeh_pe_state_mark(pe, EEH_PE_RECOVERING); 5868c2ecf20Sopenharmony_ci 5878c2ecf20Sopenharmony_ci /* Save states */ 5888c2ecf20Sopenharmony_ci eeh_pe_dev_traverse(pe, eeh_dev_save_state, NULL); 5898c2ecf20Sopenharmony_ci 5908c2ecf20Sopenharmony_ci /* Issue reset */ 5918c2ecf20Sopenharmony_ci ret = eeh_pe_reset_full(pe, true); 5928c2ecf20Sopenharmony_ci if (ret) { 5938c2ecf20Sopenharmony_ci eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); 5948c2ecf20Sopenharmony_ci return ret; 5958c2ecf20Sopenharmony_ci } 5968c2ecf20Sopenharmony_ci 5978c2ecf20Sopenharmony_ci /* Unfreeze the PE */ 5988c2ecf20Sopenharmony_ci ret = eeh_clear_pe_frozen_state(pe, true); 5998c2ecf20Sopenharmony_ci if (ret) { 6008c2ecf20Sopenharmony_ci eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); 6018c2ecf20Sopenharmony_ci return ret; 6028c2ecf20Sopenharmony_ci } 6038c2ecf20Sopenharmony_ci 6048c2ecf20Sopenharmony_ci /* Restore device state */ 6058c2ecf20Sopenharmony_ci eeh_pe_dev_traverse(pe, eeh_dev_restore_state, NULL); 6068c2ecf20Sopenharmony_ci 6078c2ecf20Sopenharmony_ci /* Clear recovery mode */ 6088c2ecf20Sopenharmony_ci eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); 6098c2ecf20Sopenharmony_ci 6108c2ecf20Sopenharmony_ci return 0; 6118c2ecf20Sopenharmony_ci} 6128c2ecf20Sopenharmony_ci 6138c2ecf20Sopenharmony_ci/** 6148c2ecf20Sopenharmony_ci * eeh_reset_device - Perform actual reset of a pci slot 6158c2ecf20Sopenharmony_ci * @driver_eeh_aware: Does the device's driver provide EEH support? 6168c2ecf20Sopenharmony_ci * @pe: EEH PE 6178c2ecf20Sopenharmony_ci * @bus: PCI bus corresponding to the isolcated slot 6188c2ecf20Sopenharmony_ci * @rmv_data: Optional, list to record removed devices 6198c2ecf20Sopenharmony_ci * 6208c2ecf20Sopenharmony_ci * This routine must be called to do reset on the indicated PE. 6218c2ecf20Sopenharmony_ci * During the reset, udev might be invoked because those affected 6228c2ecf20Sopenharmony_ci * PCI devices will be removed and then added. 6238c2ecf20Sopenharmony_ci */ 6248c2ecf20Sopenharmony_cistatic int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, 6258c2ecf20Sopenharmony_ci struct eeh_rmv_data *rmv_data, 6268c2ecf20Sopenharmony_ci bool driver_eeh_aware) 6278c2ecf20Sopenharmony_ci{ 6288c2ecf20Sopenharmony_ci time64_t tstamp; 6298c2ecf20Sopenharmony_ci int cnt, rc; 6308c2ecf20Sopenharmony_ci struct eeh_dev *edev; 6318c2ecf20Sopenharmony_ci struct eeh_pe *tmp_pe; 6328c2ecf20Sopenharmony_ci bool any_passed = false; 6338c2ecf20Sopenharmony_ci 6348c2ecf20Sopenharmony_ci eeh_for_each_pe(pe, tmp_pe) 6358c2ecf20Sopenharmony_ci any_passed |= eeh_pe_passed(tmp_pe); 6368c2ecf20Sopenharmony_ci 6378c2ecf20Sopenharmony_ci /* pcibios will clear the counter; save the value */ 6388c2ecf20Sopenharmony_ci cnt = pe->freeze_count; 6398c2ecf20Sopenharmony_ci tstamp = pe->tstamp; 6408c2ecf20Sopenharmony_ci 6418c2ecf20Sopenharmony_ci /* 6428c2ecf20Sopenharmony_ci * We don't remove the corresponding PE instances because 6438c2ecf20Sopenharmony_ci * we need the information afterwords. The attached EEH 6448c2ecf20Sopenharmony_ci * devices are expected to be attached soon when calling 6458c2ecf20Sopenharmony_ci * into pci_hp_add_devices(). 6468c2ecf20Sopenharmony_ci */ 6478c2ecf20Sopenharmony_ci eeh_pe_state_mark(pe, EEH_PE_KEEP); 6488c2ecf20Sopenharmony_ci if (any_passed || driver_eeh_aware || (pe->type & EEH_PE_VF)) { 6498c2ecf20Sopenharmony_ci eeh_pe_dev_traverse(pe, eeh_rmv_device, rmv_data); 6508c2ecf20Sopenharmony_ci } else { 6518c2ecf20Sopenharmony_ci pci_lock_rescan_remove(); 6528c2ecf20Sopenharmony_ci pci_hp_remove_devices(bus); 6538c2ecf20Sopenharmony_ci pci_unlock_rescan_remove(); 6548c2ecf20Sopenharmony_ci } 6558c2ecf20Sopenharmony_ci 6568c2ecf20Sopenharmony_ci /* 6578c2ecf20Sopenharmony_ci * Reset the pci controller. (Asserts RST#; resets config space). 6588c2ecf20Sopenharmony_ci * Reconfigure bridges and devices. Don't try to bring the system 6598c2ecf20Sopenharmony_ci * up if the reset failed for some reason. 6608c2ecf20Sopenharmony_ci * 6618c2ecf20Sopenharmony_ci * During the reset, it's very dangerous to have uncontrolled PCI 6628c2ecf20Sopenharmony_ci * config accesses. So we prefer to block them. However, controlled 6638c2ecf20Sopenharmony_ci * PCI config accesses initiated from EEH itself are allowed. 6648c2ecf20Sopenharmony_ci */ 6658c2ecf20Sopenharmony_ci rc = eeh_pe_reset_full(pe, false); 6668c2ecf20Sopenharmony_ci if (rc) 6678c2ecf20Sopenharmony_ci return rc; 6688c2ecf20Sopenharmony_ci 6698c2ecf20Sopenharmony_ci pci_lock_rescan_remove(); 6708c2ecf20Sopenharmony_ci 6718c2ecf20Sopenharmony_ci /* Restore PE */ 6728c2ecf20Sopenharmony_ci eeh_ops->configure_bridge(pe); 6738c2ecf20Sopenharmony_ci eeh_pe_restore_bars(pe); 6748c2ecf20Sopenharmony_ci 6758c2ecf20Sopenharmony_ci /* Clear frozen state */ 6768c2ecf20Sopenharmony_ci rc = eeh_clear_pe_frozen_state(pe, false); 6778c2ecf20Sopenharmony_ci if (rc) { 6788c2ecf20Sopenharmony_ci pci_unlock_rescan_remove(); 6798c2ecf20Sopenharmony_ci return rc; 6808c2ecf20Sopenharmony_ci } 6818c2ecf20Sopenharmony_ci 6828c2ecf20Sopenharmony_ci /* Give the system 5 seconds to finish running the user-space 6838c2ecf20Sopenharmony_ci * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, 6848c2ecf20Sopenharmony_ci * this is a hack, but if we don't do this, and try to bring 6858c2ecf20Sopenharmony_ci * the device up before the scripts have taken it down, 6868c2ecf20Sopenharmony_ci * potentially weird things happen. 6878c2ecf20Sopenharmony_ci */ 6888c2ecf20Sopenharmony_ci if (!driver_eeh_aware || rmv_data->removed_dev_count) { 6898c2ecf20Sopenharmony_ci pr_info("EEH: Sleep 5s ahead of %s hotplug\n", 6908c2ecf20Sopenharmony_ci (driver_eeh_aware ? "partial" : "complete")); 6918c2ecf20Sopenharmony_ci ssleep(5); 6928c2ecf20Sopenharmony_ci 6938c2ecf20Sopenharmony_ci /* 6948c2ecf20Sopenharmony_ci * The EEH device is still connected with its parent 6958c2ecf20Sopenharmony_ci * PE. We should disconnect it so the binding can be 6968c2ecf20Sopenharmony_ci * rebuilt when adding PCI devices. 6978c2ecf20Sopenharmony_ci */ 6988c2ecf20Sopenharmony_ci edev = list_first_entry(&pe->edevs, struct eeh_dev, entry); 6998c2ecf20Sopenharmony_ci eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); 7008c2ecf20Sopenharmony_ci if (pe->type & EEH_PE_VF) { 7018c2ecf20Sopenharmony_ci eeh_add_virt_device(edev); 7028c2ecf20Sopenharmony_ci } else { 7038c2ecf20Sopenharmony_ci if (!driver_eeh_aware) 7048c2ecf20Sopenharmony_ci eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); 7058c2ecf20Sopenharmony_ci pci_hp_add_devices(bus); 7068c2ecf20Sopenharmony_ci } 7078c2ecf20Sopenharmony_ci } 7088c2ecf20Sopenharmony_ci eeh_pe_state_clear(pe, EEH_PE_KEEP, true); 7098c2ecf20Sopenharmony_ci 7108c2ecf20Sopenharmony_ci pe->tstamp = tstamp; 7118c2ecf20Sopenharmony_ci pe->freeze_count = cnt; 7128c2ecf20Sopenharmony_ci 7138c2ecf20Sopenharmony_ci pci_unlock_rescan_remove(); 7148c2ecf20Sopenharmony_ci return 0; 7158c2ecf20Sopenharmony_ci} 7168c2ecf20Sopenharmony_ci 7178c2ecf20Sopenharmony_ci/* The longest amount of time to wait for a pci device 7188c2ecf20Sopenharmony_ci * to come back on line, in seconds. 7198c2ecf20Sopenharmony_ci */ 7208c2ecf20Sopenharmony_ci#define MAX_WAIT_FOR_RECOVERY 300 7218c2ecf20Sopenharmony_ci 7228c2ecf20Sopenharmony_ci 7238c2ecf20Sopenharmony_ci/* Walks the PE tree after processing an event to remove any stale PEs. 7248c2ecf20Sopenharmony_ci * 7258c2ecf20Sopenharmony_ci * NB: This needs to be recursive to ensure the leaf PEs get removed 7268c2ecf20Sopenharmony_ci * before their parents do. Although this is possible to do recursively 7278c2ecf20Sopenharmony_ci * we don't since this is easier to read and we need to garantee 7288c2ecf20Sopenharmony_ci * the leaf nodes will be handled first. 7298c2ecf20Sopenharmony_ci */ 7308c2ecf20Sopenharmony_cistatic void eeh_pe_cleanup(struct eeh_pe *pe) 7318c2ecf20Sopenharmony_ci{ 7328c2ecf20Sopenharmony_ci struct eeh_pe *child_pe, *tmp; 7338c2ecf20Sopenharmony_ci 7348c2ecf20Sopenharmony_ci list_for_each_entry_safe(child_pe, tmp, &pe->child_list, child) 7358c2ecf20Sopenharmony_ci eeh_pe_cleanup(child_pe); 7368c2ecf20Sopenharmony_ci 7378c2ecf20Sopenharmony_ci if (pe->state & EEH_PE_KEEP) 7388c2ecf20Sopenharmony_ci return; 7398c2ecf20Sopenharmony_ci 7408c2ecf20Sopenharmony_ci if (!(pe->state & EEH_PE_INVALID)) 7418c2ecf20Sopenharmony_ci return; 7428c2ecf20Sopenharmony_ci 7438c2ecf20Sopenharmony_ci if (list_empty(&pe->edevs) && list_empty(&pe->child_list)) { 7448c2ecf20Sopenharmony_ci list_del(&pe->child); 7458c2ecf20Sopenharmony_ci kfree(pe); 7468c2ecf20Sopenharmony_ci } 7478c2ecf20Sopenharmony_ci} 7488c2ecf20Sopenharmony_ci 7498c2ecf20Sopenharmony_ci/** 7508c2ecf20Sopenharmony_ci * eeh_check_slot_presence - Check if a device is still present in a slot 7518c2ecf20Sopenharmony_ci * @pdev: pci_dev to check 7528c2ecf20Sopenharmony_ci * 7538c2ecf20Sopenharmony_ci * This function may return a false positive if we can't determine the slot's 7548c2ecf20Sopenharmony_ci * presence state. This might happen for for PCIe slots if the PE containing 7558c2ecf20Sopenharmony_ci * the upstream bridge is also frozen, or the bridge is part of the same PE 7568c2ecf20Sopenharmony_ci * as the device. 7578c2ecf20Sopenharmony_ci * 7588c2ecf20Sopenharmony_ci * This shouldn't happen often, but you might see it if you hotplug a PCIe 7598c2ecf20Sopenharmony_ci * switch. 7608c2ecf20Sopenharmony_ci */ 7618c2ecf20Sopenharmony_cistatic bool eeh_slot_presence_check(struct pci_dev *pdev) 7628c2ecf20Sopenharmony_ci{ 7638c2ecf20Sopenharmony_ci const struct hotplug_slot_ops *ops; 7648c2ecf20Sopenharmony_ci struct pci_slot *slot; 7658c2ecf20Sopenharmony_ci u8 state; 7668c2ecf20Sopenharmony_ci int rc; 7678c2ecf20Sopenharmony_ci 7688c2ecf20Sopenharmony_ci if (!pdev) 7698c2ecf20Sopenharmony_ci return false; 7708c2ecf20Sopenharmony_ci 7718c2ecf20Sopenharmony_ci if (pdev->error_state == pci_channel_io_perm_failure) 7728c2ecf20Sopenharmony_ci return false; 7738c2ecf20Sopenharmony_ci 7748c2ecf20Sopenharmony_ci slot = pdev->slot; 7758c2ecf20Sopenharmony_ci if (!slot || !slot->hotplug) 7768c2ecf20Sopenharmony_ci return true; 7778c2ecf20Sopenharmony_ci 7788c2ecf20Sopenharmony_ci ops = slot->hotplug->ops; 7798c2ecf20Sopenharmony_ci if (!ops || !ops->get_adapter_status) 7808c2ecf20Sopenharmony_ci return true; 7818c2ecf20Sopenharmony_ci 7828c2ecf20Sopenharmony_ci /* set the attention indicator while we've got the slot ops */ 7838c2ecf20Sopenharmony_ci if (ops->set_attention_status) 7848c2ecf20Sopenharmony_ci ops->set_attention_status(slot->hotplug, 1); 7858c2ecf20Sopenharmony_ci 7868c2ecf20Sopenharmony_ci rc = ops->get_adapter_status(slot->hotplug, &state); 7878c2ecf20Sopenharmony_ci if (rc) 7888c2ecf20Sopenharmony_ci return true; 7898c2ecf20Sopenharmony_ci 7908c2ecf20Sopenharmony_ci return !!state; 7918c2ecf20Sopenharmony_ci} 7928c2ecf20Sopenharmony_ci 7938c2ecf20Sopenharmony_cistatic void eeh_clear_slot_attention(struct pci_dev *pdev) 7948c2ecf20Sopenharmony_ci{ 7958c2ecf20Sopenharmony_ci const struct hotplug_slot_ops *ops; 7968c2ecf20Sopenharmony_ci struct pci_slot *slot; 7978c2ecf20Sopenharmony_ci 7988c2ecf20Sopenharmony_ci if (!pdev) 7998c2ecf20Sopenharmony_ci return; 8008c2ecf20Sopenharmony_ci 8018c2ecf20Sopenharmony_ci if (pdev->error_state == pci_channel_io_perm_failure) 8028c2ecf20Sopenharmony_ci return; 8038c2ecf20Sopenharmony_ci 8048c2ecf20Sopenharmony_ci slot = pdev->slot; 8058c2ecf20Sopenharmony_ci if (!slot || !slot->hotplug) 8068c2ecf20Sopenharmony_ci return; 8078c2ecf20Sopenharmony_ci 8088c2ecf20Sopenharmony_ci ops = slot->hotplug->ops; 8098c2ecf20Sopenharmony_ci if (!ops || !ops->set_attention_status) 8108c2ecf20Sopenharmony_ci return; 8118c2ecf20Sopenharmony_ci 8128c2ecf20Sopenharmony_ci ops->set_attention_status(slot->hotplug, 0); 8138c2ecf20Sopenharmony_ci} 8148c2ecf20Sopenharmony_ci 8158c2ecf20Sopenharmony_ci/** 8168c2ecf20Sopenharmony_ci * eeh_handle_normal_event - Handle EEH events on a specific PE 8178c2ecf20Sopenharmony_ci * @pe: EEH PE - which should not be used after we return, as it may 8188c2ecf20Sopenharmony_ci * have been invalidated. 8198c2ecf20Sopenharmony_ci * 8208c2ecf20Sopenharmony_ci * Attempts to recover the given PE. If recovery fails or the PE has failed 8218c2ecf20Sopenharmony_ci * too many times, remove the PE. 8228c2ecf20Sopenharmony_ci * 8238c2ecf20Sopenharmony_ci * While PHB detects address or data parity errors on particular PCI 8248c2ecf20Sopenharmony_ci * slot, the associated PE will be frozen. Besides, DMA's occurring 8258c2ecf20Sopenharmony_ci * to wild addresses (which usually happen due to bugs in device 8268c2ecf20Sopenharmony_ci * drivers or in PCI adapter firmware) can cause EEH error. #SERR, 8278c2ecf20Sopenharmony_ci * #PERR or other misc PCI-related errors also can trigger EEH errors. 8288c2ecf20Sopenharmony_ci * 8298c2ecf20Sopenharmony_ci * Recovery process consists of unplugging the device driver (which 8308c2ecf20Sopenharmony_ci * generated hotplug events to userspace), then issuing a PCI #RST to 8318c2ecf20Sopenharmony_ci * the device, then reconfiguring the PCI config space for all bridges 8328c2ecf20Sopenharmony_ci * & devices under this slot, and then finally restarting the device 8338c2ecf20Sopenharmony_ci * drivers (which cause a second set of hotplug events to go out to 8348c2ecf20Sopenharmony_ci * userspace). 8358c2ecf20Sopenharmony_ci */ 8368c2ecf20Sopenharmony_civoid eeh_handle_normal_event(struct eeh_pe *pe) 8378c2ecf20Sopenharmony_ci{ 8388c2ecf20Sopenharmony_ci struct pci_bus *bus; 8398c2ecf20Sopenharmony_ci struct eeh_dev *edev, *tmp; 8408c2ecf20Sopenharmony_ci struct eeh_pe *tmp_pe; 8418c2ecf20Sopenharmony_ci int rc = 0; 8428c2ecf20Sopenharmony_ci enum pci_ers_result result = PCI_ERS_RESULT_NONE; 8438c2ecf20Sopenharmony_ci struct eeh_rmv_data rmv_data = 8448c2ecf20Sopenharmony_ci {LIST_HEAD_INIT(rmv_data.removed_vf_list), 0}; 8458c2ecf20Sopenharmony_ci int devices = 0; 8468c2ecf20Sopenharmony_ci 8478c2ecf20Sopenharmony_ci bus = eeh_pe_bus_get(pe); 8488c2ecf20Sopenharmony_ci if (!bus) { 8498c2ecf20Sopenharmony_ci pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n", 8508c2ecf20Sopenharmony_ci __func__, pe->phb->global_number, pe->addr); 8518c2ecf20Sopenharmony_ci return; 8528c2ecf20Sopenharmony_ci } 8538c2ecf20Sopenharmony_ci 8548c2ecf20Sopenharmony_ci /* 8558c2ecf20Sopenharmony_ci * When devices are hot-removed we might get an EEH due to 8568c2ecf20Sopenharmony_ci * a driver attempting to touch the MMIO space of a removed 8578c2ecf20Sopenharmony_ci * device. In this case we don't have a device to recover 8588c2ecf20Sopenharmony_ci * so suppress the event if we can't find any present devices. 8598c2ecf20Sopenharmony_ci * 8608c2ecf20Sopenharmony_ci * The hotplug driver should take care of tearing down the 8618c2ecf20Sopenharmony_ci * device itself. 8628c2ecf20Sopenharmony_ci */ 8638c2ecf20Sopenharmony_ci eeh_for_each_pe(pe, tmp_pe) 8648c2ecf20Sopenharmony_ci eeh_pe_for_each_dev(tmp_pe, edev, tmp) 8658c2ecf20Sopenharmony_ci if (eeh_slot_presence_check(edev->pdev)) 8668c2ecf20Sopenharmony_ci devices++; 8678c2ecf20Sopenharmony_ci 8688c2ecf20Sopenharmony_ci if (!devices) { 8698c2ecf20Sopenharmony_ci pr_debug("EEH: Frozen PHB#%x-PE#%x is empty!\n", 8708c2ecf20Sopenharmony_ci pe->phb->global_number, pe->addr); 8718c2ecf20Sopenharmony_ci goto out; /* nothing to recover */ 8728c2ecf20Sopenharmony_ci } 8738c2ecf20Sopenharmony_ci 8748c2ecf20Sopenharmony_ci /* Log the event */ 8758c2ecf20Sopenharmony_ci if (pe->type & EEH_PE_PHB) { 8768c2ecf20Sopenharmony_ci pr_err("EEH: Recovering PHB#%x, location: %s\n", 8778c2ecf20Sopenharmony_ci pe->phb->global_number, eeh_pe_loc_get(pe)); 8788c2ecf20Sopenharmony_ci } else { 8798c2ecf20Sopenharmony_ci struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb); 8808c2ecf20Sopenharmony_ci 8818c2ecf20Sopenharmony_ci pr_err("EEH: Recovering PHB#%x-PE#%x\n", 8828c2ecf20Sopenharmony_ci pe->phb->global_number, pe->addr); 8838c2ecf20Sopenharmony_ci pr_err("EEH: PE location: %s, PHB location: %s\n", 8848c2ecf20Sopenharmony_ci eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe)); 8858c2ecf20Sopenharmony_ci } 8868c2ecf20Sopenharmony_ci 8878c2ecf20Sopenharmony_ci#ifdef CONFIG_STACKTRACE 8888c2ecf20Sopenharmony_ci /* 8898c2ecf20Sopenharmony_ci * Print the saved stack trace now that we've verified there's 8908c2ecf20Sopenharmony_ci * something to recover. 8918c2ecf20Sopenharmony_ci */ 8928c2ecf20Sopenharmony_ci if (pe->trace_entries) { 8938c2ecf20Sopenharmony_ci void **ptrs = (void **) pe->stack_trace; 8948c2ecf20Sopenharmony_ci int i; 8958c2ecf20Sopenharmony_ci 8968c2ecf20Sopenharmony_ci pr_err("EEH: Frozen PHB#%x-PE#%x detected\n", 8978c2ecf20Sopenharmony_ci pe->phb->global_number, pe->addr); 8988c2ecf20Sopenharmony_ci 8998c2ecf20Sopenharmony_ci /* FIXME: Use the same format as dump_stack() */ 9008c2ecf20Sopenharmony_ci pr_err("EEH: Call Trace:\n"); 9018c2ecf20Sopenharmony_ci for (i = 0; i < pe->trace_entries; i++) 9028c2ecf20Sopenharmony_ci pr_err("EEH: [%pK] %pS\n", ptrs[i], ptrs[i]); 9038c2ecf20Sopenharmony_ci 9048c2ecf20Sopenharmony_ci pe->trace_entries = 0; 9058c2ecf20Sopenharmony_ci } 9068c2ecf20Sopenharmony_ci#endif /* CONFIG_STACKTRACE */ 9078c2ecf20Sopenharmony_ci 9088c2ecf20Sopenharmony_ci eeh_pe_update_time_stamp(pe); 9098c2ecf20Sopenharmony_ci pe->freeze_count++; 9108c2ecf20Sopenharmony_ci if (pe->freeze_count > eeh_max_freezes) { 9118c2ecf20Sopenharmony_ci pr_err("EEH: PHB#%x-PE#%x has failed %d times in the last hour and has been permanently disabled.\n", 9128c2ecf20Sopenharmony_ci pe->phb->global_number, pe->addr, 9138c2ecf20Sopenharmony_ci pe->freeze_count); 9148c2ecf20Sopenharmony_ci result = PCI_ERS_RESULT_DISCONNECT; 9158c2ecf20Sopenharmony_ci } 9168c2ecf20Sopenharmony_ci 9178c2ecf20Sopenharmony_ci eeh_for_each_pe(pe, tmp_pe) 9188c2ecf20Sopenharmony_ci eeh_pe_for_each_dev(tmp_pe, edev, tmp) 9198c2ecf20Sopenharmony_ci edev->mode &= ~EEH_DEV_NO_HANDLER; 9208c2ecf20Sopenharmony_ci 9218c2ecf20Sopenharmony_ci /* Walk the various device drivers attached to this slot through 9228c2ecf20Sopenharmony_ci * a reset sequence, giving each an opportunity to do what it needs 9238c2ecf20Sopenharmony_ci * to accomplish the reset. Each child gets a report of the 9248c2ecf20Sopenharmony_ci * status ... if any child can't handle the reset, then the entire 9258c2ecf20Sopenharmony_ci * slot is dlpar removed and added. 9268c2ecf20Sopenharmony_ci * 9278c2ecf20Sopenharmony_ci * When the PHB is fenced, we have to issue a reset to recover from 9288c2ecf20Sopenharmony_ci * the error. Override the result if necessary to have partially 9298c2ecf20Sopenharmony_ci * hotplug for this case. 9308c2ecf20Sopenharmony_ci */ 9318c2ecf20Sopenharmony_ci if (result != PCI_ERS_RESULT_DISCONNECT) { 9328c2ecf20Sopenharmony_ci pr_warn("EEH: This PCI device has failed %d times in the last hour and will be permanently disabled after %d failures.\n", 9338c2ecf20Sopenharmony_ci pe->freeze_count, eeh_max_freezes); 9348c2ecf20Sopenharmony_ci pr_info("EEH: Notify device drivers to shutdown\n"); 9358c2ecf20Sopenharmony_ci eeh_set_channel_state(pe, pci_channel_io_frozen); 9368c2ecf20Sopenharmony_ci eeh_set_irq_state(pe, false); 9378c2ecf20Sopenharmony_ci eeh_pe_report("error_detected(IO frozen)", pe, 9388c2ecf20Sopenharmony_ci eeh_report_error, &result); 9398c2ecf20Sopenharmony_ci if ((pe->type & EEH_PE_PHB) && 9408c2ecf20Sopenharmony_ci result != PCI_ERS_RESULT_NONE && 9418c2ecf20Sopenharmony_ci result != PCI_ERS_RESULT_NEED_RESET) 9428c2ecf20Sopenharmony_ci result = PCI_ERS_RESULT_NEED_RESET; 9438c2ecf20Sopenharmony_ci } 9448c2ecf20Sopenharmony_ci 9458c2ecf20Sopenharmony_ci /* Get the current PCI slot state. This can take a long time, 9468c2ecf20Sopenharmony_ci * sometimes over 300 seconds for certain systems. 9478c2ecf20Sopenharmony_ci */ 9488c2ecf20Sopenharmony_ci if (result != PCI_ERS_RESULT_DISCONNECT) { 9498c2ecf20Sopenharmony_ci rc = eeh_wait_state(pe, MAX_WAIT_FOR_RECOVERY*1000); 9508c2ecf20Sopenharmony_ci if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { 9518c2ecf20Sopenharmony_ci pr_warn("EEH: Permanent failure\n"); 9528c2ecf20Sopenharmony_ci result = PCI_ERS_RESULT_DISCONNECT; 9538c2ecf20Sopenharmony_ci } 9548c2ecf20Sopenharmony_ci } 9558c2ecf20Sopenharmony_ci 9568c2ecf20Sopenharmony_ci /* Since rtas may enable MMIO when posting the error log, 9578c2ecf20Sopenharmony_ci * don't post the error log until after all dev drivers 9588c2ecf20Sopenharmony_ci * have been informed. 9598c2ecf20Sopenharmony_ci */ 9608c2ecf20Sopenharmony_ci if (result != PCI_ERS_RESULT_DISCONNECT) { 9618c2ecf20Sopenharmony_ci pr_info("EEH: Collect temporary log\n"); 9628c2ecf20Sopenharmony_ci eeh_slot_error_detail(pe, EEH_LOG_TEMP); 9638c2ecf20Sopenharmony_ci } 9648c2ecf20Sopenharmony_ci 9658c2ecf20Sopenharmony_ci /* If all device drivers were EEH-unaware, then shut 9668c2ecf20Sopenharmony_ci * down all of the device drivers, and hope they 9678c2ecf20Sopenharmony_ci * go down willingly, without panicing the system. 9688c2ecf20Sopenharmony_ci */ 9698c2ecf20Sopenharmony_ci if (result == PCI_ERS_RESULT_NONE) { 9708c2ecf20Sopenharmony_ci pr_info("EEH: Reset with hotplug activity\n"); 9718c2ecf20Sopenharmony_ci rc = eeh_reset_device(pe, bus, NULL, false); 9728c2ecf20Sopenharmony_ci if (rc) { 9738c2ecf20Sopenharmony_ci pr_warn("%s: Unable to reset, err=%d\n", 9748c2ecf20Sopenharmony_ci __func__, rc); 9758c2ecf20Sopenharmony_ci result = PCI_ERS_RESULT_DISCONNECT; 9768c2ecf20Sopenharmony_ci } 9778c2ecf20Sopenharmony_ci } 9788c2ecf20Sopenharmony_ci 9798c2ecf20Sopenharmony_ci /* If all devices reported they can proceed, then re-enable MMIO */ 9808c2ecf20Sopenharmony_ci if (result == PCI_ERS_RESULT_CAN_RECOVER) { 9818c2ecf20Sopenharmony_ci pr_info("EEH: Enable I/O for affected devices\n"); 9828c2ecf20Sopenharmony_ci rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); 9838c2ecf20Sopenharmony_ci 9848c2ecf20Sopenharmony_ci if (rc < 0) { 9858c2ecf20Sopenharmony_ci result = PCI_ERS_RESULT_DISCONNECT; 9868c2ecf20Sopenharmony_ci } else if (rc) { 9878c2ecf20Sopenharmony_ci result = PCI_ERS_RESULT_NEED_RESET; 9888c2ecf20Sopenharmony_ci } else { 9898c2ecf20Sopenharmony_ci pr_info("EEH: Notify device drivers to resume I/O\n"); 9908c2ecf20Sopenharmony_ci eeh_pe_report("mmio_enabled", pe, 9918c2ecf20Sopenharmony_ci eeh_report_mmio_enabled, &result); 9928c2ecf20Sopenharmony_ci } 9938c2ecf20Sopenharmony_ci } 9948c2ecf20Sopenharmony_ci 9958c2ecf20Sopenharmony_ci /* If all devices reported they can proceed, then re-enable DMA */ 9968c2ecf20Sopenharmony_ci if (result == PCI_ERS_RESULT_CAN_RECOVER) { 9978c2ecf20Sopenharmony_ci pr_info("EEH: Enabled DMA for affected devices\n"); 9988c2ecf20Sopenharmony_ci rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); 9998c2ecf20Sopenharmony_ci 10008c2ecf20Sopenharmony_ci if (rc < 0) { 10018c2ecf20Sopenharmony_ci result = PCI_ERS_RESULT_DISCONNECT; 10028c2ecf20Sopenharmony_ci } else if (rc) { 10038c2ecf20Sopenharmony_ci result = PCI_ERS_RESULT_NEED_RESET; 10048c2ecf20Sopenharmony_ci } else { 10058c2ecf20Sopenharmony_ci /* 10068c2ecf20Sopenharmony_ci * We didn't do PE reset for the case. The PE 10078c2ecf20Sopenharmony_ci * is still in frozen state. Clear it before 10088c2ecf20Sopenharmony_ci * resuming the PE. 10098c2ecf20Sopenharmony_ci */ 10108c2ecf20Sopenharmony_ci eeh_pe_state_clear(pe, EEH_PE_ISOLATED, true); 10118c2ecf20Sopenharmony_ci result = PCI_ERS_RESULT_RECOVERED; 10128c2ecf20Sopenharmony_ci } 10138c2ecf20Sopenharmony_ci } 10148c2ecf20Sopenharmony_ci 10158c2ecf20Sopenharmony_ci /* If any device called out for a reset, then reset the slot */ 10168c2ecf20Sopenharmony_ci if (result == PCI_ERS_RESULT_NEED_RESET) { 10178c2ecf20Sopenharmony_ci pr_info("EEH: Reset without hotplug activity\n"); 10188c2ecf20Sopenharmony_ci rc = eeh_reset_device(pe, bus, &rmv_data, true); 10198c2ecf20Sopenharmony_ci if (rc) { 10208c2ecf20Sopenharmony_ci pr_warn("%s: Cannot reset, err=%d\n", 10218c2ecf20Sopenharmony_ci __func__, rc); 10228c2ecf20Sopenharmony_ci result = PCI_ERS_RESULT_DISCONNECT; 10238c2ecf20Sopenharmony_ci } else { 10248c2ecf20Sopenharmony_ci result = PCI_ERS_RESULT_NONE; 10258c2ecf20Sopenharmony_ci eeh_set_channel_state(pe, pci_channel_io_normal); 10268c2ecf20Sopenharmony_ci eeh_set_irq_state(pe, true); 10278c2ecf20Sopenharmony_ci eeh_pe_report("slot_reset", pe, eeh_report_reset, 10288c2ecf20Sopenharmony_ci &result); 10298c2ecf20Sopenharmony_ci } 10308c2ecf20Sopenharmony_ci } 10318c2ecf20Sopenharmony_ci 10328c2ecf20Sopenharmony_ci if ((result == PCI_ERS_RESULT_RECOVERED) || 10338c2ecf20Sopenharmony_ci (result == PCI_ERS_RESULT_NONE)) { 10348c2ecf20Sopenharmony_ci /* 10358c2ecf20Sopenharmony_ci * For those hot removed VFs, we should add back them after PF 10368c2ecf20Sopenharmony_ci * get recovered properly. 10378c2ecf20Sopenharmony_ci */ 10388c2ecf20Sopenharmony_ci list_for_each_entry_safe(edev, tmp, &rmv_data.removed_vf_list, 10398c2ecf20Sopenharmony_ci rmv_entry) { 10408c2ecf20Sopenharmony_ci eeh_add_virt_device(edev); 10418c2ecf20Sopenharmony_ci list_del(&edev->rmv_entry); 10428c2ecf20Sopenharmony_ci } 10438c2ecf20Sopenharmony_ci 10448c2ecf20Sopenharmony_ci /* Tell all device drivers that they can resume operations */ 10458c2ecf20Sopenharmony_ci pr_info("EEH: Notify device driver to resume\n"); 10468c2ecf20Sopenharmony_ci eeh_set_channel_state(pe, pci_channel_io_normal); 10478c2ecf20Sopenharmony_ci eeh_set_irq_state(pe, true); 10488c2ecf20Sopenharmony_ci eeh_pe_report("resume", pe, eeh_report_resume, NULL); 10498c2ecf20Sopenharmony_ci eeh_for_each_pe(pe, tmp_pe) { 10508c2ecf20Sopenharmony_ci eeh_pe_for_each_dev(tmp_pe, edev, tmp) { 10518c2ecf20Sopenharmony_ci edev->mode &= ~EEH_DEV_NO_HANDLER; 10528c2ecf20Sopenharmony_ci edev->in_error = false; 10538c2ecf20Sopenharmony_ci } 10548c2ecf20Sopenharmony_ci } 10558c2ecf20Sopenharmony_ci 10568c2ecf20Sopenharmony_ci pr_info("EEH: Recovery successful.\n"); 10578c2ecf20Sopenharmony_ci goto out; 10588c2ecf20Sopenharmony_ci } 10598c2ecf20Sopenharmony_ci 10608c2ecf20Sopenharmony_ci /* 10618c2ecf20Sopenharmony_ci * About 90% of all real-life EEH failures in the field 10628c2ecf20Sopenharmony_ci * are due to poorly seated PCI cards. Only 10% or so are 10638c2ecf20Sopenharmony_ci * due to actual, failed cards. 10648c2ecf20Sopenharmony_ci */ 10658c2ecf20Sopenharmony_ci pr_err("EEH: Unable to recover from failure from PHB#%x-PE#%x.\n" 10668c2ecf20Sopenharmony_ci "Please try reseating or replacing it\n", 10678c2ecf20Sopenharmony_ci pe->phb->global_number, pe->addr); 10688c2ecf20Sopenharmony_ci 10698c2ecf20Sopenharmony_ci eeh_slot_error_detail(pe, EEH_LOG_PERM); 10708c2ecf20Sopenharmony_ci 10718c2ecf20Sopenharmony_ci /* Notify all devices that they're about to go down. */ 10728c2ecf20Sopenharmony_ci eeh_set_irq_state(pe, false); 10738c2ecf20Sopenharmony_ci eeh_pe_report("error_detected(permanent failure)", pe, 10748c2ecf20Sopenharmony_ci eeh_report_failure, NULL); 10758c2ecf20Sopenharmony_ci eeh_set_channel_state(pe, pci_channel_io_perm_failure); 10768c2ecf20Sopenharmony_ci 10778c2ecf20Sopenharmony_ci /* Mark the PE to be removed permanently */ 10788c2ecf20Sopenharmony_ci eeh_pe_state_mark(pe, EEH_PE_REMOVED); 10798c2ecf20Sopenharmony_ci 10808c2ecf20Sopenharmony_ci /* 10818c2ecf20Sopenharmony_ci * Shut down the device drivers for good. We mark 10828c2ecf20Sopenharmony_ci * all removed devices correctly to avoid access 10838c2ecf20Sopenharmony_ci * the their PCI config any more. 10848c2ecf20Sopenharmony_ci */ 10858c2ecf20Sopenharmony_ci if (pe->type & EEH_PE_VF) { 10868c2ecf20Sopenharmony_ci eeh_pe_dev_traverse(pe, eeh_rmv_device, NULL); 10878c2ecf20Sopenharmony_ci eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); 10888c2ecf20Sopenharmony_ci } else { 10898c2ecf20Sopenharmony_ci eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); 10908c2ecf20Sopenharmony_ci eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED); 10918c2ecf20Sopenharmony_ci 10928c2ecf20Sopenharmony_ci pci_lock_rescan_remove(); 10938c2ecf20Sopenharmony_ci pci_hp_remove_devices(bus); 10948c2ecf20Sopenharmony_ci pci_unlock_rescan_remove(); 10958c2ecf20Sopenharmony_ci /* The passed PE should no longer be used */ 10968c2ecf20Sopenharmony_ci return; 10978c2ecf20Sopenharmony_ci } 10988c2ecf20Sopenharmony_ci 10998c2ecf20Sopenharmony_ciout: 11008c2ecf20Sopenharmony_ci /* 11018c2ecf20Sopenharmony_ci * Clean up any PEs without devices. While marked as EEH_PE_RECOVERYING 11028c2ecf20Sopenharmony_ci * we don't want to modify the PE tree structure so we do it here. 11038c2ecf20Sopenharmony_ci */ 11048c2ecf20Sopenharmony_ci eeh_pe_cleanup(pe); 11058c2ecf20Sopenharmony_ci 11068c2ecf20Sopenharmony_ci /* clear the slot attention LED for all recovered devices */ 11078c2ecf20Sopenharmony_ci eeh_for_each_pe(pe, tmp_pe) 11088c2ecf20Sopenharmony_ci eeh_pe_for_each_dev(tmp_pe, edev, tmp) 11098c2ecf20Sopenharmony_ci eeh_clear_slot_attention(edev->pdev); 11108c2ecf20Sopenharmony_ci 11118c2ecf20Sopenharmony_ci eeh_pe_state_clear(pe, EEH_PE_RECOVERING, true); 11128c2ecf20Sopenharmony_ci} 11138c2ecf20Sopenharmony_ci 11148c2ecf20Sopenharmony_ci/** 11158c2ecf20Sopenharmony_ci * eeh_handle_special_event - Handle EEH events without a specific failing PE 11168c2ecf20Sopenharmony_ci * 11178c2ecf20Sopenharmony_ci * Called when an EEH event is detected but can't be narrowed down to a 11188c2ecf20Sopenharmony_ci * specific PE. Iterates through possible failures and handles them as 11198c2ecf20Sopenharmony_ci * necessary. 11208c2ecf20Sopenharmony_ci */ 11218c2ecf20Sopenharmony_civoid eeh_handle_special_event(void) 11228c2ecf20Sopenharmony_ci{ 11238c2ecf20Sopenharmony_ci struct eeh_pe *pe, *phb_pe, *tmp_pe; 11248c2ecf20Sopenharmony_ci struct eeh_dev *edev, *tmp_edev; 11258c2ecf20Sopenharmony_ci struct pci_bus *bus; 11268c2ecf20Sopenharmony_ci struct pci_controller *hose; 11278c2ecf20Sopenharmony_ci unsigned long flags; 11288c2ecf20Sopenharmony_ci int rc; 11298c2ecf20Sopenharmony_ci 11308c2ecf20Sopenharmony_ci 11318c2ecf20Sopenharmony_ci do { 11328c2ecf20Sopenharmony_ci rc = eeh_ops->next_error(&pe); 11338c2ecf20Sopenharmony_ci 11348c2ecf20Sopenharmony_ci switch (rc) { 11358c2ecf20Sopenharmony_ci case EEH_NEXT_ERR_DEAD_IOC: 11368c2ecf20Sopenharmony_ci /* Mark all PHBs in dead state */ 11378c2ecf20Sopenharmony_ci eeh_serialize_lock(&flags); 11388c2ecf20Sopenharmony_ci 11398c2ecf20Sopenharmony_ci /* Purge all events */ 11408c2ecf20Sopenharmony_ci eeh_remove_event(NULL, true); 11418c2ecf20Sopenharmony_ci 11428c2ecf20Sopenharmony_ci list_for_each_entry(hose, &hose_list, list_node) { 11438c2ecf20Sopenharmony_ci phb_pe = eeh_phb_pe_get(hose); 11448c2ecf20Sopenharmony_ci if (!phb_pe) continue; 11458c2ecf20Sopenharmony_ci 11468c2ecf20Sopenharmony_ci eeh_pe_mark_isolated(phb_pe); 11478c2ecf20Sopenharmony_ci } 11488c2ecf20Sopenharmony_ci 11498c2ecf20Sopenharmony_ci eeh_serialize_unlock(flags); 11508c2ecf20Sopenharmony_ci 11518c2ecf20Sopenharmony_ci break; 11528c2ecf20Sopenharmony_ci case EEH_NEXT_ERR_FROZEN_PE: 11538c2ecf20Sopenharmony_ci case EEH_NEXT_ERR_FENCED_PHB: 11548c2ecf20Sopenharmony_ci case EEH_NEXT_ERR_DEAD_PHB: 11558c2ecf20Sopenharmony_ci /* Mark the PE in fenced state */ 11568c2ecf20Sopenharmony_ci eeh_serialize_lock(&flags); 11578c2ecf20Sopenharmony_ci 11588c2ecf20Sopenharmony_ci /* Purge all events of the PHB */ 11598c2ecf20Sopenharmony_ci eeh_remove_event(pe, true); 11608c2ecf20Sopenharmony_ci 11618c2ecf20Sopenharmony_ci if (rc != EEH_NEXT_ERR_DEAD_PHB) 11628c2ecf20Sopenharmony_ci eeh_pe_state_mark(pe, EEH_PE_RECOVERING); 11638c2ecf20Sopenharmony_ci eeh_pe_mark_isolated(pe); 11648c2ecf20Sopenharmony_ci 11658c2ecf20Sopenharmony_ci eeh_serialize_unlock(flags); 11668c2ecf20Sopenharmony_ci 11678c2ecf20Sopenharmony_ci break; 11688c2ecf20Sopenharmony_ci case EEH_NEXT_ERR_NONE: 11698c2ecf20Sopenharmony_ci return; 11708c2ecf20Sopenharmony_ci default: 11718c2ecf20Sopenharmony_ci pr_warn("%s: Invalid value %d from next_error()\n", 11728c2ecf20Sopenharmony_ci __func__, rc); 11738c2ecf20Sopenharmony_ci return; 11748c2ecf20Sopenharmony_ci } 11758c2ecf20Sopenharmony_ci 11768c2ecf20Sopenharmony_ci /* 11778c2ecf20Sopenharmony_ci * For fenced PHB and frozen PE, it's handled as normal 11788c2ecf20Sopenharmony_ci * event. We have to remove the affected PHBs for dead 11798c2ecf20Sopenharmony_ci * PHB and IOC 11808c2ecf20Sopenharmony_ci */ 11818c2ecf20Sopenharmony_ci if (rc == EEH_NEXT_ERR_FROZEN_PE || 11828c2ecf20Sopenharmony_ci rc == EEH_NEXT_ERR_FENCED_PHB) { 11838c2ecf20Sopenharmony_ci eeh_pe_state_mark(pe, EEH_PE_RECOVERING); 11848c2ecf20Sopenharmony_ci eeh_handle_normal_event(pe); 11858c2ecf20Sopenharmony_ci } else { 11868c2ecf20Sopenharmony_ci eeh_for_each_pe(pe, tmp_pe) 11878c2ecf20Sopenharmony_ci eeh_pe_for_each_dev(tmp_pe, edev, tmp_edev) 11888c2ecf20Sopenharmony_ci edev->mode &= ~EEH_DEV_NO_HANDLER; 11898c2ecf20Sopenharmony_ci 11908c2ecf20Sopenharmony_ci /* Notify all devices to be down */ 11918c2ecf20Sopenharmony_ci eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true); 11928c2ecf20Sopenharmony_ci eeh_pe_report( 11938c2ecf20Sopenharmony_ci "error_detected(permanent failure)", pe, 11948c2ecf20Sopenharmony_ci eeh_report_failure, NULL); 11958c2ecf20Sopenharmony_ci eeh_set_channel_state(pe, pci_channel_io_perm_failure); 11968c2ecf20Sopenharmony_ci 11978c2ecf20Sopenharmony_ci pci_lock_rescan_remove(); 11988c2ecf20Sopenharmony_ci list_for_each_entry(hose, &hose_list, list_node) { 11998c2ecf20Sopenharmony_ci phb_pe = eeh_phb_pe_get(hose); 12008c2ecf20Sopenharmony_ci if (!phb_pe || 12018c2ecf20Sopenharmony_ci !(phb_pe->state & EEH_PE_ISOLATED) || 12028c2ecf20Sopenharmony_ci (phb_pe->state & EEH_PE_RECOVERING)) 12038c2ecf20Sopenharmony_ci continue; 12048c2ecf20Sopenharmony_ci 12058c2ecf20Sopenharmony_ci bus = eeh_pe_bus_get(phb_pe); 12068c2ecf20Sopenharmony_ci if (!bus) { 12078c2ecf20Sopenharmony_ci pr_err("%s: Cannot find PCI bus for " 12088c2ecf20Sopenharmony_ci "PHB#%x-PE#%x\n", 12098c2ecf20Sopenharmony_ci __func__, 12108c2ecf20Sopenharmony_ci pe->phb->global_number, 12118c2ecf20Sopenharmony_ci pe->addr); 12128c2ecf20Sopenharmony_ci break; 12138c2ecf20Sopenharmony_ci } 12148c2ecf20Sopenharmony_ci pci_hp_remove_devices(bus); 12158c2ecf20Sopenharmony_ci } 12168c2ecf20Sopenharmony_ci pci_unlock_rescan_remove(); 12178c2ecf20Sopenharmony_ci } 12188c2ecf20Sopenharmony_ci 12198c2ecf20Sopenharmony_ci /* 12208c2ecf20Sopenharmony_ci * If we have detected dead IOC, we needn't proceed 12218c2ecf20Sopenharmony_ci * any more since all PHBs would have been removed 12228c2ecf20Sopenharmony_ci */ 12238c2ecf20Sopenharmony_ci if (rc == EEH_NEXT_ERR_DEAD_IOC) 12248c2ecf20Sopenharmony_ci break; 12258c2ecf20Sopenharmony_ci } while (rc != EEH_NEXT_ERR_NONE); 12268c2ecf20Sopenharmony_ci} 1227