162306a36Sopenharmony_ci#!/bin/sh 262306a36Sopenharmony_ci# SPDX-License-Identifier: GPL-2.0-only 362306a36Sopenharmony_ci 462306a36Sopenharmony_ciexport KSELFTESTS_SKIP=4 562306a36Sopenharmony_ci 662306a36Sopenharmony_cilog() { 762306a36Sopenharmony_ci echo >/dev/stderr $* 862306a36Sopenharmony_ci} 962306a36Sopenharmony_ci 1062306a36Sopenharmony_cipe_ok() { 1162306a36Sopenharmony_ci local dev="$1" 1262306a36Sopenharmony_ci local path="/sys/bus/pci/devices/$dev/eeh_pe_state" 1362306a36Sopenharmony_ci 1462306a36Sopenharmony_ci # if a driver doesn't support the error handling callbacks then the 1562306a36Sopenharmony_ci # device is recovered by removing and re-probing it. This causes the 1662306a36Sopenharmony_ci # sysfs directory to disappear so read the PE state once and squash 1762306a36Sopenharmony_ci # any potential error messages 1862306a36Sopenharmony_ci local eeh_state="$(cat $path 2>/dev/null)" 1962306a36Sopenharmony_ci if [ -z "$eeh_state" ]; then 2062306a36Sopenharmony_ci return 1; 2162306a36Sopenharmony_ci fi 2262306a36Sopenharmony_ci 2362306a36Sopenharmony_ci local fw_state="$(echo $eeh_state | cut -d' ' -f1)" 2462306a36Sopenharmony_ci local sw_state="$(echo $eeh_state | cut -d' ' -f2)" 2562306a36Sopenharmony_ci 2662306a36Sopenharmony_ci # If EEH_PE_ISOLATED or EEH_PE_RECOVERING are set then the PE is in an 2762306a36Sopenharmony_ci # error state or being recovered. Either way, not ok. 2862306a36Sopenharmony_ci if [ "$((sw_state & 0x3))" -ne 0 ] ; then 2962306a36Sopenharmony_ci return 1 3062306a36Sopenharmony_ci fi 3162306a36Sopenharmony_ci 3262306a36Sopenharmony_ci # A functioning PE should have the EEH_STATE_MMIO_ACTIVE and 3362306a36Sopenharmony_ci # EEH_STATE_DMA_ACTIVE flags set. For some goddamn stupid reason 3462306a36Sopenharmony_ci # the platform backends set these when the PE is in reset. The 3562306a36Sopenharmony_ci # RECOVERING check above should stop any false positives though. 3662306a36Sopenharmony_ci if [ "$((fw_state & 0x18))" -ne "$((0x18))" ] ; then 3762306a36Sopenharmony_ci return 1 3862306a36Sopenharmony_ci fi 3962306a36Sopenharmony_ci 4062306a36Sopenharmony_ci return 0; 4162306a36Sopenharmony_ci} 4262306a36Sopenharmony_ci 4362306a36Sopenharmony_cieeh_supported() { 4462306a36Sopenharmony_ci test -e /proc/powerpc/eeh && \ 4562306a36Sopenharmony_ci grep -q 'EEH Subsystem is enabled' /proc/powerpc/eeh 4662306a36Sopenharmony_ci} 4762306a36Sopenharmony_ci 4862306a36Sopenharmony_cieeh_test_prep() { 4962306a36Sopenharmony_ci if ! eeh_supported ; then 5062306a36Sopenharmony_ci echo "EEH not supported on this system, skipping" 5162306a36Sopenharmony_ci exit $KSELFTESTS_SKIP; 5262306a36Sopenharmony_ci fi 5362306a36Sopenharmony_ci 5462306a36Sopenharmony_ci if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \ 5562306a36Sopenharmony_ci [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then 5662306a36Sopenharmony_ci log "debugfs EEH testing files are missing. Is debugfs mounted?" 5762306a36Sopenharmony_ci exit $KSELFTESTS_SKIP; 5862306a36Sopenharmony_ci fi 5962306a36Sopenharmony_ci 6062306a36Sopenharmony_ci # Bump the max freeze count to something absurd so we don't 6162306a36Sopenharmony_ci # trip over it while breaking things. 6262306a36Sopenharmony_ci echo 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes 6362306a36Sopenharmony_ci} 6462306a36Sopenharmony_ci 6562306a36Sopenharmony_cieeh_can_break() { 6662306a36Sopenharmony_ci # skip bridges since we can't recover them (yet...) 6762306a36Sopenharmony_ci if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then 6862306a36Sopenharmony_ci log "$dev, Skipped: bridge" 6962306a36Sopenharmony_ci return 1; 7062306a36Sopenharmony_ci fi 7162306a36Sopenharmony_ci 7262306a36Sopenharmony_ci # The ahci driver doesn't support error recovery. If the ahci device 7362306a36Sopenharmony_ci # happens to be hosting the root filesystem, and then we go and break 7462306a36Sopenharmony_ci # it the system will generally go down. We should probably fix that 7562306a36Sopenharmony_ci # at some point 7662306a36Sopenharmony_ci if [ "ahci" = "$(basename $(realpath /sys/bus/pci/devices/$dev/driver))" ] ; then 7762306a36Sopenharmony_ci log "$dev, Skipped: ahci doesn't support recovery" 7862306a36Sopenharmony_ci return 1; 7962306a36Sopenharmony_ci fi 8062306a36Sopenharmony_ci 8162306a36Sopenharmony_ci # Don't inject errosr into an already-frozen PE. This happens with 8262306a36Sopenharmony_ci # PEs that contain multiple PCI devices (e.g. multi-function cards) 8362306a36Sopenharmony_ci # and injecting new errors during the recovery process will probably 8462306a36Sopenharmony_ci # result in the recovery failing and the device being marked as 8562306a36Sopenharmony_ci # failed. 8662306a36Sopenharmony_ci if ! pe_ok $dev ; then 8762306a36Sopenharmony_ci log "$dev, Skipped: Bad initial PE state" 8862306a36Sopenharmony_ci return 1; 8962306a36Sopenharmony_ci fi 9062306a36Sopenharmony_ci 9162306a36Sopenharmony_ci return 0 9262306a36Sopenharmony_ci} 9362306a36Sopenharmony_ci 9462306a36Sopenharmony_cieeh_one_dev() { 9562306a36Sopenharmony_ci local dev="$1" 9662306a36Sopenharmony_ci 9762306a36Sopenharmony_ci # Using this function from the command line is sometimes useful for 9862306a36Sopenharmony_ci # testing so check that the argument is a well-formed sysfs device 9962306a36Sopenharmony_ci # name. 10062306a36Sopenharmony_ci if ! test -e /sys/bus/pci/devices/$dev/ ; then 10162306a36Sopenharmony_ci log "Error: '$dev' must be a sysfs device name (DDDD:BB:DD.F)" 10262306a36Sopenharmony_ci return 1; 10362306a36Sopenharmony_ci fi 10462306a36Sopenharmony_ci 10562306a36Sopenharmony_ci # Break it 10662306a36Sopenharmony_ci echo $dev >/sys/kernel/debug/powerpc/eeh_dev_break 10762306a36Sopenharmony_ci 10862306a36Sopenharmony_ci # Force an EEH device check. If the kernel has already 10962306a36Sopenharmony_ci # noticed the EEH (due to a driver poll or whatever), this 11062306a36Sopenharmony_ci # is a no-op. 11162306a36Sopenharmony_ci echo $dev >/sys/kernel/debug/powerpc/eeh_dev_check 11262306a36Sopenharmony_ci 11362306a36Sopenharmony_ci # Default to a 60s timeout when waiting for a device to recover. This 11462306a36Sopenharmony_ci # is an arbitrary default which can be overridden by setting the 11562306a36Sopenharmony_ci # EEH_MAX_WAIT environmental variable when required. 11662306a36Sopenharmony_ci 11762306a36Sopenharmony_ci # The current record holder for longest recovery time is: 11862306a36Sopenharmony_ci # "Adaptec Series 8 12G SAS/PCIe 3" at 39 seconds 11962306a36Sopenharmony_ci max_wait=${EEH_MAX_WAIT:=60} 12062306a36Sopenharmony_ci 12162306a36Sopenharmony_ci for i in `seq 0 ${max_wait}` ; do 12262306a36Sopenharmony_ci if pe_ok $dev ; then 12362306a36Sopenharmony_ci break; 12462306a36Sopenharmony_ci fi 12562306a36Sopenharmony_ci log "$dev, waited $i/${max_wait}" 12662306a36Sopenharmony_ci sleep 1 12762306a36Sopenharmony_ci done 12862306a36Sopenharmony_ci 12962306a36Sopenharmony_ci if ! pe_ok $dev ; then 13062306a36Sopenharmony_ci log "$dev, Failed to recover!" 13162306a36Sopenharmony_ci return 1; 13262306a36Sopenharmony_ci fi 13362306a36Sopenharmony_ci 13462306a36Sopenharmony_ci log "$dev, Recovered after $i seconds" 13562306a36Sopenharmony_ci return 0; 13662306a36Sopenharmony_ci} 13762306a36Sopenharmony_ci 13862306a36Sopenharmony_cieeh_has_driver() { 13962306a36Sopenharmony_ci test -e /sys/bus/pci/devices/$1/driver; 14062306a36Sopenharmony_ci return $? 14162306a36Sopenharmony_ci} 14262306a36Sopenharmony_ci 14362306a36Sopenharmony_cieeh_can_recover() { 14462306a36Sopenharmony_ci # we'll get an IO error if the device's current driver doesn't support 14562306a36Sopenharmony_ci # error recovery 14662306a36Sopenharmony_ci echo $1 > '/sys/kernel/debug/powerpc/eeh_dev_can_recover' 2>/dev/null 14762306a36Sopenharmony_ci 14862306a36Sopenharmony_ci return $? 14962306a36Sopenharmony_ci} 15062306a36Sopenharmony_ci 15162306a36Sopenharmony_cieeh_find_all_pfs() { 15262306a36Sopenharmony_ci devices="" 15362306a36Sopenharmony_ci 15462306a36Sopenharmony_ci # SR-IOV on pseries requires hypervisor support, so check for that 15562306a36Sopenharmony_ci is_pseries="" 15662306a36Sopenharmony_ci if grep -q pSeries /proc/cpuinfo ; then 15762306a36Sopenharmony_ci if [ ! -f /proc/device-tree/rtas/ibm,open-sriov-allow-unfreeze ] || 15862306a36Sopenharmony_ci [ ! -f /proc/device-tree/rtas/ibm,open-sriov-map-pe-number ] ; then 15962306a36Sopenharmony_ci return 1; 16062306a36Sopenharmony_ci fi 16162306a36Sopenharmony_ci 16262306a36Sopenharmony_ci is_pseries="true" 16362306a36Sopenharmony_ci fi 16462306a36Sopenharmony_ci 16562306a36Sopenharmony_ci for dev in `ls -1 /sys/bus/pci/devices/` ; do 16662306a36Sopenharmony_ci sysfs="/sys/bus/pci/devices/$dev" 16762306a36Sopenharmony_ci if [ ! -e "$sysfs/sriov_numvfs" ] ; then 16862306a36Sopenharmony_ci continue 16962306a36Sopenharmony_ci fi 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_ci # skip unsupported PFs on pseries 17262306a36Sopenharmony_ci if [ -z "$is_pseries" ] && 17362306a36Sopenharmony_ci [ ! -f "$sysfs/of_node/ibm,is-open-sriov-pf" ] && 17462306a36Sopenharmony_ci [ ! -f "$sysfs/of_node/ibm,open-sriov-vf-bar-info" ] ; then 17562306a36Sopenharmony_ci continue; 17662306a36Sopenharmony_ci fi 17762306a36Sopenharmony_ci 17862306a36Sopenharmony_ci # no driver, no vfs 17962306a36Sopenharmony_ci if ! eeh_has_driver $dev ; then 18062306a36Sopenharmony_ci continue 18162306a36Sopenharmony_ci fi 18262306a36Sopenharmony_ci 18362306a36Sopenharmony_ci devices="$devices $dev" 18462306a36Sopenharmony_ci done 18562306a36Sopenharmony_ci 18662306a36Sopenharmony_ci if [ -z "$devices" ] ; then 18762306a36Sopenharmony_ci return 1; 18862306a36Sopenharmony_ci fi 18962306a36Sopenharmony_ci 19062306a36Sopenharmony_ci echo $devices 19162306a36Sopenharmony_ci return 0; 19262306a36Sopenharmony_ci} 19362306a36Sopenharmony_ci 19462306a36Sopenharmony_ci# attempts to enable one VF on each PF so we can do VF specific tests. 19562306a36Sopenharmony_ci# stdout: list of enabled VFs, one per line 19662306a36Sopenharmony_ci# return code: 0 if vfs are found, 1 otherwise 19762306a36Sopenharmony_cieeh_enable_vfs() { 19862306a36Sopenharmony_ci pf_list="$(eeh_find_all_pfs)" 19962306a36Sopenharmony_ci 20062306a36Sopenharmony_ci vfs=0 20162306a36Sopenharmony_ci for dev in $pf_list ; do 20262306a36Sopenharmony_ci pf_sysfs="/sys/bus/pci/devices/$dev" 20362306a36Sopenharmony_ci 20462306a36Sopenharmony_ci # make sure we have a single VF 20562306a36Sopenharmony_ci echo 0 > "$pf_sysfs/sriov_numvfs" 20662306a36Sopenharmony_ci echo 1 > "$pf_sysfs/sriov_numvfs" 20762306a36Sopenharmony_ci if [ "$?" != 0 ] ; then 20862306a36Sopenharmony_ci log "Unable to enable VFs on $pf, skipping" 20962306a36Sopenharmony_ci continue; 21062306a36Sopenharmony_ci fi 21162306a36Sopenharmony_ci 21262306a36Sopenharmony_ci vf="$(basename $(realpath "$pf_sysfs/virtfn0"))" 21362306a36Sopenharmony_ci if [ $? != 0 ] ; then 21462306a36Sopenharmony_ci log "unable to find enabled vf on $pf" 21562306a36Sopenharmony_ci echo 0 > "$pf_sysfs/sriov_numvfs" 21662306a36Sopenharmony_ci continue; 21762306a36Sopenharmony_ci fi 21862306a36Sopenharmony_ci 21962306a36Sopenharmony_ci if ! eeh_can_break $vf ; then 22062306a36Sopenharmony_ci log "skipping " 22162306a36Sopenharmony_ci 22262306a36Sopenharmony_ci echo 0 > "$pf_sysfs/sriov_numvfs" 22362306a36Sopenharmony_ci continue; 22462306a36Sopenharmony_ci fi 22562306a36Sopenharmony_ci 22662306a36Sopenharmony_ci vfs="$((vfs + 1))" 22762306a36Sopenharmony_ci echo $vf 22862306a36Sopenharmony_ci done 22962306a36Sopenharmony_ci 23062306a36Sopenharmony_ci test "$vfs" != 0 23162306a36Sopenharmony_ci return $? 23262306a36Sopenharmony_ci} 23362306a36Sopenharmony_ci 23462306a36Sopenharmony_cieeh_disable_vfs() { 23562306a36Sopenharmony_ci pf_list="$(eeh_find_all_pfs)" 23662306a36Sopenharmony_ci if [ -z "$pf_list" ] ; then 23762306a36Sopenharmony_ci return 1; 23862306a36Sopenharmony_ci fi 23962306a36Sopenharmony_ci 24062306a36Sopenharmony_ci for dev in $pf_list ; do 24162306a36Sopenharmony_ci echo 0 > "/sys/bus/pci/devices/$dev/sriov_numvfs" 24262306a36Sopenharmony_ci done 24362306a36Sopenharmony_ci 24462306a36Sopenharmony_ci return 0; 24562306a36Sopenharmony_ci} 246