162306a36Sopenharmony_ci#!/bin/sh
262306a36Sopenharmony_ci# SPDX-License-Identifier: GPL-2.0-only
362306a36Sopenharmony_ci
462306a36Sopenharmony_ciexport KSELFTESTS_SKIP=4
562306a36Sopenharmony_ci
662306a36Sopenharmony_cilog() {
762306a36Sopenharmony_ci	echo >/dev/stderr $*
862306a36Sopenharmony_ci}
962306a36Sopenharmony_ci
1062306a36Sopenharmony_cipe_ok() {
1162306a36Sopenharmony_ci	local dev="$1"
1262306a36Sopenharmony_ci	local path="/sys/bus/pci/devices/$dev/eeh_pe_state"
1362306a36Sopenharmony_ci
1462306a36Sopenharmony_ci	# if a driver doesn't support the error handling callbacks then the
1562306a36Sopenharmony_ci	# device is recovered by removing and re-probing it. This causes the
1662306a36Sopenharmony_ci	# sysfs directory to disappear so read the PE state once and squash
1762306a36Sopenharmony_ci	# any potential error messages
1862306a36Sopenharmony_ci	local eeh_state="$(cat $path 2>/dev/null)"
1962306a36Sopenharmony_ci	if [ -z "$eeh_state" ]; then
2062306a36Sopenharmony_ci		return 1;
2162306a36Sopenharmony_ci	fi
2262306a36Sopenharmony_ci
2362306a36Sopenharmony_ci	local fw_state="$(echo $eeh_state | cut -d' ' -f1)"
2462306a36Sopenharmony_ci	local sw_state="$(echo $eeh_state | cut -d' ' -f2)"
2562306a36Sopenharmony_ci
2662306a36Sopenharmony_ci	# If EEH_PE_ISOLATED or EEH_PE_RECOVERING are set then the PE is in an
2762306a36Sopenharmony_ci	# error state or being recovered. Either way, not ok.
2862306a36Sopenharmony_ci	if [ "$((sw_state & 0x3))" -ne 0 ] ; then
2962306a36Sopenharmony_ci		return 1
3062306a36Sopenharmony_ci	fi
3162306a36Sopenharmony_ci
3262306a36Sopenharmony_ci	# A functioning PE should have the EEH_STATE_MMIO_ACTIVE and
3362306a36Sopenharmony_ci	# EEH_STATE_DMA_ACTIVE flags set. For some goddamn stupid reason
3462306a36Sopenharmony_ci	# the platform backends set these when the PE is in reset. The
3562306a36Sopenharmony_ci	# RECOVERING check above should stop any false positives though.
3662306a36Sopenharmony_ci	if [ "$((fw_state & 0x18))" -ne "$((0x18))" ] ; then
3762306a36Sopenharmony_ci		return 1
3862306a36Sopenharmony_ci	fi
3962306a36Sopenharmony_ci
4062306a36Sopenharmony_ci	return 0;
4162306a36Sopenharmony_ci}
4262306a36Sopenharmony_ci
4362306a36Sopenharmony_cieeh_supported() {
4462306a36Sopenharmony_ci	test -e /proc/powerpc/eeh && \
4562306a36Sopenharmony_ci	grep -q 'EEH Subsystem is enabled' /proc/powerpc/eeh
4662306a36Sopenharmony_ci}
4762306a36Sopenharmony_ci
4862306a36Sopenharmony_cieeh_test_prep() {
4962306a36Sopenharmony_ci	if ! eeh_supported ; then
5062306a36Sopenharmony_ci		echo "EEH not supported on this system, skipping"
5162306a36Sopenharmony_ci		exit $KSELFTESTS_SKIP;
5262306a36Sopenharmony_ci	fi
5362306a36Sopenharmony_ci
5462306a36Sopenharmony_ci	if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \
5562306a36Sopenharmony_ci	   [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then
5662306a36Sopenharmony_ci		log "debugfs EEH testing files are missing. Is debugfs mounted?"
5762306a36Sopenharmony_ci		exit $KSELFTESTS_SKIP;
5862306a36Sopenharmony_ci	fi
5962306a36Sopenharmony_ci
6062306a36Sopenharmony_ci	# Bump the max freeze count to something absurd so we don't
6162306a36Sopenharmony_ci	# trip over it while breaking things.
6262306a36Sopenharmony_ci	echo 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes
6362306a36Sopenharmony_ci}
6462306a36Sopenharmony_ci
6562306a36Sopenharmony_cieeh_can_break() {
6662306a36Sopenharmony_ci	# skip bridges since we can't recover them (yet...)
6762306a36Sopenharmony_ci	if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then
6862306a36Sopenharmony_ci		log "$dev, Skipped: bridge"
6962306a36Sopenharmony_ci		return 1;
7062306a36Sopenharmony_ci	fi
7162306a36Sopenharmony_ci
7262306a36Sopenharmony_ci	# The ahci driver doesn't support error recovery. If the ahci device
7362306a36Sopenharmony_ci	# happens to be hosting the root filesystem, and then we go and break
7462306a36Sopenharmony_ci	# it the system will generally go down. We should probably fix that
7562306a36Sopenharmony_ci	# at some point
7662306a36Sopenharmony_ci	if [ "ahci" = "$(basename $(realpath /sys/bus/pci/devices/$dev/driver))" ] ; then
7762306a36Sopenharmony_ci		log "$dev, Skipped: ahci doesn't support recovery"
7862306a36Sopenharmony_ci		return 1;
7962306a36Sopenharmony_ci	fi
8062306a36Sopenharmony_ci
8162306a36Sopenharmony_ci	# Don't inject errosr into an already-frozen PE. This happens with
8262306a36Sopenharmony_ci	# PEs that contain multiple PCI devices (e.g. multi-function cards)
8362306a36Sopenharmony_ci	# and injecting new errors during the recovery process will probably
8462306a36Sopenharmony_ci	# result in the recovery failing and the device being marked as
8562306a36Sopenharmony_ci	# failed.
8662306a36Sopenharmony_ci	if ! pe_ok $dev ; then
8762306a36Sopenharmony_ci		log "$dev, Skipped: Bad initial PE state"
8862306a36Sopenharmony_ci		return 1;
8962306a36Sopenharmony_ci	fi
9062306a36Sopenharmony_ci
9162306a36Sopenharmony_ci	return 0
9262306a36Sopenharmony_ci}
9362306a36Sopenharmony_ci
9462306a36Sopenharmony_cieeh_one_dev() {
9562306a36Sopenharmony_ci	local dev="$1"
9662306a36Sopenharmony_ci
9762306a36Sopenharmony_ci	# Using this function from the command line is sometimes useful for
9862306a36Sopenharmony_ci	# testing so check that the argument is a well-formed sysfs device
9962306a36Sopenharmony_ci	# name.
10062306a36Sopenharmony_ci	if ! test -e /sys/bus/pci/devices/$dev/ ; then
10162306a36Sopenharmony_ci		log "Error: '$dev' must be a sysfs device name (DDDD:BB:DD.F)"
10262306a36Sopenharmony_ci		return 1;
10362306a36Sopenharmony_ci	fi
10462306a36Sopenharmony_ci
10562306a36Sopenharmony_ci	# Break it
10662306a36Sopenharmony_ci	echo $dev >/sys/kernel/debug/powerpc/eeh_dev_break
10762306a36Sopenharmony_ci
10862306a36Sopenharmony_ci	# Force an EEH device check. If the kernel has already
10962306a36Sopenharmony_ci	# noticed the EEH (due to a driver poll or whatever), this
11062306a36Sopenharmony_ci	# is a no-op.
11162306a36Sopenharmony_ci	echo $dev >/sys/kernel/debug/powerpc/eeh_dev_check
11262306a36Sopenharmony_ci
11362306a36Sopenharmony_ci	# Default to a 60s timeout when waiting for a device to recover. This
11462306a36Sopenharmony_ci	# is an arbitrary default which can be overridden by setting the
11562306a36Sopenharmony_ci	# EEH_MAX_WAIT environmental variable when required.
11662306a36Sopenharmony_ci
11762306a36Sopenharmony_ci	# The current record holder for longest recovery time is:
11862306a36Sopenharmony_ci	#  "Adaptec Series 8 12G SAS/PCIe 3" at 39 seconds
11962306a36Sopenharmony_ci	max_wait=${EEH_MAX_WAIT:=60}
12062306a36Sopenharmony_ci
12162306a36Sopenharmony_ci	for i in `seq 0 ${max_wait}` ; do
12262306a36Sopenharmony_ci		if pe_ok $dev ; then
12362306a36Sopenharmony_ci			break;
12462306a36Sopenharmony_ci		fi
12562306a36Sopenharmony_ci		log "$dev, waited $i/${max_wait}"
12662306a36Sopenharmony_ci		sleep 1
12762306a36Sopenharmony_ci	done
12862306a36Sopenharmony_ci
12962306a36Sopenharmony_ci	if ! pe_ok $dev ; then
13062306a36Sopenharmony_ci		log "$dev, Failed to recover!"
13162306a36Sopenharmony_ci		return 1;
13262306a36Sopenharmony_ci	fi
13362306a36Sopenharmony_ci
13462306a36Sopenharmony_ci	log "$dev, Recovered after $i seconds"
13562306a36Sopenharmony_ci	return 0;
13662306a36Sopenharmony_ci}
13762306a36Sopenharmony_ci
13862306a36Sopenharmony_cieeh_has_driver() {
13962306a36Sopenharmony_ci	test -e /sys/bus/pci/devices/$1/driver;
14062306a36Sopenharmony_ci	return $?
14162306a36Sopenharmony_ci}
14262306a36Sopenharmony_ci
14362306a36Sopenharmony_cieeh_can_recover() {
14462306a36Sopenharmony_ci	# we'll get an IO error if the device's current driver doesn't support
14562306a36Sopenharmony_ci	# error recovery
14662306a36Sopenharmony_ci	echo $1 > '/sys/kernel/debug/powerpc/eeh_dev_can_recover' 2>/dev/null
14762306a36Sopenharmony_ci
14862306a36Sopenharmony_ci	return $?
14962306a36Sopenharmony_ci}
15062306a36Sopenharmony_ci
15162306a36Sopenharmony_cieeh_find_all_pfs() {
15262306a36Sopenharmony_ci	devices=""
15362306a36Sopenharmony_ci
15462306a36Sopenharmony_ci	# SR-IOV on pseries requires hypervisor support, so check for that
15562306a36Sopenharmony_ci	is_pseries=""
15662306a36Sopenharmony_ci	if grep -q pSeries /proc/cpuinfo ; then
15762306a36Sopenharmony_ci		if [ ! -f /proc/device-tree/rtas/ibm,open-sriov-allow-unfreeze ] ||
15862306a36Sopenharmony_ci		   [ ! -f /proc/device-tree/rtas/ibm,open-sriov-map-pe-number ] ; then
15962306a36Sopenharmony_ci			return 1;
16062306a36Sopenharmony_ci		fi
16162306a36Sopenharmony_ci
16262306a36Sopenharmony_ci		is_pseries="true"
16362306a36Sopenharmony_ci	fi
16462306a36Sopenharmony_ci
16562306a36Sopenharmony_ci	for dev in `ls -1 /sys/bus/pci/devices/` ; do
16662306a36Sopenharmony_ci		sysfs="/sys/bus/pci/devices/$dev"
16762306a36Sopenharmony_ci		if [ ! -e "$sysfs/sriov_numvfs" ] ; then
16862306a36Sopenharmony_ci			continue
16962306a36Sopenharmony_ci		fi
17062306a36Sopenharmony_ci
17162306a36Sopenharmony_ci		# skip unsupported PFs on pseries
17262306a36Sopenharmony_ci		if [ -z "$is_pseries" ] &&
17362306a36Sopenharmony_ci		   [ ! -f "$sysfs/of_node/ibm,is-open-sriov-pf" ] &&
17462306a36Sopenharmony_ci		   [ ! -f "$sysfs/of_node/ibm,open-sriov-vf-bar-info" ] ; then
17562306a36Sopenharmony_ci			continue;
17662306a36Sopenharmony_ci		fi
17762306a36Sopenharmony_ci
17862306a36Sopenharmony_ci		# no driver, no vfs
17962306a36Sopenharmony_ci		if ! eeh_has_driver $dev ; then
18062306a36Sopenharmony_ci			continue
18162306a36Sopenharmony_ci		fi
18262306a36Sopenharmony_ci
18362306a36Sopenharmony_ci		devices="$devices $dev"
18462306a36Sopenharmony_ci	done
18562306a36Sopenharmony_ci
18662306a36Sopenharmony_ci	if [ -z "$devices" ] ; then
18762306a36Sopenharmony_ci		return 1;
18862306a36Sopenharmony_ci	fi
18962306a36Sopenharmony_ci
19062306a36Sopenharmony_ci	echo $devices
19162306a36Sopenharmony_ci	return 0;
19262306a36Sopenharmony_ci}
19362306a36Sopenharmony_ci
19462306a36Sopenharmony_ci# attempts to enable one VF on each PF so we can do VF specific tests.
19562306a36Sopenharmony_ci# stdout: list of enabled VFs, one per line
19662306a36Sopenharmony_ci# return code: 0 if vfs are found, 1 otherwise
19762306a36Sopenharmony_cieeh_enable_vfs() {
19862306a36Sopenharmony_ci	pf_list="$(eeh_find_all_pfs)"
19962306a36Sopenharmony_ci
20062306a36Sopenharmony_ci	vfs=0
20162306a36Sopenharmony_ci	for dev in $pf_list ; do
20262306a36Sopenharmony_ci		pf_sysfs="/sys/bus/pci/devices/$dev"
20362306a36Sopenharmony_ci
20462306a36Sopenharmony_ci		# make sure we have a single VF
20562306a36Sopenharmony_ci		echo 0 > "$pf_sysfs/sriov_numvfs"
20662306a36Sopenharmony_ci		echo 1 > "$pf_sysfs/sriov_numvfs"
20762306a36Sopenharmony_ci		if [ "$?" != 0 ] ; then
20862306a36Sopenharmony_ci			log "Unable to enable VFs on $pf, skipping"
20962306a36Sopenharmony_ci			continue;
21062306a36Sopenharmony_ci		fi
21162306a36Sopenharmony_ci
21262306a36Sopenharmony_ci		vf="$(basename $(realpath "$pf_sysfs/virtfn0"))"
21362306a36Sopenharmony_ci		if [ $? != 0 ] ; then
21462306a36Sopenharmony_ci			log "unable to find enabled vf on $pf"
21562306a36Sopenharmony_ci			echo 0 > "$pf_sysfs/sriov_numvfs"
21662306a36Sopenharmony_ci			continue;
21762306a36Sopenharmony_ci		fi
21862306a36Sopenharmony_ci
21962306a36Sopenharmony_ci		if ! eeh_can_break $vf ; then
22062306a36Sopenharmony_ci			log "skipping "
22162306a36Sopenharmony_ci
22262306a36Sopenharmony_ci			echo 0 > "$pf_sysfs/sriov_numvfs"
22362306a36Sopenharmony_ci			continue;
22462306a36Sopenharmony_ci		fi
22562306a36Sopenharmony_ci
22662306a36Sopenharmony_ci		vfs="$((vfs + 1))"
22762306a36Sopenharmony_ci		echo $vf
22862306a36Sopenharmony_ci	done
22962306a36Sopenharmony_ci
23062306a36Sopenharmony_ci	test "$vfs" != 0
23162306a36Sopenharmony_ci	return $?
23262306a36Sopenharmony_ci}
23362306a36Sopenharmony_ci
23462306a36Sopenharmony_cieeh_disable_vfs() {
23562306a36Sopenharmony_ci	pf_list="$(eeh_find_all_pfs)"
23662306a36Sopenharmony_ci	if [ -z "$pf_list" ] ; then
23762306a36Sopenharmony_ci		return 1;
23862306a36Sopenharmony_ci	fi
23962306a36Sopenharmony_ci
24062306a36Sopenharmony_ci	for dev in $pf_list ; do
24162306a36Sopenharmony_ci		echo 0 > "/sys/bus/pci/devices/$dev/sriov_numvfs"
24262306a36Sopenharmony_ci	done
24362306a36Sopenharmony_ci
24462306a36Sopenharmony_ci	return 0;
24562306a36Sopenharmony_ci}
246