18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
28c2ecf20Sopenharmony_ci#include <string.h>
38c2ecf20Sopenharmony_ci#include "debug.h"
48c2ecf20Sopenharmony_ci
58c2ecf20Sopenharmony_ci#include "demangle-rust.h"
68c2ecf20Sopenharmony_ci
78c2ecf20Sopenharmony_ci/*
88c2ecf20Sopenharmony_ci * Mangled Rust symbols look like this:
98c2ecf20Sopenharmony_ci *
108c2ecf20Sopenharmony_ci *     _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
118c2ecf20Sopenharmony_ci *
128c2ecf20Sopenharmony_ci * The original symbol is:
138c2ecf20Sopenharmony_ci *
148c2ecf20Sopenharmony_ci *     <std::sys::fd::FileDesc as core::ops::Drop>::drop
158c2ecf20Sopenharmony_ci *
168c2ecf20Sopenharmony_ci * The last component of the path is a 64-bit hash in lowercase hex, prefixed
178c2ecf20Sopenharmony_ci * with "h". Rust does not have a global namespace between crates, an illusion
188c2ecf20Sopenharmony_ci * which Rust maintains by using the hash to distinguish things that would
198c2ecf20Sopenharmony_ci * otherwise have the same symbol.
208c2ecf20Sopenharmony_ci *
218c2ecf20Sopenharmony_ci * Any path component not starting with a XID_Start character is prefixed with
228c2ecf20Sopenharmony_ci * "_".
238c2ecf20Sopenharmony_ci *
248c2ecf20Sopenharmony_ci * The following escape sequences are used:
258c2ecf20Sopenharmony_ci *
268c2ecf20Sopenharmony_ci *     ","  =>  $C$
278c2ecf20Sopenharmony_ci *     "@"  =>  $SP$
288c2ecf20Sopenharmony_ci *     "*"  =>  $BP$
298c2ecf20Sopenharmony_ci *     "&"  =>  $RF$
308c2ecf20Sopenharmony_ci *     "<"  =>  $LT$
318c2ecf20Sopenharmony_ci *     ">"  =>  $GT$
328c2ecf20Sopenharmony_ci *     "("  =>  $LP$
338c2ecf20Sopenharmony_ci *     ")"  =>  $RP$
348c2ecf20Sopenharmony_ci *     " "  =>  $u20$
358c2ecf20Sopenharmony_ci *     "'"  =>  $u27$
368c2ecf20Sopenharmony_ci *     "["  =>  $u5b$
378c2ecf20Sopenharmony_ci *     "]"  =>  $u5d$
388c2ecf20Sopenharmony_ci *     "~"  =>  $u7e$
398c2ecf20Sopenharmony_ci *
408c2ecf20Sopenharmony_ci * A double ".." means "::" and a single "." means "-".
418c2ecf20Sopenharmony_ci *
428c2ecf20Sopenharmony_ci * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$
438c2ecf20Sopenharmony_ci */
448c2ecf20Sopenharmony_ci
458c2ecf20Sopenharmony_cistatic const char *hash_prefix = "::h";
468c2ecf20Sopenharmony_cistatic const size_t hash_prefix_len = 3;
478c2ecf20Sopenharmony_cistatic const size_t hash_len = 16;
488c2ecf20Sopenharmony_ci
498c2ecf20Sopenharmony_cistatic bool is_prefixed_hash(const char *start);
508c2ecf20Sopenharmony_cistatic bool looks_like_rust(const char *sym, size_t len);
518c2ecf20Sopenharmony_cistatic bool unescape(const char **in, char **out, const char *seq, char value);
528c2ecf20Sopenharmony_ci
538c2ecf20Sopenharmony_ci/*
548c2ecf20Sopenharmony_ci * INPUT:
558c2ecf20Sopenharmony_ci *     sym: symbol that has been through BFD-demangling
568c2ecf20Sopenharmony_ci *
578c2ecf20Sopenharmony_ci * This function looks for the following indicators:
588c2ecf20Sopenharmony_ci *
598c2ecf20Sopenharmony_ci *  1. The hash must consist of "h" followed by 16 lowercase hex digits.
608c2ecf20Sopenharmony_ci *
618c2ecf20Sopenharmony_ci *  2. As a sanity check, the hash must use between 5 and 15 of the 16 possible
628c2ecf20Sopenharmony_ci *     hex digits. This is true of 99.9998% of hashes so once in your life you
638c2ecf20Sopenharmony_ci *     may see a false negative. The point is to notice path components that
648c2ecf20Sopenharmony_ci *     could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In
658c2ecf20Sopenharmony_ci *     this case a false positive (non-Rust symbol has an important path
668c2ecf20Sopenharmony_ci *     component removed because it looks like a Rust hash) is worse than a
678c2ecf20Sopenharmony_ci *     false negative (the rare Rust symbol is not demangled) so this sets the
688c2ecf20Sopenharmony_ci *     balance in favor of false negatives.
698c2ecf20Sopenharmony_ci *
708c2ecf20Sopenharmony_ci *  3. There must be no characters other than a-zA-Z0-9 and _.:$
718c2ecf20Sopenharmony_ci *
728c2ecf20Sopenharmony_ci *  4. There must be no unrecognized $-sign sequences.
738c2ecf20Sopenharmony_ci *
748c2ecf20Sopenharmony_ci *  5. There must be no sequence of three or more dots in a row ("...").
758c2ecf20Sopenharmony_ci */
768c2ecf20Sopenharmony_cibool
778c2ecf20Sopenharmony_cirust_is_mangled(const char *sym)
788c2ecf20Sopenharmony_ci{
798c2ecf20Sopenharmony_ci	size_t len, len_without_hash;
808c2ecf20Sopenharmony_ci
818c2ecf20Sopenharmony_ci	if (!sym)
828c2ecf20Sopenharmony_ci		return false;
838c2ecf20Sopenharmony_ci
848c2ecf20Sopenharmony_ci	len = strlen(sym);
858c2ecf20Sopenharmony_ci	if (len <= hash_prefix_len + hash_len)
868c2ecf20Sopenharmony_ci		/* Not long enough to contain "::h" + hash + something else */
878c2ecf20Sopenharmony_ci		return false;
888c2ecf20Sopenharmony_ci
898c2ecf20Sopenharmony_ci	len_without_hash = len - (hash_prefix_len + hash_len);
908c2ecf20Sopenharmony_ci	if (!is_prefixed_hash(sym + len_without_hash))
918c2ecf20Sopenharmony_ci		return false;
928c2ecf20Sopenharmony_ci
938c2ecf20Sopenharmony_ci	return looks_like_rust(sym, len_without_hash);
948c2ecf20Sopenharmony_ci}
958c2ecf20Sopenharmony_ci
968c2ecf20Sopenharmony_ci/*
978c2ecf20Sopenharmony_ci * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex
988c2ecf20Sopenharmony_ci * digits must comprise between 5 and 15 (inclusive) distinct digits.
998c2ecf20Sopenharmony_ci */
1008c2ecf20Sopenharmony_cistatic bool is_prefixed_hash(const char *str)
1018c2ecf20Sopenharmony_ci{
1028c2ecf20Sopenharmony_ci	const char *end;
1038c2ecf20Sopenharmony_ci	bool seen[16];
1048c2ecf20Sopenharmony_ci	size_t i;
1058c2ecf20Sopenharmony_ci	int count;
1068c2ecf20Sopenharmony_ci
1078c2ecf20Sopenharmony_ci	if (strncmp(str, hash_prefix, hash_prefix_len))
1088c2ecf20Sopenharmony_ci		return false;
1098c2ecf20Sopenharmony_ci	str += hash_prefix_len;
1108c2ecf20Sopenharmony_ci
1118c2ecf20Sopenharmony_ci	memset(seen, false, sizeof(seen));
1128c2ecf20Sopenharmony_ci	for (end = str + hash_len; str < end; str++)
1138c2ecf20Sopenharmony_ci		if (*str >= '0' && *str <= '9')
1148c2ecf20Sopenharmony_ci			seen[*str - '0'] = true;
1158c2ecf20Sopenharmony_ci		else if (*str >= 'a' && *str <= 'f')
1168c2ecf20Sopenharmony_ci			seen[*str - 'a' + 10] = true;
1178c2ecf20Sopenharmony_ci		else
1188c2ecf20Sopenharmony_ci			return false;
1198c2ecf20Sopenharmony_ci
1208c2ecf20Sopenharmony_ci	/* Count how many distinct digits seen */
1218c2ecf20Sopenharmony_ci	count = 0;
1228c2ecf20Sopenharmony_ci	for (i = 0; i < 16; i++)
1238c2ecf20Sopenharmony_ci		if (seen[i])
1248c2ecf20Sopenharmony_ci			count++;
1258c2ecf20Sopenharmony_ci
1268c2ecf20Sopenharmony_ci	return count >= 5 && count <= 15;
1278c2ecf20Sopenharmony_ci}
1288c2ecf20Sopenharmony_ci
1298c2ecf20Sopenharmony_cistatic bool looks_like_rust(const char *str, size_t len)
1308c2ecf20Sopenharmony_ci{
1318c2ecf20Sopenharmony_ci	const char *end = str + len;
1328c2ecf20Sopenharmony_ci
1338c2ecf20Sopenharmony_ci	while (str < end)
1348c2ecf20Sopenharmony_ci		switch (*str) {
1358c2ecf20Sopenharmony_ci		case '$':
1368c2ecf20Sopenharmony_ci			if (!strncmp(str, "$C$", 3))
1378c2ecf20Sopenharmony_ci				str += 3;
1388c2ecf20Sopenharmony_ci			else if (!strncmp(str, "$SP$", 4)
1398c2ecf20Sopenharmony_ci					|| !strncmp(str, "$BP$", 4)
1408c2ecf20Sopenharmony_ci					|| !strncmp(str, "$RF$", 4)
1418c2ecf20Sopenharmony_ci					|| !strncmp(str, "$LT$", 4)
1428c2ecf20Sopenharmony_ci					|| !strncmp(str, "$GT$", 4)
1438c2ecf20Sopenharmony_ci					|| !strncmp(str, "$LP$", 4)
1448c2ecf20Sopenharmony_ci					|| !strncmp(str, "$RP$", 4))
1458c2ecf20Sopenharmony_ci				str += 4;
1468c2ecf20Sopenharmony_ci			else if (!strncmp(str, "$u20$", 5)
1478c2ecf20Sopenharmony_ci					|| !strncmp(str, "$u27$", 5)
1488c2ecf20Sopenharmony_ci					|| !strncmp(str, "$u5b$", 5)
1498c2ecf20Sopenharmony_ci					|| !strncmp(str, "$u5d$", 5)
1508c2ecf20Sopenharmony_ci					|| !strncmp(str, "$u7e$", 5))
1518c2ecf20Sopenharmony_ci				str += 5;
1528c2ecf20Sopenharmony_ci			else
1538c2ecf20Sopenharmony_ci				return false;
1548c2ecf20Sopenharmony_ci			break;
1558c2ecf20Sopenharmony_ci		case '.':
1568c2ecf20Sopenharmony_ci			/* Do not allow three or more consecutive dots */
1578c2ecf20Sopenharmony_ci			if (!strncmp(str, "...", 3))
1588c2ecf20Sopenharmony_ci				return false;
1598c2ecf20Sopenharmony_ci			/* Fall through */
1608c2ecf20Sopenharmony_ci		case 'a' ... 'z':
1618c2ecf20Sopenharmony_ci		case 'A' ... 'Z':
1628c2ecf20Sopenharmony_ci		case '0' ... '9':
1638c2ecf20Sopenharmony_ci		case '_':
1648c2ecf20Sopenharmony_ci		case ':':
1658c2ecf20Sopenharmony_ci			str++;
1668c2ecf20Sopenharmony_ci			break;
1678c2ecf20Sopenharmony_ci		default:
1688c2ecf20Sopenharmony_ci			return false;
1698c2ecf20Sopenharmony_ci		}
1708c2ecf20Sopenharmony_ci
1718c2ecf20Sopenharmony_ci	return true;
1728c2ecf20Sopenharmony_ci}
1738c2ecf20Sopenharmony_ci
1748c2ecf20Sopenharmony_ci/*
1758c2ecf20Sopenharmony_ci * INPUT:
1768c2ecf20Sopenharmony_ci *     sym: symbol for which rust_is_mangled(sym) returns true
1778c2ecf20Sopenharmony_ci *
1788c2ecf20Sopenharmony_ci * The input is demangled in-place because the mangled name is always longer
1798c2ecf20Sopenharmony_ci * than the demangled one.
1808c2ecf20Sopenharmony_ci */
1818c2ecf20Sopenharmony_civoid
1828c2ecf20Sopenharmony_cirust_demangle_sym(char *sym)
1838c2ecf20Sopenharmony_ci{
1848c2ecf20Sopenharmony_ci	const char *in;
1858c2ecf20Sopenharmony_ci	char *out;
1868c2ecf20Sopenharmony_ci	const char *end;
1878c2ecf20Sopenharmony_ci
1888c2ecf20Sopenharmony_ci	if (!sym)
1898c2ecf20Sopenharmony_ci		return;
1908c2ecf20Sopenharmony_ci
1918c2ecf20Sopenharmony_ci	in = sym;
1928c2ecf20Sopenharmony_ci	out = sym;
1938c2ecf20Sopenharmony_ci	end = sym + strlen(sym) - (hash_prefix_len + hash_len);
1948c2ecf20Sopenharmony_ci
1958c2ecf20Sopenharmony_ci	while (in < end)
1968c2ecf20Sopenharmony_ci		switch (*in) {
1978c2ecf20Sopenharmony_ci		case '$':
1988c2ecf20Sopenharmony_ci			if (!(unescape(&in, &out, "$C$", ',')
1998c2ecf20Sopenharmony_ci					|| unescape(&in, &out, "$SP$", '@')
2008c2ecf20Sopenharmony_ci					|| unescape(&in, &out, "$BP$", '*')
2018c2ecf20Sopenharmony_ci					|| unescape(&in, &out, "$RF$", '&')
2028c2ecf20Sopenharmony_ci					|| unescape(&in, &out, "$LT$", '<')
2038c2ecf20Sopenharmony_ci					|| unescape(&in, &out, "$GT$", '>')
2048c2ecf20Sopenharmony_ci					|| unescape(&in, &out, "$LP$", '(')
2058c2ecf20Sopenharmony_ci					|| unescape(&in, &out, "$RP$", ')')
2068c2ecf20Sopenharmony_ci					|| unescape(&in, &out, "$u20$", ' ')
2078c2ecf20Sopenharmony_ci					|| unescape(&in, &out, "$u27$", '\'')
2088c2ecf20Sopenharmony_ci					|| unescape(&in, &out, "$u5b$", '[')
2098c2ecf20Sopenharmony_ci					|| unescape(&in, &out, "$u5d$", ']')
2108c2ecf20Sopenharmony_ci					|| unescape(&in, &out, "$u7e$", '~'))) {
2118c2ecf20Sopenharmony_ci				pr_err("demangle-rust: unexpected escape sequence");
2128c2ecf20Sopenharmony_ci				goto done;
2138c2ecf20Sopenharmony_ci			}
2148c2ecf20Sopenharmony_ci			break;
2158c2ecf20Sopenharmony_ci		case '_':
2168c2ecf20Sopenharmony_ci			/*
2178c2ecf20Sopenharmony_ci			 * If this is the start of a path component and the next
2188c2ecf20Sopenharmony_ci			 * character is an escape sequence, ignore the
2198c2ecf20Sopenharmony_ci			 * underscore. The mangler inserts an underscore to make
2208c2ecf20Sopenharmony_ci			 * sure the path component begins with a XID_Start
2218c2ecf20Sopenharmony_ci			 * character.
2228c2ecf20Sopenharmony_ci			 */
2238c2ecf20Sopenharmony_ci			if ((in == sym || in[-1] == ':') && in[1] == '$')
2248c2ecf20Sopenharmony_ci				in++;
2258c2ecf20Sopenharmony_ci			else
2268c2ecf20Sopenharmony_ci				*out++ = *in++;
2278c2ecf20Sopenharmony_ci			break;
2288c2ecf20Sopenharmony_ci		case '.':
2298c2ecf20Sopenharmony_ci			if (in[1] == '.') {
2308c2ecf20Sopenharmony_ci				/* ".." becomes "::" */
2318c2ecf20Sopenharmony_ci				*out++ = ':';
2328c2ecf20Sopenharmony_ci				*out++ = ':';
2338c2ecf20Sopenharmony_ci				in += 2;
2348c2ecf20Sopenharmony_ci			} else {
2358c2ecf20Sopenharmony_ci				/* "." becomes "-" */
2368c2ecf20Sopenharmony_ci				*out++ = '-';
2378c2ecf20Sopenharmony_ci				in++;
2388c2ecf20Sopenharmony_ci			}
2398c2ecf20Sopenharmony_ci			break;
2408c2ecf20Sopenharmony_ci		case 'a' ... 'z':
2418c2ecf20Sopenharmony_ci		case 'A' ... 'Z':
2428c2ecf20Sopenharmony_ci		case '0' ... '9':
2438c2ecf20Sopenharmony_ci		case ':':
2448c2ecf20Sopenharmony_ci			*out++ = *in++;
2458c2ecf20Sopenharmony_ci			break;
2468c2ecf20Sopenharmony_ci		default:
2478c2ecf20Sopenharmony_ci			pr_err("demangle-rust: unexpected character '%c' in symbol\n",
2488c2ecf20Sopenharmony_ci				*in);
2498c2ecf20Sopenharmony_ci			goto done;
2508c2ecf20Sopenharmony_ci		}
2518c2ecf20Sopenharmony_ci
2528c2ecf20Sopenharmony_cidone:
2538c2ecf20Sopenharmony_ci	*out = '\0';
2548c2ecf20Sopenharmony_ci}
2558c2ecf20Sopenharmony_ci
2568c2ecf20Sopenharmony_cistatic bool unescape(const char **in, char **out, const char *seq, char value)
2578c2ecf20Sopenharmony_ci{
2588c2ecf20Sopenharmony_ci	size_t len = strlen(seq);
2598c2ecf20Sopenharmony_ci
2608c2ecf20Sopenharmony_ci	if (strncmp(*in, seq, len))
2618c2ecf20Sopenharmony_ci		return false;
2628c2ecf20Sopenharmony_ci
2638c2ecf20Sopenharmony_ci	**out = value;
2648c2ecf20Sopenharmony_ci
2658c2ecf20Sopenharmony_ci	*in += len;
2668c2ecf20Sopenharmony_ci	*out += 1;
2678c2ecf20Sopenharmony_ci
2688c2ecf20Sopenharmony_ci	return true;
2698c2ecf20Sopenharmony_ci}
270