162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0
262306a36Sopenharmony_ci#include <string.h>
362306a36Sopenharmony_ci#include "debug.h"
462306a36Sopenharmony_ci
562306a36Sopenharmony_ci#include "demangle-rust.h"
662306a36Sopenharmony_ci
762306a36Sopenharmony_ci/*
862306a36Sopenharmony_ci * Mangled Rust symbols look like this:
962306a36Sopenharmony_ci *
1062306a36Sopenharmony_ci *     _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a
1162306a36Sopenharmony_ci *
1262306a36Sopenharmony_ci * The original symbol is:
1362306a36Sopenharmony_ci *
1462306a36Sopenharmony_ci *     <std::sys::fd::FileDesc as core::ops::Drop>::drop
1562306a36Sopenharmony_ci *
1662306a36Sopenharmony_ci * The last component of the path is a 64-bit hash in lowercase hex, prefixed
1762306a36Sopenharmony_ci * with "h". Rust does not have a global namespace between crates, an illusion
1862306a36Sopenharmony_ci * which Rust maintains by using the hash to distinguish things that would
1962306a36Sopenharmony_ci * otherwise have the same symbol.
2062306a36Sopenharmony_ci *
2162306a36Sopenharmony_ci * Any path component not starting with a XID_Start character is prefixed with
2262306a36Sopenharmony_ci * "_".
2362306a36Sopenharmony_ci *
2462306a36Sopenharmony_ci * The following escape sequences are used:
2562306a36Sopenharmony_ci *
2662306a36Sopenharmony_ci *     ","  =>  $C$
2762306a36Sopenharmony_ci *     "@"  =>  $SP$
2862306a36Sopenharmony_ci *     "*"  =>  $BP$
2962306a36Sopenharmony_ci *     "&"  =>  $RF$
3062306a36Sopenharmony_ci *     "<"  =>  $LT$
3162306a36Sopenharmony_ci *     ">"  =>  $GT$
3262306a36Sopenharmony_ci *     "("  =>  $LP$
3362306a36Sopenharmony_ci *     ")"  =>  $RP$
3462306a36Sopenharmony_ci *     " "  =>  $u20$
3562306a36Sopenharmony_ci *     "'"  =>  $u27$
3662306a36Sopenharmony_ci *     "["  =>  $u5b$
3762306a36Sopenharmony_ci *     "]"  =>  $u5d$
3862306a36Sopenharmony_ci *     "~"  =>  $u7e$
3962306a36Sopenharmony_ci *
4062306a36Sopenharmony_ci * A double ".." means "::" and a single "." means "-".
4162306a36Sopenharmony_ci *
4262306a36Sopenharmony_ci * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$
4362306a36Sopenharmony_ci */
4462306a36Sopenharmony_ci
4562306a36Sopenharmony_cistatic const char *hash_prefix = "::h";
4662306a36Sopenharmony_cistatic const size_t hash_prefix_len = 3;
4762306a36Sopenharmony_cistatic const size_t hash_len = 16;
4862306a36Sopenharmony_ci
4962306a36Sopenharmony_cistatic bool is_prefixed_hash(const char *start);
5062306a36Sopenharmony_cistatic bool looks_like_rust(const char *sym, size_t len);
5162306a36Sopenharmony_cistatic bool unescape(const char **in, char **out, const char *seq, char value);
5262306a36Sopenharmony_ci
5362306a36Sopenharmony_ci/*
5462306a36Sopenharmony_ci * INPUT:
5562306a36Sopenharmony_ci *     sym: symbol that has been through BFD-demangling
5662306a36Sopenharmony_ci *
5762306a36Sopenharmony_ci * This function looks for the following indicators:
5862306a36Sopenharmony_ci *
5962306a36Sopenharmony_ci *  1. The hash must consist of "h" followed by 16 lowercase hex digits.
6062306a36Sopenharmony_ci *
6162306a36Sopenharmony_ci *  2. As a sanity check, the hash must use between 5 and 15 of the 16 possible
6262306a36Sopenharmony_ci *     hex digits. This is true of 99.9998% of hashes so once in your life you
6362306a36Sopenharmony_ci *     may see a false negative. The point is to notice path components that
6462306a36Sopenharmony_ci *     could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In
6562306a36Sopenharmony_ci *     this case a false positive (non-Rust symbol has an important path
6662306a36Sopenharmony_ci *     component removed because it looks like a Rust hash) is worse than a
6762306a36Sopenharmony_ci *     false negative (the rare Rust symbol is not demangled) so this sets the
6862306a36Sopenharmony_ci *     balance in favor of false negatives.
6962306a36Sopenharmony_ci *
7062306a36Sopenharmony_ci *  3. There must be no characters other than a-zA-Z0-9 and _.:$
7162306a36Sopenharmony_ci *
7262306a36Sopenharmony_ci *  4. There must be no unrecognized $-sign sequences.
7362306a36Sopenharmony_ci *
7462306a36Sopenharmony_ci *  5. There must be no sequence of three or more dots in a row ("...").
7562306a36Sopenharmony_ci */
7662306a36Sopenharmony_cibool
7762306a36Sopenharmony_cirust_is_mangled(const char *sym)
7862306a36Sopenharmony_ci{
7962306a36Sopenharmony_ci	size_t len, len_without_hash;
8062306a36Sopenharmony_ci
8162306a36Sopenharmony_ci	if (!sym)
8262306a36Sopenharmony_ci		return false;
8362306a36Sopenharmony_ci
8462306a36Sopenharmony_ci	len = strlen(sym);
8562306a36Sopenharmony_ci	if (len <= hash_prefix_len + hash_len)
8662306a36Sopenharmony_ci		/* Not long enough to contain "::h" + hash + something else */
8762306a36Sopenharmony_ci		return false;
8862306a36Sopenharmony_ci
8962306a36Sopenharmony_ci	len_without_hash = len - (hash_prefix_len + hash_len);
9062306a36Sopenharmony_ci	if (!is_prefixed_hash(sym + len_without_hash))
9162306a36Sopenharmony_ci		return false;
9262306a36Sopenharmony_ci
9362306a36Sopenharmony_ci	return looks_like_rust(sym, len_without_hash);
9462306a36Sopenharmony_ci}
9562306a36Sopenharmony_ci
9662306a36Sopenharmony_ci/*
9762306a36Sopenharmony_ci * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex
9862306a36Sopenharmony_ci * digits must comprise between 5 and 15 (inclusive) distinct digits.
9962306a36Sopenharmony_ci */
10062306a36Sopenharmony_cistatic bool is_prefixed_hash(const char *str)
10162306a36Sopenharmony_ci{
10262306a36Sopenharmony_ci	const char *end;
10362306a36Sopenharmony_ci	bool seen[16];
10462306a36Sopenharmony_ci	size_t i;
10562306a36Sopenharmony_ci	int count;
10662306a36Sopenharmony_ci
10762306a36Sopenharmony_ci	if (strncmp(str, hash_prefix, hash_prefix_len))
10862306a36Sopenharmony_ci		return false;
10962306a36Sopenharmony_ci	str += hash_prefix_len;
11062306a36Sopenharmony_ci
11162306a36Sopenharmony_ci	memset(seen, false, sizeof(seen));
11262306a36Sopenharmony_ci	for (end = str + hash_len; str < end; str++)
11362306a36Sopenharmony_ci		if (*str >= '0' && *str <= '9')
11462306a36Sopenharmony_ci			seen[*str - '0'] = true;
11562306a36Sopenharmony_ci		else if (*str >= 'a' && *str <= 'f')
11662306a36Sopenharmony_ci			seen[*str - 'a' + 10] = true;
11762306a36Sopenharmony_ci		else
11862306a36Sopenharmony_ci			return false;
11962306a36Sopenharmony_ci
12062306a36Sopenharmony_ci	/* Count how many distinct digits seen */
12162306a36Sopenharmony_ci	count = 0;
12262306a36Sopenharmony_ci	for (i = 0; i < 16; i++)
12362306a36Sopenharmony_ci		if (seen[i])
12462306a36Sopenharmony_ci			count++;
12562306a36Sopenharmony_ci
12662306a36Sopenharmony_ci	return count >= 5 && count <= 15;
12762306a36Sopenharmony_ci}
12862306a36Sopenharmony_ci
12962306a36Sopenharmony_cistatic bool looks_like_rust(const char *str, size_t len)
13062306a36Sopenharmony_ci{
13162306a36Sopenharmony_ci	const char *end = str + len;
13262306a36Sopenharmony_ci
13362306a36Sopenharmony_ci	while (str < end)
13462306a36Sopenharmony_ci		switch (*str) {
13562306a36Sopenharmony_ci		case '$':
13662306a36Sopenharmony_ci			if (!strncmp(str, "$C$", 3))
13762306a36Sopenharmony_ci				str += 3;
13862306a36Sopenharmony_ci			else if (!strncmp(str, "$SP$", 4)
13962306a36Sopenharmony_ci					|| !strncmp(str, "$BP$", 4)
14062306a36Sopenharmony_ci					|| !strncmp(str, "$RF$", 4)
14162306a36Sopenharmony_ci					|| !strncmp(str, "$LT$", 4)
14262306a36Sopenharmony_ci					|| !strncmp(str, "$GT$", 4)
14362306a36Sopenharmony_ci					|| !strncmp(str, "$LP$", 4)
14462306a36Sopenharmony_ci					|| !strncmp(str, "$RP$", 4))
14562306a36Sopenharmony_ci				str += 4;
14662306a36Sopenharmony_ci			else if (!strncmp(str, "$u20$", 5)
14762306a36Sopenharmony_ci					|| !strncmp(str, "$u27$", 5)
14862306a36Sopenharmony_ci					|| !strncmp(str, "$u5b$", 5)
14962306a36Sopenharmony_ci					|| !strncmp(str, "$u5d$", 5)
15062306a36Sopenharmony_ci					|| !strncmp(str, "$u7e$", 5))
15162306a36Sopenharmony_ci				str += 5;
15262306a36Sopenharmony_ci			else
15362306a36Sopenharmony_ci				return false;
15462306a36Sopenharmony_ci			break;
15562306a36Sopenharmony_ci		case '.':
15662306a36Sopenharmony_ci			/* Do not allow three or more consecutive dots */
15762306a36Sopenharmony_ci			if (!strncmp(str, "...", 3))
15862306a36Sopenharmony_ci				return false;
15962306a36Sopenharmony_ci			/* Fall through */
16062306a36Sopenharmony_ci		case 'a' ... 'z':
16162306a36Sopenharmony_ci		case 'A' ... 'Z':
16262306a36Sopenharmony_ci		case '0' ... '9':
16362306a36Sopenharmony_ci		case '_':
16462306a36Sopenharmony_ci		case ':':
16562306a36Sopenharmony_ci			str++;
16662306a36Sopenharmony_ci			break;
16762306a36Sopenharmony_ci		default:
16862306a36Sopenharmony_ci			return false;
16962306a36Sopenharmony_ci		}
17062306a36Sopenharmony_ci
17162306a36Sopenharmony_ci	return true;
17262306a36Sopenharmony_ci}
17362306a36Sopenharmony_ci
17462306a36Sopenharmony_ci/*
17562306a36Sopenharmony_ci * INPUT:
17662306a36Sopenharmony_ci *     sym: symbol for which rust_is_mangled(sym) returns true
17762306a36Sopenharmony_ci *
17862306a36Sopenharmony_ci * The input is demangled in-place because the mangled name is always longer
17962306a36Sopenharmony_ci * than the demangled one.
18062306a36Sopenharmony_ci */
18162306a36Sopenharmony_civoid
18262306a36Sopenharmony_cirust_demangle_sym(char *sym)
18362306a36Sopenharmony_ci{
18462306a36Sopenharmony_ci	const char *in;
18562306a36Sopenharmony_ci	char *out;
18662306a36Sopenharmony_ci	const char *end;
18762306a36Sopenharmony_ci
18862306a36Sopenharmony_ci	if (!sym)
18962306a36Sopenharmony_ci		return;
19062306a36Sopenharmony_ci
19162306a36Sopenharmony_ci	in = sym;
19262306a36Sopenharmony_ci	out = sym;
19362306a36Sopenharmony_ci	end = sym + strlen(sym) - (hash_prefix_len + hash_len);
19462306a36Sopenharmony_ci
19562306a36Sopenharmony_ci	while (in < end)
19662306a36Sopenharmony_ci		switch (*in) {
19762306a36Sopenharmony_ci		case '$':
19862306a36Sopenharmony_ci			if (!(unescape(&in, &out, "$C$", ',')
19962306a36Sopenharmony_ci					|| unescape(&in, &out, "$SP$", '@')
20062306a36Sopenharmony_ci					|| unescape(&in, &out, "$BP$", '*')
20162306a36Sopenharmony_ci					|| unescape(&in, &out, "$RF$", '&')
20262306a36Sopenharmony_ci					|| unescape(&in, &out, "$LT$", '<')
20362306a36Sopenharmony_ci					|| unescape(&in, &out, "$GT$", '>')
20462306a36Sopenharmony_ci					|| unescape(&in, &out, "$LP$", '(')
20562306a36Sopenharmony_ci					|| unescape(&in, &out, "$RP$", ')')
20662306a36Sopenharmony_ci					|| unescape(&in, &out, "$u20$", ' ')
20762306a36Sopenharmony_ci					|| unescape(&in, &out, "$u27$", '\'')
20862306a36Sopenharmony_ci					|| unescape(&in, &out, "$u5b$", '[')
20962306a36Sopenharmony_ci					|| unescape(&in, &out, "$u5d$", ']')
21062306a36Sopenharmony_ci					|| unescape(&in, &out, "$u7e$", '~'))) {
21162306a36Sopenharmony_ci				pr_err("demangle-rust: unexpected escape sequence");
21262306a36Sopenharmony_ci				goto done;
21362306a36Sopenharmony_ci			}
21462306a36Sopenharmony_ci			break;
21562306a36Sopenharmony_ci		case '_':
21662306a36Sopenharmony_ci			/*
21762306a36Sopenharmony_ci			 * If this is the start of a path component and the next
21862306a36Sopenharmony_ci			 * character is an escape sequence, ignore the
21962306a36Sopenharmony_ci			 * underscore. The mangler inserts an underscore to make
22062306a36Sopenharmony_ci			 * sure the path component begins with a XID_Start
22162306a36Sopenharmony_ci			 * character.
22262306a36Sopenharmony_ci			 */
22362306a36Sopenharmony_ci			if ((in == sym || in[-1] == ':') && in[1] == '$')
22462306a36Sopenharmony_ci				in++;
22562306a36Sopenharmony_ci			else
22662306a36Sopenharmony_ci				*out++ = *in++;
22762306a36Sopenharmony_ci			break;
22862306a36Sopenharmony_ci		case '.':
22962306a36Sopenharmony_ci			if (in[1] == '.') {
23062306a36Sopenharmony_ci				/* ".." becomes "::" */
23162306a36Sopenharmony_ci				*out++ = ':';
23262306a36Sopenharmony_ci				*out++ = ':';
23362306a36Sopenharmony_ci				in += 2;
23462306a36Sopenharmony_ci			} else {
23562306a36Sopenharmony_ci				/* "." becomes "-" */
23662306a36Sopenharmony_ci				*out++ = '-';
23762306a36Sopenharmony_ci				in++;
23862306a36Sopenharmony_ci			}
23962306a36Sopenharmony_ci			break;
24062306a36Sopenharmony_ci		case 'a' ... 'z':
24162306a36Sopenharmony_ci		case 'A' ... 'Z':
24262306a36Sopenharmony_ci		case '0' ... '9':
24362306a36Sopenharmony_ci		case ':':
24462306a36Sopenharmony_ci			*out++ = *in++;
24562306a36Sopenharmony_ci			break;
24662306a36Sopenharmony_ci		default:
24762306a36Sopenharmony_ci			pr_err("demangle-rust: unexpected character '%c' in symbol\n",
24862306a36Sopenharmony_ci				*in);
24962306a36Sopenharmony_ci			goto done;
25062306a36Sopenharmony_ci		}
25162306a36Sopenharmony_ci
25262306a36Sopenharmony_cidone:
25362306a36Sopenharmony_ci	*out = '\0';
25462306a36Sopenharmony_ci}
25562306a36Sopenharmony_ci
25662306a36Sopenharmony_cistatic bool unescape(const char **in, char **out, const char *seq, char value)
25762306a36Sopenharmony_ci{
25862306a36Sopenharmony_ci	size_t len = strlen(seq);
25962306a36Sopenharmony_ci
26062306a36Sopenharmony_ci	if (strncmp(*in, seq, len))
26162306a36Sopenharmony_ci		return false;
26262306a36Sopenharmony_ci
26362306a36Sopenharmony_ci	**out = value;
26462306a36Sopenharmony_ci
26562306a36Sopenharmony_ci	*in += len;
26662306a36Sopenharmony_ci	*out += 1;
26762306a36Sopenharmony_ci
26862306a36Sopenharmony_ci	return true;
26962306a36Sopenharmony_ci}
270