162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 262306a36Sopenharmony_ci#include <string.h> 362306a36Sopenharmony_ci#include "debug.h" 462306a36Sopenharmony_ci 562306a36Sopenharmony_ci#include "demangle-rust.h" 662306a36Sopenharmony_ci 762306a36Sopenharmony_ci/* 862306a36Sopenharmony_ci * Mangled Rust symbols look like this: 962306a36Sopenharmony_ci * 1062306a36Sopenharmony_ci * _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a 1162306a36Sopenharmony_ci * 1262306a36Sopenharmony_ci * The original symbol is: 1362306a36Sopenharmony_ci * 1462306a36Sopenharmony_ci * <std::sys::fd::FileDesc as core::ops::Drop>::drop 1562306a36Sopenharmony_ci * 1662306a36Sopenharmony_ci * The last component of the path is a 64-bit hash in lowercase hex, prefixed 1762306a36Sopenharmony_ci * with "h". Rust does not have a global namespace between crates, an illusion 1862306a36Sopenharmony_ci * which Rust maintains by using the hash to distinguish things that would 1962306a36Sopenharmony_ci * otherwise have the same symbol. 2062306a36Sopenharmony_ci * 2162306a36Sopenharmony_ci * Any path component not starting with a XID_Start character is prefixed with 2262306a36Sopenharmony_ci * "_". 2362306a36Sopenharmony_ci * 2462306a36Sopenharmony_ci * The following escape sequences are used: 2562306a36Sopenharmony_ci * 2662306a36Sopenharmony_ci * "," => $C$ 2762306a36Sopenharmony_ci * "@" => $SP$ 2862306a36Sopenharmony_ci * "*" => $BP$ 2962306a36Sopenharmony_ci * "&" => $RF$ 3062306a36Sopenharmony_ci * "<" => $LT$ 3162306a36Sopenharmony_ci * ">" => $GT$ 3262306a36Sopenharmony_ci * "(" => $LP$ 3362306a36Sopenharmony_ci * ")" => $RP$ 3462306a36Sopenharmony_ci * " " => $u20$ 3562306a36Sopenharmony_ci * "'" => $u27$ 3662306a36Sopenharmony_ci * "[" => $u5b$ 3762306a36Sopenharmony_ci * "]" => $u5d$ 3862306a36Sopenharmony_ci * "~" => $u7e$ 3962306a36Sopenharmony_ci * 4062306a36Sopenharmony_ci * A double ".." means "::" and a single "." means "-". 4162306a36Sopenharmony_ci * 4262306a36Sopenharmony_ci * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$ 4362306a36Sopenharmony_ci */ 4462306a36Sopenharmony_ci 4562306a36Sopenharmony_cistatic const char *hash_prefix = "::h"; 4662306a36Sopenharmony_cistatic const size_t hash_prefix_len = 3; 4762306a36Sopenharmony_cistatic const size_t hash_len = 16; 4862306a36Sopenharmony_ci 4962306a36Sopenharmony_cistatic bool is_prefixed_hash(const char *start); 5062306a36Sopenharmony_cistatic bool looks_like_rust(const char *sym, size_t len); 5162306a36Sopenharmony_cistatic bool unescape(const char **in, char **out, const char *seq, char value); 5262306a36Sopenharmony_ci 5362306a36Sopenharmony_ci/* 5462306a36Sopenharmony_ci * INPUT: 5562306a36Sopenharmony_ci * sym: symbol that has been through BFD-demangling 5662306a36Sopenharmony_ci * 5762306a36Sopenharmony_ci * This function looks for the following indicators: 5862306a36Sopenharmony_ci * 5962306a36Sopenharmony_ci * 1. The hash must consist of "h" followed by 16 lowercase hex digits. 6062306a36Sopenharmony_ci * 6162306a36Sopenharmony_ci * 2. As a sanity check, the hash must use between 5 and 15 of the 16 possible 6262306a36Sopenharmony_ci * hex digits. This is true of 99.9998% of hashes so once in your life you 6362306a36Sopenharmony_ci * may see a false negative. The point is to notice path components that 6462306a36Sopenharmony_ci * could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In 6562306a36Sopenharmony_ci * this case a false positive (non-Rust symbol has an important path 6662306a36Sopenharmony_ci * component removed because it looks like a Rust hash) is worse than a 6762306a36Sopenharmony_ci * false negative (the rare Rust symbol is not demangled) so this sets the 6862306a36Sopenharmony_ci * balance in favor of false negatives. 6962306a36Sopenharmony_ci * 7062306a36Sopenharmony_ci * 3. There must be no characters other than a-zA-Z0-9 and _.:$ 7162306a36Sopenharmony_ci * 7262306a36Sopenharmony_ci * 4. There must be no unrecognized $-sign sequences. 7362306a36Sopenharmony_ci * 7462306a36Sopenharmony_ci * 5. There must be no sequence of three or more dots in a row ("..."). 7562306a36Sopenharmony_ci */ 7662306a36Sopenharmony_cibool 7762306a36Sopenharmony_cirust_is_mangled(const char *sym) 7862306a36Sopenharmony_ci{ 7962306a36Sopenharmony_ci size_t len, len_without_hash; 8062306a36Sopenharmony_ci 8162306a36Sopenharmony_ci if (!sym) 8262306a36Sopenharmony_ci return false; 8362306a36Sopenharmony_ci 8462306a36Sopenharmony_ci len = strlen(sym); 8562306a36Sopenharmony_ci if (len <= hash_prefix_len + hash_len) 8662306a36Sopenharmony_ci /* Not long enough to contain "::h" + hash + something else */ 8762306a36Sopenharmony_ci return false; 8862306a36Sopenharmony_ci 8962306a36Sopenharmony_ci len_without_hash = len - (hash_prefix_len + hash_len); 9062306a36Sopenharmony_ci if (!is_prefixed_hash(sym + len_without_hash)) 9162306a36Sopenharmony_ci return false; 9262306a36Sopenharmony_ci 9362306a36Sopenharmony_ci return looks_like_rust(sym, len_without_hash); 9462306a36Sopenharmony_ci} 9562306a36Sopenharmony_ci 9662306a36Sopenharmony_ci/* 9762306a36Sopenharmony_ci * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex 9862306a36Sopenharmony_ci * digits must comprise between 5 and 15 (inclusive) distinct digits. 9962306a36Sopenharmony_ci */ 10062306a36Sopenharmony_cistatic bool is_prefixed_hash(const char *str) 10162306a36Sopenharmony_ci{ 10262306a36Sopenharmony_ci const char *end; 10362306a36Sopenharmony_ci bool seen[16]; 10462306a36Sopenharmony_ci size_t i; 10562306a36Sopenharmony_ci int count; 10662306a36Sopenharmony_ci 10762306a36Sopenharmony_ci if (strncmp(str, hash_prefix, hash_prefix_len)) 10862306a36Sopenharmony_ci return false; 10962306a36Sopenharmony_ci str += hash_prefix_len; 11062306a36Sopenharmony_ci 11162306a36Sopenharmony_ci memset(seen, false, sizeof(seen)); 11262306a36Sopenharmony_ci for (end = str + hash_len; str < end; str++) 11362306a36Sopenharmony_ci if (*str >= '0' && *str <= '9') 11462306a36Sopenharmony_ci seen[*str - '0'] = true; 11562306a36Sopenharmony_ci else if (*str >= 'a' && *str <= 'f') 11662306a36Sopenharmony_ci seen[*str - 'a' + 10] = true; 11762306a36Sopenharmony_ci else 11862306a36Sopenharmony_ci return false; 11962306a36Sopenharmony_ci 12062306a36Sopenharmony_ci /* Count how many distinct digits seen */ 12162306a36Sopenharmony_ci count = 0; 12262306a36Sopenharmony_ci for (i = 0; i < 16; i++) 12362306a36Sopenharmony_ci if (seen[i]) 12462306a36Sopenharmony_ci count++; 12562306a36Sopenharmony_ci 12662306a36Sopenharmony_ci return count >= 5 && count <= 15; 12762306a36Sopenharmony_ci} 12862306a36Sopenharmony_ci 12962306a36Sopenharmony_cistatic bool looks_like_rust(const char *str, size_t len) 13062306a36Sopenharmony_ci{ 13162306a36Sopenharmony_ci const char *end = str + len; 13262306a36Sopenharmony_ci 13362306a36Sopenharmony_ci while (str < end) 13462306a36Sopenharmony_ci switch (*str) { 13562306a36Sopenharmony_ci case '$': 13662306a36Sopenharmony_ci if (!strncmp(str, "$C$", 3)) 13762306a36Sopenharmony_ci str += 3; 13862306a36Sopenharmony_ci else if (!strncmp(str, "$SP$", 4) 13962306a36Sopenharmony_ci || !strncmp(str, "$BP$", 4) 14062306a36Sopenharmony_ci || !strncmp(str, "$RF$", 4) 14162306a36Sopenharmony_ci || !strncmp(str, "$LT$", 4) 14262306a36Sopenharmony_ci || !strncmp(str, "$GT$", 4) 14362306a36Sopenharmony_ci || !strncmp(str, "$LP$", 4) 14462306a36Sopenharmony_ci || !strncmp(str, "$RP$", 4)) 14562306a36Sopenharmony_ci str += 4; 14662306a36Sopenharmony_ci else if (!strncmp(str, "$u20$", 5) 14762306a36Sopenharmony_ci || !strncmp(str, "$u27$", 5) 14862306a36Sopenharmony_ci || !strncmp(str, "$u5b$", 5) 14962306a36Sopenharmony_ci || !strncmp(str, "$u5d$", 5) 15062306a36Sopenharmony_ci || !strncmp(str, "$u7e$", 5)) 15162306a36Sopenharmony_ci str += 5; 15262306a36Sopenharmony_ci else 15362306a36Sopenharmony_ci return false; 15462306a36Sopenharmony_ci break; 15562306a36Sopenharmony_ci case '.': 15662306a36Sopenharmony_ci /* Do not allow three or more consecutive dots */ 15762306a36Sopenharmony_ci if (!strncmp(str, "...", 3)) 15862306a36Sopenharmony_ci return false; 15962306a36Sopenharmony_ci /* Fall through */ 16062306a36Sopenharmony_ci case 'a' ... 'z': 16162306a36Sopenharmony_ci case 'A' ... 'Z': 16262306a36Sopenharmony_ci case '0' ... '9': 16362306a36Sopenharmony_ci case '_': 16462306a36Sopenharmony_ci case ':': 16562306a36Sopenharmony_ci str++; 16662306a36Sopenharmony_ci break; 16762306a36Sopenharmony_ci default: 16862306a36Sopenharmony_ci return false; 16962306a36Sopenharmony_ci } 17062306a36Sopenharmony_ci 17162306a36Sopenharmony_ci return true; 17262306a36Sopenharmony_ci} 17362306a36Sopenharmony_ci 17462306a36Sopenharmony_ci/* 17562306a36Sopenharmony_ci * INPUT: 17662306a36Sopenharmony_ci * sym: symbol for which rust_is_mangled(sym) returns true 17762306a36Sopenharmony_ci * 17862306a36Sopenharmony_ci * The input is demangled in-place because the mangled name is always longer 17962306a36Sopenharmony_ci * than the demangled one. 18062306a36Sopenharmony_ci */ 18162306a36Sopenharmony_civoid 18262306a36Sopenharmony_cirust_demangle_sym(char *sym) 18362306a36Sopenharmony_ci{ 18462306a36Sopenharmony_ci const char *in; 18562306a36Sopenharmony_ci char *out; 18662306a36Sopenharmony_ci const char *end; 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_ci if (!sym) 18962306a36Sopenharmony_ci return; 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_ci in = sym; 19262306a36Sopenharmony_ci out = sym; 19362306a36Sopenharmony_ci end = sym + strlen(sym) - (hash_prefix_len + hash_len); 19462306a36Sopenharmony_ci 19562306a36Sopenharmony_ci while (in < end) 19662306a36Sopenharmony_ci switch (*in) { 19762306a36Sopenharmony_ci case '$': 19862306a36Sopenharmony_ci if (!(unescape(&in, &out, "$C$", ',') 19962306a36Sopenharmony_ci || unescape(&in, &out, "$SP$", '@') 20062306a36Sopenharmony_ci || unescape(&in, &out, "$BP$", '*') 20162306a36Sopenharmony_ci || unescape(&in, &out, "$RF$", '&') 20262306a36Sopenharmony_ci || unescape(&in, &out, "$LT$", '<') 20362306a36Sopenharmony_ci || unescape(&in, &out, "$GT$", '>') 20462306a36Sopenharmony_ci || unescape(&in, &out, "$LP$", '(') 20562306a36Sopenharmony_ci || unescape(&in, &out, "$RP$", ')') 20662306a36Sopenharmony_ci || unescape(&in, &out, "$u20$", ' ') 20762306a36Sopenharmony_ci || unescape(&in, &out, "$u27$", '\'') 20862306a36Sopenharmony_ci || unescape(&in, &out, "$u5b$", '[') 20962306a36Sopenharmony_ci || unescape(&in, &out, "$u5d$", ']') 21062306a36Sopenharmony_ci || unescape(&in, &out, "$u7e$", '~'))) { 21162306a36Sopenharmony_ci pr_err("demangle-rust: unexpected escape sequence"); 21262306a36Sopenharmony_ci goto done; 21362306a36Sopenharmony_ci } 21462306a36Sopenharmony_ci break; 21562306a36Sopenharmony_ci case '_': 21662306a36Sopenharmony_ci /* 21762306a36Sopenharmony_ci * If this is the start of a path component and the next 21862306a36Sopenharmony_ci * character is an escape sequence, ignore the 21962306a36Sopenharmony_ci * underscore. The mangler inserts an underscore to make 22062306a36Sopenharmony_ci * sure the path component begins with a XID_Start 22162306a36Sopenharmony_ci * character. 22262306a36Sopenharmony_ci */ 22362306a36Sopenharmony_ci if ((in == sym || in[-1] == ':') && in[1] == '$') 22462306a36Sopenharmony_ci in++; 22562306a36Sopenharmony_ci else 22662306a36Sopenharmony_ci *out++ = *in++; 22762306a36Sopenharmony_ci break; 22862306a36Sopenharmony_ci case '.': 22962306a36Sopenharmony_ci if (in[1] == '.') { 23062306a36Sopenharmony_ci /* ".." becomes "::" */ 23162306a36Sopenharmony_ci *out++ = ':'; 23262306a36Sopenharmony_ci *out++ = ':'; 23362306a36Sopenharmony_ci in += 2; 23462306a36Sopenharmony_ci } else { 23562306a36Sopenharmony_ci /* "." becomes "-" */ 23662306a36Sopenharmony_ci *out++ = '-'; 23762306a36Sopenharmony_ci in++; 23862306a36Sopenharmony_ci } 23962306a36Sopenharmony_ci break; 24062306a36Sopenharmony_ci case 'a' ... 'z': 24162306a36Sopenharmony_ci case 'A' ... 'Z': 24262306a36Sopenharmony_ci case '0' ... '9': 24362306a36Sopenharmony_ci case ':': 24462306a36Sopenharmony_ci *out++ = *in++; 24562306a36Sopenharmony_ci break; 24662306a36Sopenharmony_ci default: 24762306a36Sopenharmony_ci pr_err("demangle-rust: unexpected character '%c' in symbol\n", 24862306a36Sopenharmony_ci *in); 24962306a36Sopenharmony_ci goto done; 25062306a36Sopenharmony_ci } 25162306a36Sopenharmony_ci 25262306a36Sopenharmony_cidone: 25362306a36Sopenharmony_ci *out = '\0'; 25462306a36Sopenharmony_ci} 25562306a36Sopenharmony_ci 25662306a36Sopenharmony_cistatic bool unescape(const char **in, char **out, const char *seq, char value) 25762306a36Sopenharmony_ci{ 25862306a36Sopenharmony_ci size_t len = strlen(seq); 25962306a36Sopenharmony_ci 26062306a36Sopenharmony_ci if (strncmp(*in, seq, len)) 26162306a36Sopenharmony_ci return false; 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci **out = value; 26462306a36Sopenharmony_ci 26562306a36Sopenharmony_ci *in += len; 26662306a36Sopenharmony_ci *out += 1; 26762306a36Sopenharmony_ci 26862306a36Sopenharmony_ci return true; 26962306a36Sopenharmony_ci} 270