18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0 28c2ecf20Sopenharmony_ci#include <string.h> 38c2ecf20Sopenharmony_ci#include "debug.h" 48c2ecf20Sopenharmony_ci 58c2ecf20Sopenharmony_ci#include "demangle-rust.h" 68c2ecf20Sopenharmony_ci 78c2ecf20Sopenharmony_ci/* 88c2ecf20Sopenharmony_ci * Mangled Rust symbols look like this: 98c2ecf20Sopenharmony_ci * 108c2ecf20Sopenharmony_ci * _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a 118c2ecf20Sopenharmony_ci * 128c2ecf20Sopenharmony_ci * The original symbol is: 138c2ecf20Sopenharmony_ci * 148c2ecf20Sopenharmony_ci * <std::sys::fd::FileDesc as core::ops::Drop>::drop 158c2ecf20Sopenharmony_ci * 168c2ecf20Sopenharmony_ci * The last component of the path is a 64-bit hash in lowercase hex, prefixed 178c2ecf20Sopenharmony_ci * with "h". Rust does not have a global namespace between crates, an illusion 188c2ecf20Sopenharmony_ci * which Rust maintains by using the hash to distinguish things that would 198c2ecf20Sopenharmony_ci * otherwise have the same symbol. 208c2ecf20Sopenharmony_ci * 218c2ecf20Sopenharmony_ci * Any path component not starting with a XID_Start character is prefixed with 228c2ecf20Sopenharmony_ci * "_". 238c2ecf20Sopenharmony_ci * 248c2ecf20Sopenharmony_ci * The following escape sequences are used: 258c2ecf20Sopenharmony_ci * 268c2ecf20Sopenharmony_ci * "," => $C$ 278c2ecf20Sopenharmony_ci * "@" => $SP$ 288c2ecf20Sopenharmony_ci * "*" => $BP$ 298c2ecf20Sopenharmony_ci * "&" => $RF$ 308c2ecf20Sopenharmony_ci * "<" => $LT$ 318c2ecf20Sopenharmony_ci * ">" => $GT$ 328c2ecf20Sopenharmony_ci * "(" => $LP$ 338c2ecf20Sopenharmony_ci * ")" => $RP$ 348c2ecf20Sopenharmony_ci * " " => $u20$ 358c2ecf20Sopenharmony_ci * "'" => $u27$ 368c2ecf20Sopenharmony_ci * "[" => $u5b$ 378c2ecf20Sopenharmony_ci * "]" => $u5d$ 388c2ecf20Sopenharmony_ci * "~" => $u7e$ 398c2ecf20Sopenharmony_ci * 408c2ecf20Sopenharmony_ci * A double ".." means "::" and a single "." means "-". 418c2ecf20Sopenharmony_ci * 428c2ecf20Sopenharmony_ci * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$ 438c2ecf20Sopenharmony_ci */ 448c2ecf20Sopenharmony_ci 458c2ecf20Sopenharmony_cistatic const char *hash_prefix = "::h"; 468c2ecf20Sopenharmony_cistatic const size_t hash_prefix_len = 3; 478c2ecf20Sopenharmony_cistatic const size_t hash_len = 16; 488c2ecf20Sopenharmony_ci 498c2ecf20Sopenharmony_cistatic bool is_prefixed_hash(const char *start); 508c2ecf20Sopenharmony_cistatic bool looks_like_rust(const char *sym, size_t len); 518c2ecf20Sopenharmony_cistatic bool unescape(const char **in, char **out, const char *seq, char value); 528c2ecf20Sopenharmony_ci 538c2ecf20Sopenharmony_ci/* 548c2ecf20Sopenharmony_ci * INPUT: 558c2ecf20Sopenharmony_ci * sym: symbol that has been through BFD-demangling 568c2ecf20Sopenharmony_ci * 578c2ecf20Sopenharmony_ci * This function looks for the following indicators: 588c2ecf20Sopenharmony_ci * 598c2ecf20Sopenharmony_ci * 1. The hash must consist of "h" followed by 16 lowercase hex digits. 608c2ecf20Sopenharmony_ci * 618c2ecf20Sopenharmony_ci * 2. As a sanity check, the hash must use between 5 and 15 of the 16 possible 628c2ecf20Sopenharmony_ci * hex digits. This is true of 99.9998% of hashes so once in your life you 638c2ecf20Sopenharmony_ci * may see a false negative. The point is to notice path components that 648c2ecf20Sopenharmony_ci * could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In 658c2ecf20Sopenharmony_ci * this case a false positive (non-Rust symbol has an important path 668c2ecf20Sopenharmony_ci * component removed because it looks like a Rust hash) is worse than a 678c2ecf20Sopenharmony_ci * false negative (the rare Rust symbol is not demangled) so this sets the 688c2ecf20Sopenharmony_ci * balance in favor of false negatives. 698c2ecf20Sopenharmony_ci * 708c2ecf20Sopenharmony_ci * 3. There must be no characters other than a-zA-Z0-9 and _.:$ 718c2ecf20Sopenharmony_ci * 728c2ecf20Sopenharmony_ci * 4. There must be no unrecognized $-sign sequences. 738c2ecf20Sopenharmony_ci * 748c2ecf20Sopenharmony_ci * 5. There must be no sequence of three or more dots in a row ("..."). 758c2ecf20Sopenharmony_ci */ 768c2ecf20Sopenharmony_cibool 778c2ecf20Sopenharmony_cirust_is_mangled(const char *sym) 788c2ecf20Sopenharmony_ci{ 798c2ecf20Sopenharmony_ci size_t len, len_without_hash; 808c2ecf20Sopenharmony_ci 818c2ecf20Sopenharmony_ci if (!sym) 828c2ecf20Sopenharmony_ci return false; 838c2ecf20Sopenharmony_ci 848c2ecf20Sopenharmony_ci len = strlen(sym); 858c2ecf20Sopenharmony_ci if (len <= hash_prefix_len + hash_len) 868c2ecf20Sopenharmony_ci /* Not long enough to contain "::h" + hash + something else */ 878c2ecf20Sopenharmony_ci return false; 888c2ecf20Sopenharmony_ci 898c2ecf20Sopenharmony_ci len_without_hash = len - (hash_prefix_len + hash_len); 908c2ecf20Sopenharmony_ci if (!is_prefixed_hash(sym + len_without_hash)) 918c2ecf20Sopenharmony_ci return false; 928c2ecf20Sopenharmony_ci 938c2ecf20Sopenharmony_ci return looks_like_rust(sym, len_without_hash); 948c2ecf20Sopenharmony_ci} 958c2ecf20Sopenharmony_ci 968c2ecf20Sopenharmony_ci/* 978c2ecf20Sopenharmony_ci * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex 988c2ecf20Sopenharmony_ci * digits must comprise between 5 and 15 (inclusive) distinct digits. 998c2ecf20Sopenharmony_ci */ 1008c2ecf20Sopenharmony_cistatic bool is_prefixed_hash(const char *str) 1018c2ecf20Sopenharmony_ci{ 1028c2ecf20Sopenharmony_ci const char *end; 1038c2ecf20Sopenharmony_ci bool seen[16]; 1048c2ecf20Sopenharmony_ci size_t i; 1058c2ecf20Sopenharmony_ci int count; 1068c2ecf20Sopenharmony_ci 1078c2ecf20Sopenharmony_ci if (strncmp(str, hash_prefix, hash_prefix_len)) 1088c2ecf20Sopenharmony_ci return false; 1098c2ecf20Sopenharmony_ci str += hash_prefix_len; 1108c2ecf20Sopenharmony_ci 1118c2ecf20Sopenharmony_ci memset(seen, false, sizeof(seen)); 1128c2ecf20Sopenharmony_ci for (end = str + hash_len; str < end; str++) 1138c2ecf20Sopenharmony_ci if (*str >= '0' && *str <= '9') 1148c2ecf20Sopenharmony_ci seen[*str - '0'] = true; 1158c2ecf20Sopenharmony_ci else if (*str >= 'a' && *str <= 'f') 1168c2ecf20Sopenharmony_ci seen[*str - 'a' + 10] = true; 1178c2ecf20Sopenharmony_ci else 1188c2ecf20Sopenharmony_ci return false; 1198c2ecf20Sopenharmony_ci 1208c2ecf20Sopenharmony_ci /* Count how many distinct digits seen */ 1218c2ecf20Sopenharmony_ci count = 0; 1228c2ecf20Sopenharmony_ci for (i = 0; i < 16; i++) 1238c2ecf20Sopenharmony_ci if (seen[i]) 1248c2ecf20Sopenharmony_ci count++; 1258c2ecf20Sopenharmony_ci 1268c2ecf20Sopenharmony_ci return count >= 5 && count <= 15; 1278c2ecf20Sopenharmony_ci} 1288c2ecf20Sopenharmony_ci 1298c2ecf20Sopenharmony_cistatic bool looks_like_rust(const char *str, size_t len) 1308c2ecf20Sopenharmony_ci{ 1318c2ecf20Sopenharmony_ci const char *end = str + len; 1328c2ecf20Sopenharmony_ci 1338c2ecf20Sopenharmony_ci while (str < end) 1348c2ecf20Sopenharmony_ci switch (*str) { 1358c2ecf20Sopenharmony_ci case '$': 1368c2ecf20Sopenharmony_ci if (!strncmp(str, "$C$", 3)) 1378c2ecf20Sopenharmony_ci str += 3; 1388c2ecf20Sopenharmony_ci else if (!strncmp(str, "$SP$", 4) 1398c2ecf20Sopenharmony_ci || !strncmp(str, "$BP$", 4) 1408c2ecf20Sopenharmony_ci || !strncmp(str, "$RF$", 4) 1418c2ecf20Sopenharmony_ci || !strncmp(str, "$LT$", 4) 1428c2ecf20Sopenharmony_ci || !strncmp(str, "$GT$", 4) 1438c2ecf20Sopenharmony_ci || !strncmp(str, "$LP$", 4) 1448c2ecf20Sopenharmony_ci || !strncmp(str, "$RP$", 4)) 1458c2ecf20Sopenharmony_ci str += 4; 1468c2ecf20Sopenharmony_ci else if (!strncmp(str, "$u20$", 5) 1478c2ecf20Sopenharmony_ci || !strncmp(str, "$u27$", 5) 1488c2ecf20Sopenharmony_ci || !strncmp(str, "$u5b$", 5) 1498c2ecf20Sopenharmony_ci || !strncmp(str, "$u5d$", 5) 1508c2ecf20Sopenharmony_ci || !strncmp(str, "$u7e$", 5)) 1518c2ecf20Sopenharmony_ci str += 5; 1528c2ecf20Sopenharmony_ci else 1538c2ecf20Sopenharmony_ci return false; 1548c2ecf20Sopenharmony_ci break; 1558c2ecf20Sopenharmony_ci case '.': 1568c2ecf20Sopenharmony_ci /* Do not allow three or more consecutive dots */ 1578c2ecf20Sopenharmony_ci if (!strncmp(str, "...", 3)) 1588c2ecf20Sopenharmony_ci return false; 1598c2ecf20Sopenharmony_ci /* Fall through */ 1608c2ecf20Sopenharmony_ci case 'a' ... 'z': 1618c2ecf20Sopenharmony_ci case 'A' ... 'Z': 1628c2ecf20Sopenharmony_ci case '0' ... '9': 1638c2ecf20Sopenharmony_ci case '_': 1648c2ecf20Sopenharmony_ci case ':': 1658c2ecf20Sopenharmony_ci str++; 1668c2ecf20Sopenharmony_ci break; 1678c2ecf20Sopenharmony_ci default: 1688c2ecf20Sopenharmony_ci return false; 1698c2ecf20Sopenharmony_ci } 1708c2ecf20Sopenharmony_ci 1718c2ecf20Sopenharmony_ci return true; 1728c2ecf20Sopenharmony_ci} 1738c2ecf20Sopenharmony_ci 1748c2ecf20Sopenharmony_ci/* 1758c2ecf20Sopenharmony_ci * INPUT: 1768c2ecf20Sopenharmony_ci * sym: symbol for which rust_is_mangled(sym) returns true 1778c2ecf20Sopenharmony_ci * 1788c2ecf20Sopenharmony_ci * The input is demangled in-place because the mangled name is always longer 1798c2ecf20Sopenharmony_ci * than the demangled one. 1808c2ecf20Sopenharmony_ci */ 1818c2ecf20Sopenharmony_civoid 1828c2ecf20Sopenharmony_cirust_demangle_sym(char *sym) 1838c2ecf20Sopenharmony_ci{ 1848c2ecf20Sopenharmony_ci const char *in; 1858c2ecf20Sopenharmony_ci char *out; 1868c2ecf20Sopenharmony_ci const char *end; 1878c2ecf20Sopenharmony_ci 1888c2ecf20Sopenharmony_ci if (!sym) 1898c2ecf20Sopenharmony_ci return; 1908c2ecf20Sopenharmony_ci 1918c2ecf20Sopenharmony_ci in = sym; 1928c2ecf20Sopenharmony_ci out = sym; 1938c2ecf20Sopenharmony_ci end = sym + strlen(sym) - (hash_prefix_len + hash_len); 1948c2ecf20Sopenharmony_ci 1958c2ecf20Sopenharmony_ci while (in < end) 1968c2ecf20Sopenharmony_ci switch (*in) { 1978c2ecf20Sopenharmony_ci case '$': 1988c2ecf20Sopenharmony_ci if (!(unescape(&in, &out, "$C$", ',') 1998c2ecf20Sopenharmony_ci || unescape(&in, &out, "$SP$", '@') 2008c2ecf20Sopenharmony_ci || unescape(&in, &out, "$BP$", '*') 2018c2ecf20Sopenharmony_ci || unescape(&in, &out, "$RF$", '&') 2028c2ecf20Sopenharmony_ci || unescape(&in, &out, "$LT$", '<') 2038c2ecf20Sopenharmony_ci || unescape(&in, &out, "$GT$", '>') 2048c2ecf20Sopenharmony_ci || unescape(&in, &out, "$LP$", '(') 2058c2ecf20Sopenharmony_ci || unescape(&in, &out, "$RP$", ')') 2068c2ecf20Sopenharmony_ci || unescape(&in, &out, "$u20$", ' ') 2078c2ecf20Sopenharmony_ci || unescape(&in, &out, "$u27$", '\'') 2088c2ecf20Sopenharmony_ci || unescape(&in, &out, "$u5b$", '[') 2098c2ecf20Sopenharmony_ci || unescape(&in, &out, "$u5d$", ']') 2108c2ecf20Sopenharmony_ci || unescape(&in, &out, "$u7e$", '~'))) { 2118c2ecf20Sopenharmony_ci pr_err("demangle-rust: unexpected escape sequence"); 2128c2ecf20Sopenharmony_ci goto done; 2138c2ecf20Sopenharmony_ci } 2148c2ecf20Sopenharmony_ci break; 2158c2ecf20Sopenharmony_ci case '_': 2168c2ecf20Sopenharmony_ci /* 2178c2ecf20Sopenharmony_ci * If this is the start of a path component and the next 2188c2ecf20Sopenharmony_ci * character is an escape sequence, ignore the 2198c2ecf20Sopenharmony_ci * underscore. The mangler inserts an underscore to make 2208c2ecf20Sopenharmony_ci * sure the path component begins with a XID_Start 2218c2ecf20Sopenharmony_ci * character. 2228c2ecf20Sopenharmony_ci */ 2238c2ecf20Sopenharmony_ci if ((in == sym || in[-1] == ':') && in[1] == '$') 2248c2ecf20Sopenharmony_ci in++; 2258c2ecf20Sopenharmony_ci else 2268c2ecf20Sopenharmony_ci *out++ = *in++; 2278c2ecf20Sopenharmony_ci break; 2288c2ecf20Sopenharmony_ci case '.': 2298c2ecf20Sopenharmony_ci if (in[1] == '.') { 2308c2ecf20Sopenharmony_ci /* ".." becomes "::" */ 2318c2ecf20Sopenharmony_ci *out++ = ':'; 2328c2ecf20Sopenharmony_ci *out++ = ':'; 2338c2ecf20Sopenharmony_ci in += 2; 2348c2ecf20Sopenharmony_ci } else { 2358c2ecf20Sopenharmony_ci /* "." becomes "-" */ 2368c2ecf20Sopenharmony_ci *out++ = '-'; 2378c2ecf20Sopenharmony_ci in++; 2388c2ecf20Sopenharmony_ci } 2398c2ecf20Sopenharmony_ci break; 2408c2ecf20Sopenharmony_ci case 'a' ... 'z': 2418c2ecf20Sopenharmony_ci case 'A' ... 'Z': 2428c2ecf20Sopenharmony_ci case '0' ... '9': 2438c2ecf20Sopenharmony_ci case ':': 2448c2ecf20Sopenharmony_ci *out++ = *in++; 2458c2ecf20Sopenharmony_ci break; 2468c2ecf20Sopenharmony_ci default: 2478c2ecf20Sopenharmony_ci pr_err("demangle-rust: unexpected character '%c' in symbol\n", 2488c2ecf20Sopenharmony_ci *in); 2498c2ecf20Sopenharmony_ci goto done; 2508c2ecf20Sopenharmony_ci } 2518c2ecf20Sopenharmony_ci 2528c2ecf20Sopenharmony_cidone: 2538c2ecf20Sopenharmony_ci *out = '\0'; 2548c2ecf20Sopenharmony_ci} 2558c2ecf20Sopenharmony_ci 2568c2ecf20Sopenharmony_cistatic bool unescape(const char **in, char **out, const char *seq, char value) 2578c2ecf20Sopenharmony_ci{ 2588c2ecf20Sopenharmony_ci size_t len = strlen(seq); 2598c2ecf20Sopenharmony_ci 2608c2ecf20Sopenharmony_ci if (strncmp(*in, seq, len)) 2618c2ecf20Sopenharmony_ci return false; 2628c2ecf20Sopenharmony_ci 2638c2ecf20Sopenharmony_ci **out = value; 2648c2ecf20Sopenharmony_ci 2658c2ecf20Sopenharmony_ci *in += len; 2668c2ecf20Sopenharmony_ci *out += 1; 2678c2ecf20Sopenharmony_ci 2688c2ecf20Sopenharmony_ci return true; 2698c2ecf20Sopenharmony_ci} 270