18c2ecf20Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 28c2ecf20Sopenharmony_ci/* 38c2ecf20Sopenharmony_ci * Kernel module for testing utf-8 support. 48c2ecf20Sopenharmony_ci * 58c2ecf20Sopenharmony_ci * Copyright 2017 Collabora Ltd. 68c2ecf20Sopenharmony_ci */ 78c2ecf20Sopenharmony_ci 88c2ecf20Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 98c2ecf20Sopenharmony_ci 108c2ecf20Sopenharmony_ci#include <linux/module.h> 118c2ecf20Sopenharmony_ci#include <linux/printk.h> 128c2ecf20Sopenharmony_ci#include <linux/unicode.h> 138c2ecf20Sopenharmony_ci#include <linux/dcache.h> 148c2ecf20Sopenharmony_ci 158c2ecf20Sopenharmony_ci#include "utf8n.h" 168c2ecf20Sopenharmony_ci 178c2ecf20Sopenharmony_ciunsigned int failed_tests; 188c2ecf20Sopenharmony_ciunsigned int total_tests; 198c2ecf20Sopenharmony_ci 208c2ecf20Sopenharmony_ci/* Tests will be based on this version. */ 218c2ecf20Sopenharmony_ci#define latest_maj 12 228c2ecf20Sopenharmony_ci#define latest_min 1 238c2ecf20Sopenharmony_ci#define latest_rev 0 248c2ecf20Sopenharmony_ci 258c2ecf20Sopenharmony_ci#define _test(cond, func, line, fmt, ...) do { \ 268c2ecf20Sopenharmony_ci total_tests++; \ 278c2ecf20Sopenharmony_ci if (!cond) { \ 288c2ecf20Sopenharmony_ci failed_tests++; \ 298c2ecf20Sopenharmony_ci pr_err("test %s:%d Failed: %s%s", \ 308c2ecf20Sopenharmony_ci func, line, #cond, (fmt?":":".")); \ 318c2ecf20Sopenharmony_ci if (fmt) \ 328c2ecf20Sopenharmony_ci pr_err(fmt, ##__VA_ARGS__); \ 338c2ecf20Sopenharmony_ci } \ 348c2ecf20Sopenharmony_ci } while (0) 358c2ecf20Sopenharmony_ci#define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__) 368c2ecf20Sopenharmony_ci#define test(cond) _test(cond, __func__, __LINE__, "") 378c2ecf20Sopenharmony_ci 388c2ecf20Sopenharmony_cistatic const struct { 398c2ecf20Sopenharmony_ci /* UTF-8 strings in this vector _must_ be NULL-terminated. */ 408c2ecf20Sopenharmony_ci unsigned char str[10]; 418c2ecf20Sopenharmony_ci unsigned char dec[10]; 428c2ecf20Sopenharmony_ci} nfdi_test_data[] = { 438c2ecf20Sopenharmony_ci /* Trivial sequence */ 448c2ecf20Sopenharmony_ci { 458c2ecf20Sopenharmony_ci /* "ABba" decomposes to itself */ 468c2ecf20Sopenharmony_ci .str = "aBba", 478c2ecf20Sopenharmony_ci .dec = "aBba", 488c2ecf20Sopenharmony_ci }, 498c2ecf20Sopenharmony_ci /* Simple equivalent sequences */ 508c2ecf20Sopenharmony_ci { 518c2ecf20Sopenharmony_ci /* 'VULGAR FRACTION ONE QUARTER' cannot decompose to 528c2ecf20Sopenharmony_ci 'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' on 538c2ecf20Sopenharmony_ci canonical decomposition */ 548c2ecf20Sopenharmony_ci .str = {0xc2, 0xbc, 0x00}, 558c2ecf20Sopenharmony_ci .dec = {0xc2, 0xbc, 0x00}, 568c2ecf20Sopenharmony_ci }, 578c2ecf20Sopenharmony_ci { 588c2ecf20Sopenharmony_ci /* 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to 598c2ecf20Sopenharmony_ci 'LETTER A' + 'COMBINING DIAERESIS' */ 608c2ecf20Sopenharmony_ci .str = {0xc3, 0xa4, 0x00}, 618c2ecf20Sopenharmony_ci .dec = {0x61, 0xcc, 0x88, 0x00}, 628c2ecf20Sopenharmony_ci }, 638c2ecf20Sopenharmony_ci { 648c2ecf20Sopenharmony_ci /* 'LATIN SMALL LETTER LJ' can't decompose to 658c2ecf20Sopenharmony_ci 'LETTER L' + 'LETTER J' on canonical decomposition */ 668c2ecf20Sopenharmony_ci .str = {0xC7, 0x89, 0x00}, 678c2ecf20Sopenharmony_ci .dec = {0xC7, 0x89, 0x00}, 688c2ecf20Sopenharmony_ci }, 698c2ecf20Sopenharmony_ci { 708c2ecf20Sopenharmony_ci /* GREEK ANO TELEIA decomposes to MIDDLE DOT */ 718c2ecf20Sopenharmony_ci .str = {0xCE, 0x87, 0x00}, 728c2ecf20Sopenharmony_ci .dec = {0xC2, 0xB7, 0x00} 738c2ecf20Sopenharmony_ci }, 748c2ecf20Sopenharmony_ci /* Canonical ordering */ 758c2ecf20Sopenharmony_ci { 768c2ecf20Sopenharmony_ci /* A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes 778c2ecf20Sopenharmony_ci to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' */ 788c2ecf20Sopenharmony_ci .str = {0x41, 0xcc, 0x81, 0xcc, 0xa8, 0x0}, 798c2ecf20Sopenharmony_ci .dec = {0x41, 0xcc, 0xa8, 0xcc, 0x81, 0x0}, 808c2ecf20Sopenharmony_ci }, 818c2ecf20Sopenharmony_ci { 828c2ecf20Sopenharmony_ci /* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK' 838c2ecf20Sopenharmony_ci decomposes to 848c2ecf20Sopenharmony_ci 'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' */ 858c2ecf20Sopenharmony_ci .str = {0xc3, 0xa4, 0xCC, 0xA8, 0x00}, 868c2ecf20Sopenharmony_ci 878c2ecf20Sopenharmony_ci .dec = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00}, 888c2ecf20Sopenharmony_ci }, 898c2ecf20Sopenharmony_ci 908c2ecf20Sopenharmony_ci}; 918c2ecf20Sopenharmony_ci 928c2ecf20Sopenharmony_cistatic const struct { 938c2ecf20Sopenharmony_ci /* UTF-8 strings in this vector _must_ be NULL-terminated. */ 948c2ecf20Sopenharmony_ci unsigned char str[30]; 958c2ecf20Sopenharmony_ci unsigned char ncf[30]; 968c2ecf20Sopenharmony_ci} nfdicf_test_data[] = { 978c2ecf20Sopenharmony_ci /* Trivial sequences */ 988c2ecf20Sopenharmony_ci { 998c2ecf20Sopenharmony_ci /* "ABba" folds to lowercase */ 1008c2ecf20Sopenharmony_ci .str = {0x41, 0x42, 0x62, 0x61, 0x00}, 1018c2ecf20Sopenharmony_ci .ncf = {0x61, 0x62, 0x62, 0x61, 0x00}, 1028c2ecf20Sopenharmony_ci }, 1038c2ecf20Sopenharmony_ci { 1048c2ecf20Sopenharmony_ci /* All ASCII folds to lower-case */ 1058c2ecf20Sopenharmony_ci .str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0.1", 1068c2ecf20Sopenharmony_ci .ncf = "abcdefghijklmnopqrstuvwxyz0.1", 1078c2ecf20Sopenharmony_ci }, 1088c2ecf20Sopenharmony_ci { 1098c2ecf20Sopenharmony_ci /* LATIN SMALL LETTER SHARP S folds to 1108c2ecf20Sopenharmony_ci LATIN SMALL LETTER S + LATIN SMALL LETTER S */ 1118c2ecf20Sopenharmony_ci .str = {0xc3, 0x9f, 0x00}, 1128c2ecf20Sopenharmony_ci .ncf = {0x73, 0x73, 0x00}, 1138c2ecf20Sopenharmony_ci }, 1148c2ecf20Sopenharmony_ci { 1158c2ecf20Sopenharmony_ci /* LATIN CAPITAL LETTER A WITH RING ABOVE folds to 1168c2ecf20Sopenharmony_ci LATIN SMALL LETTER A + COMBINING RING ABOVE */ 1178c2ecf20Sopenharmony_ci .str = {0xC3, 0x85, 0x00}, 1188c2ecf20Sopenharmony_ci .ncf = {0x61, 0xcc, 0x8a, 0x00}, 1198c2ecf20Sopenharmony_ci }, 1208c2ecf20Sopenharmony_ci /* Introduced by UTF-8.0.0. */ 1218c2ecf20Sopenharmony_ci /* Cherokee letters are interesting test-cases because they fold 1228c2ecf20Sopenharmony_ci to upper-case. Before 8.0.0, Cherokee lowercase were 1238c2ecf20Sopenharmony_ci undefined, thus, the folding from LC is not stable between 1248c2ecf20Sopenharmony_ci 7.0.0 -> 8.0.0, but it is from UC. */ 1258c2ecf20Sopenharmony_ci { 1268c2ecf20Sopenharmony_ci /* CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A */ 1278c2ecf20Sopenharmony_ci .str = {0xea, 0xad, 0xb0, 0x00}, 1288c2ecf20Sopenharmony_ci .ncf = {0xe1, 0x8e, 0xa0, 0x00}, 1298c2ecf20Sopenharmony_ci }, 1308c2ecf20Sopenharmony_ci { 1318c2ecf20Sopenharmony_ci /* CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE */ 1328c2ecf20Sopenharmony_ci .str = {0xe1, 0x8f, 0xb8, 0x00}, 1338c2ecf20Sopenharmony_ci .ncf = {0xe1, 0x8f, 0xb0, 0x00}, 1348c2ecf20Sopenharmony_ci }, 1358c2ecf20Sopenharmony_ci { 1368c2ecf20Sopenharmony_ci /* OLD HUNGARIAN CAPITAL LETTER AMB folds to 1378c2ecf20Sopenharmony_ci OLD HUNGARIAN SMALL LETTER AMB */ 1388c2ecf20Sopenharmony_ci .str = {0xf0, 0x90, 0xb2, 0x83, 0x00}, 1398c2ecf20Sopenharmony_ci .ncf = {0xf0, 0x90, 0xb3, 0x83, 0x00}, 1408c2ecf20Sopenharmony_ci }, 1418c2ecf20Sopenharmony_ci /* Introduced by UTF-9.0.0. */ 1428c2ecf20Sopenharmony_ci { 1438c2ecf20Sopenharmony_ci /* OSAGE CAPITAL LETTER CHA folds to 1448c2ecf20Sopenharmony_ci OSAGE SMALL LETTER CHA */ 1458c2ecf20Sopenharmony_ci .str = {0xf0, 0x90, 0x92, 0xb5, 0x00}, 1468c2ecf20Sopenharmony_ci .ncf = {0xf0, 0x90, 0x93, 0x9d, 0x00}, 1478c2ecf20Sopenharmony_ci }, 1488c2ecf20Sopenharmony_ci { 1498c2ecf20Sopenharmony_ci /* LATIN CAPITAL LETTER SMALL CAPITAL I folds to 1508c2ecf20Sopenharmony_ci LATIN LETTER SMALL CAPITAL I */ 1518c2ecf20Sopenharmony_ci .str = {0xea, 0x9e, 0xae, 0x00}, 1528c2ecf20Sopenharmony_ci .ncf = {0xc9, 0xaa, 0x00}, 1538c2ecf20Sopenharmony_ci }, 1548c2ecf20Sopenharmony_ci /* Introduced by UTF-11.0.0. */ 1558c2ecf20Sopenharmony_ci { 1568c2ecf20Sopenharmony_ci /* GEORGIAN SMALL LETTER AN folds to GEORGIAN MTAVRULI 1578c2ecf20Sopenharmony_ci CAPITAL LETTER AN */ 1588c2ecf20Sopenharmony_ci .str = {0xe1, 0xb2, 0x90, 0x00}, 1598c2ecf20Sopenharmony_ci .ncf = {0xe1, 0x83, 0x90, 0x00}, 1608c2ecf20Sopenharmony_ci } 1618c2ecf20Sopenharmony_ci}; 1628c2ecf20Sopenharmony_ci 1638c2ecf20Sopenharmony_cistatic void check_utf8_nfdi(void) 1648c2ecf20Sopenharmony_ci{ 1658c2ecf20Sopenharmony_ci int i; 1668c2ecf20Sopenharmony_ci struct utf8cursor u8c; 1678c2ecf20Sopenharmony_ci const struct utf8data *data; 1688c2ecf20Sopenharmony_ci 1698c2ecf20Sopenharmony_ci data = utf8nfdi(UNICODE_AGE(latest_maj, latest_min, latest_rev)); 1708c2ecf20Sopenharmony_ci if (!data) { 1718c2ecf20Sopenharmony_ci pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n", 1728c2ecf20Sopenharmony_ci __func__, latest_maj, latest_min, latest_rev); 1738c2ecf20Sopenharmony_ci return; 1748c2ecf20Sopenharmony_ci } 1758c2ecf20Sopenharmony_ci 1768c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { 1778c2ecf20Sopenharmony_ci int len = strlen(nfdi_test_data[i].str); 1788c2ecf20Sopenharmony_ci int nlen = strlen(nfdi_test_data[i].dec); 1798c2ecf20Sopenharmony_ci int j = 0; 1808c2ecf20Sopenharmony_ci unsigned char c; 1818c2ecf20Sopenharmony_ci 1828c2ecf20Sopenharmony_ci test((utf8len(data, nfdi_test_data[i].str) == nlen)); 1838c2ecf20Sopenharmony_ci test((utf8nlen(data, nfdi_test_data[i].str, len) == nlen)); 1848c2ecf20Sopenharmony_ci 1858c2ecf20Sopenharmony_ci if (utf8cursor(&u8c, data, nfdi_test_data[i].str) < 0) 1868c2ecf20Sopenharmony_ci pr_err("can't create cursor\n"); 1878c2ecf20Sopenharmony_ci 1888c2ecf20Sopenharmony_ci while ((c = utf8byte(&u8c)) > 0) { 1898c2ecf20Sopenharmony_ci test_f((c == nfdi_test_data[i].dec[j]), 1908c2ecf20Sopenharmony_ci "Unexpected byte 0x%x should be 0x%x\n", 1918c2ecf20Sopenharmony_ci c, nfdi_test_data[i].dec[j]); 1928c2ecf20Sopenharmony_ci j++; 1938c2ecf20Sopenharmony_ci } 1948c2ecf20Sopenharmony_ci 1958c2ecf20Sopenharmony_ci test((j == nlen)); 1968c2ecf20Sopenharmony_ci } 1978c2ecf20Sopenharmony_ci} 1988c2ecf20Sopenharmony_ci 1998c2ecf20Sopenharmony_cistatic void check_utf8_nfdicf(void) 2008c2ecf20Sopenharmony_ci{ 2018c2ecf20Sopenharmony_ci int i; 2028c2ecf20Sopenharmony_ci struct utf8cursor u8c; 2038c2ecf20Sopenharmony_ci const struct utf8data *data; 2048c2ecf20Sopenharmony_ci 2058c2ecf20Sopenharmony_ci data = utf8nfdicf(UNICODE_AGE(latest_maj, latest_min, latest_rev)); 2068c2ecf20Sopenharmony_ci if (!data) { 2078c2ecf20Sopenharmony_ci pr_err("%s: Unable to load utf8-%d.%d.%d. Skipping.\n", 2088c2ecf20Sopenharmony_ci __func__, latest_maj, latest_min, latest_rev); 2098c2ecf20Sopenharmony_ci return; 2108c2ecf20Sopenharmony_ci } 2118c2ecf20Sopenharmony_ci 2128c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { 2138c2ecf20Sopenharmony_ci int len = strlen(nfdicf_test_data[i].str); 2148c2ecf20Sopenharmony_ci int nlen = strlen(nfdicf_test_data[i].ncf); 2158c2ecf20Sopenharmony_ci int j = 0; 2168c2ecf20Sopenharmony_ci unsigned char c; 2178c2ecf20Sopenharmony_ci 2188c2ecf20Sopenharmony_ci test((utf8len(data, nfdicf_test_data[i].str) == nlen)); 2198c2ecf20Sopenharmony_ci test((utf8nlen(data, nfdicf_test_data[i].str, len) == nlen)); 2208c2ecf20Sopenharmony_ci 2218c2ecf20Sopenharmony_ci if (utf8cursor(&u8c, data, nfdicf_test_data[i].str) < 0) 2228c2ecf20Sopenharmony_ci pr_err("can't create cursor\n"); 2238c2ecf20Sopenharmony_ci 2248c2ecf20Sopenharmony_ci while ((c = utf8byte(&u8c)) > 0) { 2258c2ecf20Sopenharmony_ci test_f((c == nfdicf_test_data[i].ncf[j]), 2268c2ecf20Sopenharmony_ci "Unexpected byte 0x%x should be 0x%x\n", 2278c2ecf20Sopenharmony_ci c, nfdicf_test_data[i].ncf[j]); 2288c2ecf20Sopenharmony_ci j++; 2298c2ecf20Sopenharmony_ci } 2308c2ecf20Sopenharmony_ci 2318c2ecf20Sopenharmony_ci test((j == nlen)); 2328c2ecf20Sopenharmony_ci } 2338c2ecf20Sopenharmony_ci} 2348c2ecf20Sopenharmony_ci 2358c2ecf20Sopenharmony_cistatic void check_utf8_comparisons(void) 2368c2ecf20Sopenharmony_ci{ 2378c2ecf20Sopenharmony_ci int i; 2388c2ecf20Sopenharmony_ci struct unicode_map *table = utf8_load("12.1.0"); 2398c2ecf20Sopenharmony_ci 2408c2ecf20Sopenharmony_ci if (IS_ERR(table)) { 2418c2ecf20Sopenharmony_ci pr_err("%s: Unable to load utf8 %d.%d.%d. Skipping.\n", 2428c2ecf20Sopenharmony_ci __func__, latest_maj, latest_min, latest_rev); 2438c2ecf20Sopenharmony_ci return; 2448c2ecf20Sopenharmony_ci } 2458c2ecf20Sopenharmony_ci 2468c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { 2478c2ecf20Sopenharmony_ci const struct qstr s1 = {.name = nfdi_test_data[i].str, 2488c2ecf20Sopenharmony_ci .len = sizeof(nfdi_test_data[i].str)}; 2498c2ecf20Sopenharmony_ci const struct qstr s2 = {.name = nfdi_test_data[i].dec, 2508c2ecf20Sopenharmony_ci .len = sizeof(nfdi_test_data[i].dec)}; 2518c2ecf20Sopenharmony_ci 2528c2ecf20Sopenharmony_ci test_f(!utf8_strncmp(table, &s1, &s2), 2538c2ecf20Sopenharmony_ci "%s %s comparison mismatch\n", s1.name, s2.name); 2548c2ecf20Sopenharmony_ci } 2558c2ecf20Sopenharmony_ci 2568c2ecf20Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { 2578c2ecf20Sopenharmony_ci const struct qstr s1 = {.name = nfdicf_test_data[i].str, 2588c2ecf20Sopenharmony_ci .len = sizeof(nfdicf_test_data[i].str)}; 2598c2ecf20Sopenharmony_ci const struct qstr s2 = {.name = nfdicf_test_data[i].ncf, 2608c2ecf20Sopenharmony_ci .len = sizeof(nfdicf_test_data[i].ncf)}; 2618c2ecf20Sopenharmony_ci 2628c2ecf20Sopenharmony_ci test_f(!utf8_strncasecmp(table, &s1, &s2), 2638c2ecf20Sopenharmony_ci "%s %s comparison mismatch\n", s1.name, s2.name); 2648c2ecf20Sopenharmony_ci } 2658c2ecf20Sopenharmony_ci 2668c2ecf20Sopenharmony_ci utf8_unload(table); 2678c2ecf20Sopenharmony_ci} 2688c2ecf20Sopenharmony_ci 2698c2ecf20Sopenharmony_cistatic void check_supported_versions(void) 2708c2ecf20Sopenharmony_ci{ 2718c2ecf20Sopenharmony_ci /* Unicode 7.0.0 should be supported. */ 2728c2ecf20Sopenharmony_ci test(utf8version_is_supported(7, 0, 0)); 2738c2ecf20Sopenharmony_ci 2748c2ecf20Sopenharmony_ci /* Unicode 9.0.0 should be supported. */ 2758c2ecf20Sopenharmony_ci test(utf8version_is_supported(9, 0, 0)); 2768c2ecf20Sopenharmony_ci 2778c2ecf20Sopenharmony_ci /* Unicode 1x.0.0 (the latest version) should be supported. */ 2788c2ecf20Sopenharmony_ci test(utf8version_is_supported(latest_maj, latest_min, latest_rev)); 2798c2ecf20Sopenharmony_ci 2808c2ecf20Sopenharmony_ci /* Next versions don't exist. */ 2818c2ecf20Sopenharmony_ci test(!utf8version_is_supported(13, 0, 0)); 2828c2ecf20Sopenharmony_ci test(!utf8version_is_supported(0, 0, 0)); 2838c2ecf20Sopenharmony_ci test(!utf8version_is_supported(-1, -1, -1)); 2848c2ecf20Sopenharmony_ci} 2858c2ecf20Sopenharmony_ci 2868c2ecf20Sopenharmony_cistatic int __init init_test_ucd(void) 2878c2ecf20Sopenharmony_ci{ 2888c2ecf20Sopenharmony_ci failed_tests = 0; 2898c2ecf20Sopenharmony_ci total_tests = 0; 2908c2ecf20Sopenharmony_ci 2918c2ecf20Sopenharmony_ci check_supported_versions(); 2928c2ecf20Sopenharmony_ci check_utf8_nfdi(); 2938c2ecf20Sopenharmony_ci check_utf8_nfdicf(); 2948c2ecf20Sopenharmony_ci check_utf8_comparisons(); 2958c2ecf20Sopenharmony_ci 2968c2ecf20Sopenharmony_ci if (!failed_tests) 2978c2ecf20Sopenharmony_ci pr_info("All %u tests passed\n", total_tests); 2988c2ecf20Sopenharmony_ci else 2998c2ecf20Sopenharmony_ci pr_err("%u out of %u tests failed\n", failed_tests, 3008c2ecf20Sopenharmony_ci total_tests); 3018c2ecf20Sopenharmony_ci return 0; 3028c2ecf20Sopenharmony_ci} 3038c2ecf20Sopenharmony_ci 3048c2ecf20Sopenharmony_cistatic void __exit exit_test_ucd(void) 3058c2ecf20Sopenharmony_ci{ 3068c2ecf20Sopenharmony_ci} 3078c2ecf20Sopenharmony_ci 3088c2ecf20Sopenharmony_cimodule_init(init_test_ucd); 3098c2ecf20Sopenharmony_cimodule_exit(exit_test_ucd); 3108c2ecf20Sopenharmony_ci 3118c2ecf20Sopenharmony_ciMODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>"); 3128c2ecf20Sopenharmony_ciMODULE_LICENSE("GPL"); 313