162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only 262306a36Sopenharmony_ci/* 362306a36Sopenharmony_ci * Kernel module for testing utf-8 support. 462306a36Sopenharmony_ci * 562306a36Sopenharmony_ci * Copyright 2017 Collabora Ltd. 662306a36Sopenharmony_ci */ 762306a36Sopenharmony_ci 862306a36Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 962306a36Sopenharmony_ci 1062306a36Sopenharmony_ci#include <linux/module.h> 1162306a36Sopenharmony_ci#include <linux/printk.h> 1262306a36Sopenharmony_ci#include <linux/unicode.h> 1362306a36Sopenharmony_ci#include <linux/dcache.h> 1462306a36Sopenharmony_ci 1562306a36Sopenharmony_ci#include "utf8n.h" 1662306a36Sopenharmony_ci 1762306a36Sopenharmony_ciunsigned int failed_tests; 1862306a36Sopenharmony_ciunsigned int total_tests; 1962306a36Sopenharmony_ci 2062306a36Sopenharmony_ci/* Tests will be based on this version. */ 2162306a36Sopenharmony_ci#define UTF8_LATEST UNICODE_AGE(12, 1, 0) 2262306a36Sopenharmony_ci 2362306a36Sopenharmony_ci#define _test(cond, func, line, fmt, ...) do { \ 2462306a36Sopenharmony_ci total_tests++; \ 2562306a36Sopenharmony_ci if (!cond) { \ 2662306a36Sopenharmony_ci failed_tests++; \ 2762306a36Sopenharmony_ci pr_err("test %s:%d Failed: %s%s", \ 2862306a36Sopenharmony_ci func, line, #cond, (fmt?":":".")); \ 2962306a36Sopenharmony_ci if (fmt) \ 3062306a36Sopenharmony_ci pr_err(fmt, ##__VA_ARGS__); \ 3162306a36Sopenharmony_ci } \ 3262306a36Sopenharmony_ci } while (0) 3362306a36Sopenharmony_ci#define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__) 3462306a36Sopenharmony_ci#define test(cond) _test(cond, __func__, __LINE__, "") 3562306a36Sopenharmony_ci 3662306a36Sopenharmony_cistatic const struct { 3762306a36Sopenharmony_ci /* UTF-8 strings in this vector _must_ be NULL-terminated. */ 3862306a36Sopenharmony_ci unsigned char str[10]; 3962306a36Sopenharmony_ci unsigned char dec[10]; 4062306a36Sopenharmony_ci} nfdi_test_data[] = { 4162306a36Sopenharmony_ci /* Trivial sequence */ 4262306a36Sopenharmony_ci { 4362306a36Sopenharmony_ci /* "ABba" decomposes to itself */ 4462306a36Sopenharmony_ci .str = "aBba", 4562306a36Sopenharmony_ci .dec = "aBba", 4662306a36Sopenharmony_ci }, 4762306a36Sopenharmony_ci /* Simple equivalent sequences */ 4862306a36Sopenharmony_ci { 4962306a36Sopenharmony_ci /* 'VULGAR FRACTION ONE QUARTER' cannot decompose to 5062306a36Sopenharmony_ci 'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' on 5162306a36Sopenharmony_ci canonical decomposition */ 5262306a36Sopenharmony_ci .str = {0xc2, 0xbc, 0x00}, 5362306a36Sopenharmony_ci .dec = {0xc2, 0xbc, 0x00}, 5462306a36Sopenharmony_ci }, 5562306a36Sopenharmony_ci { 5662306a36Sopenharmony_ci /* 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to 5762306a36Sopenharmony_ci 'LETTER A' + 'COMBINING DIAERESIS' */ 5862306a36Sopenharmony_ci .str = {0xc3, 0xa4, 0x00}, 5962306a36Sopenharmony_ci .dec = {0x61, 0xcc, 0x88, 0x00}, 6062306a36Sopenharmony_ci }, 6162306a36Sopenharmony_ci { 6262306a36Sopenharmony_ci /* 'LATIN SMALL LETTER LJ' can't decompose to 6362306a36Sopenharmony_ci 'LETTER L' + 'LETTER J' on canonical decomposition */ 6462306a36Sopenharmony_ci .str = {0xC7, 0x89, 0x00}, 6562306a36Sopenharmony_ci .dec = {0xC7, 0x89, 0x00}, 6662306a36Sopenharmony_ci }, 6762306a36Sopenharmony_ci { 6862306a36Sopenharmony_ci /* GREEK ANO TELEIA decomposes to MIDDLE DOT */ 6962306a36Sopenharmony_ci .str = {0xCE, 0x87, 0x00}, 7062306a36Sopenharmony_ci .dec = {0xC2, 0xB7, 0x00} 7162306a36Sopenharmony_ci }, 7262306a36Sopenharmony_ci /* Canonical ordering */ 7362306a36Sopenharmony_ci { 7462306a36Sopenharmony_ci /* A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes 7562306a36Sopenharmony_ci to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' */ 7662306a36Sopenharmony_ci .str = {0x41, 0xcc, 0x81, 0xcc, 0xa8, 0x0}, 7762306a36Sopenharmony_ci .dec = {0x41, 0xcc, 0xa8, 0xcc, 0x81, 0x0}, 7862306a36Sopenharmony_ci }, 7962306a36Sopenharmony_ci { 8062306a36Sopenharmony_ci /* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK' 8162306a36Sopenharmony_ci decomposes to 8262306a36Sopenharmony_ci 'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' */ 8362306a36Sopenharmony_ci .str = {0xc3, 0xa4, 0xCC, 0xA8, 0x00}, 8462306a36Sopenharmony_ci 8562306a36Sopenharmony_ci .dec = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00}, 8662306a36Sopenharmony_ci }, 8762306a36Sopenharmony_ci 8862306a36Sopenharmony_ci}; 8962306a36Sopenharmony_ci 9062306a36Sopenharmony_cistatic const struct { 9162306a36Sopenharmony_ci /* UTF-8 strings in this vector _must_ be NULL-terminated. */ 9262306a36Sopenharmony_ci unsigned char str[30]; 9362306a36Sopenharmony_ci unsigned char ncf[30]; 9462306a36Sopenharmony_ci} nfdicf_test_data[] = { 9562306a36Sopenharmony_ci /* Trivial sequences */ 9662306a36Sopenharmony_ci { 9762306a36Sopenharmony_ci /* "ABba" folds to lowercase */ 9862306a36Sopenharmony_ci .str = {0x41, 0x42, 0x62, 0x61, 0x00}, 9962306a36Sopenharmony_ci .ncf = {0x61, 0x62, 0x62, 0x61, 0x00}, 10062306a36Sopenharmony_ci }, 10162306a36Sopenharmony_ci { 10262306a36Sopenharmony_ci /* All ASCII folds to lower-case */ 10362306a36Sopenharmony_ci .str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0.1", 10462306a36Sopenharmony_ci .ncf = "abcdefghijklmnopqrstuvwxyz0.1", 10562306a36Sopenharmony_ci }, 10662306a36Sopenharmony_ci { 10762306a36Sopenharmony_ci /* LATIN SMALL LETTER SHARP S folds to 10862306a36Sopenharmony_ci LATIN SMALL LETTER S + LATIN SMALL LETTER S */ 10962306a36Sopenharmony_ci .str = {0xc3, 0x9f, 0x00}, 11062306a36Sopenharmony_ci .ncf = {0x73, 0x73, 0x00}, 11162306a36Sopenharmony_ci }, 11262306a36Sopenharmony_ci { 11362306a36Sopenharmony_ci /* LATIN CAPITAL LETTER A WITH RING ABOVE folds to 11462306a36Sopenharmony_ci LATIN SMALL LETTER A + COMBINING RING ABOVE */ 11562306a36Sopenharmony_ci .str = {0xC3, 0x85, 0x00}, 11662306a36Sopenharmony_ci .ncf = {0x61, 0xcc, 0x8a, 0x00}, 11762306a36Sopenharmony_ci }, 11862306a36Sopenharmony_ci /* Introduced by UTF-8.0.0. */ 11962306a36Sopenharmony_ci /* Cherokee letters are interesting test-cases because they fold 12062306a36Sopenharmony_ci to upper-case. Before 8.0.0, Cherokee lowercase were 12162306a36Sopenharmony_ci undefined, thus, the folding from LC is not stable between 12262306a36Sopenharmony_ci 7.0.0 -> 8.0.0, but it is from UC. */ 12362306a36Sopenharmony_ci { 12462306a36Sopenharmony_ci /* CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A */ 12562306a36Sopenharmony_ci .str = {0xea, 0xad, 0xb0, 0x00}, 12662306a36Sopenharmony_ci .ncf = {0xe1, 0x8e, 0xa0, 0x00}, 12762306a36Sopenharmony_ci }, 12862306a36Sopenharmony_ci { 12962306a36Sopenharmony_ci /* CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE */ 13062306a36Sopenharmony_ci .str = {0xe1, 0x8f, 0xb8, 0x00}, 13162306a36Sopenharmony_ci .ncf = {0xe1, 0x8f, 0xb0, 0x00}, 13262306a36Sopenharmony_ci }, 13362306a36Sopenharmony_ci { 13462306a36Sopenharmony_ci /* OLD HUNGARIAN CAPITAL LETTER AMB folds to 13562306a36Sopenharmony_ci OLD HUNGARIAN SMALL LETTER AMB */ 13662306a36Sopenharmony_ci .str = {0xf0, 0x90, 0xb2, 0x83, 0x00}, 13762306a36Sopenharmony_ci .ncf = {0xf0, 0x90, 0xb3, 0x83, 0x00}, 13862306a36Sopenharmony_ci }, 13962306a36Sopenharmony_ci /* Introduced by UTF-9.0.0. */ 14062306a36Sopenharmony_ci { 14162306a36Sopenharmony_ci /* OSAGE CAPITAL LETTER CHA folds to 14262306a36Sopenharmony_ci OSAGE SMALL LETTER CHA */ 14362306a36Sopenharmony_ci .str = {0xf0, 0x90, 0x92, 0xb5, 0x00}, 14462306a36Sopenharmony_ci .ncf = {0xf0, 0x90, 0x93, 0x9d, 0x00}, 14562306a36Sopenharmony_ci }, 14662306a36Sopenharmony_ci { 14762306a36Sopenharmony_ci /* LATIN CAPITAL LETTER SMALL CAPITAL I folds to 14862306a36Sopenharmony_ci LATIN LETTER SMALL CAPITAL I */ 14962306a36Sopenharmony_ci .str = {0xea, 0x9e, 0xae, 0x00}, 15062306a36Sopenharmony_ci .ncf = {0xc9, 0xaa, 0x00}, 15162306a36Sopenharmony_ci }, 15262306a36Sopenharmony_ci /* Introduced by UTF-11.0.0. */ 15362306a36Sopenharmony_ci { 15462306a36Sopenharmony_ci /* GEORGIAN SMALL LETTER AN folds to GEORGIAN MTAVRULI 15562306a36Sopenharmony_ci CAPITAL LETTER AN */ 15662306a36Sopenharmony_ci .str = {0xe1, 0xb2, 0x90, 0x00}, 15762306a36Sopenharmony_ci .ncf = {0xe1, 0x83, 0x90, 0x00}, 15862306a36Sopenharmony_ci } 15962306a36Sopenharmony_ci}; 16062306a36Sopenharmony_ci 16162306a36Sopenharmony_cistatic ssize_t utf8len(const struct unicode_map *um, enum utf8_normalization n, 16262306a36Sopenharmony_ci const char *s) 16362306a36Sopenharmony_ci{ 16462306a36Sopenharmony_ci return utf8nlen(um, n, s, (size_t)-1); 16562306a36Sopenharmony_ci} 16662306a36Sopenharmony_ci 16762306a36Sopenharmony_cistatic int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um, 16862306a36Sopenharmony_ci enum utf8_normalization n, const char *s) 16962306a36Sopenharmony_ci{ 17062306a36Sopenharmony_ci return utf8ncursor(u8c, um, n, s, (unsigned int)-1); 17162306a36Sopenharmony_ci} 17262306a36Sopenharmony_ci 17362306a36Sopenharmony_cistatic void check_utf8_nfdi(struct unicode_map *um) 17462306a36Sopenharmony_ci{ 17562306a36Sopenharmony_ci int i; 17662306a36Sopenharmony_ci struct utf8cursor u8c; 17762306a36Sopenharmony_ci 17862306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { 17962306a36Sopenharmony_ci int len = strlen(nfdi_test_data[i].str); 18062306a36Sopenharmony_ci int nlen = strlen(nfdi_test_data[i].dec); 18162306a36Sopenharmony_ci int j = 0; 18262306a36Sopenharmony_ci unsigned char c; 18362306a36Sopenharmony_ci 18462306a36Sopenharmony_ci test((utf8len(um, UTF8_NFDI, nfdi_test_data[i].str) == nlen)); 18562306a36Sopenharmony_ci test((utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len) == 18662306a36Sopenharmony_ci nlen)); 18762306a36Sopenharmony_ci 18862306a36Sopenharmony_ci if (utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str) < 0) 18962306a36Sopenharmony_ci pr_err("can't create cursor\n"); 19062306a36Sopenharmony_ci 19162306a36Sopenharmony_ci while ((c = utf8byte(&u8c)) > 0) { 19262306a36Sopenharmony_ci test_f((c == nfdi_test_data[i].dec[j]), 19362306a36Sopenharmony_ci "Unexpected byte 0x%x should be 0x%x\n", 19462306a36Sopenharmony_ci c, nfdi_test_data[i].dec[j]); 19562306a36Sopenharmony_ci j++; 19662306a36Sopenharmony_ci } 19762306a36Sopenharmony_ci 19862306a36Sopenharmony_ci test((j == nlen)); 19962306a36Sopenharmony_ci } 20062306a36Sopenharmony_ci} 20162306a36Sopenharmony_ci 20262306a36Sopenharmony_cistatic void check_utf8_nfdicf(struct unicode_map *um) 20362306a36Sopenharmony_ci{ 20462306a36Sopenharmony_ci int i; 20562306a36Sopenharmony_ci struct utf8cursor u8c; 20662306a36Sopenharmony_ci 20762306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { 20862306a36Sopenharmony_ci int len = strlen(nfdicf_test_data[i].str); 20962306a36Sopenharmony_ci int nlen = strlen(nfdicf_test_data[i].ncf); 21062306a36Sopenharmony_ci int j = 0; 21162306a36Sopenharmony_ci unsigned char c; 21262306a36Sopenharmony_ci 21362306a36Sopenharmony_ci test((utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str) == 21462306a36Sopenharmony_ci nlen)); 21562306a36Sopenharmony_ci test((utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len) == 21662306a36Sopenharmony_ci nlen)); 21762306a36Sopenharmony_ci 21862306a36Sopenharmony_ci if (utf8cursor(&u8c, um, UTF8_NFDICF, 21962306a36Sopenharmony_ci nfdicf_test_data[i].str) < 0) 22062306a36Sopenharmony_ci pr_err("can't create cursor\n"); 22162306a36Sopenharmony_ci 22262306a36Sopenharmony_ci while ((c = utf8byte(&u8c)) > 0) { 22362306a36Sopenharmony_ci test_f((c == nfdicf_test_data[i].ncf[j]), 22462306a36Sopenharmony_ci "Unexpected byte 0x%x should be 0x%x\n", 22562306a36Sopenharmony_ci c, nfdicf_test_data[i].ncf[j]); 22662306a36Sopenharmony_ci j++; 22762306a36Sopenharmony_ci } 22862306a36Sopenharmony_ci 22962306a36Sopenharmony_ci test((j == nlen)); 23062306a36Sopenharmony_ci } 23162306a36Sopenharmony_ci} 23262306a36Sopenharmony_ci 23362306a36Sopenharmony_cistatic void check_utf8_comparisons(struct unicode_map *table) 23462306a36Sopenharmony_ci{ 23562306a36Sopenharmony_ci int i; 23662306a36Sopenharmony_ci 23762306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { 23862306a36Sopenharmony_ci const struct qstr s1 = {.name = nfdi_test_data[i].str, 23962306a36Sopenharmony_ci .len = sizeof(nfdi_test_data[i].str)}; 24062306a36Sopenharmony_ci const struct qstr s2 = {.name = nfdi_test_data[i].dec, 24162306a36Sopenharmony_ci .len = sizeof(nfdi_test_data[i].dec)}; 24262306a36Sopenharmony_ci 24362306a36Sopenharmony_ci test_f(!utf8_strncmp(table, &s1, &s2), 24462306a36Sopenharmony_ci "%s %s comparison mismatch\n", s1.name, s2.name); 24562306a36Sopenharmony_ci } 24662306a36Sopenharmony_ci 24762306a36Sopenharmony_ci for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { 24862306a36Sopenharmony_ci const struct qstr s1 = {.name = nfdicf_test_data[i].str, 24962306a36Sopenharmony_ci .len = sizeof(nfdicf_test_data[i].str)}; 25062306a36Sopenharmony_ci const struct qstr s2 = {.name = nfdicf_test_data[i].ncf, 25162306a36Sopenharmony_ci .len = sizeof(nfdicf_test_data[i].ncf)}; 25262306a36Sopenharmony_ci 25362306a36Sopenharmony_ci test_f(!utf8_strncasecmp(table, &s1, &s2), 25462306a36Sopenharmony_ci "%s %s comparison mismatch\n", s1.name, s2.name); 25562306a36Sopenharmony_ci } 25662306a36Sopenharmony_ci} 25762306a36Sopenharmony_ci 25862306a36Sopenharmony_cistatic void check_supported_versions(struct unicode_map *um) 25962306a36Sopenharmony_ci{ 26062306a36Sopenharmony_ci /* Unicode 7.0.0 should be supported. */ 26162306a36Sopenharmony_ci test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0))); 26262306a36Sopenharmony_ci 26362306a36Sopenharmony_ci /* Unicode 9.0.0 should be supported. */ 26462306a36Sopenharmony_ci test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0))); 26562306a36Sopenharmony_ci 26662306a36Sopenharmony_ci /* Unicode 1x.0.0 (the latest version) should be supported. */ 26762306a36Sopenharmony_ci test(utf8version_is_supported(um, UTF8_LATEST)); 26862306a36Sopenharmony_ci 26962306a36Sopenharmony_ci /* Next versions don't exist. */ 27062306a36Sopenharmony_ci test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0))); 27162306a36Sopenharmony_ci test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0))); 27262306a36Sopenharmony_ci test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1))); 27362306a36Sopenharmony_ci} 27462306a36Sopenharmony_ci 27562306a36Sopenharmony_cistatic int __init init_test_ucd(void) 27662306a36Sopenharmony_ci{ 27762306a36Sopenharmony_ci struct unicode_map *um; 27862306a36Sopenharmony_ci 27962306a36Sopenharmony_ci failed_tests = 0; 28062306a36Sopenharmony_ci total_tests = 0; 28162306a36Sopenharmony_ci 28262306a36Sopenharmony_ci um = utf8_load(UTF8_LATEST); 28362306a36Sopenharmony_ci if (IS_ERR(um)) { 28462306a36Sopenharmony_ci pr_err("%s: Unable to load utf8 table.\n", __func__); 28562306a36Sopenharmony_ci return PTR_ERR(um); 28662306a36Sopenharmony_ci } 28762306a36Sopenharmony_ci 28862306a36Sopenharmony_ci check_supported_versions(um); 28962306a36Sopenharmony_ci check_utf8_nfdi(um); 29062306a36Sopenharmony_ci check_utf8_nfdicf(um); 29162306a36Sopenharmony_ci check_utf8_comparisons(um); 29262306a36Sopenharmony_ci 29362306a36Sopenharmony_ci if (!failed_tests) 29462306a36Sopenharmony_ci pr_info("All %u tests passed\n", total_tests); 29562306a36Sopenharmony_ci else 29662306a36Sopenharmony_ci pr_err("%u out of %u tests failed\n", failed_tests, 29762306a36Sopenharmony_ci total_tests); 29862306a36Sopenharmony_ci utf8_unload(um); 29962306a36Sopenharmony_ci return 0; 30062306a36Sopenharmony_ci} 30162306a36Sopenharmony_ci 30262306a36Sopenharmony_cistatic void __exit exit_test_ucd(void) 30362306a36Sopenharmony_ci{ 30462306a36Sopenharmony_ci} 30562306a36Sopenharmony_ci 30662306a36Sopenharmony_cimodule_init(init_test_ucd); 30762306a36Sopenharmony_cimodule_exit(exit_test_ucd); 30862306a36Sopenharmony_ci 30962306a36Sopenharmony_ciMODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>"); 31062306a36Sopenharmony_ciMODULE_LICENSE("GPL"); 311