162306a36Sopenharmony_ci// SPDX-License-Identifier: GPL-2.0-only
262306a36Sopenharmony_ci/*
362306a36Sopenharmony_ci * Kernel module for testing utf-8 support.
462306a36Sopenharmony_ci *
562306a36Sopenharmony_ci * Copyright 2017 Collabora Ltd.
662306a36Sopenharmony_ci */
762306a36Sopenharmony_ci
862306a36Sopenharmony_ci#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
962306a36Sopenharmony_ci
1062306a36Sopenharmony_ci#include <linux/module.h>
1162306a36Sopenharmony_ci#include <linux/printk.h>
1262306a36Sopenharmony_ci#include <linux/unicode.h>
1362306a36Sopenharmony_ci#include <linux/dcache.h>
1462306a36Sopenharmony_ci
1562306a36Sopenharmony_ci#include "utf8n.h"
1662306a36Sopenharmony_ci
1762306a36Sopenharmony_ciunsigned int failed_tests;
1862306a36Sopenharmony_ciunsigned int total_tests;
1962306a36Sopenharmony_ci
2062306a36Sopenharmony_ci/* Tests will be based on this version. */
2162306a36Sopenharmony_ci#define UTF8_LATEST	UNICODE_AGE(12, 1, 0)
2262306a36Sopenharmony_ci
2362306a36Sopenharmony_ci#define _test(cond, func, line, fmt, ...) do {				\
2462306a36Sopenharmony_ci		total_tests++;						\
2562306a36Sopenharmony_ci		if (!cond) {						\
2662306a36Sopenharmony_ci			failed_tests++;					\
2762306a36Sopenharmony_ci			pr_err("test %s:%d Failed: %s%s",		\
2862306a36Sopenharmony_ci			       func, line, #cond, (fmt?":":"."));	\
2962306a36Sopenharmony_ci			if (fmt)					\
3062306a36Sopenharmony_ci				pr_err(fmt, ##__VA_ARGS__);		\
3162306a36Sopenharmony_ci		}							\
3262306a36Sopenharmony_ci	} while (0)
3362306a36Sopenharmony_ci#define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__)
3462306a36Sopenharmony_ci#define test(cond) _test(cond, __func__, __LINE__, "")
3562306a36Sopenharmony_ci
3662306a36Sopenharmony_cistatic const struct {
3762306a36Sopenharmony_ci	/* UTF-8 strings in this vector _must_ be NULL-terminated. */
3862306a36Sopenharmony_ci	unsigned char str[10];
3962306a36Sopenharmony_ci	unsigned char dec[10];
4062306a36Sopenharmony_ci} nfdi_test_data[] = {
4162306a36Sopenharmony_ci	/* Trivial sequence */
4262306a36Sopenharmony_ci	{
4362306a36Sopenharmony_ci		/* "ABba" decomposes to itself */
4462306a36Sopenharmony_ci		.str = "aBba",
4562306a36Sopenharmony_ci		.dec = "aBba",
4662306a36Sopenharmony_ci	},
4762306a36Sopenharmony_ci	/* Simple equivalent sequences */
4862306a36Sopenharmony_ci	{
4962306a36Sopenharmony_ci               /* 'VULGAR FRACTION ONE QUARTER' cannot decompose to
5062306a36Sopenharmony_ci                  'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' on
5162306a36Sopenharmony_ci                  canonical decomposition */
5262306a36Sopenharmony_ci               .str = {0xc2, 0xbc, 0x00},
5362306a36Sopenharmony_ci	       .dec = {0xc2, 0xbc, 0x00},
5462306a36Sopenharmony_ci	},
5562306a36Sopenharmony_ci	{
5662306a36Sopenharmony_ci		/* 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to
5762306a36Sopenharmony_ci		   'LETTER A' + 'COMBINING DIAERESIS' */
5862306a36Sopenharmony_ci		.str = {0xc3, 0xa4, 0x00},
5962306a36Sopenharmony_ci		.dec = {0x61, 0xcc, 0x88, 0x00},
6062306a36Sopenharmony_ci	},
6162306a36Sopenharmony_ci	{
6262306a36Sopenharmony_ci		/* 'LATIN SMALL LETTER LJ' can't decompose to
6362306a36Sopenharmony_ci		   'LETTER L' + 'LETTER J' on canonical decomposition */
6462306a36Sopenharmony_ci		.str = {0xC7, 0x89, 0x00},
6562306a36Sopenharmony_ci		.dec = {0xC7, 0x89, 0x00},
6662306a36Sopenharmony_ci	},
6762306a36Sopenharmony_ci	{
6862306a36Sopenharmony_ci		/* GREEK ANO TELEIA decomposes to MIDDLE DOT */
6962306a36Sopenharmony_ci		.str = {0xCE, 0x87, 0x00},
7062306a36Sopenharmony_ci		.dec = {0xC2, 0xB7, 0x00}
7162306a36Sopenharmony_ci	},
7262306a36Sopenharmony_ci	/* Canonical ordering */
7362306a36Sopenharmony_ci	{
7462306a36Sopenharmony_ci		/* A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes
7562306a36Sopenharmony_ci		   to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' */
7662306a36Sopenharmony_ci		.str = {0x41, 0xcc, 0x81, 0xcc, 0xa8, 0x0},
7762306a36Sopenharmony_ci		.dec = {0x41, 0xcc, 0xa8, 0xcc, 0x81, 0x0},
7862306a36Sopenharmony_ci	},
7962306a36Sopenharmony_ci	{
8062306a36Sopenharmony_ci		/* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK'
8162306a36Sopenharmony_ci		   decomposes to
8262306a36Sopenharmony_ci		   'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' */
8362306a36Sopenharmony_ci		.str = {0xc3, 0xa4, 0xCC, 0xA8, 0x00},
8462306a36Sopenharmony_ci
8562306a36Sopenharmony_ci		.dec = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00},
8662306a36Sopenharmony_ci	},
8762306a36Sopenharmony_ci
8862306a36Sopenharmony_ci};
8962306a36Sopenharmony_ci
9062306a36Sopenharmony_cistatic const struct {
9162306a36Sopenharmony_ci	/* UTF-8 strings in this vector _must_ be NULL-terminated. */
9262306a36Sopenharmony_ci	unsigned char str[30];
9362306a36Sopenharmony_ci	unsigned char ncf[30];
9462306a36Sopenharmony_ci} nfdicf_test_data[] = {
9562306a36Sopenharmony_ci	/* Trivial sequences */
9662306a36Sopenharmony_ci	{
9762306a36Sopenharmony_ci		/* "ABba" folds to lowercase */
9862306a36Sopenharmony_ci		.str = {0x41, 0x42, 0x62, 0x61, 0x00},
9962306a36Sopenharmony_ci		.ncf = {0x61, 0x62, 0x62, 0x61, 0x00},
10062306a36Sopenharmony_ci	},
10162306a36Sopenharmony_ci	{
10262306a36Sopenharmony_ci		/* All ASCII folds to lower-case */
10362306a36Sopenharmony_ci		.str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0.1",
10462306a36Sopenharmony_ci		.ncf = "abcdefghijklmnopqrstuvwxyz0.1",
10562306a36Sopenharmony_ci	},
10662306a36Sopenharmony_ci	{
10762306a36Sopenharmony_ci		/* LATIN SMALL LETTER SHARP S folds to
10862306a36Sopenharmony_ci		   LATIN SMALL LETTER S + LATIN SMALL LETTER S */
10962306a36Sopenharmony_ci		.str = {0xc3, 0x9f, 0x00},
11062306a36Sopenharmony_ci		.ncf = {0x73, 0x73, 0x00},
11162306a36Sopenharmony_ci	},
11262306a36Sopenharmony_ci	{
11362306a36Sopenharmony_ci		/* LATIN CAPITAL LETTER A WITH RING ABOVE folds to
11462306a36Sopenharmony_ci		   LATIN SMALL LETTER A + COMBINING RING ABOVE */
11562306a36Sopenharmony_ci		.str = {0xC3, 0x85, 0x00},
11662306a36Sopenharmony_ci		.ncf = {0x61, 0xcc, 0x8a, 0x00},
11762306a36Sopenharmony_ci	},
11862306a36Sopenharmony_ci	/* Introduced by UTF-8.0.0. */
11962306a36Sopenharmony_ci	/* Cherokee letters are interesting test-cases because they fold
12062306a36Sopenharmony_ci	   to upper-case.  Before 8.0.0, Cherokee lowercase were
12162306a36Sopenharmony_ci	   undefined, thus, the folding from LC is not stable between
12262306a36Sopenharmony_ci	   7.0.0 -> 8.0.0, but it is from UC. */
12362306a36Sopenharmony_ci	{
12462306a36Sopenharmony_ci		/* CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A */
12562306a36Sopenharmony_ci		.str = {0xea, 0xad, 0xb0, 0x00},
12662306a36Sopenharmony_ci		.ncf = {0xe1, 0x8e, 0xa0, 0x00},
12762306a36Sopenharmony_ci	},
12862306a36Sopenharmony_ci	{
12962306a36Sopenharmony_ci		/* CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE */
13062306a36Sopenharmony_ci		.str = {0xe1, 0x8f, 0xb8, 0x00},
13162306a36Sopenharmony_ci		.ncf = {0xe1, 0x8f, 0xb0, 0x00},
13262306a36Sopenharmony_ci	},
13362306a36Sopenharmony_ci	{
13462306a36Sopenharmony_ci		/* OLD HUNGARIAN CAPITAL LETTER AMB folds to
13562306a36Sopenharmony_ci		   OLD HUNGARIAN SMALL LETTER AMB */
13662306a36Sopenharmony_ci		.str = {0xf0, 0x90, 0xb2, 0x83, 0x00},
13762306a36Sopenharmony_ci		.ncf = {0xf0, 0x90, 0xb3, 0x83, 0x00},
13862306a36Sopenharmony_ci	},
13962306a36Sopenharmony_ci	/* Introduced by UTF-9.0.0. */
14062306a36Sopenharmony_ci	{
14162306a36Sopenharmony_ci		/* OSAGE CAPITAL LETTER CHA folds to
14262306a36Sopenharmony_ci		   OSAGE SMALL LETTER CHA */
14362306a36Sopenharmony_ci		.str = {0xf0, 0x90, 0x92, 0xb5, 0x00},
14462306a36Sopenharmony_ci		.ncf = {0xf0, 0x90, 0x93, 0x9d, 0x00},
14562306a36Sopenharmony_ci	},
14662306a36Sopenharmony_ci	{
14762306a36Sopenharmony_ci		/* LATIN CAPITAL LETTER SMALL CAPITAL I folds to
14862306a36Sopenharmony_ci		   LATIN LETTER SMALL CAPITAL I */
14962306a36Sopenharmony_ci		.str = {0xea, 0x9e, 0xae, 0x00},
15062306a36Sopenharmony_ci		.ncf = {0xc9, 0xaa, 0x00},
15162306a36Sopenharmony_ci	},
15262306a36Sopenharmony_ci	/* Introduced by UTF-11.0.0. */
15362306a36Sopenharmony_ci	{
15462306a36Sopenharmony_ci		/* GEORGIAN SMALL LETTER AN folds to GEORGIAN MTAVRULI
15562306a36Sopenharmony_ci		   CAPITAL LETTER AN */
15662306a36Sopenharmony_ci		.str = {0xe1, 0xb2, 0x90, 0x00},
15762306a36Sopenharmony_ci		.ncf = {0xe1, 0x83, 0x90, 0x00},
15862306a36Sopenharmony_ci	}
15962306a36Sopenharmony_ci};
16062306a36Sopenharmony_ci
16162306a36Sopenharmony_cistatic ssize_t utf8len(const struct unicode_map *um, enum utf8_normalization n,
16262306a36Sopenharmony_ci		const char *s)
16362306a36Sopenharmony_ci{
16462306a36Sopenharmony_ci	return utf8nlen(um, n, s, (size_t)-1);
16562306a36Sopenharmony_ci}
16662306a36Sopenharmony_ci
16762306a36Sopenharmony_cistatic int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um,
16862306a36Sopenharmony_ci		enum utf8_normalization n, const char *s)
16962306a36Sopenharmony_ci{
17062306a36Sopenharmony_ci	return utf8ncursor(u8c, um, n, s, (unsigned int)-1);
17162306a36Sopenharmony_ci}
17262306a36Sopenharmony_ci
17362306a36Sopenharmony_cistatic void check_utf8_nfdi(struct unicode_map *um)
17462306a36Sopenharmony_ci{
17562306a36Sopenharmony_ci	int i;
17662306a36Sopenharmony_ci	struct utf8cursor u8c;
17762306a36Sopenharmony_ci
17862306a36Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
17962306a36Sopenharmony_ci		int len = strlen(nfdi_test_data[i].str);
18062306a36Sopenharmony_ci		int nlen = strlen(nfdi_test_data[i].dec);
18162306a36Sopenharmony_ci		int j = 0;
18262306a36Sopenharmony_ci		unsigned char c;
18362306a36Sopenharmony_ci
18462306a36Sopenharmony_ci		test((utf8len(um, UTF8_NFDI, nfdi_test_data[i].str) == nlen));
18562306a36Sopenharmony_ci		test((utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len) ==
18662306a36Sopenharmony_ci			nlen));
18762306a36Sopenharmony_ci
18862306a36Sopenharmony_ci		if (utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str) < 0)
18962306a36Sopenharmony_ci			pr_err("can't create cursor\n");
19062306a36Sopenharmony_ci
19162306a36Sopenharmony_ci		while ((c = utf8byte(&u8c)) > 0) {
19262306a36Sopenharmony_ci			test_f((c == nfdi_test_data[i].dec[j]),
19362306a36Sopenharmony_ci			       "Unexpected byte 0x%x should be 0x%x\n",
19462306a36Sopenharmony_ci			       c, nfdi_test_data[i].dec[j]);
19562306a36Sopenharmony_ci			j++;
19662306a36Sopenharmony_ci		}
19762306a36Sopenharmony_ci
19862306a36Sopenharmony_ci		test((j == nlen));
19962306a36Sopenharmony_ci	}
20062306a36Sopenharmony_ci}
20162306a36Sopenharmony_ci
20262306a36Sopenharmony_cistatic void check_utf8_nfdicf(struct unicode_map *um)
20362306a36Sopenharmony_ci{
20462306a36Sopenharmony_ci	int i;
20562306a36Sopenharmony_ci	struct utf8cursor u8c;
20662306a36Sopenharmony_ci
20762306a36Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
20862306a36Sopenharmony_ci		int len = strlen(nfdicf_test_data[i].str);
20962306a36Sopenharmony_ci		int nlen = strlen(nfdicf_test_data[i].ncf);
21062306a36Sopenharmony_ci		int j = 0;
21162306a36Sopenharmony_ci		unsigned char c;
21262306a36Sopenharmony_ci
21362306a36Sopenharmony_ci		test((utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str) ==
21462306a36Sopenharmony_ci				nlen));
21562306a36Sopenharmony_ci		test((utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len) ==
21662306a36Sopenharmony_ci				nlen));
21762306a36Sopenharmony_ci
21862306a36Sopenharmony_ci		if (utf8cursor(&u8c, um, UTF8_NFDICF,
21962306a36Sopenharmony_ci				nfdicf_test_data[i].str) < 0)
22062306a36Sopenharmony_ci			pr_err("can't create cursor\n");
22162306a36Sopenharmony_ci
22262306a36Sopenharmony_ci		while ((c = utf8byte(&u8c)) > 0) {
22362306a36Sopenharmony_ci			test_f((c == nfdicf_test_data[i].ncf[j]),
22462306a36Sopenharmony_ci			       "Unexpected byte 0x%x should be 0x%x\n",
22562306a36Sopenharmony_ci			       c, nfdicf_test_data[i].ncf[j]);
22662306a36Sopenharmony_ci			j++;
22762306a36Sopenharmony_ci		}
22862306a36Sopenharmony_ci
22962306a36Sopenharmony_ci		test((j == nlen));
23062306a36Sopenharmony_ci	}
23162306a36Sopenharmony_ci}
23262306a36Sopenharmony_ci
23362306a36Sopenharmony_cistatic void check_utf8_comparisons(struct unicode_map *table)
23462306a36Sopenharmony_ci{
23562306a36Sopenharmony_ci	int i;
23662306a36Sopenharmony_ci
23762306a36Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
23862306a36Sopenharmony_ci		const struct qstr s1 = {.name = nfdi_test_data[i].str,
23962306a36Sopenharmony_ci					.len = sizeof(nfdi_test_data[i].str)};
24062306a36Sopenharmony_ci		const struct qstr s2 = {.name = nfdi_test_data[i].dec,
24162306a36Sopenharmony_ci					.len = sizeof(nfdi_test_data[i].dec)};
24262306a36Sopenharmony_ci
24362306a36Sopenharmony_ci		test_f(!utf8_strncmp(table, &s1, &s2),
24462306a36Sopenharmony_ci		       "%s %s comparison mismatch\n", s1.name, s2.name);
24562306a36Sopenharmony_ci	}
24662306a36Sopenharmony_ci
24762306a36Sopenharmony_ci	for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
24862306a36Sopenharmony_ci		const struct qstr s1 = {.name = nfdicf_test_data[i].str,
24962306a36Sopenharmony_ci					.len = sizeof(nfdicf_test_data[i].str)};
25062306a36Sopenharmony_ci		const struct qstr s2 = {.name = nfdicf_test_data[i].ncf,
25162306a36Sopenharmony_ci					.len = sizeof(nfdicf_test_data[i].ncf)};
25262306a36Sopenharmony_ci
25362306a36Sopenharmony_ci		test_f(!utf8_strncasecmp(table, &s1, &s2),
25462306a36Sopenharmony_ci		       "%s %s comparison mismatch\n", s1.name, s2.name);
25562306a36Sopenharmony_ci	}
25662306a36Sopenharmony_ci}
25762306a36Sopenharmony_ci
25862306a36Sopenharmony_cistatic void check_supported_versions(struct unicode_map *um)
25962306a36Sopenharmony_ci{
26062306a36Sopenharmony_ci	/* Unicode 7.0.0 should be supported. */
26162306a36Sopenharmony_ci	test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0)));
26262306a36Sopenharmony_ci
26362306a36Sopenharmony_ci	/* Unicode 9.0.0 should be supported. */
26462306a36Sopenharmony_ci	test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0)));
26562306a36Sopenharmony_ci
26662306a36Sopenharmony_ci	/* Unicode 1x.0.0 (the latest version) should be supported. */
26762306a36Sopenharmony_ci	test(utf8version_is_supported(um, UTF8_LATEST));
26862306a36Sopenharmony_ci
26962306a36Sopenharmony_ci	/* Next versions don't exist. */
27062306a36Sopenharmony_ci	test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0)));
27162306a36Sopenharmony_ci	test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0)));
27262306a36Sopenharmony_ci	test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1)));
27362306a36Sopenharmony_ci}
27462306a36Sopenharmony_ci
27562306a36Sopenharmony_cistatic int __init init_test_ucd(void)
27662306a36Sopenharmony_ci{
27762306a36Sopenharmony_ci	struct unicode_map *um;
27862306a36Sopenharmony_ci
27962306a36Sopenharmony_ci	failed_tests = 0;
28062306a36Sopenharmony_ci	total_tests = 0;
28162306a36Sopenharmony_ci
28262306a36Sopenharmony_ci	um = utf8_load(UTF8_LATEST);
28362306a36Sopenharmony_ci	if (IS_ERR(um)) {
28462306a36Sopenharmony_ci		pr_err("%s: Unable to load utf8 table.\n", __func__);
28562306a36Sopenharmony_ci		return PTR_ERR(um);
28662306a36Sopenharmony_ci	}
28762306a36Sopenharmony_ci
28862306a36Sopenharmony_ci	check_supported_versions(um);
28962306a36Sopenharmony_ci	check_utf8_nfdi(um);
29062306a36Sopenharmony_ci	check_utf8_nfdicf(um);
29162306a36Sopenharmony_ci	check_utf8_comparisons(um);
29262306a36Sopenharmony_ci
29362306a36Sopenharmony_ci	if (!failed_tests)
29462306a36Sopenharmony_ci		pr_info("All %u tests passed\n", total_tests);
29562306a36Sopenharmony_ci	else
29662306a36Sopenharmony_ci		pr_err("%u out of %u tests failed\n", failed_tests,
29762306a36Sopenharmony_ci		       total_tests);
29862306a36Sopenharmony_ci	utf8_unload(um);
29962306a36Sopenharmony_ci	return 0;
30062306a36Sopenharmony_ci}
30162306a36Sopenharmony_ci
30262306a36Sopenharmony_cistatic void __exit exit_test_ucd(void)
30362306a36Sopenharmony_ci{
30462306a36Sopenharmony_ci}
30562306a36Sopenharmony_ci
30662306a36Sopenharmony_cimodule_init(init_test_ucd);
30762306a36Sopenharmony_cimodule_exit(exit_test_ucd);
30862306a36Sopenharmony_ci
30962306a36Sopenharmony_ciMODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>");
31062306a36Sopenharmony_ciMODULE_LICENSE("GPL");
311