1 // To regenerate tables, run the following in the repo root:
2 //
3 // $ cargo install ucd-generate
4 // $ curl -LO https://www.unicode.org/Public/zipped/15.0.0/UCD.zip
5 // $ unzip UCD.zip -d UCD
6 // $ ucd-generate property-bool UCD --include XID_Start,XID_Continue > tests/table/tables.rs
7 // $ ucd-generate property-bool UCD --include XID_Start,XID_Continue --fst-dir tests/fst
8 // $ ucd-generate property-bool UCD --include XID_Start,XID_Continue --trie-set > tests/trie/trie.rs
9 // $ cargo run --manifest-path generate/Cargo.toml
10
11 #![allow(
12 clippy::cast_lossless,
13 clippy::cast_possible_truncation, // https://github.com/rust-lang/rust-clippy/issues/9613
14 clippy::let_underscore_untyped,
15 clippy::match_wild_err_arm,
16 clippy::module_name_repetitions,
17 clippy::too_many_lines,
18 clippy::uninlined_format_args
19 )]
20
21 mod output;
22 mod parse;
23 mod write;
24
25 use crate::parse::parse_xid_properties;
26 use std::collections::{BTreeMap as Map, VecDeque};
27 use std::convert::TryFrom;
28 use std::fs;
29 use std::io::{self, Write};
30 use std::path::Path;
31 use std::process;
32
33 const CHUNK: usize = 64;
34 const UCD: &str = "UCD";
35 const TABLES: &str = "src/tables.rs";
36
mainnull37 fn main() {
38 let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR"));
39 let unicode_ident_dir = manifest_dir.parent().unwrap();
40 let ucd_dir = unicode_ident_dir.join(UCD);
41 let properties = parse_xid_properties(&ucd_dir);
42
43 let mut chunkmap = Map::<[u8; CHUNK], u8>::new();
44 let mut dense = Vec::<[u8; CHUNK]>::new();
45 let mut new_chunk = |chunk| {
46 if let Some(prev) = chunkmap.get(&chunk) {
47 *prev
48 } else {
49 dense.push(chunk);
50 let Ok(new) = u8::try_from(chunkmap.len()) else {
51 panic!("exceeded 256 unique chunks");
52 };
53 chunkmap.insert(chunk, new);
54 new
55 }
56 };
57
58 let empty_chunk = [0u8; CHUNK];
59 new_chunk(empty_chunk);
60
61 let mut index_start = Vec::<u8>::new();
62 let mut index_continue = Vec::<u8>::new();
63 for i in 0..(u32::from(char::MAX) + 1) / CHUNK as u32 / 8 {
64 let mut start_bits = empty_chunk;
65 let mut continue_bits = empty_chunk;
66 for j in 0..CHUNK as u32 {
67 let this_start = &mut start_bits[j as usize];
68 let this_continue = &mut continue_bits[j as usize];
69 for k in 0..8u32 {
70 let code = (i * CHUNK as u32 + j) * 8 + k;
71 if code >= 0x80 {
72 if let Some(ch) = char::from_u32(code) {
73 *this_start |= (properties.is_xid_start(ch) as u8) << k;
74 *this_continue |= (properties.is_xid_continue(ch) as u8) << k;
75 }
76 }
77 }
78 }
79 index_start.push(new_chunk(start_bits));
80 index_continue.push(new_chunk(continue_bits));
81 }
82
83 while let Some(0) = index_start.last() {
84 index_start.pop();
85 }
86 while let Some(0) = index_continue.last() {
87 index_continue.pop();
88 }
89
90 let mut halfchunkmap = Map::new();
91 for chunk in &dense {
92 let mut front = [0u8; CHUNK / 2];
93 let mut back = [0u8; CHUNK / 2];
94 front.copy_from_slice(&chunk[..CHUNK / 2]);
95 back.copy_from_slice(&chunk[CHUNK / 2..]);
96 halfchunkmap
97 .entry(front)
98 .or_insert_with(VecDeque::new)
99 .push_back(back);
100 }
101
102 let mut halfdense = Vec::<u8>::new();
103 let mut dense_to_halfdense = Map::<u8, u8>::new();
104 for chunk in &dense {
105 let original_pos = chunkmap[chunk];
106 if dense_to_halfdense.contains_key(&original_pos) {
107 continue;
108 }
109 let mut front = [0u8; CHUNK / 2];
110 let mut back = [0u8; CHUNK / 2];
111 front.copy_from_slice(&chunk[..CHUNK / 2]);
112 back.copy_from_slice(&chunk[CHUNK / 2..]);
113 dense_to_halfdense.insert(
114 original_pos,
115 match u8::try_from(halfdense.len() / (CHUNK / 2)) {
116 Ok(byte) => byte,
117 Err(_) => panic!("exceeded 256 half-chunks"),
118 },
119 );
120 halfdense.extend_from_slice(&front);
121 halfdense.extend_from_slice(&back);
122 while let Some(next) = halfchunkmap.get_mut(&back).and_then(VecDeque::pop_front) {
123 let mut concat = empty_chunk;
124 concat[..CHUNK / 2].copy_from_slice(&back);
125 concat[CHUNK / 2..].copy_from_slice(&next);
126 let original_pos = chunkmap[&concat];
127 if dense_to_halfdense.contains_key(&original_pos) {
128 continue;
129 }
130 dense_to_halfdense.insert(
131 original_pos,
132 match u8::try_from(halfdense.len() / (CHUNK / 2) - 1) {
133 Ok(byte) => byte,
134 Err(_) => panic!("exceeded 256 half-chunks"),
135 },
136 );
137 halfdense.extend_from_slice(&next);
138 back = next;
139 }
140 }
141
142 for index in &mut index_start {
143 *index = dense_to_halfdense[index];
144 }
145 for index in &mut index_continue {
146 *index = dense_to_halfdense[index];
147 }
148
149 let out = write::output(&properties, &index_start, &index_continue, &halfdense);
150 let path = unicode_ident_dir.join(TABLES);
151 if let Err(err) = fs::write(&path, out) {
152 let _ = writeln!(io::stderr(), "{}: {err}", path.display());
153 process::exit(1);
154 }
155 }
156