1 // To regenerate tables, run the following in the repo root:
2 //
3 // $ cargo install ucd-generate
4 // $ curl -LO https://www.unicode.org/Public/zipped/15.0.0/UCD.zip
5 // $ unzip UCD.zip -d UCD
6 // $ ucd-generate property-bool UCD --include XID_Start,XID_Continue > tests/table/tables.rs
7 // $ ucd-generate property-bool UCD --include XID_Start,XID_Continue --fst-dir tests/fst
8 // $ ucd-generate property-bool UCD --include XID_Start,XID_Continue --trie-set > tests/trie/trie.rs
9 // $ cargo run --manifest-path generate/Cargo.toml
10 
11 #![allow(
12     clippy::cast_lossless,
13     clippy::cast_possible_truncation, // https://github.com/rust-lang/rust-clippy/issues/9613
14     clippy::let_underscore_untyped,
15     clippy::match_wild_err_arm,
16     clippy::module_name_repetitions,
17     clippy::too_many_lines,
18     clippy::uninlined_format_args
19 )]
20 
21 mod output;
22 mod parse;
23 mod write;
24 
25 use crate::parse::parse_xid_properties;
26 use std::collections::{BTreeMap as Map, VecDeque};
27 use std::convert::TryFrom;
28 use std::fs;
29 use std::io::{self, Write};
30 use std::path::Path;
31 use std::process;
32 
33 const CHUNK: usize = 64;
34 const UCD: &str = "UCD";
35 const TABLES: &str = "src/tables.rs";
36 
mainnull37 fn main() {
38     let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR"));
39     let unicode_ident_dir = manifest_dir.parent().unwrap();
40     let ucd_dir = unicode_ident_dir.join(UCD);
41     let properties = parse_xid_properties(&ucd_dir);
42 
43     let mut chunkmap = Map::<[u8; CHUNK], u8>::new();
44     let mut dense = Vec::<[u8; CHUNK]>::new();
45     let mut new_chunk = |chunk| {
46         if let Some(prev) = chunkmap.get(&chunk) {
47             *prev
48         } else {
49             dense.push(chunk);
50             let Ok(new) = u8::try_from(chunkmap.len()) else {
51                 panic!("exceeded 256 unique chunks");
52             };
53             chunkmap.insert(chunk, new);
54             new
55         }
56     };
57 
58     let empty_chunk = [0u8; CHUNK];
59     new_chunk(empty_chunk);
60 
61     let mut index_start = Vec::<u8>::new();
62     let mut index_continue = Vec::<u8>::new();
63     for i in 0..(u32::from(char::MAX) + 1) / CHUNK as u32 / 8 {
64         let mut start_bits = empty_chunk;
65         let mut continue_bits = empty_chunk;
66         for j in 0..CHUNK as u32 {
67             let this_start = &mut start_bits[j as usize];
68             let this_continue = &mut continue_bits[j as usize];
69             for k in 0..8u32 {
70                 let code = (i * CHUNK as u32 + j) * 8 + k;
71                 if code >= 0x80 {
72                     if let Some(ch) = char::from_u32(code) {
73                         *this_start |= (properties.is_xid_start(ch) as u8) << k;
74                         *this_continue |= (properties.is_xid_continue(ch) as u8) << k;
75                     }
76                 }
77             }
78         }
79         index_start.push(new_chunk(start_bits));
80         index_continue.push(new_chunk(continue_bits));
81     }
82 
83     while let Some(0) = index_start.last() {
84         index_start.pop();
85     }
86     while let Some(0) = index_continue.last() {
87         index_continue.pop();
88     }
89 
90     let mut halfchunkmap = Map::new();
91     for chunk in &dense {
92         let mut front = [0u8; CHUNK / 2];
93         let mut back = [0u8; CHUNK / 2];
94         front.copy_from_slice(&chunk[..CHUNK / 2]);
95         back.copy_from_slice(&chunk[CHUNK / 2..]);
96         halfchunkmap
97             .entry(front)
98             .or_insert_with(VecDeque::new)
99             .push_back(back);
100     }
101 
102     let mut halfdense = Vec::<u8>::new();
103     let mut dense_to_halfdense = Map::<u8, u8>::new();
104     for chunk in &dense {
105         let original_pos = chunkmap[chunk];
106         if dense_to_halfdense.contains_key(&original_pos) {
107             continue;
108         }
109         let mut front = [0u8; CHUNK / 2];
110         let mut back = [0u8; CHUNK / 2];
111         front.copy_from_slice(&chunk[..CHUNK / 2]);
112         back.copy_from_slice(&chunk[CHUNK / 2..]);
113         dense_to_halfdense.insert(
114             original_pos,
115             match u8::try_from(halfdense.len() / (CHUNK / 2)) {
116                 Ok(byte) => byte,
117                 Err(_) => panic!("exceeded 256 half-chunks"),
118             },
119         );
120         halfdense.extend_from_slice(&front);
121         halfdense.extend_from_slice(&back);
122         while let Some(next) = halfchunkmap.get_mut(&back).and_then(VecDeque::pop_front) {
123             let mut concat = empty_chunk;
124             concat[..CHUNK / 2].copy_from_slice(&back);
125             concat[CHUNK / 2..].copy_from_slice(&next);
126             let original_pos = chunkmap[&concat];
127             if dense_to_halfdense.contains_key(&original_pos) {
128                 continue;
129             }
130             dense_to_halfdense.insert(
131                 original_pos,
132                 match u8::try_from(halfdense.len() / (CHUNK / 2) - 1) {
133                     Ok(byte) => byte,
134                     Err(_) => panic!("exceeded 256 half-chunks"),
135                 },
136             );
137             halfdense.extend_from_slice(&next);
138             back = next;
139         }
140     }
141 
142     for index in &mut index_start {
143         *index = dense_to_halfdense[index];
144     }
145     for index in &mut index_continue {
146         *index = dense_to_halfdense[index];
147     }
148 
149     let out = write::output(&properties, &index_start, &index_continue, &halfdense);
150     let path = unicode_ident_dir.join(TABLES);
151     if let Err(err) = fs::write(&path, out) {
152         let _ = writeln!(io::stderr(), "{}: {err}", path.display());
153         process::exit(1);
154     }
155 }
156