1// To regenerate tables, run the following in the repo root:
2//
3// $ cargo install ucd-generate
4// $ curl -LO https://www.unicode.org/Public/zipped/15.0.0/UCD.zip
5// $ unzip UCD.zip -d UCD
6// $ ucd-generate property-bool UCD --include XID_Start,XID_Continue > tests/table/tables.rs
7// $ ucd-generate property-bool UCD --include XID_Start,XID_Continue --fst-dir tests/fst
8// $ ucd-generate property-bool UCD --include XID_Start,XID_Continue --trie-set > tests/trie/trie.rs
9// $ cargo run --manifest-path generate/Cargo.toml
10
11#![allow(
12    clippy::cast_lossless,
13    clippy::cast_possible_truncation, // https://github.com/rust-lang/rust-clippy/issues/9613
14    clippy::let_underscore_untyped,
15    clippy::match_wild_err_arm,
16    clippy::module_name_repetitions,
17    clippy::too_many_lines,
18    clippy::uninlined_format_args
19)]
20
21mod output;
22mod parse;
23mod write;
24
25use crate::parse::parse_xid_properties;
26use std::collections::{BTreeMap as Map, VecDeque};
27use std::convert::TryFrom;
28use std::fs;
29use std::io::{self, Write};
30use std::path::Path;
31use std::process;
32
33const CHUNK: usize = 64;
34const UCD: &str = "UCD";
35const TABLES: &str = "src/tables.rs";
36
37fn main() {
38    let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR"));
39    let unicode_ident_dir = manifest_dir.parent().unwrap();
40    let ucd_dir = unicode_ident_dir.join(UCD);
41    let properties = parse_xid_properties(&ucd_dir);
42
43    let mut chunkmap = Map::<[u8; CHUNK], u8>::new();
44    let mut dense = Vec::<[u8; CHUNK]>::new();
45    let mut new_chunk = |chunk| {
46        if let Some(prev) = chunkmap.get(&chunk) {
47            *prev
48        } else {
49            dense.push(chunk);
50            let Ok(new) = u8::try_from(chunkmap.len()) else {
51                panic!("exceeded 256 unique chunks");
52            };
53            chunkmap.insert(chunk, new);
54            new
55        }
56    };
57
58    let empty_chunk = [0u8; CHUNK];
59    new_chunk(empty_chunk);
60
61    let mut index_start = Vec::<u8>::new();
62    let mut index_continue = Vec::<u8>::new();
63    for i in 0..(u32::from(char::MAX) + 1) / CHUNK as u32 / 8 {
64        let mut start_bits = empty_chunk;
65        let mut continue_bits = empty_chunk;
66        for j in 0..CHUNK as u32 {
67            let this_start = &mut start_bits[j as usize];
68            let this_continue = &mut continue_bits[j as usize];
69            for k in 0..8u32 {
70                let code = (i * CHUNK as u32 + j) * 8 + k;
71                if code >= 0x80 {
72                    if let Some(ch) = char::from_u32(code) {
73                        *this_start |= (properties.is_xid_start(ch) as u8) << k;
74                        *this_continue |= (properties.is_xid_continue(ch) as u8) << k;
75                    }
76                }
77            }
78        }
79        index_start.push(new_chunk(start_bits));
80        index_continue.push(new_chunk(continue_bits));
81    }
82
83    while let Some(0) = index_start.last() {
84        index_start.pop();
85    }
86    while let Some(0) = index_continue.last() {
87        index_continue.pop();
88    }
89
90    let mut halfchunkmap = Map::new();
91    for chunk in &dense {
92        let mut front = [0u8; CHUNK / 2];
93        let mut back = [0u8; CHUNK / 2];
94        front.copy_from_slice(&chunk[..CHUNK / 2]);
95        back.copy_from_slice(&chunk[CHUNK / 2..]);
96        halfchunkmap
97            .entry(front)
98            .or_insert_with(VecDeque::new)
99            .push_back(back);
100    }
101
102    let mut halfdense = Vec::<u8>::new();
103    let mut dense_to_halfdense = Map::<u8, u8>::new();
104    for chunk in &dense {
105        let original_pos = chunkmap[chunk];
106        if dense_to_halfdense.contains_key(&original_pos) {
107            continue;
108        }
109        let mut front = [0u8; CHUNK / 2];
110        let mut back = [0u8; CHUNK / 2];
111        front.copy_from_slice(&chunk[..CHUNK / 2]);
112        back.copy_from_slice(&chunk[CHUNK / 2..]);
113        dense_to_halfdense.insert(
114            original_pos,
115            match u8::try_from(halfdense.len() / (CHUNK / 2)) {
116                Ok(byte) => byte,
117                Err(_) => panic!("exceeded 256 half-chunks"),
118            },
119        );
120        halfdense.extend_from_slice(&front);
121        halfdense.extend_from_slice(&back);
122        while let Some(next) = halfchunkmap.get_mut(&back).and_then(VecDeque::pop_front) {
123            let mut concat = empty_chunk;
124            concat[..CHUNK / 2].copy_from_slice(&back);
125            concat[CHUNK / 2..].copy_from_slice(&next);
126            let original_pos = chunkmap[&concat];
127            if dense_to_halfdense.contains_key(&original_pos) {
128                continue;
129            }
130            dense_to_halfdense.insert(
131                original_pos,
132                match u8::try_from(halfdense.len() / (CHUNK / 2) - 1) {
133                    Ok(byte) => byte,
134                    Err(_) => panic!("exceeded 256 half-chunks"),
135                },
136            );
137            halfdense.extend_from_slice(&next);
138            back = next;
139        }
140    }
141
142    for index in &mut index_start {
143        *index = dense_to_halfdense[index];
144    }
145    for index in &mut index_continue {
146        *index = dense_to_halfdense[index];
147    }
148
149    let out = write::output(&properties, &index_start, &index_continue, &halfdense);
150    let path = unicode_ident_dir.join(TABLES);
151    if let Err(err) = fs::write(&path, out) {
152        let _ = writeln!(io::stderr(), "{}: {err}", path.display());
153        process::exit(1);
154    }
155}
156