1// To regenerate tables, run the following in the repo root: 2// 3// $ cargo install ucd-generate 4// $ curl -LO https://www.unicode.org/Public/zipped/15.0.0/UCD.zip 5// $ unzip UCD.zip -d UCD 6// $ ucd-generate property-bool UCD --include XID_Start,XID_Continue > tests/table/tables.rs 7// $ ucd-generate property-bool UCD --include XID_Start,XID_Continue --fst-dir tests/fst 8// $ ucd-generate property-bool UCD --include XID_Start,XID_Continue --trie-set > tests/trie/trie.rs 9// $ cargo run --manifest-path generate/Cargo.toml 10 11#![allow( 12 clippy::cast_lossless, 13 clippy::cast_possible_truncation, // https://github.com/rust-lang/rust-clippy/issues/9613 14 clippy::let_underscore_untyped, 15 clippy::match_wild_err_arm, 16 clippy::module_name_repetitions, 17 clippy::too_many_lines, 18 clippy::uninlined_format_args 19)] 20 21mod output; 22mod parse; 23mod write; 24 25use crate::parse::parse_xid_properties; 26use std::collections::{BTreeMap as Map, VecDeque}; 27use std::convert::TryFrom; 28use std::fs; 29use std::io::{self, Write}; 30use std::path::Path; 31use std::process; 32 33const CHUNK: usize = 64; 34const UCD: &str = "UCD"; 35const TABLES: &str = "src/tables.rs"; 36 37fn main() { 38 let manifest_dir = Path::new(env!("CARGO_MANIFEST_DIR")); 39 let unicode_ident_dir = manifest_dir.parent().unwrap(); 40 let ucd_dir = unicode_ident_dir.join(UCD); 41 let properties = parse_xid_properties(&ucd_dir); 42 43 let mut chunkmap = Map::<[u8; CHUNK], u8>::new(); 44 let mut dense = Vec::<[u8; CHUNK]>::new(); 45 let mut new_chunk = |chunk| { 46 if let Some(prev) = chunkmap.get(&chunk) { 47 *prev 48 } else { 49 dense.push(chunk); 50 let Ok(new) = u8::try_from(chunkmap.len()) else { 51 panic!("exceeded 256 unique chunks"); 52 }; 53 chunkmap.insert(chunk, new); 54 new 55 } 56 }; 57 58 let empty_chunk = [0u8; CHUNK]; 59 new_chunk(empty_chunk); 60 61 let mut index_start = Vec::<u8>::new(); 62 let mut index_continue = Vec::<u8>::new(); 63 for i in 0..(u32::from(char::MAX) + 1) / CHUNK as u32 / 8 { 64 let mut start_bits = empty_chunk; 65 let mut continue_bits = empty_chunk; 66 for j in 0..CHUNK as u32 { 67 let this_start = &mut start_bits[j as usize]; 68 let this_continue = &mut continue_bits[j as usize]; 69 for k in 0..8u32 { 70 let code = (i * CHUNK as u32 + j) * 8 + k; 71 if code >= 0x80 { 72 if let Some(ch) = char::from_u32(code) { 73 *this_start |= (properties.is_xid_start(ch) as u8) << k; 74 *this_continue |= (properties.is_xid_continue(ch) as u8) << k; 75 } 76 } 77 } 78 } 79 index_start.push(new_chunk(start_bits)); 80 index_continue.push(new_chunk(continue_bits)); 81 } 82 83 while let Some(0) = index_start.last() { 84 index_start.pop(); 85 } 86 while let Some(0) = index_continue.last() { 87 index_continue.pop(); 88 } 89 90 let mut halfchunkmap = Map::new(); 91 for chunk in &dense { 92 let mut front = [0u8; CHUNK / 2]; 93 let mut back = [0u8; CHUNK / 2]; 94 front.copy_from_slice(&chunk[..CHUNK / 2]); 95 back.copy_from_slice(&chunk[CHUNK / 2..]); 96 halfchunkmap 97 .entry(front) 98 .or_insert_with(VecDeque::new) 99 .push_back(back); 100 } 101 102 let mut halfdense = Vec::<u8>::new(); 103 let mut dense_to_halfdense = Map::<u8, u8>::new(); 104 for chunk in &dense { 105 let original_pos = chunkmap[chunk]; 106 if dense_to_halfdense.contains_key(&original_pos) { 107 continue; 108 } 109 let mut front = [0u8; CHUNK / 2]; 110 let mut back = [0u8; CHUNK / 2]; 111 front.copy_from_slice(&chunk[..CHUNK / 2]); 112 back.copy_from_slice(&chunk[CHUNK / 2..]); 113 dense_to_halfdense.insert( 114 original_pos, 115 match u8::try_from(halfdense.len() / (CHUNK / 2)) { 116 Ok(byte) => byte, 117 Err(_) => panic!("exceeded 256 half-chunks"), 118 }, 119 ); 120 halfdense.extend_from_slice(&front); 121 halfdense.extend_from_slice(&back); 122 while let Some(next) = halfchunkmap.get_mut(&back).and_then(VecDeque::pop_front) { 123 let mut concat = empty_chunk; 124 concat[..CHUNK / 2].copy_from_slice(&back); 125 concat[CHUNK / 2..].copy_from_slice(&next); 126 let original_pos = chunkmap[&concat]; 127 if dense_to_halfdense.contains_key(&original_pos) { 128 continue; 129 } 130 dense_to_halfdense.insert( 131 original_pos, 132 match u8::try_from(halfdense.len() / (CHUNK / 2) - 1) { 133 Ok(byte) => byte, 134 Err(_) => panic!("exceeded 256 half-chunks"), 135 }, 136 ); 137 halfdense.extend_from_slice(&next); 138 back = next; 139 } 140 } 141 142 for index in &mut index_start { 143 *index = dense_to_halfdense[index]; 144 } 145 for index in &mut index_continue { 146 *index = dense_to_halfdense[index]; 147 } 148 149 let out = write::output(&properties, &index_start, &index_continue, &halfdense); 150 let path = unicode_ident_dir.join(TABLES); 151 if let Err(err) = fs::write(&path, out) { 152 let _ = writeln!(io::stderr(), "{}: {err}", path.display()); 153 process::exit(1); 154 } 155} 156