106f54294Sopenharmony_ciuse std::fmt; 206f54294Sopenharmony_ci 306f54294Sopenharmony_ci/// A representation of byte oriented equivalence classes. 406f54294Sopenharmony_ci/// 506f54294Sopenharmony_ci/// This is used in an FSM to reduce the size of the transition table. This can 606f54294Sopenharmony_ci/// have a particularly large impact not only on the total size of an FSM, but 706f54294Sopenharmony_ci/// also on compile times. 806f54294Sopenharmony_ci#[derive(Clone, Copy)] 906f54294Sopenharmony_cipub struct ByteClasses([u8; 256]); 1006f54294Sopenharmony_ci 1106f54294Sopenharmony_ciimpl ByteClasses { 1206f54294Sopenharmony_ci /// Creates a new set of equivalence classes where all bytes are mapped to 1306f54294Sopenharmony_ci /// the same class. 1406f54294Sopenharmony_ci pub fn empty() -> ByteClasses { 1506f54294Sopenharmony_ci ByteClasses([0; 256]) 1606f54294Sopenharmony_ci } 1706f54294Sopenharmony_ci 1806f54294Sopenharmony_ci /// Creates a new set of equivalence classes where each byte belongs to 1906f54294Sopenharmony_ci /// its own equivalence class. 2006f54294Sopenharmony_ci pub fn singletons() -> ByteClasses { 2106f54294Sopenharmony_ci let mut classes = ByteClasses::empty(); 2206f54294Sopenharmony_ci for i in 0..256 { 2306f54294Sopenharmony_ci classes.set(i as u8, i as u8); 2406f54294Sopenharmony_ci } 2506f54294Sopenharmony_ci classes 2606f54294Sopenharmony_ci } 2706f54294Sopenharmony_ci 2806f54294Sopenharmony_ci /// Set the equivalence class for the given byte. 2906f54294Sopenharmony_ci #[inline] 3006f54294Sopenharmony_ci pub fn set(&mut self, byte: u8, class: u8) { 3106f54294Sopenharmony_ci self.0[byte as usize] = class; 3206f54294Sopenharmony_ci } 3306f54294Sopenharmony_ci 3406f54294Sopenharmony_ci /// Get the equivalence class for the given byte. 3506f54294Sopenharmony_ci #[inline] 3606f54294Sopenharmony_ci pub fn get(&self, byte: u8) -> u8 { 3706f54294Sopenharmony_ci // SAFETY: This is safe because all dense transitions have 3806f54294Sopenharmony_ci // exactly 256 elements, so all u8 values are valid indices. 3906f54294Sopenharmony_ci self.0[byte as usize] 4006f54294Sopenharmony_ci } 4106f54294Sopenharmony_ci 4206f54294Sopenharmony_ci /// Return the total number of elements in the alphabet represented by 4306f54294Sopenharmony_ci /// these equivalence classes. Equivalently, this returns the total number 4406f54294Sopenharmony_ci /// of equivalence classes. 4506f54294Sopenharmony_ci #[inline] 4606f54294Sopenharmony_ci pub fn alphabet_len(&self) -> usize { 4706f54294Sopenharmony_ci self.0[255] as usize + 1 4806f54294Sopenharmony_ci } 4906f54294Sopenharmony_ci 5006f54294Sopenharmony_ci /// Returns true if and only if every byte in this class maps to its own 5106f54294Sopenharmony_ci /// equivalence class. Equivalently, there are 256 equivalence classes 5206f54294Sopenharmony_ci /// and each class contains exactly one byte. 5306f54294Sopenharmony_ci #[inline] 5406f54294Sopenharmony_ci pub fn is_singleton(&self) -> bool { 5506f54294Sopenharmony_ci self.alphabet_len() == 256 5606f54294Sopenharmony_ci } 5706f54294Sopenharmony_ci 5806f54294Sopenharmony_ci /// Returns an iterator over a sequence of representative bytes from each 5906f54294Sopenharmony_ci /// equivalence class. Namely, this yields exactly N items, where N is 6006f54294Sopenharmony_ci /// equivalent to the number of equivalence classes. Each item is an 6106f54294Sopenharmony_ci /// arbitrary byte drawn from each equivalence class. 6206f54294Sopenharmony_ci /// 6306f54294Sopenharmony_ci /// This is useful when one is determinizing an NFA and the NFA's alphabet 6406f54294Sopenharmony_ci /// hasn't been converted to equivalence classes yet. Picking an arbitrary 6506f54294Sopenharmony_ci /// byte from each equivalence class then permits a full exploration of 6606f54294Sopenharmony_ci /// the NFA instead of using every possible byte value. 6706f54294Sopenharmony_ci pub fn representatives(&self) -> ByteClassRepresentatives<'_> { 6806f54294Sopenharmony_ci ByteClassRepresentatives { classes: self, byte: 0, last_class: None } 6906f54294Sopenharmony_ci } 7006f54294Sopenharmony_ci 7106f54294Sopenharmony_ci /// Returns all of the bytes in the given equivalence class. 7206f54294Sopenharmony_ci /// 7306f54294Sopenharmony_ci /// The second element in the tuple indicates the number of elements in 7406f54294Sopenharmony_ci /// the array. 7506f54294Sopenharmony_ci fn elements(&self, equiv: u8) -> ([u8; 256], usize) { 7606f54294Sopenharmony_ci let (mut array, mut len) = ([0; 256], 0); 7706f54294Sopenharmony_ci for b in 0..256 { 7806f54294Sopenharmony_ci if self.get(b as u8) == equiv { 7906f54294Sopenharmony_ci array[len] = b as u8; 8006f54294Sopenharmony_ci len += 1; 8106f54294Sopenharmony_ci } 8206f54294Sopenharmony_ci } 8306f54294Sopenharmony_ci (array, len) 8406f54294Sopenharmony_ci } 8506f54294Sopenharmony_ci} 8606f54294Sopenharmony_ci 8706f54294Sopenharmony_ciimpl fmt::Debug for ByteClasses { 8806f54294Sopenharmony_ci fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 8906f54294Sopenharmony_ci if self.is_singleton() { 9006f54294Sopenharmony_ci write!(f, "ByteClasses({{singletons}})") 9106f54294Sopenharmony_ci } else { 9206f54294Sopenharmony_ci write!(f, "ByteClasses(")?; 9306f54294Sopenharmony_ci for equiv in 0..self.alphabet_len() { 9406f54294Sopenharmony_ci let (members, len) = self.elements(equiv as u8); 9506f54294Sopenharmony_ci write!(f, " {} => {:?}", equiv, &members[..len])?; 9606f54294Sopenharmony_ci } 9706f54294Sopenharmony_ci write!(f, ")") 9806f54294Sopenharmony_ci } 9906f54294Sopenharmony_ci } 10006f54294Sopenharmony_ci} 10106f54294Sopenharmony_ci 10206f54294Sopenharmony_ci/// An iterator over representative bytes from each equivalence class. 10306f54294Sopenharmony_ci#[derive(Debug)] 10406f54294Sopenharmony_cipub struct ByteClassRepresentatives<'a> { 10506f54294Sopenharmony_ci classes: &'a ByteClasses, 10606f54294Sopenharmony_ci byte: usize, 10706f54294Sopenharmony_ci last_class: Option<u8>, 10806f54294Sopenharmony_ci} 10906f54294Sopenharmony_ci 11006f54294Sopenharmony_ciimpl<'a> Iterator for ByteClassRepresentatives<'a> { 11106f54294Sopenharmony_ci type Item = u8; 11206f54294Sopenharmony_ci 11306f54294Sopenharmony_ci fn next(&mut self) -> Option<u8> { 11406f54294Sopenharmony_ci while self.byte < 256 { 11506f54294Sopenharmony_ci let byte = self.byte as u8; 11606f54294Sopenharmony_ci let class = self.classes.get(byte); 11706f54294Sopenharmony_ci self.byte += 1; 11806f54294Sopenharmony_ci 11906f54294Sopenharmony_ci if self.last_class != Some(class) { 12006f54294Sopenharmony_ci self.last_class = Some(class); 12106f54294Sopenharmony_ci return Some(byte); 12206f54294Sopenharmony_ci } 12306f54294Sopenharmony_ci } 12406f54294Sopenharmony_ci None 12506f54294Sopenharmony_ci } 12606f54294Sopenharmony_ci} 12706f54294Sopenharmony_ci 12806f54294Sopenharmony_ci/// A byte class builder keeps track of an *approximation* of equivalence 12906f54294Sopenharmony_ci/// classes of bytes during NFA construction. That is, every byte in an 13006f54294Sopenharmony_ci/// equivalence class cannot discriminate between a match and a non-match. 13106f54294Sopenharmony_ci/// 13206f54294Sopenharmony_ci/// For example, in the literals `abc` and `xyz`, the bytes [\x00-`], [d-w] 13306f54294Sopenharmony_ci/// and [{-\xFF] never discriminate between a match and a non-match, precisely 13406f54294Sopenharmony_ci/// because they never occur in the literals anywhere. 13506f54294Sopenharmony_ci/// 13606f54294Sopenharmony_ci/// Note though that this does not necessarily compute the minimal set of 13706f54294Sopenharmony_ci/// equivalence classes. For example, in the literals above, the byte ranges 13806f54294Sopenharmony_ci/// [\x00-`], [d-w] and [{-\xFF] are all treated as distinct equivalence 13906f54294Sopenharmony_ci/// classes even though they could be treated a single class. The reason for 14006f54294Sopenharmony_ci/// this is implementation complexity. In the future, we should endeavor to 14106f54294Sopenharmony_ci/// compute the minimal equivalence classes since they can have a rather large 14206f54294Sopenharmony_ci/// impact on the size of the DFA. 14306f54294Sopenharmony_ci/// 14406f54294Sopenharmony_ci/// The representation here is 256 booleans, all initially set to false. Each 14506f54294Sopenharmony_ci/// boolean maps to its corresponding byte based on position. A `true` value 14606f54294Sopenharmony_ci/// indicates the end of an equivalence class, where its corresponding byte 14706f54294Sopenharmony_ci/// and all of the bytes corresponding to all previous contiguous `false` 14806f54294Sopenharmony_ci/// values are in the same equivalence class. 14906f54294Sopenharmony_ci/// 15006f54294Sopenharmony_ci/// This particular representation only permits contiguous ranges of bytes to 15106f54294Sopenharmony_ci/// be in the same equivalence class, which means that we can never discover 15206f54294Sopenharmony_ci/// the true minimal set of equivalence classes. 15306f54294Sopenharmony_ci#[derive(Debug)] 15406f54294Sopenharmony_cipub struct ByteClassBuilder(Vec<bool>); 15506f54294Sopenharmony_ci 15606f54294Sopenharmony_ciimpl ByteClassBuilder { 15706f54294Sopenharmony_ci /// Create a new builder of byte classes where all bytes are part of the 15806f54294Sopenharmony_ci /// same equivalence class. 15906f54294Sopenharmony_ci pub fn new() -> ByteClassBuilder { 16006f54294Sopenharmony_ci ByteClassBuilder(vec![false; 256]) 16106f54294Sopenharmony_ci } 16206f54294Sopenharmony_ci 16306f54294Sopenharmony_ci /// Indicate the the range of byte given (inclusive) can discriminate a 16406f54294Sopenharmony_ci /// match between it and all other bytes outside of the range. 16506f54294Sopenharmony_ci pub fn set_range(&mut self, start: u8, end: u8) { 16606f54294Sopenharmony_ci debug_assert!(start <= end); 16706f54294Sopenharmony_ci if start > 0 { 16806f54294Sopenharmony_ci self.0[start as usize - 1] = true; 16906f54294Sopenharmony_ci } 17006f54294Sopenharmony_ci self.0[end as usize] = true; 17106f54294Sopenharmony_ci } 17206f54294Sopenharmony_ci 17306f54294Sopenharmony_ci /// Build byte classes that map all byte values to their corresponding 17406f54294Sopenharmony_ci /// equivalence class. The last mapping indicates the largest equivalence 17506f54294Sopenharmony_ci /// class identifier (which is never bigger than 255). 17606f54294Sopenharmony_ci pub fn build(&self) -> ByteClasses { 17706f54294Sopenharmony_ci let mut classes = ByteClasses::empty(); 17806f54294Sopenharmony_ci let mut class = 0u8; 17906f54294Sopenharmony_ci let mut i = 0; 18006f54294Sopenharmony_ci loop { 18106f54294Sopenharmony_ci classes.set(i as u8, class as u8); 18206f54294Sopenharmony_ci if i >= 255 { 18306f54294Sopenharmony_ci break; 18406f54294Sopenharmony_ci } 18506f54294Sopenharmony_ci if self.0[i] { 18606f54294Sopenharmony_ci class = class.checked_add(1).unwrap(); 18706f54294Sopenharmony_ci } 18806f54294Sopenharmony_ci i += 1; 18906f54294Sopenharmony_ci } 19006f54294Sopenharmony_ci classes 19106f54294Sopenharmony_ci } 19206f54294Sopenharmony_ci} 19306f54294Sopenharmony_ci 19406f54294Sopenharmony_ci#[cfg(test)] 19506f54294Sopenharmony_cimod tests { 19606f54294Sopenharmony_ci use super::*; 19706f54294Sopenharmony_ci 19806f54294Sopenharmony_ci #[test] 19906f54294Sopenharmony_ci fn byte_classes() { 20006f54294Sopenharmony_ci let mut set = ByteClassBuilder::new(); 20106f54294Sopenharmony_ci set.set_range(b'a', b'z'); 20206f54294Sopenharmony_ci 20306f54294Sopenharmony_ci let classes = set.build(); 20406f54294Sopenharmony_ci assert_eq!(classes.get(0), 0); 20506f54294Sopenharmony_ci assert_eq!(classes.get(1), 0); 20606f54294Sopenharmony_ci assert_eq!(classes.get(2), 0); 20706f54294Sopenharmony_ci assert_eq!(classes.get(b'a' - 1), 0); 20806f54294Sopenharmony_ci assert_eq!(classes.get(b'a'), 1); 20906f54294Sopenharmony_ci assert_eq!(classes.get(b'm'), 1); 21006f54294Sopenharmony_ci assert_eq!(classes.get(b'z'), 1); 21106f54294Sopenharmony_ci assert_eq!(classes.get(b'z' + 1), 2); 21206f54294Sopenharmony_ci assert_eq!(classes.get(254), 2); 21306f54294Sopenharmony_ci assert_eq!(classes.get(255), 2); 21406f54294Sopenharmony_ci 21506f54294Sopenharmony_ci let mut set = ByteClassBuilder::new(); 21606f54294Sopenharmony_ci set.set_range(0, 2); 21706f54294Sopenharmony_ci set.set_range(4, 6); 21806f54294Sopenharmony_ci let classes = set.build(); 21906f54294Sopenharmony_ci assert_eq!(classes.get(0), 0); 22006f54294Sopenharmony_ci assert_eq!(classes.get(1), 0); 22106f54294Sopenharmony_ci assert_eq!(classes.get(2), 0); 22206f54294Sopenharmony_ci assert_eq!(classes.get(3), 1); 22306f54294Sopenharmony_ci assert_eq!(classes.get(4), 2); 22406f54294Sopenharmony_ci assert_eq!(classes.get(5), 2); 22506f54294Sopenharmony_ci assert_eq!(classes.get(6), 2); 22606f54294Sopenharmony_ci assert_eq!(classes.get(7), 3); 22706f54294Sopenharmony_ci assert_eq!(classes.get(255), 3); 22806f54294Sopenharmony_ci } 22906f54294Sopenharmony_ci 23006f54294Sopenharmony_ci #[test] 23106f54294Sopenharmony_ci fn full_byte_classes() { 23206f54294Sopenharmony_ci let mut set = ByteClassBuilder::new(); 23306f54294Sopenharmony_ci for i in 0..256u16 { 23406f54294Sopenharmony_ci set.set_range(i as u8, i as u8); 23506f54294Sopenharmony_ci } 23606f54294Sopenharmony_ci assert_eq!(set.build().alphabet_len(), 256); 23706f54294Sopenharmony_ci } 23806f54294Sopenharmony_ci} 239