106f54294Sopenharmony_ciuse std::fmt;
206f54294Sopenharmony_ci
306f54294Sopenharmony_ci/// A representation of byte oriented equivalence classes.
406f54294Sopenharmony_ci///
506f54294Sopenharmony_ci/// This is used in an FSM to reduce the size of the transition table. This can
606f54294Sopenharmony_ci/// have a particularly large impact not only on the total size of an FSM, but
706f54294Sopenharmony_ci/// also on compile times.
806f54294Sopenharmony_ci#[derive(Clone, Copy)]
906f54294Sopenharmony_cipub struct ByteClasses([u8; 256]);
1006f54294Sopenharmony_ci
1106f54294Sopenharmony_ciimpl ByteClasses {
1206f54294Sopenharmony_ci    /// Creates a new set of equivalence classes where all bytes are mapped to
1306f54294Sopenharmony_ci    /// the same class.
1406f54294Sopenharmony_ci    pub fn empty() -> ByteClasses {
1506f54294Sopenharmony_ci        ByteClasses([0; 256])
1606f54294Sopenharmony_ci    }
1706f54294Sopenharmony_ci
1806f54294Sopenharmony_ci    /// Creates a new set of equivalence classes where each byte belongs to
1906f54294Sopenharmony_ci    /// its own equivalence class.
2006f54294Sopenharmony_ci    pub fn singletons() -> ByteClasses {
2106f54294Sopenharmony_ci        let mut classes = ByteClasses::empty();
2206f54294Sopenharmony_ci        for i in 0..256 {
2306f54294Sopenharmony_ci            classes.set(i as u8, i as u8);
2406f54294Sopenharmony_ci        }
2506f54294Sopenharmony_ci        classes
2606f54294Sopenharmony_ci    }
2706f54294Sopenharmony_ci
2806f54294Sopenharmony_ci    /// Set the equivalence class for the given byte.
2906f54294Sopenharmony_ci    #[inline]
3006f54294Sopenharmony_ci    pub fn set(&mut self, byte: u8, class: u8) {
3106f54294Sopenharmony_ci        self.0[byte as usize] = class;
3206f54294Sopenharmony_ci    }
3306f54294Sopenharmony_ci
3406f54294Sopenharmony_ci    /// Get the equivalence class for the given byte.
3506f54294Sopenharmony_ci    #[inline]
3606f54294Sopenharmony_ci    pub fn get(&self, byte: u8) -> u8 {
3706f54294Sopenharmony_ci        // SAFETY: This is safe because all dense transitions have
3806f54294Sopenharmony_ci        // exactly 256 elements, so all u8 values are valid indices.
3906f54294Sopenharmony_ci        self.0[byte as usize]
4006f54294Sopenharmony_ci    }
4106f54294Sopenharmony_ci
4206f54294Sopenharmony_ci    /// Return the total number of elements in the alphabet represented by
4306f54294Sopenharmony_ci    /// these equivalence classes. Equivalently, this returns the total number
4406f54294Sopenharmony_ci    /// of equivalence classes.
4506f54294Sopenharmony_ci    #[inline]
4606f54294Sopenharmony_ci    pub fn alphabet_len(&self) -> usize {
4706f54294Sopenharmony_ci        self.0[255] as usize + 1
4806f54294Sopenharmony_ci    }
4906f54294Sopenharmony_ci
5006f54294Sopenharmony_ci    /// Returns true if and only if every byte in this class maps to its own
5106f54294Sopenharmony_ci    /// equivalence class. Equivalently, there are 256 equivalence classes
5206f54294Sopenharmony_ci    /// and each class contains exactly one byte.
5306f54294Sopenharmony_ci    #[inline]
5406f54294Sopenharmony_ci    pub fn is_singleton(&self) -> bool {
5506f54294Sopenharmony_ci        self.alphabet_len() == 256
5606f54294Sopenharmony_ci    }
5706f54294Sopenharmony_ci
5806f54294Sopenharmony_ci    /// Returns an iterator over a sequence of representative bytes from each
5906f54294Sopenharmony_ci    /// equivalence class. Namely, this yields exactly N items, where N is
6006f54294Sopenharmony_ci    /// equivalent to the number of equivalence classes. Each item is an
6106f54294Sopenharmony_ci    /// arbitrary byte drawn from each equivalence class.
6206f54294Sopenharmony_ci    ///
6306f54294Sopenharmony_ci    /// This is useful when one is determinizing an NFA and the NFA's alphabet
6406f54294Sopenharmony_ci    /// hasn't been converted to equivalence classes yet. Picking an arbitrary
6506f54294Sopenharmony_ci    /// byte from each equivalence class then permits a full exploration of
6606f54294Sopenharmony_ci    /// the NFA instead of using every possible byte value.
6706f54294Sopenharmony_ci    pub fn representatives(&self) -> ByteClassRepresentatives<'_> {
6806f54294Sopenharmony_ci        ByteClassRepresentatives { classes: self, byte: 0, last_class: None }
6906f54294Sopenharmony_ci    }
7006f54294Sopenharmony_ci
7106f54294Sopenharmony_ci    /// Returns all of the bytes in the given equivalence class.
7206f54294Sopenharmony_ci    ///
7306f54294Sopenharmony_ci    /// The second element in the tuple indicates the number of elements in
7406f54294Sopenharmony_ci    /// the array.
7506f54294Sopenharmony_ci    fn elements(&self, equiv: u8) -> ([u8; 256], usize) {
7606f54294Sopenharmony_ci        let (mut array, mut len) = ([0; 256], 0);
7706f54294Sopenharmony_ci        for b in 0..256 {
7806f54294Sopenharmony_ci            if self.get(b as u8) == equiv {
7906f54294Sopenharmony_ci                array[len] = b as u8;
8006f54294Sopenharmony_ci                len += 1;
8106f54294Sopenharmony_ci            }
8206f54294Sopenharmony_ci        }
8306f54294Sopenharmony_ci        (array, len)
8406f54294Sopenharmony_ci    }
8506f54294Sopenharmony_ci}
8606f54294Sopenharmony_ci
8706f54294Sopenharmony_ciimpl fmt::Debug for ByteClasses {
8806f54294Sopenharmony_ci    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
8906f54294Sopenharmony_ci        if self.is_singleton() {
9006f54294Sopenharmony_ci            write!(f, "ByteClasses({{singletons}})")
9106f54294Sopenharmony_ci        } else {
9206f54294Sopenharmony_ci            write!(f, "ByteClasses(")?;
9306f54294Sopenharmony_ci            for equiv in 0..self.alphabet_len() {
9406f54294Sopenharmony_ci                let (members, len) = self.elements(equiv as u8);
9506f54294Sopenharmony_ci                write!(f, " {} => {:?}", equiv, &members[..len])?;
9606f54294Sopenharmony_ci            }
9706f54294Sopenharmony_ci            write!(f, ")")
9806f54294Sopenharmony_ci        }
9906f54294Sopenharmony_ci    }
10006f54294Sopenharmony_ci}
10106f54294Sopenharmony_ci
10206f54294Sopenharmony_ci/// An iterator over representative bytes from each equivalence class.
10306f54294Sopenharmony_ci#[derive(Debug)]
10406f54294Sopenharmony_cipub struct ByteClassRepresentatives<'a> {
10506f54294Sopenharmony_ci    classes: &'a ByteClasses,
10606f54294Sopenharmony_ci    byte: usize,
10706f54294Sopenharmony_ci    last_class: Option<u8>,
10806f54294Sopenharmony_ci}
10906f54294Sopenharmony_ci
11006f54294Sopenharmony_ciimpl<'a> Iterator for ByteClassRepresentatives<'a> {
11106f54294Sopenharmony_ci    type Item = u8;
11206f54294Sopenharmony_ci
11306f54294Sopenharmony_ci    fn next(&mut self) -> Option<u8> {
11406f54294Sopenharmony_ci        while self.byte < 256 {
11506f54294Sopenharmony_ci            let byte = self.byte as u8;
11606f54294Sopenharmony_ci            let class = self.classes.get(byte);
11706f54294Sopenharmony_ci            self.byte += 1;
11806f54294Sopenharmony_ci
11906f54294Sopenharmony_ci            if self.last_class != Some(class) {
12006f54294Sopenharmony_ci                self.last_class = Some(class);
12106f54294Sopenharmony_ci                return Some(byte);
12206f54294Sopenharmony_ci            }
12306f54294Sopenharmony_ci        }
12406f54294Sopenharmony_ci        None
12506f54294Sopenharmony_ci    }
12606f54294Sopenharmony_ci}
12706f54294Sopenharmony_ci
12806f54294Sopenharmony_ci/// A byte class builder keeps track of an *approximation* of equivalence
12906f54294Sopenharmony_ci/// classes of bytes during NFA construction. That is, every byte in an
13006f54294Sopenharmony_ci/// equivalence class cannot discriminate between a match and a non-match.
13106f54294Sopenharmony_ci///
13206f54294Sopenharmony_ci/// For example, in the literals `abc` and `xyz`, the bytes [\x00-`], [d-w]
13306f54294Sopenharmony_ci/// and [{-\xFF] never discriminate between a match and a non-match, precisely
13406f54294Sopenharmony_ci/// because they never occur in the literals anywhere.
13506f54294Sopenharmony_ci///
13606f54294Sopenharmony_ci/// Note though that this does not necessarily compute the minimal set of
13706f54294Sopenharmony_ci/// equivalence classes. For example, in the literals above, the byte ranges
13806f54294Sopenharmony_ci/// [\x00-`], [d-w] and [{-\xFF] are all treated as distinct equivalence
13906f54294Sopenharmony_ci/// classes even though they could be treated a single class. The reason for
14006f54294Sopenharmony_ci/// this is implementation complexity. In the future, we should endeavor to
14106f54294Sopenharmony_ci/// compute the minimal equivalence classes since they can have a rather large
14206f54294Sopenharmony_ci/// impact on the size of the DFA.
14306f54294Sopenharmony_ci///
14406f54294Sopenharmony_ci/// The representation here is 256 booleans, all initially set to false. Each
14506f54294Sopenharmony_ci/// boolean maps to its corresponding byte based on position. A `true` value
14606f54294Sopenharmony_ci/// indicates the end of an equivalence class, where its corresponding byte
14706f54294Sopenharmony_ci/// and all of the bytes corresponding to all previous contiguous `false`
14806f54294Sopenharmony_ci/// values are in the same equivalence class.
14906f54294Sopenharmony_ci///
15006f54294Sopenharmony_ci/// This particular representation only permits contiguous ranges of bytes to
15106f54294Sopenharmony_ci/// be in the same equivalence class, which means that we can never discover
15206f54294Sopenharmony_ci/// the true minimal set of equivalence classes.
15306f54294Sopenharmony_ci#[derive(Debug)]
15406f54294Sopenharmony_cipub struct ByteClassBuilder(Vec<bool>);
15506f54294Sopenharmony_ci
15606f54294Sopenharmony_ciimpl ByteClassBuilder {
15706f54294Sopenharmony_ci    /// Create a new builder of byte classes where all bytes are part of the
15806f54294Sopenharmony_ci    /// same equivalence class.
15906f54294Sopenharmony_ci    pub fn new() -> ByteClassBuilder {
16006f54294Sopenharmony_ci        ByteClassBuilder(vec![false; 256])
16106f54294Sopenharmony_ci    }
16206f54294Sopenharmony_ci
16306f54294Sopenharmony_ci    /// Indicate the the range of byte given (inclusive) can discriminate a
16406f54294Sopenharmony_ci    /// match between it and all other bytes outside of the range.
16506f54294Sopenharmony_ci    pub fn set_range(&mut self, start: u8, end: u8) {
16606f54294Sopenharmony_ci        debug_assert!(start <= end);
16706f54294Sopenharmony_ci        if start > 0 {
16806f54294Sopenharmony_ci            self.0[start as usize - 1] = true;
16906f54294Sopenharmony_ci        }
17006f54294Sopenharmony_ci        self.0[end as usize] = true;
17106f54294Sopenharmony_ci    }
17206f54294Sopenharmony_ci
17306f54294Sopenharmony_ci    /// Build byte classes that map all byte values to their corresponding
17406f54294Sopenharmony_ci    /// equivalence class. The last mapping indicates the largest equivalence
17506f54294Sopenharmony_ci    /// class identifier (which is never bigger than 255).
17606f54294Sopenharmony_ci    pub fn build(&self) -> ByteClasses {
17706f54294Sopenharmony_ci        let mut classes = ByteClasses::empty();
17806f54294Sopenharmony_ci        let mut class = 0u8;
17906f54294Sopenharmony_ci        let mut i = 0;
18006f54294Sopenharmony_ci        loop {
18106f54294Sopenharmony_ci            classes.set(i as u8, class as u8);
18206f54294Sopenharmony_ci            if i >= 255 {
18306f54294Sopenharmony_ci                break;
18406f54294Sopenharmony_ci            }
18506f54294Sopenharmony_ci            if self.0[i] {
18606f54294Sopenharmony_ci                class = class.checked_add(1).unwrap();
18706f54294Sopenharmony_ci            }
18806f54294Sopenharmony_ci            i += 1;
18906f54294Sopenharmony_ci        }
19006f54294Sopenharmony_ci        classes
19106f54294Sopenharmony_ci    }
19206f54294Sopenharmony_ci}
19306f54294Sopenharmony_ci
19406f54294Sopenharmony_ci#[cfg(test)]
19506f54294Sopenharmony_cimod tests {
19606f54294Sopenharmony_ci    use super::*;
19706f54294Sopenharmony_ci
19806f54294Sopenharmony_ci    #[test]
19906f54294Sopenharmony_ci    fn byte_classes() {
20006f54294Sopenharmony_ci        let mut set = ByteClassBuilder::new();
20106f54294Sopenharmony_ci        set.set_range(b'a', b'z');
20206f54294Sopenharmony_ci
20306f54294Sopenharmony_ci        let classes = set.build();
20406f54294Sopenharmony_ci        assert_eq!(classes.get(0), 0);
20506f54294Sopenharmony_ci        assert_eq!(classes.get(1), 0);
20606f54294Sopenharmony_ci        assert_eq!(classes.get(2), 0);
20706f54294Sopenharmony_ci        assert_eq!(classes.get(b'a' - 1), 0);
20806f54294Sopenharmony_ci        assert_eq!(classes.get(b'a'), 1);
20906f54294Sopenharmony_ci        assert_eq!(classes.get(b'm'), 1);
21006f54294Sopenharmony_ci        assert_eq!(classes.get(b'z'), 1);
21106f54294Sopenharmony_ci        assert_eq!(classes.get(b'z' + 1), 2);
21206f54294Sopenharmony_ci        assert_eq!(classes.get(254), 2);
21306f54294Sopenharmony_ci        assert_eq!(classes.get(255), 2);
21406f54294Sopenharmony_ci
21506f54294Sopenharmony_ci        let mut set = ByteClassBuilder::new();
21606f54294Sopenharmony_ci        set.set_range(0, 2);
21706f54294Sopenharmony_ci        set.set_range(4, 6);
21806f54294Sopenharmony_ci        let classes = set.build();
21906f54294Sopenharmony_ci        assert_eq!(classes.get(0), 0);
22006f54294Sopenharmony_ci        assert_eq!(classes.get(1), 0);
22106f54294Sopenharmony_ci        assert_eq!(classes.get(2), 0);
22206f54294Sopenharmony_ci        assert_eq!(classes.get(3), 1);
22306f54294Sopenharmony_ci        assert_eq!(classes.get(4), 2);
22406f54294Sopenharmony_ci        assert_eq!(classes.get(5), 2);
22506f54294Sopenharmony_ci        assert_eq!(classes.get(6), 2);
22606f54294Sopenharmony_ci        assert_eq!(classes.get(7), 3);
22706f54294Sopenharmony_ci        assert_eq!(classes.get(255), 3);
22806f54294Sopenharmony_ci    }
22906f54294Sopenharmony_ci
23006f54294Sopenharmony_ci    #[test]
23106f54294Sopenharmony_ci    fn full_byte_classes() {
23206f54294Sopenharmony_ci        let mut set = ByteClassBuilder::new();
23306f54294Sopenharmony_ci        for i in 0..256u16 {
23406f54294Sopenharmony_ci            set.set_range(i as u8, i as u8);
23506f54294Sopenharmony_ci        }
23606f54294Sopenharmony_ci        assert_eq!(set.build().alphabet_len(), 256);
23706f54294Sopenharmony_ci    }
23806f54294Sopenharmony_ci}
239