1c67d6573Sopenharmony_ci/// A few elementary UTF-8 encoding and decoding functions used by the matching
2c67d6573Sopenharmony_ci/// engines.
3c67d6573Sopenharmony_ci///
4c67d6573Sopenharmony_ci/// In an ideal world, the matching engines operate on `&str` and we can just
5c67d6573Sopenharmony_ci/// lean on the standard library for all our UTF-8 needs. However, to support
6c67d6573Sopenharmony_ci/// byte based regexes (that can match on arbitrary bytes which may contain
7c67d6573Sopenharmony_ci/// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`.
8c67d6573Sopenharmony_ci/// The standard library doesn't really recognize this use case, so we have
9c67d6573Sopenharmony_ci/// to build it out ourselves.
10c67d6573Sopenharmony_ci///
11c67d6573Sopenharmony_ci/// Should this be factored out into a separate crate? It seems independently
12c67d6573Sopenharmony_ci/// useful. There are other crates that already exist (e.g., `utf-8`) that have
13c67d6573Sopenharmony_ci/// overlapping use cases. Not sure what to do.
14c67d6573Sopenharmony_ciuse std::char;
15c67d6573Sopenharmony_ci
16c67d6573Sopenharmony_ciconst TAG_CONT: u8 = 0b1000_0000;
17c67d6573Sopenharmony_ciconst TAG_TWO: u8 = 0b1100_0000;
18c67d6573Sopenharmony_ciconst TAG_THREE: u8 = 0b1110_0000;
19c67d6573Sopenharmony_ciconst TAG_FOUR: u8 = 0b1111_0000;
20c67d6573Sopenharmony_ci
21c67d6573Sopenharmony_ci/// Returns the smallest possible index of the next valid UTF-8 sequence
22c67d6573Sopenharmony_ci/// starting after `i`.
23c67d6573Sopenharmony_cipub fn next_utf8(text: &[u8], i: usize) -> usize {
24c67d6573Sopenharmony_ci    let b = match text.get(i) {
25c67d6573Sopenharmony_ci        None => return i + 1,
26c67d6573Sopenharmony_ci        Some(&b) => b,
27c67d6573Sopenharmony_ci    };
28c67d6573Sopenharmony_ci    let inc = if b <= 0x7F {
29c67d6573Sopenharmony_ci        1
30c67d6573Sopenharmony_ci    } else if b <= 0b110_11111 {
31c67d6573Sopenharmony_ci        2
32c67d6573Sopenharmony_ci    } else if b <= 0b1110_1111 {
33c67d6573Sopenharmony_ci        3
34c67d6573Sopenharmony_ci    } else {
35c67d6573Sopenharmony_ci        4
36c67d6573Sopenharmony_ci    };
37c67d6573Sopenharmony_ci    i + inc
38c67d6573Sopenharmony_ci}
39c67d6573Sopenharmony_ci
40c67d6573Sopenharmony_ci/// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`.
41c67d6573Sopenharmony_ci///
42c67d6573Sopenharmony_ci/// If no valid UTF-8 sequence could be found, then `None` is returned.
43c67d6573Sopenharmony_ci/// Otherwise, the decoded codepoint and the number of bytes read is returned.
44c67d6573Sopenharmony_ci/// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be
45c67d6573Sopenharmony_ci/// 1, 2, 3 or 4.
46c67d6573Sopenharmony_ci///
47c67d6573Sopenharmony_ci/// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a
48c67d6573Sopenharmony_ci/// codepoint that is out of range (surrogate codepoints are out of range) or
49c67d6573Sopenharmony_ci/// is not the shortest possible UTF-8 sequence for that codepoint.
50c67d6573Sopenharmony_ci#[inline]
51c67d6573Sopenharmony_cipub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> {
52c67d6573Sopenharmony_ci    let b0 = match src.get(0) {
53c67d6573Sopenharmony_ci        None => return None,
54c67d6573Sopenharmony_ci        Some(&b) if b <= 0x7F => return Some((b as char, 1)),
55c67d6573Sopenharmony_ci        Some(&b) => b,
56c67d6573Sopenharmony_ci    };
57c67d6573Sopenharmony_ci    match b0 {
58c67d6573Sopenharmony_ci        0b110_00000..=0b110_11111 => {
59c67d6573Sopenharmony_ci            if src.len() < 2 {
60c67d6573Sopenharmony_ci                return None;
61c67d6573Sopenharmony_ci            }
62c67d6573Sopenharmony_ci            let b1 = src[1];
63c67d6573Sopenharmony_ci            if 0b11_000000 & b1 != TAG_CONT {
64c67d6573Sopenharmony_ci                return None;
65c67d6573Sopenharmony_ci            }
66c67d6573Sopenharmony_ci            let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32);
67c67d6573Sopenharmony_ci            match cp {
68c67d6573Sopenharmony_ci                0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)),
69c67d6573Sopenharmony_ci                _ => None,
70c67d6573Sopenharmony_ci            }
71c67d6573Sopenharmony_ci        }
72c67d6573Sopenharmony_ci        0b1110_0000..=0b1110_1111 => {
73c67d6573Sopenharmony_ci            if src.len() < 3 {
74c67d6573Sopenharmony_ci                return None;
75c67d6573Sopenharmony_ci            }
76c67d6573Sopenharmony_ci            let (b1, b2) = (src[1], src[2]);
77c67d6573Sopenharmony_ci            if 0b11_000000 & b1 != TAG_CONT {
78c67d6573Sopenharmony_ci                return None;
79c67d6573Sopenharmony_ci            }
80c67d6573Sopenharmony_ci            if 0b11_000000 & b2 != TAG_CONT {
81c67d6573Sopenharmony_ci                return None;
82c67d6573Sopenharmony_ci            }
83c67d6573Sopenharmony_ci            let cp = ((b0 & !TAG_THREE) as u32) << 12
84c67d6573Sopenharmony_ci                | ((b1 & !TAG_CONT) as u32) << 6
85c67d6573Sopenharmony_ci                | ((b2 & !TAG_CONT) as u32);
86c67d6573Sopenharmony_ci            match cp {
87c67d6573Sopenharmony_ci                // char::from_u32 will disallow surrogate codepoints.
88c67d6573Sopenharmony_ci                0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)),
89c67d6573Sopenharmony_ci                _ => None,
90c67d6573Sopenharmony_ci            }
91c67d6573Sopenharmony_ci        }
92c67d6573Sopenharmony_ci        0b11110_000..=0b11110_111 => {
93c67d6573Sopenharmony_ci            if src.len() < 4 {
94c67d6573Sopenharmony_ci                return None;
95c67d6573Sopenharmony_ci            }
96c67d6573Sopenharmony_ci            let (b1, b2, b3) = (src[1], src[2], src[3]);
97c67d6573Sopenharmony_ci            if 0b11_000000 & b1 != TAG_CONT {
98c67d6573Sopenharmony_ci                return None;
99c67d6573Sopenharmony_ci            }
100c67d6573Sopenharmony_ci            if 0b11_000000 & b2 != TAG_CONT {
101c67d6573Sopenharmony_ci                return None;
102c67d6573Sopenharmony_ci            }
103c67d6573Sopenharmony_ci            if 0b11_000000 & b3 != TAG_CONT {
104c67d6573Sopenharmony_ci                return None;
105c67d6573Sopenharmony_ci            }
106c67d6573Sopenharmony_ci            let cp = ((b0 & !TAG_FOUR) as u32) << 18
107c67d6573Sopenharmony_ci                | ((b1 & !TAG_CONT) as u32) << 12
108c67d6573Sopenharmony_ci                | ((b2 & !TAG_CONT) as u32) << 6
109c67d6573Sopenharmony_ci                | ((b3 & !TAG_CONT) as u32);
110c67d6573Sopenharmony_ci            match cp {
111c67d6573Sopenharmony_ci                0x10000..=0x0010_FFFF => char::from_u32(cp).map(|cp| (cp, 4)),
112c67d6573Sopenharmony_ci                _ => None,
113c67d6573Sopenharmony_ci            }
114c67d6573Sopenharmony_ci        }
115c67d6573Sopenharmony_ci        _ => None,
116c67d6573Sopenharmony_ci    }
117c67d6573Sopenharmony_ci}
118c67d6573Sopenharmony_ci
119c67d6573Sopenharmony_ci/// Like `decode_utf8`, but decodes the last UTF-8 sequence in `src` instead
120c67d6573Sopenharmony_ci/// of the first.
121c67d6573Sopenharmony_cipub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> {
122c67d6573Sopenharmony_ci    if src.is_empty() {
123c67d6573Sopenharmony_ci        return None;
124c67d6573Sopenharmony_ci    }
125c67d6573Sopenharmony_ci    let mut start = src.len() - 1;
126c67d6573Sopenharmony_ci    if src[start] <= 0x7F {
127c67d6573Sopenharmony_ci        return Some((src[start] as char, 1));
128c67d6573Sopenharmony_ci    }
129c67d6573Sopenharmony_ci    while start > src.len().saturating_sub(4) {
130c67d6573Sopenharmony_ci        start -= 1;
131c67d6573Sopenharmony_ci        if is_start_byte(src[start]) {
132c67d6573Sopenharmony_ci            break;
133c67d6573Sopenharmony_ci        }
134c67d6573Sopenharmony_ci    }
135c67d6573Sopenharmony_ci    match decode_utf8(&src[start..]) {
136c67d6573Sopenharmony_ci        None => None,
137c67d6573Sopenharmony_ci        Some((_, n)) if n < src.len() - start => None,
138c67d6573Sopenharmony_ci        Some((cp, n)) => Some((cp, n)),
139c67d6573Sopenharmony_ci    }
140c67d6573Sopenharmony_ci}
141c67d6573Sopenharmony_ci
142c67d6573Sopenharmony_cifn is_start_byte(b: u8) -> bool {
143c67d6573Sopenharmony_ci    b & 0b11_000000 != 0b1_0000000
144c67d6573Sopenharmony_ci}
145c67d6573Sopenharmony_ci
146c67d6573Sopenharmony_ci#[cfg(test)]
147c67d6573Sopenharmony_cimod tests {
148c67d6573Sopenharmony_ci    use std::str;
149c67d6573Sopenharmony_ci
150c67d6573Sopenharmony_ci    use quickcheck::quickcheck;
151c67d6573Sopenharmony_ci
152c67d6573Sopenharmony_ci    use super::{
153c67d6573Sopenharmony_ci        decode_last_utf8, decode_utf8, TAG_CONT, TAG_FOUR, TAG_THREE, TAG_TWO,
154c67d6573Sopenharmony_ci    };
155c67d6573Sopenharmony_ci
156c67d6573Sopenharmony_ci    #[test]
157c67d6573Sopenharmony_ci    fn prop_roundtrip() {
158c67d6573Sopenharmony_ci        fn p(given_cp: char) -> bool {
159c67d6573Sopenharmony_ci            let mut tmp = [0; 4];
160c67d6573Sopenharmony_ci            let encoded_len = given_cp.encode_utf8(&mut tmp).len();
161c67d6573Sopenharmony_ci            let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap();
162c67d6573Sopenharmony_ci            encoded_len == got_len && given_cp == got_cp
163c67d6573Sopenharmony_ci        }
164c67d6573Sopenharmony_ci        quickcheck(p as fn(char) -> bool)
165c67d6573Sopenharmony_ci    }
166c67d6573Sopenharmony_ci
167c67d6573Sopenharmony_ci    #[test]
168c67d6573Sopenharmony_ci    fn prop_roundtrip_last() {
169c67d6573Sopenharmony_ci        fn p(given_cp: char) -> bool {
170c67d6573Sopenharmony_ci            let mut tmp = [0; 4];
171c67d6573Sopenharmony_ci            let encoded_len = given_cp.encode_utf8(&mut tmp).len();
172c67d6573Sopenharmony_ci            let (got_cp, got_len) =
173c67d6573Sopenharmony_ci                decode_last_utf8(&tmp[..encoded_len]).unwrap();
174c67d6573Sopenharmony_ci            encoded_len == got_len && given_cp == got_cp
175c67d6573Sopenharmony_ci        }
176c67d6573Sopenharmony_ci        quickcheck(p as fn(char) -> bool)
177c67d6573Sopenharmony_ci    }
178c67d6573Sopenharmony_ci
179c67d6573Sopenharmony_ci    #[test]
180c67d6573Sopenharmony_ci    fn prop_encode_matches_std() {
181c67d6573Sopenharmony_ci        fn p(cp: char) -> bool {
182c67d6573Sopenharmony_ci            let mut got = [0; 4];
183c67d6573Sopenharmony_ci            let n = cp.encode_utf8(&mut got).len();
184c67d6573Sopenharmony_ci            let expected = cp.to_string();
185c67d6573Sopenharmony_ci            &got[..n] == expected.as_bytes()
186c67d6573Sopenharmony_ci        }
187c67d6573Sopenharmony_ci        quickcheck(p as fn(char) -> bool)
188c67d6573Sopenharmony_ci    }
189c67d6573Sopenharmony_ci
190c67d6573Sopenharmony_ci    #[test]
191c67d6573Sopenharmony_ci    fn prop_decode_matches_std() {
192c67d6573Sopenharmony_ci        fn p(given_cp: char) -> bool {
193c67d6573Sopenharmony_ci            let mut tmp = [0; 4];
194c67d6573Sopenharmony_ci            let n = given_cp.encode_utf8(&mut tmp).len();
195c67d6573Sopenharmony_ci            let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap();
196c67d6573Sopenharmony_ci            let expected_cp =
197c67d6573Sopenharmony_ci                str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap();
198c67d6573Sopenharmony_ci            got_cp == expected_cp
199c67d6573Sopenharmony_ci        }
200c67d6573Sopenharmony_ci        quickcheck(p as fn(char) -> bool)
201c67d6573Sopenharmony_ci    }
202c67d6573Sopenharmony_ci
203c67d6573Sopenharmony_ci    #[test]
204c67d6573Sopenharmony_ci    fn prop_decode_last_matches_std() {
205c67d6573Sopenharmony_ci        fn p(given_cp: char) -> bool {
206c67d6573Sopenharmony_ci            let mut tmp = [0; 4];
207c67d6573Sopenharmony_ci            let n = given_cp.encode_utf8(&mut tmp).len();
208c67d6573Sopenharmony_ci            let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap();
209c67d6573Sopenharmony_ci            let expected_cp = str::from_utf8(&tmp[..n])
210c67d6573Sopenharmony_ci                .unwrap()
211c67d6573Sopenharmony_ci                .chars()
212c67d6573Sopenharmony_ci                .rev()
213c67d6573Sopenharmony_ci                .next()
214c67d6573Sopenharmony_ci                .unwrap();
215c67d6573Sopenharmony_ci            got_cp == expected_cp
216c67d6573Sopenharmony_ci        }
217c67d6573Sopenharmony_ci        quickcheck(p as fn(char) -> bool)
218c67d6573Sopenharmony_ci    }
219c67d6573Sopenharmony_ci
220c67d6573Sopenharmony_ci    #[test]
221c67d6573Sopenharmony_ci    fn reject_invalid() {
222c67d6573Sopenharmony_ci        // Invalid start byte
223c67d6573Sopenharmony_ci        assert_eq!(decode_utf8(&[0xFF]), None);
224c67d6573Sopenharmony_ci        // Surrogate pair
225c67d6573Sopenharmony_ci        assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None);
226c67d6573Sopenharmony_ci        // Invalid continuation byte.
227c67d6573Sopenharmony_ci        assert_eq!(decode_utf8(&[0xD4, 0xC2]), None);
228c67d6573Sopenharmony_ci        // Bad lengths
229c67d6573Sopenharmony_ci        assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes
230c67d6573Sopenharmony_ci        assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes
231c67d6573Sopenharmony_ci        assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
232c67d6573Sopenharmony_ci                                                            // Not a minimal UTF-8 sequence
233c67d6573Sopenharmony_ci        assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
234c67d6573Sopenharmony_ci        assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None);
235c67d6573Sopenharmony_ci        assert_eq!(
236c67d6573Sopenharmony_ci            decode_utf8(&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]),
237c67d6573Sopenharmony_ci            None
238c67d6573Sopenharmony_ci        );
239c67d6573Sopenharmony_ci    }
240c67d6573Sopenharmony_ci
241c67d6573Sopenharmony_ci    #[test]
242c67d6573Sopenharmony_ci    fn reject_invalid_last() {
243c67d6573Sopenharmony_ci        // Invalid start byte
244c67d6573Sopenharmony_ci        assert_eq!(decode_last_utf8(&[0xFF]), None);
245c67d6573Sopenharmony_ci        // Surrogate pair
246c67d6573Sopenharmony_ci        assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None);
247c67d6573Sopenharmony_ci        // Bad lengths
248c67d6573Sopenharmony_ci        assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes
249c67d6573Sopenharmony_ci        assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes
250c67d6573Sopenharmony_ci        assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes
251c67d6573Sopenharmony_ci                                                                 // Not a minimal UTF-8 sequence
252c67d6573Sopenharmony_ci        assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None);
253c67d6573Sopenharmony_ci        assert_eq!(
254c67d6573Sopenharmony_ci            decode_last_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a',]),
255c67d6573Sopenharmony_ci            None
256c67d6573Sopenharmony_ci        );
257c67d6573Sopenharmony_ci        assert_eq!(
258c67d6573Sopenharmony_ci            decode_last_utf8(
259c67d6573Sopenharmony_ci                &[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]
260c67d6573Sopenharmony_ci            ),
261c67d6573Sopenharmony_ci            None
262c67d6573Sopenharmony_ci        );
263c67d6573Sopenharmony_ci    }
264c67d6573Sopenharmony_ci}
265