1c67d6573Sopenharmony_ci/// A few elementary UTF-8 encoding and decoding functions used by the matching 2c67d6573Sopenharmony_ci/// engines. 3c67d6573Sopenharmony_ci/// 4c67d6573Sopenharmony_ci/// In an ideal world, the matching engines operate on `&str` and we can just 5c67d6573Sopenharmony_ci/// lean on the standard library for all our UTF-8 needs. However, to support 6c67d6573Sopenharmony_ci/// byte based regexes (that can match on arbitrary bytes which may contain 7c67d6573Sopenharmony_ci/// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`. 8c67d6573Sopenharmony_ci/// The standard library doesn't really recognize this use case, so we have 9c67d6573Sopenharmony_ci/// to build it out ourselves. 10c67d6573Sopenharmony_ci/// 11c67d6573Sopenharmony_ci/// Should this be factored out into a separate crate? It seems independently 12c67d6573Sopenharmony_ci/// useful. There are other crates that already exist (e.g., `utf-8`) that have 13c67d6573Sopenharmony_ci/// overlapping use cases. Not sure what to do. 14c67d6573Sopenharmony_ciuse std::char; 15c67d6573Sopenharmony_ci 16c67d6573Sopenharmony_ciconst TAG_CONT: u8 = 0b1000_0000; 17c67d6573Sopenharmony_ciconst TAG_TWO: u8 = 0b1100_0000; 18c67d6573Sopenharmony_ciconst TAG_THREE: u8 = 0b1110_0000; 19c67d6573Sopenharmony_ciconst TAG_FOUR: u8 = 0b1111_0000; 20c67d6573Sopenharmony_ci 21c67d6573Sopenharmony_ci/// Returns the smallest possible index of the next valid UTF-8 sequence 22c67d6573Sopenharmony_ci/// starting after `i`. 23c67d6573Sopenharmony_cipub fn next_utf8(text: &[u8], i: usize) -> usize { 24c67d6573Sopenharmony_ci let b = match text.get(i) { 25c67d6573Sopenharmony_ci None => return i + 1, 26c67d6573Sopenharmony_ci Some(&b) => b, 27c67d6573Sopenharmony_ci }; 28c67d6573Sopenharmony_ci let inc = if b <= 0x7F { 29c67d6573Sopenharmony_ci 1 30c67d6573Sopenharmony_ci } else if b <= 0b110_11111 { 31c67d6573Sopenharmony_ci 2 32c67d6573Sopenharmony_ci } else if b <= 0b1110_1111 { 33c67d6573Sopenharmony_ci 3 34c67d6573Sopenharmony_ci } else { 35c67d6573Sopenharmony_ci 4 36c67d6573Sopenharmony_ci }; 37c67d6573Sopenharmony_ci i + inc 38c67d6573Sopenharmony_ci} 39c67d6573Sopenharmony_ci 40c67d6573Sopenharmony_ci/// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`. 41c67d6573Sopenharmony_ci/// 42c67d6573Sopenharmony_ci/// If no valid UTF-8 sequence could be found, then `None` is returned. 43c67d6573Sopenharmony_ci/// Otherwise, the decoded codepoint and the number of bytes read is returned. 44c67d6573Sopenharmony_ci/// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be 45c67d6573Sopenharmony_ci/// 1, 2, 3 or 4. 46c67d6573Sopenharmony_ci/// 47c67d6573Sopenharmony_ci/// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a 48c67d6573Sopenharmony_ci/// codepoint that is out of range (surrogate codepoints are out of range) or 49c67d6573Sopenharmony_ci/// is not the shortest possible UTF-8 sequence for that codepoint. 50c67d6573Sopenharmony_ci#[inline] 51c67d6573Sopenharmony_cipub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> { 52c67d6573Sopenharmony_ci let b0 = match src.get(0) { 53c67d6573Sopenharmony_ci None => return None, 54c67d6573Sopenharmony_ci Some(&b) if b <= 0x7F => return Some((b as char, 1)), 55c67d6573Sopenharmony_ci Some(&b) => b, 56c67d6573Sopenharmony_ci }; 57c67d6573Sopenharmony_ci match b0 { 58c67d6573Sopenharmony_ci 0b110_00000..=0b110_11111 => { 59c67d6573Sopenharmony_ci if src.len() < 2 { 60c67d6573Sopenharmony_ci return None; 61c67d6573Sopenharmony_ci } 62c67d6573Sopenharmony_ci let b1 = src[1]; 63c67d6573Sopenharmony_ci if 0b11_000000 & b1 != TAG_CONT { 64c67d6573Sopenharmony_ci return None; 65c67d6573Sopenharmony_ci } 66c67d6573Sopenharmony_ci let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32); 67c67d6573Sopenharmony_ci match cp { 68c67d6573Sopenharmony_ci 0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)), 69c67d6573Sopenharmony_ci _ => None, 70c67d6573Sopenharmony_ci } 71c67d6573Sopenharmony_ci } 72c67d6573Sopenharmony_ci 0b1110_0000..=0b1110_1111 => { 73c67d6573Sopenharmony_ci if src.len() < 3 { 74c67d6573Sopenharmony_ci return None; 75c67d6573Sopenharmony_ci } 76c67d6573Sopenharmony_ci let (b1, b2) = (src[1], src[2]); 77c67d6573Sopenharmony_ci if 0b11_000000 & b1 != TAG_CONT { 78c67d6573Sopenharmony_ci return None; 79c67d6573Sopenharmony_ci } 80c67d6573Sopenharmony_ci if 0b11_000000 & b2 != TAG_CONT { 81c67d6573Sopenharmony_ci return None; 82c67d6573Sopenharmony_ci } 83c67d6573Sopenharmony_ci let cp = ((b0 & !TAG_THREE) as u32) << 12 84c67d6573Sopenharmony_ci | ((b1 & !TAG_CONT) as u32) << 6 85c67d6573Sopenharmony_ci | ((b2 & !TAG_CONT) as u32); 86c67d6573Sopenharmony_ci match cp { 87c67d6573Sopenharmony_ci // char::from_u32 will disallow surrogate codepoints. 88c67d6573Sopenharmony_ci 0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)), 89c67d6573Sopenharmony_ci _ => None, 90c67d6573Sopenharmony_ci } 91c67d6573Sopenharmony_ci } 92c67d6573Sopenharmony_ci 0b11110_000..=0b11110_111 => { 93c67d6573Sopenharmony_ci if src.len() < 4 { 94c67d6573Sopenharmony_ci return None; 95c67d6573Sopenharmony_ci } 96c67d6573Sopenharmony_ci let (b1, b2, b3) = (src[1], src[2], src[3]); 97c67d6573Sopenharmony_ci if 0b11_000000 & b1 != TAG_CONT { 98c67d6573Sopenharmony_ci return None; 99c67d6573Sopenharmony_ci } 100c67d6573Sopenharmony_ci if 0b11_000000 & b2 != TAG_CONT { 101c67d6573Sopenharmony_ci return None; 102c67d6573Sopenharmony_ci } 103c67d6573Sopenharmony_ci if 0b11_000000 & b3 != TAG_CONT { 104c67d6573Sopenharmony_ci return None; 105c67d6573Sopenharmony_ci } 106c67d6573Sopenharmony_ci let cp = ((b0 & !TAG_FOUR) as u32) << 18 107c67d6573Sopenharmony_ci | ((b1 & !TAG_CONT) as u32) << 12 108c67d6573Sopenharmony_ci | ((b2 & !TAG_CONT) as u32) << 6 109c67d6573Sopenharmony_ci | ((b3 & !TAG_CONT) as u32); 110c67d6573Sopenharmony_ci match cp { 111c67d6573Sopenharmony_ci 0x10000..=0x0010_FFFF => char::from_u32(cp).map(|cp| (cp, 4)), 112c67d6573Sopenharmony_ci _ => None, 113c67d6573Sopenharmony_ci } 114c67d6573Sopenharmony_ci } 115c67d6573Sopenharmony_ci _ => None, 116c67d6573Sopenharmony_ci } 117c67d6573Sopenharmony_ci} 118c67d6573Sopenharmony_ci 119c67d6573Sopenharmony_ci/// Like `decode_utf8`, but decodes the last UTF-8 sequence in `src` instead 120c67d6573Sopenharmony_ci/// of the first. 121c67d6573Sopenharmony_cipub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> { 122c67d6573Sopenharmony_ci if src.is_empty() { 123c67d6573Sopenharmony_ci return None; 124c67d6573Sopenharmony_ci } 125c67d6573Sopenharmony_ci let mut start = src.len() - 1; 126c67d6573Sopenharmony_ci if src[start] <= 0x7F { 127c67d6573Sopenharmony_ci return Some((src[start] as char, 1)); 128c67d6573Sopenharmony_ci } 129c67d6573Sopenharmony_ci while start > src.len().saturating_sub(4) { 130c67d6573Sopenharmony_ci start -= 1; 131c67d6573Sopenharmony_ci if is_start_byte(src[start]) { 132c67d6573Sopenharmony_ci break; 133c67d6573Sopenharmony_ci } 134c67d6573Sopenharmony_ci } 135c67d6573Sopenharmony_ci match decode_utf8(&src[start..]) { 136c67d6573Sopenharmony_ci None => None, 137c67d6573Sopenharmony_ci Some((_, n)) if n < src.len() - start => None, 138c67d6573Sopenharmony_ci Some((cp, n)) => Some((cp, n)), 139c67d6573Sopenharmony_ci } 140c67d6573Sopenharmony_ci} 141c67d6573Sopenharmony_ci 142c67d6573Sopenharmony_cifn is_start_byte(b: u8) -> bool { 143c67d6573Sopenharmony_ci b & 0b11_000000 != 0b1_0000000 144c67d6573Sopenharmony_ci} 145c67d6573Sopenharmony_ci 146c67d6573Sopenharmony_ci#[cfg(test)] 147c67d6573Sopenharmony_cimod tests { 148c67d6573Sopenharmony_ci use std::str; 149c67d6573Sopenharmony_ci 150c67d6573Sopenharmony_ci use quickcheck::quickcheck; 151c67d6573Sopenharmony_ci 152c67d6573Sopenharmony_ci use super::{ 153c67d6573Sopenharmony_ci decode_last_utf8, decode_utf8, TAG_CONT, TAG_FOUR, TAG_THREE, TAG_TWO, 154c67d6573Sopenharmony_ci }; 155c67d6573Sopenharmony_ci 156c67d6573Sopenharmony_ci #[test] 157c67d6573Sopenharmony_ci fn prop_roundtrip() { 158c67d6573Sopenharmony_ci fn p(given_cp: char) -> bool { 159c67d6573Sopenharmony_ci let mut tmp = [0; 4]; 160c67d6573Sopenharmony_ci let encoded_len = given_cp.encode_utf8(&mut tmp).len(); 161c67d6573Sopenharmony_ci let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap(); 162c67d6573Sopenharmony_ci encoded_len == got_len && given_cp == got_cp 163c67d6573Sopenharmony_ci } 164c67d6573Sopenharmony_ci quickcheck(p as fn(char) -> bool) 165c67d6573Sopenharmony_ci } 166c67d6573Sopenharmony_ci 167c67d6573Sopenharmony_ci #[test] 168c67d6573Sopenharmony_ci fn prop_roundtrip_last() { 169c67d6573Sopenharmony_ci fn p(given_cp: char) -> bool { 170c67d6573Sopenharmony_ci let mut tmp = [0; 4]; 171c67d6573Sopenharmony_ci let encoded_len = given_cp.encode_utf8(&mut tmp).len(); 172c67d6573Sopenharmony_ci let (got_cp, got_len) = 173c67d6573Sopenharmony_ci decode_last_utf8(&tmp[..encoded_len]).unwrap(); 174c67d6573Sopenharmony_ci encoded_len == got_len && given_cp == got_cp 175c67d6573Sopenharmony_ci } 176c67d6573Sopenharmony_ci quickcheck(p as fn(char) -> bool) 177c67d6573Sopenharmony_ci } 178c67d6573Sopenharmony_ci 179c67d6573Sopenharmony_ci #[test] 180c67d6573Sopenharmony_ci fn prop_encode_matches_std() { 181c67d6573Sopenharmony_ci fn p(cp: char) -> bool { 182c67d6573Sopenharmony_ci let mut got = [0; 4]; 183c67d6573Sopenharmony_ci let n = cp.encode_utf8(&mut got).len(); 184c67d6573Sopenharmony_ci let expected = cp.to_string(); 185c67d6573Sopenharmony_ci &got[..n] == expected.as_bytes() 186c67d6573Sopenharmony_ci } 187c67d6573Sopenharmony_ci quickcheck(p as fn(char) -> bool) 188c67d6573Sopenharmony_ci } 189c67d6573Sopenharmony_ci 190c67d6573Sopenharmony_ci #[test] 191c67d6573Sopenharmony_ci fn prop_decode_matches_std() { 192c67d6573Sopenharmony_ci fn p(given_cp: char) -> bool { 193c67d6573Sopenharmony_ci let mut tmp = [0; 4]; 194c67d6573Sopenharmony_ci let n = given_cp.encode_utf8(&mut tmp).len(); 195c67d6573Sopenharmony_ci let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap(); 196c67d6573Sopenharmony_ci let expected_cp = 197c67d6573Sopenharmony_ci str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap(); 198c67d6573Sopenharmony_ci got_cp == expected_cp 199c67d6573Sopenharmony_ci } 200c67d6573Sopenharmony_ci quickcheck(p as fn(char) -> bool) 201c67d6573Sopenharmony_ci } 202c67d6573Sopenharmony_ci 203c67d6573Sopenharmony_ci #[test] 204c67d6573Sopenharmony_ci fn prop_decode_last_matches_std() { 205c67d6573Sopenharmony_ci fn p(given_cp: char) -> bool { 206c67d6573Sopenharmony_ci let mut tmp = [0; 4]; 207c67d6573Sopenharmony_ci let n = given_cp.encode_utf8(&mut tmp).len(); 208c67d6573Sopenharmony_ci let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap(); 209c67d6573Sopenharmony_ci let expected_cp = str::from_utf8(&tmp[..n]) 210c67d6573Sopenharmony_ci .unwrap() 211c67d6573Sopenharmony_ci .chars() 212c67d6573Sopenharmony_ci .rev() 213c67d6573Sopenharmony_ci .next() 214c67d6573Sopenharmony_ci .unwrap(); 215c67d6573Sopenharmony_ci got_cp == expected_cp 216c67d6573Sopenharmony_ci } 217c67d6573Sopenharmony_ci quickcheck(p as fn(char) -> bool) 218c67d6573Sopenharmony_ci } 219c67d6573Sopenharmony_ci 220c67d6573Sopenharmony_ci #[test] 221c67d6573Sopenharmony_ci fn reject_invalid() { 222c67d6573Sopenharmony_ci // Invalid start byte 223c67d6573Sopenharmony_ci assert_eq!(decode_utf8(&[0xFF]), None); 224c67d6573Sopenharmony_ci // Surrogate pair 225c67d6573Sopenharmony_ci assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None); 226c67d6573Sopenharmony_ci // Invalid continuation byte. 227c67d6573Sopenharmony_ci assert_eq!(decode_utf8(&[0xD4, 0xC2]), None); 228c67d6573Sopenharmony_ci // Bad lengths 229c67d6573Sopenharmony_ci assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes 230c67d6573Sopenharmony_ci assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes 231c67d6573Sopenharmony_ci assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes 232c67d6573Sopenharmony_ci // Not a minimal UTF-8 sequence 233c67d6573Sopenharmony_ci assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None); 234c67d6573Sopenharmony_ci assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None); 235c67d6573Sopenharmony_ci assert_eq!( 236c67d6573Sopenharmony_ci decode_utf8(&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]), 237c67d6573Sopenharmony_ci None 238c67d6573Sopenharmony_ci ); 239c67d6573Sopenharmony_ci } 240c67d6573Sopenharmony_ci 241c67d6573Sopenharmony_ci #[test] 242c67d6573Sopenharmony_ci fn reject_invalid_last() { 243c67d6573Sopenharmony_ci // Invalid start byte 244c67d6573Sopenharmony_ci assert_eq!(decode_last_utf8(&[0xFF]), None); 245c67d6573Sopenharmony_ci // Surrogate pair 246c67d6573Sopenharmony_ci assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None); 247c67d6573Sopenharmony_ci // Bad lengths 248c67d6573Sopenharmony_ci assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes 249c67d6573Sopenharmony_ci assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes 250c67d6573Sopenharmony_ci assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes 251c67d6573Sopenharmony_ci // Not a minimal UTF-8 sequence 252c67d6573Sopenharmony_ci assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None); 253c67d6573Sopenharmony_ci assert_eq!( 254c67d6573Sopenharmony_ci decode_last_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a',]), 255c67d6573Sopenharmony_ci None 256c67d6573Sopenharmony_ci ); 257c67d6573Sopenharmony_ci assert_eq!( 258c67d6573Sopenharmony_ci decode_last_utf8( 259c67d6573Sopenharmony_ci &[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',] 260c67d6573Sopenharmony_ci ), 261c67d6573Sopenharmony_ci None 262c67d6573Sopenharmony_ci ); 263c67d6573Sopenharmony_ci } 264c67d6573Sopenharmony_ci} 265