1/// A few elementary UTF-8 encoding and decoding functions used by the matching 2/// engines. 3/// 4/// In an ideal world, the matching engines operate on `&str` and we can just 5/// lean on the standard library for all our UTF-8 needs. However, to support 6/// byte based regexes (that can match on arbitrary bytes which may contain 7/// UTF-8), we need to be capable of searching and decoding UTF-8 on a `&[u8]`. 8/// The standard library doesn't really recognize this use case, so we have 9/// to build it out ourselves. 10/// 11/// Should this be factored out into a separate crate? It seems independently 12/// useful. There are other crates that already exist (e.g., `utf-8`) that have 13/// overlapping use cases. Not sure what to do. 14use std::char; 15 16const TAG_CONT: u8 = 0b1000_0000; 17const TAG_TWO: u8 = 0b1100_0000; 18const TAG_THREE: u8 = 0b1110_0000; 19const TAG_FOUR: u8 = 0b1111_0000; 20 21/// Returns the smallest possible index of the next valid UTF-8 sequence 22/// starting after `i`. 23pub fn next_utf8(text: &[u8], i: usize) -> usize { 24 let b = match text.get(i) { 25 None => return i + 1, 26 Some(&b) => b, 27 }; 28 let inc = if b <= 0x7F { 29 1 30 } else if b <= 0b110_11111 { 31 2 32 } else if b <= 0b1110_1111 { 33 3 34 } else { 35 4 36 }; 37 i + inc 38} 39 40/// Decode a single UTF-8 sequence into a single Unicode codepoint from `src`. 41/// 42/// If no valid UTF-8 sequence could be found, then `None` is returned. 43/// Otherwise, the decoded codepoint and the number of bytes read is returned. 44/// The number of bytes read (for a valid UTF-8 sequence) is guaranteed to be 45/// 1, 2, 3 or 4. 46/// 47/// Note that a UTF-8 sequence is invalid if it is incorrect UTF-8, encodes a 48/// codepoint that is out of range (surrogate codepoints are out of range) or 49/// is not the shortest possible UTF-8 sequence for that codepoint. 50#[inline] 51pub fn decode_utf8(src: &[u8]) -> Option<(char, usize)> { 52 let b0 = match src.get(0) { 53 None => return None, 54 Some(&b) if b <= 0x7F => return Some((b as char, 1)), 55 Some(&b) => b, 56 }; 57 match b0 { 58 0b110_00000..=0b110_11111 => { 59 if src.len() < 2 { 60 return None; 61 } 62 let b1 = src[1]; 63 if 0b11_000000 & b1 != TAG_CONT { 64 return None; 65 } 66 let cp = ((b0 & !TAG_TWO) as u32) << 6 | ((b1 & !TAG_CONT) as u32); 67 match cp { 68 0x80..=0x7FF => char::from_u32(cp).map(|cp| (cp, 2)), 69 _ => None, 70 } 71 } 72 0b1110_0000..=0b1110_1111 => { 73 if src.len() < 3 { 74 return None; 75 } 76 let (b1, b2) = (src[1], src[2]); 77 if 0b11_000000 & b1 != TAG_CONT { 78 return None; 79 } 80 if 0b11_000000 & b2 != TAG_CONT { 81 return None; 82 } 83 let cp = ((b0 & !TAG_THREE) as u32) << 12 84 | ((b1 & !TAG_CONT) as u32) << 6 85 | ((b2 & !TAG_CONT) as u32); 86 match cp { 87 // char::from_u32 will disallow surrogate codepoints. 88 0x800..=0xFFFF => char::from_u32(cp).map(|cp| (cp, 3)), 89 _ => None, 90 } 91 } 92 0b11110_000..=0b11110_111 => { 93 if src.len() < 4 { 94 return None; 95 } 96 let (b1, b2, b3) = (src[1], src[2], src[3]); 97 if 0b11_000000 & b1 != TAG_CONT { 98 return None; 99 } 100 if 0b11_000000 & b2 != TAG_CONT { 101 return None; 102 } 103 if 0b11_000000 & b3 != TAG_CONT { 104 return None; 105 } 106 let cp = ((b0 & !TAG_FOUR) as u32) << 18 107 | ((b1 & !TAG_CONT) as u32) << 12 108 | ((b2 & !TAG_CONT) as u32) << 6 109 | ((b3 & !TAG_CONT) as u32); 110 match cp { 111 0x10000..=0x0010_FFFF => char::from_u32(cp).map(|cp| (cp, 4)), 112 _ => None, 113 } 114 } 115 _ => None, 116 } 117} 118 119/// Like `decode_utf8`, but decodes the last UTF-8 sequence in `src` instead 120/// of the first. 121pub fn decode_last_utf8(src: &[u8]) -> Option<(char, usize)> { 122 if src.is_empty() { 123 return None; 124 } 125 let mut start = src.len() - 1; 126 if src[start] <= 0x7F { 127 return Some((src[start] as char, 1)); 128 } 129 while start > src.len().saturating_sub(4) { 130 start -= 1; 131 if is_start_byte(src[start]) { 132 break; 133 } 134 } 135 match decode_utf8(&src[start..]) { 136 None => None, 137 Some((_, n)) if n < src.len() - start => None, 138 Some((cp, n)) => Some((cp, n)), 139 } 140} 141 142fn is_start_byte(b: u8) -> bool { 143 b & 0b11_000000 != 0b1_0000000 144} 145 146#[cfg(test)] 147mod tests { 148 use std::str; 149 150 use quickcheck::quickcheck; 151 152 use super::{ 153 decode_last_utf8, decode_utf8, TAG_CONT, TAG_FOUR, TAG_THREE, TAG_TWO, 154 }; 155 156 #[test] 157 fn prop_roundtrip() { 158 fn p(given_cp: char) -> bool { 159 let mut tmp = [0; 4]; 160 let encoded_len = given_cp.encode_utf8(&mut tmp).len(); 161 let (got_cp, got_len) = decode_utf8(&tmp[..encoded_len]).unwrap(); 162 encoded_len == got_len && given_cp == got_cp 163 } 164 quickcheck(p as fn(char) -> bool) 165 } 166 167 #[test] 168 fn prop_roundtrip_last() { 169 fn p(given_cp: char) -> bool { 170 let mut tmp = [0; 4]; 171 let encoded_len = given_cp.encode_utf8(&mut tmp).len(); 172 let (got_cp, got_len) = 173 decode_last_utf8(&tmp[..encoded_len]).unwrap(); 174 encoded_len == got_len && given_cp == got_cp 175 } 176 quickcheck(p as fn(char) -> bool) 177 } 178 179 #[test] 180 fn prop_encode_matches_std() { 181 fn p(cp: char) -> bool { 182 let mut got = [0; 4]; 183 let n = cp.encode_utf8(&mut got).len(); 184 let expected = cp.to_string(); 185 &got[..n] == expected.as_bytes() 186 } 187 quickcheck(p as fn(char) -> bool) 188 } 189 190 #[test] 191 fn prop_decode_matches_std() { 192 fn p(given_cp: char) -> bool { 193 let mut tmp = [0; 4]; 194 let n = given_cp.encode_utf8(&mut tmp).len(); 195 let (got_cp, _) = decode_utf8(&tmp[..n]).unwrap(); 196 let expected_cp = 197 str::from_utf8(&tmp[..n]).unwrap().chars().next().unwrap(); 198 got_cp == expected_cp 199 } 200 quickcheck(p as fn(char) -> bool) 201 } 202 203 #[test] 204 fn prop_decode_last_matches_std() { 205 fn p(given_cp: char) -> bool { 206 let mut tmp = [0; 4]; 207 let n = given_cp.encode_utf8(&mut tmp).len(); 208 let (got_cp, _) = decode_last_utf8(&tmp[..n]).unwrap(); 209 let expected_cp = str::from_utf8(&tmp[..n]) 210 .unwrap() 211 .chars() 212 .rev() 213 .next() 214 .unwrap(); 215 got_cp == expected_cp 216 } 217 quickcheck(p as fn(char) -> bool) 218 } 219 220 #[test] 221 fn reject_invalid() { 222 // Invalid start byte 223 assert_eq!(decode_utf8(&[0xFF]), None); 224 // Surrogate pair 225 assert_eq!(decode_utf8(&[0xED, 0xA0, 0x81]), None); 226 // Invalid continuation byte. 227 assert_eq!(decode_utf8(&[0xD4, 0xC2]), None); 228 // Bad lengths 229 assert_eq!(decode_utf8(&[0xC3]), None); // 2 bytes 230 assert_eq!(decode_utf8(&[0xEF, 0xBF]), None); // 3 bytes 231 assert_eq!(decode_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes 232 // Not a minimal UTF-8 sequence 233 assert_eq!(decode_utf8(&[TAG_TWO, TAG_CONT | b'a']), None); 234 assert_eq!(decode_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a']), None); 235 assert_eq!( 236 decode_utf8(&[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',]), 237 None 238 ); 239 } 240 241 #[test] 242 fn reject_invalid_last() { 243 // Invalid start byte 244 assert_eq!(decode_last_utf8(&[0xFF]), None); 245 // Surrogate pair 246 assert_eq!(decode_last_utf8(&[0xED, 0xA0, 0x81]), None); 247 // Bad lengths 248 assert_eq!(decode_last_utf8(&[0xC3]), None); // 2 bytes 249 assert_eq!(decode_last_utf8(&[0xEF, 0xBF]), None); // 3 bytes 250 assert_eq!(decode_last_utf8(&[0xF4, 0x8F, 0xBF]), None); // 4 bytes 251 // Not a minimal UTF-8 sequence 252 assert_eq!(decode_last_utf8(&[TAG_TWO, TAG_CONT | b'a']), None); 253 assert_eq!( 254 decode_last_utf8(&[TAG_THREE, TAG_CONT, TAG_CONT | b'a',]), 255 None 256 ); 257 assert_eq!( 258 decode_last_utf8( 259 &[TAG_FOUR, TAG_CONT, TAG_CONT, TAG_CONT | b'a',] 260 ), 261 None 262 ); 263 } 264} 265