1use std::char; 2use std::cmp::Ordering; 3use std::fmt; 4use std::ops; 5use std::u32; 6 7use crate::literal::LiteralSearcher; 8use crate::prog::InstEmptyLook; 9use crate::utf8::{decode_last_utf8, decode_utf8}; 10 11/// Represents a location in the input. 12#[derive(Clone, Copy, Debug)] 13pub struct InputAt { 14 pos: usize, 15 c: Char, 16 byte: Option<u8>, 17 len: usize, 18} 19 20impl InputAt { 21 /// Returns true iff this position is at the beginning of the input. 22 pub fn is_start(&self) -> bool { 23 self.pos == 0 24 } 25 26 /// Returns true iff this position is past the end of the input. 27 pub fn is_end(&self) -> bool { 28 self.c.is_none() && self.byte.is_none() 29 } 30 31 /// Returns the character at this position. 32 /// 33 /// If this position is just before or after the input, then an absent 34 /// character is returned. 35 pub fn char(&self) -> Char { 36 self.c 37 } 38 39 /// Returns the byte at this position. 40 pub fn byte(&self) -> Option<u8> { 41 self.byte 42 } 43 44 /// Returns the UTF-8 width of the character at this position. 45 pub fn len(&self) -> usize { 46 self.len 47 } 48 49 /// Returns whether the UTF-8 width of the character at this position 50 /// is zero. 51 pub fn is_empty(&self) -> bool { 52 self.len == 0 53 } 54 55 /// Returns the byte offset of this position. 56 pub fn pos(&self) -> usize { 57 self.pos 58 } 59 60 /// Returns the byte offset of the next position in the input. 61 pub fn next_pos(&self) -> usize { 62 self.pos + self.len 63 } 64} 65 66/// An abstraction over input used in the matching engines. 67pub trait Input: fmt::Debug { 68 /// Return an encoding of the position at byte offset `i`. 69 fn at(&self, i: usize) -> InputAt; 70 71 /// Return the Unicode character occurring next to `at`. 72 /// 73 /// If no such character could be decoded, then `Char` is absent. 74 fn next_char(&self, at: InputAt) -> Char; 75 76 /// Return the Unicode character occurring previous to `at`. 77 /// 78 /// If no such character could be decoded, then `Char` is absent. 79 fn previous_char(&self, at: InputAt) -> Char; 80 81 /// Return true if the given empty width instruction matches at the 82 /// input position given. 83 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool; 84 85 /// Scan the input for a matching prefix. 86 fn prefix_at( 87 &self, 88 prefixes: &LiteralSearcher, 89 at: InputAt, 90 ) -> Option<InputAt>; 91 92 /// The number of bytes in the input. 93 fn len(&self) -> usize; 94 95 /// Whether the input is empty. 96 fn is_empty(&self) -> bool { 97 self.len() == 0 98 } 99 100 /// Return the given input as a sequence of bytes. 101 fn as_bytes(&self) -> &[u8]; 102} 103 104impl<'a, T: Input> Input for &'a T { 105 fn at(&self, i: usize) -> InputAt { 106 (**self).at(i) 107 } 108 109 fn next_char(&self, at: InputAt) -> Char { 110 (**self).next_char(at) 111 } 112 113 fn previous_char(&self, at: InputAt) -> Char { 114 (**self).previous_char(at) 115 } 116 117 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { 118 (**self).is_empty_match(at, empty) 119 } 120 121 fn prefix_at( 122 &self, 123 prefixes: &LiteralSearcher, 124 at: InputAt, 125 ) -> Option<InputAt> { 126 (**self).prefix_at(prefixes, at) 127 } 128 129 fn len(&self) -> usize { 130 (**self).len() 131 } 132 133 fn as_bytes(&self) -> &[u8] { 134 (**self).as_bytes() 135 } 136} 137 138/// An input reader over characters. 139#[derive(Clone, Copy, Debug)] 140pub struct CharInput<'t>(&'t [u8]); 141 142impl<'t> CharInput<'t> { 143 /// Return a new character input reader for the given string. 144 pub fn new(s: &'t [u8]) -> CharInput<'t> { 145 CharInput(s) 146 } 147} 148 149impl<'t> ops::Deref for CharInput<'t> { 150 type Target = [u8]; 151 152 fn deref(&self) -> &[u8] { 153 self.0 154 } 155} 156 157impl<'t> Input for CharInput<'t> { 158 fn at(&self, i: usize) -> InputAt { 159 if i >= self.len() { 160 InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } 161 } else { 162 let c = decode_utf8(&self[i..]).map(|(c, _)| c).into(); 163 InputAt { pos: i, c, byte: None, len: c.len_utf8() } 164 } 165 } 166 167 fn next_char(&self, at: InputAt) -> Char { 168 at.char() 169 } 170 171 fn previous_char(&self, at: InputAt) -> Char { 172 decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() 173 } 174 175 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { 176 use crate::prog::EmptyLook::*; 177 match empty.look { 178 StartLine => { 179 let c = self.previous_char(at); 180 at.pos() == 0 || c == '\n' 181 } 182 EndLine => { 183 let c = self.next_char(at); 184 at.pos() == self.len() || c == '\n' 185 } 186 StartText => at.pos() == 0, 187 EndText => at.pos() == self.len(), 188 WordBoundary => { 189 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 190 c1.is_word_char() != c2.is_word_char() 191 } 192 NotWordBoundary => { 193 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 194 c1.is_word_char() == c2.is_word_char() 195 } 196 WordBoundaryAscii => { 197 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 198 c1.is_word_byte() != c2.is_word_byte() 199 } 200 NotWordBoundaryAscii => { 201 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 202 c1.is_word_byte() == c2.is_word_byte() 203 } 204 } 205 } 206 207 fn prefix_at( 208 &self, 209 prefixes: &LiteralSearcher, 210 at: InputAt, 211 ) -> Option<InputAt> { 212 prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) 213 } 214 215 fn len(&self) -> usize { 216 self.0.len() 217 } 218 219 fn as_bytes(&self) -> &[u8] { 220 self.0 221 } 222} 223 224/// An input reader over bytes. 225#[derive(Clone, Copy, Debug)] 226pub struct ByteInput<'t> { 227 text: &'t [u8], 228 only_utf8: bool, 229} 230 231impl<'t> ByteInput<'t> { 232 /// Return a new byte-based input reader for the given string. 233 pub fn new(text: &'t [u8], only_utf8: bool) -> ByteInput<'t> { 234 ByteInput { text, only_utf8 } 235 } 236} 237 238impl<'t> ops::Deref for ByteInput<'t> { 239 type Target = [u8]; 240 241 fn deref(&self) -> &[u8] { 242 self.text 243 } 244} 245 246impl<'t> Input for ByteInput<'t> { 247 fn at(&self, i: usize) -> InputAt { 248 if i >= self.len() { 249 InputAt { pos: self.len(), c: None.into(), byte: None, len: 0 } 250 } else { 251 InputAt { 252 pos: i, 253 c: None.into(), 254 byte: self.get(i).cloned(), 255 len: 1, 256 } 257 } 258 } 259 260 fn next_char(&self, at: InputAt) -> Char { 261 decode_utf8(&self[at.pos()..]).map(|(c, _)| c).into() 262 } 263 264 fn previous_char(&self, at: InputAt) -> Char { 265 decode_last_utf8(&self[..at.pos()]).map(|(c, _)| c).into() 266 } 267 268 fn is_empty_match(&self, at: InputAt, empty: &InstEmptyLook) -> bool { 269 use crate::prog::EmptyLook::*; 270 match empty.look { 271 StartLine => { 272 let c = self.previous_char(at); 273 at.pos() == 0 || c == '\n' 274 } 275 EndLine => { 276 let c = self.next_char(at); 277 at.pos() == self.len() || c == '\n' 278 } 279 StartText => at.pos() == 0, 280 EndText => at.pos() == self.len(), 281 WordBoundary => { 282 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 283 c1.is_word_char() != c2.is_word_char() 284 } 285 NotWordBoundary => { 286 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 287 c1.is_word_char() == c2.is_word_char() 288 } 289 WordBoundaryAscii => { 290 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 291 if self.only_utf8 { 292 // If we must match UTF-8, then we can't match word 293 // boundaries at invalid UTF-8. 294 if c1.is_none() && !at.is_start() { 295 return false; 296 } 297 if c2.is_none() && !at.is_end() { 298 return false; 299 } 300 } 301 c1.is_word_byte() != c2.is_word_byte() 302 } 303 NotWordBoundaryAscii => { 304 let (c1, c2) = (self.previous_char(at), self.next_char(at)); 305 if self.only_utf8 { 306 // If we must match UTF-8, then we can't match word 307 // boundaries at invalid UTF-8. 308 if c1.is_none() && !at.is_start() { 309 return false; 310 } 311 if c2.is_none() && !at.is_end() { 312 return false; 313 } 314 } 315 c1.is_word_byte() == c2.is_word_byte() 316 } 317 } 318 } 319 320 fn prefix_at( 321 &self, 322 prefixes: &LiteralSearcher, 323 at: InputAt, 324 ) -> Option<InputAt> { 325 prefixes.find(&self[at.pos()..]).map(|(s, _)| self.at(at.pos() + s)) 326 } 327 328 fn len(&self) -> usize { 329 self.text.len() 330 } 331 332 fn as_bytes(&self) -> &[u8] { 333 self.text 334 } 335} 336 337/// An inline representation of `Option<char>`. 338/// 339/// This eliminates the need to do case analysis on `Option<char>` to determine 340/// ordinality with other characters. 341/// 342/// (The `Option<char>` is not related to encoding. Instead, it is used in the 343/// matching engines to represent the beginning and ending boundaries of the 344/// search text.) 345#[derive(Clone, Copy, Hash, PartialEq, Eq, PartialOrd, Ord)] 346pub struct Char(u32); 347 348impl fmt::Debug for Char { 349 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 350 match char::from_u32(self.0) { 351 None => write!(f, "Empty"), 352 Some(c) => write!(f, "{:?}", c), 353 } 354 } 355} 356 357impl Char { 358 /// Returns true iff the character is absent. 359 #[inline] 360 pub fn is_none(self) -> bool { 361 self.0 == u32::MAX 362 } 363 364 /// Returns the length of the character's UTF-8 encoding. 365 /// 366 /// If the character is absent, then `1` is returned. 367 #[inline] 368 pub fn len_utf8(self) -> usize { 369 char::from_u32(self.0).map_or(1, |c| c.len_utf8()) 370 } 371 372 /// Returns true iff the character is a word character. 373 /// 374 /// If the character is absent, then false is returned. 375 pub fn is_word_char(self) -> bool { 376 // is_word_character can panic if the Unicode data for \w isn't 377 // available. However, our compiler ensures that if a Unicode word 378 // boundary is used, then the data must also be available. If it isn't, 379 // then the compiler returns an error. 380 char::from_u32(self.0).map_or(false, regex_syntax::is_word_character) 381 } 382 383 /// Returns true iff the byte is a word byte. 384 /// 385 /// If the byte is absent, then false is returned. 386 pub fn is_word_byte(self) -> bool { 387 match char::from_u32(self.0) { 388 Some(c) if c <= '\u{7F}' => regex_syntax::is_word_byte(c as u8), 389 None | Some(_) => false, 390 } 391 } 392} 393 394impl From<char> for Char { 395 fn from(c: char) -> Char { 396 Char(c as u32) 397 } 398} 399 400impl From<Option<char>> for Char { 401 fn from(c: Option<char>) -> Char { 402 c.map_or(Char(u32::MAX), |c| c.into()) 403 } 404} 405 406impl PartialEq<char> for Char { 407 #[inline] 408 fn eq(&self, other: &char) -> bool { 409 self.0 == *other as u32 410 } 411} 412 413impl PartialEq<Char> for char { 414 #[inline] 415 fn eq(&self, other: &Char) -> bool { 416 *self as u32 == other.0 417 } 418} 419 420impl PartialOrd<char> for Char { 421 #[inline] 422 fn partial_cmp(&self, other: &char) -> Option<Ordering> { 423 self.0.partial_cmp(&(*other as u32)) 424 } 425} 426 427impl PartialOrd<Char> for char { 428 #[inline] 429 fn partial_cmp(&self, other: &Char) -> Option<Ordering> { 430 (*self as u32).partial_cmp(&other.0) 431 } 432} 433