1/*! 2Defines a translator that converts an `Ast` to an `Hir`. 3*/ 4 5use std::cell::{Cell, RefCell}; 6use std::result; 7 8use crate::ast::{self, Ast, Span, Visitor}; 9use crate::hir::{self, Error, ErrorKind, Hir}; 10use crate::unicode::{self, ClassQuery}; 11 12type Result<T> = result::Result<T, Error>; 13 14/// A builder for constructing an AST->HIR translator. 15#[derive(Clone, Debug)] 16pub struct TranslatorBuilder { 17 allow_invalid_utf8: bool, 18 flags: Flags, 19} 20 21impl Default for TranslatorBuilder { 22 fn default() -> TranslatorBuilder { 23 TranslatorBuilder::new() 24 } 25} 26 27impl TranslatorBuilder { 28 /// Create a new translator builder with a default c onfiguration. 29 pub fn new() -> TranslatorBuilder { 30 TranslatorBuilder { 31 allow_invalid_utf8: false, 32 flags: Flags::default(), 33 } 34 } 35 36 /// Build a translator using the current configuration. 37 pub fn build(&self) -> Translator { 38 Translator { 39 stack: RefCell::new(vec![]), 40 flags: Cell::new(self.flags), 41 allow_invalid_utf8: self.allow_invalid_utf8, 42 } 43 } 44 45 /// When enabled, translation will permit the construction of a regular 46 /// expression that may match invalid UTF-8. 47 /// 48 /// When disabled (the default), the translator is guaranteed to produce 49 /// an expression that will only ever match valid UTF-8 (otherwise, the 50 /// translator will return an error). 51 /// 52 /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII 53 /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause 54 /// the parser to return an error. Namely, a negated ASCII word boundary 55 /// can result in matching positions that aren't valid UTF-8 boundaries. 56 pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut TranslatorBuilder { 57 self.allow_invalid_utf8 = yes; 58 self 59 } 60 61 /// Enable or disable the case insensitive flag (`i`) by default. 62 pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder { 63 self.flags.case_insensitive = if yes { Some(true) } else { None }; 64 self 65 } 66 67 /// Enable or disable the multi-line matching flag (`m`) by default. 68 pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder { 69 self.flags.multi_line = if yes { Some(true) } else { None }; 70 self 71 } 72 73 /// Enable or disable the "dot matches any character" flag (`s`) by 74 /// default. 75 pub fn dot_matches_new_line( 76 &mut self, 77 yes: bool, 78 ) -> &mut TranslatorBuilder { 79 self.flags.dot_matches_new_line = if yes { Some(true) } else { None }; 80 self 81 } 82 83 /// Enable or disable the "swap greed" flag (`U`) by default. 84 pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder { 85 self.flags.swap_greed = if yes { Some(true) } else { None }; 86 self 87 } 88 89 /// Enable or disable the Unicode flag (`u`) by default. 90 pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder { 91 self.flags.unicode = if yes { None } else { Some(false) }; 92 self 93 } 94} 95 96/// A translator maps abstract syntax to a high level intermediate 97/// representation. 98/// 99/// A translator may be benefit from reuse. That is, a translator can translate 100/// many abstract syntax trees. 101/// 102/// A `Translator` can be configured in more detail via a 103/// [`TranslatorBuilder`](struct.TranslatorBuilder.html). 104#[derive(Clone, Debug)] 105pub struct Translator { 106 /// Our call stack, but on the heap. 107 stack: RefCell<Vec<HirFrame>>, 108 /// The current flag settings. 109 flags: Cell<Flags>, 110 /// Whether we're allowed to produce HIR that can match arbitrary bytes. 111 allow_invalid_utf8: bool, 112} 113 114impl Translator { 115 /// Create a new translator using the default configuration. 116 pub fn new() -> Translator { 117 TranslatorBuilder::new().build() 118 } 119 120 /// Translate the given abstract syntax tree (AST) into a high level 121 /// intermediate representation (HIR). 122 /// 123 /// If there was a problem doing the translation, then an HIR-specific 124 /// error is returned. 125 /// 126 /// The original pattern string used to produce the `Ast` *must* also be 127 /// provided. The translator does not use the pattern string during any 128 /// correct translation, but is used for error reporting. 129 pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> { 130 ast::visit(ast, TranslatorI::new(self, pattern)) 131 } 132} 133 134/// An HirFrame is a single stack frame, represented explicitly, which is 135/// created for each item in the Ast that we traverse. 136/// 137/// Note that technically, this type doesn't represent our entire stack 138/// frame. In particular, the Ast visitor represents any state associated with 139/// traversing the Ast itself. 140#[derive(Clone, Debug)] 141enum HirFrame { 142 /// An arbitrary HIR expression. These get pushed whenever we hit a base 143 /// case in the Ast. They get popped after an inductive (i.e., recursive) 144 /// step is complete. 145 Expr(Hir), 146 /// A Unicode character class. This frame is mutated as we descend into 147 /// the Ast of a character class (which is itself its own mini recursive 148 /// structure). 149 ClassUnicode(hir::ClassUnicode), 150 /// A byte-oriented character class. This frame is mutated as we descend 151 /// into the Ast of a character class (which is itself its own mini 152 /// recursive structure). 153 /// 154 /// Byte character classes are created when Unicode mode (`u`) is disabled. 155 /// If `allow_invalid_utf8` is disabled (the default), then a byte 156 /// character is only permitted to match ASCII text. 157 ClassBytes(hir::ClassBytes), 158 /// This is pushed on to the stack upon first seeing any kind of group, 159 /// indicated by parentheses (including non-capturing groups). It is popped 160 /// upon leaving a group. 161 Group { 162 /// The old active flags when this group was opened. 163 /// 164 /// If this group sets flags, then the new active flags are set to the 165 /// result of merging the old flags with the flags introduced by this 166 /// group. If the group doesn't set any flags, then this is simply 167 /// equivalent to whatever flags were set when the group was opened. 168 /// 169 /// When this group is popped, the active flags should be restored to 170 /// the flags set here. 171 /// 172 /// The "active" flags correspond to whatever flags are set in the 173 /// Translator. 174 old_flags: Flags, 175 }, 176 /// This is pushed whenever a concatenation is observed. After visiting 177 /// every sub-expression in the concatenation, the translator's stack is 178 /// popped until it sees a Concat frame. 179 Concat, 180 /// This is pushed whenever an alternation is observed. After visiting 181 /// every sub-expression in the alternation, the translator's stack is 182 /// popped until it sees an Alternation frame. 183 Alternation, 184} 185 186impl HirFrame { 187 /// Assert that the current stack frame is an Hir expression and return it. 188 fn unwrap_expr(self) -> Hir { 189 match self { 190 HirFrame::Expr(expr) => expr, 191 _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self), 192 } 193 } 194 195 /// Assert that the current stack frame is a Unicode class expression and 196 /// return it. 197 fn unwrap_class_unicode(self) -> hir::ClassUnicode { 198 match self { 199 HirFrame::ClassUnicode(cls) => cls, 200 _ => panic!( 201 "tried to unwrap Unicode class \ 202 from HirFrame, got: {:?}", 203 self 204 ), 205 } 206 } 207 208 /// Assert that the current stack frame is a byte class expression and 209 /// return it. 210 fn unwrap_class_bytes(self) -> hir::ClassBytes { 211 match self { 212 HirFrame::ClassBytes(cls) => cls, 213 _ => panic!( 214 "tried to unwrap byte class \ 215 from HirFrame, got: {:?}", 216 self 217 ), 218 } 219 } 220 221 /// Assert that the current stack frame is a group indicator and return 222 /// its corresponding flags (the flags that were active at the time the 223 /// group was entered). 224 fn unwrap_group(self) -> Flags { 225 match self { 226 HirFrame::Group { old_flags } => old_flags, 227 _ => { 228 panic!("tried to unwrap group from HirFrame, got: {:?}", self) 229 } 230 } 231 } 232} 233 234impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { 235 type Output = Hir; 236 type Err = Error; 237 238 fn finish(self) -> Result<Hir> { 239 // ... otherwise, we should have exactly one HIR on the stack. 240 assert_eq!(self.trans().stack.borrow().len(), 1); 241 Ok(self.pop().unwrap().unwrap_expr()) 242 } 243 244 fn visit_pre(&mut self, ast: &Ast) -> Result<()> { 245 match *ast { 246 Ast::Class(ast::Class::Bracketed(_)) => { 247 if self.flags().unicode() { 248 let cls = hir::ClassUnicode::empty(); 249 self.push(HirFrame::ClassUnicode(cls)); 250 } else { 251 let cls = hir::ClassBytes::empty(); 252 self.push(HirFrame::ClassBytes(cls)); 253 } 254 } 255 Ast::Group(ref x) => { 256 let old_flags = x 257 .flags() 258 .map(|ast| self.set_flags(ast)) 259 .unwrap_or_else(|| self.flags()); 260 self.push(HirFrame::Group { old_flags }); 261 } 262 Ast::Concat(ref x) if x.asts.is_empty() => {} 263 Ast::Concat(_) => { 264 self.push(HirFrame::Concat); 265 } 266 Ast::Alternation(ref x) if x.asts.is_empty() => {} 267 Ast::Alternation(_) => { 268 self.push(HirFrame::Alternation); 269 } 270 _ => {} 271 } 272 Ok(()) 273 } 274 275 fn visit_post(&mut self, ast: &Ast) -> Result<()> { 276 match *ast { 277 Ast::Empty(_) => { 278 self.push(HirFrame::Expr(Hir::empty())); 279 } 280 Ast::Flags(ref x) => { 281 self.set_flags(&x.flags); 282 // Flags in the AST are generally considered directives and 283 // not actual sub-expressions. However, they can be used in 284 // the concrete syntax like `((?i))`, and we need some kind of 285 // indication of an expression there, and Empty is the correct 286 // choice. 287 // 288 // There can also be things like `(?i)+`, but we rule those out 289 // in the parser. In the future, we might allow them for 290 // consistency sake. 291 self.push(HirFrame::Expr(Hir::empty())); 292 } 293 Ast::Literal(ref x) => { 294 self.push(HirFrame::Expr(self.hir_literal(x)?)); 295 } 296 Ast::Dot(span) => { 297 self.push(HirFrame::Expr(self.hir_dot(span)?)); 298 } 299 Ast::Assertion(ref x) => { 300 self.push(HirFrame::Expr(self.hir_assertion(x)?)); 301 } 302 Ast::Class(ast::Class::Perl(ref x)) => { 303 if self.flags().unicode() { 304 let cls = self.hir_perl_unicode_class(x)?; 305 let hcls = hir::Class::Unicode(cls); 306 self.push(HirFrame::Expr(Hir::class(hcls))); 307 } else { 308 let cls = self.hir_perl_byte_class(x); 309 let hcls = hir::Class::Bytes(cls); 310 self.push(HirFrame::Expr(Hir::class(hcls))); 311 } 312 } 313 Ast::Class(ast::Class::Unicode(ref x)) => { 314 let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); 315 self.push(HirFrame::Expr(Hir::class(cls))); 316 } 317 Ast::Class(ast::Class::Bracketed(ref ast)) => { 318 if self.flags().unicode() { 319 let mut cls = self.pop().unwrap().unwrap_class_unicode(); 320 self.unicode_fold_and_negate( 321 &ast.span, 322 ast.negated, 323 &mut cls, 324 )?; 325 if cls.ranges().is_empty() { 326 return Err(self.error( 327 ast.span, 328 ErrorKind::EmptyClassNotAllowed, 329 )); 330 } 331 let expr = Hir::class(hir::Class::Unicode(cls)); 332 self.push(HirFrame::Expr(expr)); 333 } else { 334 let mut cls = self.pop().unwrap().unwrap_class_bytes(); 335 self.bytes_fold_and_negate( 336 &ast.span, 337 ast.negated, 338 &mut cls, 339 )?; 340 if cls.ranges().is_empty() { 341 return Err(self.error( 342 ast.span, 343 ErrorKind::EmptyClassNotAllowed, 344 )); 345 } 346 347 let expr = Hir::class(hir::Class::Bytes(cls)); 348 self.push(HirFrame::Expr(expr)); 349 } 350 } 351 Ast::Repetition(ref x) => { 352 let expr = self.pop().unwrap().unwrap_expr(); 353 self.push(HirFrame::Expr(self.hir_repetition(x, expr))); 354 } 355 Ast::Group(ref x) => { 356 let expr = self.pop().unwrap().unwrap_expr(); 357 let old_flags = self.pop().unwrap().unwrap_group(); 358 self.trans().flags.set(old_flags); 359 self.push(HirFrame::Expr(self.hir_group(x, expr))); 360 } 361 Ast::Concat(_) => { 362 let mut exprs = vec![]; 363 while let Some(HirFrame::Expr(expr)) = self.pop() { 364 if !expr.kind().is_empty() { 365 exprs.push(expr); 366 } 367 } 368 exprs.reverse(); 369 self.push(HirFrame::Expr(Hir::concat(exprs))); 370 } 371 Ast::Alternation(_) => { 372 let mut exprs = vec![]; 373 while let Some(HirFrame::Expr(expr)) = self.pop() { 374 exprs.push(expr); 375 } 376 exprs.reverse(); 377 self.push(HirFrame::Expr(Hir::alternation(exprs))); 378 } 379 } 380 Ok(()) 381 } 382 383 fn visit_class_set_item_pre( 384 &mut self, 385 ast: &ast::ClassSetItem, 386 ) -> Result<()> { 387 match *ast { 388 ast::ClassSetItem::Bracketed(_) => { 389 if self.flags().unicode() { 390 let cls = hir::ClassUnicode::empty(); 391 self.push(HirFrame::ClassUnicode(cls)); 392 } else { 393 let cls = hir::ClassBytes::empty(); 394 self.push(HirFrame::ClassBytes(cls)); 395 } 396 } 397 // We needn't handle the Union case here since the visitor will 398 // do it for us. 399 _ => {} 400 } 401 Ok(()) 402 } 403 404 fn visit_class_set_item_post( 405 &mut self, 406 ast: &ast::ClassSetItem, 407 ) -> Result<()> { 408 match *ast { 409 ast::ClassSetItem::Empty(_) => {} 410 ast::ClassSetItem::Literal(ref x) => { 411 if self.flags().unicode() { 412 let mut cls = self.pop().unwrap().unwrap_class_unicode(); 413 cls.push(hir::ClassUnicodeRange::new(x.c, x.c)); 414 self.push(HirFrame::ClassUnicode(cls)); 415 } else { 416 let mut cls = self.pop().unwrap().unwrap_class_bytes(); 417 let byte = self.class_literal_byte(x)?; 418 cls.push(hir::ClassBytesRange::new(byte, byte)); 419 self.push(HirFrame::ClassBytes(cls)); 420 } 421 } 422 ast::ClassSetItem::Range(ref x) => { 423 if self.flags().unicode() { 424 let mut cls = self.pop().unwrap().unwrap_class_unicode(); 425 cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c)); 426 self.push(HirFrame::ClassUnicode(cls)); 427 } else { 428 let mut cls = self.pop().unwrap().unwrap_class_bytes(); 429 let start = self.class_literal_byte(&x.start)?; 430 let end = self.class_literal_byte(&x.end)?; 431 cls.push(hir::ClassBytesRange::new(start, end)); 432 self.push(HirFrame::ClassBytes(cls)); 433 } 434 } 435 ast::ClassSetItem::Ascii(ref x) => { 436 if self.flags().unicode() { 437 let xcls = self.hir_ascii_unicode_class(x)?; 438 let mut cls = self.pop().unwrap().unwrap_class_unicode(); 439 cls.union(&xcls); 440 self.push(HirFrame::ClassUnicode(cls)); 441 } else { 442 let xcls = self.hir_ascii_byte_class(x)?; 443 let mut cls = self.pop().unwrap().unwrap_class_bytes(); 444 cls.union(&xcls); 445 self.push(HirFrame::ClassBytes(cls)); 446 } 447 } 448 ast::ClassSetItem::Unicode(ref x) => { 449 let xcls = self.hir_unicode_class(x)?; 450 let mut cls = self.pop().unwrap().unwrap_class_unicode(); 451 cls.union(&xcls); 452 self.push(HirFrame::ClassUnicode(cls)); 453 } 454 ast::ClassSetItem::Perl(ref x) => { 455 if self.flags().unicode() { 456 let xcls = self.hir_perl_unicode_class(x)?; 457 let mut cls = self.pop().unwrap().unwrap_class_unicode(); 458 cls.union(&xcls); 459 self.push(HirFrame::ClassUnicode(cls)); 460 } else { 461 let xcls = self.hir_perl_byte_class(x); 462 let mut cls = self.pop().unwrap().unwrap_class_bytes(); 463 cls.union(&xcls); 464 self.push(HirFrame::ClassBytes(cls)); 465 } 466 } 467 ast::ClassSetItem::Bracketed(ref ast) => { 468 if self.flags().unicode() { 469 let mut cls1 = self.pop().unwrap().unwrap_class_unicode(); 470 self.unicode_fold_and_negate( 471 &ast.span, 472 ast.negated, 473 &mut cls1, 474 )?; 475 476 let mut cls2 = self.pop().unwrap().unwrap_class_unicode(); 477 cls2.union(&cls1); 478 self.push(HirFrame::ClassUnicode(cls2)); 479 } else { 480 let mut cls1 = self.pop().unwrap().unwrap_class_bytes(); 481 self.bytes_fold_and_negate( 482 &ast.span, 483 ast.negated, 484 &mut cls1, 485 )?; 486 487 let mut cls2 = self.pop().unwrap().unwrap_class_bytes(); 488 cls2.union(&cls1); 489 self.push(HirFrame::ClassBytes(cls2)); 490 } 491 } 492 // This is handled automatically by the visitor. 493 ast::ClassSetItem::Union(_) => {} 494 } 495 Ok(()) 496 } 497 498 fn visit_class_set_binary_op_pre( 499 &mut self, 500 _op: &ast::ClassSetBinaryOp, 501 ) -> Result<()> { 502 if self.flags().unicode() { 503 let cls = hir::ClassUnicode::empty(); 504 self.push(HirFrame::ClassUnicode(cls)); 505 } else { 506 let cls = hir::ClassBytes::empty(); 507 self.push(HirFrame::ClassBytes(cls)); 508 } 509 Ok(()) 510 } 511 512 fn visit_class_set_binary_op_in( 513 &mut self, 514 _op: &ast::ClassSetBinaryOp, 515 ) -> Result<()> { 516 if self.flags().unicode() { 517 let cls = hir::ClassUnicode::empty(); 518 self.push(HirFrame::ClassUnicode(cls)); 519 } else { 520 let cls = hir::ClassBytes::empty(); 521 self.push(HirFrame::ClassBytes(cls)); 522 } 523 Ok(()) 524 } 525 526 fn visit_class_set_binary_op_post( 527 &mut self, 528 op: &ast::ClassSetBinaryOp, 529 ) -> Result<()> { 530 use crate::ast::ClassSetBinaryOpKind::*; 531 532 if self.flags().unicode() { 533 let mut rhs = self.pop().unwrap().unwrap_class_unicode(); 534 let mut lhs = self.pop().unwrap().unwrap_class_unicode(); 535 let mut cls = self.pop().unwrap().unwrap_class_unicode(); 536 if self.flags().case_insensitive() { 537 rhs.try_case_fold_simple().map_err(|_| { 538 self.error( 539 op.rhs.span().clone(), 540 ErrorKind::UnicodeCaseUnavailable, 541 ) 542 })?; 543 lhs.try_case_fold_simple().map_err(|_| { 544 self.error( 545 op.lhs.span().clone(), 546 ErrorKind::UnicodeCaseUnavailable, 547 ) 548 })?; 549 } 550 match op.kind { 551 Intersection => lhs.intersect(&rhs), 552 Difference => lhs.difference(&rhs), 553 SymmetricDifference => lhs.symmetric_difference(&rhs), 554 } 555 cls.union(&lhs); 556 self.push(HirFrame::ClassUnicode(cls)); 557 } else { 558 let mut rhs = self.pop().unwrap().unwrap_class_bytes(); 559 let mut lhs = self.pop().unwrap().unwrap_class_bytes(); 560 let mut cls = self.pop().unwrap().unwrap_class_bytes(); 561 if self.flags().case_insensitive() { 562 rhs.case_fold_simple(); 563 lhs.case_fold_simple(); 564 } 565 match op.kind { 566 Intersection => lhs.intersect(&rhs), 567 Difference => lhs.difference(&rhs), 568 SymmetricDifference => lhs.symmetric_difference(&rhs), 569 } 570 cls.union(&lhs); 571 self.push(HirFrame::ClassBytes(cls)); 572 } 573 Ok(()) 574 } 575} 576 577/// The internal implementation of a translator. 578/// 579/// This type is responsible for carrying around the original pattern string, 580/// which is not tied to the internal state of a translator. 581/// 582/// A TranslatorI exists for the time it takes to translate a single Ast. 583#[derive(Clone, Debug)] 584struct TranslatorI<'t, 'p> { 585 trans: &'t Translator, 586 pattern: &'p str, 587} 588 589impl<'t, 'p> TranslatorI<'t, 'p> { 590 /// Build a new internal translator. 591 fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> { 592 TranslatorI { trans, pattern } 593 } 594 595 /// Return a reference to the underlying translator. 596 fn trans(&self) -> &Translator { 597 &self.trans 598 } 599 600 /// Push the given frame on to the call stack. 601 fn push(&self, frame: HirFrame) { 602 self.trans().stack.borrow_mut().push(frame); 603 } 604 605 /// Pop the top of the call stack. If the call stack is empty, return None. 606 fn pop(&self) -> Option<HirFrame> { 607 self.trans().stack.borrow_mut().pop() 608 } 609 610 /// Create a new error with the given span and error type. 611 fn error(&self, span: Span, kind: ErrorKind) -> Error { 612 Error { kind, pattern: self.pattern.to_string(), span } 613 } 614 615 /// Return a copy of the active flags. 616 fn flags(&self) -> Flags { 617 self.trans().flags.get() 618 } 619 620 /// Set the flags of this translator from the flags set in the given AST. 621 /// Then, return the old flags. 622 fn set_flags(&self, ast_flags: &ast::Flags) -> Flags { 623 let old_flags = self.flags(); 624 let mut new_flags = Flags::from_ast(ast_flags); 625 new_flags.merge(&old_flags); 626 self.trans().flags.set(new_flags); 627 old_flags 628 } 629 630 fn hir_literal(&self, lit: &ast::Literal) -> Result<Hir> { 631 let ch = match self.literal_to_char(lit)? { 632 byte @ hir::Literal::Byte(_) => return Ok(Hir::literal(byte)), 633 hir::Literal::Unicode(ch) => ch, 634 }; 635 if self.flags().case_insensitive() { 636 self.hir_from_char_case_insensitive(lit.span, ch) 637 } else { 638 self.hir_from_char(lit.span, ch) 639 } 640 } 641 642 /// Convert an Ast literal to its scalar representation. 643 /// 644 /// When Unicode mode is enabled, then this always succeeds and returns a 645 /// `char` (Unicode scalar value). 646 /// 647 /// When Unicode mode is disabled, then a raw byte is returned. If that 648 /// byte is not ASCII and invalid UTF-8 is not allowed, then this returns 649 /// an error. 650 fn literal_to_char(&self, lit: &ast::Literal) -> Result<hir::Literal> { 651 if self.flags().unicode() { 652 return Ok(hir::Literal::Unicode(lit.c)); 653 } 654 let byte = match lit.byte() { 655 None => return Ok(hir::Literal::Unicode(lit.c)), 656 Some(byte) => byte, 657 }; 658 if byte <= 0x7F { 659 return Ok(hir::Literal::Unicode(byte as char)); 660 } 661 if !self.trans().allow_invalid_utf8 { 662 return Err(self.error(lit.span, ErrorKind::InvalidUtf8)); 663 } 664 Ok(hir::Literal::Byte(byte)) 665 } 666 667 fn hir_from_char(&self, span: Span, c: char) -> Result<Hir> { 668 if !self.flags().unicode() && c.len_utf8() > 1 { 669 return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); 670 } 671 Ok(Hir::literal(hir::Literal::Unicode(c))) 672 } 673 674 fn hir_from_char_case_insensitive( 675 &self, 676 span: Span, 677 c: char, 678 ) -> Result<Hir> { 679 if self.flags().unicode() { 680 // If case folding won't do anything, then don't bother trying. 681 let map = 682 unicode::contains_simple_case_mapping(c, c).map_err(|_| { 683 self.error(span, ErrorKind::UnicodeCaseUnavailable) 684 })?; 685 if !map { 686 return self.hir_from_char(span, c); 687 } 688 let mut cls = 689 hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new( 690 c, c, 691 )]); 692 cls.try_case_fold_simple().map_err(|_| { 693 self.error(span, ErrorKind::UnicodeCaseUnavailable) 694 })?; 695 Ok(Hir::class(hir::Class::Unicode(cls))) 696 } else { 697 if c.len_utf8() > 1 { 698 return Err(self.error(span, ErrorKind::UnicodeNotAllowed)); 699 } 700 // If case folding won't do anything, then don't bother trying. 701 match c { 702 'A'..='Z' | 'a'..='z' => {} 703 _ => return self.hir_from_char(span, c), 704 } 705 let mut cls = 706 hir::ClassBytes::new(vec![hir::ClassBytesRange::new( 707 c as u8, c as u8, 708 )]); 709 cls.case_fold_simple(); 710 Ok(Hir::class(hir::Class::Bytes(cls))) 711 } 712 } 713 714 fn hir_dot(&self, span: Span) -> Result<Hir> { 715 let unicode = self.flags().unicode(); 716 if !unicode && !self.trans().allow_invalid_utf8 { 717 return Err(self.error(span, ErrorKind::InvalidUtf8)); 718 } 719 Ok(if self.flags().dot_matches_new_line() { 720 Hir::any(!unicode) 721 } else { 722 Hir::dot(!unicode) 723 }) 724 } 725 726 fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> { 727 let unicode = self.flags().unicode(); 728 let multi_line = self.flags().multi_line(); 729 Ok(match asst.kind { 730 ast::AssertionKind::StartLine => Hir::anchor(if multi_line { 731 hir::Anchor::StartLine 732 } else { 733 hir::Anchor::StartText 734 }), 735 ast::AssertionKind::EndLine => Hir::anchor(if multi_line { 736 hir::Anchor::EndLine 737 } else { 738 hir::Anchor::EndText 739 }), 740 ast::AssertionKind::StartText => { 741 Hir::anchor(hir::Anchor::StartText) 742 } 743 ast::AssertionKind::EndText => Hir::anchor(hir::Anchor::EndText), 744 ast::AssertionKind::WordBoundary => { 745 Hir::word_boundary(if unicode { 746 hir::WordBoundary::Unicode 747 } else { 748 hir::WordBoundary::Ascii 749 }) 750 } 751 ast::AssertionKind::NotWordBoundary => { 752 Hir::word_boundary(if unicode { 753 hir::WordBoundary::UnicodeNegate 754 } else { 755 // It is possible for negated ASCII word boundaries to 756 // match at invalid UTF-8 boundaries, even when searching 757 // valid UTF-8. 758 if !self.trans().allow_invalid_utf8 { 759 return Err( 760 self.error(asst.span, ErrorKind::InvalidUtf8) 761 ); 762 } 763 hir::WordBoundary::AsciiNegate 764 }) 765 } 766 }) 767 } 768 769 fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir { 770 let kind = match group.kind { 771 ast::GroupKind::CaptureIndex(idx) => { 772 hir::GroupKind::CaptureIndex(idx) 773 } 774 ast::GroupKind::CaptureName(ref capname) => { 775 hir::GroupKind::CaptureName { 776 name: capname.name.clone(), 777 index: capname.index, 778 } 779 } 780 ast::GroupKind::NonCapturing(_) => hir::GroupKind::NonCapturing, 781 }; 782 Hir::group(hir::Group { kind, hir: Box::new(expr) }) 783 } 784 785 fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir { 786 let kind = match rep.op.kind { 787 ast::RepetitionKind::ZeroOrOne => hir::RepetitionKind::ZeroOrOne, 788 ast::RepetitionKind::ZeroOrMore => hir::RepetitionKind::ZeroOrMore, 789 ast::RepetitionKind::OneOrMore => hir::RepetitionKind::OneOrMore, 790 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => { 791 hir::RepetitionKind::Range(hir::RepetitionRange::Exactly(m)) 792 } 793 ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => { 794 hir::RepetitionKind::Range(hir::RepetitionRange::AtLeast(m)) 795 } 796 ast::RepetitionKind::Range(ast::RepetitionRange::Bounded( 797 m, 798 n, 799 )) => { 800 hir::RepetitionKind::Range(hir::RepetitionRange::Bounded(m, n)) 801 } 802 }; 803 let greedy = 804 if self.flags().swap_greed() { !rep.greedy } else { rep.greedy }; 805 Hir::repetition(hir::Repetition { kind, greedy, hir: Box::new(expr) }) 806 } 807 808 fn hir_unicode_class( 809 &self, 810 ast_class: &ast::ClassUnicode, 811 ) -> Result<hir::ClassUnicode> { 812 use crate::ast::ClassUnicodeKind::*; 813 814 if !self.flags().unicode() { 815 return Err( 816 self.error(ast_class.span, ErrorKind::UnicodeNotAllowed) 817 ); 818 } 819 let query = match ast_class.kind { 820 OneLetter(name) => ClassQuery::OneLetter(name), 821 Named(ref name) => ClassQuery::Binary(name), 822 NamedValue { ref name, ref value, .. } => ClassQuery::ByValue { 823 property_name: name, 824 property_value: value, 825 }, 826 }; 827 let mut result = self.convert_unicode_class_error( 828 &ast_class.span, 829 unicode::class(query), 830 ); 831 if let Ok(ref mut class) = result { 832 self.unicode_fold_and_negate( 833 &ast_class.span, 834 ast_class.negated, 835 class, 836 )?; 837 if class.ranges().is_empty() { 838 let err = self 839 .error(ast_class.span, ErrorKind::EmptyClassNotAllowed); 840 return Err(err); 841 } 842 } 843 result 844 } 845 846 fn hir_ascii_unicode_class( 847 &self, 848 ast: &ast::ClassAscii, 849 ) -> Result<hir::ClassUnicode> { 850 let mut cls = hir::ClassUnicode::new( 851 ascii_class(&ast.kind) 852 .iter() 853 .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)), 854 ); 855 self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?; 856 Ok(cls) 857 } 858 859 fn hir_ascii_byte_class( 860 &self, 861 ast: &ast::ClassAscii, 862 ) -> Result<hir::ClassBytes> { 863 let mut cls = hir::ClassBytes::new( 864 ascii_class(&ast.kind) 865 .iter() 866 .map(|&(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)), 867 ); 868 self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?; 869 Ok(cls) 870 } 871 872 fn hir_perl_unicode_class( 873 &self, 874 ast_class: &ast::ClassPerl, 875 ) -> Result<hir::ClassUnicode> { 876 use crate::ast::ClassPerlKind::*; 877 878 assert!(self.flags().unicode()); 879 let result = match ast_class.kind { 880 Digit => unicode::perl_digit(), 881 Space => unicode::perl_space(), 882 Word => unicode::perl_word(), 883 }; 884 let mut class = 885 self.convert_unicode_class_error(&ast_class.span, result)?; 886 // We needn't apply case folding here because the Perl Unicode classes 887 // are already closed under Unicode simple case folding. 888 if ast_class.negated { 889 class.negate(); 890 } 891 Ok(class) 892 } 893 894 fn hir_perl_byte_class( 895 &self, 896 ast_class: &ast::ClassPerl, 897 ) -> hir::ClassBytes { 898 use crate::ast::ClassPerlKind::*; 899 900 assert!(!self.flags().unicode()); 901 let mut class = match ast_class.kind { 902 Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit), 903 Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space), 904 Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word), 905 }; 906 // We needn't apply case folding here because the Perl ASCII classes 907 // are already closed (under ASCII case folding). 908 if ast_class.negated { 909 class.negate(); 910 } 911 class 912 } 913 914 /// Converts the given Unicode specific error to an HIR translation error. 915 /// 916 /// The span given should approximate the position at which an error would 917 /// occur. 918 fn convert_unicode_class_error( 919 &self, 920 span: &Span, 921 result: unicode::Result<hir::ClassUnicode>, 922 ) -> Result<hir::ClassUnicode> { 923 result.map_err(|err| { 924 let sp = span.clone(); 925 match err { 926 unicode::Error::PropertyNotFound => { 927 self.error(sp, ErrorKind::UnicodePropertyNotFound) 928 } 929 unicode::Error::PropertyValueNotFound => { 930 self.error(sp, ErrorKind::UnicodePropertyValueNotFound) 931 } 932 unicode::Error::PerlClassNotFound => { 933 self.error(sp, ErrorKind::UnicodePerlClassNotFound) 934 } 935 } 936 }) 937 } 938 939 fn unicode_fold_and_negate( 940 &self, 941 span: &Span, 942 negated: bool, 943 class: &mut hir::ClassUnicode, 944 ) -> Result<()> { 945 // Note that we must apply case folding before negation! 946 // Consider `(?i)[^x]`. If we applied negation field, then 947 // the result would be the character class that matched any 948 // Unicode scalar value. 949 if self.flags().case_insensitive() { 950 class.try_case_fold_simple().map_err(|_| { 951 self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable) 952 })?; 953 } 954 if negated { 955 class.negate(); 956 } 957 Ok(()) 958 } 959 960 fn bytes_fold_and_negate( 961 &self, 962 span: &Span, 963 negated: bool, 964 class: &mut hir::ClassBytes, 965 ) -> Result<()> { 966 // Note that we must apply case folding before negation! 967 // Consider `(?i)[^x]`. If we applied negation first, then 968 // the result would be the character class that matched any 969 // Unicode scalar value. 970 if self.flags().case_insensitive() { 971 class.case_fold_simple(); 972 } 973 if negated { 974 class.negate(); 975 } 976 if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() { 977 return Err(self.error(span.clone(), ErrorKind::InvalidUtf8)); 978 } 979 Ok(()) 980 } 981 982 /// Return a scalar byte value suitable for use as a literal in a byte 983 /// character class. 984 fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> { 985 match self.literal_to_char(ast)? { 986 hir::Literal::Byte(byte) => Ok(byte), 987 hir::Literal::Unicode(ch) => { 988 if ch <= 0x7F as char { 989 Ok(ch as u8) 990 } else { 991 // We can't feasibly support Unicode in 992 // byte oriented classes. Byte classes don't 993 // do Unicode case folding. 994 Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed)) 995 } 996 } 997 } 998 } 999} 1000 1001/// A translator's representation of a regular expression's flags at any given 1002/// moment in time. 1003/// 1004/// Each flag can be in one of three states: absent, present but disabled or 1005/// present but enabled. 1006#[derive(Clone, Copy, Debug, Default)] 1007struct Flags { 1008 case_insensitive: Option<bool>, 1009 multi_line: Option<bool>, 1010 dot_matches_new_line: Option<bool>, 1011 swap_greed: Option<bool>, 1012 unicode: Option<bool>, 1013 // Note that `ignore_whitespace` is omitted here because it is handled 1014 // entirely in the parser. 1015} 1016 1017impl Flags { 1018 fn from_ast(ast: &ast::Flags) -> Flags { 1019 let mut flags = Flags::default(); 1020 let mut enable = true; 1021 for item in &ast.items { 1022 match item.kind { 1023 ast::FlagsItemKind::Negation => { 1024 enable = false; 1025 } 1026 ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => { 1027 flags.case_insensitive = Some(enable); 1028 } 1029 ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => { 1030 flags.multi_line = Some(enable); 1031 } 1032 ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => { 1033 flags.dot_matches_new_line = Some(enable); 1034 } 1035 ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => { 1036 flags.swap_greed = Some(enable); 1037 } 1038 ast::FlagsItemKind::Flag(ast::Flag::Unicode) => { 1039 flags.unicode = Some(enable); 1040 } 1041 ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {} 1042 } 1043 } 1044 flags 1045 } 1046 1047 fn merge(&mut self, previous: &Flags) { 1048 if self.case_insensitive.is_none() { 1049 self.case_insensitive = previous.case_insensitive; 1050 } 1051 if self.multi_line.is_none() { 1052 self.multi_line = previous.multi_line; 1053 } 1054 if self.dot_matches_new_line.is_none() { 1055 self.dot_matches_new_line = previous.dot_matches_new_line; 1056 } 1057 if self.swap_greed.is_none() { 1058 self.swap_greed = previous.swap_greed; 1059 } 1060 if self.unicode.is_none() { 1061 self.unicode = previous.unicode; 1062 } 1063 } 1064 1065 fn case_insensitive(&self) -> bool { 1066 self.case_insensitive.unwrap_or(false) 1067 } 1068 1069 fn multi_line(&self) -> bool { 1070 self.multi_line.unwrap_or(false) 1071 } 1072 1073 fn dot_matches_new_line(&self) -> bool { 1074 self.dot_matches_new_line.unwrap_or(false) 1075 } 1076 1077 fn swap_greed(&self) -> bool { 1078 self.swap_greed.unwrap_or(false) 1079 } 1080 1081 fn unicode(&self) -> bool { 1082 self.unicode.unwrap_or(true) 1083 } 1084} 1085 1086fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { 1087 let ranges: Vec<_> = ascii_class(kind) 1088 .iter() 1089 .cloned() 1090 .map(|(s, e)| hir::ClassBytesRange::new(s as u8, e as u8)) 1091 .collect(); 1092 hir::ClassBytes::new(ranges) 1093} 1094 1095fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] { 1096 use crate::ast::ClassAsciiKind::*; 1097 match *kind { 1098 Alnum => &[('0', '9'), ('A', 'Z'), ('a', 'z')], 1099 Alpha => &[('A', 'Z'), ('a', 'z')], 1100 Ascii => &[('\x00', '\x7F')], 1101 Blank => &[('\t', '\t'), (' ', ' ')], 1102 Cntrl => &[('\x00', '\x1F'), ('\x7F', '\x7F')], 1103 Digit => &[('0', '9')], 1104 Graph => &[('!', '~')], 1105 Lower => &[('a', 'z')], 1106 Print => &[(' ', '~')], 1107 Punct => &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')], 1108 Space => &[ 1109 ('\t', '\t'), 1110 ('\n', '\n'), 1111 ('\x0B', '\x0B'), 1112 ('\x0C', '\x0C'), 1113 ('\r', '\r'), 1114 (' ', ' '), 1115 ], 1116 Upper => &[('A', 'Z')], 1117 Word => &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')], 1118 Xdigit => &[('0', '9'), ('A', 'F'), ('a', 'f')], 1119 } 1120} 1121 1122#[cfg(test)] 1123mod tests { 1124 use crate::ast::parse::ParserBuilder; 1125 use crate::ast::{self, Ast, Position, Span}; 1126 use crate::hir::{self, Hir, HirKind}; 1127 use crate::unicode::{self, ClassQuery}; 1128 1129 use super::{ascii_class, TranslatorBuilder}; 1130 1131 // We create these errors to compare with real hir::Errors in the tests. 1132 // We define equality between TestError and hir::Error to disregard the 1133 // pattern string in hir::Error, which is annoying to provide in tests. 1134 #[derive(Clone, Debug)] 1135 struct TestError { 1136 span: Span, 1137 kind: hir::ErrorKind, 1138 } 1139 1140 impl PartialEq<hir::Error> for TestError { 1141 fn eq(&self, other: &hir::Error) -> bool { 1142 self.span == other.span && self.kind == other.kind 1143 } 1144 } 1145 1146 impl PartialEq<TestError> for hir::Error { 1147 fn eq(&self, other: &TestError) -> bool { 1148 self.span == other.span && self.kind == other.kind 1149 } 1150 } 1151 1152 fn parse(pattern: &str) -> Ast { 1153 ParserBuilder::new().octal(true).build().parse(pattern).unwrap() 1154 } 1155 1156 fn t(pattern: &str) -> Hir { 1157 TranslatorBuilder::new() 1158 .allow_invalid_utf8(false) 1159 .build() 1160 .translate(pattern, &parse(pattern)) 1161 .unwrap() 1162 } 1163 1164 fn t_err(pattern: &str) -> hir::Error { 1165 TranslatorBuilder::new() 1166 .allow_invalid_utf8(false) 1167 .build() 1168 .translate(pattern, &parse(pattern)) 1169 .unwrap_err() 1170 } 1171 1172 fn t_bytes(pattern: &str) -> Hir { 1173 TranslatorBuilder::new() 1174 .allow_invalid_utf8(true) 1175 .build() 1176 .translate(pattern, &parse(pattern)) 1177 .unwrap() 1178 } 1179 1180 fn hir_lit(s: &str) -> Hir { 1181 match s.len() { 1182 0 => Hir::empty(), 1183 _ => { 1184 let lits = s 1185 .chars() 1186 .map(hir::Literal::Unicode) 1187 .map(Hir::literal) 1188 .collect(); 1189 Hir::concat(lits) 1190 } 1191 } 1192 } 1193 1194 fn hir_blit(s: &[u8]) -> Hir { 1195 match s.len() { 1196 0 => Hir::empty(), 1197 1 => Hir::literal(hir::Literal::Byte(s[0])), 1198 _ => { 1199 let lits = s 1200 .iter() 1201 .cloned() 1202 .map(hir::Literal::Byte) 1203 .map(Hir::literal) 1204 .collect(); 1205 Hir::concat(lits) 1206 } 1207 } 1208 } 1209 1210 fn hir_group(i: u32, expr: Hir) -> Hir { 1211 Hir::group(hir::Group { 1212 kind: hir::GroupKind::CaptureIndex(i), 1213 hir: Box::new(expr), 1214 }) 1215 } 1216 1217 fn hir_group_name(i: u32, name: &str, expr: Hir) -> Hir { 1218 Hir::group(hir::Group { 1219 kind: hir::GroupKind::CaptureName { 1220 name: name.to_string(), 1221 index: i, 1222 }, 1223 hir: Box::new(expr), 1224 }) 1225 } 1226 1227 fn hir_group_nocap(expr: Hir) -> Hir { 1228 Hir::group(hir::Group { 1229 kind: hir::GroupKind::NonCapturing, 1230 hir: Box::new(expr), 1231 }) 1232 } 1233 1234 fn hir_quest(greedy: bool, expr: Hir) -> Hir { 1235 Hir::repetition(hir::Repetition { 1236 kind: hir::RepetitionKind::ZeroOrOne, 1237 greedy, 1238 hir: Box::new(expr), 1239 }) 1240 } 1241 1242 fn hir_star(greedy: bool, expr: Hir) -> Hir { 1243 Hir::repetition(hir::Repetition { 1244 kind: hir::RepetitionKind::ZeroOrMore, 1245 greedy, 1246 hir: Box::new(expr), 1247 }) 1248 } 1249 1250 fn hir_plus(greedy: bool, expr: Hir) -> Hir { 1251 Hir::repetition(hir::Repetition { 1252 kind: hir::RepetitionKind::OneOrMore, 1253 greedy, 1254 hir: Box::new(expr), 1255 }) 1256 } 1257 1258 fn hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir { 1259 Hir::repetition(hir::Repetition { 1260 kind: hir::RepetitionKind::Range(range), 1261 greedy, 1262 hir: Box::new(expr), 1263 }) 1264 } 1265 1266 fn hir_alt(alts: Vec<Hir>) -> Hir { 1267 Hir::alternation(alts) 1268 } 1269 1270 fn hir_cat(exprs: Vec<Hir>) -> Hir { 1271 Hir::concat(exprs) 1272 } 1273 1274 #[allow(dead_code)] 1275 fn hir_uclass_query(query: ClassQuery<'_>) -> Hir { 1276 Hir::class(hir::Class::Unicode(unicode::class(query).unwrap())) 1277 } 1278 1279 #[allow(dead_code)] 1280 fn hir_uclass_perl_word() -> Hir { 1281 Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap())) 1282 } 1283 1284 fn hir_uclass(ranges: &[(char, char)]) -> Hir { 1285 let ranges: Vec<hir::ClassUnicodeRange> = ranges 1286 .iter() 1287 .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) 1288 .collect(); 1289 Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(ranges))) 1290 } 1291 1292 fn hir_bclass(ranges: &[(u8, u8)]) -> Hir { 1293 let ranges: Vec<hir::ClassBytesRange> = ranges 1294 .iter() 1295 .map(|&(s, e)| hir::ClassBytesRange::new(s, e)) 1296 .collect(); 1297 Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) 1298 } 1299 1300 fn hir_bclass_from_char(ranges: &[(char, char)]) -> Hir { 1301 let ranges: Vec<hir::ClassBytesRange> = ranges 1302 .iter() 1303 .map(|&(s, e)| { 1304 assert!(s as u32 <= 0x7F); 1305 assert!(e as u32 <= 0x7F); 1306 hir::ClassBytesRange::new(s as u8, e as u8) 1307 }) 1308 .collect(); 1309 Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges))) 1310 } 1311 1312 fn hir_case_fold(expr: Hir) -> Hir { 1313 match expr.into_kind() { 1314 HirKind::Class(mut cls) => { 1315 cls.case_fold_simple(); 1316 Hir::class(cls) 1317 } 1318 _ => panic!("cannot case fold non-class Hir expr"), 1319 } 1320 } 1321 1322 fn hir_negate(expr: Hir) -> Hir { 1323 match expr.into_kind() { 1324 HirKind::Class(mut cls) => { 1325 cls.negate(); 1326 Hir::class(cls) 1327 } 1328 _ => panic!("cannot negate non-class Hir expr"), 1329 } 1330 } 1331 1332 #[allow(dead_code)] 1333 fn hir_union(expr1: Hir, expr2: Hir) -> Hir { 1334 use crate::hir::Class::{Bytes, Unicode}; 1335 1336 match (expr1.into_kind(), expr2.into_kind()) { 1337 (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { 1338 c1.union(&c2); 1339 Hir::class(hir::Class::Unicode(c1)) 1340 } 1341 (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { 1342 c1.union(&c2); 1343 Hir::class(hir::Class::Bytes(c1)) 1344 } 1345 _ => panic!("cannot union non-class Hir exprs"), 1346 } 1347 } 1348 1349 #[allow(dead_code)] 1350 fn hir_difference(expr1: Hir, expr2: Hir) -> Hir { 1351 use crate::hir::Class::{Bytes, Unicode}; 1352 1353 match (expr1.into_kind(), expr2.into_kind()) { 1354 (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { 1355 c1.difference(&c2); 1356 Hir::class(hir::Class::Unicode(c1)) 1357 } 1358 (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { 1359 c1.difference(&c2); 1360 Hir::class(hir::Class::Bytes(c1)) 1361 } 1362 _ => panic!("cannot difference non-class Hir exprs"), 1363 } 1364 } 1365 1366 fn hir_anchor(anchor: hir::Anchor) -> Hir { 1367 Hir::anchor(anchor) 1368 } 1369 1370 fn hir_word(wb: hir::WordBoundary) -> Hir { 1371 Hir::word_boundary(wb) 1372 } 1373 1374 #[test] 1375 fn empty() { 1376 assert_eq!(t(""), Hir::empty()); 1377 assert_eq!(t("(?i)"), Hir::empty()); 1378 assert_eq!(t("()"), hir_group(1, Hir::empty())); 1379 assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty())); 1380 assert_eq!(t("(?P<wat>)"), hir_group_name(1, "wat", Hir::empty())); 1381 assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()])); 1382 assert_eq!( 1383 t("()|()"), 1384 hir_alt(vec![ 1385 hir_group(1, Hir::empty()), 1386 hir_group(2, Hir::empty()), 1387 ]) 1388 ); 1389 assert_eq!( 1390 t("(|b)"), 1391 hir_group(1, hir_alt(vec![Hir::empty(), hir_lit("b"),])) 1392 ); 1393 assert_eq!( 1394 t("(a|)"), 1395 hir_group(1, hir_alt(vec![hir_lit("a"), Hir::empty(),])) 1396 ); 1397 assert_eq!( 1398 t("(a||c)"), 1399 hir_group( 1400 1, 1401 hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),]) 1402 ) 1403 ); 1404 assert_eq!( 1405 t("(||)"), 1406 hir_group( 1407 1, 1408 hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),]) 1409 ) 1410 ); 1411 } 1412 1413 #[test] 1414 fn literal() { 1415 assert_eq!(t("a"), hir_lit("a")); 1416 assert_eq!(t("(?-u)a"), hir_lit("a")); 1417 assert_eq!(t("☃"), hir_lit("☃")); 1418 assert_eq!(t("abcd"), hir_lit("abcd")); 1419 1420 assert_eq!(t_bytes("(?-u)a"), hir_lit("a")); 1421 assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a")); 1422 assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a")); 1423 assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF")); 1424 1425 assert_eq!( 1426 t_err("(?-u)☃"), 1427 TestError { 1428 kind: hir::ErrorKind::UnicodeNotAllowed, 1429 span: Span::new( 1430 Position::new(5, 1, 6), 1431 Position::new(8, 1, 7) 1432 ), 1433 } 1434 ); 1435 assert_eq!( 1436 t_err(r"(?-u)\xFF"), 1437 TestError { 1438 kind: hir::ErrorKind::InvalidUtf8, 1439 span: Span::new( 1440 Position::new(5, 1, 6), 1441 Position::new(9, 1, 10) 1442 ), 1443 } 1444 ); 1445 } 1446 1447 #[test] 1448 fn literal_case_insensitive() { 1449 #[cfg(feature = "unicode-case")] 1450 assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),])); 1451 #[cfg(feature = "unicode-case")] 1452 assert_eq!( 1453 t("(?i:a)"), 1454 hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')],)) 1455 ); 1456 #[cfg(feature = "unicode-case")] 1457 assert_eq!( 1458 t("a(?i)a(?-i)a"), 1459 hir_cat(vec![ 1460 hir_lit("a"), 1461 hir_uclass(&[('A', 'A'), ('a', 'a')]), 1462 hir_lit("a"), 1463 ]) 1464 ); 1465 #[cfg(feature = "unicode-case")] 1466 assert_eq!( 1467 t("(?i)ab@c"), 1468 hir_cat(vec![ 1469 hir_uclass(&[('A', 'A'), ('a', 'a')]), 1470 hir_uclass(&[('B', 'B'), ('b', 'b')]), 1471 hir_lit("@"), 1472 hir_uclass(&[('C', 'C'), ('c', 'c')]), 1473 ]) 1474 ); 1475 #[cfg(feature = "unicode-case")] 1476 assert_eq!( 1477 t("(?i)β"), 1478 hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) 1479 ); 1480 1481 assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])); 1482 #[cfg(feature = "unicode-case")] 1483 assert_eq!( 1484 t("(?-u)a(?i)a(?-i)a"), 1485 hir_cat(vec![ 1486 hir_lit("a"), 1487 hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), 1488 hir_lit("a"), 1489 ]) 1490 ); 1491 assert_eq!( 1492 t("(?i-u)ab@c"), 1493 hir_cat(vec![ 1494 hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), 1495 hir_bclass(&[(b'B', b'B'), (b'b', b'b')]), 1496 hir_lit("@"), 1497 hir_bclass(&[(b'C', b'C'), (b'c', b'c')]), 1498 ]) 1499 ); 1500 1501 assert_eq!( 1502 t_bytes("(?i-u)a"), 1503 hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) 1504 ); 1505 assert_eq!( 1506 t_bytes("(?i-u)\x61"), 1507 hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) 1508 ); 1509 assert_eq!( 1510 t_bytes(r"(?i-u)\x61"), 1511 hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) 1512 ); 1513 assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF")); 1514 1515 assert_eq!( 1516 t_err("(?i-u)β"), 1517 TestError { 1518 kind: hir::ErrorKind::UnicodeNotAllowed, 1519 span: Span::new( 1520 Position::new(6, 1, 7), 1521 Position::new(8, 1, 8), 1522 ), 1523 } 1524 ); 1525 } 1526 1527 #[test] 1528 fn dot() { 1529 assert_eq!( 1530 t("."), 1531 hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}'),]) 1532 ); 1533 assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}'),])); 1534 assert_eq!( 1535 t_bytes("(?-u)."), 1536 hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF'),]) 1537 ); 1538 assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); 1539 1540 // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed. 1541 assert_eq!( 1542 t_err("(?-u)."), 1543 TestError { 1544 kind: hir::ErrorKind::InvalidUtf8, 1545 span: Span::new( 1546 Position::new(5, 1, 6), 1547 Position::new(6, 1, 7) 1548 ), 1549 } 1550 ); 1551 assert_eq!( 1552 t_err("(?s-u)."), 1553 TestError { 1554 kind: hir::ErrorKind::InvalidUtf8, 1555 span: Span::new( 1556 Position::new(6, 1, 7), 1557 Position::new(7, 1, 8) 1558 ), 1559 } 1560 ); 1561 } 1562 1563 #[test] 1564 fn assertions() { 1565 assert_eq!(t("^"), hir_anchor(hir::Anchor::StartText)); 1566 assert_eq!(t("$"), hir_anchor(hir::Anchor::EndText)); 1567 assert_eq!(t(r"\A"), hir_anchor(hir::Anchor::StartText)); 1568 assert_eq!(t(r"\z"), hir_anchor(hir::Anchor::EndText)); 1569 assert_eq!(t("(?m)^"), hir_anchor(hir::Anchor::StartLine)); 1570 assert_eq!(t("(?m)$"), hir_anchor(hir::Anchor::EndLine)); 1571 assert_eq!(t(r"(?m)\A"), hir_anchor(hir::Anchor::StartText)); 1572 assert_eq!(t(r"(?m)\z"), hir_anchor(hir::Anchor::EndText)); 1573 1574 assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode)); 1575 assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate)); 1576 assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii)); 1577 assert_eq!( 1578 t_bytes(r"(?-u)\B"), 1579 hir_word(hir::WordBoundary::AsciiNegate) 1580 ); 1581 1582 assert_eq!( 1583 t_err(r"(?-u)\B"), 1584 TestError { 1585 kind: hir::ErrorKind::InvalidUtf8, 1586 span: Span::new( 1587 Position::new(5, 1, 6), 1588 Position::new(7, 1, 8) 1589 ), 1590 } 1591 ); 1592 } 1593 1594 #[test] 1595 fn group() { 1596 assert_eq!(t("(a)"), hir_group(1, hir_lit("a"))); 1597 assert_eq!( 1598 t("(a)(b)"), 1599 hir_cat(vec![ 1600 hir_group(1, hir_lit("a")), 1601 hir_group(2, hir_lit("b")), 1602 ]) 1603 ); 1604 assert_eq!( 1605 t("(a)|(b)"), 1606 hir_alt(vec![ 1607 hir_group(1, hir_lit("a")), 1608 hir_group(2, hir_lit("b")), 1609 ]) 1610 ); 1611 assert_eq!(t("(?P<foo>)"), hir_group_name(1, "foo", Hir::empty())); 1612 assert_eq!(t("(?P<foo>a)"), hir_group_name(1, "foo", hir_lit("a"))); 1613 assert_eq!( 1614 t("(?P<foo>a)(?P<bar>b)"), 1615 hir_cat(vec![ 1616 hir_group_name(1, "foo", hir_lit("a")), 1617 hir_group_name(2, "bar", hir_lit("b")), 1618 ]) 1619 ); 1620 assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty())); 1621 assert_eq!(t("(?:a)"), hir_group_nocap(hir_lit("a"))); 1622 assert_eq!( 1623 t("(?:a)(b)"), 1624 hir_cat(vec![ 1625 hir_group_nocap(hir_lit("a")), 1626 hir_group(1, hir_lit("b")), 1627 ]) 1628 ); 1629 assert_eq!( 1630 t("(a)(?:b)(c)"), 1631 hir_cat(vec![ 1632 hir_group(1, hir_lit("a")), 1633 hir_group_nocap(hir_lit("b")), 1634 hir_group(2, hir_lit("c")), 1635 ]) 1636 ); 1637 assert_eq!( 1638 t("(a)(?P<foo>b)(c)"), 1639 hir_cat(vec![ 1640 hir_group(1, hir_lit("a")), 1641 hir_group_name(2, "foo", hir_lit("b")), 1642 hir_group(3, hir_lit("c")), 1643 ]) 1644 ); 1645 assert_eq!(t("()"), hir_group(1, Hir::empty())); 1646 assert_eq!(t("((?i))"), hir_group(1, Hir::empty())); 1647 assert_eq!(t("((?x))"), hir_group(1, Hir::empty())); 1648 assert_eq!(t("(((?x)))"), hir_group(1, hir_group(2, Hir::empty()))); 1649 } 1650 1651 #[test] 1652 fn flags() { 1653 #[cfg(feature = "unicode-case")] 1654 assert_eq!( 1655 t("(?i:a)a"), 1656 hir_cat(vec![ 1657 hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')])), 1658 hir_lit("a"), 1659 ]) 1660 ); 1661 assert_eq!( 1662 t("(?i-u:a)β"), 1663 hir_cat(vec![ 1664 hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), 1665 hir_lit("β"), 1666 ]) 1667 ); 1668 assert_eq!( 1669 t("(?:(?i-u)a)b"), 1670 hir_cat(vec![ 1671 hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), 1672 hir_lit("b"), 1673 ]) 1674 ); 1675 assert_eq!( 1676 t("((?i-u)a)b"), 1677 hir_cat(vec![ 1678 hir_group(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), 1679 hir_lit("b"), 1680 ]) 1681 ); 1682 #[cfg(feature = "unicode-case")] 1683 assert_eq!( 1684 t("(?i)(?-i:a)a"), 1685 hir_cat(vec![ 1686 hir_group_nocap(hir_lit("a")), 1687 hir_uclass(&[('A', 'A'), ('a', 'a')]), 1688 ]) 1689 ); 1690 #[cfg(feature = "unicode-case")] 1691 assert_eq!( 1692 t("(?im)a^"), 1693 hir_cat(vec![ 1694 hir_uclass(&[('A', 'A'), ('a', 'a')]), 1695 hir_anchor(hir::Anchor::StartLine), 1696 ]) 1697 ); 1698 #[cfg(feature = "unicode-case")] 1699 assert_eq!( 1700 t("(?im)a^(?i-m)a^"), 1701 hir_cat(vec![ 1702 hir_uclass(&[('A', 'A'), ('a', 'a')]), 1703 hir_anchor(hir::Anchor::StartLine), 1704 hir_uclass(&[('A', 'A'), ('a', 'a')]), 1705 hir_anchor(hir::Anchor::StartText), 1706 ]) 1707 ); 1708 assert_eq!( 1709 t("(?U)a*a*?(?-U)a*a*?"), 1710 hir_cat(vec![ 1711 hir_star(false, hir_lit("a")), 1712 hir_star(true, hir_lit("a")), 1713 hir_star(true, hir_lit("a")), 1714 hir_star(false, hir_lit("a")), 1715 ]) 1716 ); 1717 #[cfg(feature = "unicode-case")] 1718 assert_eq!( 1719 t("(?:a(?i)a)a"), 1720 hir_cat(vec![ 1721 hir_group_nocap(hir_cat(vec![ 1722 hir_lit("a"), 1723 hir_uclass(&[('A', 'A'), ('a', 'a')]), 1724 ])), 1725 hir_lit("a"), 1726 ]) 1727 ); 1728 #[cfg(feature = "unicode-case")] 1729 assert_eq!( 1730 t("(?i)(?:a(?-i)a)a"), 1731 hir_cat(vec![ 1732 hir_group_nocap(hir_cat(vec![ 1733 hir_uclass(&[('A', 'A'), ('a', 'a')]), 1734 hir_lit("a"), 1735 ])), 1736 hir_uclass(&[('A', 'A'), ('a', 'a')]), 1737 ]) 1738 ); 1739 } 1740 1741 #[test] 1742 fn escape() { 1743 assert_eq!( 1744 t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"), 1745 hir_lit(r"\.+*?()|[]{}^$#") 1746 ); 1747 } 1748 1749 #[test] 1750 fn repetition() { 1751 assert_eq!(t("a?"), hir_quest(true, hir_lit("a"))); 1752 assert_eq!(t("a*"), hir_star(true, hir_lit("a"))); 1753 assert_eq!(t("a+"), hir_plus(true, hir_lit("a"))); 1754 assert_eq!(t("a??"), hir_quest(false, hir_lit("a"))); 1755 assert_eq!(t("a*?"), hir_star(false, hir_lit("a"))); 1756 assert_eq!(t("a+?"), hir_plus(false, hir_lit("a"))); 1757 1758 assert_eq!( 1759 t("a{1}"), 1760 hir_range(true, hir::RepetitionRange::Exactly(1), hir_lit("a"),) 1761 ); 1762 assert_eq!( 1763 t("a{1,}"), 1764 hir_range(true, hir::RepetitionRange::AtLeast(1), hir_lit("a"),) 1765 ); 1766 assert_eq!( 1767 t("a{1,2}"), 1768 hir_range(true, hir::RepetitionRange::Bounded(1, 2), hir_lit("a"),) 1769 ); 1770 assert_eq!( 1771 t("a{1}?"), 1772 hir_range(false, hir::RepetitionRange::Exactly(1), hir_lit("a"),) 1773 ); 1774 assert_eq!( 1775 t("a{1,}?"), 1776 hir_range(false, hir::RepetitionRange::AtLeast(1), hir_lit("a"),) 1777 ); 1778 assert_eq!( 1779 t("a{1,2}?"), 1780 hir_range( 1781 false, 1782 hir::RepetitionRange::Bounded(1, 2), 1783 hir_lit("a"), 1784 ) 1785 ); 1786 1787 assert_eq!( 1788 t("ab?"), 1789 hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) 1790 ); 1791 assert_eq!( 1792 t("(ab)?"), 1793 hir_quest( 1794 true, 1795 hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),])) 1796 ) 1797 ); 1798 assert_eq!( 1799 t("a|b?"), 1800 hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) 1801 ); 1802 } 1803 1804 #[test] 1805 fn cat_alt() { 1806 assert_eq!( 1807 t("(ab)"), 1808 hir_group(1, hir_cat(vec![hir_lit("a"), hir_lit("b"),])) 1809 ); 1810 assert_eq!(t("a|b"), hir_alt(vec![hir_lit("a"), hir_lit("b"),])); 1811 assert_eq!( 1812 t("a|b|c"), 1813 hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),]) 1814 ); 1815 assert_eq!( 1816 t("ab|bc|cd"), 1817 hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),]) 1818 ); 1819 assert_eq!( 1820 t("(a|b)"), 1821 hir_group(1, hir_alt(vec![hir_lit("a"), hir_lit("b"),])) 1822 ); 1823 assert_eq!( 1824 t("(a|b|c)"), 1825 hir_group( 1826 1, 1827 hir_alt(vec![hir_lit("a"), hir_lit("b"), hir_lit("c"),]) 1828 ) 1829 ); 1830 assert_eq!( 1831 t("(ab|bc|cd)"), 1832 hir_group( 1833 1, 1834 hir_alt(vec![hir_lit("ab"), hir_lit("bc"), hir_lit("cd"),]) 1835 ) 1836 ); 1837 assert_eq!( 1838 t("(ab|(bc|(cd)))"), 1839 hir_group( 1840 1, 1841 hir_alt(vec![ 1842 hir_lit("ab"), 1843 hir_group( 1844 2, 1845 hir_alt(vec![ 1846 hir_lit("bc"), 1847 hir_group(3, hir_lit("cd")), 1848 ]) 1849 ), 1850 ]) 1851 ) 1852 ); 1853 } 1854 1855 #[test] 1856 fn class_ascii() { 1857 assert_eq!( 1858 t("[[:alnum:]]"), 1859 hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)) 1860 ); 1861 assert_eq!( 1862 t("[[:alpha:]]"), 1863 hir_uclass(ascii_class(&ast::ClassAsciiKind::Alpha)) 1864 ); 1865 assert_eq!( 1866 t("[[:ascii:]]"), 1867 hir_uclass(ascii_class(&ast::ClassAsciiKind::Ascii)) 1868 ); 1869 assert_eq!( 1870 t("[[:blank:]]"), 1871 hir_uclass(ascii_class(&ast::ClassAsciiKind::Blank)) 1872 ); 1873 assert_eq!( 1874 t("[[:cntrl:]]"), 1875 hir_uclass(ascii_class(&ast::ClassAsciiKind::Cntrl)) 1876 ); 1877 assert_eq!( 1878 t("[[:digit:]]"), 1879 hir_uclass(ascii_class(&ast::ClassAsciiKind::Digit)) 1880 ); 1881 assert_eq!( 1882 t("[[:graph:]]"), 1883 hir_uclass(ascii_class(&ast::ClassAsciiKind::Graph)) 1884 ); 1885 assert_eq!( 1886 t("[[:lower:]]"), 1887 hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower)) 1888 ); 1889 assert_eq!( 1890 t("[[:print:]]"), 1891 hir_uclass(ascii_class(&ast::ClassAsciiKind::Print)) 1892 ); 1893 assert_eq!( 1894 t("[[:punct:]]"), 1895 hir_uclass(ascii_class(&ast::ClassAsciiKind::Punct)) 1896 ); 1897 assert_eq!( 1898 t("[[:space:]]"), 1899 hir_uclass(ascii_class(&ast::ClassAsciiKind::Space)) 1900 ); 1901 assert_eq!( 1902 t("[[:upper:]]"), 1903 hir_uclass(ascii_class(&ast::ClassAsciiKind::Upper)) 1904 ); 1905 assert_eq!( 1906 t("[[:word:]]"), 1907 hir_uclass(ascii_class(&ast::ClassAsciiKind::Word)) 1908 ); 1909 assert_eq!( 1910 t("[[:xdigit:]]"), 1911 hir_uclass(ascii_class(&ast::ClassAsciiKind::Xdigit)) 1912 ); 1913 1914 assert_eq!( 1915 t("[[:^lower:]]"), 1916 hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))) 1917 ); 1918 #[cfg(feature = "unicode-case")] 1919 assert_eq!( 1920 t("(?i)[[:lower:]]"), 1921 hir_uclass(&[ 1922 ('A', 'Z'), 1923 ('a', 'z'), 1924 ('\u{17F}', '\u{17F}'), 1925 ('\u{212A}', '\u{212A}'), 1926 ]) 1927 ); 1928 1929 assert_eq!( 1930 t("(?-u)[[:lower:]]"), 1931 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Lower)) 1932 ); 1933 assert_eq!( 1934 t("(?i-u)[[:lower:]]"), 1935 hir_case_fold(hir_bclass_from_char(ascii_class( 1936 &ast::ClassAsciiKind::Lower 1937 ))) 1938 ); 1939 1940 assert_eq!( 1941 t_err("(?-u)[[:^lower:]]"), 1942 TestError { 1943 kind: hir::ErrorKind::InvalidUtf8, 1944 span: Span::new( 1945 Position::new(6, 1, 7), 1946 Position::new(16, 1, 17) 1947 ), 1948 } 1949 ); 1950 assert_eq!( 1951 t_err("(?i-u)[[:^lower:]]"), 1952 TestError { 1953 kind: hir::ErrorKind::InvalidUtf8, 1954 span: Span::new( 1955 Position::new(7, 1, 8), 1956 Position::new(17, 1, 18) 1957 ), 1958 } 1959 ); 1960 } 1961 1962 #[test] 1963 fn class_ascii_multiple() { 1964 // See: https://github.com/rust-lang/regex/issues/680 1965 assert_eq!( 1966 t("[[:alnum:][:^ascii:]]"), 1967 hir_union( 1968 hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)), 1969 hir_uclass(&[('\u{80}', '\u{10FFFF}')]), 1970 ), 1971 ); 1972 assert_eq!( 1973 t_bytes("(?-u)[[:alnum:][:^ascii:]]"), 1974 hir_union( 1975 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Alnum)), 1976 hir_bclass(&[(0x80, 0xFF)]), 1977 ), 1978 ); 1979 } 1980 1981 #[test] 1982 #[cfg(feature = "unicode-perl")] 1983 fn class_perl() { 1984 // Unicode 1985 assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit"))); 1986 assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space"))); 1987 assert_eq!(t(r"\w"), hir_uclass_perl_word()); 1988 #[cfg(feature = "unicode-case")] 1989 assert_eq!( 1990 t(r"(?i)\d"), 1991 hir_uclass_query(ClassQuery::Binary("digit")) 1992 ); 1993 #[cfg(feature = "unicode-case")] 1994 assert_eq!( 1995 t(r"(?i)\s"), 1996 hir_uclass_query(ClassQuery::Binary("space")) 1997 ); 1998 #[cfg(feature = "unicode-case")] 1999 assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word()); 2000 2001 // Unicode, negated 2002 assert_eq!( 2003 t(r"\D"), 2004 hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) 2005 ); 2006 assert_eq!( 2007 t(r"\S"), 2008 hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) 2009 ); 2010 assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word())); 2011 #[cfg(feature = "unicode-case")] 2012 assert_eq!( 2013 t(r"(?i)\D"), 2014 hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) 2015 ); 2016 #[cfg(feature = "unicode-case")] 2017 assert_eq!( 2018 t(r"(?i)\S"), 2019 hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) 2020 ); 2021 #[cfg(feature = "unicode-case")] 2022 assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word())); 2023 2024 // ASCII only 2025 assert_eq!( 2026 t(r"(?-u)\d"), 2027 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) 2028 ); 2029 assert_eq!( 2030 t(r"(?-u)\s"), 2031 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space)) 2032 ); 2033 assert_eq!( 2034 t(r"(?-u)\w"), 2035 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word)) 2036 ); 2037 assert_eq!( 2038 t(r"(?i-u)\d"), 2039 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) 2040 ); 2041 assert_eq!( 2042 t(r"(?i-u)\s"), 2043 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space)) 2044 ); 2045 assert_eq!( 2046 t(r"(?i-u)\w"), 2047 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word)) 2048 ); 2049 2050 // ASCII only, negated 2051 assert_eq!( 2052 t(r"(?-u)\D"), 2053 hir_negate(hir_bclass_from_char(ascii_class( 2054 &ast::ClassAsciiKind::Digit 2055 ))) 2056 ); 2057 assert_eq!( 2058 t(r"(?-u)\S"), 2059 hir_negate(hir_bclass_from_char(ascii_class( 2060 &ast::ClassAsciiKind::Space 2061 ))) 2062 ); 2063 assert_eq!( 2064 t(r"(?-u)\W"), 2065 hir_negate(hir_bclass_from_char(ascii_class( 2066 &ast::ClassAsciiKind::Word 2067 ))) 2068 ); 2069 assert_eq!( 2070 t(r"(?i-u)\D"), 2071 hir_negate(hir_bclass_from_char(ascii_class( 2072 &ast::ClassAsciiKind::Digit 2073 ))) 2074 ); 2075 assert_eq!( 2076 t(r"(?i-u)\S"), 2077 hir_negate(hir_bclass_from_char(ascii_class( 2078 &ast::ClassAsciiKind::Space 2079 ))) 2080 ); 2081 assert_eq!( 2082 t(r"(?i-u)\W"), 2083 hir_negate(hir_bclass_from_char(ascii_class( 2084 &ast::ClassAsciiKind::Word 2085 ))) 2086 ); 2087 } 2088 2089 #[test] 2090 #[cfg(not(feature = "unicode-perl"))] 2091 fn class_perl_word_disabled() { 2092 assert_eq!( 2093 t_err(r"\w"), 2094 TestError { 2095 kind: hir::ErrorKind::UnicodePerlClassNotFound, 2096 span: Span::new( 2097 Position::new(0, 1, 1), 2098 Position::new(2, 1, 3) 2099 ), 2100 } 2101 ); 2102 } 2103 2104 #[test] 2105 #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))] 2106 fn class_perl_space_disabled() { 2107 assert_eq!( 2108 t_err(r"\s"), 2109 TestError { 2110 kind: hir::ErrorKind::UnicodePerlClassNotFound, 2111 span: Span::new( 2112 Position::new(0, 1, 1), 2113 Position::new(2, 1, 3) 2114 ), 2115 } 2116 ); 2117 } 2118 2119 #[test] 2120 #[cfg(all( 2121 not(feature = "unicode-perl"), 2122 not(feature = "unicode-gencat") 2123 ))] 2124 fn class_perl_digit_disabled() { 2125 assert_eq!( 2126 t_err(r"\d"), 2127 TestError { 2128 kind: hir::ErrorKind::UnicodePerlClassNotFound, 2129 span: Span::new( 2130 Position::new(0, 1, 1), 2131 Position::new(2, 1, 3) 2132 ), 2133 } 2134 ); 2135 } 2136 2137 #[test] 2138 #[cfg(feature = "unicode-gencat")] 2139 fn class_unicode_gencat() { 2140 assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z"))); 2141 assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z"))); 2142 assert_eq!( 2143 t(r"\p{Separator}"), 2144 hir_uclass_query(ClassQuery::Binary("Z")) 2145 ); 2146 assert_eq!( 2147 t(r"\p{se PaRa ToR}"), 2148 hir_uclass_query(ClassQuery::Binary("Z")) 2149 ); 2150 assert_eq!( 2151 t(r"\p{gc:Separator}"), 2152 hir_uclass_query(ClassQuery::Binary("Z")) 2153 ); 2154 assert_eq!( 2155 t(r"\p{gc=Separator}"), 2156 hir_uclass_query(ClassQuery::Binary("Z")) 2157 ); 2158 assert_eq!( 2159 t(r"\p{Other}"), 2160 hir_uclass_query(ClassQuery::Binary("Other")) 2161 ); 2162 assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other"))); 2163 2164 assert_eq!( 2165 t(r"\PZ"), 2166 hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) 2167 ); 2168 assert_eq!( 2169 t(r"\P{separator}"), 2170 hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) 2171 ); 2172 assert_eq!( 2173 t(r"\P{gc!=separator}"), 2174 hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) 2175 ); 2176 2177 assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any"))); 2178 assert_eq!( 2179 t(r"\p{assigned}"), 2180 hir_uclass_query(ClassQuery::Binary("Assigned")) 2181 ); 2182 assert_eq!( 2183 t(r"\p{ascii}"), 2184 hir_uclass_query(ClassQuery::Binary("ASCII")) 2185 ); 2186 assert_eq!( 2187 t(r"\p{gc:any}"), 2188 hir_uclass_query(ClassQuery::Binary("Any")) 2189 ); 2190 assert_eq!( 2191 t(r"\p{gc:assigned}"), 2192 hir_uclass_query(ClassQuery::Binary("Assigned")) 2193 ); 2194 assert_eq!( 2195 t(r"\p{gc:ascii}"), 2196 hir_uclass_query(ClassQuery::Binary("ASCII")) 2197 ); 2198 2199 assert_eq!( 2200 t_err(r"(?-u)\pZ"), 2201 TestError { 2202 kind: hir::ErrorKind::UnicodeNotAllowed, 2203 span: Span::new( 2204 Position::new(5, 1, 6), 2205 Position::new(8, 1, 9) 2206 ), 2207 } 2208 ); 2209 assert_eq!( 2210 t_err(r"(?-u)\p{Separator}"), 2211 TestError { 2212 kind: hir::ErrorKind::UnicodeNotAllowed, 2213 span: Span::new( 2214 Position::new(5, 1, 6), 2215 Position::new(18, 1, 19) 2216 ), 2217 } 2218 ); 2219 assert_eq!( 2220 t_err(r"\pE"), 2221 TestError { 2222 kind: hir::ErrorKind::UnicodePropertyNotFound, 2223 span: Span::new( 2224 Position::new(0, 1, 1), 2225 Position::new(3, 1, 4) 2226 ), 2227 } 2228 ); 2229 assert_eq!( 2230 t_err(r"\p{Foo}"), 2231 TestError { 2232 kind: hir::ErrorKind::UnicodePropertyNotFound, 2233 span: Span::new( 2234 Position::new(0, 1, 1), 2235 Position::new(7, 1, 8) 2236 ), 2237 } 2238 ); 2239 assert_eq!( 2240 t_err(r"\p{gc:Foo}"), 2241 TestError { 2242 kind: hir::ErrorKind::UnicodePropertyValueNotFound, 2243 span: Span::new( 2244 Position::new(0, 1, 1), 2245 Position::new(10, 1, 11) 2246 ), 2247 } 2248 ); 2249 } 2250 2251 #[test] 2252 #[cfg(not(feature = "unicode-gencat"))] 2253 fn class_unicode_gencat_disabled() { 2254 assert_eq!( 2255 t_err(r"\p{Separator}"), 2256 TestError { 2257 kind: hir::ErrorKind::UnicodePropertyNotFound, 2258 span: Span::new( 2259 Position::new(0, 1, 1), 2260 Position::new(13, 1, 14) 2261 ), 2262 } 2263 ); 2264 2265 assert_eq!( 2266 t_err(r"\p{Any}"), 2267 TestError { 2268 kind: hir::ErrorKind::UnicodePropertyNotFound, 2269 span: Span::new( 2270 Position::new(0, 1, 1), 2271 Position::new(7, 1, 8) 2272 ), 2273 } 2274 ); 2275 } 2276 2277 #[test] 2278 #[cfg(feature = "unicode-script")] 2279 fn class_unicode_script() { 2280 assert_eq!( 2281 t(r"\p{Greek}"), 2282 hir_uclass_query(ClassQuery::Binary("Greek")) 2283 ); 2284 #[cfg(feature = "unicode-case")] 2285 assert_eq!( 2286 t(r"(?i)\p{Greek}"), 2287 hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek"))) 2288 ); 2289 #[cfg(feature = "unicode-case")] 2290 assert_eq!( 2291 t(r"(?i)\P{Greek}"), 2292 hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( 2293 "Greek" 2294 )))) 2295 ); 2296 2297 assert_eq!( 2298 t_err(r"\p{sc:Foo}"), 2299 TestError { 2300 kind: hir::ErrorKind::UnicodePropertyValueNotFound, 2301 span: Span::new( 2302 Position::new(0, 1, 1), 2303 Position::new(10, 1, 11) 2304 ), 2305 } 2306 ); 2307 assert_eq!( 2308 t_err(r"\p{scx:Foo}"), 2309 TestError { 2310 kind: hir::ErrorKind::UnicodePropertyValueNotFound, 2311 span: Span::new( 2312 Position::new(0, 1, 1), 2313 Position::new(11, 1, 12) 2314 ), 2315 } 2316 ); 2317 } 2318 2319 #[test] 2320 #[cfg(not(feature = "unicode-script"))] 2321 fn class_unicode_script_disabled() { 2322 assert_eq!( 2323 t_err(r"\p{Greek}"), 2324 TestError { 2325 kind: hir::ErrorKind::UnicodePropertyNotFound, 2326 span: Span::new( 2327 Position::new(0, 1, 1), 2328 Position::new(9, 1, 10) 2329 ), 2330 } 2331 ); 2332 2333 assert_eq!( 2334 t_err(r"\p{scx:Greek}"), 2335 TestError { 2336 kind: hir::ErrorKind::UnicodePropertyNotFound, 2337 span: Span::new( 2338 Position::new(0, 1, 1), 2339 Position::new(13, 1, 14) 2340 ), 2341 } 2342 ); 2343 } 2344 2345 #[test] 2346 #[cfg(feature = "unicode-age")] 2347 fn class_unicode_age() { 2348 assert_eq!( 2349 t_err(r"\p{age:Foo}"), 2350 TestError { 2351 kind: hir::ErrorKind::UnicodePropertyValueNotFound, 2352 span: Span::new( 2353 Position::new(0, 1, 1), 2354 Position::new(11, 1, 12) 2355 ), 2356 } 2357 ); 2358 } 2359 2360 #[test] 2361 #[cfg(feature = "unicode-gencat")] 2362 fn class_unicode_any_empty() { 2363 assert_eq!( 2364 t_err(r"\P{any}"), 2365 TestError { 2366 kind: hir::ErrorKind::EmptyClassNotAllowed, 2367 span: Span::new( 2368 Position::new(0, 1, 1), 2369 Position::new(7, 1, 8) 2370 ), 2371 } 2372 ); 2373 } 2374 2375 #[test] 2376 #[cfg(not(feature = "unicode-age"))] 2377 fn class_unicode_age_disabled() { 2378 assert_eq!( 2379 t_err(r"\p{age:3.0}"), 2380 TestError { 2381 kind: hir::ErrorKind::UnicodePropertyNotFound, 2382 span: Span::new( 2383 Position::new(0, 1, 1), 2384 Position::new(11, 1, 12) 2385 ), 2386 } 2387 ); 2388 } 2389 2390 #[test] 2391 fn class_bracketed() { 2392 assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')])); 2393 assert_eq!(t("[^[a]]"), hir_negate(hir_uclass(&[('a', 'a')]))); 2394 assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')])); 2395 assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')])); 2396 assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')])); 2397 assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')])); 2398 assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')])); 2399 assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')])); 2400 #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] 2401 assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit"))); 2402 #[cfg(feature = "unicode-gencat")] 2403 assert_eq!( 2404 t(r"[\pZ]"), 2405 hir_uclass_query(ClassQuery::Binary("separator")) 2406 ); 2407 #[cfg(feature = "unicode-gencat")] 2408 assert_eq!( 2409 t(r"[\p{separator}]"), 2410 hir_uclass_query(ClassQuery::Binary("separator")) 2411 ); 2412 #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] 2413 assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit"))); 2414 #[cfg(feature = "unicode-gencat")] 2415 assert_eq!( 2416 t(r"[^\PZ]"), 2417 hir_uclass_query(ClassQuery::Binary("separator")) 2418 ); 2419 #[cfg(feature = "unicode-gencat")] 2420 assert_eq!( 2421 t(r"[^\P{separator}]"), 2422 hir_uclass_query(ClassQuery::Binary("separator")) 2423 ); 2424 #[cfg(all( 2425 feature = "unicode-case", 2426 any(feature = "unicode-perl", feature = "unicode-gencat") 2427 ))] 2428 assert_eq!( 2429 t(r"(?i)[^\D]"), 2430 hir_uclass_query(ClassQuery::Binary("digit")) 2431 ); 2432 #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] 2433 assert_eq!( 2434 t(r"(?i)[^\P{greek}]"), 2435 hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek"))) 2436 ); 2437 2438 assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')])); 2439 assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')])); 2440 assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')])); 2441 2442 #[cfg(feature = "unicode-case")] 2443 assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')])); 2444 #[cfg(feature = "unicode-case")] 2445 assert_eq!( 2446 t("(?i)[k]"), 2447 hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),]) 2448 ); 2449 #[cfg(feature = "unicode-case")] 2450 assert_eq!( 2451 t("(?i)[β]"), 2452 hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) 2453 ); 2454 assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),])); 2455 2456 assert_eq!(t("[^a]"), hir_negate(hir_uclass(&[('a', 'a')]))); 2457 assert_eq!(t(r"[^\x00]"), hir_negate(hir_uclass(&[('\0', '\0')]))); 2458 assert_eq!( 2459 t_bytes("(?-u)[^a]"), 2460 hir_negate(hir_bclass(&[(b'a', b'a')])) 2461 ); 2462 #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] 2463 assert_eq!( 2464 t(r"[^\d]"), 2465 hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) 2466 ); 2467 #[cfg(feature = "unicode-gencat")] 2468 assert_eq!( 2469 t(r"[^\pZ]"), 2470 hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) 2471 ); 2472 #[cfg(feature = "unicode-gencat")] 2473 assert_eq!( 2474 t(r"[^\p{separator}]"), 2475 hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) 2476 ); 2477 #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] 2478 assert_eq!( 2479 t(r"(?i)[^\p{greek}]"), 2480 hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( 2481 "greek" 2482 )))) 2483 ); 2484 #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] 2485 assert_eq!( 2486 t(r"(?i)[\P{greek}]"), 2487 hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( 2488 "greek" 2489 )))) 2490 ); 2491 2492 // Test some weird cases. 2493 assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')])); 2494 2495 assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')])); 2496 assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')])); 2497 assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')])); 2498 assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')])); 2499 assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')])); 2500 2501 assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')])); 2502 assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')])); 2503 assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')])); 2504 assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')])); 2505 assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')])); 2506 2507 assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')])); 2508 assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')])); 2509 assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')])); 2510 assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')])); 2511 assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')])); 2512 2513 assert_eq!( 2514 t_err("(?-u)[^a]"), 2515 TestError { 2516 kind: hir::ErrorKind::InvalidUtf8, 2517 span: Span::new( 2518 Position::new(5, 1, 6), 2519 Position::new(9, 1, 10) 2520 ), 2521 } 2522 ); 2523 #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] 2524 assert_eq!( 2525 t_err(r"[^\s\S]"), 2526 TestError { 2527 kind: hir::ErrorKind::EmptyClassNotAllowed, 2528 span: Span::new( 2529 Position::new(0, 1, 1), 2530 Position::new(7, 1, 8) 2531 ), 2532 } 2533 ); 2534 #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] 2535 assert_eq!( 2536 t_err(r"(?-u)[^\s\S]"), 2537 TestError { 2538 kind: hir::ErrorKind::EmptyClassNotAllowed, 2539 span: Span::new( 2540 Position::new(5, 1, 6), 2541 Position::new(12, 1, 13) 2542 ), 2543 } 2544 ); 2545 } 2546 2547 #[test] 2548 fn class_bracketed_union() { 2549 assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); 2550 #[cfg(feature = "unicode-gencat")] 2551 assert_eq!( 2552 t(r"[a\pZb]"), 2553 hir_union( 2554 hir_uclass(&[('a', 'b')]), 2555 hir_uclass_query(ClassQuery::Binary("separator")) 2556 ) 2557 ); 2558 #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))] 2559 assert_eq!( 2560 t(r"[\pZ\p{Greek}]"), 2561 hir_union( 2562 hir_uclass_query(ClassQuery::Binary("greek")), 2563 hir_uclass_query(ClassQuery::Binary("separator")) 2564 ) 2565 ); 2566 #[cfg(all( 2567 feature = "unicode-age", 2568 feature = "unicode-gencat", 2569 feature = "unicode-script" 2570 ))] 2571 assert_eq!( 2572 t(r"[\p{age:3.0}\pZ\p{Greek}]"), 2573 hir_union( 2574 hir_uclass_query(ClassQuery::ByValue { 2575 property_name: "age", 2576 property_value: "3.0", 2577 }), 2578 hir_union( 2579 hir_uclass_query(ClassQuery::Binary("greek")), 2580 hir_uclass_query(ClassQuery::Binary("separator")) 2581 ) 2582 ) 2583 ); 2584 #[cfg(all( 2585 feature = "unicode-age", 2586 feature = "unicode-gencat", 2587 feature = "unicode-script" 2588 ))] 2589 assert_eq!( 2590 t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"), 2591 hir_union( 2592 hir_uclass_query(ClassQuery::ByValue { 2593 property_name: "age", 2594 property_value: "3.0", 2595 }), 2596 hir_union( 2597 hir_uclass_query(ClassQuery::Binary("cyrillic")), 2598 hir_union( 2599 hir_uclass_query(ClassQuery::Binary("greek")), 2600 hir_uclass_query(ClassQuery::Binary("separator")) 2601 ) 2602 ) 2603 ) 2604 ); 2605 2606 #[cfg(all( 2607 feature = "unicode-age", 2608 feature = "unicode-case", 2609 feature = "unicode-gencat", 2610 feature = "unicode-script" 2611 ))] 2612 assert_eq!( 2613 t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"), 2614 hir_case_fold(hir_union( 2615 hir_uclass_query(ClassQuery::ByValue { 2616 property_name: "age", 2617 property_value: "3.0", 2618 }), 2619 hir_union( 2620 hir_uclass_query(ClassQuery::Binary("greek")), 2621 hir_uclass_query(ClassQuery::Binary("separator")) 2622 ) 2623 )) 2624 ); 2625 #[cfg(all( 2626 feature = "unicode-age", 2627 feature = "unicode-gencat", 2628 feature = "unicode-script" 2629 ))] 2630 assert_eq!( 2631 t(r"[^\p{age:3.0}\pZ\p{Greek}]"), 2632 hir_negate(hir_union( 2633 hir_uclass_query(ClassQuery::ByValue { 2634 property_name: "age", 2635 property_value: "3.0", 2636 }), 2637 hir_union( 2638 hir_uclass_query(ClassQuery::Binary("greek")), 2639 hir_uclass_query(ClassQuery::Binary("separator")) 2640 ) 2641 )) 2642 ); 2643 #[cfg(all( 2644 feature = "unicode-age", 2645 feature = "unicode-case", 2646 feature = "unicode-gencat", 2647 feature = "unicode-script" 2648 ))] 2649 assert_eq!( 2650 t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"), 2651 hir_negate(hir_case_fold(hir_union( 2652 hir_uclass_query(ClassQuery::ByValue { 2653 property_name: "age", 2654 property_value: "3.0", 2655 }), 2656 hir_union( 2657 hir_uclass_query(ClassQuery::Binary("greek")), 2658 hir_uclass_query(ClassQuery::Binary("separator")) 2659 ) 2660 ))) 2661 ); 2662 } 2663 2664 #[test] 2665 fn class_bracketed_nested() { 2666 assert_eq!(t(r"[a[^c]]"), hir_negate(hir_uclass(&[('c', 'c')]))); 2667 assert_eq!(t(r"[a-b[^c]]"), hir_negate(hir_uclass(&[('c', 'c')]))); 2668 assert_eq!(t(r"[a-c[^c]]"), hir_negate(hir_uclass(&[]))); 2669 2670 assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')])); 2671 assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')])); 2672 2673 #[cfg(feature = "unicode-case")] 2674 assert_eq!( 2675 t(r"(?i)[a[^c]]"), 2676 hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))) 2677 ); 2678 #[cfg(feature = "unicode-case")] 2679 assert_eq!( 2680 t(r"(?i)[a-b[^c]]"), 2681 hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))) 2682 ); 2683 2684 #[cfg(feature = "unicode-case")] 2685 assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')])); 2686 #[cfg(feature = "unicode-case")] 2687 assert_eq!( 2688 t(r"(?i)[^a-b[^c]]"), 2689 hir_uclass(&[('C', 'C'), ('c', 'c')]) 2690 ); 2691 2692 assert_eq!( 2693 t_err(r"[^a-c[^c]]"), 2694 TestError { 2695 kind: hir::ErrorKind::EmptyClassNotAllowed, 2696 span: Span::new( 2697 Position::new(0, 1, 1), 2698 Position::new(10, 1, 11) 2699 ), 2700 } 2701 ); 2702 #[cfg(feature = "unicode-case")] 2703 assert_eq!( 2704 t_err(r"(?i)[^a-c[^c]]"), 2705 TestError { 2706 kind: hir::ErrorKind::EmptyClassNotAllowed, 2707 span: Span::new( 2708 Position::new(4, 1, 5), 2709 Position::new(14, 1, 15) 2710 ), 2711 } 2712 ); 2713 } 2714 2715 #[test] 2716 fn class_bracketed_intersect() { 2717 assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')])); 2718 assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')])); 2719 assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')])); 2720 assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')])); 2721 assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')])); 2722 assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')])); 2723 assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')])); 2724 assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')])); 2725 assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); 2726 2727 assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')])); 2728 assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); 2729 assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); 2730 assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')])); 2731 assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')])); 2732 assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')])); 2733 2734 #[cfg(feature = "unicode-case")] 2735 assert_eq!( 2736 t("(?i)[abc&&b-c]"), 2737 hir_case_fold(hir_uclass(&[('b', 'c')])) 2738 ); 2739 #[cfg(feature = "unicode-case")] 2740 assert_eq!( 2741 t("(?i)[abc&&[b-c]]"), 2742 hir_case_fold(hir_uclass(&[('b', 'c')])) 2743 ); 2744 #[cfg(feature = "unicode-case")] 2745 assert_eq!( 2746 t("(?i)[[abc]&&[b-c]]"), 2747 hir_case_fold(hir_uclass(&[('b', 'c')])) 2748 ); 2749 #[cfg(feature = "unicode-case")] 2750 assert_eq!( 2751 t("(?i)[a-z&&b-y&&c-x]"), 2752 hir_case_fold(hir_uclass(&[('c', 'x')])) 2753 ); 2754 #[cfg(feature = "unicode-case")] 2755 assert_eq!( 2756 t("(?i)[c-da-b&&a-d]"), 2757 hir_case_fold(hir_uclass(&[('a', 'd')])) 2758 ); 2759 #[cfg(feature = "unicode-case")] 2760 assert_eq!( 2761 t("(?i)[a-d&&c-da-b]"), 2762 hir_case_fold(hir_uclass(&[('a', 'd')])) 2763 ); 2764 2765 assert_eq!( 2766 t("(?i-u)[abc&&b-c]"), 2767 hir_case_fold(hir_bclass(&[(b'b', b'c')])) 2768 ); 2769 assert_eq!( 2770 t("(?i-u)[abc&&[b-c]]"), 2771 hir_case_fold(hir_bclass(&[(b'b', b'c')])) 2772 ); 2773 assert_eq!( 2774 t("(?i-u)[[abc]&&[b-c]]"), 2775 hir_case_fold(hir_bclass(&[(b'b', b'c')])) 2776 ); 2777 assert_eq!( 2778 t("(?i-u)[a-z&&b-y&&c-x]"), 2779 hir_case_fold(hir_bclass(&[(b'c', b'x')])) 2780 ); 2781 assert_eq!( 2782 t("(?i-u)[c-da-b&&a-d]"), 2783 hir_case_fold(hir_bclass(&[(b'a', b'd')])) 2784 ); 2785 assert_eq!( 2786 t("(?i-u)[a-d&&c-da-b]"), 2787 hir_case_fold(hir_bclass(&[(b'a', b'd')])) 2788 ); 2789 2790 // In `[a^]`, `^` does not need to be escaped, so it makes sense that 2791 // `^` is also allowed to be unescaped after `&&`. 2792 assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')])); 2793 // `]` needs to be escaped after `&&` since it's not at start of class. 2794 assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')])); 2795 assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')])); 2796 assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')])); 2797 assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')])); 2798 // Test precedence. 2799 assert_eq!( 2800 t(r"[a-w&&[^c-g]z]"), 2801 hir_uclass(&[('a', 'b'), ('h', 'w')]) 2802 ); 2803 } 2804 2805 #[test] 2806 fn class_bracketed_intersect_negate() { 2807 #[cfg(feature = "unicode-perl")] 2808 assert_eq!( 2809 t(r"[^\w&&\d]"), 2810 hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) 2811 ); 2812 assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); 2813 #[cfg(feature = "unicode-perl")] 2814 assert_eq!( 2815 t(r"[^[\w&&\d]]"), 2816 hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) 2817 ); 2818 #[cfg(feature = "unicode-perl")] 2819 assert_eq!( 2820 t(r"[^[^\w&&\d]]"), 2821 hir_uclass_query(ClassQuery::Binary("digit")) 2822 ); 2823 #[cfg(feature = "unicode-perl")] 2824 assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word())); 2825 2826 #[cfg(feature = "unicode-perl")] 2827 assert_eq!( 2828 t_bytes(r"(?-u)[^\w&&\d]"), 2829 hir_negate(hir_bclass_from_char(ascii_class( 2830 &ast::ClassAsciiKind::Digit 2831 ))) 2832 ); 2833 assert_eq!( 2834 t_bytes(r"(?-u)[^[a-z&&a-c]]"), 2835 hir_negate(hir_bclass(&[(b'a', b'c')])) 2836 ); 2837 assert_eq!( 2838 t_bytes(r"(?-u)[^[\w&&\d]]"), 2839 hir_negate(hir_bclass_from_char(ascii_class( 2840 &ast::ClassAsciiKind::Digit 2841 ))) 2842 ); 2843 assert_eq!( 2844 t_bytes(r"(?-u)[^[^\w&&\d]]"), 2845 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)) 2846 ); 2847 assert_eq!( 2848 t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"), 2849 hir_negate(hir_bclass_from_char(ascii_class( 2850 &ast::ClassAsciiKind::Word 2851 ))) 2852 ); 2853 } 2854 2855 #[test] 2856 fn class_bracketed_difference() { 2857 #[cfg(feature = "unicode-gencat")] 2858 assert_eq!( 2859 t(r"[\pL--[:ascii:]]"), 2860 hir_difference( 2861 hir_uclass_query(ClassQuery::Binary("letter")), 2862 hir_uclass(&[('\0', '\x7F')]) 2863 ) 2864 ); 2865 2866 assert_eq!( 2867 t(r"(?-u)[[:alpha:]--[:lower:]]"), 2868 hir_bclass(&[(b'A', b'Z')]) 2869 ); 2870 } 2871 2872 #[test] 2873 fn class_bracketed_symmetric_difference() { 2874 #[cfg(feature = "unicode-script")] 2875 assert_eq!( 2876 t(r"[\p{sc:Greek}~~\p{scx:Greek}]"), 2877 hir_uclass(&[ 2878 ('\u{0342}', '\u{0342}'), 2879 ('\u{0345}', '\u{0345}'), 2880 ('\u{1DC0}', '\u{1DC1}'), 2881 ]) 2882 ); 2883 assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')])); 2884 2885 assert_eq!( 2886 t(r"(?-u)[a-g~~c-j]"), 2887 hir_bclass(&[(b'a', b'b'), (b'h', b'j')]) 2888 ); 2889 } 2890 2891 #[test] 2892 fn ignore_whitespace() { 2893 assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3")); 2894 assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S")); 2895 assert_eq!( 2896 t(r"(?x)\x # comment 2897{ # comment 2898 53 # comment 2899} #comment"), 2900 hir_lit("S") 2901 ); 2902 2903 assert_eq!(t(r"(?x)\x 53"), hir_lit("S")); 2904 assert_eq!( 2905 t(r"(?x)\x # comment 2906 53 # comment"), 2907 hir_lit("S") 2908 ); 2909 assert_eq!(t(r"(?x)\x5 3"), hir_lit("S")); 2910 2911 #[cfg(feature = "unicode-gencat")] 2912 assert_eq!( 2913 t(r"(?x)\p # comment 2914{ # comment 2915 Separator # comment 2916} # comment"), 2917 hir_uclass_query(ClassQuery::Binary("separator")) 2918 ); 2919 2920 assert_eq!( 2921 t(r"(?x)a # comment 2922{ # comment 2923 5 # comment 2924 , # comment 2925 10 # comment 2926} # comment"), 2927 hir_range( 2928 true, 2929 hir::RepetitionRange::Bounded(5, 10), 2930 hir_lit("a") 2931 ) 2932 ); 2933 2934 assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a ")); 2935 } 2936 2937 #[test] 2938 fn analysis_is_always_utf8() { 2939 // Positive examples. 2940 assert!(t_bytes(r"a").is_always_utf8()); 2941 assert!(t_bytes(r"ab").is_always_utf8()); 2942 assert!(t_bytes(r"(?-u)a").is_always_utf8()); 2943 assert!(t_bytes(r"(?-u)ab").is_always_utf8()); 2944 assert!(t_bytes(r"\xFF").is_always_utf8()); 2945 assert!(t_bytes(r"\xFF\xFF").is_always_utf8()); 2946 assert!(t_bytes(r"[^a]").is_always_utf8()); 2947 assert!(t_bytes(r"[^a][^a]").is_always_utf8()); 2948 assert!(t_bytes(r"\b").is_always_utf8()); 2949 assert!(t_bytes(r"\B").is_always_utf8()); 2950 assert!(t_bytes(r"(?-u)\b").is_always_utf8()); 2951 2952 // Negative examples. 2953 assert!(!t_bytes(r"(?-u)\xFF").is_always_utf8()); 2954 assert!(!t_bytes(r"(?-u)\xFF\xFF").is_always_utf8()); 2955 assert!(!t_bytes(r"(?-u)[^a]").is_always_utf8()); 2956 assert!(!t_bytes(r"(?-u)[^a][^a]").is_always_utf8()); 2957 assert!(!t_bytes(r"(?-u)\B").is_always_utf8()); 2958 } 2959 2960 #[test] 2961 fn analysis_is_all_assertions() { 2962 // Positive examples. 2963 assert!(t(r"\b").is_all_assertions()); 2964 assert!(t(r"\B").is_all_assertions()); 2965 assert!(t(r"^").is_all_assertions()); 2966 assert!(t(r"$").is_all_assertions()); 2967 assert!(t(r"\A").is_all_assertions()); 2968 assert!(t(r"\z").is_all_assertions()); 2969 assert!(t(r"$^\z\A\b\B").is_all_assertions()); 2970 assert!(t(r"$|^|\z|\A|\b|\B").is_all_assertions()); 2971 assert!(t(r"^$|$^").is_all_assertions()); 2972 assert!(t(r"((\b)+())*^").is_all_assertions()); 2973 2974 // Negative examples. 2975 assert!(!t(r"^a").is_all_assertions()); 2976 } 2977 2978 #[test] 2979 fn analysis_is_anchored() { 2980 // Positive examples. 2981 assert!(t(r"^").is_anchored_start()); 2982 assert!(t(r"$").is_anchored_end()); 2983 assert!(t(r"^").is_line_anchored_start()); 2984 assert!(t(r"$").is_line_anchored_end()); 2985 2986 assert!(t(r"^^").is_anchored_start()); 2987 assert!(t(r"$$").is_anchored_end()); 2988 assert!(t(r"^^").is_line_anchored_start()); 2989 assert!(t(r"$$").is_line_anchored_end()); 2990 2991 assert!(t(r"^$").is_anchored_start()); 2992 assert!(t(r"^$").is_anchored_end()); 2993 assert!(t(r"^$").is_line_anchored_start()); 2994 assert!(t(r"^$").is_line_anchored_end()); 2995 2996 assert!(t(r"^foo").is_anchored_start()); 2997 assert!(t(r"foo$").is_anchored_end()); 2998 assert!(t(r"^foo").is_line_anchored_start()); 2999 assert!(t(r"foo$").is_line_anchored_end()); 3000 3001 assert!(t(r"^foo|^bar").is_anchored_start()); 3002 assert!(t(r"foo$|bar$").is_anchored_end()); 3003 assert!(t(r"^foo|^bar").is_line_anchored_start()); 3004 assert!(t(r"foo$|bar$").is_line_anchored_end()); 3005 3006 assert!(t(r"^(foo|bar)").is_anchored_start()); 3007 assert!(t(r"(foo|bar)$").is_anchored_end()); 3008 assert!(t(r"^(foo|bar)").is_line_anchored_start()); 3009 assert!(t(r"(foo|bar)$").is_line_anchored_end()); 3010 3011 assert!(t(r"^+").is_anchored_start()); 3012 assert!(t(r"$+").is_anchored_end()); 3013 assert!(t(r"^+").is_line_anchored_start()); 3014 assert!(t(r"$+").is_line_anchored_end()); 3015 assert!(t(r"^++").is_anchored_start()); 3016 assert!(t(r"$++").is_anchored_end()); 3017 assert!(t(r"^++").is_line_anchored_start()); 3018 assert!(t(r"$++").is_line_anchored_end()); 3019 assert!(t(r"(^)+").is_anchored_start()); 3020 assert!(t(r"($)+").is_anchored_end()); 3021 assert!(t(r"(^)+").is_line_anchored_start()); 3022 assert!(t(r"($)+").is_line_anchored_end()); 3023 3024 assert!(t(r"$^").is_anchored_start()); 3025 assert!(t(r"$^").is_anchored_start()); 3026 assert!(t(r"$^").is_line_anchored_end()); 3027 assert!(t(r"$^").is_line_anchored_end()); 3028 assert!(t(r"$^|^$").is_anchored_start()); 3029 assert!(t(r"$^|^$").is_anchored_end()); 3030 assert!(t(r"$^|^$").is_line_anchored_start()); 3031 assert!(t(r"$^|^$").is_line_anchored_end()); 3032 3033 assert!(t(r"\b^").is_anchored_start()); 3034 assert!(t(r"$\b").is_anchored_end()); 3035 assert!(t(r"\b^").is_line_anchored_start()); 3036 assert!(t(r"$\b").is_line_anchored_end()); 3037 assert!(t(r"^(?m:^)").is_anchored_start()); 3038 assert!(t(r"(?m:$)$").is_anchored_end()); 3039 assert!(t(r"^(?m:^)").is_line_anchored_start()); 3040 assert!(t(r"(?m:$)$").is_line_anchored_end()); 3041 assert!(t(r"(?m:^)^").is_anchored_start()); 3042 assert!(t(r"$(?m:$)").is_anchored_end()); 3043 assert!(t(r"(?m:^)^").is_line_anchored_start()); 3044 assert!(t(r"$(?m:$)").is_line_anchored_end()); 3045 3046 // Negative examples. 3047 assert!(!t(r"(?m)^").is_anchored_start()); 3048 assert!(!t(r"(?m)$").is_anchored_end()); 3049 assert!(!t(r"(?m:^$)|$^").is_anchored_start()); 3050 assert!(!t(r"(?m:^$)|$^").is_anchored_end()); 3051 assert!(!t(r"$^|(?m:^$)").is_anchored_start()); 3052 assert!(!t(r"$^|(?m:^$)").is_anchored_end()); 3053 3054 assert!(!t(r"a^").is_anchored_start()); 3055 assert!(!t(r"$a").is_anchored_start()); 3056 assert!(!t(r"a^").is_line_anchored_start()); 3057 assert!(!t(r"$a").is_line_anchored_start()); 3058 3059 assert!(!t(r"a^").is_anchored_end()); 3060 assert!(!t(r"$a").is_anchored_end()); 3061 assert!(!t(r"a^").is_line_anchored_end()); 3062 assert!(!t(r"$a").is_line_anchored_end()); 3063 3064 assert!(!t(r"^foo|bar").is_anchored_start()); 3065 assert!(!t(r"foo|bar$").is_anchored_end()); 3066 assert!(!t(r"^foo|bar").is_line_anchored_start()); 3067 assert!(!t(r"foo|bar$").is_line_anchored_end()); 3068 3069 assert!(!t(r"^*").is_anchored_start()); 3070 assert!(!t(r"$*").is_anchored_end()); 3071 assert!(!t(r"^*").is_line_anchored_start()); 3072 assert!(!t(r"$*").is_line_anchored_end()); 3073 assert!(!t(r"^*+").is_anchored_start()); 3074 assert!(!t(r"$*+").is_anchored_end()); 3075 assert!(!t(r"^*+").is_line_anchored_start()); 3076 assert!(!t(r"$*+").is_line_anchored_end()); 3077 assert!(!t(r"^+*").is_anchored_start()); 3078 assert!(!t(r"$+*").is_anchored_end()); 3079 assert!(!t(r"^+*").is_line_anchored_start()); 3080 assert!(!t(r"$+*").is_line_anchored_end()); 3081 assert!(!t(r"(^)*").is_anchored_start()); 3082 assert!(!t(r"($)*").is_anchored_end()); 3083 assert!(!t(r"(^)*").is_line_anchored_start()); 3084 assert!(!t(r"($)*").is_line_anchored_end()); 3085 } 3086 3087 #[test] 3088 fn analysis_is_line_anchored() { 3089 assert!(t(r"(?m)^(foo|bar)").is_line_anchored_start()); 3090 assert!(t(r"(?m)(foo|bar)$").is_line_anchored_end()); 3091 3092 assert!(t(r"(?m)^foo|^bar").is_line_anchored_start()); 3093 assert!(t(r"(?m)foo$|bar$").is_line_anchored_end()); 3094 3095 assert!(t(r"(?m)^").is_line_anchored_start()); 3096 assert!(t(r"(?m)$").is_line_anchored_end()); 3097 3098 assert!(t(r"(?m:^$)|$^").is_line_anchored_start()); 3099 assert!(t(r"(?m:^$)|$^").is_line_anchored_end()); 3100 3101 assert!(t(r"$^|(?m:^$)").is_line_anchored_start()); 3102 assert!(t(r"$^|(?m:^$)").is_line_anchored_end()); 3103 } 3104 3105 #[test] 3106 fn analysis_is_any_anchored() { 3107 // Positive examples. 3108 assert!(t(r"^").is_any_anchored_start()); 3109 assert!(t(r"$").is_any_anchored_end()); 3110 assert!(t(r"\A").is_any_anchored_start()); 3111 assert!(t(r"\z").is_any_anchored_end()); 3112 3113 // Negative examples. 3114 assert!(!t(r"(?m)^").is_any_anchored_start()); 3115 assert!(!t(r"(?m)$").is_any_anchored_end()); 3116 assert!(!t(r"$").is_any_anchored_start()); 3117 assert!(!t(r"^").is_any_anchored_end()); 3118 } 3119 3120 #[test] 3121 fn analysis_is_match_empty() { 3122 // Positive examples. 3123 assert!(t(r"").is_match_empty()); 3124 assert!(t(r"()").is_match_empty()); 3125 assert!(t(r"()*").is_match_empty()); 3126 assert!(t(r"()+").is_match_empty()); 3127 assert!(t(r"()?").is_match_empty()); 3128 assert!(t(r"a*").is_match_empty()); 3129 assert!(t(r"a?").is_match_empty()); 3130 assert!(t(r"a{0}").is_match_empty()); 3131 assert!(t(r"a{0,}").is_match_empty()); 3132 assert!(t(r"a{0,1}").is_match_empty()); 3133 assert!(t(r"a{0,10}").is_match_empty()); 3134 #[cfg(feature = "unicode-gencat")] 3135 assert!(t(r"\pL*").is_match_empty()); 3136 assert!(t(r"a*|b").is_match_empty()); 3137 assert!(t(r"b|a*").is_match_empty()); 3138 assert!(t(r"a|").is_match_empty()); 3139 assert!(t(r"|a").is_match_empty()); 3140 assert!(t(r"a||b").is_match_empty()); 3141 assert!(t(r"a*a?(abcd)*").is_match_empty()); 3142 assert!(t(r"^").is_match_empty()); 3143 assert!(t(r"$").is_match_empty()); 3144 assert!(t(r"(?m)^").is_match_empty()); 3145 assert!(t(r"(?m)$").is_match_empty()); 3146 assert!(t(r"\A").is_match_empty()); 3147 assert!(t(r"\z").is_match_empty()); 3148 assert!(t(r"\B").is_match_empty()); 3149 assert!(t_bytes(r"(?-u)\B").is_match_empty()); 3150 assert!(t(r"\b").is_match_empty()); 3151 assert!(t(r"(?-u)\b").is_match_empty()); 3152 3153 // Negative examples. 3154 assert!(!t(r"a+").is_match_empty()); 3155 assert!(!t(r"a{1}").is_match_empty()); 3156 assert!(!t(r"a{1,}").is_match_empty()); 3157 assert!(!t(r"a{1,2}").is_match_empty()); 3158 assert!(!t(r"a{1,10}").is_match_empty()); 3159 assert!(!t(r"b|a").is_match_empty()); 3160 assert!(!t(r"a*a+(abcd)*").is_match_empty()); 3161 } 3162 3163 #[test] 3164 fn analysis_is_literal() { 3165 // Positive examples. 3166 assert!(t(r"a").is_literal()); 3167 assert!(t(r"ab").is_literal()); 3168 assert!(t(r"abc").is_literal()); 3169 assert!(t(r"(?m)abc").is_literal()); 3170 3171 // Negative examples. 3172 assert!(!t(r"").is_literal()); 3173 assert!(!t(r"^").is_literal()); 3174 assert!(!t(r"a|b").is_literal()); 3175 assert!(!t(r"(a)").is_literal()); 3176 assert!(!t(r"a+").is_literal()); 3177 assert!(!t(r"foo(a)").is_literal()); 3178 assert!(!t(r"(a)foo").is_literal()); 3179 assert!(!t(r"[a]").is_literal()); 3180 } 3181 3182 #[test] 3183 fn analysis_is_alternation_literal() { 3184 // Positive examples. 3185 assert!(t(r"a").is_alternation_literal()); 3186 assert!(t(r"ab").is_alternation_literal()); 3187 assert!(t(r"abc").is_alternation_literal()); 3188 assert!(t(r"(?m)abc").is_alternation_literal()); 3189 assert!(t(r"a|b").is_alternation_literal()); 3190 assert!(t(r"a|b|c").is_alternation_literal()); 3191 assert!(t(r"foo|bar").is_alternation_literal()); 3192 assert!(t(r"foo|bar|baz").is_alternation_literal()); 3193 3194 // Negative examples. 3195 assert!(!t(r"").is_alternation_literal()); 3196 assert!(!t(r"^").is_alternation_literal()); 3197 assert!(!t(r"(a)").is_alternation_literal()); 3198 assert!(!t(r"a+").is_alternation_literal()); 3199 assert!(!t(r"foo(a)").is_alternation_literal()); 3200 assert!(!t(r"(a)foo").is_alternation_literal()); 3201 assert!(!t(r"[a]").is_alternation_literal()); 3202 assert!(!t(r"[a]|b").is_alternation_literal()); 3203 assert!(!t(r"a|[b]").is_alternation_literal()); 3204 assert!(!t(r"(a)|b").is_alternation_literal()); 3205 assert!(!t(r"a|(b)").is_alternation_literal()); 3206 } 3207} 3208