1// These are tests specifically crafted for regexes that can match arbitrary 2// bytes. 3 4// A silly wrapper to make it possible to write and match raw bytes. 5struct R<'a>(&'a [u8]); 6impl<'a> R<'a> { 7 fn as_bytes(&self) -> &'a [u8] { 8 self.0 9 } 10} 11 12mat!(word_boundary, r"(?-u) \b", " δ", None); 13#[cfg(feature = "unicode-perl")] 14mat!(word_boundary_unicode, r" \b", " δ", Some((0, 1))); 15mat!(word_not_boundary, r"(?-u) \B", " δ", Some((0, 1))); 16#[cfg(feature = "unicode-perl")] 17mat!(word_not_boundary_unicode, r" \B", " δ", None); 18 19mat!(perl_w_ascii, r"(?-u)\w+", "aδ", Some((0, 1))); 20#[cfg(feature = "unicode-perl")] 21mat!(perl_w_unicode, r"\w+", "aδ", Some((0, 3))); 22mat!(perl_d_ascii, r"(?-u)\d+", "1२३9", Some((0, 1))); 23#[cfg(feature = "unicode-perl")] 24mat!(perl_d_unicode, r"\d+", "1२३9", Some((0, 8))); 25mat!(perl_s_ascii, r"(?-u)\s+", " \u{1680}", Some((0, 1))); 26#[cfg(feature = "unicode-perl")] 27mat!(perl_s_unicode, r"\s+", " \u{1680}", Some((0, 4))); 28 29// The first `(.+)` matches two Unicode codepoints, but can't match the 5th 30// byte, which isn't valid UTF-8. The second (byte based) `(.+)` takes over and 31// matches. 32mat!( 33 mixed1, 34 r"(.+)(?-u)(.+)", 35 R(b"\xCE\x93\xCE\x94\xFF"), 36 Some((0, 5)), 37 Some((0, 4)), 38 Some((4, 5)) 39); 40 41mat!(case_ascii_one, r"(?i-u)a", "A", Some((0, 1))); 42mat!(case_ascii_class, r"(?i-u)[a-z]+", "AaAaA", Some((0, 5))); 43#[cfg(feature = "unicode-case")] 44mat!(case_unicode, r"(?i)[a-z]+", "aA\u{212A}aA", Some((0, 7))); 45mat!(case_not_unicode, r"(?i-u)[a-z]+", "aA\u{212A}aA", Some((0, 2))); 46 47mat!(negate_unicode, r"[^a]", "δ", Some((0, 2))); 48mat!(negate_not_unicode, r"(?-u)[^a]", "δ", Some((0, 1))); 49 50// This doesn't match in a normal Unicode regex because the implicit preceding 51// `.*?` is Unicode aware. 52mat!(dotstar_prefix_not_unicode1, r"(?-u)a", R(b"\xFFa"), Some((1, 2))); 53mat!(dotstar_prefix_not_unicode2, r"a", R(b"\xFFa"), Some((1, 2))); 54 55// Have fun with null bytes. 56mat!( 57 null_bytes, 58 r"(?-u)(?P<cstr>[^\x00]+)\x00", 59 R(b"foo\x00"), 60 Some((0, 4)), 61 Some((0, 3)) 62); 63 64// Test that lookahead operators work properly in the face of invalid UTF-8. 65// See: https://github.com/rust-lang/regex/issues/277 66matiter!( 67 invalidutf8_anchor1, 68 r"(?-u)\xcc?^", 69 R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"), 70 (0, 0) 71); 72matiter!( 73 invalidutf8_anchor2, 74 r"(?-u)^\xf7|4\xff\d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########[] d\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a\x8a##########\[] #####\x80\S7|$", 75 R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"), 76 (22, 22) 77); 78matiter!( 79 invalidutf8_anchor3, 80 r"(?-u)^|ddp\xff\xffdddddlQd@\x80", 81 R(b"\x8d#;\x1a\xa4s3\x05foobarX\\\x0f0t\xe4\x9b\xa4"), 82 (0, 0) 83); 84 85// See https://github.com/rust-lang/regex/issues/303 86#[test] 87fn negated_full_byte_range() { 88 assert!(::regex::bytes::Regex::new(r#"(?-u)[^\x00-\xff]"#).is_err()); 89} 90 91matiter!(word_boundary_ascii1, r"(?-u:\B)x(?-u:\B)", "áxβ"); 92matiter!( 93 word_boundary_ascii2, 94 r"(?-u:\B)", 95 "0\u{7EF5E}", 96 (2, 2), 97 (3, 3), 98 (4, 4), 99 (5, 5) 100); 101 102// See: https://github.com/rust-lang/regex/issues/264 103mat!(ascii_boundary_no_capture, r"(?-u)\B", "\u{28f3e}", Some((0, 0))); 104mat!(ascii_boundary_capture, r"(?-u)(\B)", "\u{28f3e}", Some((0, 0))); 105 106// See: https://github.com/rust-lang/regex/issues/271 107mat!(end_not_wb, r"$(?-u:\B)", "\u{5c124}\u{b576c}", Some((8, 8))); 108