1use core::mem::size_of; 2 3use crate::memmem::{util::memcmp, vector::Vector, NeedleInfo}; 4 5/// The minimum length of a needle required for this algorithm. The minimum 6/// is 2 since a length of 1 should just use memchr and a length of 0 isn't 7/// a case handled by this searcher. 8pub(crate) const MIN_NEEDLE_LEN: usize = 2; 9 10/// The maximum length of a needle required for this algorithm. 11/// 12/// In reality, there is no hard max here. The code below can handle any 13/// length needle. (Perhaps that suggests there are missing optimizations.) 14/// Instead, this is a heuristic and a bound guaranteeing our linear time 15/// complexity. 16/// 17/// It is a heuristic because when a candidate match is found, memcmp is run. 18/// For very large needles with lots of false positives, memcmp can make the 19/// code run quite slow. 20/// 21/// It is a bound because the worst case behavior with memcmp is multiplicative 22/// in the size of the needle and haystack, and we want to keep that additive. 23/// This bound ensures we still meet that bound theoretically, since it's just 24/// a constant. We aren't acting in bad faith here, memcmp on tiny needles 25/// is so fast that even in pathological cases (see pathological vector 26/// benchmarks), this is still just as fast or faster in practice. 27/// 28/// This specific number was chosen by tweaking a bit and running benchmarks. 29/// The rare-medium-needle, for example, gets about 5% faster by using this 30/// algorithm instead of a prefilter-accelerated Two-Way. There's also a 31/// theoretical desire to keep this number reasonably low, to mitigate the 32/// impact of pathological cases. I did try 64, and some benchmarks got a 33/// little better, and others (particularly the pathological ones), got a lot 34/// worse. So... 32 it is? 35pub(crate) const MAX_NEEDLE_LEN: usize = 32; 36 37/// The implementation of the forward vector accelerated substring search. 38/// 39/// This is extremely similar to the prefilter vector module by the same name. 40/// The key difference is that this is not a prefilter. Instead, it handles 41/// confirming its own matches. The trade off is that this only works with 42/// smaller needles. The speed up here is that an inlined memcmp on a tiny 43/// needle is very quick, even on pathological inputs. This is much better than 44/// combining a prefilter with Two-Way, where using Two-Way to confirm the 45/// match has higher latency. 46/// 47/// So why not use this for all needles? We could, and it would probably work 48/// really well on most inputs. But its worst case is multiplicative and we 49/// want to guarantee worst case additive time. Some of the benchmarks try to 50/// justify this (see the pathological ones). 51/// 52/// The prefilter variant of this has more comments. Also note that we only 53/// implement this for forward searches for now. If you have a compelling use 54/// case for accelerated reverse search, please file an issue. 55#[derive(Clone, Copy, Debug)] 56pub(crate) struct Forward { 57 rare1i: u8, 58 rare2i: u8, 59} 60 61impl Forward { 62 /// Create a new "generic simd" forward searcher. If one could not be 63 /// created from the given inputs, then None is returned. 64 pub(crate) fn new(ninfo: &NeedleInfo, needle: &[u8]) -> Option<Forward> { 65 let (rare1i, rare2i) = ninfo.rarebytes.as_rare_ordered_u8(); 66 // If the needle is too short or too long, give up. Also, give up 67 // if the rare bytes detected are at the same position. (It likely 68 // suggests a degenerate case, although it should technically not be 69 // possible.) 70 if needle.len() < MIN_NEEDLE_LEN 71 || needle.len() > MAX_NEEDLE_LEN 72 || rare1i == rare2i 73 { 74 return None; 75 } 76 Some(Forward { rare1i, rare2i }) 77 } 78 79 /// Returns the minimum length of haystack that is needed for this searcher 80 /// to work for a particular vector. Passing a haystack with a length 81 /// smaller than this will cause `fwd_find` to panic. 82 #[inline(always)] 83 pub(crate) fn min_haystack_len<V: Vector>(&self) -> usize { 84 self.rare2i as usize + size_of::<V>() 85 } 86} 87 88/// Searches the given haystack for the given needle. The needle given should 89/// be the same as the needle that this searcher was initialized with. 90/// 91/// # Panics 92/// 93/// When the given haystack has a length smaller than `min_haystack_len`. 94/// 95/// # Safety 96/// 97/// Since this is meant to be used with vector functions, callers need to 98/// specialize this inside of a function with a `target_feature` attribute. 99/// Therefore, callers must ensure that whatever target feature is being used 100/// supports the vector functions that this function is specialized for. (For 101/// the specific vector functions used, see the Vector trait implementations.) 102#[inline(always)] 103pub(crate) unsafe fn fwd_find<V: Vector>( 104 fwd: &Forward, 105 haystack: &[u8], 106 needle: &[u8], 107) -> Option<usize> { 108 // It would be nice if we didn't have this check here, since the meta 109 // searcher should handle it for us. But without this, I don't think we 110 // guarantee that end_ptr.sub(needle.len()) won't result in UB. We could 111 // put it as part of the safety contract, but it makes it more complicated 112 // than necessary. 113 if haystack.len() < needle.len() { 114 return None; 115 } 116 let min_haystack_len = fwd.min_haystack_len::<V>(); 117 assert!(haystack.len() >= min_haystack_len, "haystack too small"); 118 debug_assert!(needle.len() <= haystack.len()); 119 debug_assert!( 120 needle.len() >= MIN_NEEDLE_LEN, 121 "needle must be at least {} bytes", 122 MIN_NEEDLE_LEN, 123 ); 124 debug_assert!( 125 needle.len() <= MAX_NEEDLE_LEN, 126 "needle must be at most {} bytes", 127 MAX_NEEDLE_LEN, 128 ); 129 130 let (rare1i, rare2i) = (fwd.rare1i as usize, fwd.rare2i as usize); 131 let rare1chunk = V::splat(needle[rare1i]); 132 let rare2chunk = V::splat(needle[rare2i]); 133 134 let start_ptr = haystack.as_ptr(); 135 let end_ptr = start_ptr.add(haystack.len()); 136 let max_ptr = end_ptr.sub(min_haystack_len); 137 let mut ptr = start_ptr; 138 139 // N.B. I did experiment with unrolling the loop to deal with size(V) 140 // bytes at a time and 2*size(V) bytes at a time. The double unroll was 141 // marginally faster while the quadruple unroll was unambiguously slower. 142 // In the end, I decided the complexity from unrolling wasn't worth it. I 143 // used the memmem/krate/prebuilt/huge-en/ benchmarks to compare. 144 while ptr <= max_ptr { 145 let m = fwd_find_in_chunk( 146 fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, !0, 147 ); 148 if let Some(chunki) = m { 149 return Some(matched(start_ptr, ptr, chunki)); 150 } 151 ptr = ptr.add(size_of::<V>()); 152 } 153 if ptr < end_ptr { 154 let remaining = diff(end_ptr, ptr); 155 debug_assert!( 156 remaining < min_haystack_len, 157 "remaining bytes should be smaller than the minimum haystack \ 158 length of {}, but there are {} bytes remaining", 159 min_haystack_len, 160 remaining, 161 ); 162 if remaining < needle.len() { 163 return None; 164 } 165 debug_assert!( 166 max_ptr < ptr, 167 "after main loop, ptr should have exceeded max_ptr", 168 ); 169 let overlap = diff(ptr, max_ptr); 170 debug_assert!( 171 overlap > 0, 172 "overlap ({}) must always be non-zero", 173 overlap, 174 ); 175 debug_assert!( 176 overlap < size_of::<V>(), 177 "overlap ({}) cannot possibly be >= than a vector ({})", 178 overlap, 179 size_of::<V>(), 180 ); 181 // The mask has all of its bits set except for the first N least 182 // significant bits, where N=overlap. This way, any matches that 183 // occur in find_in_chunk within the overlap are automatically 184 // ignored. 185 let mask = !((1 << overlap) - 1); 186 ptr = max_ptr; 187 let m = fwd_find_in_chunk( 188 fwd, needle, ptr, end_ptr, rare1chunk, rare2chunk, mask, 189 ); 190 if let Some(chunki) = m { 191 return Some(matched(start_ptr, ptr, chunki)); 192 } 193 } 194 None 195} 196 197/// Search for an occurrence of two rare bytes from the needle in the chunk 198/// pointed to by ptr, with the end of the haystack pointed to by end_ptr. When 199/// an occurrence is found, memcmp is run to check if a match occurs at the 200/// corresponding position. 201/// 202/// rare1chunk and rare2chunk correspond to vectors with the rare1 and rare2 203/// bytes repeated in each 8-bit lane, respectively. 204/// 205/// mask should have bits set corresponding the positions in the chunk in which 206/// matches are considered. This is only used for the last vector load where 207/// the beginning of the vector might have overlapped with the last load in 208/// the main loop. The mask lets us avoid visiting positions that have already 209/// been discarded as matches. 210/// 211/// # Safety 212/// 213/// It must be safe to do an unaligned read of size(V) bytes starting at both 214/// (ptr + rare1i) and (ptr + rare2i). It must also be safe to do unaligned 215/// loads on ptr up to (end_ptr - needle.len()). 216#[inline(always)] 217unsafe fn fwd_find_in_chunk<V: Vector>( 218 fwd: &Forward, 219 needle: &[u8], 220 ptr: *const u8, 221 end_ptr: *const u8, 222 rare1chunk: V, 223 rare2chunk: V, 224 mask: u32, 225) -> Option<usize> { 226 let chunk0 = V::load_unaligned(ptr.add(fwd.rare1i as usize)); 227 let chunk1 = V::load_unaligned(ptr.add(fwd.rare2i as usize)); 228 229 let eq0 = chunk0.cmpeq(rare1chunk); 230 let eq1 = chunk1.cmpeq(rare2chunk); 231 232 let mut match_offsets = eq0.and(eq1).movemask() & mask; 233 while match_offsets != 0 { 234 let offset = match_offsets.trailing_zeros() as usize; 235 let ptr = ptr.add(offset); 236 if end_ptr.sub(needle.len()) < ptr { 237 return None; 238 } 239 let chunk = core::slice::from_raw_parts(ptr, needle.len()); 240 if memcmp(needle, chunk) { 241 return Some(offset); 242 } 243 match_offsets &= match_offsets - 1; 244 } 245 None 246} 247 248/// Accepts a chunk-relative offset and returns a haystack relative offset 249/// after updating the prefilter state. 250/// 251/// See the same function with the same name in the prefilter variant of this 252/// algorithm to learned why it's tagged with inline(never). Even here, where 253/// the function is simpler, inlining it leads to poorer codegen. (Although 254/// it does improve some benchmarks, like prebuiltiter/huge-en/common-you.) 255#[cold] 256#[inline(never)] 257fn matched(start_ptr: *const u8, ptr: *const u8, chunki: usize) -> usize { 258 diff(ptr, start_ptr) + chunki 259} 260 261/// Subtract `b` from `a` and return the difference. `a` must be greater than 262/// or equal to `b`. 263fn diff(a: *const u8, b: *const u8) -> usize { 264 debug_assert!(a >= b); 265 (a as usize) - (b as usize) 266} 267