1 // These routines are meant to be optimized specifically for low latency as 2 // compared to the equivalent routines offered by std. (Which may invoke the 3 // dynamic linker and call out to libc, which introduces a bit more latency 4 // than we'd like.) 5 6 /// Returns true if and only if needle is a prefix of haystack. 7 #[inline(always)] 8 pub(crate) fn is_prefix(haystack: &[u8], needle: &[u8]) -> bool { 9 needle.len() <= haystack.len() && memcmp(&haystack[..needle.len()], needle) 10 } 11 12 /// Returns true if and only if needle is a suffix of haystack. 13 #[inline(always)] 14 pub(crate) fn is_suffix(haystack: &[u8], needle: &[u8]) -> bool { 15 needle.len() <= haystack.len() 16 && memcmp(&haystack[haystack.len() - needle.len()..], needle) 17 } 18 19 /// Return true if and only if x.len() == y.len() && x[i] == y[i] for all 20 /// 0 <= i < x.len(). 21 /// 22 /// Why not just use actual memcmp for this? Well, memcmp requires calling out 23 /// to libc, and this routine is called in fairly hot code paths. Other than 24 /// just calling out to libc, it also seems to result in worse codegen. By 25 /// rolling our own memcmp in pure Rust, it seems to appear more friendly to 26 /// the optimizer. 27 /// 28 /// We mark this as inline always, although, some callers may not want it 29 /// inlined for better codegen (like Rabin-Karp). In that case, callers are 30 /// advised to create a non-inlineable wrapper routine that calls memcmp. 31 #[inline(always)] 32 pub(crate) fn memcmp(x: &[u8], y: &[u8]) -> bool { 33 if x.len() != y.len() { 34 return false; 35 } 36 // If we don't have enough bytes to do 4-byte at a time loads, then 37 // fall back to the naive slow version. 38 // 39 // TODO: We could do a copy_nonoverlapping combined with a mask instead 40 // of a loop. Benchmark it. 41 if x.len() < 4 { 42 for (&b1, &b2) in x.iter().zip(y) { 43 if b1 != b2 { 44 return false; 45 } 46 } 47 return true; 48 } 49 // When we have 4 or more bytes to compare, then proceed in chunks of 4 at 50 // a time using unaligned loads. 51 // 52 // Also, why do 4 byte loads instead of, say, 8 byte loads? The reason is 53 // that this particular version of memcmp is likely to be called with tiny 54 // needles. That means that if we do 8 byte loads, then a higher proportion 55 // of memcmp calls will use the slower variant above. With that said, this 56 // is a hypothesis and is only loosely supported by benchmarks. There's 57 // likely some improvement that could be made here. The main thing here 58 // though is to optimize for latency, not throughput. 59 60 // SAFETY: Via the conditional above, we know that both `px` and `py` 61 // have the same length, so `px < pxend` implies that `py < pyend`. 62 // Thus, derefencing both `px` and `py` in the loop below is safe. 63 // 64 // Moreover, we set `pxend` and `pyend` to be 4 bytes before the actual 65 // end of of `px` and `py`. Thus, the final dereference outside of the 66 // loop is guaranteed to be valid. (The final comparison will overlap with 67 // the last comparison done in the loop for lengths that aren't multiples 68 // of four.) 69 // 70 // Finally, we needn't worry about alignment here, since we do unaligned 71 // loads. 72 unsafe { 73 let (mut px, mut py) = (x.as_ptr(), y.as_ptr()); 74 let (pxend, pyend) = (px.add(x.len() - 4), py.add(y.len() - 4)); 75 while px < pxend { 76 let vx = (px as *const u32).read_unaligned(); 77 let vy = (py as *const u32).read_unaligned(); 78 if vx != vy { 79 return false; 80 } 81 px = px.add(4); 82 py = py.add(4); 83 } 84 let vx = (pxend as *const u32).read_unaligned(); 85 let vy = (pyend as *const u32).read_unaligned(); 86 vx == vy 87 } 88 } 89