1// To run: 2// 3// cargo criterion --features criterion/real_blackbox 4// 5// This benchmarks each of the different libraries at several ratios of ASCII to 6// non-ASCII content. There is one additional benchmark labeled "baseline" which 7// just iterates over characters in a string, converting UTF-8 to 32-bit chars. 8// 9// Criterion will show a time in milliseconds. The non-baseline bench functions 10// each make one million function calls (2 calls per character, 500K characters 11// in the strings created by gen_string). The "time per call" listed in our 12// readme is computed by subtracting this baseline from the other bench 13// functions' time, then dividing by one million (ms -> ns). 14 15#![allow(clippy::needless_pass_by_value)] 16 17#[path = "../tests/fst/mod.rs"] 18mod fst; 19#[path = "../tests/roaring/mod.rs"] 20mod roaring; 21#[path = "../tests/trie/mod.rs"] 22mod trie; 23 24use criterion::{black_box, criterion_group, criterion_main, Criterion}; 25use rand::distributions::{Bernoulli, Distribution, Uniform}; 26use rand::rngs::SmallRng; 27use rand::SeedableRng; 28use std::time::Duration; 29 30fn gen_string(p_nonascii: u32) -> String { 31 let mut rng = SmallRng::from_seed([b'!'; 32]); 32 let pick_nonascii = Bernoulli::from_ratio(p_nonascii, 100).unwrap(); 33 let ascii = Uniform::new_inclusive('\0', '\x7f'); 34 let nonascii = Uniform::new_inclusive(0x80 as char, char::MAX); 35 36 let mut string = String::new(); 37 for _ in 0..500_000 { 38 let distribution = if pick_nonascii.sample(&mut rng) { 39 nonascii 40 } else { 41 ascii 42 }; 43 string.push(distribution.sample(&mut rng)); 44 } 45 46 string 47} 48 49fn bench(c: &mut Criterion, group_name: &str, string: String) { 50 let mut group = c.benchmark_group(group_name); 51 group.measurement_time(Duration::from_secs(10)); 52 group.bench_function("baseline", |b| { 53 b.iter(|| { 54 for ch in string.chars() { 55 black_box(ch); 56 } 57 }); 58 }); 59 group.bench_function("unicode-ident", |b| { 60 b.iter(|| { 61 for ch in string.chars() { 62 black_box(unicode_ident::is_xid_start(ch)); 63 black_box(unicode_ident::is_xid_continue(ch)); 64 } 65 }); 66 }); 67 group.bench_function("unicode-xid", |b| { 68 b.iter(|| { 69 for ch in string.chars() { 70 black_box(unicode_xid::UnicodeXID::is_xid_start(ch)); 71 black_box(unicode_xid::UnicodeXID::is_xid_continue(ch)); 72 } 73 }); 74 }); 75 group.bench_function("ucd-trie", |b| { 76 b.iter(|| { 77 for ch in string.chars() { 78 black_box(trie::XID_START.contains_char(ch)); 79 black_box(trie::XID_CONTINUE.contains_char(ch)); 80 } 81 }); 82 }); 83 group.bench_function("fst", |b| { 84 let xid_start_fst = fst::xid_start_fst(); 85 let xid_continue_fst = fst::xid_continue_fst(); 86 b.iter(|| { 87 for ch in string.chars() { 88 let ch_bytes = (ch as u32).to_be_bytes(); 89 black_box(xid_start_fst.contains(ch_bytes)); 90 black_box(xid_continue_fst.contains(ch_bytes)); 91 } 92 }); 93 }); 94 group.bench_function("roaring", |b| { 95 let xid_start_bitmap = roaring::xid_start_bitmap(); 96 let xid_continue_bitmap = roaring::xid_continue_bitmap(); 97 b.iter(|| { 98 for ch in string.chars() { 99 black_box(xid_start_bitmap.contains(ch as u32)); 100 black_box(xid_continue_bitmap.contains(ch as u32)); 101 } 102 }); 103 }); 104 group.finish(); 105} 106 107fn bench0(c: &mut Criterion) { 108 bench(c, "0%-nonascii", gen_string(0)); 109} 110 111fn bench1(c: &mut Criterion) { 112 bench(c, "1%-nonascii", gen_string(1)); 113} 114 115fn bench10(c: &mut Criterion) { 116 bench(c, "10%-nonascii", gen_string(10)); 117} 118 119fn bench100(c: &mut Criterion) { 120 bench(c, "100%-nonascii", gen_string(100)); 121} 122 123criterion_group!(benches, bench0, bench1, bench10, bench100); 124criterion_main!(benches); 125