182e69de5Sopenharmony_ci//! This library implements string similarity metrics. 282e69de5Sopenharmony_ci 382e69de5Sopenharmony_ci#![forbid(unsafe_code)] 482e69de5Sopenharmony_ci 582e69de5Sopenharmony_ciuse std::char; 682e69de5Sopenharmony_ciuse std::cmp::{max, min}; 782e69de5Sopenharmony_ciuse std::collections::HashMap; 882e69de5Sopenharmony_ciuse std::error::Error; 982e69de5Sopenharmony_ciuse std::fmt::{self, Display, Formatter}; 1082e69de5Sopenharmony_ciuse std::hash::Hash; 1182e69de5Sopenharmony_ciuse std::str::Chars; 1282e69de5Sopenharmony_ci 1382e69de5Sopenharmony_ci#[derive(Debug, PartialEq)] 1482e69de5Sopenharmony_cipub enum StrSimError { 1582e69de5Sopenharmony_ci DifferentLengthArgs, 1682e69de5Sopenharmony_ci} 1782e69de5Sopenharmony_ci 1882e69de5Sopenharmony_ciimpl Display for StrSimError { 1982e69de5Sopenharmony_ci fn fmt(&self, fmt: &mut Formatter) -> Result<(), fmt::Error> { 2082e69de5Sopenharmony_ci let text = match self { 2182e69de5Sopenharmony_ci StrSimError::DifferentLengthArgs => "Differing length arguments provided", 2282e69de5Sopenharmony_ci }; 2382e69de5Sopenharmony_ci 2482e69de5Sopenharmony_ci write!(fmt, "{}", text) 2582e69de5Sopenharmony_ci } 2682e69de5Sopenharmony_ci} 2782e69de5Sopenharmony_ci 2882e69de5Sopenharmony_ciimpl Error for StrSimError {} 2982e69de5Sopenharmony_ci 3082e69de5Sopenharmony_cipub type HammingResult = Result<usize, StrSimError>; 3182e69de5Sopenharmony_ci 3282e69de5Sopenharmony_ci/// Calculates the number of positions in the two sequences where the elements 3382e69de5Sopenharmony_ci/// differ. Returns an error if the sequences have different lengths. 3482e69de5Sopenharmony_cipub fn generic_hamming<Iter1, Iter2, Elem1, Elem2>(a: Iter1, b: Iter2) -> HammingResult 3582e69de5Sopenharmony_ci where Iter1: IntoIterator<Item=Elem1>, 3682e69de5Sopenharmony_ci Iter2: IntoIterator<Item=Elem2>, 3782e69de5Sopenharmony_ci Elem1: PartialEq<Elem2> { 3882e69de5Sopenharmony_ci let (mut ita, mut itb) = (a.into_iter(), b.into_iter()); 3982e69de5Sopenharmony_ci let mut count = 0; 4082e69de5Sopenharmony_ci loop { 4182e69de5Sopenharmony_ci match (ita.next(), itb.next()){ 4282e69de5Sopenharmony_ci (Some(x), Some(y)) => if x != y { count += 1 }, 4382e69de5Sopenharmony_ci (None, None) => return Ok(count), 4482e69de5Sopenharmony_ci _ => return Err(StrSimError::DifferentLengthArgs), 4582e69de5Sopenharmony_ci } 4682e69de5Sopenharmony_ci } 4782e69de5Sopenharmony_ci} 4882e69de5Sopenharmony_ci 4982e69de5Sopenharmony_ci/// Calculates the number of positions in the two strings where the characters 5082e69de5Sopenharmony_ci/// differ. Returns an error if the strings have different lengths. 5182e69de5Sopenharmony_ci/// 5282e69de5Sopenharmony_ci/// ``` 5382e69de5Sopenharmony_ci/// use strsim::{hamming, StrSimError::DifferentLengthArgs}; 5482e69de5Sopenharmony_ci/// 5582e69de5Sopenharmony_ci/// assert_eq!(Ok(3), hamming("hamming", "hammers")); 5682e69de5Sopenharmony_ci/// 5782e69de5Sopenharmony_ci/// assert_eq!(Err(DifferentLengthArgs), hamming("hamming", "ham")); 5882e69de5Sopenharmony_ci/// ``` 5982e69de5Sopenharmony_cipub fn hamming(a: &str, b: &str) -> HammingResult { 6082e69de5Sopenharmony_ci generic_hamming(a.chars(), b.chars()) 6182e69de5Sopenharmony_ci} 6282e69de5Sopenharmony_ci 6382e69de5Sopenharmony_ci/// Calculates the Jaro similarity between two sequences. The returned value 6482e69de5Sopenharmony_ci/// is between 0.0 and 1.0 (higher value means more similar). 6582e69de5Sopenharmony_cipub fn generic_jaro<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 6682e69de5Sopenharmony_ci where &'a Iter1: IntoIterator<Item=Elem1>, 6782e69de5Sopenharmony_ci &'b Iter2: IntoIterator<Item=Elem2>, 6882e69de5Sopenharmony_ci Elem1: PartialEq<Elem2> { 6982e69de5Sopenharmony_ci let a_len = a.into_iter().count(); 7082e69de5Sopenharmony_ci let b_len = b.into_iter().count(); 7182e69de5Sopenharmony_ci 7282e69de5Sopenharmony_ci // The check for lengths of one here is to prevent integer overflow when 7382e69de5Sopenharmony_ci // calculating the search range. 7482e69de5Sopenharmony_ci if a_len == 0 && b_len == 0 { 7582e69de5Sopenharmony_ci return 1.0; 7682e69de5Sopenharmony_ci } else if a_len == 0 || b_len == 0 { 7782e69de5Sopenharmony_ci return 0.0; 7882e69de5Sopenharmony_ci } else if a_len == 1 && b_len == 1 { 7982e69de5Sopenharmony_ci return if a.into_iter().eq(b.into_iter()) { 1.0} else { 0.0 }; 8082e69de5Sopenharmony_ci } 8182e69de5Sopenharmony_ci 8282e69de5Sopenharmony_ci let search_range = (max(a_len, b_len) / 2) - 1; 8382e69de5Sopenharmony_ci 8482e69de5Sopenharmony_ci let mut b_consumed = Vec::with_capacity(b_len); 8582e69de5Sopenharmony_ci for _ in 0..b_len { 8682e69de5Sopenharmony_ci b_consumed.push(false); 8782e69de5Sopenharmony_ci } 8882e69de5Sopenharmony_ci let mut matches = 0.0; 8982e69de5Sopenharmony_ci 9082e69de5Sopenharmony_ci let mut transpositions = 0.0; 9182e69de5Sopenharmony_ci let mut b_match_index = 0; 9282e69de5Sopenharmony_ci 9382e69de5Sopenharmony_ci for (i, a_elem) in a.into_iter().enumerate() { 9482e69de5Sopenharmony_ci let min_bound = 9582e69de5Sopenharmony_ci // prevent integer wrapping 9682e69de5Sopenharmony_ci if i > search_range { 9782e69de5Sopenharmony_ci max(0, i - search_range) 9882e69de5Sopenharmony_ci } else { 9982e69de5Sopenharmony_ci 0 10082e69de5Sopenharmony_ci }; 10182e69de5Sopenharmony_ci 10282e69de5Sopenharmony_ci let max_bound = min(b_len - 1, i + search_range); 10382e69de5Sopenharmony_ci 10482e69de5Sopenharmony_ci if min_bound > max_bound { 10582e69de5Sopenharmony_ci continue; 10682e69de5Sopenharmony_ci } 10782e69de5Sopenharmony_ci 10882e69de5Sopenharmony_ci for (j, b_elem) in b.into_iter().enumerate() { 10982e69de5Sopenharmony_ci if min_bound <= j && j <= max_bound && a_elem == b_elem && 11082e69de5Sopenharmony_ci !b_consumed[j] { 11182e69de5Sopenharmony_ci b_consumed[j] = true; 11282e69de5Sopenharmony_ci matches += 1.0; 11382e69de5Sopenharmony_ci 11482e69de5Sopenharmony_ci if j < b_match_index { 11582e69de5Sopenharmony_ci transpositions += 1.0; 11682e69de5Sopenharmony_ci } 11782e69de5Sopenharmony_ci b_match_index = j; 11882e69de5Sopenharmony_ci 11982e69de5Sopenharmony_ci break; 12082e69de5Sopenharmony_ci } 12182e69de5Sopenharmony_ci } 12282e69de5Sopenharmony_ci } 12382e69de5Sopenharmony_ci 12482e69de5Sopenharmony_ci if matches == 0.0 { 12582e69de5Sopenharmony_ci 0.0 12682e69de5Sopenharmony_ci } else { 12782e69de5Sopenharmony_ci (1.0 / 3.0) * ((matches / a_len as f64) + 12882e69de5Sopenharmony_ci (matches / b_len as f64) + 12982e69de5Sopenharmony_ci ((matches - transpositions) / matches)) 13082e69de5Sopenharmony_ci } 13182e69de5Sopenharmony_ci} 13282e69de5Sopenharmony_ci 13382e69de5Sopenharmony_cistruct StringWrapper<'a>(&'a str); 13482e69de5Sopenharmony_ci 13582e69de5Sopenharmony_ciimpl<'a, 'b> IntoIterator for &'a StringWrapper<'b> { 13682e69de5Sopenharmony_ci type Item = char; 13782e69de5Sopenharmony_ci type IntoIter = Chars<'b>; 13882e69de5Sopenharmony_ci 13982e69de5Sopenharmony_ci fn into_iter(self) -> Self::IntoIter { 14082e69de5Sopenharmony_ci self.0.chars() 14182e69de5Sopenharmony_ci } 14282e69de5Sopenharmony_ci} 14382e69de5Sopenharmony_ci 14482e69de5Sopenharmony_ci/// Calculates the Jaro similarity between two strings. The returned value 14582e69de5Sopenharmony_ci/// is between 0.0 and 1.0 (higher value means more similar). 14682e69de5Sopenharmony_ci/// 14782e69de5Sopenharmony_ci/// ``` 14882e69de5Sopenharmony_ci/// use strsim::jaro; 14982e69de5Sopenharmony_ci/// 15082e69de5Sopenharmony_ci/// assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() < 15182e69de5Sopenharmony_ci/// 0.001); 15282e69de5Sopenharmony_ci/// ``` 15382e69de5Sopenharmony_cipub fn jaro(a: &str, b: &str) -> f64 { 15482e69de5Sopenharmony_ci generic_jaro(&StringWrapper(a), &StringWrapper(b)) 15582e69de5Sopenharmony_ci} 15682e69de5Sopenharmony_ci 15782e69de5Sopenharmony_ci/// Like Jaro but gives a boost to sequences that have a common prefix. 15882e69de5Sopenharmony_cipub fn generic_jaro_winkler<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 15982e69de5Sopenharmony_ci where &'a Iter1: IntoIterator<Item=Elem1>, 16082e69de5Sopenharmony_ci &'b Iter2: IntoIterator<Item=Elem2>, 16182e69de5Sopenharmony_ci Elem1: PartialEq<Elem2> { 16282e69de5Sopenharmony_ci let jaro_distance = generic_jaro(a, b); 16382e69de5Sopenharmony_ci 16482e69de5Sopenharmony_ci // Don't limit the length of the common prefix 16582e69de5Sopenharmony_ci let prefix_length = a.into_iter() 16682e69de5Sopenharmony_ci .zip(b.into_iter()) 16782e69de5Sopenharmony_ci .take_while(|&(ref a_elem, ref b_elem)| a_elem == b_elem) 16882e69de5Sopenharmony_ci .count(); 16982e69de5Sopenharmony_ci 17082e69de5Sopenharmony_ci let jaro_winkler_distance = 17182e69de5Sopenharmony_ci jaro_distance + (0.1 * prefix_length as f64 * (1.0 - jaro_distance)); 17282e69de5Sopenharmony_ci 17382e69de5Sopenharmony_ci if jaro_winkler_distance <= 1.0 { 17482e69de5Sopenharmony_ci jaro_winkler_distance 17582e69de5Sopenharmony_ci } else { 17682e69de5Sopenharmony_ci 1.0 17782e69de5Sopenharmony_ci } 17882e69de5Sopenharmony_ci} 17982e69de5Sopenharmony_ci 18082e69de5Sopenharmony_ci/// Like Jaro but gives a boost to strings that have a common prefix. 18182e69de5Sopenharmony_ci/// 18282e69de5Sopenharmony_ci/// ``` 18382e69de5Sopenharmony_ci/// use strsim::jaro_winkler; 18482e69de5Sopenharmony_ci/// 18582e69de5Sopenharmony_ci/// assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() < 18682e69de5Sopenharmony_ci/// 0.001); 18782e69de5Sopenharmony_ci/// ``` 18882e69de5Sopenharmony_cipub fn jaro_winkler(a: &str, b: &str) -> f64 { 18982e69de5Sopenharmony_ci generic_jaro_winkler(&StringWrapper(a), &StringWrapper(b)) 19082e69de5Sopenharmony_ci} 19182e69de5Sopenharmony_ci 19282e69de5Sopenharmony_ci/// Calculates the minimum number of insertions, deletions, and substitutions 19382e69de5Sopenharmony_ci/// required to change one sequence into the other. 19482e69de5Sopenharmony_ci/// 19582e69de5Sopenharmony_ci/// ``` 19682e69de5Sopenharmony_ci/// use strsim::generic_levenshtein; 19782e69de5Sopenharmony_ci/// 19882e69de5Sopenharmony_ci/// assert_eq!(3, generic_levenshtein(&[1,2,3], &[1,2,3,4,5,6])); 19982e69de5Sopenharmony_ci/// ``` 20082e69de5Sopenharmony_cipub fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> usize 20182e69de5Sopenharmony_ci where &'a Iter1: IntoIterator<Item=Elem1>, 20282e69de5Sopenharmony_ci &'b Iter2: IntoIterator<Item=Elem2>, 20382e69de5Sopenharmony_ci Elem1: PartialEq<Elem2> { 20482e69de5Sopenharmony_ci let b_len = b.into_iter().count(); 20582e69de5Sopenharmony_ci 20682e69de5Sopenharmony_ci if a.into_iter().next().is_none() { return b_len; } 20782e69de5Sopenharmony_ci 20882e69de5Sopenharmony_ci let mut cache: Vec<usize> = (1..b_len+1).collect(); 20982e69de5Sopenharmony_ci 21082e69de5Sopenharmony_ci let mut result = 0; 21182e69de5Sopenharmony_ci 21282e69de5Sopenharmony_ci for (i, a_elem) in a.into_iter().enumerate() { 21382e69de5Sopenharmony_ci result = i + 1; 21482e69de5Sopenharmony_ci let mut distance_b = i; 21582e69de5Sopenharmony_ci 21682e69de5Sopenharmony_ci for (j, b_elem) in b.into_iter().enumerate() { 21782e69de5Sopenharmony_ci let cost = if a_elem == b_elem { 0usize } else { 1usize }; 21882e69de5Sopenharmony_ci let distance_a = distance_b + cost; 21982e69de5Sopenharmony_ci distance_b = cache[j]; 22082e69de5Sopenharmony_ci result = min(result + 1, min(distance_a, distance_b + 1)); 22182e69de5Sopenharmony_ci cache[j] = result; 22282e69de5Sopenharmony_ci } 22382e69de5Sopenharmony_ci } 22482e69de5Sopenharmony_ci 22582e69de5Sopenharmony_ci result 22682e69de5Sopenharmony_ci} 22782e69de5Sopenharmony_ci 22882e69de5Sopenharmony_ci/// Calculates the minimum number of insertions, deletions, and substitutions 22982e69de5Sopenharmony_ci/// required to change one string into the other. 23082e69de5Sopenharmony_ci/// 23182e69de5Sopenharmony_ci/// ``` 23282e69de5Sopenharmony_ci/// use strsim::levenshtein; 23382e69de5Sopenharmony_ci/// 23482e69de5Sopenharmony_ci/// assert_eq!(3, levenshtein("kitten", "sitting")); 23582e69de5Sopenharmony_ci/// ``` 23682e69de5Sopenharmony_cipub fn levenshtein(a: &str, b: &str) -> usize { 23782e69de5Sopenharmony_ci generic_levenshtein(&StringWrapper(a), &StringWrapper(b)) 23882e69de5Sopenharmony_ci} 23982e69de5Sopenharmony_ci 24082e69de5Sopenharmony_ci/// Calculates a normalized score of the Levenshtein algorithm between 0.0 and 24182e69de5Sopenharmony_ci/// 1.0 (inclusive), where 1.0 means the strings are the same. 24282e69de5Sopenharmony_ci/// 24382e69de5Sopenharmony_ci/// ``` 24482e69de5Sopenharmony_ci/// use strsim::normalized_levenshtein; 24582e69de5Sopenharmony_ci/// 24682e69de5Sopenharmony_ci/// assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001); 24782e69de5Sopenharmony_ci/// assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001); 24882e69de5Sopenharmony_ci/// assert!(normalized_levenshtein("", "second").abs() < 0.00001); 24982e69de5Sopenharmony_ci/// assert!(normalized_levenshtein("first", "").abs() < 0.00001); 25082e69de5Sopenharmony_ci/// assert!((normalized_levenshtein("string", "string") - 1.0).abs() < 0.00001); 25182e69de5Sopenharmony_ci/// ``` 25282e69de5Sopenharmony_cipub fn normalized_levenshtein(a: &str, b: &str) -> f64 { 25382e69de5Sopenharmony_ci if a.is_empty() && b.is_empty() { 25482e69de5Sopenharmony_ci return 1.0; 25582e69de5Sopenharmony_ci } 25682e69de5Sopenharmony_ci 1.0 - (levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64) 25782e69de5Sopenharmony_ci} 25882e69de5Sopenharmony_ci 25982e69de5Sopenharmony_ci/// Like Levenshtein but allows for adjacent transpositions. Each substring can 26082e69de5Sopenharmony_ci/// only be edited once. 26182e69de5Sopenharmony_ci/// 26282e69de5Sopenharmony_ci/// ``` 26382e69de5Sopenharmony_ci/// use strsim::osa_distance; 26482e69de5Sopenharmony_ci/// 26582e69de5Sopenharmony_ci/// assert_eq!(3, osa_distance("ab", "bca")); 26682e69de5Sopenharmony_ci/// ``` 26782e69de5Sopenharmony_cipub fn osa_distance(a: &str, b: &str) -> usize { 26882e69de5Sopenharmony_ci let a_len = a.chars().count(); 26982e69de5Sopenharmony_ci let b_len = b.chars().count(); 27082e69de5Sopenharmony_ci if a == b { return 0; } 27182e69de5Sopenharmony_ci else if a_len == 0 { return b_len; } 27282e69de5Sopenharmony_ci else if b_len == 0 { return a_len; } 27382e69de5Sopenharmony_ci 27482e69de5Sopenharmony_ci let mut prev_two_distances: Vec<usize> = Vec::with_capacity(b_len + 1); 27582e69de5Sopenharmony_ci let mut prev_distances: Vec<usize> = Vec::with_capacity(b_len + 1); 27682e69de5Sopenharmony_ci let mut curr_distances: Vec<usize> = Vec::with_capacity(b_len + 1); 27782e69de5Sopenharmony_ci 27882e69de5Sopenharmony_ci let mut prev_a_char = char::MAX; 27982e69de5Sopenharmony_ci let mut prev_b_char = char::MAX; 28082e69de5Sopenharmony_ci 28182e69de5Sopenharmony_ci for i in 0..(b_len + 1) { 28282e69de5Sopenharmony_ci prev_two_distances.push(i); 28382e69de5Sopenharmony_ci prev_distances.push(i); 28482e69de5Sopenharmony_ci curr_distances.push(0); 28582e69de5Sopenharmony_ci } 28682e69de5Sopenharmony_ci 28782e69de5Sopenharmony_ci for (i, a_char) in a.chars().enumerate() { 28882e69de5Sopenharmony_ci curr_distances[0] = i + 1; 28982e69de5Sopenharmony_ci 29082e69de5Sopenharmony_ci for (j, b_char) in b.chars().enumerate() { 29182e69de5Sopenharmony_ci let cost = if a_char == b_char { 0 } else { 1 }; 29282e69de5Sopenharmony_ci curr_distances[j + 1] = min(curr_distances[j] + 1, 29382e69de5Sopenharmony_ci min(prev_distances[j + 1] + 1, 29482e69de5Sopenharmony_ci prev_distances[j] + cost)); 29582e69de5Sopenharmony_ci if i > 0 && j > 0 && a_char != b_char && 29682e69de5Sopenharmony_ci a_char == prev_b_char && b_char == prev_a_char { 29782e69de5Sopenharmony_ci curr_distances[j + 1] = min(curr_distances[j + 1], 29882e69de5Sopenharmony_ci prev_two_distances[j - 1] + 1); 29982e69de5Sopenharmony_ci } 30082e69de5Sopenharmony_ci 30182e69de5Sopenharmony_ci prev_b_char = b_char; 30282e69de5Sopenharmony_ci } 30382e69de5Sopenharmony_ci 30482e69de5Sopenharmony_ci prev_two_distances.clone_from(&prev_distances); 30582e69de5Sopenharmony_ci prev_distances.clone_from(&curr_distances); 30682e69de5Sopenharmony_ci prev_a_char = a_char; 30782e69de5Sopenharmony_ci } 30882e69de5Sopenharmony_ci 30982e69de5Sopenharmony_ci curr_distances[b_len] 31082e69de5Sopenharmony_ci 31182e69de5Sopenharmony_ci} 31282e69de5Sopenharmony_ci 31382e69de5Sopenharmony_ci/* Returns the final index for a value in a single vector that represents a fixed 31482e69de5Sopenharmony_ci 2d grid */ 31582e69de5Sopenharmony_cifn flat_index(i: usize, j: usize, width: usize) -> usize { 31682e69de5Sopenharmony_ci j * width + i 31782e69de5Sopenharmony_ci} 31882e69de5Sopenharmony_ci 31982e69de5Sopenharmony_ci/// Like optimal string alignment, but substrings can be edited an unlimited 32082e69de5Sopenharmony_ci/// number of times, and the triangle inequality holds. 32182e69de5Sopenharmony_ci/// 32282e69de5Sopenharmony_ci/// ``` 32382e69de5Sopenharmony_ci/// use strsim::generic_damerau_levenshtein; 32482e69de5Sopenharmony_ci/// 32582e69de5Sopenharmony_ci/// assert_eq!(2, generic_damerau_levenshtein(&[1,2], &[2,3,1])); 32682e69de5Sopenharmony_ci/// ``` 32782e69de5Sopenharmony_cipub fn generic_damerau_levenshtein<Elem>(a_elems: &[Elem], b_elems: &[Elem]) -> usize 32882e69de5Sopenharmony_ci where Elem: Eq + Hash + Clone { 32982e69de5Sopenharmony_ci let a_len = a_elems.len(); 33082e69de5Sopenharmony_ci let b_len = b_elems.len(); 33182e69de5Sopenharmony_ci 33282e69de5Sopenharmony_ci if a_len == 0 { return b_len; } 33382e69de5Sopenharmony_ci if b_len == 0 { return a_len; } 33482e69de5Sopenharmony_ci 33582e69de5Sopenharmony_ci let width = a_len + 2; 33682e69de5Sopenharmony_ci let mut distances = vec![0; (a_len + 2) * (b_len + 2)]; 33782e69de5Sopenharmony_ci let max_distance = a_len + b_len; 33882e69de5Sopenharmony_ci distances[0] = max_distance; 33982e69de5Sopenharmony_ci 34082e69de5Sopenharmony_ci for i in 0..(a_len + 1) { 34182e69de5Sopenharmony_ci distances[flat_index(i + 1, 0, width)] = max_distance; 34282e69de5Sopenharmony_ci distances[flat_index(i + 1, 1, width)] = i; 34382e69de5Sopenharmony_ci } 34482e69de5Sopenharmony_ci 34582e69de5Sopenharmony_ci for j in 0..(b_len + 1) { 34682e69de5Sopenharmony_ci distances[flat_index(0, j + 1, width)] = max_distance; 34782e69de5Sopenharmony_ci distances[flat_index(1, j + 1, width)] = j; 34882e69de5Sopenharmony_ci } 34982e69de5Sopenharmony_ci 35082e69de5Sopenharmony_ci let mut elems: HashMap<Elem, usize> = HashMap::with_capacity(64); 35182e69de5Sopenharmony_ci 35282e69de5Sopenharmony_ci for i in 1..(a_len + 1) { 35382e69de5Sopenharmony_ci let mut db = 0; 35482e69de5Sopenharmony_ci 35582e69de5Sopenharmony_ci for j in 1..(b_len + 1) { 35682e69de5Sopenharmony_ci let k = match elems.get(&b_elems[j - 1]) { 35782e69de5Sopenharmony_ci Some(&value) => value, 35882e69de5Sopenharmony_ci None => 0 35982e69de5Sopenharmony_ci }; 36082e69de5Sopenharmony_ci 36182e69de5Sopenharmony_ci let insertion_cost = distances[flat_index(i, j + 1, width)] + 1; 36282e69de5Sopenharmony_ci let deletion_cost = distances[flat_index(i + 1, j, width)] + 1; 36382e69de5Sopenharmony_ci let transposition_cost = distances[flat_index(k, db, width)] + 36482e69de5Sopenharmony_ci (i - k - 1) + 1 + (j - db - 1); 36582e69de5Sopenharmony_ci 36682e69de5Sopenharmony_ci let mut substitution_cost = distances[flat_index(i, j, width)] + 1; 36782e69de5Sopenharmony_ci if a_elems[i - 1] == b_elems[j - 1] { 36882e69de5Sopenharmony_ci db = j; 36982e69de5Sopenharmony_ci substitution_cost -= 1; 37082e69de5Sopenharmony_ci } 37182e69de5Sopenharmony_ci 37282e69de5Sopenharmony_ci distances[flat_index(i + 1, j + 1, width)] = min(substitution_cost, 37382e69de5Sopenharmony_ci min(insertion_cost, min(deletion_cost, transposition_cost))); 37482e69de5Sopenharmony_ci } 37582e69de5Sopenharmony_ci 37682e69de5Sopenharmony_ci elems.insert(a_elems[i - 1].clone(), i); 37782e69de5Sopenharmony_ci } 37882e69de5Sopenharmony_ci 37982e69de5Sopenharmony_ci distances[flat_index(a_len + 1, b_len + 1, width)] 38082e69de5Sopenharmony_ci} 38182e69de5Sopenharmony_ci 38282e69de5Sopenharmony_ci/// Like optimal string alignment, but substrings can be edited an unlimited 38382e69de5Sopenharmony_ci/// number of times, and the triangle inequality holds. 38482e69de5Sopenharmony_ci/// 38582e69de5Sopenharmony_ci/// ``` 38682e69de5Sopenharmony_ci/// use strsim::damerau_levenshtein; 38782e69de5Sopenharmony_ci/// 38882e69de5Sopenharmony_ci/// assert_eq!(2, damerau_levenshtein("ab", "bca")); 38982e69de5Sopenharmony_ci/// ``` 39082e69de5Sopenharmony_cipub fn damerau_levenshtein(a: &str, b: &str) -> usize { 39182e69de5Sopenharmony_ci let (x, y): (Vec<_>, Vec<_>) = (a.chars().collect(), b.chars().collect()); 39282e69de5Sopenharmony_ci generic_damerau_levenshtein(x.as_slice(), y.as_slice()) 39382e69de5Sopenharmony_ci} 39482e69de5Sopenharmony_ci 39582e69de5Sopenharmony_ci/// Calculates a normalized score of the Damerau–Levenshtein algorithm between 39682e69de5Sopenharmony_ci/// 0.0 and 1.0 (inclusive), where 1.0 means the strings are the same. 39782e69de5Sopenharmony_ci/// 39882e69de5Sopenharmony_ci/// ``` 39982e69de5Sopenharmony_ci/// use strsim::normalized_damerau_levenshtein; 40082e69de5Sopenharmony_ci/// 40182e69de5Sopenharmony_ci/// assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001); 40282e69de5Sopenharmony_ci/// assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001); 40382e69de5Sopenharmony_ci/// assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001); 40482e69de5Sopenharmony_ci/// assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001); 40582e69de5Sopenharmony_ci/// assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001); 40682e69de5Sopenharmony_ci/// ``` 40782e69de5Sopenharmony_cipub fn normalized_damerau_levenshtein(a: &str, b: &str) -> f64 { 40882e69de5Sopenharmony_ci if a.is_empty() && b.is_empty() { 40982e69de5Sopenharmony_ci return 1.0; 41082e69de5Sopenharmony_ci } 41182e69de5Sopenharmony_ci 1.0 - (damerau_levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64) 41282e69de5Sopenharmony_ci} 41382e69de5Sopenharmony_ci 41482e69de5Sopenharmony_ci/// Returns an Iterator of char tuples. 41582e69de5Sopenharmony_cifn bigrams(s: &str) -> impl Iterator<Item=(char, char)> + '_ { 41682e69de5Sopenharmony_ci s.chars().zip(s.chars().skip(1)) 41782e69de5Sopenharmony_ci} 41882e69de5Sopenharmony_ci 41982e69de5Sopenharmony_ci 42082e69de5Sopenharmony_ci/// Calculates a Sørensen-Dice similarity distance using bigrams. 42182e69de5Sopenharmony_ci/// See http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient. 42282e69de5Sopenharmony_ci/// 42382e69de5Sopenharmony_ci/// ``` 42482e69de5Sopenharmony_ci/// use strsim::sorensen_dice; 42582e69de5Sopenharmony_ci/// 42682e69de5Sopenharmony_ci/// assert_eq!(1.0, sorensen_dice("", "")); 42782e69de5Sopenharmony_ci/// assert_eq!(0.0, sorensen_dice("", "a")); 42882e69de5Sopenharmony_ci/// assert_eq!(0.0, sorensen_dice("french", "quebec")); 42982e69de5Sopenharmony_ci/// assert_eq!(1.0, sorensen_dice("ferris", "ferris")); 43082e69de5Sopenharmony_ci/// assert_eq!(1.0, sorensen_dice("ferris", "ferris")); 43182e69de5Sopenharmony_ci/// assert_eq!(0.8888888888888888, sorensen_dice("feris", "ferris")); 43282e69de5Sopenharmony_ci/// ``` 43382e69de5Sopenharmony_cipub fn sorensen_dice(a: &str, b: &str) -> f64 { 43482e69de5Sopenharmony_ci // implementation guided by 43582e69de5Sopenharmony_ci // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/index.js#L6 43682e69de5Sopenharmony_ci 43782e69de5Sopenharmony_ci let a: String = a.chars().filter(|&x| !char::is_whitespace(x)).collect(); 43882e69de5Sopenharmony_ci let b: String = b.chars().filter(|&x| !char::is_whitespace(x)).collect(); 43982e69de5Sopenharmony_ci 44082e69de5Sopenharmony_ci if a.len() == 0 && b.len() == 0 { 44182e69de5Sopenharmony_ci return 1.0; 44282e69de5Sopenharmony_ci } 44382e69de5Sopenharmony_ci 44482e69de5Sopenharmony_ci if a.len() == 0 || b.len() == 0 { 44582e69de5Sopenharmony_ci return 0.0; 44682e69de5Sopenharmony_ci } 44782e69de5Sopenharmony_ci 44882e69de5Sopenharmony_ci if a == b { 44982e69de5Sopenharmony_ci return 1.0; 45082e69de5Sopenharmony_ci } 45182e69de5Sopenharmony_ci 45282e69de5Sopenharmony_ci if a.len() == 1 && b.len() == 1 { 45382e69de5Sopenharmony_ci return 0.0; 45482e69de5Sopenharmony_ci } 45582e69de5Sopenharmony_ci 45682e69de5Sopenharmony_ci if a.len() < 2 || b.len() < 2 { 45782e69de5Sopenharmony_ci return 0.0; 45882e69de5Sopenharmony_ci } 45982e69de5Sopenharmony_ci 46082e69de5Sopenharmony_ci let mut a_bigrams: HashMap<(char, char), usize> = HashMap::new(); 46182e69de5Sopenharmony_ci 46282e69de5Sopenharmony_ci for bigram in bigrams(&a) { 46382e69de5Sopenharmony_ci *a_bigrams.entry(bigram).or_insert(0) += 1; 46482e69de5Sopenharmony_ci } 46582e69de5Sopenharmony_ci 46682e69de5Sopenharmony_ci let mut intersection_size = 0; 46782e69de5Sopenharmony_ci 46882e69de5Sopenharmony_ci for bigram in bigrams(&b) { 46982e69de5Sopenharmony_ci a_bigrams.entry(bigram).and_modify(|bi| { 47082e69de5Sopenharmony_ci if *bi > 0 { 47182e69de5Sopenharmony_ci *bi -= 1; 47282e69de5Sopenharmony_ci intersection_size += 1; 47382e69de5Sopenharmony_ci } 47482e69de5Sopenharmony_ci }); 47582e69de5Sopenharmony_ci } 47682e69de5Sopenharmony_ci 47782e69de5Sopenharmony_ci (2 * intersection_size) as f64 / (a.len() + b.len() - 2) as f64 47882e69de5Sopenharmony_ci} 47982e69de5Sopenharmony_ci 48082e69de5Sopenharmony_ci 48182e69de5Sopenharmony_ci#[cfg(test)] 48282e69de5Sopenharmony_cimod tests { 48382e69de5Sopenharmony_ci use super::*; 48482e69de5Sopenharmony_ci 48582e69de5Sopenharmony_ci #[test] 48682e69de5Sopenharmony_ci fn bigrams_iterator() { 48782e69de5Sopenharmony_ci let mut bi = bigrams("abcde"); 48882e69de5Sopenharmony_ci 48982e69de5Sopenharmony_ci assert_eq!(Some(('a', 'b')), bi.next()); 49082e69de5Sopenharmony_ci assert_eq!(Some(('b', 'c')), bi.next()); 49182e69de5Sopenharmony_ci assert_eq!(Some(('c', 'd')), bi.next()); 49282e69de5Sopenharmony_ci assert_eq!(Some(('d', 'e')), bi.next()); 49382e69de5Sopenharmony_ci assert_eq!(None, bi.next()); 49482e69de5Sopenharmony_ci } 49582e69de5Sopenharmony_ci 49682e69de5Sopenharmony_ci fn assert_hamming_dist(dist: usize, str1: &str, str2: &str) { 49782e69de5Sopenharmony_ci assert_eq!(Ok(dist), hamming(str1, str2)); 49882e69de5Sopenharmony_ci } 49982e69de5Sopenharmony_ci 50082e69de5Sopenharmony_ci #[test] 50182e69de5Sopenharmony_ci fn hamming_empty() { 50282e69de5Sopenharmony_ci assert_hamming_dist(0, "", "") 50382e69de5Sopenharmony_ci } 50482e69de5Sopenharmony_ci 50582e69de5Sopenharmony_ci #[test] 50682e69de5Sopenharmony_ci fn hamming_same() { 50782e69de5Sopenharmony_ci assert_hamming_dist(0, "hamming", "hamming") 50882e69de5Sopenharmony_ci } 50982e69de5Sopenharmony_ci 51082e69de5Sopenharmony_ci #[test] 51182e69de5Sopenharmony_ci fn hamming_numbers() { 51282e69de5Sopenharmony_ci assert_eq!(Ok(1), generic_hamming(&[1, 2, 4], &[1, 2, 3])); 51382e69de5Sopenharmony_ci } 51482e69de5Sopenharmony_ci 51582e69de5Sopenharmony_ci #[test] 51682e69de5Sopenharmony_ci fn hamming_diff() { 51782e69de5Sopenharmony_ci assert_hamming_dist(3, "hamming", "hammers") 51882e69de5Sopenharmony_ci } 51982e69de5Sopenharmony_ci 52082e69de5Sopenharmony_ci #[test] 52182e69de5Sopenharmony_ci fn hamming_diff_multibyte() { 52282e69de5Sopenharmony_ci assert_hamming_dist(2, "hamming", "h香mmüng"); 52382e69de5Sopenharmony_ci } 52482e69de5Sopenharmony_ci 52582e69de5Sopenharmony_ci #[test] 52682e69de5Sopenharmony_ci fn hamming_unequal_length() { 52782e69de5Sopenharmony_ci assert_eq!( 52882e69de5Sopenharmony_ci Err(StrSimError::DifferentLengthArgs), 52982e69de5Sopenharmony_ci generic_hamming("ham".chars(), "hamming".chars()) 53082e69de5Sopenharmony_ci ); 53182e69de5Sopenharmony_ci } 53282e69de5Sopenharmony_ci 53382e69de5Sopenharmony_ci #[test] 53482e69de5Sopenharmony_ci fn hamming_names() { 53582e69de5Sopenharmony_ci assert_hamming_dist(14, "Friedrich Nietzs", "Jean-Paul Sartre") 53682e69de5Sopenharmony_ci } 53782e69de5Sopenharmony_ci 53882e69de5Sopenharmony_ci #[test] 53982e69de5Sopenharmony_ci fn jaro_both_empty() { 54082e69de5Sopenharmony_ci assert_eq!(1.0, jaro("", "")); 54182e69de5Sopenharmony_ci } 54282e69de5Sopenharmony_ci 54382e69de5Sopenharmony_ci #[test] 54482e69de5Sopenharmony_ci fn jaro_first_empty() { 54582e69de5Sopenharmony_ci assert_eq!(0.0, jaro("", "jaro")); 54682e69de5Sopenharmony_ci } 54782e69de5Sopenharmony_ci 54882e69de5Sopenharmony_ci #[test] 54982e69de5Sopenharmony_ci fn jaro_second_empty() { 55082e69de5Sopenharmony_ci assert_eq!(0.0, jaro("distance", "")); 55182e69de5Sopenharmony_ci } 55282e69de5Sopenharmony_ci 55382e69de5Sopenharmony_ci #[test] 55482e69de5Sopenharmony_ci fn jaro_same() { 55582e69de5Sopenharmony_ci assert_eq!(1.0, jaro("jaro", "jaro")); 55682e69de5Sopenharmony_ci } 55782e69de5Sopenharmony_ci 55882e69de5Sopenharmony_ci #[test] 55982e69de5Sopenharmony_ci fn jaro_multibyte() { 56082e69de5Sopenharmony_ci assert!((0.818 - jaro("testabctest", "testöঙ香test")) < 0.001); 56182e69de5Sopenharmony_ci assert!((0.818 - jaro("testöঙ香test", "testabctest")) < 0.001); 56282e69de5Sopenharmony_ci } 56382e69de5Sopenharmony_ci 56482e69de5Sopenharmony_ci #[test] 56582e69de5Sopenharmony_ci fn jaro_diff_short() { 56682e69de5Sopenharmony_ci assert!((0.767 - jaro("dixon", "dicksonx")).abs() < 0.001); 56782e69de5Sopenharmony_ci } 56882e69de5Sopenharmony_ci 56982e69de5Sopenharmony_ci #[test] 57082e69de5Sopenharmony_ci fn jaro_diff_one_character() { 57182e69de5Sopenharmony_ci assert_eq!(0.0, jaro("a", "b")); 57282e69de5Sopenharmony_ci } 57382e69de5Sopenharmony_ci 57482e69de5Sopenharmony_ci #[test] 57582e69de5Sopenharmony_ci fn jaro_same_one_character() { 57682e69de5Sopenharmony_ci assert_eq!(1.0, jaro("a", "a")); 57782e69de5Sopenharmony_ci } 57882e69de5Sopenharmony_ci 57982e69de5Sopenharmony_ci #[test] 58082e69de5Sopenharmony_ci fn generic_jaro_diff() { 58182e69de5Sopenharmony_ci assert_eq!(0.0, generic_jaro(&[1, 2], &[3, 4])); 58282e69de5Sopenharmony_ci } 58382e69de5Sopenharmony_ci 58482e69de5Sopenharmony_ci #[test] 58582e69de5Sopenharmony_ci fn jaro_diff_one_and_two() { 58682e69de5Sopenharmony_ci assert!((0.83 - jaro("a", "ab")).abs() < 0.01); 58782e69de5Sopenharmony_ci } 58882e69de5Sopenharmony_ci 58982e69de5Sopenharmony_ci #[test] 59082e69de5Sopenharmony_ci fn jaro_diff_two_and_one() { 59182e69de5Sopenharmony_ci assert!((0.83 - jaro("ab", "a")).abs() < 0.01); 59282e69de5Sopenharmony_ci } 59382e69de5Sopenharmony_ci 59482e69de5Sopenharmony_ci #[test] 59582e69de5Sopenharmony_ci fn jaro_diff_no_transposition() { 59682e69de5Sopenharmony_ci assert!((0.822 - jaro("dwayne", "duane")).abs() < 0.001); 59782e69de5Sopenharmony_ci } 59882e69de5Sopenharmony_ci 59982e69de5Sopenharmony_ci #[test] 60082e69de5Sopenharmony_ci fn jaro_diff_with_transposition() { 60182e69de5Sopenharmony_ci assert!((0.944 - jaro("martha", "marhta")).abs() < 0.001); 60282e69de5Sopenharmony_ci } 60382e69de5Sopenharmony_ci 60482e69de5Sopenharmony_ci #[test] 60582e69de5Sopenharmony_ci fn jaro_names() { 60682e69de5Sopenharmony_ci assert!((0.392 - jaro("Friedrich Nietzsche", 60782e69de5Sopenharmony_ci "Jean-Paul Sartre")).abs() < 0.001); 60882e69de5Sopenharmony_ci } 60982e69de5Sopenharmony_ci 61082e69de5Sopenharmony_ci #[test] 61182e69de5Sopenharmony_ci fn jaro_winkler_both_empty() { 61282e69de5Sopenharmony_ci assert_eq!(1.0, jaro_winkler("", "")); 61382e69de5Sopenharmony_ci } 61482e69de5Sopenharmony_ci 61582e69de5Sopenharmony_ci #[test] 61682e69de5Sopenharmony_ci fn jaro_winkler_first_empty() { 61782e69de5Sopenharmony_ci assert_eq!(0.0, jaro_winkler("", "jaro-winkler")); 61882e69de5Sopenharmony_ci } 61982e69de5Sopenharmony_ci 62082e69de5Sopenharmony_ci #[test] 62182e69de5Sopenharmony_ci fn jaro_winkler_second_empty() { 62282e69de5Sopenharmony_ci assert_eq!(0.0, jaro_winkler("distance", "")); 62382e69de5Sopenharmony_ci } 62482e69de5Sopenharmony_ci 62582e69de5Sopenharmony_ci #[test] 62682e69de5Sopenharmony_ci fn jaro_winkler_same() { 62782e69de5Sopenharmony_ci assert_eq!(1.0, jaro_winkler("Jaro-Winkler", "Jaro-Winkler")); 62882e69de5Sopenharmony_ci } 62982e69de5Sopenharmony_ci 63082e69de5Sopenharmony_ci #[test] 63182e69de5Sopenharmony_ci fn jaro_winkler_multibyte() { 63282e69de5Sopenharmony_ci assert!((0.89 - jaro_winkler("testabctest", "testöঙ香test")).abs() < 63382e69de5Sopenharmony_ci 0.001); 63482e69de5Sopenharmony_ci assert!((0.89 - jaro_winkler("testöঙ香test", "testabctest")).abs() < 63582e69de5Sopenharmony_ci 0.001); 63682e69de5Sopenharmony_ci } 63782e69de5Sopenharmony_ci 63882e69de5Sopenharmony_ci #[test] 63982e69de5Sopenharmony_ci fn jaro_winkler_diff_short() { 64082e69de5Sopenharmony_ci assert!((0.813 - jaro_winkler("dixon", "dicksonx")).abs() < 0.001); 64182e69de5Sopenharmony_ci assert!((0.813 - jaro_winkler("dicksonx", "dixon")).abs() < 0.001); 64282e69de5Sopenharmony_ci } 64382e69de5Sopenharmony_ci 64482e69de5Sopenharmony_ci #[test] 64582e69de5Sopenharmony_ci fn jaro_winkler_diff_one_character() { 64682e69de5Sopenharmony_ci assert_eq!(0.0, jaro_winkler("a", "b")); 64782e69de5Sopenharmony_ci } 64882e69de5Sopenharmony_ci 64982e69de5Sopenharmony_ci #[test] 65082e69de5Sopenharmony_ci fn jaro_winkler_same_one_character() { 65182e69de5Sopenharmony_ci assert_eq!(1.0, jaro_winkler("a", "a")); 65282e69de5Sopenharmony_ci } 65382e69de5Sopenharmony_ci 65482e69de5Sopenharmony_ci #[test] 65582e69de5Sopenharmony_ci fn jaro_winkler_diff_no_transposition() { 65682e69de5Sopenharmony_ci assert!((0.840 - jaro_winkler("dwayne", "duane")).abs() < 0.001); 65782e69de5Sopenharmony_ci } 65882e69de5Sopenharmony_ci 65982e69de5Sopenharmony_ci #[test] 66082e69de5Sopenharmony_ci fn jaro_winkler_diff_with_transposition() { 66182e69de5Sopenharmony_ci assert!((0.961 - jaro_winkler("martha", "marhta")).abs() < 0.001); 66282e69de5Sopenharmony_ci } 66382e69de5Sopenharmony_ci 66482e69de5Sopenharmony_ci #[test] 66582e69de5Sopenharmony_ci fn jaro_winkler_names() { 66682e69de5Sopenharmony_ci assert!((0.562 - jaro_winkler("Friedrich Nietzsche", 66782e69de5Sopenharmony_ci "Fran-Paul Sartre")).abs() < 0.001); 66882e69de5Sopenharmony_ci } 66982e69de5Sopenharmony_ci 67082e69de5Sopenharmony_ci #[test] 67182e69de5Sopenharmony_ci fn jaro_winkler_long_prefix() { 67282e69de5Sopenharmony_ci assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() < 67382e69de5Sopenharmony_ci 0.001); 67482e69de5Sopenharmony_ci } 67582e69de5Sopenharmony_ci 67682e69de5Sopenharmony_ci #[test] 67782e69de5Sopenharmony_ci fn jaro_winkler_more_names() { 67882e69de5Sopenharmony_ci assert!((0.868 - jaro_winkler("Thorkel", "Thorgier")).abs() < 0.001); 67982e69de5Sopenharmony_ci } 68082e69de5Sopenharmony_ci 68182e69de5Sopenharmony_ci #[test] 68282e69de5Sopenharmony_ci fn jaro_winkler_length_of_one() { 68382e69de5Sopenharmony_ci assert!((0.738 - jaro_winkler("Dinsdale", "D")).abs() < 0.001); 68482e69de5Sopenharmony_ci } 68582e69de5Sopenharmony_ci 68682e69de5Sopenharmony_ci #[test] 68782e69de5Sopenharmony_ci fn jaro_winkler_very_long_prefix() { 68882e69de5Sopenharmony_ci assert!((1.0 - jaro_winkler("thequickbrownfoxjumpedoverx", 68982e69de5Sopenharmony_ci "thequickbrownfoxjumpedovery")).abs() < 69082e69de5Sopenharmony_ci 0.001); 69182e69de5Sopenharmony_ci } 69282e69de5Sopenharmony_ci 69382e69de5Sopenharmony_ci #[test] 69482e69de5Sopenharmony_ci fn levenshtein_empty() { 69582e69de5Sopenharmony_ci assert_eq!(0, levenshtein("", "")); 69682e69de5Sopenharmony_ci } 69782e69de5Sopenharmony_ci 69882e69de5Sopenharmony_ci #[test] 69982e69de5Sopenharmony_ci fn levenshtein_same() { 70082e69de5Sopenharmony_ci assert_eq!(0, levenshtein("levenshtein", "levenshtein")); 70182e69de5Sopenharmony_ci } 70282e69de5Sopenharmony_ci 70382e69de5Sopenharmony_ci #[test] 70482e69de5Sopenharmony_ci fn levenshtein_diff_short() { 70582e69de5Sopenharmony_ci assert_eq!(3, levenshtein("kitten", "sitting")); 70682e69de5Sopenharmony_ci } 70782e69de5Sopenharmony_ci 70882e69de5Sopenharmony_ci #[test] 70982e69de5Sopenharmony_ci fn levenshtein_diff_with_space() { 71082e69de5Sopenharmony_ci assert_eq!(5, levenshtein("hello, world", "bye, world")); 71182e69de5Sopenharmony_ci } 71282e69de5Sopenharmony_ci 71382e69de5Sopenharmony_ci #[test] 71482e69de5Sopenharmony_ci fn levenshtein_diff_multibyte() { 71582e69de5Sopenharmony_ci assert_eq!(3, levenshtein("öঙ香", "abc")); 71682e69de5Sopenharmony_ci assert_eq!(3, levenshtein("abc", "öঙ香")); 71782e69de5Sopenharmony_ci } 71882e69de5Sopenharmony_ci 71982e69de5Sopenharmony_ci #[test] 72082e69de5Sopenharmony_ci fn levenshtein_diff_longer() { 72182e69de5Sopenharmony_ci let a = "The quick brown fox jumped over the angry dog."; 72282e69de5Sopenharmony_ci let b = "Lorem ipsum dolor sit amet, dicta latine an eam."; 72382e69de5Sopenharmony_ci assert_eq!(37, levenshtein(a, b)); 72482e69de5Sopenharmony_ci } 72582e69de5Sopenharmony_ci 72682e69de5Sopenharmony_ci #[test] 72782e69de5Sopenharmony_ci fn levenshtein_first_empty() { 72882e69de5Sopenharmony_ci assert_eq!(7, levenshtein("", "sitting")); 72982e69de5Sopenharmony_ci } 73082e69de5Sopenharmony_ci 73182e69de5Sopenharmony_ci #[test] 73282e69de5Sopenharmony_ci fn levenshtein_second_empty() { 73382e69de5Sopenharmony_ci assert_eq!(6, levenshtein("kitten", "")); 73482e69de5Sopenharmony_ci } 73582e69de5Sopenharmony_ci 73682e69de5Sopenharmony_ci #[test] 73782e69de5Sopenharmony_ci fn normalized_levenshtein_diff_short() { 73882e69de5Sopenharmony_ci assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001); 73982e69de5Sopenharmony_ci } 74082e69de5Sopenharmony_ci 74182e69de5Sopenharmony_ci #[test] 74282e69de5Sopenharmony_ci fn normalized_levenshtein_for_empty_strings() { 74382e69de5Sopenharmony_ci assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001); 74482e69de5Sopenharmony_ci } 74582e69de5Sopenharmony_ci 74682e69de5Sopenharmony_ci #[test] 74782e69de5Sopenharmony_ci fn normalized_levenshtein_first_empty() { 74882e69de5Sopenharmony_ci assert!(normalized_levenshtein("", "second").abs() < 0.00001); 74982e69de5Sopenharmony_ci } 75082e69de5Sopenharmony_ci 75182e69de5Sopenharmony_ci #[test] 75282e69de5Sopenharmony_ci fn normalized_levenshtein_second_empty() { 75382e69de5Sopenharmony_ci assert!(normalized_levenshtein("first", "").abs() < 0.00001); 75482e69de5Sopenharmony_ci } 75582e69de5Sopenharmony_ci 75682e69de5Sopenharmony_ci #[test] 75782e69de5Sopenharmony_ci fn normalized_levenshtein_identical_strings() { 75882e69de5Sopenharmony_ci assert!((normalized_levenshtein("identical", "identical") - 1.0).abs() < 0.00001); 75982e69de5Sopenharmony_ci } 76082e69de5Sopenharmony_ci 76182e69de5Sopenharmony_ci #[test] 76282e69de5Sopenharmony_ci fn osa_distance_empty() { 76382e69de5Sopenharmony_ci assert_eq!(0, osa_distance("", "")); 76482e69de5Sopenharmony_ci } 76582e69de5Sopenharmony_ci 76682e69de5Sopenharmony_ci #[test] 76782e69de5Sopenharmony_ci fn osa_distance_same() { 76882e69de5Sopenharmony_ci assert_eq!(0, osa_distance("damerau", "damerau")); 76982e69de5Sopenharmony_ci } 77082e69de5Sopenharmony_ci 77182e69de5Sopenharmony_ci #[test] 77282e69de5Sopenharmony_ci fn osa_distance_first_empty() { 77382e69de5Sopenharmony_ci assert_eq!(7, osa_distance("", "damerau")); 77482e69de5Sopenharmony_ci } 77582e69de5Sopenharmony_ci 77682e69de5Sopenharmony_ci #[test] 77782e69de5Sopenharmony_ci fn osa_distance_second_empty() { 77882e69de5Sopenharmony_ci assert_eq!(7, osa_distance("damerau", "")); 77982e69de5Sopenharmony_ci } 78082e69de5Sopenharmony_ci 78182e69de5Sopenharmony_ci #[test] 78282e69de5Sopenharmony_ci fn osa_distance_diff() { 78382e69de5Sopenharmony_ci assert_eq!(3, osa_distance("ca", "abc")); 78482e69de5Sopenharmony_ci } 78582e69de5Sopenharmony_ci 78682e69de5Sopenharmony_ci #[test] 78782e69de5Sopenharmony_ci fn osa_distance_diff_short() { 78882e69de5Sopenharmony_ci assert_eq!(3, osa_distance("damerau", "aderua")); 78982e69de5Sopenharmony_ci } 79082e69de5Sopenharmony_ci 79182e69de5Sopenharmony_ci #[test] 79282e69de5Sopenharmony_ci fn osa_distance_diff_reversed() { 79382e69de5Sopenharmony_ci assert_eq!(3, osa_distance("aderua", "damerau")); 79482e69de5Sopenharmony_ci } 79582e69de5Sopenharmony_ci 79682e69de5Sopenharmony_ci #[test] 79782e69de5Sopenharmony_ci fn osa_distance_diff_multibyte() { 79882e69de5Sopenharmony_ci assert_eq!(3, osa_distance("öঙ香", "abc")); 79982e69de5Sopenharmony_ci assert_eq!(3, osa_distance("abc", "öঙ香")); 80082e69de5Sopenharmony_ci } 80182e69de5Sopenharmony_ci 80282e69de5Sopenharmony_ci #[test] 80382e69de5Sopenharmony_ci fn osa_distance_diff_unequal_length() { 80482e69de5Sopenharmony_ci assert_eq!(6, osa_distance("damerau", "aderuaxyz")); 80582e69de5Sopenharmony_ci } 80682e69de5Sopenharmony_ci 80782e69de5Sopenharmony_ci #[test] 80882e69de5Sopenharmony_ci fn osa_distance_diff_unequal_length_reversed() { 80982e69de5Sopenharmony_ci assert_eq!(6, osa_distance("aderuaxyz", "damerau")); 81082e69de5Sopenharmony_ci } 81182e69de5Sopenharmony_ci 81282e69de5Sopenharmony_ci #[test] 81382e69de5Sopenharmony_ci fn osa_distance_diff_comedians() { 81482e69de5Sopenharmony_ci assert_eq!(5, osa_distance("Stewart", "Colbert")); 81582e69de5Sopenharmony_ci } 81682e69de5Sopenharmony_ci 81782e69de5Sopenharmony_ci #[test] 81882e69de5Sopenharmony_ci fn osa_distance_many_transpositions() { 81982e69de5Sopenharmony_ci assert_eq!(4, osa_distance("abcdefghijkl", "bacedfgihjlk")); 82082e69de5Sopenharmony_ci } 82182e69de5Sopenharmony_ci 82282e69de5Sopenharmony_ci #[test] 82382e69de5Sopenharmony_ci fn osa_distance_diff_longer() { 82482e69de5Sopenharmony_ci let a = "The quick brown fox jumped over the angry dog."; 82582e69de5Sopenharmony_ci let b = "Lehem ipsum dolor sit amet, dicta latine an eam."; 82682e69de5Sopenharmony_ci assert_eq!(36, osa_distance(a, b)); 82782e69de5Sopenharmony_ci } 82882e69de5Sopenharmony_ci 82982e69de5Sopenharmony_ci #[test] 83082e69de5Sopenharmony_ci fn osa_distance_beginning_transposition() { 83182e69de5Sopenharmony_ci assert_eq!(1, osa_distance("foobar", "ofobar")); 83282e69de5Sopenharmony_ci } 83382e69de5Sopenharmony_ci 83482e69de5Sopenharmony_ci #[test] 83582e69de5Sopenharmony_ci fn osa_distance_end_transposition() { 83682e69de5Sopenharmony_ci assert_eq!(1, osa_distance("specter", "spectre")); 83782e69de5Sopenharmony_ci } 83882e69de5Sopenharmony_ci 83982e69de5Sopenharmony_ci #[test] 84082e69de5Sopenharmony_ci fn osa_distance_restricted_edit() { 84182e69de5Sopenharmony_ci assert_eq!(4, osa_distance("a cat", "an abct")); 84282e69de5Sopenharmony_ci } 84382e69de5Sopenharmony_ci 84482e69de5Sopenharmony_ci #[test] 84582e69de5Sopenharmony_ci fn damerau_levenshtein_empty() { 84682e69de5Sopenharmony_ci assert_eq!(0, damerau_levenshtein("", "")); 84782e69de5Sopenharmony_ci } 84882e69de5Sopenharmony_ci 84982e69de5Sopenharmony_ci #[test] 85082e69de5Sopenharmony_ci fn damerau_levenshtein_same() { 85182e69de5Sopenharmony_ci assert_eq!(0, damerau_levenshtein("damerau", "damerau")); 85282e69de5Sopenharmony_ci } 85382e69de5Sopenharmony_ci 85482e69de5Sopenharmony_ci #[test] 85582e69de5Sopenharmony_ci fn damerau_levenshtein_first_empty() { 85682e69de5Sopenharmony_ci assert_eq!(7, damerau_levenshtein("", "damerau")); 85782e69de5Sopenharmony_ci } 85882e69de5Sopenharmony_ci 85982e69de5Sopenharmony_ci #[test] 86082e69de5Sopenharmony_ci fn damerau_levenshtein_second_empty() { 86182e69de5Sopenharmony_ci assert_eq!(7, damerau_levenshtein("damerau", "")); 86282e69de5Sopenharmony_ci } 86382e69de5Sopenharmony_ci 86482e69de5Sopenharmony_ci #[test] 86582e69de5Sopenharmony_ci fn damerau_levenshtein_diff() { 86682e69de5Sopenharmony_ci assert_eq!(2, damerau_levenshtein("ca", "abc")); 86782e69de5Sopenharmony_ci } 86882e69de5Sopenharmony_ci 86982e69de5Sopenharmony_ci #[test] 87082e69de5Sopenharmony_ci fn damerau_levenshtein_diff_short() { 87182e69de5Sopenharmony_ci assert_eq!(3, damerau_levenshtein("damerau", "aderua")); 87282e69de5Sopenharmony_ci } 87382e69de5Sopenharmony_ci 87482e69de5Sopenharmony_ci #[test] 87582e69de5Sopenharmony_ci fn damerau_levenshtein_diff_reversed() { 87682e69de5Sopenharmony_ci assert_eq!(3, damerau_levenshtein("aderua", "damerau")); 87782e69de5Sopenharmony_ci } 87882e69de5Sopenharmony_ci 87982e69de5Sopenharmony_ci #[test] 88082e69de5Sopenharmony_ci fn damerau_levenshtein_diff_multibyte() { 88182e69de5Sopenharmony_ci assert_eq!(3, damerau_levenshtein("öঙ香", "abc")); 88282e69de5Sopenharmony_ci assert_eq!(3, damerau_levenshtein("abc", "öঙ香")); 88382e69de5Sopenharmony_ci } 88482e69de5Sopenharmony_ci 88582e69de5Sopenharmony_ci #[test] 88682e69de5Sopenharmony_ci fn damerau_levenshtein_diff_unequal_length() { 88782e69de5Sopenharmony_ci assert_eq!(6, damerau_levenshtein("damerau", "aderuaxyz")); 88882e69de5Sopenharmony_ci } 88982e69de5Sopenharmony_ci 89082e69de5Sopenharmony_ci #[test] 89182e69de5Sopenharmony_ci fn damerau_levenshtein_diff_unequal_length_reversed() { 89282e69de5Sopenharmony_ci assert_eq!(6, damerau_levenshtein("aderuaxyz", "damerau")); 89382e69de5Sopenharmony_ci } 89482e69de5Sopenharmony_ci 89582e69de5Sopenharmony_ci #[test] 89682e69de5Sopenharmony_ci fn damerau_levenshtein_diff_comedians() { 89782e69de5Sopenharmony_ci assert_eq!(5, damerau_levenshtein("Stewart", "Colbert")); 89882e69de5Sopenharmony_ci } 89982e69de5Sopenharmony_ci 90082e69de5Sopenharmony_ci #[test] 90182e69de5Sopenharmony_ci fn damerau_levenshtein_many_transpositions() { 90282e69de5Sopenharmony_ci assert_eq!(4, damerau_levenshtein("abcdefghijkl", "bacedfgihjlk")); 90382e69de5Sopenharmony_ci } 90482e69de5Sopenharmony_ci 90582e69de5Sopenharmony_ci #[test] 90682e69de5Sopenharmony_ci fn damerau_levenshtein_diff_longer() { 90782e69de5Sopenharmony_ci let a = "The quick brown fox jumped over the angry dog."; 90882e69de5Sopenharmony_ci let b = "Lehem ipsum dolor sit amet, dicta latine an eam."; 90982e69de5Sopenharmony_ci assert_eq!(36, damerau_levenshtein(a, b)); 91082e69de5Sopenharmony_ci } 91182e69de5Sopenharmony_ci 91282e69de5Sopenharmony_ci #[test] 91382e69de5Sopenharmony_ci fn damerau_levenshtein_beginning_transposition() { 91482e69de5Sopenharmony_ci assert_eq!(1, damerau_levenshtein("foobar", "ofobar")); 91582e69de5Sopenharmony_ci } 91682e69de5Sopenharmony_ci 91782e69de5Sopenharmony_ci #[test] 91882e69de5Sopenharmony_ci fn damerau_levenshtein_end_transposition() { 91982e69de5Sopenharmony_ci assert_eq!(1, damerau_levenshtein("specter", "spectre")); 92082e69de5Sopenharmony_ci } 92182e69de5Sopenharmony_ci 92282e69de5Sopenharmony_ci #[test] 92382e69de5Sopenharmony_ci fn damerau_levenshtein_unrestricted_edit() { 92482e69de5Sopenharmony_ci assert_eq!(3, damerau_levenshtein("a cat", "an abct")); 92582e69de5Sopenharmony_ci } 92682e69de5Sopenharmony_ci 92782e69de5Sopenharmony_ci #[test] 92882e69de5Sopenharmony_ci fn normalized_damerau_levenshtein_diff_short() { 92982e69de5Sopenharmony_ci assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001); 93082e69de5Sopenharmony_ci } 93182e69de5Sopenharmony_ci 93282e69de5Sopenharmony_ci #[test] 93382e69de5Sopenharmony_ci fn normalized_damerau_levenshtein_for_empty_strings() { 93482e69de5Sopenharmony_ci assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001); 93582e69de5Sopenharmony_ci } 93682e69de5Sopenharmony_ci 93782e69de5Sopenharmony_ci #[test] 93882e69de5Sopenharmony_ci fn normalized_damerau_levenshtein_first_empty() { 93982e69de5Sopenharmony_ci assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001); 94082e69de5Sopenharmony_ci } 94182e69de5Sopenharmony_ci 94282e69de5Sopenharmony_ci #[test] 94382e69de5Sopenharmony_ci fn normalized_damerau_levenshtein_second_empty() { 94482e69de5Sopenharmony_ci assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001); 94582e69de5Sopenharmony_ci } 94682e69de5Sopenharmony_ci 94782e69de5Sopenharmony_ci #[test] 94882e69de5Sopenharmony_ci fn normalized_damerau_levenshtein_identical_strings() { 94982e69de5Sopenharmony_ci assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001); 95082e69de5Sopenharmony_ci } 95182e69de5Sopenharmony_ci 95282e69de5Sopenharmony_ci #[test] 95382e69de5Sopenharmony_ci fn sorensen_dice_all() { 95482e69de5Sopenharmony_ci // test cases taken from 95582e69de5Sopenharmony_ci // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/spec/index.spec.js#L11 95682e69de5Sopenharmony_ci 95782e69de5Sopenharmony_ci assert_eq!(1.0, sorensen_dice("a", "a")); 95882e69de5Sopenharmony_ci assert_eq!(0.0, sorensen_dice("a", "b")); 95982e69de5Sopenharmony_ci assert_eq!(1.0, sorensen_dice("", "")); 96082e69de5Sopenharmony_ci assert_eq!(0.0, sorensen_dice("a", "")); 96182e69de5Sopenharmony_ci assert_eq!(0.0, sorensen_dice("", "a")); 96282e69de5Sopenharmony_ci assert_eq!(1.0, sorensen_dice("apple event", "apple event")); 96382e69de5Sopenharmony_ci assert_eq!(0.9090909090909091, sorensen_dice("iphone", "iphone x")); 96482e69de5Sopenharmony_ci assert_eq!(0.0, sorensen_dice("french", "quebec")); 96582e69de5Sopenharmony_ci assert_eq!(1.0, sorensen_dice("france", "france")); 96682e69de5Sopenharmony_ci assert_eq!(0.2, sorensen_dice("fRaNce", "france")); 96782e69de5Sopenharmony_ci assert_eq!(0.8, sorensen_dice("healed", "sealed")); 96882e69de5Sopenharmony_ci assert_eq!( 96982e69de5Sopenharmony_ci 0.7878787878787878, 97082e69de5Sopenharmony_ci sorensen_dice("web applications", "applications of the web") 97182e69de5Sopenharmony_ci ); 97282e69de5Sopenharmony_ci assert_eq!( 97382e69de5Sopenharmony_ci 0.92, 97482e69de5Sopenharmony_ci sorensen_dice( 97582e69de5Sopenharmony_ci "this will have a typo somewhere", 97682e69de5Sopenharmony_ci "this will huve a typo somewhere" 97782e69de5Sopenharmony_ci ) 97882e69de5Sopenharmony_ci ); 97982e69de5Sopenharmony_ci assert_eq!( 98082e69de5Sopenharmony_ci 0.6060606060606061, 98182e69de5Sopenharmony_ci sorensen_dice( 98282e69de5Sopenharmony_ci "Olive-green table for sale, in extremely good condition.", 98382e69de5Sopenharmony_ci "For sale: table in very good condition, olive green in colour." 98482e69de5Sopenharmony_ci ) 98582e69de5Sopenharmony_ci ); 98682e69de5Sopenharmony_ci assert_eq!( 98782e69de5Sopenharmony_ci 0.2558139534883721, 98882e69de5Sopenharmony_ci sorensen_dice( 98982e69de5Sopenharmony_ci "Olive-green table for sale, in extremely good condition.", 99082e69de5Sopenharmony_ci "For sale: green Subaru Impreza, 210,000 miles" 99182e69de5Sopenharmony_ci ) 99282e69de5Sopenharmony_ci ); 99382e69de5Sopenharmony_ci assert_eq!( 99482e69de5Sopenharmony_ci 0.1411764705882353, 99582e69de5Sopenharmony_ci sorensen_dice( 99682e69de5Sopenharmony_ci "Olive-green table for sale, in extremely good condition.", 99782e69de5Sopenharmony_ci "Wanted: mountain bike with at least 21 gears." 99882e69de5Sopenharmony_ci ) 99982e69de5Sopenharmony_ci ); 100082e69de5Sopenharmony_ci assert_eq!( 100182e69de5Sopenharmony_ci 0.7741935483870968, 100282e69de5Sopenharmony_ci sorensen_dice("this has one extra word", "this has one word") 100382e69de5Sopenharmony_ci ); 100482e69de5Sopenharmony_ci } 100582e69de5Sopenharmony_ci} 1006