1//! This library implements string similarity metrics. 2 3#![forbid(unsafe_code)] 4 5use std::char; 6use std::cmp::{max, min}; 7use std::collections::HashMap; 8use std::error::Error; 9use std::fmt::{self, Display, Formatter}; 10use std::hash::Hash; 11use std::str::Chars; 12 13#[derive(Debug, PartialEq)] 14pub enum StrSimError { 15 DifferentLengthArgs, 16} 17 18impl Display for StrSimError { 19 fn fmt(&self, fmt: &mut Formatter) -> Result<(), fmt::Error> { 20 let text = match self { 21 StrSimError::DifferentLengthArgs => "Differing length arguments provided", 22 }; 23 24 write!(fmt, "{}", text) 25 } 26} 27 28impl Error for StrSimError {} 29 30pub type HammingResult = Result<usize, StrSimError>; 31 32/// Calculates the number of positions in the two sequences where the elements 33/// differ. Returns an error if the sequences have different lengths. 34pub fn generic_hamming<Iter1, Iter2, Elem1, Elem2>(a: Iter1, b: Iter2) -> HammingResult 35 where Iter1: IntoIterator<Item=Elem1>, 36 Iter2: IntoIterator<Item=Elem2>, 37 Elem1: PartialEq<Elem2> { 38 let (mut ita, mut itb) = (a.into_iter(), b.into_iter()); 39 let mut count = 0; 40 loop { 41 match (ita.next(), itb.next()){ 42 (Some(x), Some(y)) => if x != y { count += 1 }, 43 (None, None) => return Ok(count), 44 _ => return Err(StrSimError::DifferentLengthArgs), 45 } 46 } 47} 48 49/// Calculates the number of positions in the two strings where the characters 50/// differ. Returns an error if the strings have different lengths. 51/// 52/// ``` 53/// use strsim::{hamming, StrSimError::DifferentLengthArgs}; 54/// 55/// assert_eq!(Ok(3), hamming("hamming", "hammers")); 56/// 57/// assert_eq!(Err(DifferentLengthArgs), hamming("hamming", "ham")); 58/// ``` 59pub fn hamming(a: &str, b: &str) -> HammingResult { 60 generic_hamming(a.chars(), b.chars()) 61} 62 63/// Calculates the Jaro similarity between two sequences. The returned value 64/// is between 0.0 and 1.0 (higher value means more similar). 65pub fn generic_jaro<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 66 where &'a Iter1: IntoIterator<Item=Elem1>, 67 &'b Iter2: IntoIterator<Item=Elem2>, 68 Elem1: PartialEq<Elem2> { 69 let a_len = a.into_iter().count(); 70 let b_len = b.into_iter().count(); 71 72 // The check for lengths of one here is to prevent integer overflow when 73 // calculating the search range. 74 if a_len == 0 && b_len == 0 { 75 return 1.0; 76 } else if a_len == 0 || b_len == 0 { 77 return 0.0; 78 } else if a_len == 1 && b_len == 1 { 79 return if a.into_iter().eq(b.into_iter()) { 1.0} else { 0.0 }; 80 } 81 82 let search_range = (max(a_len, b_len) / 2) - 1; 83 84 let mut b_consumed = Vec::with_capacity(b_len); 85 for _ in 0..b_len { 86 b_consumed.push(false); 87 } 88 let mut matches = 0.0; 89 90 let mut transpositions = 0.0; 91 let mut b_match_index = 0; 92 93 for (i, a_elem) in a.into_iter().enumerate() { 94 let min_bound = 95 // prevent integer wrapping 96 if i > search_range { 97 max(0, i - search_range) 98 } else { 99 0 100 }; 101 102 let max_bound = min(b_len - 1, i + search_range); 103 104 if min_bound > max_bound { 105 continue; 106 } 107 108 for (j, b_elem) in b.into_iter().enumerate() { 109 if min_bound <= j && j <= max_bound && a_elem == b_elem && 110 !b_consumed[j] { 111 b_consumed[j] = true; 112 matches += 1.0; 113 114 if j < b_match_index { 115 transpositions += 1.0; 116 } 117 b_match_index = j; 118 119 break; 120 } 121 } 122 } 123 124 if matches == 0.0 { 125 0.0 126 } else { 127 (1.0 / 3.0) * ((matches / a_len as f64) + 128 (matches / b_len as f64) + 129 ((matches - transpositions) / matches)) 130 } 131} 132 133struct StringWrapper<'a>(&'a str); 134 135impl<'a, 'b> IntoIterator for &'a StringWrapper<'b> { 136 type Item = char; 137 type IntoIter = Chars<'b>; 138 139 fn into_iter(self) -> Self::IntoIter { 140 self.0.chars() 141 } 142} 143 144/// Calculates the Jaro similarity between two strings. The returned value 145/// is between 0.0 and 1.0 (higher value means more similar). 146/// 147/// ``` 148/// use strsim::jaro; 149/// 150/// assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() < 151/// 0.001); 152/// ``` 153pub fn jaro(a: &str, b: &str) -> f64 { 154 generic_jaro(&StringWrapper(a), &StringWrapper(b)) 155} 156 157/// Like Jaro but gives a boost to sequences that have a common prefix. 158pub fn generic_jaro_winkler<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 159 where &'a Iter1: IntoIterator<Item=Elem1>, 160 &'b Iter2: IntoIterator<Item=Elem2>, 161 Elem1: PartialEq<Elem2> { 162 let jaro_distance = generic_jaro(a, b); 163 164 // Don't limit the length of the common prefix 165 let prefix_length = a.into_iter() 166 .zip(b.into_iter()) 167 .take_while(|&(ref a_elem, ref b_elem)| a_elem == b_elem) 168 .count(); 169 170 let jaro_winkler_distance = 171 jaro_distance + (0.1 * prefix_length as f64 * (1.0 - jaro_distance)); 172 173 if jaro_winkler_distance <= 1.0 { 174 jaro_winkler_distance 175 } else { 176 1.0 177 } 178} 179 180/// Like Jaro but gives a boost to strings that have a common prefix. 181/// 182/// ``` 183/// use strsim::jaro_winkler; 184/// 185/// assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() < 186/// 0.001); 187/// ``` 188pub fn jaro_winkler(a: &str, b: &str) -> f64 { 189 generic_jaro_winkler(&StringWrapper(a), &StringWrapper(b)) 190} 191 192/// Calculates the minimum number of insertions, deletions, and substitutions 193/// required to change one sequence into the other. 194/// 195/// ``` 196/// use strsim::generic_levenshtein; 197/// 198/// assert_eq!(3, generic_levenshtein(&[1,2,3], &[1,2,3,4,5,6])); 199/// ``` 200pub fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> usize 201 where &'a Iter1: IntoIterator<Item=Elem1>, 202 &'b Iter2: IntoIterator<Item=Elem2>, 203 Elem1: PartialEq<Elem2> { 204 let b_len = b.into_iter().count(); 205 206 if a.into_iter().next().is_none() { return b_len; } 207 208 let mut cache: Vec<usize> = (1..b_len+1).collect(); 209 210 let mut result = 0; 211 212 for (i, a_elem) in a.into_iter().enumerate() { 213 result = i + 1; 214 let mut distance_b = i; 215 216 for (j, b_elem) in b.into_iter().enumerate() { 217 let cost = if a_elem == b_elem { 0usize } else { 1usize }; 218 let distance_a = distance_b + cost; 219 distance_b = cache[j]; 220 result = min(result + 1, min(distance_a, distance_b + 1)); 221 cache[j] = result; 222 } 223 } 224 225 result 226} 227 228/// Calculates the minimum number of insertions, deletions, and substitutions 229/// required to change one string into the other. 230/// 231/// ``` 232/// use strsim::levenshtein; 233/// 234/// assert_eq!(3, levenshtein("kitten", "sitting")); 235/// ``` 236pub fn levenshtein(a: &str, b: &str) -> usize { 237 generic_levenshtein(&StringWrapper(a), &StringWrapper(b)) 238} 239 240/// Calculates a normalized score of the Levenshtein algorithm between 0.0 and 241/// 1.0 (inclusive), where 1.0 means the strings are the same. 242/// 243/// ``` 244/// use strsim::normalized_levenshtein; 245/// 246/// assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001); 247/// assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001); 248/// assert!(normalized_levenshtein("", "second").abs() < 0.00001); 249/// assert!(normalized_levenshtein("first", "").abs() < 0.00001); 250/// assert!((normalized_levenshtein("string", "string") - 1.0).abs() < 0.00001); 251/// ``` 252pub fn normalized_levenshtein(a: &str, b: &str) -> f64 { 253 if a.is_empty() && b.is_empty() { 254 return 1.0; 255 } 256 1.0 - (levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64) 257} 258 259/// Like Levenshtein but allows for adjacent transpositions. Each substring can 260/// only be edited once. 261/// 262/// ``` 263/// use strsim::osa_distance; 264/// 265/// assert_eq!(3, osa_distance("ab", "bca")); 266/// ``` 267pub fn osa_distance(a: &str, b: &str) -> usize { 268 let a_len = a.chars().count(); 269 let b_len = b.chars().count(); 270 if a == b { return 0; } 271 else if a_len == 0 { return b_len; } 272 else if b_len == 0 { return a_len; } 273 274 let mut prev_two_distances: Vec<usize> = Vec::with_capacity(b_len + 1); 275 let mut prev_distances: Vec<usize> = Vec::with_capacity(b_len + 1); 276 let mut curr_distances: Vec<usize> = Vec::with_capacity(b_len + 1); 277 278 let mut prev_a_char = char::MAX; 279 let mut prev_b_char = char::MAX; 280 281 for i in 0..(b_len + 1) { 282 prev_two_distances.push(i); 283 prev_distances.push(i); 284 curr_distances.push(0); 285 } 286 287 for (i, a_char) in a.chars().enumerate() { 288 curr_distances[0] = i + 1; 289 290 for (j, b_char) in b.chars().enumerate() { 291 let cost = if a_char == b_char { 0 } else { 1 }; 292 curr_distances[j + 1] = min(curr_distances[j] + 1, 293 min(prev_distances[j + 1] + 1, 294 prev_distances[j] + cost)); 295 if i > 0 && j > 0 && a_char != b_char && 296 a_char == prev_b_char && b_char == prev_a_char { 297 curr_distances[j + 1] = min(curr_distances[j + 1], 298 prev_two_distances[j - 1] + 1); 299 } 300 301 prev_b_char = b_char; 302 } 303 304 prev_two_distances.clone_from(&prev_distances); 305 prev_distances.clone_from(&curr_distances); 306 prev_a_char = a_char; 307 } 308 309 curr_distances[b_len] 310 311} 312 313/* Returns the final index for a value in a single vector that represents a fixed 314 2d grid */ 315fn flat_index(i: usize, j: usize, width: usize) -> usize { 316 j * width + i 317} 318 319/// Like optimal string alignment, but substrings can be edited an unlimited 320/// number of times, and the triangle inequality holds. 321/// 322/// ``` 323/// use strsim::generic_damerau_levenshtein; 324/// 325/// assert_eq!(2, generic_damerau_levenshtein(&[1,2], &[2,3,1])); 326/// ``` 327pub fn generic_damerau_levenshtein<Elem>(a_elems: &[Elem], b_elems: &[Elem]) -> usize 328 where Elem: Eq + Hash + Clone { 329 let a_len = a_elems.len(); 330 let b_len = b_elems.len(); 331 332 if a_len == 0 { return b_len; } 333 if b_len == 0 { return a_len; } 334 335 let width = a_len + 2; 336 let mut distances = vec![0; (a_len + 2) * (b_len + 2)]; 337 let max_distance = a_len + b_len; 338 distances[0] = max_distance; 339 340 for i in 0..(a_len + 1) { 341 distances[flat_index(i + 1, 0, width)] = max_distance; 342 distances[flat_index(i + 1, 1, width)] = i; 343 } 344 345 for j in 0..(b_len + 1) { 346 distances[flat_index(0, j + 1, width)] = max_distance; 347 distances[flat_index(1, j + 1, width)] = j; 348 } 349 350 let mut elems: HashMap<Elem, usize> = HashMap::with_capacity(64); 351 352 for i in 1..(a_len + 1) { 353 let mut db = 0; 354 355 for j in 1..(b_len + 1) { 356 let k = match elems.get(&b_elems[j - 1]) { 357 Some(&value) => value, 358 None => 0 359 }; 360 361 let insertion_cost = distances[flat_index(i, j + 1, width)] + 1; 362 let deletion_cost = distances[flat_index(i + 1, j, width)] + 1; 363 let transposition_cost = distances[flat_index(k, db, width)] + 364 (i - k - 1) + 1 + (j - db - 1); 365 366 let mut substitution_cost = distances[flat_index(i, j, width)] + 1; 367 if a_elems[i - 1] == b_elems[j - 1] { 368 db = j; 369 substitution_cost -= 1; 370 } 371 372 distances[flat_index(i + 1, j + 1, width)] = min(substitution_cost, 373 min(insertion_cost, min(deletion_cost, transposition_cost))); 374 } 375 376 elems.insert(a_elems[i - 1].clone(), i); 377 } 378 379 distances[flat_index(a_len + 1, b_len + 1, width)] 380} 381 382/// Like optimal string alignment, but substrings can be edited an unlimited 383/// number of times, and the triangle inequality holds. 384/// 385/// ``` 386/// use strsim::damerau_levenshtein; 387/// 388/// assert_eq!(2, damerau_levenshtein("ab", "bca")); 389/// ``` 390pub fn damerau_levenshtein(a: &str, b: &str) -> usize { 391 let (x, y): (Vec<_>, Vec<_>) = (a.chars().collect(), b.chars().collect()); 392 generic_damerau_levenshtein(x.as_slice(), y.as_slice()) 393} 394 395/// Calculates a normalized score of the Damerau–Levenshtein algorithm between 396/// 0.0 and 1.0 (inclusive), where 1.0 means the strings are the same. 397/// 398/// ``` 399/// use strsim::normalized_damerau_levenshtein; 400/// 401/// assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001); 402/// assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001); 403/// assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001); 404/// assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001); 405/// assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001); 406/// ``` 407pub fn normalized_damerau_levenshtein(a: &str, b: &str) -> f64 { 408 if a.is_empty() && b.is_empty() { 409 return 1.0; 410 } 411 1.0 - (damerau_levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64) 412} 413 414/// Returns an Iterator of char tuples. 415fn bigrams(s: &str) -> impl Iterator<Item=(char, char)> + '_ { 416 s.chars().zip(s.chars().skip(1)) 417} 418 419 420/// Calculates a Sørensen-Dice similarity distance using bigrams. 421/// See http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient. 422/// 423/// ``` 424/// use strsim::sorensen_dice; 425/// 426/// assert_eq!(1.0, sorensen_dice("", "")); 427/// assert_eq!(0.0, sorensen_dice("", "a")); 428/// assert_eq!(0.0, sorensen_dice("french", "quebec")); 429/// assert_eq!(1.0, sorensen_dice("ferris", "ferris")); 430/// assert_eq!(1.0, sorensen_dice("ferris", "ferris")); 431/// assert_eq!(0.8888888888888888, sorensen_dice("feris", "ferris")); 432/// ``` 433pub fn sorensen_dice(a: &str, b: &str) -> f64 { 434 // implementation guided by 435 // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/index.js#L6 436 437 let a: String = a.chars().filter(|&x| !char::is_whitespace(x)).collect(); 438 let b: String = b.chars().filter(|&x| !char::is_whitespace(x)).collect(); 439 440 if a.len() == 0 && b.len() == 0 { 441 return 1.0; 442 } 443 444 if a.len() == 0 || b.len() == 0 { 445 return 0.0; 446 } 447 448 if a == b { 449 return 1.0; 450 } 451 452 if a.len() == 1 && b.len() == 1 { 453 return 0.0; 454 } 455 456 if a.len() < 2 || b.len() < 2 { 457 return 0.0; 458 } 459 460 let mut a_bigrams: HashMap<(char, char), usize> = HashMap::new(); 461 462 for bigram in bigrams(&a) { 463 *a_bigrams.entry(bigram).or_insert(0) += 1; 464 } 465 466 let mut intersection_size = 0; 467 468 for bigram in bigrams(&b) { 469 a_bigrams.entry(bigram).and_modify(|bi| { 470 if *bi > 0 { 471 *bi -= 1; 472 intersection_size += 1; 473 } 474 }); 475 } 476 477 (2 * intersection_size) as f64 / (a.len() + b.len() - 2) as f64 478} 479 480 481#[cfg(test)] 482mod tests { 483 use super::*; 484 485 #[test] 486 fn bigrams_iterator() { 487 let mut bi = bigrams("abcde"); 488 489 assert_eq!(Some(('a', 'b')), bi.next()); 490 assert_eq!(Some(('b', 'c')), bi.next()); 491 assert_eq!(Some(('c', 'd')), bi.next()); 492 assert_eq!(Some(('d', 'e')), bi.next()); 493 assert_eq!(None, bi.next()); 494 } 495 496 fn assert_hamming_dist(dist: usize, str1: &str, str2: &str) { 497 assert_eq!(Ok(dist), hamming(str1, str2)); 498 } 499 500 #[test] 501 fn hamming_empty() { 502 assert_hamming_dist(0, "", "") 503 } 504 505 #[test] 506 fn hamming_same() { 507 assert_hamming_dist(0, "hamming", "hamming") 508 } 509 510 #[test] 511 fn hamming_numbers() { 512 assert_eq!(Ok(1), generic_hamming(&[1, 2, 4], &[1, 2, 3])); 513 } 514 515 #[test] 516 fn hamming_diff() { 517 assert_hamming_dist(3, "hamming", "hammers") 518 } 519 520 #[test] 521 fn hamming_diff_multibyte() { 522 assert_hamming_dist(2, "hamming", "h香mmüng"); 523 } 524 525 #[test] 526 fn hamming_unequal_length() { 527 assert_eq!( 528 Err(StrSimError::DifferentLengthArgs), 529 generic_hamming("ham".chars(), "hamming".chars()) 530 ); 531 } 532 533 #[test] 534 fn hamming_names() { 535 assert_hamming_dist(14, "Friedrich Nietzs", "Jean-Paul Sartre") 536 } 537 538 #[test] 539 fn jaro_both_empty() { 540 assert_eq!(1.0, jaro("", "")); 541 } 542 543 #[test] 544 fn jaro_first_empty() { 545 assert_eq!(0.0, jaro("", "jaro")); 546 } 547 548 #[test] 549 fn jaro_second_empty() { 550 assert_eq!(0.0, jaro("distance", "")); 551 } 552 553 #[test] 554 fn jaro_same() { 555 assert_eq!(1.0, jaro("jaro", "jaro")); 556 } 557 558 #[test] 559 fn jaro_multibyte() { 560 assert!((0.818 - jaro("testabctest", "testöঙ香test")) < 0.001); 561 assert!((0.818 - jaro("testöঙ香test", "testabctest")) < 0.001); 562 } 563 564 #[test] 565 fn jaro_diff_short() { 566 assert!((0.767 - jaro("dixon", "dicksonx")).abs() < 0.001); 567 } 568 569 #[test] 570 fn jaro_diff_one_character() { 571 assert_eq!(0.0, jaro("a", "b")); 572 } 573 574 #[test] 575 fn jaro_same_one_character() { 576 assert_eq!(1.0, jaro("a", "a")); 577 } 578 579 #[test] 580 fn generic_jaro_diff() { 581 assert_eq!(0.0, generic_jaro(&[1, 2], &[3, 4])); 582 } 583 584 #[test] 585 fn jaro_diff_one_and_two() { 586 assert!((0.83 - jaro("a", "ab")).abs() < 0.01); 587 } 588 589 #[test] 590 fn jaro_diff_two_and_one() { 591 assert!((0.83 - jaro("ab", "a")).abs() < 0.01); 592 } 593 594 #[test] 595 fn jaro_diff_no_transposition() { 596 assert!((0.822 - jaro("dwayne", "duane")).abs() < 0.001); 597 } 598 599 #[test] 600 fn jaro_diff_with_transposition() { 601 assert!((0.944 - jaro("martha", "marhta")).abs() < 0.001); 602 } 603 604 #[test] 605 fn jaro_names() { 606 assert!((0.392 - jaro("Friedrich Nietzsche", 607 "Jean-Paul Sartre")).abs() < 0.001); 608 } 609 610 #[test] 611 fn jaro_winkler_both_empty() { 612 assert_eq!(1.0, jaro_winkler("", "")); 613 } 614 615 #[test] 616 fn jaro_winkler_first_empty() { 617 assert_eq!(0.0, jaro_winkler("", "jaro-winkler")); 618 } 619 620 #[test] 621 fn jaro_winkler_second_empty() { 622 assert_eq!(0.0, jaro_winkler("distance", "")); 623 } 624 625 #[test] 626 fn jaro_winkler_same() { 627 assert_eq!(1.0, jaro_winkler("Jaro-Winkler", "Jaro-Winkler")); 628 } 629 630 #[test] 631 fn jaro_winkler_multibyte() { 632 assert!((0.89 - jaro_winkler("testabctest", "testöঙ香test")).abs() < 633 0.001); 634 assert!((0.89 - jaro_winkler("testöঙ香test", "testabctest")).abs() < 635 0.001); 636 } 637 638 #[test] 639 fn jaro_winkler_diff_short() { 640 assert!((0.813 - jaro_winkler("dixon", "dicksonx")).abs() < 0.001); 641 assert!((0.813 - jaro_winkler("dicksonx", "dixon")).abs() < 0.001); 642 } 643 644 #[test] 645 fn jaro_winkler_diff_one_character() { 646 assert_eq!(0.0, jaro_winkler("a", "b")); 647 } 648 649 #[test] 650 fn jaro_winkler_same_one_character() { 651 assert_eq!(1.0, jaro_winkler("a", "a")); 652 } 653 654 #[test] 655 fn jaro_winkler_diff_no_transposition() { 656 assert!((0.840 - jaro_winkler("dwayne", "duane")).abs() < 0.001); 657 } 658 659 #[test] 660 fn jaro_winkler_diff_with_transposition() { 661 assert!((0.961 - jaro_winkler("martha", "marhta")).abs() < 0.001); 662 } 663 664 #[test] 665 fn jaro_winkler_names() { 666 assert!((0.562 - jaro_winkler("Friedrich Nietzsche", 667 "Fran-Paul Sartre")).abs() < 0.001); 668 } 669 670 #[test] 671 fn jaro_winkler_long_prefix() { 672 assert!((0.911 - jaro_winkler("cheeseburger", "cheese fries")).abs() < 673 0.001); 674 } 675 676 #[test] 677 fn jaro_winkler_more_names() { 678 assert!((0.868 - jaro_winkler("Thorkel", "Thorgier")).abs() < 0.001); 679 } 680 681 #[test] 682 fn jaro_winkler_length_of_one() { 683 assert!((0.738 - jaro_winkler("Dinsdale", "D")).abs() < 0.001); 684 } 685 686 #[test] 687 fn jaro_winkler_very_long_prefix() { 688 assert!((1.0 - jaro_winkler("thequickbrownfoxjumpedoverx", 689 "thequickbrownfoxjumpedovery")).abs() < 690 0.001); 691 } 692 693 #[test] 694 fn levenshtein_empty() { 695 assert_eq!(0, levenshtein("", "")); 696 } 697 698 #[test] 699 fn levenshtein_same() { 700 assert_eq!(0, levenshtein("levenshtein", "levenshtein")); 701 } 702 703 #[test] 704 fn levenshtein_diff_short() { 705 assert_eq!(3, levenshtein("kitten", "sitting")); 706 } 707 708 #[test] 709 fn levenshtein_diff_with_space() { 710 assert_eq!(5, levenshtein("hello, world", "bye, world")); 711 } 712 713 #[test] 714 fn levenshtein_diff_multibyte() { 715 assert_eq!(3, levenshtein("öঙ香", "abc")); 716 assert_eq!(3, levenshtein("abc", "öঙ香")); 717 } 718 719 #[test] 720 fn levenshtein_diff_longer() { 721 let a = "The quick brown fox jumped over the angry dog."; 722 let b = "Lorem ipsum dolor sit amet, dicta latine an eam."; 723 assert_eq!(37, levenshtein(a, b)); 724 } 725 726 #[test] 727 fn levenshtein_first_empty() { 728 assert_eq!(7, levenshtein("", "sitting")); 729 } 730 731 #[test] 732 fn levenshtein_second_empty() { 733 assert_eq!(6, levenshtein("kitten", "")); 734 } 735 736 #[test] 737 fn normalized_levenshtein_diff_short() { 738 assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001); 739 } 740 741 #[test] 742 fn normalized_levenshtein_for_empty_strings() { 743 assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001); 744 } 745 746 #[test] 747 fn normalized_levenshtein_first_empty() { 748 assert!(normalized_levenshtein("", "second").abs() < 0.00001); 749 } 750 751 #[test] 752 fn normalized_levenshtein_second_empty() { 753 assert!(normalized_levenshtein("first", "").abs() < 0.00001); 754 } 755 756 #[test] 757 fn normalized_levenshtein_identical_strings() { 758 assert!((normalized_levenshtein("identical", "identical") - 1.0).abs() < 0.00001); 759 } 760 761 #[test] 762 fn osa_distance_empty() { 763 assert_eq!(0, osa_distance("", "")); 764 } 765 766 #[test] 767 fn osa_distance_same() { 768 assert_eq!(0, osa_distance("damerau", "damerau")); 769 } 770 771 #[test] 772 fn osa_distance_first_empty() { 773 assert_eq!(7, osa_distance("", "damerau")); 774 } 775 776 #[test] 777 fn osa_distance_second_empty() { 778 assert_eq!(7, osa_distance("damerau", "")); 779 } 780 781 #[test] 782 fn osa_distance_diff() { 783 assert_eq!(3, osa_distance("ca", "abc")); 784 } 785 786 #[test] 787 fn osa_distance_diff_short() { 788 assert_eq!(3, osa_distance("damerau", "aderua")); 789 } 790 791 #[test] 792 fn osa_distance_diff_reversed() { 793 assert_eq!(3, osa_distance("aderua", "damerau")); 794 } 795 796 #[test] 797 fn osa_distance_diff_multibyte() { 798 assert_eq!(3, osa_distance("öঙ香", "abc")); 799 assert_eq!(3, osa_distance("abc", "öঙ香")); 800 } 801 802 #[test] 803 fn osa_distance_diff_unequal_length() { 804 assert_eq!(6, osa_distance("damerau", "aderuaxyz")); 805 } 806 807 #[test] 808 fn osa_distance_diff_unequal_length_reversed() { 809 assert_eq!(6, osa_distance("aderuaxyz", "damerau")); 810 } 811 812 #[test] 813 fn osa_distance_diff_comedians() { 814 assert_eq!(5, osa_distance("Stewart", "Colbert")); 815 } 816 817 #[test] 818 fn osa_distance_many_transpositions() { 819 assert_eq!(4, osa_distance("abcdefghijkl", "bacedfgihjlk")); 820 } 821 822 #[test] 823 fn osa_distance_diff_longer() { 824 let a = "The quick brown fox jumped over the angry dog."; 825 let b = "Lehem ipsum dolor sit amet, dicta latine an eam."; 826 assert_eq!(36, osa_distance(a, b)); 827 } 828 829 #[test] 830 fn osa_distance_beginning_transposition() { 831 assert_eq!(1, osa_distance("foobar", "ofobar")); 832 } 833 834 #[test] 835 fn osa_distance_end_transposition() { 836 assert_eq!(1, osa_distance("specter", "spectre")); 837 } 838 839 #[test] 840 fn osa_distance_restricted_edit() { 841 assert_eq!(4, osa_distance("a cat", "an abct")); 842 } 843 844 #[test] 845 fn damerau_levenshtein_empty() { 846 assert_eq!(0, damerau_levenshtein("", "")); 847 } 848 849 #[test] 850 fn damerau_levenshtein_same() { 851 assert_eq!(0, damerau_levenshtein("damerau", "damerau")); 852 } 853 854 #[test] 855 fn damerau_levenshtein_first_empty() { 856 assert_eq!(7, damerau_levenshtein("", "damerau")); 857 } 858 859 #[test] 860 fn damerau_levenshtein_second_empty() { 861 assert_eq!(7, damerau_levenshtein("damerau", "")); 862 } 863 864 #[test] 865 fn damerau_levenshtein_diff() { 866 assert_eq!(2, damerau_levenshtein("ca", "abc")); 867 } 868 869 #[test] 870 fn damerau_levenshtein_diff_short() { 871 assert_eq!(3, damerau_levenshtein("damerau", "aderua")); 872 } 873 874 #[test] 875 fn damerau_levenshtein_diff_reversed() { 876 assert_eq!(3, damerau_levenshtein("aderua", "damerau")); 877 } 878 879 #[test] 880 fn damerau_levenshtein_diff_multibyte() { 881 assert_eq!(3, damerau_levenshtein("öঙ香", "abc")); 882 assert_eq!(3, damerau_levenshtein("abc", "öঙ香")); 883 } 884 885 #[test] 886 fn damerau_levenshtein_diff_unequal_length() { 887 assert_eq!(6, damerau_levenshtein("damerau", "aderuaxyz")); 888 } 889 890 #[test] 891 fn damerau_levenshtein_diff_unequal_length_reversed() { 892 assert_eq!(6, damerau_levenshtein("aderuaxyz", "damerau")); 893 } 894 895 #[test] 896 fn damerau_levenshtein_diff_comedians() { 897 assert_eq!(5, damerau_levenshtein("Stewart", "Colbert")); 898 } 899 900 #[test] 901 fn damerau_levenshtein_many_transpositions() { 902 assert_eq!(4, damerau_levenshtein("abcdefghijkl", "bacedfgihjlk")); 903 } 904 905 #[test] 906 fn damerau_levenshtein_diff_longer() { 907 let a = "The quick brown fox jumped over the angry dog."; 908 let b = "Lehem ipsum dolor sit amet, dicta latine an eam."; 909 assert_eq!(36, damerau_levenshtein(a, b)); 910 } 911 912 #[test] 913 fn damerau_levenshtein_beginning_transposition() { 914 assert_eq!(1, damerau_levenshtein("foobar", "ofobar")); 915 } 916 917 #[test] 918 fn damerau_levenshtein_end_transposition() { 919 assert_eq!(1, damerau_levenshtein("specter", "spectre")); 920 } 921 922 #[test] 923 fn damerau_levenshtein_unrestricted_edit() { 924 assert_eq!(3, damerau_levenshtein("a cat", "an abct")); 925 } 926 927 #[test] 928 fn normalized_damerau_levenshtein_diff_short() { 929 assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001); 930 } 931 932 #[test] 933 fn normalized_damerau_levenshtein_for_empty_strings() { 934 assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001); 935 } 936 937 #[test] 938 fn normalized_damerau_levenshtein_first_empty() { 939 assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001); 940 } 941 942 #[test] 943 fn normalized_damerau_levenshtein_second_empty() { 944 assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001); 945 } 946 947 #[test] 948 fn normalized_damerau_levenshtein_identical_strings() { 949 assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001); 950 } 951 952 #[test] 953 fn sorensen_dice_all() { 954 // test cases taken from 955 // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/spec/index.spec.js#L11 956 957 assert_eq!(1.0, sorensen_dice("a", "a")); 958 assert_eq!(0.0, sorensen_dice("a", "b")); 959 assert_eq!(1.0, sorensen_dice("", "")); 960 assert_eq!(0.0, sorensen_dice("a", "")); 961 assert_eq!(0.0, sorensen_dice("", "a")); 962 assert_eq!(1.0, sorensen_dice("apple event", "apple event")); 963 assert_eq!(0.9090909090909091, sorensen_dice("iphone", "iphone x")); 964 assert_eq!(0.0, sorensen_dice("french", "quebec")); 965 assert_eq!(1.0, sorensen_dice("france", "france")); 966 assert_eq!(0.2, sorensen_dice("fRaNce", "france")); 967 assert_eq!(0.8, sorensen_dice("healed", "sealed")); 968 assert_eq!( 969 0.7878787878787878, 970 sorensen_dice("web applications", "applications of the web") 971 ); 972 assert_eq!( 973 0.92, 974 sorensen_dice( 975 "this will have a typo somewhere", 976 "this will huve a typo somewhere" 977 ) 978 ); 979 assert_eq!( 980 0.6060606060606061, 981 sorensen_dice( 982 "Olive-green table for sale, in extremely good condition.", 983 "For sale: table in very good condition, olive green in colour." 984 ) 985 ); 986 assert_eq!( 987 0.2558139534883721, 988 sorensen_dice( 989 "Olive-green table for sale, in extremely good condition.", 990 "For sale: green Subaru Impreza, 210,000 miles" 991 ) 992 ); 993 assert_eq!( 994 0.1411764705882353, 995 sorensen_dice( 996 "Olive-green table for sale, in extremely good condition.", 997 "Wanted: mountain bike with at least 21 gears." 998 ) 999 ); 1000 assert_eq!( 1001 0.7741935483870968, 1002 sorensen_dice("this has one extra word", "this has one word") 1003 ); 1004 } 1005} 1006