Jump to content

Jaro-Winkler distance: Difference between revisions

Added Rust solution
mNo edit summary
(Added Rust solution)
Line 209:
witch | 0.1067
which | 0.1200
</pre>
 
=={{header|Rust}}==
{{trans|Python}}
<lang rust>use std::fs::File;
use std::io::{self, BufRead};
 
fn load_dictionary(filename: &str) -> std::io::Result<Vec<String>> {
let file = File::open(filename)?;
let mut dict = Vec::new();
for line in io::BufReader::new(file).lines() {
dict.push(line?);
}
Ok(dict)
}
 
fn jaro_winkler_distance(string1: &str, string2: &str) -> f64 {
let mut st1 = string1;
let mut st2 = string2;
if st1.len() < st2.len() {
std::mem::swap(&mut st1, &mut st2);
}
let len1 = st1.len();
let len2 = st2.len();
if len2 == 0 {
return 0.0;
}
let delta = std::cmp::max(0, len2 / 2 - 1);
let mut flag = vec![false; len2];
let mut ch1_match = vec![];
for (idx1, ch1) in st1.chars().enumerate() {
for (idx2, ch2) in st2.chars().enumerate() {
if idx2 <= idx1 + delta && idx2 + delta >= idx1 && ch1 == ch2 && !flag[idx2] {
flag[idx2] = true;
ch1_match.push(ch1);
break;
}
}
}
let matches = ch1_match.len();
if matches == 0 {
return 1.0;
}
let mut transpositions = 0;
let mut idx1 = 0;
for (idx2, ch2) in st2.chars().enumerate() {
if flag[idx2] {
transpositions += (ch2 != ch1_match[idx1]) as i32;
idx1 += 1;
}
}
let m = matches as f64;
let jaro =
(m / (len1 as f64) + m / (len2 as f64) + (m - (transpositions as f64) / 2.0) / m) / 3.0;
let mut commonprefix = 0;
for (c1, c2) in st1.chars().zip(st2.chars()).take(std::cmp::min(4, len2)) {
commonprefix += (c1 == c2) as i32;
}
1.0 - (jaro + commonprefix as f64 * 0.1 * (1.0 - jaro))
}
 
fn within_distance<'a>(
dict: &'a Vec<String>,
max_distance: f64,
stri: &str,
max_to_return: usize,
) -> Vec<(&'a String, f64)> {
let mut arr: Vec<(&String, f64)> = dict
.iter()
.map(|w| (w, jaro_winkler_distance(stri, w)))
.filter(|x| x.1 <= max_distance)
.collect();
// The trait std::cmp::Ord is not implemented for f64, otherwise
// we could just do this:
// arr.sort_by_key(|x| x.1);
let compare_distance = |d1, d2| {
use std::cmp::Ordering;
if d1 < d2 {
Ordering::Less
} else {
if d1 > d2 {
Ordering::Greater
} else {
Ordering::Equal
}
}
};
arr.sort_by(|x, y| compare_distance(x.1, y.1));
arr[0..std::cmp::min(max_to_return, arr.len())].to_vec()
}
 
fn main() {
match load_dictionary("linuxwords.txt") {
Ok(dict) => {
for word in &[
"accomodate",
"definately",
"goverment",
"occured",
"publically",
"recieve",
"seperate",
"untill",
"wich",
] {
println!("Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to '{}' are:", word);
println!(" Word | Distance");
for (w, dist) in within_distance(&dict, 0.15, word, 5) {
println!("{:>14} | {:6.4}", w, dist)
}
println!();
}
}
Err(error) => eprintln!("{}", error),
}
}</lang>
 
{{out}}
The output is slightly different from that of the Python example because I've removed
a trailing zero-width space character from some of the test strings.
<pre>
Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to 'accomodate' are:
Word | Distance
accommodate | 0.0182
accommodated | 0.0333
accommodates | 0.0333
accommodating | 0.0815
accommodation | 0.0815
 
Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to 'definately' are:
Word | Distance
definitely | 0.0400
defiantly | 0.0422
define | 0.0800
definite | 0.0850
definable | 0.0872
 
Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to 'goverment' are:
Word | Distance
government | 0.0533
govern | 0.0667
governments | 0.0697
movement | 0.0810
governmental | 0.0833
 
Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to 'occured' are:
Word | Distance
occurred | 0.0250
occur | 0.0571
occupied | 0.0786
occurs | 0.0905
accursed | 0.0917
 
Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to 'publically' are:
Word | Distance
publicly | 0.0400
public | 0.0800
publicity | 0.1044
publication | 0.1327
biblically | 0.1400
 
Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to 'recieve' are:
Word | Distance
receive | 0.0333
received | 0.0625
receiver | 0.0625
receives | 0.0625
relieve | 0.0667
 
Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to 'seperate' are:
Word | Distance
desperate | 0.0708
separate | 0.0917
temperate | 0.1042
separated | 0.1144
separates | 0.1144
 
Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to 'untill' are:
Word | Distance
until | 0.0333
untie | 0.1067
untimely | 0.1083
Antilles | 0.1264
untidy | 0.1333
 
Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to 'wich' are:
Word | Distance
witch | 0.0533
which | 0.0600
witches | 0.1143
rich | 0.1167
wick | 0.1167
 
</pre>
1,777

edits

Cookies help us deliver our services. By using our services, you agree to our use of cookies.