Jaro similarity
The Jaro distance is a measure of similarity between two strings. The higher the Jaro distance for two strings is, the more similar the strings are. The score is normalized such that 0 equates to no similarity and 1 is an exact match.
- Definition
The Jaro distance of two given strings and is
Where:
- is the number of matching characters (see below);
- is half the number of transpositions (see below).
Two characters from and respectively, are considered matching only if they are the same and not farther than .
Each character of is compared with all its matching characters in . The number of matching (but different sequence order) characters divided by 2 defines the number of transpositions.
- Example
Given the strings DWAYNE and DUANE we find:
We find a Jaro score of:
- Task
Implement the Jaro distance algorithm and show the distance value for each of the following pairs:
- ("MARTHA", "MARHTA")
- ("DIXON", "DICKSONX")
- ("JELLYFISH", "SMELLYFISH")
- See also
- Jaro Winkler distance on Wikipedia.
C
<lang C>#include <stdlib.h>
- include <string.h>
- include <ctype.h>
- include <stdio.h>
- define TRUE 1
- define FALSE 0
- define max(a, b) ((a) > (b) ? (a) : (b))
- define min(a, b) ((a) < (b) ? (a) : (b))
double jaro(const char *str1, const char *str2) {
// length of the strings int str1_len = strlen(str1); int str2_len = strlen(str2);
// if both strings are empty return 1 // if only one of the strings is empty return 0 if (str1_len == 0) return str2_len == 0 ? 1.0 : 0.0;
// max distance between two chars to be considered matching // floor() is ommitted due to integer division rules int match_distance = (int) max(str1_len, str2_len)/2 - 1;
// arrays of bools that signify if that char in the matching string has a match int *str1_matches = calloc(str1_len, sizeof(int)); int *str2_matches = calloc(str2_len, sizeof(int));
// number of matches and transpositions double matches = 0.0; double transpositions = 0.0;
// find the matches for (int i = 0; i < str1_len; i++) { // start and end take into account the match distance int start = max(0, i - match_distance); int end = min(i + match_distance + 1, str2_len);
for (int k = start; k < end; k++) { // if str2 already has a match continue if (str2_matches[k]) continue; // if str1 and str2 are not if (str1[i] != str2[k]) continue; // otherwise assume there is a match str1_matches[i] = TRUE; str2_matches[k] = TRUE; matches++; break; } }
// if there are no matches return 0 if (matches == 0) { free(str1_matches); free(str2_matches); return 0.0; }
// count transpositions int k = 0; for (int i = 0; i < str1_len; i++) { // if there are no matches in str1 continue if (!str1_matches[i]) continue; // while there is no match in str2 increment k while (!str2_matches[k]) k++; // increment transpositions if (str1[i] != str2[k]) transpositions++; k++; }
// divide the number of transpositions by two as per the algorithm specs // this division is valid because the counted transpositions include both // instances of the transposed characters. transpositions /= 2.0;
// free the allocated memory free(str1_matches); free(str2_matches);
// return the Jaro distance return ((matches / str1_len) + (matches / str2_len) + ((matches - transpositions) / matches)) / 3.0;
}
int main() {
printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));
}</lang>
- Output:
0.944444 0.766667 0.896296
Perl
<lang perl>use List::Util qw(min max);
sub jaro {
my ($s, $t) = @_;
my $s_len = length($s); my $t_len = length($t);
if ($s_len == 0 and $t_len == 0) { return 1; }
my $match_distance = int(max($s_len, $t_len) / 2) - 1;
my @s_matches; my @t_matches;
my @s = split(//, $s); my @t = split(//, $t);
my $matches = 0; foreach my $i (0 .. $#s) {
my $start = max(0, $i - $match_distance); my $end = min($i + $match_distance + 1, $t_len);
foreach my $j ($start .. $end - 1) { $t_matches[$j] and next; $s[$i] eq $t[$j] or next; $s_matches[$i] = 1; $t_matches[$j] = 1; $matches++; last; } }
return 0 if $matches == 0;
my $k = 0; my $transpositions = 0;
foreach my $i (0 .. $#s) { $s_matches[$i] or next; while (not $t_matches[$k]) { ++$k } $s[$i] eq $t[$k] or ++$transpositions; ++$k; }
(($matches / $s_len) + ($matches / $t_len) + (($matches - $transpositions / 2) / $matches)) / 3;
}
printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));</lang>
- Output:
0.944444 0.766667 0.896296
Sidef
<lang ruby>func jaro(s, t) {
var s_len = s.len var t_len = t.len
var match_distance = (floor(max(s_len, t_len) / 2) - 1)
var s_matches = [] var t_matches = []
var matches = 0 var transpositions = 0
s_len.range.each { |i| var start = max(0, i-match_distance) var end = min(i+match_distance, t_len-1)
start ... end -> each { |k| t_matches[k] && next s[i] == t[k] || next s_matches[i] = true t_matches[k] = true matches++ break } }
return 0 if (matches == 0)
var k = 0 s_len.range.each { |i| s_matches[i] || next while (!t_matches[k]) { ++k } s[i] == t[k] || ++transpositions ++k }
((matches / s_len) + (matches / t_len) + ((matches - transpositions/2) / matches)) / 3
}
say "%f"%jaro(%c"MARTHA", %c"MARHTA") say "%f"%jaro(%c"DIXON", %c"DICKSONX") say "%f"%jaro(%c"JELLYFISH", %c"SMELLYFISH")</lang>
- Output:
0.944444 0.766667 0.896296