Jaro similarity: Difference between revisions
(added java) |
(→{{header|REXX}}: added the REXX language.) |
||
Line 353: | Line 353: | ||
jaro('DIXON', 'DICKSONX') = 0.7666666667 |
jaro('DIXON', 'DICKSONX') = 0.7666666667 |
||
jaro('JELLYFISH', 'SMELLYFISH') = 0.8962962963 |
jaro('JELLYFISH', 'SMELLYFISH') = 0.8962962963 |
||
</pre> |
|||
=={{header|REXX}}== |
|||
<lang rexx>/*REXX pgm computes the Jaro distance between 2 strings (or a list of strings)*/ |
|||
@.= /*define a default for the @. array. */ |
|||
parse arg @.1 /*obtain an optional character string. */ |
|||
if @.1='' then do /*nothing specified? Use the defaults.*/ |
|||
@.1='MARTHA MARHTA' |
|||
@.2='DIXON DICKSONX' |
|||
@.3='JELLYFISH SMELLYFISH' |
|||
@.4='DWAYNE DUANE' |
|||
end |
|||
do j=1 while @.j\=='' /*process all the strings in the list. */ |
|||
d=jaroDist(@.j) |
|||
say 'Jaro distance is ' format(d,,5) " for strings: " @.j |
|||
end /*j*/ /* └──── digits past decimal point. */ |
|||
exit /*stick a fork in it, we're all done. */ |
|||
/*────────────────────────────────────────────────────────────────────────────*/ |
|||
jaroDist: procedure; arg s.1 s.2 .; L1=length(s.1); L2=length(s.2); m=0 |
|||
if L1==0 | L2==0 then return 0 /*check if any string is a null string.*/ |
|||
f=max(L1,L2)%2 - 1 /*calculate furthest distanced allowed.*/ |
|||
r.=0 /*the relative positions of the letters*/ |
|||
do k=1 for L1; _=substr(s.1,k,1) |
|||
p=pos(_,s.2, max(1,k-f)); r.k=p /*see if the character is near enough. */ |
|||
if p\==0 & abs(p-k)<=f then m=m+1 /*if near enough, count it as a match. */ |
|||
else r.k=0 /* ··· otherwise, don't count it.*/ |
|||
end /*k*/ |
|||
t=0 |
|||
do o=1 for L1; v=o-1; _=substr(s.1, o, 1) |
|||
if pos(_, s.2)==0 | r.o==0 | r.v==0 then iterate |
|||
if r.o<r.v then t=t+1 |
|||
end /*o*/ /* [↑] count number of transpositions.*/ |
|||
if m==0 then return 0 |
|||
return (m/L1 + m/L2 + (m-t)/m) / 3</rexx> |
|||
'''output''' when using the default inputs: |
|||
<pre> |
|||
Jaro distance is 0.94444 for strings: MARTHA MARHTA |
|||
Jaro distance is 0.76667 for strings: DIXON DICKSONX |
|||
Jaro distance is 0.89630 for strings: JELLYFISH SMELLYFISH |
|||
Jaro distance is 0.82222 for strings: DWAYNE DUANE |
|||
</pre> |
</pre> |
||
Revision as of 00:49, 22 February 2016
You are encouraged to solve this task according to the task description, using any language you may know.
The Jaro distance is a measure of similarity between two strings. The higher the Jaro distance for two strings is, the more similar the strings are. The score is normalized such that 0 equates to no similarity and 1 is an exact match.
- Definition
The Jaro distance of two given strings and is
Where:
- is the number of matching characters;
- is half the number of transpositions.
Two characters from and respectively, are considered matching only if they are the same and not farther than .
Each character of is compared with all its matching characters in . The number of matching (but different sequence order) characters divided by 2 defines the number of transpositions.
- Example
Given the strings DWAYNE and DUANE we find:
We find a Jaro score of:
- Task
Implement the Jaro-distance algorithm and show the distances for each of the following pairs:
- ("MARTHA", "MARHTA")
- ("DIXON", "DICKSONX")
- ("JELLYFISH", "SMELLYFISH")
- See also
- Jaro–Winkler distance on Wikipedia.
C
<lang C>#include <stdlib.h>
- include <string.h>
- include <ctype.h>
- include <stdio.h>
- define TRUE 1
- define FALSE 0
- define max(a, b) ((a) > (b) ? (a) : (b))
- define min(a, b) ((a) < (b) ? (a) : (b))
double jaro(const char *str1, const char *str2) {
// length of the strings int str1_len = strlen(str1); int str2_len = strlen(str2);
// if both strings are empty return 1 // if only one of the strings is empty return 0 if (str1_len == 0) return str2_len == 0 ? 1.0 : 0.0;
// max distance between two chars to be considered matching // floor() is ommitted due to integer division rules int match_distance = (int) max(str1_len, str2_len)/2 - 1;
// arrays of bools that signify if that char in the matching string has a match int *str1_matches = calloc(str1_len, sizeof(int)); int *str2_matches = calloc(str2_len, sizeof(int));
// number of matches and transpositions double matches = 0.0; double transpositions = 0.0;
// find the matches for (int i = 0; i < str1_len; i++) { // start and end take into account the match distance int start = max(0, i - match_distance); int end = min(i + match_distance + 1, str2_len);
for (int k = start; k < end; k++) { // if str2 already has a match continue if (str2_matches[k]) continue; // if str1 and str2 are not if (str1[i] != str2[k]) continue; // otherwise assume there is a match str1_matches[i] = TRUE; str2_matches[k] = TRUE; matches++; break; } }
// if there are no matches return 0 if (matches == 0) { free(str1_matches); free(str2_matches); return 0.0; }
// count transpositions int k = 0; for (int i = 0; i < str1_len; i++) { // if there are no matches in str1 continue if (!str1_matches[i]) continue; // while there is no match in str2 increment k while (!str2_matches[k]) k++; // increment transpositions if (str1[i] != str2[k]) transpositions++; k++; }
// divide the number of transpositions by two as per the algorithm specs // this division is valid because the counted transpositions include both // instances of the transposed characters. transpositions /= 2.0;
// free the allocated memory free(str1_matches); free(str2_matches);
// return the Jaro distance return ((matches / str1_len) + (matches / str2_len) + ((matches - transpositions) / matches)) / 3.0;
}
int main() {
printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));
}</lang>
- Output:
0.944444 0.766667 0.896296
J
Implementation:
<lang J>jaro=: dyad define
d=. ((x >.&# y)%2)-1 e=. (x =/y) * d >: |x -/&(i.@#) y xm=. (+./"1 e)#x ym=. (+./"2 e)#y m=. xm <.&# ym t=. (+/xm ~:&(m&{.) ym)%2 s1=. #x s2=. #y ((m%s1)+(m%s2)+(m-t)%m)%3
)</lang>
Task examples:
<lang J> 'MARTHA' jaro 'MARHTA' 0.944444
'DIXON' jaro 'DICKSONX'
0.766667
'JELLYFISH' jaro 'SMELLYFISH'
0.896296</lang>
Java
<lang java>public class JaroDistance {
public static double jaro(String s, String t) { int s_len = s.length(); int t_len = t.length();
if (s_len == 0 && t_len == 0) return 1;
int match_distance = Integer.max(s_len, t_len) / 2 - 1;
boolean[] s_matches = new boolean[s_len]; boolean[] t_matches = new boolean[t_len];
int matches = 0; int transpositions = 0;
for (int i = 0; i < s_len; i++) { int start = Integer.max(0, i-match_distance); int end = Integer.min(i+match_distance+1, t_len);
for (int j = start; j < end; j++) { if (t_matches[j]) continue; if (s.charAt(i) != t.charAt(j)) continue; s_matches[i] = true; t_matches[j] = true; matches++; break; } }
if (matches == 0) return 0;
int k = 0; for (int i = 0; i < s_len; i++) { if (!s_matches[i]) continue; while (!t_matches[k]) k++; if (s.charAt(i) != t.charAt(k)) transpositions++; k++; }
return (((double)matches / s_len) + ((double)matches / t_len) + (((double)matches - transpositions/2.0) / matches)) / 3.0; }
public static void main(String[] args) { System.out.println(jaro( "MARTHA", "MARHTA")); System.out.println(jaro( "DIXON", "DICKSONX")); System.out.println(jaro("JELLYFISH", "SMELLYFISH")); }
}</lang>
- Output:
0.9444444444444445 0.7666666666666666 0.8962962962962964
Perl
<lang perl>use List::Util qw(min max);
sub jaro {
my ($s, $t) = @_;
my $s_len = length($s); my $t_len = length($t);
return 1 if $s_len == 0 and $t_len == 0;
my $match_distance = int(max($s_len, $t_len) / 2) - 1;
my @s_matches; my @t_matches;
my @s = split(//, $s); my @t = split(//, $t);
my $matches = 0; foreach my $i (0 .. $#s) {
my $start = max(0, $i - $match_distance); my $end = min($i + $match_distance + 1, $t_len);
foreach my $j ($start .. $end - 1) { $t_matches[$j] and next; $s[$i] eq $t[$j] or next; $s_matches[$i] = 1; $t_matches[$j] = 1; $matches++; last; } }
return 0 if $matches == 0;
my $k = 0; my $transpositions = 0;
foreach my $i (0 .. $#s) { $s_matches[$i] or next; until ($t_matches[$k]) { ++$k } $s[$i] eq $t[$k] or ++$transpositions; ++$k; }
(($matches / $s_len) + ($matches / $t_len) + (($matches - $transpositions / 2) / $matches)) / 3;
}
printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));</lang>
- Output:
0.944444 0.766667 0.896296
Python
<lang python>from __future__ import division
def jaro(s, t):
s_len = len(s) t_len = len(t)
if s_len == 0 and t_len == 0: return 1
match_distance = (max(s_len, t_len) // 2) - 1
s_matches = [False] * s_len t_matches = [False] * t_len
matches = 0 transpositions = 0
for i in range(s_len): start = max(0, i-match_distance) end = min(i+match_distance+1, t_len)
for j in range(start, end): if t_matches[j]: continue if s[i] != t[j]: continue s_matches[i] = True t_matches[j] = True matches += 1 break
if matches == 0: return 0
k = 0 for i in range(s_len): if not s_matches[i]: continue while not t_matches[k]: k += 1 if s[i] != t[k]: transpositions += 1 k += 1
return ((matches / s_len) + (matches / t_len) + ((matches - transpositions/2) / matches)) / 3
for s,t in [( 'MARTHA', 'MARHTA'),
( 'DIXON', 'DICKSONX'), ('JELLYFISH', 'SMELLYFISH')]: print("jaro(%r, %r) = %.10f" % (s, t, jaro(s, t)))</lang>
- Output:
jaro('MARTHA', 'MARHTA') = 0.9444444444 jaro('DIXON', 'DICKSONX') = 0.7666666667 jaro('JELLYFISH', 'SMELLYFISH') = 0.8962962963
REXX
<lang rexx>/*REXX pgm computes the Jaro distance between 2 strings (or a list of strings)*/ @.= /*define a default for the @. array. */ parse arg @.1 /*obtain an optional character string. */ if @.1= then do /*nothing specified? Use the defaults.*/
@.1='MARTHA MARHTA' @.2='DIXON DICKSONX' @.3='JELLYFISH SMELLYFISH' @.4='DWAYNE DUANE' end
do j=1 while @.j\== /*process all the strings in the list. */ d=jaroDist(@.j) say 'Jaro distance is ' format(d,,5) " for strings: " @.j end /*j*/ /* └──── digits past decimal point. */
exit /*stick a fork in it, we're all done. */ /*────────────────────────────────────────────────────────────────────────────*/ jaroDist: procedure; arg s.1 s.2 .; L1=length(s.1); L2=length(s.2); m=0 if L1==0 | L2==0 then return 0 /*check if any string is a null string.*/ f=max(L1,L2)%2 - 1 /*calculate furthest distanced allowed.*/ r.=0 /*the relative positions of the letters*/
do k=1 for L1; _=substr(s.1,k,1) p=pos(_,s.2, max(1,k-f)); r.k=p /*see if the character is near enough. */ if p\==0 & abs(p-k)<=f then m=m+1 /*if near enough, count it as a match. */ else r.k=0 /* ··· otherwise, don't count it.*/ end /*k*/
t=0
do o=1 for L1; v=o-1; _=substr(s.1, o, 1) if pos(_, s.2)==0 | r.o==0 | r.v==0 then iterate if r.o<r.v then t=t+1 end /*o*/ /* [↑] count number of transpositions.*/
if m==0 then return 0
return (m/L1 + m/L2 + (m-t)/m) / 3</rexx>
output when using the default inputs:
Jaro distance is 0.94444 for strings: MARTHA MARHTA Jaro distance is 0.76667 for strings: DIXON DICKSONX Jaro distance is 0.89630 for strings: JELLYFISH SMELLYFISH Jaro distance is 0.82222 for strings: DWAYNE DUANE
Ruby
<lang ruby>def jaro(s, t)
s_len = s.size t_len = t.size
return 1 if s_len == 0 and t_len == 0 match_distance = ([s_len, t_len].max / 2) - 1
s_matches = [] t_matches = []
matches = 0.0 transpositions = 0.0
(0...s_len).each do |i| j_start = [0, i-match_distance].max j_end = [i+match_distance, t_len-1].min
(j_start..j_end).each do |j| t_matches[j] && next s[i] == t[j] || next s_matches[i] = true t_matches[j] = true matches += 1.0 break end end
return 0.0 if matches == 0.0
k = 0 (0...s_len).each do |i| s_matches[i] || next until t_matches[k] k += 1 end s[i] == t[k] || (transpositions += 1.0) k += 1 end
((matches / s_len) + (matches / t_len) + ((matches - transpositions/2.0) / matches)) / 3.0
end
for s,t in %w(
MARTHA MARHTA DIXON DICKSONX JELLYFISH SMELLYFISH
).each_slice(2)
puts "jaro(#{s.inspect}, #{t.inspect}) = #{'%.10f' % jaro(s, t)}"
end</lang>
- Output:
jaro("MARTHA", "MARHTA") = 0.9444444444 jaro("DIXON", "DICKSONX") = 0.7666666667 jaro("JELLYFISH", "SMELLYFISH") = 0.8962962963
Sidef
<lang ruby>func jaro(s, t) {
return 1 if (s.is_empty && t.is_empty)
var s_len = s.len var t_len = t.len
var match_distance = (floor(max(s_len, t_len) / 2) - 1)
var s_matches = [] var t_matches = []
var matches = 0 var transpositions = 0
s_len.range.each { |i| var start = max(0, i-match_distance) var end = min(i+match_distance, t_len-1)
start ... end -> each { |k| t_matches[k] && next s[i] == t[k] || next s_matches[i] = true t_matches[k] = true matches++ break } }
return 0 if (matches == 0)
var k = 0 s_len.range.each { |i| s_matches[i] || next while (!t_matches[k]) { ++k } s[i] == t[k] || ++transpositions ++k }
((matches / s_len) + (matches / t_len) + ((matches - transpositions/2) / matches)) / 3
}
for pair in [
[%c"MARTHA", %c"MARHTA"], [%c"DIXON", %c"DICKSONX"], [%c"JELLYFISH", %c"SMELLYFISH"],
] {
say "jaro(#{pair.map{.join.dump}.join(', ')}) = #{'%.10f' % jaro(pair...)}"
}</lang>
- Output:
jaro("MARTHA", "MARHTA") = 0.9444444444 jaro("DIXON", "DICKSONX") = 0.7666666667 jaro("JELLYFISH", "SMELLYFISH") = 0.8962962963
zkl
<lang zkl> //-->String of matched characters, ordered fcn _jaro(str1,str2, matchDistance){
cs:=Sink(String); foreach i,c in ([0..].zip(str1)){ str2.find(c,(0).max(i - matchDistance),i + matchDistance) : if(Void!=_) cs.write(c); } cs.close()
}
fcn jaro(str1,str2){
s1Len,s2Len,matchDistance := str1.len(), str2.len(), s1Len.max(s2Len)/2 - 1; cs12,cs21 := _jaro(str1,str2, matchDistance), _jaro(str2,str1, matchDistance);
matches:=cs12.len().toFloat(); if(not matches) return(0.0); transpositions:=cs12.walker().zipWith('!=,cs21).filter().sum(0)/2; ( matches/s1Len + matches/s2Len + ((matches - transpositions)/matches) ) / 3.0
}</lang> <lang zkl>foreach s,t in (T(
T("MARTHA","MARHTA"), T("DIXON","DICKSONX"), T("JELLYFISH","SMELLYFISH"))){ println(0'|jaro("%s","%s") = %.10f|.fmt(s,t,jaro(s,t)));
}</lang>
- Output:
jaro("MARTHA","MARHTA") = 0.9444444444 jaro("DIXON","DICKSONX") = 0.7666666667 jaro("JELLYFISH","SMELLYFISH") = 0.8962962963