Jaro similarity
The Jaro distance is a measure of similarity between two strings.
You are encouraged to solve this task according to the task description, using any language you may know.
The higher the Jaro distance for two strings is, the more similar the strings are.
The score is normalized such that 0 equates to no similarity and 1 is an exact match.
- Definition
The Jaro distance of two given strings and is
Where:
- is the number of matching characters;
- is half the number of transpositions.
Two characters from and respectively, are considered matching only if they are the same and not farther than .
Each character of is compared with all its matching characters in .
The number of matching (but different sequence order) characters divided by 2 defines the number of transpositions.
- Example
Given the strings DWAYNE and DUANE we find:
We find a Jaro score of:
- Task
Implement the Jaro-distance algorithm and show the distances for each of the following pairs:
- ("MARTHA", "MARHTA")
- ("DIXON", "DICKSONX")
- ("JELLYFISH", "SMELLYFISH")
- See also
- Jaro–Winkler distance on Wikipedia.
C
<lang C>#include <stdlib.h>
- include <string.h>
- include <ctype.h>
- include <stdio.h>
- define TRUE 1
- define FALSE 0
- define max(a, b) ((a) > (b) ? (a) : (b))
- define min(a, b) ((a) < (b) ? (a) : (b))
double jaro(const char *str1, const char *str2) {
// length of the strings int str1_len = strlen(str1); int str2_len = strlen(str2);
// if both strings are empty return 1 // if only one of the strings is empty return 0 if (str1_len == 0) return str2_len == 0 ? 1.0 : 0.0;
// max distance between two chars to be considered matching // floor() is ommitted due to integer division rules int match_distance = (int) max(str1_len, str2_len)/2 - 1;
// arrays of bools that signify if that char in the matching string has a match int *str1_matches = calloc(str1_len, sizeof(int)); int *str2_matches = calloc(str2_len, sizeof(int));
// number of matches and transpositions double matches = 0.0; double transpositions = 0.0;
// find the matches for (int i = 0; i < str1_len; i++) { // start and end take into account the match distance int start = max(0, i - match_distance); int end = min(i + match_distance + 1, str2_len);
for (int k = start; k < end; k++) { // if str2 already has a match continue if (str2_matches[k]) continue; // if str1 and str2 are not if (str1[i] != str2[k]) continue; // otherwise assume there is a match str1_matches[i] = TRUE; str2_matches[k] = TRUE; matches++; break; } }
// if there are no matches return 0 if (matches == 0) { free(str1_matches); free(str2_matches); return 0.0; }
// count transpositions int k = 0; for (int i = 0; i < str1_len; i++) { // if there are no matches in str1 continue if (!str1_matches[i]) continue; // while there is no match in str2 increment k while (!str2_matches[k]) k++; // increment transpositions if (str1[i] != str2[k]) transpositions++; k++; }
// divide the number of transpositions by two as per the algorithm specs // this division is valid because the counted transpositions include both // instances of the transposed characters. transpositions /= 2.0;
// free the allocated memory free(str1_matches); free(str2_matches);
// return the Jaro distance return ((matches / str1_len) + (matches / str2_len) + ((matches - transpositions) / matches)) / 3.0;
}
int main() {
printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));
}</lang>
- Output:
0.944444 0.766667 0.896296
Go
<lang go>package main
import "fmt"
func jaro(str1, str2 string) float64 {
if len(str1) == 0 && len(str2) == 0 { return 1 } if len(str1) == 0 || len(str2) == 0 { return 0 } match_distance := len(str1) if len(str2) > match_distance { match_distance = len(str2) } match_distance = match_distance/2 - 1 str1_matches := make([]bool, len(str1)) str2_matches := make([]bool, len(str2)) matches := 0. transpositions := 0. for i := range str1 { start := i - match_distance if start < 0 { start = 0 } end := i + match_distance + 1 if end > len(str2) { end = len(str2) } for k := start; k < end; k++ { if str2_matches[k] { continue } if str1[i] != str2[k] { continue } str1_matches[i] = true str2_matches[k] = true matches++ break } } if matches == 0 { return 0 } k := 0 for i := range str1 { if !str1_matches[i] { continue } for !str2_matches[k] { k++ } if str1[i] != str2[k] { transpositions++ } k++ } transpositions /= 2 return (matches/float64(len(str1)) + matches/float64(len(str2)) + (matches-transpositions)/matches) / 3
}
func main() {
fmt.Printf("%f\n", jaro("MARTHA", "MARHTA")) fmt.Printf("%f\n", jaro("DIXON", "DICKSONX")) fmt.Printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"))
}</lang>
- Output:
0.944444 0.766667 0.896296
J
Implementation:
<lang J>jaro=: dyad define
d=. ((x >.&# y)%2)-1 e=. (x =/y) * d >: |x -/&(i.@#) y xm=. (+./"1 e)#x ym=. (+./"2 e)#y m=. xm <.&# ym t=. (+/xm ~:&(m&{.) ym)%2 s1=. #x s2=. #y ((m%s1)+(m%s2)+(m-t)%m)%3
)</lang>
Task examples:
<lang J> 'MARTHA' jaro 'MARHTA' 0.944444
'DIXON' jaro 'DICKSONX'
0.766667
'JELLYFISH' jaro 'SMELLYFISH'
0.896296</lang>
Java
<lang java>public class JaroDistance {
public static double jaro(String s, String t) { int s_len = s.length(); int t_len = t.length();
if (s_len == 0 && t_len == 0) return 1;
int match_distance = Integer.max(s_len, t_len) / 2 - 1;
boolean[] s_matches = new boolean[s_len]; boolean[] t_matches = new boolean[t_len];
int matches = 0; int transpositions = 0;
for (int i = 0; i < s_len; i++) { int start = Integer.max(0, i-match_distance); int end = Integer.min(i+match_distance+1, t_len);
for (int j = start; j < end; j++) { if (t_matches[j]) continue; if (s.charAt(i) != t.charAt(j)) continue; s_matches[i] = true; t_matches[j] = true; matches++; break; } }
if (matches == 0) return 0;
int k = 0; for (int i = 0; i < s_len; i++) { if (!s_matches[i]) continue; while (!t_matches[k]) k++; if (s.charAt(i) != t.charAt(k)) transpositions++; k++; }
return (((double)matches / s_len) + ((double)matches / t_len) + (((double)matches - transpositions/2.0) / matches)) / 3.0; }
public static void main(String[] args) { System.out.println(jaro( "MARTHA", "MARHTA")); System.out.println(jaro( "DIXON", "DICKSONX")); System.out.println(jaro("JELLYFISH", "SMELLYFISH")); }
}</lang>
- Output:
0.9444444444444445 0.7666666666666666 0.8962962962962964
PARI/GP
This version was translated from Java and Perl.
<lang parigp> \\Jaro distance between 2 strings s1 and s2. \\ 4/12/16 aev jaroDist(s1,s2)={ my(vt1=Vecsmall(s1),vt2=Vecsmall(s2),n1=#s1,n2=#s2,d,
md=max(n1,n2)\2-1,cs,ce,mc=0,tr=0,k=1,ds, s1m=vector(n1,z,0),s2m=vector(n2,z,0));
if(!n1||!n2, return(0)); for(i=1,n1,
cs=max(1,i-md); ce=min(i+md+1,n2); for(j=cs,ce, if(s2m[j],next); if(vt1[i]!=vt2[j], next); mc++; s1m[i]=1; s2m[j]=1; break; );\\fend j
);\\fend i if(!mc, return(0)); for(i=1,n1,
if(!s1m[i], next); while(!s2m[k], k++); if(vt1[i]!=vt2[k], tr++); k++
);\\fend i d=(mc/n1+mc/n2+(mc-tr/2)/mc)/3.0; ds=Strprintf("%.5f",d); print(" *** Jaro distance is: ",ds," for strings: ",s1,", ",s2); return(d); }
{ \\ Testing: jaroDist("MARTHA","MARHTA"); jaroDist("DIXON","DICKSONX"); jaroDist("JELLYFISH","SMELLYFISH"); jaroDist("DWAYNE","DUANE"); } </lang>
- Output:
*** Jaro distance is: 0.94444 for strings: MARTHA, MARHTA *** Jaro distance is: 0.76667 for strings: DIXON, DICKSONX *** Jaro distance is: 0.89630 for strings: JELLYFISH, SMELLYFISH *** Jaro distance is: 0.82222 for strings: DWAYNE, DUANE
Perl
<lang perl>use List::Util qw(min max);
sub jaro {
my ($s, $t) = @_;
my $s_len = length($s); my $t_len = length($t);
return 1 if $s_len == 0 and $t_len == 0;
my $match_distance = int(max($s_len, $t_len) / 2) - 1;
my @s_matches; my @t_matches;
my @s = split(//, $s); my @t = split(//, $t);
my $matches = 0; foreach my $i (0 .. $#s) {
my $start = max(0, $i - $match_distance); my $end = min($i + $match_distance + 1, $t_len);
foreach my $j ($start .. $end - 1) { $t_matches[$j] and next; $s[$i] eq $t[$j] or next; $s_matches[$i] = 1; $t_matches[$j] = 1; $matches++; last; } }
return 0 if $matches == 0;
my $k = 0; my $transpositions = 0;
foreach my $i (0 .. $#s) { $s_matches[$i] or next; until ($t_matches[$k]) { ++$k } $s[$i] eq $t[$k] or ++$transpositions; ++$k; }
(($matches / $s_len) + ($matches / $t_len) + (($matches - $transpositions / 2) / $matches)) / 3;
}
printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));</lang>
- Output:
0.944444 0.766667 0.896296
Perl 6
<lang perl6>sub jaro ($s, $t) {
return 1 if $s eq $t;
my $s_len = + my @s = $s.comb; my $t_len = + my @t = $t.comb;
my $match_distance = ($s_len max $t_len) div 2 - 1;
my @s_matches; my @t_matches; my $matches = 0;
for ^@s -> $i {
my $start = 0 max $i - $match_distance; my $end = $i + $match_distance min $t_len;
for $start .. $end -> $j { @t_matches[$j] and next; @s[$i] eq @t[$j] or next; @s_matches[$i] = 1; @t_matches[$j] = 1; $matches++; last; } }
return 0 if $matches == 0;
my $k = 0; my $transpositions = 0;
for ^@s -> $i { @s_matches[$i] or next; until @t_matches[$k] { ++$k } @s[$i] eq @t[$k] or ++$transpositions; ++$k; }
($matches / $s_len + $matches / $t_len + (($matches - $transpositions / 2) / $matches)) / 3;
}
printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));</lang>
- Output:
0.944444 0.766667 0.896296
Python
<lang python>from __future__ import division
def jaro(s, t):
s_len = len(s) t_len = len(t)
if s_len == 0 and t_len == 0: return 1
match_distance = (max(s_len, t_len) // 2) - 1
s_matches = [False] * s_len t_matches = [False] * t_len
matches = 0 transpositions = 0
for i in range(s_len): start = max(0, i-match_distance) end = min(i+match_distance+1, t_len)
for j in range(start, end): if t_matches[j]: continue if s[i] != t[j]: continue s_matches[i] = True t_matches[j] = True matches += 1 break
if matches == 0: return 0
k = 0 for i in range(s_len): if not s_matches[i]: continue while not t_matches[k]: k += 1 if s[i] != t[k]: transpositions += 1 k += 1
return ((matches / s_len) + (matches / t_len) + ((matches - transpositions/2) / matches)) / 3
for s,t in [( 'MARTHA', 'MARHTA'),
( 'DIXON', 'DICKSONX'), ('JELLYFISH', 'SMELLYFISH')]: print("jaro(%r, %r) = %.10f" % (s, t, jaro(s, t)))</lang>
- Output:
jaro('MARTHA', 'MARHTA') = 0.9444444444 jaro('DIXON', 'DICKSONX') = 0.7666666667 jaro('JELLYFISH', 'SMELLYFISH') = 0.8962962963
Racket
(kinda)
Returns an exact value for the Jaro distance.
<lang racket>#lang racket/base
(require data/bit-vector)
(define (jaro-distance str1 str2)
(define str1-len (string-length str1)) (define str2-len (string-length str2)) (cond [(and (zero? str1-len) (zero? str2-len)) 0] [(or (zero? str1-len) (zero? str2-len)) 1] [else ;; vectors of bools that signify if that char in the matching string has a match (define str1-matches (make-bit-vector str1-len)) (define str2-matches (make-bit-vector str2-len)) (define matches ;; max distance between two chars to be considered matching (let ((match-distance (sub1 (quotient (max str1-len str2-len) 2)))) (for/fold ((matches 0)) ((i (in-range 0 str1-len)) (c1 (in-string str1))) (define start (max 0 (- i match-distance))) (define end (min (+ i match-distance 1) str2-len)) (for/fold ((matches matches)) ((k (in-range start end)) (c2 (in-string str2 start)) #:unless (bit-vector-ref str2-matches k) ; if str2 already has a match continue #:when (char=? c1 c2) ; if str1 and str2 are not #:final #t) ;; otherwise assume there is a match (bit-vector-set! str1-matches i #t) (bit-vector-set! str2-matches k #t) (add1 matches))))) (cond [(zero? matches) 0] [else (define-values (transpositions*2 k+) (for/fold ((transpositions 0) (k 0)) ((i (in-range 0 str1-len)) (c1 (in-string str1)) (b1 (in-bit-vector str1-matches)) ;; if there are no matches in str1 continue #:when b1) (define k+ (for/first ((k+ (in-range k str2-len)) (b2 (in-bit-vector str2-matches k)) #:when b2) k+)) (values (+ transpositions (if (char=? c1 (string-ref str2 k+)) 0 1)) ; increment transpositions (add1 k+)))) ;; while there is no match in str2 increment k ;; divide the number of transpositions by two as per the algorithm specs ;; this division is valid because the counted transpositions include both ;; instances of the transposed characters. (define transpositions (quotient transpositions*2 2)) ;; return the Jaro distance (/ (+ (/ matches str1-len) (/ matches str2-len) (/ (- matches transpositions) matches)) 3)])]))
(module+ test
(jaro-distance "MARTHA" "MARHTA"); 0.944444 (exact->inexact (jaro-distance "MARTHA" "MARHTA")); 0.944444 (jaro-distance "DIXON" "DICKSONX"); 0.766667 (exact->inexact (jaro-distance "DIXON" "DICKSONX")); 0.766667 (jaro-distance "JELLYFISH" "SMELLYFISH"); 0.896296 (exact->inexact (jaro-distance "JELLYFISH" "SMELLYFISH"))); 0.896296</lang>
- Output:
The exact->inexact
calls in the tests give an inexact, floating point version of the rational values.
17/18 0.9444444444444444 23/30 0.7666666666666667 121/135 0.8962962962962963
REXX
<lang rexx>/*REXX program computes the Jaro distance between two strings (or a list of strings).*/ @.= /*define a default for the @. array. */ parse arg @.1 /*obtain an optional character string. */ if @.1= then do /*nothing specified? Use the defaults.*/
@.1= 'MARTHA MARHTA' @.2= 'DIXON DICKSONX' @.3= 'JELLYFISH SMELLYFISH' @.4= 'DWAYNE DUANE' end
do j=1 while @.j\== /*process all the strings in the list. */ d=jaroDist(@.j) say 'Jaro distance is ' format(d, , 5) " for strings: " @.j end /*j*/ /* └──── digits past the decimal point.*/
exit /*stick a fork in it, we're all done. */ /*──────────────────────────────────────────────────────────────────────────────────────*/ jaroDist: procedure; arg s.1 s.2 .; L1=length(s.1); L2=length(s.2); m=0
if L1==0 | L2==0 then return 0 /*check if any string is a null string.*/ f=max(L1,L2)%2 - 1 /*calculate furthest distanced allowed.*/ r.=0 /* [↓] see if the char is near enough.*/ do k=1 for L1; p=pos(substr(s.1, k, 1), s.2, max(1, k-f)); r.k=p if p\==0 & abs(p-k)<=f then m=m+1 /*if near enough, count it as a match. */ else r.k=0 /* ··· otherwise, don't count it.*/ end /*k*/ t=0 do o=1 for L1; om=o-1 if pos( substr(s.1, o, 1), s.2)==0 | r.o==0 | r.om==0 then iterate if r.o<r.om then t=t+1 end /*o*/ /* [↑] count number of transpositions.*/
if m==0 then return 0 return (m/L1 + m/L2 + (m-t)/m) / 3</lang>
output when using the default inputs:
Jaro distance is 0.94444 for strings: MARTHA MARHTA Jaro distance is 0.76667 for strings: DIXON DICKSONX Jaro distance is 0.89630 for strings: JELLYFISH SMELLYFISH Jaro distance is 0.82222 for strings: DWAYNE DUANE
Ruby
<lang ruby>def jaro(s, t)
s_len = s.size t_len = t.size
return 1 if s_len == 0 and t_len == 0 match_distance = ([s_len, t_len].max / 2) - 1
s_matches = [] t_matches = []
matches = 0.0 transpositions = 0.0
(0...s_len).each do |i| j_start = [0, i-match_distance].max j_end = [i+match_distance, t_len-1].min
(j_start..j_end).each do |j| t_matches[j] && next s[i] == t[j] || next s_matches[i] = true t_matches[j] = true matches += 1.0 break end end
return 0.0 if matches == 0.0
k = 0 (0...s_len).each do |i| s_matches[i] || next until t_matches[k] k += 1 end s[i] == t[k] || (transpositions += 1.0) k += 1 end
((matches / s_len) + (matches / t_len) + ((matches - transpositions/2.0) / matches)) / 3.0
end
for s,t in %w(
MARTHA MARHTA DIXON DICKSONX JELLYFISH SMELLYFISH
).each_slice(2)
puts "jaro(#{s.inspect}, #{t.inspect}) = #{'%.10f' % jaro(s, t)}"
end</lang>
- Output:
jaro("MARTHA", "MARHTA") = 0.9444444444 jaro("DIXON", "DICKSONX") = 0.7666666667 jaro("JELLYFISH", "SMELLYFISH") = 0.8962962963
Sidef
<lang ruby>func jaro(s, t) {
return 1 if (s == t)
var s_len = s.len var t_len = t.len
var match_distance = ((::max(s_len, t_len) // 2) - 1)
var s_matches = [] var t_matches = []
var matches = 0 var transpositions = 0
for i in ^s_len { var start = ::max(0, i-match_distance) var end = ::min(i+match_distance, t_len-1)
for k in (start .. end) { t_matches[k] && next s[i] == t[k] || next s_matches[i] = true t_matches[k] = true matches++ break } }
return 0 if (matches == 0)
var k = 0 for i in ^s_len { s_matches[i] || next while (!t_matches[k]) { ++k } s[i] == t[k] || ++transpositions ++k }
((matches / s_len) + (matches / t_len) + ((matches - transpositions/2) / matches)) / 3
}
for pair in [
[%c"MARTHA", %c"MARHTA"], [%c"DIXON", %c"DICKSONX"], [%c"JELLYFISH", %c"SMELLYFISH"],
] {
say "jaro(#{pair.map{.join.dump}.join(', ')}) = #{'%.10f' % jaro(pair...)}"
}</lang>
- Output:
jaro("MARTHA", "MARHTA") = 0.9444444444 jaro("DIXON", "DICKSONX") = 0.7666666667 jaro("JELLYFISH", "SMELLYFISH") = 0.8962962963
zkl
<lang zkl> //-->String of matched characters, ordered fcn _jaro(str1,str2, matchDistance){
cs:=Sink(String); foreach i,c in ([0..].zip(str1)){ str2.find(c,(0).max(i - matchDistance),i + matchDistance) : if(Void!=_) cs.write(c); } cs.close()
}
fcn jaro(str1,str2){
s1Len,s2Len,matchDistance := str1.len(), str2.len(), s1Len.max(s2Len)/2 - 1; cs12,cs21 := _jaro(str1,str2, matchDistance), _jaro(str2,str1, matchDistance);
matches:=cs12.len().toFloat(); if(not matches) return(0.0); transpositions:=cs12.walker().zipWith('!=,cs21).filter().sum(0)/2; ( matches/s1Len + matches/s2Len + ((matches - transpositions)/matches) ) / 3.0
}</lang> <lang zkl>foreach s,t in (T(
T("MARTHA","MARHTA"), T("DIXON","DICKSONX"), T("JELLYFISH","SMELLYFISH"))){ println(0'|jaro("%s","%s") = %.10f|.fmt(s,t,jaro(s,t)));
}</lang>
- Output:
jaro("MARTHA","MARHTA") = 0.9444444444 jaro("DIXON","DICKSONX") = 0.7666666667 jaro("JELLYFISH","SMELLYFISH") = 0.8962962963