Jaro similarity

From Rosetta Code
Jaro similarity is a draft programming task. It is not yet considered ready to be promoted as a complete task, for reasons that should be found in its talk page.

The Jaro distance is a measure of similarity between two strings. The higher the Jaro distance for two strings is, the more similar the strings are. The score is normalized such that 0 equates to no similarity and 1 is an exact match.


Definition

The Jaro distance of two given strings and is

Where:

  • is the number of matching characters (see below);
  • is half the number of transpositions (see below).

Two characters from and respectively, are considered matching only if they are the same and not farther than .

Each character of is compared with all its matching characters in . The number of matching (but different sequence order) characters divided by 2 defines the number of transpositions.


Example

Given the strings DWAYNE and DUANE we find:


We find a Jaro score of:


Task

Implement the Jaro-distance algorithm and show the distances for each of the following pairs:

  • ("MARTHA", "MARHTA")
  • ("DIXON", "DICKSONX")
  • ("JELLYFISH", "SMELLYFISH")


See also


C

<lang C>#include <stdlib.h>

  1. include <string.h>
  2. include <ctype.h>
  3. include <stdio.h>
  1. define TRUE 1
  2. define FALSE 0
  1. define max(a, b) ((a) > (b) ? (a) : (b))
  2. define min(a, b) ((a) < (b) ? (a) : (b))

double jaro(const char *str1, const char *str2) {

   // length of the strings
   int str1_len = strlen(str1);
   int str2_len = strlen(str2);
   // if both strings are empty return 1
   // if only one of the strings is empty return 0
   if (str1_len == 0) return str2_len == 0 ? 1.0 : 0.0;
   // max distance between two chars to be considered matching
   // floor() is ommitted due to integer division rules
   int match_distance = (int) max(str1_len, str2_len)/2 - 1;
   // arrays of bools that signify if that char in the matching string has a match
   int *str1_matches = calloc(str1_len, sizeof(int));
   int *str2_matches = calloc(str2_len, sizeof(int));
   // number of matches and transpositions
   double matches = 0.0;
   double transpositions = 0.0;
   // find the matches
   for (int i = 0; i < str1_len; i++) {
       // start and end take into account the match distance
       int start = max(0, i - match_distance);
       int end = min(i + match_distance + 1, str2_len);
       for (int k = start; k < end; k++) {
           // if str2 already has a match continue
           if (str2_matches[k]) continue;
           // if str1 and str2 are not
           if (str1[i] != str2[k]) continue;
           // otherwise assume there is a match
           str1_matches[i] = TRUE;
           str2_matches[k] = TRUE;
           matches++;
           break;
       }
   }
   // if there are no matches return 0
   if (matches == 0) {
       free(str1_matches);
       free(str2_matches);
       return 0.0;
   }
   // count transpositions
   int k = 0;
   for (int i = 0; i < str1_len; i++) {
       // if there are no matches in str1 continue
       if (!str1_matches[i]) continue;
       // while there is no match in str2 increment k
       while (!str2_matches[k]) k++;
       // increment transpositions
       if (str1[i] != str2[k]) transpositions++;
       k++;
   }
   // divide the number of transpositions by two as per the algorithm specs
   // this division is valid because the counted transpositions include both
   // instances of the transposed characters.
   transpositions /= 2.0;
   // free the allocated memory
   free(str1_matches);
   free(str2_matches);
   // return the Jaro distance
   return ((matches / str1_len) +
       (matches / str2_len) +
       ((matches - transpositions) / matches)) / 3.0;

}

int main() {

   printf("%f\n", jaro("MARTHA",    "MARHTA"));
   printf("%f\n", jaro("DIXON",     "DICKSONX"));
   printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));

}</lang>

Output:
0.944444
0.766667
0.896296

J

Implementation:

<lang J>jaro=: dyad define

 d=. ((x >.&# y)%2)-1
 e=. (x =/y) * d >: |x -/&(i.@#) y
 xm=. (+./"1 e)#x
 ym=. (+./"2 e)#y
 m=. xm <.&# ym
 t=. (+/xm ~:&(m&{.) ym)%2
 s1=. #x
 s2=. #y
 ((m%s1)+(m%s2)+(m-t)%m)%3

)</lang>

Task examples:

<lang J> 'MARTHA' jaro 'MARHTA' 0.944444

  'DIXON' jaro 'DICKSONX'

0.766667

  'JELLYFISH' jaro 'SMELLYFISH'

0.896296</lang>

Perl

<lang perl>use List::Util qw(min max);

sub jaro {

   my ($s, $t) = @_;
   my $s_len = length($s);
   my $t_len = length($t);
   return 1 if $s_len == 0 and $t_len == 0;
   my $match_distance = int(max($s_len, $t_len) / 2) - 1;
   my @s_matches;
   my @t_matches;
   my @s = split(//, $s);
   my @t = split(//, $t);
   my $matches = 0;
   foreach my $i (0 .. $#s) {
       my $start = max(0, $i - $match_distance);
       my $end = min($i + $match_distance + 1, $t_len);
       foreach my $j ($start .. $end - 1) {
           $t_matches[$j] and next;
           $s[$i] eq $t[$j] or next;
           $s_matches[$i] = 1;
           $t_matches[$j] = 1;
           $matches++;
           last;
       }
   }
   return 0 if $matches == 0;
   my $k              = 0;
   my $transpositions = 0;
   foreach my $i (0 .. $#s) {
       $s_matches[$i] or next;
       until ($t_matches[$k]) { ++$k }
       $s[$i] eq $t[$k] or ++$transpositions;
       ++$k;
   }
   (($matches / $s_len) + ($matches / $t_len) +
       (($matches - $transpositions / 2) / $matches)) / 3;

}

printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));</lang>

Output:
0.944444
0.766667
0.896296

Python

<lang python>from __future__ import division

def jaro(s, t):

   s_len = len(s)
   t_len = len(t)
   if s_len == 0 and t_len == 0:
       return 1
   match_distance = (max(s_len, t_len) // 2) - 1
   s_matches = [False] * s_len
   t_matches = [False] * t_len
   matches = 0
   transpositions = 0
   for i in range(s_len):
       start = max(0, i-match_distance)
       end = min(i+match_distance+1, t_len)
       for j in range(start, end):
           if t_matches[j]:
               continue
           if s[i] != t[j]:
               continue
           s_matches[i] = True
           t_matches[j] = True
           matches += 1
           break
   if matches == 0:
       return 0
   k = 0
   for i in range(s_len):
       if not s_matches[i]:
           continue
       while not t_matches[k]:
           k += 1
       if s[i] != t[k]:
           transpositions += 1
       k += 1
   return ((matches / s_len) +
           (matches / t_len) +
           ((matches - transpositions/2) / matches)) / 3

for s,t in [( 'MARTHA', 'MARHTA'),

           (    'DIXON',    'DICKSONX'),
           ('JELLYFISH',  'SMELLYFISH')]:
   print("jaro(%r, %r) = %.10f" % (s, t, jaro(s, t)))</lang>
Output:
jaro('MARTHA', 'MARHTA') = 0.9444444444
jaro('DIXON', 'DICKSONX') = 0.7666666667
jaro('JELLYFISH', 'SMELLYFISH') = 0.8962962963

Ruby

<lang ruby>def jaro(s, t)

   s_len = s.size
   t_len = t.size
   return 1 if s_len == 0 and t_len == 0
   match_distance = ([s_len, t_len].max / 2) - 1
   s_matches = []
   t_matches = []
   matches = 0.0
   transpositions = 0.0
   (0...s_len).each do |i|
       j_start = [0, i-match_distance].max
       j_end = [i+match_distance, t_len-1].min
       (j_start..j_end).each do |j|
           t_matches[j] && next
           s[i] == t[j] || next
           s_matches[i] = true
           t_matches[j] = true
           matches += 1.0
           break
       end
   end
   return 0.0 if matches == 0.0
   k = 0
   (0...s_len).each do |i|
       s_matches[i] || next
       until t_matches[k]
           k += 1
       end
       s[i] == t[k] || (transpositions += 1.0)
       k += 1
   end
   ((matches / s_len) +
     (matches / t_len) +
       ((matches - transpositions/2.0) / matches)) / 3.0

end

for s,t in %w(

      MARTHA      MARHTA
       DIXON    DICKSONX
   JELLYFISH  SMELLYFISH

).each_slice(2)

   puts "jaro(#{s.inspect}, #{t.inspect}) = #{'%.10f' % jaro(s, t)}"

end</lang>

Output:
jaro("MARTHA", "MARHTA") = 0.9444444444
jaro("DIXON", "DICKSONX") = 0.7666666667
jaro("JELLYFISH", "SMELLYFISH") = 0.8962962963

Sidef

<lang ruby>func jaro(s, t) {

   return 1 if (s.is_empty && t.is_empty)
   var s_len = s.len
   var t_len = t.len
   var match_distance = (floor(max(s_len, t_len) / 2) - 1)
   var s_matches = []
   var t_matches = []
   var matches = 0
   var transpositions = 0
   s_len.range.each { |i|
       var start = max(0, i-match_distance)
       var end = min(i+match_distance, t_len-1)
       start ... end -> each { |k|
           t_matches[k] && next
           s[i] == t[k] || next
           s_matches[i] = true
           t_matches[k] = true
           matches++
           break
       }
   }
   return 0 if (matches == 0)
   var k = 0
   s_len.range.each { |i|
       s_matches[i] || next
       while (!t_matches[k]) { ++k }
       s[i] == t[k] || ++transpositions
       ++k
   }
   ((matches / s_len) +
     (matches / t_len) +
       ((matches - transpositions/2) / matches)) / 3

}

for pair in [

   [%c"MARTHA",    %c"MARHTA"],
   [%c"DIXON",     %c"DICKSONX"],
   [%c"JELLYFISH", %c"SMELLYFISH"],

] {

   say "jaro(#{pair.map{.join.dump}.join(', ')}) = #{'%.10f' % jaro(pair...)}"

}</lang>

Output:
jaro("MARTHA", "MARHTA") = 0.9444444444
jaro("DIXON", "DICKSONX") = 0.7666666667
jaro("JELLYFISH", "SMELLYFISH") = 0.8962962963

zkl

<lang zkl> //-->String of matched characters, ordered fcn _jaro(str1,str2, matchDistance){

  cs:=Sink(String);
  foreach i,c in ([0..].zip(str1)){
     str2.find(c,(0).max(i - matchDistance),i + matchDistance) :
     if(Void!=_) cs.write(c);
  }
  cs.close()

}

fcn jaro(str1,str2){

  s1Len,s2Len,matchDistance := str1.len(), str2.len(), s1Len.max(s2Len)/2 - 1;
  cs12,cs21 := _jaro(str1,str2, matchDistance), _jaro(str2,str1, matchDistance); 
  matches:=cs12.len().toFloat();
  if(not matches) return(0.0);
  transpositions:=cs12.walker().zipWith('!=,cs21).filter().sum(0)/2;
  
  ( matches/s1Len + matches/s2Len +
     ((matches - transpositions)/matches) ) / 3.0

}</lang> <lang zkl>foreach s,t in (T(

    T("MARTHA","MARHTA"), T("DIXON","DICKSONX"), T("JELLYFISH","SMELLYFISH"))){
  println(0'|jaro("%s","%s") = %.10f|.fmt(s,t,jaro(s,t)));

}</lang>

Output:
jaro("MARTHA","MARHTA") = 0.9444444444
jaro("DIXON","DICKSONX") = 0.7666666667
jaro("JELLYFISH","SMELLYFISH") = 0.8962962963