Jaro similarity

From Rosetta Code
Revision as of 03:05, 27 May 2016 by rosettacode>Gerard Schildberger (→‎{{header|REXX}}: changed/added comments and whitespace, changed indentations.)
Task
Jaro similarity
You are encouraged to solve this task according to the task description, using any language you may know.

The Jaro distance is a measure of similarity between two strings. The higher the Jaro distance for two strings is, the more similar the strings are. The score is normalized such that 0 equates to no similarity and 1 is an exact match.


Definition

The Jaro distance of two given strings and is

Where:

  • is the number of matching characters;
  • is half the number of transpositions.


Two characters from and respectively, are considered matching only if they are the same and not farther than .

Each character of is compared with all its matching characters in . The number of matching (but different sequence order) characters divided by 2 defines the number of transpositions.


Example

Given the strings DWAYNE and DUANE we find:


We find a Jaro score of:


Task

Implement the Jaro-distance algorithm and show the distances for each of the following pairs:

  • ("MARTHA", "MARHTA")
  • ("DIXON", "DICKSONX")
  • ("JELLYFISH", "SMELLYFISH")


See also


C

<lang C>#include <stdlib.h>

  1. include <string.h>
  2. include <ctype.h>
  3. include <stdio.h>
  1. define TRUE 1
  2. define FALSE 0
  1. define max(a, b) ((a) > (b) ? (a) : (b))
  2. define min(a, b) ((a) < (b) ? (a) : (b))

double jaro(const char *str1, const char *str2) {

   // length of the strings
   int str1_len = strlen(str1);
   int str2_len = strlen(str2);
   // if both strings are empty return 1
   // if only one of the strings is empty return 0
   if (str1_len == 0) return str2_len == 0 ? 1.0 : 0.0;
   // max distance between two chars to be considered matching
   // floor() is ommitted due to integer division rules
   int match_distance = (int) max(str1_len, str2_len)/2 - 1;
   // arrays of bools that signify if that char in the matching string has a match
   int *str1_matches = calloc(str1_len, sizeof(int));
   int *str2_matches = calloc(str2_len, sizeof(int));
   // number of matches and transpositions
   double matches = 0.0;
   double transpositions = 0.0;
   // find the matches
   for (int i = 0; i < str1_len; i++) {
       // start and end take into account the match distance
       int start = max(0, i - match_distance);
       int end = min(i + match_distance + 1, str2_len);
       for (int k = start; k < end; k++) {
           // if str2 already has a match continue
           if (str2_matches[k]) continue;
           // if str1 and str2 are not
           if (str1[i] != str2[k]) continue;
           // otherwise assume there is a match
           str1_matches[i] = TRUE;
           str2_matches[k] = TRUE;
           matches++;
           break;
       }
   }
   // if there are no matches return 0
   if (matches == 0) {
       free(str1_matches);
       free(str2_matches);
       return 0.0;
   }
   // count transpositions
   int k = 0;
   for (int i = 0; i < str1_len; i++) {
       // if there are no matches in str1 continue
       if (!str1_matches[i]) continue;
       // while there is no match in str2 increment k
       while (!str2_matches[k]) k++;
       // increment transpositions
       if (str1[i] != str2[k]) transpositions++;
       k++;
   }
   // divide the number of transpositions by two as per the algorithm specs
   // this division is valid because the counted transpositions include both
   // instances of the transposed characters.
   transpositions /= 2.0;
   // free the allocated memory
   free(str1_matches);
   free(str2_matches);
   // return the Jaro distance
   return ((matches / str1_len) +
       (matches / str2_len) +
       ((matches - transpositions) / matches)) / 3.0;

}

int main() {

   printf("%f\n", jaro("MARTHA",    "MARHTA"));
   printf("%f\n", jaro("DIXON",     "DICKSONX"));
   printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));

}</lang>

Output:
0.944444
0.766667
0.896296

J

Implementation:

<lang J>jaro=: dyad define

 d=. ((x >.&# y)%2)-1
 e=. (x =/y) * d >: |x -/&(i.@#) y
 xm=. (+./"1 e)#x
 ym=. (+./"2 e)#y
 m=. xm <.&# ym
 t=. (+/xm ~:&(m&{.) ym)%2
 s1=. #x
 s2=. #y
 ((m%s1)+(m%s2)+(m-t)%m)%3

)</lang>

Task examples:

<lang J> 'MARTHA' jaro 'MARHTA' 0.944444

  'DIXON' jaro 'DICKSONX'

0.766667

  'JELLYFISH' jaro 'SMELLYFISH'

0.896296</lang>

Java

<lang java>public class JaroDistance {

   public static double jaro(String s, String t) {
       int s_len = s.length();
       int t_len = t.length();
       if (s_len == 0 && t_len == 0) return 1;
       int match_distance = Integer.max(s_len, t_len) / 2 - 1;
       boolean[] s_matches = new boolean[s_len];
       boolean[] t_matches = new boolean[t_len];
       int matches = 0;
       int transpositions = 0;
       for (int i = 0; i < s_len; i++) {
           int start = Integer.max(0, i-match_distance);
           int end = Integer.min(i+match_distance+1, t_len);
           for (int j = start; j < end; j++) {
               if (t_matches[j]) continue;
               if (s.charAt(i) != t.charAt(j)) continue;
               s_matches[i] = true;
               t_matches[j] = true;
               matches++;
               break;
           }
       }
       if (matches == 0) return 0;
       int k = 0;
       for (int i = 0; i < s_len; i++) {
           if (!s_matches[i]) continue;
           while (!t_matches[k]) k++;
           if (s.charAt(i) != t.charAt(k)) transpositions++;
           k++;
       }
       return (((double)matches / s_len) +
               ((double)matches / t_len) +
               (((double)matches - transpositions/2.0) / matches)) / 3.0;
   }
   public static void main(String[] args) {
       System.out.println(jaro(   "MARTHA",      "MARHTA"));
       System.out.println(jaro(    "DIXON",    "DICKSONX"));
       System.out.println(jaro("JELLYFISH",  "SMELLYFISH"));
   }

}</lang>

Output:
0.9444444444444445
0.7666666666666666
0.8962962962962964

PARI/GP

This version was translated from Java and Perl.

Works with: PARI/GP version 2.7.4 and above

<lang parigp> \\Jaro distance between 2 strings s1 and s2. \\ 4/12/16 aev jaroDist(s1,s2)={ my(vt1=Vecsmall(s1),vt2=Vecsmall(s2),n1=#s1,n2=#s2,d,

  md=max(n1,n2)\2-1,cs,ce,mc=0,tr=0,k=1,ds,
  s1m=vector(n1,z,0),s2m=vector(n2,z,0));

if(!n1||!n2, return(0)); for(i=1,n1,

 cs=max(1,i-md);
 ce=min(i+md+1,n2);
 for(j=cs,ce,
   if(s2m[j],next);
   if(vt1[i]!=vt2[j], next);
   mc++; s1m[i]=1; s2m[j]=1; break;
 );\\fend j

);\\fend i if(!mc, return(0)); for(i=1,n1,

 if(!s1m[i], next);
 while(!s2m[k], k++);
 if(vt1[i]!=vt2[k], tr++);
 k++

);\\fend i d=(mc/n1+mc/n2+(mc-tr/2)/mc)/3.0; ds=Strprintf("%.5f",d); print(" *** Jaro distance is: ",ds," for strings: ",s1,", ",s2); return(d); }

{ \\ Testing: jaroDist("MARTHA","MARHTA"); jaroDist("DIXON","DICKSONX"); jaroDist("JELLYFISH","SMELLYFISH"); jaroDist("DWAYNE","DUANE"); } </lang>

Output:
 *** Jaro distance is: 0.94444 for strings: MARTHA, MARHTA
 *** Jaro distance is: 0.76667 for strings: DIXON, DICKSONX
 *** Jaro distance is: 0.89630 for strings: JELLYFISH, SMELLYFISH
 *** Jaro distance is: 0.82222 for strings: DWAYNE, DUANE

Perl

<lang perl>use List::Util qw(min max);

sub jaro {

   my ($s, $t) = @_;
   my $s_len = length($s);
   my $t_len = length($t);
   return 1 if $s_len == 0 and $t_len == 0;
   my $match_distance = int(max($s_len, $t_len) / 2) - 1;
   my @s_matches;
   my @t_matches;
   my @s = split(//, $s);
   my @t = split(//, $t);
   my $matches = 0;
   foreach my $i (0 .. $#s) {
       my $start = max(0, $i - $match_distance);
       my $end = min($i + $match_distance + 1, $t_len);
       foreach my $j ($start .. $end - 1) {
           $t_matches[$j] and next;
           $s[$i] eq $t[$j] or next;
           $s_matches[$i] = 1;
           $t_matches[$j] = 1;
           $matches++;
           last;
       }
   }
   return 0 if $matches == 0;
   my $k              = 0;
   my $transpositions = 0;
   foreach my $i (0 .. $#s) {
       $s_matches[$i] or next;
       until ($t_matches[$k]) { ++$k }
       $s[$i] eq $t[$k] or ++$transpositions;
       ++$k;
   }
   (($matches / $s_len) + ($matches / $t_len) +
       (($matches - $transpositions / 2) / $matches)) / 3;

}

printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));</lang>

Output:
0.944444
0.766667
0.896296

Perl 6

Translation of: Perl

<lang perl6>sub jaro ($s, $t) {

   return 1 if $s eq $t;
   my $s_len = + my @s = $s.comb;
   my $t_len = + my @t = $t.comb;
   my $match_distance = ($s_len max $t_len) div 2 - 1;
   my @s_matches;
   my @t_matches;
   my $matches = 0;
   for ^@s -> $i {
       my $start = 0 max $i - $match_distance;
       my $end = $i + $match_distance min $t_len;
       for $start .. $end -> $j {
           @t_matches[$j] and next;
           @s[$i] eq @t[$j] or next;
           @s_matches[$i] = 1;
           @t_matches[$j] = 1;
           $matches++;
           last;
       }
   }
   return 0 if $matches == 0;
   my $k              = 0;
   my $transpositions = 0;
   for ^@s -> $i {
       @s_matches[$i] or next;
       until @t_matches[$k] { ++$k }
       @s[$i] eq @t[$k] or ++$transpositions;
       ++$k;
   }
   ($matches / $s_len + $matches / $t_len +
       (($matches - $transpositions / 2) / $matches)) / 3;

}

printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));</lang>

Output:
0.944444
0.766667
0.896296

Python

<lang python>from __future__ import division

def jaro(s, t):

   s_len = len(s)
   t_len = len(t)
   if s_len == 0 and t_len == 0:
       return 1
   match_distance = (max(s_len, t_len) // 2) - 1
   s_matches = [False] * s_len
   t_matches = [False] * t_len
   matches = 0
   transpositions = 0
   for i in range(s_len):
       start = max(0, i-match_distance)
       end = min(i+match_distance+1, t_len)
       for j in range(start, end):
           if t_matches[j]:
               continue
           if s[i] != t[j]:
               continue
           s_matches[i] = True
           t_matches[j] = True
           matches += 1
           break
   if matches == 0:
       return 0
   k = 0
   for i in range(s_len):
       if not s_matches[i]:
           continue
       while not t_matches[k]:
           k += 1
       if s[i] != t[k]:
           transpositions += 1
       k += 1
   return ((matches / s_len) +
           (matches / t_len) +
           ((matches - transpositions/2) / matches)) / 3

for s,t in [( 'MARTHA', 'MARHTA'),

           (    'DIXON',    'DICKSONX'),
           ('JELLYFISH',  'SMELLYFISH')]:
   print("jaro(%r, %r) = %.10f" % (s, t, jaro(s, t)))</lang>
Output:
jaro('MARTHA', 'MARHTA') = 0.9444444444
jaro('DIXON', 'DICKSONX') = 0.7666666667
jaro('JELLYFISH', 'SMELLYFISH') = 0.8962962963

Racket

Translation of: C

(kinda)

Returns an exact value for the Jaro distance.

<lang racket>#lang racket/base

Translation of: C

(require data/bit-vector)

(define (jaro-distance str1 str2)

 (define str1-len (string-length str1))
 (define str2-len (string-length str2))
 (cond
   [(and (zero? str1-len) (zero? str2-len)) 0]
   [(or  (zero? str1-len) (zero? str2-len)) 1]
   [else     
    ;; vectors of bools that signify if that char in the matching string has a match
    (define str1-matches (make-bit-vector str1-len))
    (define str2-matches (make-bit-vector str2-len))
    (define matches
      ;; max distance between two chars to be considered matching
      (let ((match-distance (sub1 (quotient (max str1-len str2-len) 2))))
        (for/fold ((matches 0))
                  ((i (in-range 0 str1-len))
                   (c1 (in-string str1)))
          (define start (max 0 (- i match-distance)))
          (define end (min (+ i match-distance 1) str2-len))        
          (for/fold ((matches matches))
                    ((k (in-range start end))
                     (c2 (in-string str2 start))
                     #:unless (bit-vector-ref str2-matches k) ; if str2 already has a match continue
                     #:when (char=? c1 c2) ; if str1 and str2 are not
                     #:final #t)
            ;; otherwise assume there is a match
            (bit-vector-set! str1-matches i #t)
            (bit-vector-set! str2-matches k #t)
            (add1 matches)))))
    (cond
      [(zero? matches) 0]
      [else                                
       (define-values (transpositions*2 k+)
         (for/fold ((transpositions 0) (k 0))
                   ((i (in-range 0 str1-len))
                    (c1 (in-string str1))
                    (b1 (in-bit-vector str1-matches))
                    ;; if there are no matches in str1 continue
                    #:when b1)
           (define k+ (for/first ((k+ (in-range k str2-len))
                                  (b2 (in-bit-vector str2-matches k))
                                  #:when b2)
                        k+))        
           (values
            (+ transpositions (if (char=? c1 (string-ref str2 k+)) 0 1)) ; increment transpositions
            (add1 k+)))) ;; while there is no match in str2 increment k             
       
       ;; divide the number of transpositions by two as per the algorithm specs
       ;; this division is valid because the counted transpositions include both
       ;; instances of the transposed characters.
       (define transpositions (quotient transpositions*2 2))
       
       ;; return the Jaro distance
       (/ (+ (/ matches str1-len)
             (/ matches str2-len)
             (/ (- matches transpositions) matches))
          3)])]))

(module+ test

 (jaro-distance "MARTHA"    "MARHTA"); 0.944444 
 (exact->inexact (jaro-distance "MARTHA"    "MARHTA")); 0.944444
 (jaro-distance "DIXON"     "DICKSONX"); 0.766667
 (exact->inexact (jaro-distance "DIXON"     "DICKSONX")); 0.766667
 (jaro-distance "JELLYFISH" "SMELLYFISH"); 0.896296
 (exact->inexact (jaro-distance "JELLYFISH" "SMELLYFISH"))); 0.896296</lang>
Output:

The exact->inexact calls in the tests give an inexact, floating point version of the rational values.

17/18
0.9444444444444444
23/30
0.7666666666666667
121/135
0.8962962962962963

REXX

<lang rexx>/*REXX program computes the Jaro distance between two strings (or a list of strings).*/ @.= /*define a default for the @. array. */ parse arg @.1 /*obtain an optional character string. */ if @.1= then do /*nothing specified? Use the defaults.*/

               @.1='MARTHA    MARHTA'
               @.2='DIXON     DICKSONX'
               @.3='JELLYFISH SMELLYFISH'
               @.4='DWAYNE    DUANE'
               end
      do j=1  while @.j\==                    /*process all the strings in the list. */
      d=jaroDist(@.j)
      say 'Jaro distance is  '         format(d, , 5)        " for strings: "        @.j
      end   /*j*/                               /* └──── digits past the decimal point.*/

exit /*stick a fork in it, we're all done. */ /*──────────────────────────────────────────────────────────────────────────────────────*/ jaroDist: procedure; arg s.1 s.2 .; L1=length(s.1); L2=length(s.2); m=0

         if L1==0 | L2==0  then return 0        /*check if any string is a null string.*/
         f=max(L1,L2)%2 - 1                     /*calculate furthest distanced allowed.*/
         r.=0                                   /*the relative positions of the letters*/
              do k=1  for L1; _=substr(s.1,k,1)
              p=pos(_,s.2, max(1,k-f));   r.k=p /*see if the character is near enough. */
              if p\==0 & abs(p-k)<=f then m=m+1 /*if near enough, count it as a match. */
                                     else r.k=0 /*       ··· otherwise, don't count it.*/
              end   /*k*/
         t=0
              do o=1  for L1;   v=o-1;   _=substr(s.1, o, 1)
              if pos(_, s.2)==0 | r.o==0 | r.v==0  then iterate
              if r.o<r.v  then t=t+1
              end   /*o*/                       /* [↑]  count number of transpositions.*/
         if m==0  then return 0
                       return (m/L1 + m/L2 + (m-t)/m) / 3</lang>

output   when using the default inputs:

Jaro distance is   0.94444  for strings:  MARTHA    MARHTA
Jaro distance is   0.76667  for strings:  DIXON     DICKSONX
Jaro distance is   0.89630  for strings:  JELLYFISH SMELLYFISH
Jaro distance is   0.82222  for strings:  DWAYNE    DUANE

Ruby

<lang ruby>def jaro(s, t)

   s_len = s.size
   t_len = t.size
   return 1 if s_len == 0 and t_len == 0
   match_distance = ([s_len, t_len].max / 2) - 1
   s_matches = []
   t_matches = []
   matches = 0.0
   transpositions = 0.0
   (0...s_len).each do |i|
       j_start = [0, i-match_distance].max
       j_end = [i+match_distance, t_len-1].min
       (j_start..j_end).each do |j|
           t_matches[j] && next
           s[i] == t[j] || next
           s_matches[i] = true
           t_matches[j] = true
           matches += 1.0
           break
       end
   end
   return 0.0 if matches == 0.0
   k = 0
   (0...s_len).each do |i|
       s_matches[i] || next
       until t_matches[k]
           k += 1
       end
       s[i] == t[k] || (transpositions += 1.0)
       k += 1
   end
   ((matches / s_len) +
     (matches / t_len) +
       ((matches - transpositions/2.0) / matches)) / 3.0

end

for s,t in %w(

      MARTHA      MARHTA
       DIXON    DICKSONX
   JELLYFISH  SMELLYFISH

).each_slice(2)

   puts "jaro(#{s.inspect}, #{t.inspect}) = #{'%.10f' % jaro(s, t)}"

end</lang>

Output:
jaro("MARTHA", "MARHTA") = 0.9444444444
jaro("DIXON", "DICKSONX") = 0.7666666667
jaro("JELLYFISH", "SMELLYFISH") = 0.8962962963

Sidef

<lang ruby>func jaro(s, t) {

   return 1 if (s.is_empty && t.is_empty)
   var s_len = s.len
   var t_len = t.len
   var match_distance = (floor(max(s_len, t_len) / 2) - 1)
   var s_matches = []
   var t_matches = []
   var matches = 0
   var transpositions = 0
   for i in range(s_len) {
       var start = max(0, i-match_distance)
       var end = min(i+match_distance, t_len-1)
       for k in range(start, end) {
           t_matches[k] && next
           s[i] == t[k] || next
           s_matches[i] = true
           t_matches[k] = true
           matches++
           break
       }
   }
   return 0 if (matches == 0)
   var k = 0
   for i in range(s_len) {
       s_matches[i] || next
       while (!t_matches[k]) { ++k }
       s[i] == t[k] || ++transpositions
       ++k
   }
   ((matches / s_len) +
     (matches / t_len) +
       ((matches - transpositions/2) / matches)) / 3

}

for pair in [

   [%c"MARTHA",    %c"MARHTA"],
   [%c"DIXON",     %c"DICKSONX"],
   [%c"JELLYFISH", %c"SMELLYFISH"],

] {

   say "jaro(#{pair.map{.join.dump}.join(', ')}) = #{'%.10f' % jaro(pair...)}"

}</lang>

Output:
jaro("MARTHA", "MARHTA") = 0.9444444444
jaro("DIXON", "DICKSONX") = 0.7666666667
jaro("JELLYFISH", "SMELLYFISH") = 0.8962962963

zkl

<lang zkl> //-->String of matched characters, ordered fcn _jaro(str1,str2, matchDistance){

  cs:=Sink(String);
  foreach i,c in ([0..].zip(str1)){
     str2.find(c,(0).max(i - matchDistance),i + matchDistance) :
     if(Void!=_) cs.write(c);
  }
  cs.close()

}

fcn jaro(str1,str2){

  s1Len,s2Len,matchDistance := str1.len(), str2.len(), s1Len.max(s2Len)/2 - 1;
  cs12,cs21 := _jaro(str1,str2, matchDistance), _jaro(str2,str1, matchDistance); 
  matches:=cs12.len().toFloat();
  if(not matches) return(0.0);
  transpositions:=cs12.walker().zipWith('!=,cs21).filter().sum(0)/2;
  
  ( matches/s1Len + matches/s2Len +
     ((matches - transpositions)/matches) ) / 3.0

}</lang> <lang zkl>foreach s,t in (T(

    T("MARTHA","MARHTA"), T("DIXON","DICKSONX"), T("JELLYFISH","SMELLYFISH"))){
  println(0'|jaro("%s","%s") = %.10f|.fmt(s,t,jaro(s,t)));

}</lang>

Output:
jaro("MARTHA","MARHTA") = 0.9444444444
jaro("DIXON","DICKSONX") = 0.7666666667
jaro("JELLYFISH","SMELLYFISH") = 0.8962962963