Jaro similarity: Difference between revisions

From Rosetta Code
Content added Content deleted
(→‎{{header|Perl 6}}: Arggh. Replaced accidentally deleted content)
Line 234: Line 234:
</pre>
</pre>


=={{header|Perl}}==
<lang perl>use List::Util qw(min max);

sub jaro {
my ($s, $t) = @_;

my $s_len = length($s);
my $t_len = length($t);

return 1 if $s_len == 0 and $t_len == 0;

my $match_distance = int(max($s_len, $t_len) / 2) - 1;

my @s_matches;
my @t_matches;

my @s = split(//, $s);
my @t = split(//, $t);

my $matches = 0;
foreach my $i (0 .. $#s) {

my $start = max(0, $i - $match_distance);
my $end = min($i + $match_distance + 1, $t_len);

foreach my $j ($start .. $end - 1) {
$t_matches[$j] and next;
$s[$i] eq $t[$j] or next;
$s_matches[$i] = 1;
$t_matches[$j] = 1;
$matches++;
last;
}
}

return 0 if $matches == 0;

my $k = 0;
my $transpositions = 0;

foreach my $i (0 .. $#s) {
$s_matches[$i] or next;
until ($t_matches[$k]) { ++$k }
$s[$i] eq $t[$k] or ++$transpositions;
++$k;
}

(($matches / $s_len) + ($matches / $t_len) +
(($matches - $transpositions / 2) / $matches)) / 3;
}

printf("%f\n", jaro("MARTHA", "MARHTA"));
printf("%f\n", jaro("DIXON", "DICKSONX"));
printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));</lang>
{{out}}
<pre>
0.944444
0.766667
0.896296
</pre>
=={{header|Perl 6}}==
=={{header|Perl 6}}==
{{trans|Perl}}
{{trans|Perl}}

Revision as of 17:30, 28 February 2016

Task
Jaro similarity
You are encouraged to solve this task according to the task description, using any language you may know.

The Jaro distance is a measure of similarity between two strings. The higher the Jaro distance for two strings is, the more similar the strings are. The score is normalized such that 0 equates to no similarity and 1 is an exact match.


Definition

The Jaro distance of two given strings and is

Where:

  • is the number of matching characters;
  • is half the number of transpositions.


Two characters from and respectively, are considered matching only if they are the same and not farther than .

Each character of is compared with all its matching characters in . The number of matching (but different sequence order) characters divided by 2 defines the number of transpositions.


Example

Given the strings DWAYNE and DUANE we find:


We find a Jaro score of:


Task

Implement the Jaro-distance algorithm and show the distances for each of the following pairs:

  • ("MARTHA", "MARHTA")
  • ("DIXON", "DICKSONX")
  • ("JELLYFISH", "SMELLYFISH")


See also


C

<lang C>#include <stdlib.h>

  1. include <string.h>
  2. include <ctype.h>
  3. include <stdio.h>
  1. define TRUE 1
  2. define FALSE 0
  1. define max(a, b) ((a) > (b) ? (a) : (b))
  2. define min(a, b) ((a) < (b) ? (a) : (b))

double jaro(const char *str1, const char *str2) {

   // length of the strings
   int str1_len = strlen(str1);
   int str2_len = strlen(str2);
   // if both strings are empty return 1
   // if only one of the strings is empty return 0
   if (str1_len == 0) return str2_len == 0 ? 1.0 : 0.0;
   // max distance between two chars to be considered matching
   // floor() is ommitted due to integer division rules
   int match_distance = (int) max(str1_len, str2_len)/2 - 1;
   // arrays of bools that signify if that char in the matching string has a match
   int *str1_matches = calloc(str1_len, sizeof(int));
   int *str2_matches = calloc(str2_len, sizeof(int));
   // number of matches and transpositions
   double matches = 0.0;
   double transpositions = 0.0;
   // find the matches
   for (int i = 0; i < str1_len; i++) {
       // start and end take into account the match distance
       int start = max(0, i - match_distance);
       int end = min(i + match_distance + 1, str2_len);
       for (int k = start; k < end; k++) {
           // if str2 already has a match continue
           if (str2_matches[k]) continue;
           // if str1 and str2 are not
           if (str1[i] != str2[k]) continue;
           // otherwise assume there is a match
           str1_matches[i] = TRUE;
           str2_matches[k] = TRUE;
           matches++;
           break;
       }
   }
   // if there are no matches return 0
   if (matches == 0) {
       free(str1_matches);
       free(str2_matches);
       return 0.0;
   }
   // count transpositions
   int k = 0;
   for (int i = 0; i < str1_len; i++) {
       // if there are no matches in str1 continue
       if (!str1_matches[i]) continue;
       // while there is no match in str2 increment k
       while (!str2_matches[k]) k++;
       // increment transpositions
       if (str1[i] != str2[k]) transpositions++;
       k++;
   }
   // divide the number of transpositions by two as per the algorithm specs
   // this division is valid because the counted transpositions include both
   // instances of the transposed characters.
   transpositions /= 2.0;
   // free the allocated memory
   free(str1_matches);
   free(str2_matches);
   // return the Jaro distance
   return ((matches / str1_len) +
       (matches / str2_len) +
       ((matches - transpositions) / matches)) / 3.0;

}

int main() {

   printf("%f\n", jaro("MARTHA",    "MARHTA"));
   printf("%f\n", jaro("DIXON",     "DICKSONX"));
   printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));

}</lang>

Output:
0.944444
0.766667
0.896296

J

Implementation:

<lang J>jaro=: dyad define

 d=. ((x >.&# y)%2)-1
 e=. (x =/y) * d >: |x -/&(i.@#) y
 xm=. (+./"1 e)#x
 ym=. (+./"2 e)#y
 m=. xm <.&# ym
 t=. (+/xm ~:&(m&{.) ym)%2
 s1=. #x
 s2=. #y
 ((m%s1)+(m%s2)+(m-t)%m)%3

)</lang>

Task examples:

<lang J> 'MARTHA' jaro 'MARHTA' 0.944444

  'DIXON' jaro 'DICKSONX'

0.766667

  'JELLYFISH' jaro 'SMELLYFISH'

0.896296</lang>

Java

<lang java>public class JaroDistance {

   public static double jaro(String s, String t) {
       int s_len = s.length();
       int t_len = t.length();
       if (s_len == 0 && t_len == 0) return 1;
       int match_distance = Integer.max(s_len, t_len) / 2 - 1;
       boolean[] s_matches = new boolean[s_len];
       boolean[] t_matches = new boolean[t_len];
       int matches = 0;
       int transpositions = 0;
       for (int i = 0; i < s_len; i++) {
           int start = Integer.max(0, i-match_distance);
           int end = Integer.min(i+match_distance+1, t_len);
           for (int j = start; j < end; j++) {
               if (t_matches[j]) continue;
               if (s.charAt(i) != t.charAt(j)) continue;
               s_matches[i] = true;
               t_matches[j] = true;
               matches++;
               break;
           }
       }
       if (matches == 0) return 0;
       int k = 0;
       for (int i = 0; i < s_len; i++) {
           if (!s_matches[i]) continue;
           while (!t_matches[k]) k++;
           if (s.charAt(i) != t.charAt(k)) transpositions++;
           k++;
       }
       return (((double)matches / s_len) +
               ((double)matches / t_len) +
               (((double)matches - transpositions/2.0) / matches)) / 3.0;
   }
   public static void main(String[] args) {
       System.out.println(jaro(   "MARTHA",      "MARHTA"));
       System.out.println(jaro(    "DIXON",    "DICKSONX"));
       System.out.println(jaro("JELLYFISH",  "SMELLYFISH"));
   }

}</lang>

Output:
0.9444444444444445
0.7666666666666666
0.8962962962962964

Perl

<lang perl>use List::Util qw(min max);

sub jaro {

   my ($s, $t) = @_;
   my $s_len = length($s);
   my $t_len = length($t);
   return 1 if $s_len == 0 and $t_len == 0;
   my $match_distance = int(max($s_len, $t_len) / 2) - 1;
   my @s_matches;
   my @t_matches;
   my @s = split(//, $s);
   my @t = split(//, $t);
   my $matches = 0;
   foreach my $i (0 .. $#s) {
       my $start = max(0, $i - $match_distance);
       my $end = min($i + $match_distance + 1, $t_len);
       foreach my $j ($start .. $end - 1) {
           $t_matches[$j] and next;
           $s[$i] eq $t[$j] or next;
           $s_matches[$i] = 1;
           $t_matches[$j] = 1;
           $matches++;
           last;
       }
   }
   return 0 if $matches == 0;
   my $k              = 0;
   my $transpositions = 0;
   foreach my $i (0 .. $#s) {
       $s_matches[$i] or next;
       until ($t_matches[$k]) { ++$k }
       $s[$i] eq $t[$k] or ++$transpositions;
       ++$k;
   }
   (($matches / $s_len) + ($matches / $t_len) +
       (($matches - $transpositions / 2) / $matches)) / 3;

}

printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));</lang>

Output:
0.944444
0.766667
0.896296

Perl 6

Translation of: Perl

<lang perl6>sub jaro ($s, $t) {

   return 1 if $s eq $t;
   my $s_len = + my @s = $s.comb;
   my $t_len = + my @t = $t.comb;
   my $match_distance = ($s_len max $t_len) div 2 - 1;
   my @s_matches;
   my @t_matches;
   my $matches = 0;
   for ^@s -> $i {
       my $start = 0 max $i - $match_distance;
       my $end = $i + $match_distance min $t_len;
       for $start .. $end -> $j {
           @t_matches[$j] and next;
           @s[$i] eq @t[$j] or next;
           @s_matches[$i] = 1;
           @t_matches[$j] = 1;
           $matches++;
           last;
       }
   }
   return 0 if $matches == 0;
   my $k              = 0;
   my $transpositions = 0;
   for ^@s -> $i {
       @s_matches[$i] or next;
       until @t_matches[$k] { ++$k }
       @s[$i] eq @t[$k] or ++$transpositions;
       ++$k;
   }
   ($matches / $s_len + $matches / $t_len +
       (($matches - $transpositions / 2) / $matches)) / 3;

}

printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));</lang>

Output:
0.944444
0.766667
0.896296

Python

<lang python>from __future__ import division

def jaro(s, t):

   s_len = len(s)
   t_len = len(t)
   if s_len == 0 and t_len == 0:
       return 1
   match_distance = (max(s_len, t_len) // 2) - 1
   s_matches = [False] * s_len
   t_matches = [False] * t_len
   matches = 0
   transpositions = 0
   for i in range(s_len):
       start = max(0, i-match_distance)
       end = min(i+match_distance+1, t_len)
       for j in range(start, end):
           if t_matches[j]:
               continue
           if s[i] != t[j]:
               continue
           s_matches[i] = True
           t_matches[j] = True
           matches += 1
           break
   if matches == 0:
       return 0
   k = 0
   for i in range(s_len):
       if not s_matches[i]:
           continue
       while not t_matches[k]:
           k += 1
       if s[i] != t[k]:
           transpositions += 1
       k += 1
   return ((matches / s_len) +
           (matches / t_len) +
           ((matches - transpositions/2) / matches)) / 3

for s,t in [( 'MARTHA', 'MARHTA'),

           (    'DIXON',    'DICKSONX'),
           ('JELLYFISH',  'SMELLYFISH')]:
   print("jaro(%r, %r) = %.10f" % (s, t, jaro(s, t)))</lang>
Output:
jaro('MARTHA', 'MARHTA') = 0.9444444444
jaro('DIXON', 'DICKSONX') = 0.7666666667
jaro('JELLYFISH', 'SMELLYFISH') = 0.8962962963

Racket

Translation of: C

(kinda)

Returns an exact value for the Jaro distance.

<lang racket>#lang racket/base

Translation of: C

(require data/bit-vector)

(define (jaro-distance str1 str2)

 (define str1-len (string-length str1))
 (define str2-len (string-length str2))
 (cond
   [(and (zero? str1-len) (zero? str2-len)) 0]
   [(or  (zero? str1-len) (zero? str2-len)) 1]
   [else     
    ;; vectors of bools that signify if that char in the matching string has a match
    (define str1-matches (make-bit-vector str1-len))
    (define str2-matches (make-bit-vector str2-len))
    (define matches
      ;; max distance between two chars to be considered matching
      (let ((match-distance (sub1 (quotient (max str1-len str2-len) 2))))
        (for/fold ((matches 0))
                  ((i (in-range 0 str1-len))
                   (c1 (in-string str1)))
          (define start (max 0 (- i match-distance)))
          (define end (min (+ i match-distance 1) str2-len))        
          (for/fold ((matches matches))
                    ((k (in-range start end))
                     (c2 (in-string str2 start))
                     #:unless (bit-vector-ref str2-matches k) ; if str2 already has a match continue
                     #:when (char=? c1 c2) ; if str1 and str2 are not
                     #:final #t)
            ;; otherwise assume there is a match
            (bit-vector-set! str1-matches i #t)
            (bit-vector-set! str2-matches k #t)
            (add1 matches)))))
    (cond
      [(zero? matches) 0]
      [else                                
       (define-values (transpositions*2 k+)
         (for/fold ((transpositions 0) (k 0))
                   ((i (in-range 0 str1-len))
                    (c1 (in-string str1))
                    (b1 (in-bit-vector str1-matches))
                    ;; if there are no matches in str1 continue
                    #:when b1)
           (define k+ (for/first ((k+ (in-range k str2-len))
                                  (b2 (in-bit-vector str2-matches k))
                                  #:when b2)
                        k+))        
           (values
            (+ transpositions (if (char=? c1 (string-ref str2 k+)) 0 1)) ; increment transpositions
            (add1 k+)))) ;; while there is no match in str2 increment k             
       
       ;; divide the number of transpositions by two as per the algorithm specs
       ;; this division is valid because the counted transpositions include both
       ;; instances of the transposed characters.
       (define transpositions (quotient transpositions*2 2))
       
       ;; return the Jaro distance
       (/ (+ (/ matches str1-len)
             (/ matches str2-len)
             (/ (- matches transpositions) matches))
          3)])]))

(module+ test

 (jaro-distance "MARTHA"    "MARHTA"); 0.944444 
 (exact->inexact (jaro-distance "MARTHA"    "MARHTA")); 0.944444
 (jaro-distance "DIXON"     "DICKSONX"); 0.766667
 (exact->inexact (jaro-distance "DIXON"     "DICKSONX")); 0.766667
 (jaro-distance "JELLYFISH" "SMELLYFISH"); 0.896296
 (exact->inexact (jaro-distance "JELLYFISH" "SMELLYFISH"))); 0.896296</lang>
Output:

The exact->inexact calls in the tests give an inexact, floating point version of the rational values.

17/18
0.9444444444444444
23/30
0.7666666666666667
121/135
0.8962962962962963

REXX

<lang rexx>/*REXX pgm computes the Jaro distance between 2 strings (or a list of strings)*/ @.= /*define a default for the @. array. */ parse arg @.1 /*obtain an optional character string. */ if @.1= then do /*nothing specified? Use the defaults.*/

               @.1='MARTHA    MARHTA'
               @.2='DIXON     DICKSONX'
               @.3='JELLYFISH SMELLYFISH'
               @.4='DWAYNE    DUANE'
               end
      do j=1  while @.j\==          /*process all the strings in the list. */
      d=jaroDist(@.j)
      say 'Jaro distance is  '    format(d,,5)      " for strings: "      @.j
      end   /*j*/                     /*    └──── digits past decimal point. */

exit /*stick a fork in it, we're all done. */ /*────────────────────────────────────────────────────────────────────────────*/ jaroDist: procedure; arg s.1 s.2 .; L1=length(s.1); L2=length(s.2); m=0 if L1==0 | L2==0 then return 0 /*check if any string is a null string.*/ f=max(L1,L2)%2 - 1 /*calculate furthest distanced allowed.*/ r.=0 /*the relative positions of the letters*/

    do k=1  for L1; _=substr(s.1,k,1)
    p=pos(_,s.2, max(1,k-f));  r.k=p  /*see if the character is near enough. */
    if p\==0 & abs(p-k)<=f then m=m+1 /*if near enough, count it as a match. */
                           else r.k=0 /*       ··· otherwise, don't count it.*/
    end   /*k*/

t=0

    do o=1  for L1;   v=o-1;   _=substr(s.1, o, 1)
    if pos(_, s.2)==0 | r.o==0 | r.v==0  then iterate
    if r.o<r.v  then t=t+1
    end   /*o*/                       /* [↑]  count number of transpositions.*/

if m==0 then return 0

             return (m/L1 + m/L2 + (m-t)/m) / 3</lang>

output   when using the default inputs:

Jaro distance is   0.94444  for strings:  MARTHA    MARHTA
Jaro distance is   0.76667  for strings:  DIXON     DICKSONX
Jaro distance is   0.89630  for strings:  JELLYFISH SMELLYFISH
Jaro distance is   0.82222  for strings:  DWAYNE    DUANE

Ruby

<lang ruby>def jaro(s, t)

   s_len = s.size
   t_len = t.size
   return 1 if s_len == 0 and t_len == 0
   match_distance = ([s_len, t_len].max / 2) - 1
   s_matches = []
   t_matches = []
   matches = 0.0
   transpositions = 0.0
   (0...s_len).each do |i|
       j_start = [0, i-match_distance].max
       j_end = [i+match_distance, t_len-1].min
       (j_start..j_end).each do |j|
           t_matches[j] && next
           s[i] == t[j] || next
           s_matches[i] = true
           t_matches[j] = true
           matches += 1.0
           break
       end
   end
   return 0.0 if matches == 0.0
   k = 0
   (0...s_len).each do |i|
       s_matches[i] || next
       until t_matches[k]
           k += 1
       end
       s[i] == t[k] || (transpositions += 1.0)
       k += 1
   end
   ((matches / s_len) +
     (matches / t_len) +
       ((matches - transpositions/2.0) / matches)) / 3.0

end

for s,t in %w(

      MARTHA      MARHTA
       DIXON    DICKSONX
   JELLYFISH  SMELLYFISH

).each_slice(2)

   puts "jaro(#{s.inspect}, #{t.inspect}) = #{'%.10f' % jaro(s, t)}"

end</lang>

Output:
jaro("MARTHA", "MARHTA") = 0.9444444444
jaro("DIXON", "DICKSONX") = 0.7666666667
jaro("JELLYFISH", "SMELLYFISH") = 0.8962962963

Sidef

<lang ruby>func jaro(s, t) {

   return 1 if (s.is_empty && t.is_empty)
   var s_len = s.len
   var t_len = t.len
   var match_distance = (floor(max(s_len, t_len) / 2) - 1)
   var s_matches = []
   var t_matches = []
   var matches = 0
   var transpositions = 0
   for i in range(s_len) {
       var start = max(0, i-match_distance)
       var end = min(i+match_distance, t_len-1)
       for k in range(start, end) {
           t_matches[k] && next
           s[i] == t[k] || next
           s_matches[i] = true
           t_matches[k] = true
           matches++
           break
       }
   }
   return 0 if (matches == 0)
   var k = 0
   for i in range(s_len) {
       s_matches[i] || next
       while (!t_matches[k]) { ++k }
       s[i] == t[k] || ++transpositions
       ++k
   }
   ((matches / s_len) +
     (matches / t_len) +
       ((matches - transpositions/2) / matches)) / 3

}

for pair in [

   [%c"MARTHA",    %c"MARHTA"],
   [%c"DIXON",     %c"DICKSONX"],
   [%c"JELLYFISH", %c"SMELLYFISH"],

] {

   say "jaro(#{pair.map{.join.dump}.join(', ')}) = #{'%.10f' % jaro(pair...)}"

}</lang>

Output:
jaro("MARTHA", "MARHTA") = 0.9444444444
jaro("DIXON", "DICKSONX") = 0.7666666667
jaro("JELLYFISH", "SMELLYFISH") = 0.8962962963

zkl

<lang zkl> //-->String of matched characters, ordered fcn _jaro(str1,str2, matchDistance){

  cs:=Sink(String);
  foreach i,c in ([0..].zip(str1)){
     str2.find(c,(0).max(i - matchDistance),i + matchDistance) :
     if(Void!=_) cs.write(c);
  }
  cs.close()

}

fcn jaro(str1,str2){

  s1Len,s2Len,matchDistance := str1.len(), str2.len(), s1Len.max(s2Len)/2 - 1;
  cs12,cs21 := _jaro(str1,str2, matchDistance), _jaro(str2,str1, matchDistance); 
  matches:=cs12.len().toFloat();
  if(not matches) return(0.0);
  transpositions:=cs12.walker().zipWith('!=,cs21).filter().sum(0)/2;
  
  ( matches/s1Len + matches/s2Len +
     ((matches - transpositions)/matches) ) / 3.0

}</lang> <lang zkl>foreach s,t in (T(

    T("MARTHA","MARHTA"), T("DIXON","DICKSONX"), T("JELLYFISH","SMELLYFISH"))){
  println(0'|jaro("%s","%s") = %.10f|.fmt(s,t,jaro(s,t)));

}</lang>

Output:
jaro("MARTHA","MARHTA") = 0.9444444444
jaro("DIXON","DICKSONX") = 0.7666666667
jaro("JELLYFISH","SMELLYFISH") = 0.8962962963