Jaro similarity
The Jaro distance is a measure of similarity between two strings.
You are encouraged to solve this task according to the task description, using any language you may know.
The higher the Jaro distance for two strings is, the more similar the strings are.
The score is normalized such that 0 equates to no similarity and 1 is an exact match.
- Definition
The Jaro distance of two given strings and is
Where:
- is the number of matching characters;
- is half the number of transpositions.
Two characters from and respectively, are considered matching only if they are the same and not farther than .
Each character of is compared with all its matching characters in .
The number of matching (but different sequence order) characters divided by 2 defines the number of transpositions.
- Example
Given the strings DWAYNE and DUANE we find:
We find a Jaro score of:
- Task
Implement the Jaro-distance algorithm and show the distances for each of the following pairs:
- ("MARTHA", "MARHTA")
- ("DIXON", "DICKSONX")
- ("JELLYFISH", "SMELLYFISH")
- See also
- Jaro–Winkler distance on Wikipedia.
AWK
<lang AWK>
- syntax: GAWK -f JARO_DISTANCE.AWK
BEGIN {
main("DWAYNE","DUANE") main("MARTHA","MARHTA") main("DIXON","DICKSONX") main("JELLYFISH","SMELLYFISH") exit(0)
} function main(str1,str2) {
printf("%9.7f '%s' '%s'\n",jaro(str1,str2),str1,str2)
} function jaro(str1,str2, begin,end,i,j,k,leng1,leng2,match_distance,matches,str1_arr,str2_arr,transpositions) {
leng1 = length(str1) leng2 = length(str2) if (leng1 == 0 && leng2 == 0) { # both strings are empty return(1) } if (leng1 == 0 || leng2 == 0) { # only one string is empty return(0) } match_distance = int(max(leng1,leng2)/2-1) for (i=1; i<=leng1; i++) { # find matches begin = int(max(0,i-match_distance)) end = int(min(i+match_distance+1,leng2)) for (j=begin; j<=end; j++) { if (str2_arr[j]) { continue } if (substr(str1,i,1) != substr(str2,j,1)) { continue } str1_arr[i] = 1 str2_arr[j] = 1 matches++ break } } if (matches == 0) { return(0) } k = 0 for (i=1; i<=leng1; i++) { # count transpositions if (!str1_arr[i]) { continue } while (!str2_arr[k]) { k++ } if (substr(str1,i,1) != substr(str2,k,1)) { transpositions++ } k++ } transpositions /= 2 return((matches/leng1)+(matches/leng2)+((matches-transpositions)/matches))/3
} function max(x,y) { return((x > y) ? x : y) } function min(x,y) { return((x < y) ? x : y) } </lang>
- Output:
0.8222222 'DWAYNE' 'DUANE' 0.9444444 'MARTHA' 'MARHTA' 0.7666667 'DIXON' 'DICKSONX' 0.8962963 'JELLYFISH' 'SMELLYFISH'
C
<lang C>#include <stdlib.h>
- include <string.h>
- include <ctype.h>
- include <stdio.h>
- define TRUE 1
- define FALSE 0
- define max(a, b) ((a) > (b) ? (a) : (b))
- define min(a, b) ((a) < (b) ? (a) : (b))
double jaro(const char *str1, const char *str2) {
// length of the strings int str1_len = strlen(str1); int str2_len = strlen(str2);
// if both strings are empty return 1 // if only one of the strings is empty return 0 if (str1_len == 0) return str2_len == 0 ? 1.0 : 0.0;
// max distance between two chars to be considered matching // floor() is ommitted due to integer division rules int match_distance = (int) max(str1_len, str2_len)/2 - 1;
// arrays of bools that signify if that char in the matching string has a match int *str1_matches = calloc(str1_len, sizeof(int)); int *str2_matches = calloc(str2_len, sizeof(int));
// number of matches and transpositions double matches = 0.0; double transpositions = 0.0;
// find the matches for (int i = 0; i < str1_len; i++) { // start and end take into account the match distance int start = max(0, i - match_distance); int end = min(i + match_distance + 1, str2_len);
for (int k = start; k < end; k++) { // if str2 already has a match continue if (str2_matches[k]) continue; // if str1 and str2 are not if (str1[i] != str2[k]) continue; // otherwise assume there is a match str1_matches[i] = TRUE; str2_matches[k] = TRUE; matches++; break; } }
// if there are no matches return 0 if (matches == 0) { free(str1_matches); free(str2_matches); return 0.0; }
// count transpositions int k = 0; for (int i = 0; i < str1_len; i++) { // if there are no matches in str1 continue if (!str1_matches[i]) continue; // while there is no match in str2 increment k while (!str2_matches[k]) k++; // increment transpositions if (str1[i] != str2[k]) transpositions++; k++; }
// divide the number of transpositions by two as per the algorithm specs // this division is valid because the counted transpositions include both // instances of the transposed characters. transpositions /= 2.0;
// free the allocated memory free(str1_matches); free(str2_matches);
// return the Jaro distance return ((matches / str1_len) + (matches / str2_len) + ((matches - transpositions) / matches)) / 3.0;
}
int main() {
printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));
}</lang>
- Output:
0.944444 0.766667 0.896296
C++
<lang cpp>#include <algorithm>
- include <iostream>
- include <string>
double jaro(const std::string s1, const std::string s2) {
const uint l1 = s1.length(), l2 = s2.length(); if (l1 == 0) return l2 == 0 ? 1.0 : 0.0; const uint match_distance = std::max(l1, l2) / 2 - 1; bool s1_matches[l1]; bool s2_matches[l2]; std::fill(s1_matches, s1_matches + l1, false); std::fill(s2_matches, s2_matches + l2, false); uint matches = 0; for (uint i = 0; i < l1; i++) { const int end = std::min(i + match_distance + 1, l2); for (int k = std::max(0u, i - match_distance); k < end; k++) if (!s2_matches[k] && s1[i] == s2[k]) { s1_matches[i] = true; s2_matches[k] = true; matches++; break; } } if (matches == 0) return 0.0; double t = 0.0; uint k = 0; for (uint i = 0; i < l1; i++) if (s1_matches[i]) { while (!s2_matches[k]) k++; if (s1[i] != s2[k]) t += 0.5; k++; }
const double m = matches; return (m / l1 + m / l2 + (m - t) / m) / 3.0;
}
int main() {
using namespace std; cout << jaro("MARTHA", "MARHTA") << endl; cout << jaro("DIXON", "DICKSONX") << endl; cout << jaro("JELLYFISH", "SMELLYFISH") << endl; return 0;
}</lang>
Clojure
<lang clojure> (ns test-project-intellij.core
(:gen-class))
(defn find-matches [s t]
" find match locations in the two strings " " s_matches is set to true wherever there is a match in t and t_matches is set conversely " (let [s_len (count s) t_len (count t) match_distance (int (- (/ (max s_len t_len) 2) 1)) matches 0 transpositions 0 fn-start (fn [i] (max 0 (- i match_distance))) ; function to compute starting position fn-end (fn [i] (min (+ i match_distance 1) (- t_len 1))) ] ; function to compute end position (loop [i 0 start (fn-start i) end (fn-end i) k start s_matches (vec (repeat (count s) false)) t_matches (vec (repeat (count t) false)) matches 0]
(if (< i s_len)
(if (<= k end)
(if (get t_matches k) ; continue with next k (recur i start end (inc k) s_matches t_matches matches)
(if (= (get s i) (get t k)) ; match a position, so update matches, s_matches, t_matches to reflect match (recur (inc i) (fn-start (inc i)) (fn-end (inc i)) (fn-start (inc i)) (assoc s_matches i true) (assoc t_matches k true) (inc matches)) ; no match so try next k (recur i start end (inc k) s_matches t_matches matches)))
; End of k iterations, so increment i and set k to start based upon i (recur (inc i) (fn-start (inc i)) (fn-end (inc i)) (fn-start (inc i)) s_matches t_matches matches))
; End of i iterations [matches s_matches t_matches]))))
(defn count-transpositions [s t s_matches t_matches]
" Utility function to count the number of transpositions " (let [s_len (count s)] (loop [i 0 k 0 transpositions 0]
(if (< i s_len) ; still elements in s (since i < s_len) (if (not (get s_matches i nil)) ; skip to next i since there are no matches in s (recur (inc i) k transpositions) ; checking for match in t (if (not (get t_matches k nil)) ; keeping looping around as long as there are no matches in t (recur i (inc k) transpositions) (if (not= (get s i) (get t k)) ; increment transposition count (if strings don't equal at match location) (recur (inc i) (inc k) (inc transpositions)) ; was a match, so advance i and k without increasing transpositions count (recur (inc i) (inc k) transpositions)))) ; Return count transpositions))))
(defn jaro [s t]
" Main Jaro Distance routine" (if (= s t) 1 (let [[matches s_matches t_matches] (find-matches s t)] (if (= 0 matches) 0 (let [s_len (count s) t_len (count t) transpositions (count-transpositions s t s_matches t_matches)] (float (/ (+ (/ matches s_len) (/ matches t_len) (/ (- matches (/ transpositions 2)) matches)) 3)))))))
(println (jaro "MARTHA" "MARHTA"))
(println (jaro "DIXON" "DICKSONX"))
(println (jaro "JELLYFISH" "SMELLYFISH"))
</lang>
- Output:
0.9444444 0.76666665 0.8962963
CoffeeScript
<lang coffeescript>jaro = (s1, s2) ->
l1 = s1.length l2 = s2.length if l1 == 0 then return if l2 == 0 then 1.0 else 0.0 match_distance = Math.max(l1, l2) / 2 - 1 s1_matches = [] s2_matches = [] m = 0 for i in [0...l1] end = Math.min(i + match_distance + 1, l2) for k in [Math.max(0, i - match_distance)...end] if !s2_matches[k] and s1[i] == s2[k] s1_matches[i] = true s2_matches[k] = true m++ break if m == 0 0.0 else t = 0.0 k = 0 for i in [0...l1] if s1_matches[i] until s2_matches[k] then k++ if s1[i] != s2[k++] then t += 0.5 (m / l1 + m / l2 + (m - t) / m) / 3.0
console.log jaro "MARTHA", "MARHTA" console.log jaro "DIXON", "DICKSONX" console.log jaro "JELLYFISH", "SMELLYFISH"</lang>
- Output:
0.9444444444444445 0.7666666666666666 0.8962962962962964
D
<lang D>auto jaro(in string s1, in string s2) {
int s1_len = cast(int) s1.length; int s2_len = cast(int) s2.length; if (s1_len == 0 && s2_len == 0) return 1;
import std.algorithm.comparison: min, max; auto match_distance = max(s1_len, s2_len) / 2 - 1; auto s1_matches = new bool[s1_len]; auto s2_matches = new bool[s2_len]; int matches = 0; for (auto i = 0; i < s1_len; i++) { auto start = max(0, i - match_distance); auto end = min(i + match_distance + 1, s2_len); for (auto j = start; j < end; j++) if (!s2_matches[j] && s1[i] == s2[j]) { s1_matches[i] = true; s2_matches[j] = true; matches++; break; } } if (matches == 0) return 0;
auto t = 0.0; auto k = 0; for (auto i = 0; i < s1_len; i++) if (s1_matches[i]) { while (!s2_matches[k]) k++; if (s1[i] != s2[k++]) t += 0.5; } double m = matches; return (m / s1_len + m / s2_len + (m - t) / m) / 3.0;
}
void main() {
import std.stdio: writeln; writeln(jaro( "MARTHA", "MARHTA")); writeln(jaro( "DIXON", "DICKSONX")); writeln(jaro("JELLYFISH", "SMELLYFISH"));
}</lang>
0.944444 0.766667 0.896296
Elixir
<lang elixir>defmodule Jaro do
def distance(s, t) when is_binary(s) and is_binary(t), do: distance(to_charlist(s), to_charlist(t)) def distance(x, x), do: 1.0 def distance(s, t) do s_len = length(s) t_len = length(t) {s_matches, t_matches, matches} = matching(s, t, s_len, t_len) if matches == 0 do 0.0 else {k, transpositions} = transposition(s, t, s_matches, t_matches) ((matches / s_len) + (matches / t_len) + ((matches - transpositions/2) / matches)) / 3 end end defp matching(s, t, s_len, t_len) do match_distance = div(max(s_len, t_len), 2) - 1 ac0 = {List.duplicate(false, s_len), List.duplicate(false, t_len), 0} Enum.reduce(0..s_len-1, ac0, fn i,acc -> j_start = max(0, i-match_distance) j_end = min(i+match_distance, t_len-1) Enum.reduce_while(j_start..j_end, acc, fn j,{sm,tm,m} -> if Enum.at(tm, j) or Enum.at(s, i) != Enum.at(t, j) do {:cont, {sm, tm, m}} else {:halt, { List.replace_at(sm, i, true), List.replace_at(tm, j, true), m + 1 }} end end) end) end defp transposition(s, t, s_matches, t_matches) do Enum.reduce(0..length(s)-1, {0,0}, fn i,{k,transpositions} -> if Enum.at(s_matches, i) do k = k + (Enum.drop(t_matches, k) |> Enum.take_while(fn matche -> not matche end) |> length) if Enum.at(s, i) == Enum.at(t, k), do: {k+1, transpositions}, else: {k+1, transpositions+1} else {k, transpositions} end end) end
end
~w( MARTHA MARHTA
DIXON DICKSONX JELLYFISH SMELLYFISH )c
|> Enum.chunk(2) |> Enum.each(fn [s,t] ->
:io.format "jaro(~s, ~s) = ~.10f~n", [inspect(s), inspect(t), Jaro.distance(s, t)] end)</lang>
- Output:
jaro('MARTHA', 'MARHTA') = 0.9444444444 jaro('DIXON', 'DICKSONX') = 0.7666666667 jaro('JELLYFISH', 'SMELLYFISH') = 0.8962962963
Elixir has a built-in function (String.jaro_distance
).
FreeBASIC
<lang freebasic>' version 09-10-2016 ' compile with: fbc -s console
- Macro max(x, y)
IIf((x) > (y), (x), (y))
- EndMacro
- Macro min(x, y)
IIf((x) < (y), (x), (y))
- EndMacro
Function jaro(word1 As String, word2 As String) As Double
If Len(word1) > Len(word2) Then Swap word1, word2
Dim As Long i, j, j1, m, t Dim As Long s1 = Len(word1) Dim As Long s2 = Len(word2) Dim As Long max_dist = s2 \ 2 -1 ' integer division
For i = 0 To s1 -1 If word1[i] = word2[j] Then m = m +1 word2[j] = 32 Else For j1 = max(0, i - max_dist) To min(s2 -1, i + max_dist) If word1[i] = word2[j1] Then t = t +1 m = m +1 word2[j1] = 32 If j1 > j Then j = j1 End If Next End If j = j + 1 Next If m = 0 Then Return 0
t = t \ 2 Return (m / s1 + m / s2 + ((m - t) / m)) / 3
End Function
' ------=< MAIN >=------
Print Print " jaro (MARTHA, MARHTA) ="; jaro("MARTHA", "MARHTA") Print " jaro (DIXON, DICKSONX) ="; jaro("DIXON", "DICKSONX") Print " jaro (JELLYFISH, SMELLYFISH) ="; jaro("JELLYFISH", "SMELLYFISH")
' empty keyboard buffer
While Inkey <> "" : Wend
Print : Print "hit any key to end program"
Sleep
End</lang>
- Output:
jaro (MARTHA, MARHTA) = 0.9444444444444444 jaro (DIXON, DICKSONX) = 0.7666666666666667 jaro (JELLYFISH, SMELLYFISH) = 0.8962962962962963
Go
<lang go>package main
import "fmt"
func jaro(str1, str2 string) float64 {
if len(str1) == 0 && len(str2) == 0 { return 1 } if len(str1) == 0 || len(str2) == 0 { return 0 } match_distance := len(str1) if len(str2) > match_distance { match_distance = len(str2) } match_distance = match_distance/2 - 1 str1_matches := make([]bool, len(str1)) str2_matches := make([]bool, len(str2)) matches := 0. transpositions := 0. for i := range str1 { start := i - match_distance if start < 0 { start = 0 } end := i + match_distance + 1 if end > len(str2) { end = len(str2) } for k := start; k < end; k++ { if str2_matches[k] { continue } if str1[i] != str2[k] { continue } str1_matches[i] = true str2_matches[k] = true matches++ break } } if matches == 0 { return 0 } k := 0 for i := range str1 { if !str1_matches[i] { continue } for !str2_matches[k] { k++ } if str1[i] != str2[k] { transpositions++ } k++ } transpositions /= 2 return (matches/float64(len(str1)) + matches/float64(len(str2)) + (matches-transpositions)/matches) / 3
}
func main() {
fmt.Printf("%f\n", jaro("MARTHA", "MARHTA")) fmt.Printf("%f\n", jaro("DIXON", "DICKSONX")) fmt.Printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"))
}</lang>
- Output:
0.944444 0.766667 0.896296
Haskell
<lang Haskell>import Data.Ord (comparing) import Text.Printf (printf) import Data.List (sortBy, elemIndex, intercalate)
jaro :: String -> String -> Float jaro x y =
let f = (fromIntegral . length) [m, t] = [f, fromIntegral . transpositions] <*> [matches x y] [s1, s2] = [f] <*> [x, y] in if m == 0 then 0 else (1 / 3) * ((m / s1) + (m / s2) + ((m - t) / m))
matches :: String -> String -> [(Int, Char)] matches s1 s2 =
let [(l1, xs), (l2, ys)] = sortBy (comparing fst) ((length >>= (,)) <$> [s1, s2]) r = quot l2 2 - 1 in foldr (\(c, n) a -> let offset = max 0 (n - (r + 1)) -- initial chars out of range ? -- Any index for this char within range ? in case elemIndex c (drop offset (take (n + r) ys)) of Just i -> (offset + i, c) : a Nothing -> a) [] (zip xs [1 ..])
transpositions :: [(Int, Char)] -> Int transpositions =
let f (x, y) = if fst x > fst y then succ else id in foldr f 0 . (zip <*> tail)
-- TEST ---------------------------------------------------------------------- main :: IO () main =
mapM_ putStrLn $ (\(s1, s2) -> intercalate " -> " [s1, s2, printf "%.3f\n" $ jaro s1 s2]) <$> [ ("DWAYNE", "DUANE") , ("MARTHA", "MARHTA") , ("DIXON", "DICKSONX") , ("JELLYFISH", "SMELLYFISH") ]</lang>
- Output:
DWAYNE -> DUANE -> 0.822 MARTHA -> MARHTA -> 0.944 DIXON -> DICKSONX -> 0.767 JELLYFISH -> SMELLYFISH -> 0.896
Haxe
<lang Haxe>class Jaro {
private static function jaro(s1: String, s2: String): Float { var s1_len = s1.length; var s2_len = s2.length; if (s1_len == 0 && s2_len == 0) return 1; var match_distance = Std.int(Math.max(s1_len, s2_len)) / 2 - 1; var matches = { s1: [for(n in 0...s1_len) false], s2: [for(n in 0...s2_len) false] }; var m = 0; for (i in 0...s1_len) { var start = Std.int(Math.max(0, i - match_distance)); var end = Std.int(Math.min(i + match_distance + 1, s2_len)); for (j in start...end) if (!matches.s2[j] && s1.charAt(i) == s2.charAt(j)) {
matches.s1[i] = true; matches.s2[j] = true; m++; break;
} } if (m == 0) return 0; var k = 0; var t = 0.; for (i in 0...s1_len) if (matches.s1[i]) { while (!matches.s2[k]) k++; if (s1.charAt(i) != s2.charAt(k++)) t += 0.5; } return (m / s1_len + m / s2_len + (m - t) / m) / 3.0; } public static function main() { Sys.println(jaro( "MARTHA", "MARHTA")); Sys.println(jaro( "DIXON", "DICKSONX")); Sys.println(jaro("JELLYFISH", "SMELLYFISH")); }
}</lang>
- Output:
0.944444444444445 0.766666666666667 0.896296296296296
J
Implementation:
<lang J>jaro=: dyad define
d=. ((x >.&# y)%2)-1 e=. (x =/y) * d >: |x -/&(i.@#) y xm=. (+./"1 e)#x ym=. (+./"2 e)#y m=. xm <.&# ym t=. (+/xm ~:&(m&{.) ym)%2 s1=. #x s2=. #y ((m%s1)+(m%s2)+(m-t)%m)%3
)</lang>
Task examples:
<lang J> 'MARTHA' jaro 'MARHTA' 0.944444
'DIXON' jaro 'DICKSONX'
0.766667
'JELLYFISH' jaro 'SMELLYFISH'
0.896296</lang>
Java
<lang java>public class JaroDistance {
public static double jaro(String s, String t) { int s_len = s.length(); int t_len = t.length();
if (s_len == 0 && t_len == 0) return 1;
int match_distance = Integer.max(s_len, t_len) / 2 - 1;
boolean[] s_matches = new boolean[s_len]; boolean[] t_matches = new boolean[t_len];
int matches = 0; int transpositions = 0;
for (int i = 0; i < s_len; i++) { int start = Integer.max(0, i-match_distance); int end = Integer.min(i+match_distance+1, t_len);
for (int j = start; j < end; j++) { if (t_matches[j]) continue; if (s.charAt(i) != t.charAt(j)) continue; s_matches[i] = true; t_matches[j] = true; matches++; break; } }
if (matches == 0) return 0;
int k = 0; for (int i = 0; i < s_len; i++) { if (!s_matches[i]) continue; while (!t_matches[k]) k++; if (s.charAt(i) != t.charAt(k)) transpositions++; k++; }
return (((double)matches / s_len) + ((double)matches / t_len) + (((double)matches - transpositions/2.0) / matches)) / 3.0; }
public static void main(String[] args) { System.out.println(jaro( "MARTHA", "MARHTA")); System.out.println(jaro( "DIXON", "DICKSONX")); System.out.println(jaro("JELLYFISH", "SMELLYFISH")); }
}</lang>
- Output:
0.9444444444444445 0.7666666666666666 0.8962962962962964
jq
def jaro(s1; s2): def when(p; q): if p then q else . end; (s1|length) as $len1 | (s2|length) as $len2 | (( [$len1, $len2] | max ) / 2 - 1) as $match_standard | {m:0, p:0} | reduce range(0; $len1) as $l1 (.; s1[$l1:$l1+1] as $t1 | reduce range(0; $len2) as $l2 (.; s2[$l2:$l2+1] as $t2 | when( $t1 == $t2; when( ($l2-$l1) <= $match_standard and ($l1-$l2) <= $match_standard; .m+=1) | when($l2 == $l1; .p += 1) ) ) ) | ((.m-.p)/2) as $t | ( (.m/$len1) + (.m/$len2) + ((.m-$t)/.m) ) / 3 ; jaro("MARTHA";"MARHTA") , jaro("DIXON"; "DICKSONX") , jaro("JELLYFISH";"SMELLYFISH")
Output:
0.9444444444444444 0.6833333333333332 0.8870370370370371
Julia
function jaro(s1,s2) s1::AbstractString s2::AbstractString m=0 t=0 p=0 l1=0 l2=0 match_standard=((max(length(s1),length(s2)))/2)-1 for i in s1[1:end] l1+=1 l2=0 for j in s2[1:end] l2+=1 if i==j if l2-l1<=match_standard && l1-l2<=match_standard m+=1 end if l2==l1 p+=1 end end end end t=(m-p)/2 d=1/3*((m/length(s1))+(m/length(s2))+((m-t)/m)) println(d) end
"MARTHA","MARHTA" : 0.9444444444444444444 "DIXON","DICKSONX" : 0.6833333333333333333 "JELLYFISH","SMELLYFISH" : 0.8870370370370
Kotlin
<lang scala>object Jaro {
fun distance(s1: String, s2: String): Double { val s1_len = s1.length val s2_len = s2.length if (s1_len == 0 && s2_len == 0) return 1.0 val match_distance = Math.max(s1_len, s2_len) / 2 - 1 val s1_matches = BooleanArray(s1_len) val s2_matches = BooleanArray(s2_len) var matches = 0 for (i in 0..s1_len - 1) { val start = Math.max(0, i - match_distance) val end = Math.min(i + match_distance + 1, s2_len) (start..end - 1).find { j -> !s2_matches[j] && s1[i] == s2[j] } ?. let { s1_matches[i] = true s2_matches[it] = true matches++ } } if (matches == 0) return 0.0 var t = 0.0 var k = 0 (0..s1_len - 1).filter { s1_matches[it] }.forEach { i -> while (!s2_matches[k]) k++ if (s1[i] != s2[k]) t += 0.5 k++ }
val m = matches.toDouble() return (m / s1_len + m / s2_len + (m - t) / m) / 3.0 }
}
fun main(args: Array<String>) {
println(Jaro.distance("MARTHA", "MARHTA")) println(Jaro.distance("DIXON", "DICKSONX")) println(Jaro.distance("JELLYFISH", "SMELLYFISH"))
}</lang>
PARI/GP
This version was translated from Java and Perl.
<lang parigp> \\Jaro distance between 2 strings s1 and s2. \\ 4/12/16 aev jaroDist(s1,s2)={ my(vt1=Vecsmall(s1),vt2=Vecsmall(s2),n1=#s1,n2=#s2,d,
md=max(n1,n2)\2-1,cs,ce,mc=0,tr=0,k=1,ds, s1m=vector(n1,z,0),s2m=vector(n2,z,0));
if(!n1||!n2, return(0)); for(i=1,n1,
cs=max(1,i-md); ce=min(i+md+1,n2); for(j=cs,ce, if(s2m[j],next); if(vt1[i]!=vt2[j], next); mc++; s1m[i]=1; s2m[j]=1; break; );\\fend j
);\\fend i if(!mc, return(0)); for(i=1,n1,
if(!s1m[i], next); while(!s2m[k], k++); if(vt1[i]!=vt2[k], tr++); k++
);\\fend i d=(mc/n1+mc/n2+(mc-tr/2)/mc)/3.0; ds=Strprintf("%.5f",d); print(" *** Jaro distance is: ",ds," for strings: ",s1,", ",s2); return(d); }
{ \\ Testing: jaroDist("MARTHA","MARHTA"); jaroDist("DIXON","DICKSONX"); jaroDist("JELLYFISH","SMELLYFISH"); jaroDist("DWAYNE","DUANE"); } </lang>
- Output:
*** Jaro distance is: 0.94444 for strings: MARTHA, MARHTA *** Jaro distance is: 0.76667 for strings: DIXON, DICKSONX *** Jaro distance is: 0.89630 for strings: JELLYFISH, SMELLYFISH *** Jaro distance is: 0.82222 for strings: DWAYNE, DUANE
Perl
<lang perl>use List::Util qw(min max);
sub jaro {
my ($s, $t) = @_;
my $s_len = length($s); my $t_len = length($t);
return 1 if $s_len == 0 and $t_len == 0;
my $match_distance = int(max($s_len, $t_len) / 2) - 1;
my @s_matches; my @t_matches;
my @s = split(//, $s); my @t = split(//, $t);
my $matches = 0; foreach my $i (0 .. $#s) {
my $start = max(0, $i - $match_distance); my $end = min($i + $match_distance + 1, $t_len);
foreach my $j ($start .. $end - 1) { $t_matches[$j] and next; $s[$i] eq $t[$j] or next; $s_matches[$i] = 1; $t_matches[$j] = 1; $matches++; last; } }
return 0 if $matches == 0;
my $k = 0; my $transpositions = 0;
foreach my $i (0 .. $#s) { $s_matches[$i] or next; until ($t_matches[$k]) { ++$k } $s[$i] eq $t[$k] or ++$transpositions; ++$k; }
(($matches / $s_len) + ($matches / $t_len) + (($matches - $transpositions / 2) / $matches)) / 3;
}
printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));</lang>
- Output:
0.944444 0.766667 0.896296
Perl 6
<lang perl6>sub jaro ($s, $t) {
return 1 if $s eq $t;
my $s_len = + my @s = $s.comb; my $t_len = + my @t = $t.comb;
my $match_distance = ($s_len max $t_len) div 2 - 1;
my @s_matches; my @t_matches; my $matches = 0;
for ^@s -> $i {
my $start = 0 max $i - $match_distance; my $end = $i + $match_distance min $t_len;
for $start .. $end -> $j { @t_matches[$j] and next; @s[$i] eq @t[$j] or next; @s_matches[$i] = 1; @t_matches[$j] = 1; $matches++; last; } }
return 0 if $matches == 0;
my $k = 0; my $transpositions = 0;
for ^@s -> $i { @s_matches[$i] or next; until @t_matches[$k] { ++$k } @s[$i] eq @t[$k] or ++$transpositions; ++$k; }
($matches / $s_len + $matches / $t_len + (($matches - $transpositions / 2) / $matches)) / 3;
}
printf("%f\n", jaro("MARTHA", "MARHTA")); printf("%f\n", jaro("DIXON", "DICKSONX")); printf("%f\n", jaro("JELLYFISH", "SMELLYFISH"));</lang>
- Output:
0.944444 0.766667 0.896296
Python
<lang python>from __future__ import division
def jaro(s, t):
s_len = len(s) t_len = len(t)
if s_len == 0 and t_len == 0: return 1
match_distance = (max(s_len, t_len) // 2) - 1
s_matches = [False] * s_len t_matches = [False] * t_len
matches = 0 transpositions = 0
for i in range(s_len): start = max(0, i-match_distance) end = min(i+match_distance+1, t_len)
for j in range(start, end): if t_matches[j]: continue if s[i] != t[j]: continue s_matches[i] = True t_matches[j] = True matches += 1 break
if matches == 0: return 0
k = 0 for i in range(s_len): if not s_matches[i]: continue while not t_matches[k]: k += 1 if s[i] != t[k]: transpositions += 1 k += 1
return ((matches / s_len) + (matches / t_len) + ((matches - transpositions/2) / matches)) / 3
for s,t in [( 'MARTHA', 'MARHTA'),
( 'DIXON', 'DICKSONX'), ('JELLYFISH', 'SMELLYFISH')]: print("jaro(%r, %r) = %.10f" % (s, t, jaro(s, t)))</lang>
- Output:
jaro('MARTHA', 'MARHTA') = 0.9444444444 jaro('DIXON', 'DICKSONX') = 0.7666666667 jaro('JELLYFISH', 'SMELLYFISH') = 0.8962962963
Racket
(kinda)
Returns an exact value for the Jaro distance.
<lang racket>#lang racket/base
(require data/bit-vector)
(define (jaro-distance str1 str2)
(define str1-len (string-length str1)) (define str2-len (string-length str2)) (cond [(and (zero? str1-len) (zero? str2-len)) 0] [(or (zero? str1-len) (zero? str2-len)) 1] [else ;; vectors of bools that signify if that char in the matching string has a match (define str1-matches (make-bit-vector str1-len)) (define str2-matches (make-bit-vector str2-len)) (define matches ;; max distance between two chars to be considered matching (let ((match-distance (sub1 (quotient (max str1-len str2-len) 2)))) (for/fold ((matches 0)) ((i (in-range 0 str1-len)) (c1 (in-string str1))) (define start (max 0 (- i match-distance))) (define end (min (+ i match-distance 1) str2-len)) (for/fold ((matches matches)) ((k (in-range start end)) (c2 (in-string str2 start)) #:unless (bit-vector-ref str2-matches k) ; if str2 already has a match continue #:when (char=? c1 c2) ; if str1 and str2 are not #:final #t) ;; otherwise assume there is a match (bit-vector-set! str1-matches i #t) (bit-vector-set! str2-matches k #t) (add1 matches))))) (cond [(zero? matches) 0] [else (define-values (transpositions*2 k+) (for/fold ((transpositions 0) (k 0)) ((i (in-range 0 str1-len)) (c1 (in-string str1)) (b1 (in-bit-vector str1-matches)) ;; if there are no matches in str1 continue #:when b1) (define k+ (for/first ((k+ (in-range k str2-len)) (b2 (in-bit-vector str2-matches k)) #:when b2) k+)) (values (+ transpositions (if (char=? c1 (string-ref str2 k+)) 0 1)) ; increment transpositions (add1 k+)))) ;; while there is no match in str2 increment k ;; divide the number of transpositions by two as per the algorithm specs ;; this division is valid because the counted transpositions include both ;; instances of the transposed characters. (define transpositions (quotient transpositions*2 2)) ;; return the Jaro distance (/ (+ (/ matches str1-len) (/ matches str2-len) (/ (- matches transpositions) matches)) 3)])]))
(module+ test
(jaro-distance "MARTHA" "MARHTA"); 0.944444 (exact->inexact (jaro-distance "MARTHA" "MARHTA")); 0.944444 (jaro-distance "DIXON" "DICKSONX"); 0.766667 (exact->inexact (jaro-distance "DIXON" "DICKSONX")); 0.766667 (jaro-distance "JELLYFISH" "SMELLYFISH"); 0.896296 (exact->inexact (jaro-distance "JELLYFISH" "SMELLYFISH"))); 0.896296</lang>
- Output:
The exact->inexact
calls in the tests give an inexact, floating point version of the rational values.
17/18 0.9444444444444444 23/30 0.7666666666666667 121/135 0.8962962962962963
REXX
<lang rexx>/*REXX program computes the Jaro distance between two strings (or a list of strings).*/ @.= /*define a default for the @. array. */ parse arg @.1 /*obtain an optional character string. */ if @.1= then do /*nothing specified? Use the defaults.*/
@.1= 'MARTHA MARHTA' @.2= 'DIXON DICKSONX' @.3= 'JELLYFISH SMELLYFISH' @.4= 'DWAYNE DUANE' end
do j=1 while @.j\== /*process all the strings in the list. */ d=jaroDist(@.j) say 'Jaro distance is ' format(d, , 5) " for strings: " @.j end /*j*/ /* └──── digits past the decimal point.*/
exit /*stick a fork in it, we're all done. */ /*──────────────────────────────────────────────────────────────────────────────────────*/ jaroDist: procedure; arg s.1 s.2 .; L1=length(s.1); L2=length(s.2); m=0
if L1==0 | L2==0 then return 0 /*check if any string is a null string.*/ f=max(L1,L2)%2 - 1 /*calculate furthest distanced allowed.*/ r.=0 /* [↓] see if the char is near enough.*/ do k=1 for L1; p=pos(substr(s.1, k, 1), s.2, max(1, k-f)); r.k=p if p\==0 & abs(p-k)<=f then m=m+1 /*if near enough, count it as a match. */ else r.k=0 /* ··· otherwise, don't count it.*/ end /*k*/ t=0 do o=1 for L1; om=o-1 if pos( substr(s.1, o, 1), s.2)==0 | r.o==0 | r.om==0 then iterate if r.o<r.om then t=t+1 end /*o*/ /* [↑] count number of transpositions.*/
if m==0 then return 0 return (m/L1 + m/L2 + (m-t)/m) / 3</lang>
output when using the default inputs:
Jaro distance is 0.94444 for strings: MARTHA MARHTA Jaro distance is 0.76667 for strings: DIXON DICKSONX Jaro distance is 0.89630 for strings: JELLYFISH SMELLYFISH Jaro distance is 0.82222 for strings: DWAYNE DUANE
Ruby
<lang ruby>def jaro(s, t)
return 1.0 if s == t s_len = s.size t_len = t.size match_distance = ([s_len, t_len].max / 2) - 1 s_matches = [] t_matches = [] matches = 0.0 s_len.times do |i| j_start = [0, i-match_distance].max j_end = [i+match_distance, t_len-1].min (j_start..j_end).each do |j| t_matches[j] && next s[i] == t[j] || next s_matches[i] = true t_matches[j] = true matches += 1.0 break end end return 0.0 if matches == 0.0 k = 0 transpositions = 0.0 s_len.times do |i| s_matches[i] || next k += 1 until t_matches[k] s[i] == t[k] || (transpositions += 1.0) k += 1 end ((matches / s_len) + (matches / t_len) + ((matches - transpositions/2.0) / matches)) / 3.0
end
%w(
MARTHA MARHTA DIXON DICKSONX JELLYFISH SMELLYFISH
).each_slice(2) do |s,t|
puts "jaro(#{s.inspect}, #{t.inspect}) = #{'%.10f' % jaro(s, t)}"
end</lang>
- Output:
jaro("MARTHA", "MARHTA") = 0.9444444444 jaro("DIXON", "DICKSONX") = 0.7666666667 jaro("JELLYFISH", "SMELLYFISH") = 0.8962962963
Rust
<lang rust>use std::cmp;
pub fn jaro(s1: &str, s2: &str) -> f64 {
let s1_len = s1.len(); let s2_len = s2.len(); if s1_len == 0 && s2_len == 0 { return 1.0; } let match_distance = cmp::max(s1_len, s2_len) / 2 - 1; let mut s1_matches = vec![false; s1_len]; let mut s2_matches = vec![false; s2_len]; let mut m: isize = 0; for i in 0..s1_len { let start = cmp::max(0, i as isize - match_distance as isize) as usize; let end = cmp::min(i + match_distance + 1, s2_len); for j in start..end { if !s2_matches[j] && s1.as_bytes()[i] == s2.as_bytes()[j] { s1_matches[i] = true; s2_matches[j] = true; m += 1; break; } } } if m == 0 { return 0.0; } let mut t = 0.0; let mut k = 0; for i in 0..s1_len { if s1_matches[i] { while !s2_matches[k] { k += 1; } if s1.as_bytes()[i] != s2.as_bytes()[k] { t += 0.5; } k += 1; } }
let m = m as f64; (m / s1_len as f64 + m / s2_len as f64 + (m - t) / m) / 3.0
}
fn main() {
let pairs = [("MARTHA", "MARHTA"), ("DIXON", "DICKSONX"), ("JELLYFISH", "SMELLYFISH")]; for p in pairs.iter() { println!("{}/{} = {}", p.0, p.1, jaro(p.0, p.1)); }
}</lang>
- Output:
MARTHA/MARHTA = 0.9444444444444445 DIXON/DICKSONX = 0.7666666666666666 JELLYFISH/SMELLYFISH = 0.8962962962962964
Scala
<lang scala>object Jaro extends App {
def distance(s1: String, s2: String): Double = { val s1_len = s1.length val s2_len = s2.length if (s1_len == 0 && s2_len == 0) return 1.0 val match_distance = Math.max(s1_len, s2_len) / 2 - 1 val s1_matches = Array.ofDim[Boolean](s1_len) val s2_matches = Array.ofDim[Boolean](s2_len) var matches = 0 for (i <- 0 until s1_len) { val start = Math.max(0, i - match_distance) val end = Math.min(i + match_distance + 1, s2_len) start until end find { j => !s2_matches(j) && s1(i) == s2(j) } match { case Some(j) => s1_matches(i) = true s2_matches(j) = true matches += 1 case None => } } if (matches == 0) return 0.0 var t = 0.0 var k = 0 0 until s1_len filter s1_matches foreach { i => while (!s2_matches(k)) k += 1 if (s1(i) != s2(k)) t += 0.5 k += 1 }
val m = matches.toDouble (m / s1_len + m / s2_len + (m - t) / m) / 3.0 }
val strings = List(("MARTHA", "MARHTA"), ("DIXON", "DICKSONX"), ("JELLYFISH", "SMELLYFISH")) strings.foreach { s => println(distance(s._1, s._2)) }
}</lang>
Sidef
<lang ruby>func jaro(s, t) {
return 1 if (s == t)
var s_len = s.len var t_len = t.len
var match_distance = ((s_len `max` t_len) // 2 - 1)
var s_matches = [] var t_matches = []
var matches = 0 var transpositions = 0
for i (^s_len) { var start = (0 `max` i-match_distance) var end = (i+match_distance `min` t_len-1)
for k (start..end) { t_matches[k] && next s[i] == t[k] || next s_matches[i] = true t_matches[k] = true matches++ break } }
return 0 if (matches == 0)
var k = 0 for i (^s_len) { s_matches[i] || next while (!t_matches[k]) { ++k } s[i] == t[k] || ++transpositions ++k }
((matches / s_len) + (matches / t_len) + ((matches - transpositions/2) / matches)) / 3
}
for pair in [
[%c"MARTHA", %c"MARHTA"], [%c"DIXON", %c"DICKSONX"], [%c"JELLYFISH", %c"SMELLYFISH"],
] {
say "jaro(#{pair.map{.join.dump}.join(', ')}) = #{'%.10f' % jaro(pair...)}"
}</lang>
- Output:
jaro("MARTHA", "MARHTA") = 0.9444444444 jaro("DIXON", "DICKSONX") = 0.7666666667 jaro("JELLYFISH", "SMELLYFISH") = 0.8962962963
Tcl
<lang Tcl>proc jaro {s1 s2} {
set l1 [string length $s1] set l2 [string length $s2] set dmax [expr {max($l1, $l2)/2 - 1}] ;# window size to scan for matches set m1 {} ;# match indices set m2 {} for {set i 0} {$i < $l1} {incr i} { set jmin [expr {$i - $dmax}] ;# don't worry about going out-of-bounds set jmax [expr {$i + $dmax}] ;# because [string index] will return {} safely for {set j $jmin} {$j <= $jmax} {incr j} { if {$j in $m2} continue ;# don't double-count matches if {[string index $s1 $i] eq [string index $s2 $j]} { lappend m1 $i lappend m2 $j break } } } set T 0 ;# number of transpositions set oj -1 foreach j $m2 { if {$j < $oj} {incr T} set oj $j } set T [expr {$T / 2.0}] set M [expr {1.0 * [llength $m1]}] ;# number of matches expr { ( ($M / $l1) + ($M / $l2) + (($M - $T) / $M) ) / 3.0 }
}
foreach {s t} {
DWAYNE DUANE MARTHA MARHTA DIXON DICKSONX JELLYFISH SMELLYFISH
} {
puts "[jaro $s $t]:\t$s / $t"
}</lang>
- Output:
0.8222222222222223: DWAYNE / DUANE 0.9722222222222222: MARTHA / MARHTA 0.7666666666666666: DIXON / DICKSONX 0.8962962962962964: JELLYFISH / SMELLYFISH
zkl
<lang zkl> //-->String of matched characters, ordered fcn _jaro(str1,str2, matchDistance){
cs:=Sink(String); foreach i,c in ([0..].zip(str1)){ str2.find(c,(0).max(i - matchDistance),i + matchDistance) : if(Void!=_) cs.write(c); } cs.close()
}
fcn jaro(str1,str2){
s1Len,s2Len,matchDistance := str1.len(), str2.len(), s1Len.max(s2Len)/2 - 1; cs12,cs21 := _jaro(str1,str2, matchDistance), _jaro(str2,str1, matchDistance);
matches:=cs12.len().toFloat(); if(not matches) return(0.0); transpositions:=cs12.walker().zipWith('!=,cs21).filter().sum(0)/2; ( matches/s1Len + matches/s2Len + ((matches - transpositions)/matches) ) / 3.0
}</lang> <lang zkl>foreach s,t in (T(
T("MARTHA","MARHTA"), T("DIXON","DICKSONX"), T("JELLYFISH","SMELLYFISH"))){ println(0'|jaro("%s","%s") = %.10f|.fmt(s,t,jaro(s,t)));
}</lang>
- Output:
jaro("MARTHA","MARHTA") = 0.9444444444 jaro("DIXON","DICKSONX") = 0.7666666667 jaro("JELLYFISH","SMELLYFISH") = 0.8962962963
ZX Spectrum Basic
<lang zxbasic>10 LET a$="MARTHA": LET b$="MARHTA": PRINT a$;", ";b$;": ";: GO SUB 1000: PRINT jaro 20 LET a$="DIXON": LET b$="DICKSONX": PRINT a$;", ";b$;": ";: GO SUB 1000: PRINT jaro 30 LET a$="JELLYFISH": LET b$="SMELLYFISH": PRINT a$;", ";b$;": ";: GO SUB 1000: PRINT jaro 900 STOP 1000 REM Jaro subroutine 1010 LET s1=LEN a$: LET s2=LEN b$: LET j=1: LET m=0: LET t=0 1030 IF s1>s2 THEN LET z$=a$: LET a$=b$: LET b$=z$: LET z=s1: LET s1=s2: LET s2=z 1035 LET maxdist=INT (s2/2) 1040 FOR i=1 TO s1 1050 IF a$(i)=b$(j) THEN LET m=m+1: LET b$(j)=" ": GO TO 2000 1080 FOR k=FN x(1,i-maxdist) TO FN n(s2,i+maxdist) 1090 IF a$(i)=b$(k) THEN LET t=t+1: LET m=m+1: LET b$(k)=" ": IF k>j THEN LET j=k 1100 NEXT k 2000 IF j<s2 THEN LET j=j+1: 2010 NEXT i 2020 IF m=0 THEN LET jaro=0: RETURN 2030 LET t=INT (t/2) 2040 LET jaro=(m/s1+m/s2+((m-t)/m))/3 2050 RETURN 5000 REM Functions 5010 DEF FN x(a,b)=(a AND a>b)+(b AND a<b)+(a AND a=b): REM max function 5020 DEF FN n(a,b)=(a AND a<b)+(b AND a>b)+(a AND a=b): REM min function</lang>
- Output:
MARTHA, MARHTA: 0.94444444 DIXON, DICKSONX: 0.76666667 JELLYFISH, SMELLYFISH: 0.8962963