Jaro-Winkler distance: Difference between revisions
Content added Content deleted
(Added Java solution) |
|||
Line 961: | Line 961: | ||
0.1111 switch |
0.1111 switch |
||
0.1111 twitch |
0.1111 twitch |
||
</pre> |
|||
=={{header|Java}}== |
|||
{{trans|C++}} |
|||
<lang java>import java.io.*; |
|||
import java.util.*; |
|||
public class JaroWinkler { |
|||
public static void main(String[] args) { |
|||
try { |
|||
List<String> words = loadDictionary("linuxwords.txt"); |
|||
String[] strings = { |
|||
"accomodate", "definately", "goverment", "occured", |
|||
"publically", "recieve", "seperate", "untill", "wich" |
|||
}; |
|||
for (String string : strings) { |
|||
System.out.printf("Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to '%s' are:\n" |
|||
+ " Word | Distance\n", string); |
|||
for (StringDistance s : withinDistance(words, 0.15, string, 5)) { |
|||
System.out.printf("%14s | %.4f\n", s.word, s.distance); |
|||
} |
|||
System.out.println(); |
|||
} |
|||
} catch (Exception e) { |
|||
e.printStackTrace(); |
|||
} |
|||
} |
|||
private static class StringDistance implements Comparable<StringDistance> { |
|||
private StringDistance(String word, double distance) { |
|||
this.word = word; |
|||
this.distance = distance; |
|||
} |
|||
public int compareTo(StringDistance s) { |
|||
return Double.compare(distance, s.distance); |
|||
} |
|||
private String word; |
|||
private double distance; |
|||
} |
|||
private static List<StringDistance> withinDistance(List<String> words, |
|||
double maxDistance, String string, int max) { |
|||
List<StringDistance> result = new ArrayList<>(); |
|||
for (String word : words) { |
|||
double distance = jaroWinklerDistance(word, string); |
|||
if (distance <= maxDistance) |
|||
result.add(new StringDistance(word, distance)); |
|||
} |
|||
Collections.sort(result); |
|||
if (result.size() > max) |
|||
result = result.subList(0, max); |
|||
return result; |
|||
} |
|||
private static double jaroWinklerDistance(String string1, String string2) { |
|||
int len1 = string1.length(); |
|||
int len2 = string2.length(); |
|||
if (len1 < len2) { |
|||
String s = string1; |
|||
string1 = string2; |
|||
string2 = s; |
|||
int tmp = len1; |
|||
len1 = len2; |
|||
len2 = tmp; |
|||
} |
|||
if (len2 == 0) |
|||
return len1 == 0 ? 0.0 : 1.0; |
|||
int delta = Math.max(1, len1 / 2) - 1; |
|||
boolean[] flag = new boolean[len2]; |
|||
Arrays.fill(flag, false); |
|||
char[] ch1Match = new char[len1]; |
|||
int matches = 0; |
|||
for (int i = 0; i < len1; ++i) { |
|||
char ch1 = string1.charAt(i); |
|||
for (int j = 0; j < len2; ++j) { |
|||
char ch2 = string2.charAt(j); |
|||
if (j <= i + delta && j + delta >= i && ch1 == ch2 && !flag[j]) { |
|||
flag[j] = true; |
|||
ch1Match[matches++] = ch1; |
|||
break; |
|||
} |
|||
} |
|||
} |
|||
if (matches == 0) |
|||
return 1.0; |
|||
int transpositions = 0; |
|||
for (int i = 0, j = 0; j < len2; ++j) { |
|||
if (flag[j]) { |
|||
if (string2.charAt(j) != ch1Match[i]) |
|||
++transpositions; |
|||
++i; |
|||
} |
|||
} |
|||
double m = matches; |
|||
double jaro = (m / len1 + m / len2 + (m - transpositions / 2.0) / m) / 3.0; |
|||
int commonPrefix = 0; |
|||
len2 = Math.min(4, len2); |
|||
for (int i = 0; i < len2; ++i) { |
|||
if (string1.charAt(i) == string2.charAt(i)) |
|||
++commonPrefix; |
|||
} |
|||
return 1.0 - (jaro + commonPrefix * 0.1 * (1.0 - jaro)); |
|||
} |
|||
private static List<String> loadDictionary(String path) throws IOException { |
|||
try (BufferedReader reader = new BufferedReader(new FileReader(path))) { |
|||
List<String> words = new ArrayList<>(); |
|||
String word; |
|||
while ((word = reader.readLine()) != null) |
|||
words.add(word); |
|||
return words; |
|||
} |
|||
} |
|||
}</lang> |
|||
{{out}} |
|||
<pre> |
|||
Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to 'accomodate' are: |
|||
Word | Distance |
|||
accommodate | 0.0182 |
|||
accommodated | 0.0333 |
|||
accommodates | 0.0333 |
|||
accommodating | 0.0815 |
|||
accommodation | 0.0815 |
|||
Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to 'definately' are: |
|||
Word | Distance |
|||
definitely | 0.0400 |
|||
defiantly | 0.0422 |
|||
define | 0.0800 |
|||
definite | 0.0850 |
|||
definable | 0.0872 |
|||
Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to 'goverment' are: |
|||
Word | Distance |
|||
government | 0.0533 |
|||
govern | 0.0667 |
|||
governments | 0.0697 |
|||
movement | 0.0810 |
|||
governmental | 0.0833 |
|||
Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to 'occured' are: |
|||
Word | Distance |
|||
occurred | 0.0250 |
|||
occur | 0.0571 |
|||
occupied | 0.0786 |
|||
occurs | 0.0905 |
|||
accursed | 0.0917 |
|||
Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to 'publically' are: |
|||
Word | Distance |
|||
publicly | 0.0400 |
|||
public | 0.0800 |
|||
publicity | 0.1044 |
|||
publication | 0.1327 |
|||
biblically | 0.1400 |
|||
Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to 'recieve' are: |
|||
Word | Distance |
|||
receive | 0.0333 |
|||
received | 0.0625 |
|||
receiver | 0.0625 |
|||
receives | 0.0625 |
|||
relieve | 0.0667 |
|||
Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to 'seperate' are: |
|||
Word | Distance |
|||
desperate | 0.0708 |
|||
separate | 0.0917 |
|||
temperate | 0.1042 |
|||
separated | 0.1144 |
|||
separates | 0.1144 |
|||
Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to 'untill' are: |
|||
Word | Distance |
|||
until | 0.0333 |
|||
untie | 0.1067 |
|||
untimely | 0.1083 |
|||
till | 0.1111 |
|||
Antilles | 0.1264 |
|||
Close dictionary words (distance < 0.15 using Jaro-Winkler distance) to 'wich' are: |
|||
Word | Distance |
|||
witch | 0.0533 |
|||
which | 0.0600 |
|||
switch | 0.1111 |
|||
twitch | 0.1111 |
|||
witches | 0.1143 |
|||
</pre> |
</pre> |
||