Soundex: Difference between revisions
(Added D code) |
(added ocaml) |
||
Line 222: | Line 222: | ||
return (r + '000').slice(0, 4); |
return (r + '000').slice(0, 4); |
||
};</lang> |
};</lang> |
||
=={{header|OCaml}}== |
|||
Here is an implementation: |
|||
<lang ocaml>let c2d = function |
|||
| 'B' | 'F' | 'P' | 'V' -> "1" |
|||
| 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' -> "2" |
|||
| 'D' | 'T' -> "3" |
|||
| 'L' -> "4" |
|||
| 'M' | 'N' -> "5" |
|||
| 'R' -> "6" |
|||
| _ -> "" |
|||
let rec dbl acc = function |
|||
| [] -> (List.rev acc) |
|||
| c::[] -> List.rev(c::acc) |
|||
| c1::(c2::_ as tl) -> |
|||
if c1 = c2 |
|||
then dbl acc tl |
|||
else dbl (c1::acc) tl |
|||
let pad s = |
|||
match String.length s with |
|||
| 0 -> s ^ "000" |
|||
| 1 -> s ^ "00" |
|||
| 2 -> s ^ "0" |
|||
| 3 -> s |
|||
| _ -> String.sub s 0 3 |
|||
let soundex_aux rem = |
|||
pad(String.concat "" (dbl [] (List.map c2d rem))) |
|||
let soundex s = |
|||
let s = String.uppercase s in |
|||
let cl = ref [] in |
|||
String.iter (fun c -> cl := c :: !cl) s; |
|||
match dbl [] (List.rev !cl) with |
|||
| c::rem -> (String.make 1 c) ^ (soundex_aux rem) |
|||
| [] -> invalid_arg "soundex"</lang> |
|||
Test our implementation: |
|||
<lang ocaml>let tests = [ |
|||
"Soundex", "S532"; |
|||
"Example", "E251"; |
|||
"Sownteks", "S532"; |
|||
"Ekzampul", "E251"; |
|||
"Euler", "E460"; |
|||
"Gauss", "G200"; |
|||
"Hilbert", "H416"; |
|||
"Knuth", "K530"; |
|||
"Lloyd", "L300"; |
|||
"Lukasiewicz", "L222"; |
|||
"Ellery", "E460"; |
|||
"Ghosh", "G200"; |
|||
"Heilbronn", "H416"; |
|||
"Kant", "K530"; |
|||
"Ladd", "L300"; |
|||
"Lissajous", "L222"; |
|||
"Wheaton", "W350"; |
|||
"Ashcraft", "A226"; |
|||
"Burroughs", "B622"; |
|||
"Burrows", "B620"; |
|||
"O'Hara", "O600"; |
|||
] |
|||
let () = |
|||
print_endline " Word \t Code Found Status"; |
|||
List.iter (fun (word, code1) -> |
|||
let code2 = soundex word in |
|||
let status = if code1 = code2 then "OK " else "Arg" in |
|||
Printf.printf " \"%s\" \t %s %s %s\n" word code1 code2 status |
|||
) tests</lang> |
|||
This test outputs: |
|||
Word Code Found Status |
|||
"Soundex" S532 S532 OK |
|||
"Example" E251 E251 OK |
|||
"Sownteks" S532 S532 OK |
|||
"Ekzampul" E251 E251 OK |
|||
"Euler" E460 E460 OK |
|||
"Gauss" G200 G200 OK |
|||
"Hilbert" H416 H416 OK |
|||
"Knuth" K530 K530 OK |
|||
"Lloyd" L300 L300 OK |
|||
"Lukasiewicz" L222 L222 OK |
|||
"Ellery" E460 E460 OK |
|||
"Ghosh" G200 G200 OK |
|||
"Heilbronn" H416 H416 OK |
|||
"Kant" K530 K530 OK |
|||
"Ladd" L300 L300 OK |
|||
"Lissajous" L222 L222 OK |
|||
"Wheaton" W350 W350 OK |
|||
"Ashcraft" A226 A226 OK |
|||
"Burroughs" B622 B622 OK |
|||
"Burrows" B620 B620 OK |
|||
"O'Hara" O600 O600 OK |
|||
=={{header|Perl}}== |
=={{header|Perl}}== |
Revision as of 16:31, 2 January 2010
You are encouraged to solve this task according to the task description, using any language you may know.
Soundex is an algorithm for creating indices for words based on their pronunciation. The goal is for homophones to be encoded to the same representation so that they can be matched despite minor differences in spelling (from the WP article).
C
<lang c>#include <stdio.h>
- include <stdlib.h>
- include <string.h>
char chr(int i) {
static char cs[]="0123456789"; return ((0<=i) && (i<=9)) ? cs[i] : '-';
}
char *soundex(char *word, char *bufr) {
static char *lset[] = { "", "BFPVbfpv", "CGJKQSXZcgjkqsxz", "DTdt", "Ll","MNmn", "Rr" };
- define LSIZE (sizeof(lset)/sizeof(char *))
char *ch, *bp; char prev = '-'; int j;
bp = bufr; ch = word;
while (*ch && (bp-bufr<4)) { for (j=1; j< LSIZE; j++) if (strchr( lset[j], *ch) ) break;
if ((j<LSIZE) && (chr(j) != prev)) *bp++ = prev = chr(j); else if (!strchr("WHwh", *ch)) prev = '7';
if (ch == word) { bp = bufr; *bp++ = toupper(*ch); } ch++; } while (bp-bufr < 4) *bp++ = '0'; *bp = 0; return bufr;
}
int main(int argc, char *argv[]) {
char *w, **wp; static char *testwords[] = { "Euler","Gauss","Hilbert","Knuth","Lloyd","Lukasiewicz","Ellery", "Ghosh", "Heilbronn","Kant","Ladd","Lissajous","Wheaton","Ashcraft", "Burroughs","Burrows","O'Hara", NULL}; char buffer[8];
w= "Soundex"; printf("soundex(%s) = %s\n", w, soundex(w,buffer)); w= "Example"; printf("soundex(%s) = %s\n", w, soundex(w,buffer)); w= "Sownteks"; printf("soundex(%s) = %s\n", w, soundex(w,buffer)); w= "Ekzampul"; printf("soundex(%s) = %s\n", w, soundex(w,buffer));
for (wp=testwords; *wp; wp++ ) printf("soundex(%s) = %s\n", *wp, soundex(*wp, buffer)); return 0;
}</lang>
D
<lang d>import std.string;
void main() {
char[][] input = ["Euler","Gauss","Hilbert","Knuth","Lloyd","Lukasiewicz","Ellery", "Ghosh", "Heilbronn","Kant","Ladd","Lissajous","Wheaton","Ashcraft", "Burroughs","Burrows","O'Hara"];
uint padding = 0; foreach(inp; input) if(inp.length > padding) padding = inp.length; char[] padstr = new char[](padding);
foreach(inp; input) { padstr[] = ' '; padstr[0..inp.length] = inp;
printf("%.*s -> %.*s\n", padstr, soundex(inp)); }
}</lang>
Forth
This implements the US Census rules, where W and H are ignored but, unlike vowels, aren't separators. Further corner cases welcome...
<lang forth>: alpha-table create does> swap 32 or [char] a - 0 max 26 min + 1+ c@ ;
alpha-table soundex-code
," 123 12. 22455 12623 1.2 2 " \ ABCDEFGHIJKLMNOPQRSTUVWXYZ
- soundex ( name len -- pad len )
over c@ pad c! \ First character verbatim pad 1+ 3 [char] 0 fill \ Pad to four characters with zeros 1 pad c@ soundex-code ( count code ) 2swap bounds do i c@ soundex-code ( count code next ) 2dup = if drop else \ runs are ignored dup [char] . = if drop else \ W, H don't separate runs of consonants dup bl = if nip else \ vowels separate consonants but aren't coded nip 2dup swap pad + c! swap 1+ tuck 4 = if leave then then then then loop 2drop pad 4 ;
\ Knuth's test cases s" Euler" soundex cr type \ E460 s" Gauss" soundex cr type \ G200 s" Hilbert" soundex cr type \ H416 s" Knuth" soundex cr type \ K530 s" Lloyd" soundex cr type \ L300 s" Lukasiewicz" soundex cr type \ L222 (W test) s" Ellery" soundex cr type \ E460 s" Ghosh" soundex cr type \ G200 s" Heilbronn" soundex cr type \ H416 s" Kant" soundex cr type \ K530 s" Ladd" soundex cr type \ L300 s" Lissajous" soundex cr type \ L222
s" Wheaton" soundex cr type \ W350 s" Ashcraft" soundex cr type \ A261 (H tests) s" Burroughs" soundex cr type \ B620 s" Burrows" soundex cr type \ B620 (W test) (any Welsh names?) s" O'Hara" soundex cr type \ O600 (punctuation test)</lang>
Haskell
<lang haskell>import Text.PhoneticCode.Soundex import Control.Arrow</lang> Example: <lang haskell>*Main> mapM_ print $ map (id &&& soundexSimple) ["Soundex", "Example", "Sownteks", "Ekzampul"] ("Soundex","S532") ("Example","E251") ("Sownteks","S532") ("Ekzampul","E251")</lang>
Java
<lang java>public static void main(String[] args){
System.out.println(soundex("Soundex")); System.out.println(soundex("Example")); System.out.println(soundex("Sownteks")); System.out.println(soundex("Ekzampul")); }
private static String getCode(char c){
switch(c){ case 'B': case 'F': case 'P': case 'V': return "1"; case 'C': case 'G': case 'J': case 'K': case 'Q': case 'S': case 'X': case 'Z': return "2"; case 'D': case 'T': return "3"; case 'L': return "4"; case 'M': case 'N': return "5"; case 'R': return "6"; default: return ""; }
}
public static String soundex(String s){
String code, previous, soundex; code = s.toUpperCase().charAt(0) + ""; previous = "7"; for(int i = 1;i < s.length();i++){ String current = getCode(s.toUpperCase().charAt(i)); if(current.length() > 0 && !current.equals(previous)){ code = code + current; } previous = current; } soundex = (code + "0000").substring(0, 4); return soundex;
}</lang> Output:
S532 E251 S532 E251
JavaScript
<lang javascript>var soundex = function (s) {
var a = s .substring(1, s.length) .toLowerCase() .split(), r = , codes = { a: , e: , i: , o: , u: , b: 1, f: 1, p: 1, v: 1, c: 2, g: 2, j: 2, k: 2, q: 2, s: 2, x: 2, z: 2, d: 3, t: 3, l: 4, m: 5, n: 5, r: 6 };
r = s[0].toUpperCase() + a .filter(function (v, i, a) { return v !== a[i + 1]; }) .map(function (v, i, a) { return codes[v] }).join();
return (r + '000').slice(0, 4);
};</lang>
OCaml
Here is an implementation:
<lang ocaml>let c2d = function
| 'B' | 'F' | 'P' | 'V' -> "1" | 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' -> "2" | 'D' | 'T' -> "3" | 'L' -> "4" | 'M' | 'N' -> "5" | 'R' -> "6" | _ -> ""
let rec dbl acc = function
| [] -> (List.rev acc) | c::[] -> List.rev(c::acc) | c1::(c2::_ as tl) -> if c1 = c2 then dbl acc tl else dbl (c1::acc) tl
let pad s =
match String.length s with | 0 -> s ^ "000" | 1 -> s ^ "00" | 2 -> s ^ "0" | 3 -> s | _ -> String.sub s 0 3
let soundex_aux rem =
pad(String.concat "" (dbl [] (List.map c2d rem)))
let soundex s =
let s = String.uppercase s in let cl = ref [] in String.iter (fun c -> cl := c :: !cl) s; match dbl [] (List.rev !cl) with | c::rem -> (String.make 1 c) ^ (soundex_aux rem) | [] -> invalid_arg "soundex"</lang>
Test our implementation:
<lang ocaml>let tests = [
"Soundex", "S532"; "Example", "E251"; "Sownteks", "S532"; "Ekzampul", "E251"; "Euler", "E460"; "Gauss", "G200"; "Hilbert", "H416"; "Knuth", "K530"; "Lloyd", "L300"; "Lukasiewicz", "L222"; "Ellery", "E460"; "Ghosh", "G200"; "Heilbronn", "H416"; "Kant", "K530"; "Ladd", "L300"; "Lissajous", "L222"; "Wheaton", "W350"; "Ashcraft", "A226"; "Burroughs", "B622"; "Burrows", "B620"; "O'Hara", "O600"; ]
let () =
print_endline " Word \t Code Found Status"; List.iter (fun (word, code1) -> let code2 = soundex word in let status = if code1 = code2 then "OK " else "Arg" in Printf.printf " \"%s\" \t %s %s %s\n" word code1 code2 status ) tests</lang>
This test outputs:
Word Code Found Status "Soundex" S532 S532 OK "Example" E251 E251 OK "Sownteks" S532 S532 OK "Ekzampul" E251 E251 OK "Euler" E460 E460 OK "Gauss" G200 G200 OK "Hilbert" H416 H416 OK "Knuth" K530 K530 OK "Lloyd" L300 L300 OK "Lukasiewicz" L222 L222 OK "Ellery" E460 E460 OK "Ghosh" G200 G200 OK "Heilbronn" H416 H416 OK "Kant" K530 K530 OK "Ladd" L300 L300 OK "Lissajous" L222 L222 OK "Wheaton" W350 W350 OK "Ashcraft" A226 A226 OK "Burroughs" B622 B622 OK "Burrows" B620 B620 OK "O'Hara" O600 O600 OK
Perl
The Text::Soundex core module supports various soundex algorithms. <lang perl>use Text::Soundex; print soundex("Soundex"), "\n"; # S532 print soundex("Example"), "\n"; # E251 print soundex("Sownteks"), "\n"; # S532 print soundex("Ekzampul"), "\n"; # E251</lang>
PHP
PHP already has a built-in soundex() function: <lang php><?php echo soundex("Soundex"), "\n"; // S532 echo soundex("Example"), "\n"; // E251 echo soundex("Sownteks"), "\n"; // S532 echo soundex("Ekzampul"), "\n"; // E251 ?></lang>
Python
<lang python>from itertools import groupby
def soundex(word):
codes = ("bfpv","cgjkqsxz", "dt", "l", "mn", "r") soundDict = dict((ch, str(ix+1)) for ix,cod in enumerate(codes) for ch in cod) cmap2 = lambda kar: soundDict.get(kar, '9') sdx = .join(cmap2(kar) for kar in word.lower()) sdx2 = word[0].upper() + .join(k for k,g in list(groupby(sdx))[1:] if k!='9') sdx3 = sdx2[0:4].ljust(4,'0') return sdx3
</lang> Example Output <lang Python>>>>print soundex("soundex") S532 >>>print soundex("example") E251 >>>print soundex("ciondecks") C532 >>>print soundex("ekzampul") E251</lang>
Ruby
Courtesy http://snippets.dzone.com/posts/show/4530 <lang ruby>class String
SoundexChars = 'BFPVCGJKQSXZDTLMNR' SoundexNums = '111122222222334556' SoundexCharsEx = '^' + SoundexChars SoundexCharsDel = '^A-Z'
# desc: http://en.wikipedia.org/wiki/Soundex def soundex(census = true) str = self.upcase.delete(SoundexCharsDel) str[0,1] + str[1..-1].delete(SoundexCharsEx). tr(SoundexChars, SoundexNums). squeeze[0 .. (census ? 2 : -1)]. ljust(3, '0') rescue end
def sounds_like(other) self.soundex == other.soundex end
end
%w(Soundex Sownteks Example Ekzampul foo bar).each_slice(2) do |word1, word2|
[word1, word2].each {|word| puts '%-8s -> %s' % [word, word.soundex]}
print "'#{word1}' " print word1.sounds_like(word2) ? "sounds" : "does not sound" print " like '#{word2}'\n"
end</lang>
Soundex -> S532 Sownteks -> S532 'Soundex' sounds like 'Sownteks' Example -> E251 Ekzampul -> E251 'Example' sounds like 'Ekzampul' foo -> F000 bar -> B600 'foo' does not sound like 'bar'
Tcl
contains an implementation of Knuth's soundex algorithm in the soundex
package.
<lang tcl>package require soundex
foreach string {"Soundex" "Example" "Sownteks" "Ekzampul"} {
set soundexCode [soundex::knuth $string] puts "\"$string\" has code $soundexCode"
}</lang> Which produces this output:
"Soundex" has code S532 "Example" has code E251 "Sownteks" has code S532 "Ekzampul" has code E251
VBScript
<lang vbscript>Function getCode(c)
Select Case c Case "B", "F", "P", "V" getCode = "1" Case "C", "G", "J", "K", "Q", "S", "X", "Z" getCode = "2" Case "D", "T" getCode = "3" Case "L" getCode = "4" Case "M", "N" getCode = "5" Case "R" getCode = "6" End Select
End Function
Function soundex(s)
Dim code, previous code = UCase(Mid(s, 1, 1)) previous = 7 For i = 2 to (Len(s) + 1) current = getCode(UCase(Mid(s, i, 1))) If Len(current) > 0 And current <> previous Then code = code & current End If previous = current Next soundex = Mid(code, 1, 4) If Len(code) < 4 Then soundex = soundex & String(4 - Len(code), "0") End If
End Function</lang>