Soundex: Difference between revisions
(Added BBC BASIC) |
(→{{header|AWK}}: Added AWK example) |
||
Line 185: | Line 185: | ||
MsgBox % Soundex("Soundex") "`n" Soundex("Sowndeks") "`n" Soundex("Ashcroft") "`n" Soundex("Ashkrofd")</lang> |
MsgBox % Soundex("Soundex") "`n" Soundex("Sowndeks") "`n" Soundex("Ashcroft") "`n" Soundex("Ashkrofd")</lang> |
||
=={{header|AWK}}== |
|||
The soundex function is embedded in a program to build a table of soundex "homonyms". |
|||
<lang awk>#!/usr/bin/awk -f |
|||
BEGIN { |
|||
delete homs |
|||
homSep = ", " |
|||
getHoms() |
|||
showHoms() |
|||
exit 1 |
|||
} |
|||
function strToSoundex(s, sdx, i, ch, cd, lch) { |
|||
if (length(s) == 0) return "" |
|||
s = tolower(s) |
|||
lch = substr(s, 1, 1); |
|||
sdx = toupper(lch) |
|||
lch = charToSoundex(lch) |
|||
for (i = 2; i <= length(s); i++) { |
|||
ch = substr(s, i, 1) |
|||
cd = charToSoundex(ch) |
|||
if (cd == 7) continue; |
|||
if (cd && cd != lch) sdx = sdx cd |
|||
lch = cd |
|||
} |
|||
sdx = substr(sdx "0000", 1, 4) |
|||
return sdx |
|||
} |
|||
function charToSoundex(ch, cd) { |
|||
if (ch ~ /[bfpv]/) cd = 1 |
|||
else if (ch ~ /[cgjkqsxz]/) cd = 2 |
|||
else if (ch ~ /[dt]/) cd = 3 |
|||
else if (ch == "l") cd = 4 |
|||
else if (ch ~ /[mn]/) cd = 5 |
|||
else if (ch == "r") cd = 6 |
|||
else if (ch ~ /[hw]/) cd = 7 |
|||
else cd = 0 |
|||
return cd; |
|||
} |
|||
function getHoms( eof, wd, ef, sd) { |
|||
while (1) { |
|||
ef = getline wd |
|||
if (ef < 1) break; |
|||
if (wd ~ /^[^'a-zA-Z]/) continue; |
|||
sd = strToSoundex(wd) |
|||
if (!(sd in homs)) homs[sd] = "" |
|||
homs[sd] = homs[sd] (homs[sd] == "" ? "" : homSep) wd |
|||
} |
|||
} |
|||
function showHoms() { |
|||
for (i in homs) { |
|||
printf i " " |
|||
n = split(homs[i], wl, homSep) |
|||
for (j = 1; j < 4 && j <= n; j++) { |
|||
printf wl[j] " " |
|||
} |
|||
print (n > 3 ? "..." : "") |
|||
} |
|||
} |
|||
</lang> |
|||
Example run: |
|||
<pre> |
|||
# ./soundex.awk ../unixdict.txt |sort |
|||
A000 a aaa aau ... |
|||
A100 a&p aba abbe ... |
|||
A110 ababa above aviv |
|||
A111 aboveboard |
|||
A112 aboveground |
|||
A114 affable |
|||
A115 abovementioned |
|||
A120 aback abase abash ... |
|||
A121 abusable abusive appeasable |
|||
A122 abacus abject abscess ... |
|||
A123 abstain abstention abstinent ... |
|||
A124 abigail absolute absolution ... |
|||
A125 absence absent absentee ... |
|||
A126 absorb absorbent absorption ... |
|||
A130 abate abbot abbott ... |
|||
A131 affidavit |
|||
A132 abdicate abduct abidjan ... |
|||
A133 abetted abutted apathetic ... |
|||
A135 abdomen abdominal abetting ... |
|||
A136 abater aftereffect afterglow ... |
|||
A140 abel able afoul ... |
|||
A141 appleby |
|||
A142 abelson ablaze abolish ... |
|||
. |
|||
. |
|||
. |
|||
Z324 zodiacal |
|||
Z400 zeal |
|||
Z420 zealous zilch zoology |
|||
Z430 zealot zloty |
|||
Z453 zealand |
|||
Z461 zellerbach |
|||
Z500 zan zen zion ... |
|||
Z510 zambia zomba zombie |
|||
Z520 zinc zing |
|||
Z521 zanzibar |
|||
Z525 zionism |
|||
Z530 zenith |
|||
Z532 zounds |
|||
Z565 zimmerman |
|||
Z600 zaire zero |
|||
Z620 zeroes zurich |
|||
Z623 zoroaster zoroastrian |
|||
Z625 zircon zirconium |
|||
Z630 zeroth |
|||
Z650 zorn |
|||
# |
|||
</pre> |
|||
=={{header|BBC BASIC}}== |
=={{header|BBC BASIC}}== |
||
<lang bbcbasic> DATA Ashcraft, Ashcroft, Gauss, Ghosh, Hilbert, Heilbronn, Lee, Lloyd |
<lang bbcbasic> DATA Ashcraft, Ashcroft, Gauss, Ghosh, Hilbert, Heilbronn, Lee, Lloyd |
Revision as of 22:10, 16 November 2012
You are encouraged to solve this task according to the task description, using any language you may know.
Soundex is an algorithm for creating indices for words based on their pronunciation. The goal is for homophones to be encoded to the same representation so that they can be matched despite minor differences in spelling (from the WP article).
Ada
<lang Ada>with Ada.Text_IO; use Ada.Text_IO; with Ada.Strings.Unbounded; use Ada.Strings.Unbounded; with Ada.Characters.Handling; use Ada.Characters.Handling; procedure Soundex is
type UStrings is array(Natural range <>) of Unbounded_String; function "+"(S:String) return Unbounded_String renames To_Unbounded_String; function toSoundex (instr : String) return String is str : String := To_Upper(instr); output : String := "0000"; spos : Integer := str'First+1; opos : Positive := 2; map : array(0..255) of Character := (others => ' '); last : Integer := str'First; begin map(65..90) := " 123 12- 22455 12623 1-2 2"; for i in str'Range loop str(i) := map(Character'Pos(str(i))); end loop; output(1) := str(str'First); while (opos <= 4 and spos <= str'Last) loop if str(spos) /= '-' and str(spos) /= ' ' then if (str(spos-1) = '-' and last = spos-2) and then (str(spos) = str(spos-2)) then null; elsif (str(spos) = output(opos-1) and last = spos-1) then last := spos; else output(opos) := str(spos); opos := opos + 1; last := spos; end if; end if; spos := spos + 1; end loop; output(1) := To_Upper(instr(instr'First)); return output; end toSoundex; cases : constant UStrings := (+"Soundex", +"Example", +"Sownteks", +"Ekzampul", +"Euler", +"Gauss", +"Hilbert", +"Knuth", +"Lloyd", +"Lukasiewicz", +"Ellery", +"Ghosh", +"Heilbronn", +"Kant", +"Ladd", +"Lissajous", +"Wheaton", +"Burroughs", +"Burrows", +"O'Hara", +"Washington", +"Lee", +"Gutierrez", +"Pfister", +"Jackson", +"Tymczak", +"VanDeusen", +"Ashcraft");
begin
for i in cases'Range loop Put_Line(To_String(cases(i))&" = "&toSoundex(To_String(cases(i)))); end loop;
end Soundex;</lang>
- Output:
Soundex = S532 Example = E251 Sownteks = S532 Ekzampul = E251 Euler = E460 Gauss = G200 Hilbert = H416 Knuth = K530 Lloyd = L300 Lukasiewicz = L222 Ellery = E460 Ghosh = G200 Heilbronn = H416 Kant = K530 Ladd = L300 Lissajous = L222 Wheaton = W350 Burroughs = B620 Burrows = B620 O'Hara = O600 Washington = W252 Lee = L000 Gutierrez = G362 Pfister = P236 Jackson = J250 Tymczak = T522 VanDeusen = V532 Ashcraft = A261
ALGOL 68
Note: The only non-standard prelude functions used are to lower, is alpha, and is digit. These are easy enough to write, vide String case <lang Algol68> PROC soundex = (STRING s) STRING:
BEGIN PROC encode = (CHAR c) CHAR: BEGIN # We assume the alphabet is contiguous. # "-123-12*-22455-12623-1*2-2"[ABS to lower(c) - ABS "a" + 1] END; INT soundex code length = 4; STRING result := soundex code length * "0"; IF s /= "" THEN CHAR previous; INT j; result[j := 1] := s[1]; previous := encode(s[1]); FOR i FROM 2 TO UPB s WHILE j < soundex code length DO IF is alpha(s[i]) THEN CHAR code = encode(s[i]); IF is digit(code) AND code /= previous THEN result[j +:= 1] := code; previous := code ELIF code = "-" THEN # Only vowels (y counts here) hide the last-added character # previous := code FI FI OD FI; result END; # Test code to persuade one that it does work. # MODE TEST = STRUCT (STRING input, STRING expected output); [] TEST soundex test = ( ("Soundex", "S532"), ("Example", "E251"), ("Sownteks", "S532"), ("Ekzampul", "E251"), ("Euler", "E460"), ("Gauss", "G200"), ("Hilbert", "H416"), ("Knuth", "K530"), ("Lloyd", "L300"), ("Lukasiewicz", "L222"), ("Ellery", "E460"), ("Ghosh", "G200"), ("Heilbronn", "H416"), ("Kant", "K530"), ("Ladd", "L300"), ("Lissajous", "L222"), ("Wheaton", "W350"), ("Burroughs", "B620"), ("Burrows", "B620"), ("O'Hara", "O600"), ("Washington", "W252"), ("Lee", "L000"), ("Gutierrez", "G362"), ("Pfister", "P236"), ("Jackson", "J250"), ("Tymczak", "T522"), ("VanDeusen", "V532"), ("Ashcraft", "A261") ); # Apologies for the magic number in the padding of the input and the wired-in heading. # print(("Test name Code Got", newline, "----------------------", newline)); FOR i FROM LWB soundex test TO UPB soundex test DO STRING output = soundex(input OF soundex test[i]); printf(($g, n (12 - UPB input OF soundex test[i]) x$, input OF soundex test[i])); printf(($g, 1x, g, 1x$, expected output OF soundex test[i], output)); printf(($b("ok", "not ok"), 1l$, output = expected output OF soundex test[i])) OD</lang>
AutoHotkey
<lang AutoHotkey>getCode(c){
If c in B,F,P,V return 1 If c in C,G,J,K,Q,S,X,Z return 2 If c in D,T return 3 If c = L return 4 If c in M,N return 5 If c = R return 6
}
soundex(s){
code := SubStr(s, 1, 1) ,previous := 7 ,i := 1 While ++i <= StrLen(s){ current := getCode(SubStr(s, i, 1)) If StrLen(current) > 0 And current <> previous code := code . current previous := current } soundex := SubStr(code, 1, 4) If StrLen(code) < 4 soundex .= String(4 - StrLen(code), "0") return soundex
}
String(a, n){
Loop n o .= a return a
}
MsgBox % Soundex("Soundex") "`n" Soundex("Sowndeks") "`n" Soundex("Ashcroft") "`n" Soundex("Ashkrofd")</lang>
AWK
The soundex function is embedded in a program to build a table of soundex "homonyms".
<lang awk>#!/usr/bin/awk -f
BEGIN {
delete homs homSep = ", " getHoms() showHoms() exit 1
}
function strToSoundex(s, sdx, i, ch, cd, lch) {
if (length(s) == 0) return "" s = tolower(s) lch = substr(s, 1, 1); sdx = toupper(lch) lch = charToSoundex(lch) for (i = 2; i <= length(s); i++) { ch = substr(s, i, 1) cd = charToSoundex(ch) if (cd == 7) continue; if (cd && cd != lch) sdx = sdx cd lch = cd } sdx = substr(sdx "0000", 1, 4) return sdx
}
function charToSoundex(ch, cd) {
if (ch ~ /[bfpv]/) cd = 1 else if (ch ~ /[cgjkqsxz]/) cd = 2 else if (ch ~ /[dt]/) cd = 3 else if (ch == "l") cd = 4 else if (ch ~ /[mn]/) cd = 5 else if (ch == "r") cd = 6 else if (ch ~ /[hw]/) cd = 7 else cd = 0 return cd;
}
function getHoms( eof, wd, ef, sd) {
while (1) { ef = getline wd if (ef < 1) break; if (wd ~ /^[^'a-zA-Z]/) continue; sd = strToSoundex(wd) if (!(sd in homs)) homs[sd] = "" homs[sd] = homs[sd] (homs[sd] == "" ? "" : homSep) wd }
}
function showHoms() {
for (i in homs) { printf i " " n = split(homs[i], wl, homSep) for (j = 1; j < 4 && j <= n; j++) { printf wl[j] " " } print (n > 3 ? "..." : "") }
} </lang>
Example run:
# ./soundex.awk ../unixdict.txt |sort A000 a aaa aau ... A100 a&p aba abbe ... A110 ababa above aviv A111 aboveboard A112 aboveground A114 affable A115 abovementioned A120 aback abase abash ... A121 abusable abusive appeasable A122 abacus abject abscess ... A123 abstain abstention abstinent ... A124 abigail absolute absolution ... A125 absence absent absentee ... A126 absorb absorbent absorption ... A130 abate abbot abbott ... A131 affidavit A132 abdicate abduct abidjan ... A133 abetted abutted apathetic ... A135 abdomen abdominal abetting ... A136 abater aftereffect afterglow ... A140 abel able afoul ... A141 appleby A142 abelson ablaze abolish ... . . . Z324 zodiacal Z400 zeal Z420 zealous zilch zoology Z430 zealot zloty Z453 zealand Z461 zellerbach Z500 zan zen zion ... Z510 zambia zomba zombie Z520 zinc zing Z521 zanzibar Z525 zionism Z530 zenith Z532 zounds Z565 zimmerman Z600 zaire zero Z620 zeroes zurich Z623 zoroaster zoroastrian Z625 zircon zirconium Z630 zeroth Z650 zorn #
BBC BASIC
<lang bbcbasic> DATA Ashcraft, Ashcroft, Gauss, Ghosh, Hilbert, Heilbronn, Lee, Lloyd
DATA Moses, Pfister, Robert, Rupert, Rubin, Tymczak, Soundex, Example FOR i% = 1 TO 16 READ name$ PRINT """" name$ """" TAB(15) FNsoundex(name$) NEXT END DEF FNsoundex(name$) LOCAL i%, n%, p%, n$, s$ name$ = FNupper(name$) n$ = "01230129022455012623019202" s$ = LEFT$(name$,1) p% = VALMID$(n$, ASCs$ - 64, 1) FOR i% = 2 TO LEN(name$) n% = VALMID$(n$, ASCMID$(name$,i%,1) - 64, 1) IF n% IF n% <> 9 IF n% <> p% s$ += STR$(n%) IF n% <> 9 p% = n% NEXT = LEFT$(s$ + "000", 4) DEF FNupper(A$) LOCAL A%,C% FOR A% = 1 TO LEN(A$) C% = ASCMID$(A$,A%) IF C% >= 97 IF C% <= 122 MID$(A$,A%,1) = CHR$(C%-32) NEXT = A$</lang>
Output:
"Ashcraft" A261 "Ashcroft" A261 "Gauss" G200 "Ghosh" G200 "Hilbert" H416 "Heilbronn" H416 "Lee" L000 "Lloyd" L300 "Moses" M220 "Pfister" P236 "Robert" R163 "Rupert" R163 "Rubin" R150 "Tymczak" T522 "Soundex" S532 "Example" E251
C
Some string examples and rules from [[1]]. <lang c>#include <stdio.h>
- include <stdlib.h>
- include <string.h>
- include <ctype.h>
/* for ASCII only */ static char code[128] = { 0 }; void add_code(const char *s, int c) { while (*s) { code[(int)*s] = code[0x20 ^ (int)*s] = c; s++; } }
void init() { static const char *cls[] = { "AEIOU", "", "BFPV", "CGJKQSXZ", "DT", "L", "MN", "R", 0}; int i; for (i = 0; cls[i]; i++) add_code(cls[i], i - 1); }
static char out[5]; /* returns a static buffer; user must copy if want to save
result across calls */
char* soundex(char *s) { int c, prev, i;
out[0] = out[4] = 0; if (!s || !*s) return out;
out[0] = *s++;
/* first letter, though not coded, can still affect next letter: Pfister */ prev = code[(int)out[0]]; for (i = 1; *s && i < 4; s++) { if ((c = code[(int)*s]) == prev) continue;
if (c == -1) prev = 0; /* vowel as separator */ else if (c > 0) { out[i++] = c + '0'; prev = c; } } while (i < 4) out[i++] = '0'; return out; }
int main() { int i; char *sdx, *names[][2] = { {"Soundex", "S532"}, {"Example", "E251"}, {"Sownteks", "S532"}, {"Ekzampul", "E251"}, {"Euler", "E460"}, {"Gauss", "G200"}, {"Hilbert", "H416"}, {"Knuth", "K530"}, {"Lloyd", "L300"}, {"Lukasiewicz", "L222"}, {"Ellery", "E460"}, {"Ghosh", "G200"}, {"Heilbronn", "H416"}, {"Kant", "K530"}, {"Ladd", "L300"}, {"Lissajous", "L222"}, {"Wheaton", "W350"}, {"Burroughs", "B620"}, {"Burrows", "B620"}, {"O'Hara", "O600"}, {"Washington", "W252"}, {"Lee", "L000"}, {"Gutierrez", "G362"}, {"Pfister", "P236"}, {"Jackson", "J250"}, {"Tymczak", "T522"}, {"VanDeusen", "V532"}, {"Ashcraft", "A261"}, {0, 0} };
init();
puts(" Test name Code Got\n----------------------"); for (i = 0; names[i][0]; i++) { sdx = soundex(names[i][0]); printf("%11s %s %s ", names[i][0], names[i][1], sdx); printf("%s\n", strcmp(sdx, names[i][1]) ? "not ok" : "ok"); }
return 0; }</lang>
C#
<lang c sharp>using System; using System.Collections.Generic; using System.Linq;
namespace Soundex {
internal static class Program { private static void Main() { var testWords = new TestWords { {"Soundex", "S532"}, {"Example", "E251"}, {"Sownteks", "S532"}, {"Ekzampul", "E251"}, {"Euler", "E460"}, {"Gauss", "G200"}, {"Hilbert", "H416"}, {"Knuth", "K530"}, {"Lloyd", "L300"}, {"Lukasiewicz", "L222"}, {"Ellery", "E460"}, {"Ghosh", "G200"}, {"Heilbronn", "H416"}, {"Kant", "K530"}, {"Ladd", "L300"}, {"Lissajous", "L222"}, {"Wheaton", "W350"}, {"Burroughs", "B620"}, {"Burrows", "B620"}, {"O'Hara", "O600"}, {"Washington", "W252"}, {"Lee", "L000"}, {"Gutierrez", "G362"}, {"Pfister", "P236"}, {"Jackson", "J250"}, {"Tymczak", "T522"}, {"VanDeusen", "V532"}, {"Ashcraft", "A261"} };
foreach (var testWord in testWords) Console.WriteLine("{0} -> {1} ({2})", testWord.Word.PadRight(11), testWord.ActualSoundex, (testWord.ExpectedSoundex == testWord.ActualSoundex)); }
// List<TestWord> wrapper to make declaration simpler. private class TestWords : List<TestWord> { public void Add(string word, string expectedSoundex) { Add(new TestWord(word, expectedSoundex)); } }
private class TestWord { public TestWord(string word, string expectedSoundex) { Word = word; ExpectedSoundex = expectedSoundex; ActualSoundex = Soundex(word); }
public string Word { get; private set; } public string ExpectedSoundex { get; private set; } public string ActualSoundex { get; private set; } }
private static string Soundex(string word) { const string soundexAlphabet = "0123012#02245501262301#202"; string soundexString = ""; char lastSoundexChar = '?'; word = word.ToUpper();
foreach (var c in from ch in word where ch >= 'A' && ch <= 'Z' && soundexString.Length < 4 select ch) { char thisSoundexChar = soundexAlphabet[c - 'A'];
if (soundexString.Length == 0) soundexString += c; else if (thisSoundexChar == '#') continue; else if (thisSoundexChar != '0' && thisSoundexChar != lastSoundexChar) soundexString += thisSoundexChar;
lastSoundexChar = thisSoundexChar; }
return soundexString.PadRight(4, '0'); } }
}</lang>
Sample Output
Soundex -> S532 (True) Example -> E251 (True) Sownteks -> S532 (True) Ekzampul -> E251 (True) Euler -> E460 (True) Gauss -> G200 (True) Hilbert -> H416 (True) Knuth -> K530 (True) Lloyd -> L300 (True) Lukasiewicz -> L222 (True) Ellery -> E460 (True) Ghosh -> G200 (True) Heilbronn -> H416 (True) Kant -> K530 (True) Ladd -> L300 (True) Lissajous -> L222 (True) Wheaton -> W350 (True) Burroughs -> B620 (True) Burrows -> B620 (True) O'Hara -> O600 (True) Washington -> W252 (True) Lee -> L000 (True) Gutierrez -> G362 (True) Pfister -> P236 (True) Jackson -> J250 (True) Tymczak -> T522 (True) VanDeusen -> V532 (True) Ashcraft -> A261 (True)
Clipper/XBase++
<lang Clipper/XBase++>FUNCTION Soundex(cWord)
/* This is a Clipper/XBase++ implementation of the standard American Soundex procedure.
*/
LOCAL cSoundex, i, nLast, cChar, nCode
cWord:=ALLTRIM(UPPER(cWord)) cSoundex:=LEFT(cWord, 1) // first letter is first char nLast:=-1 FOR i:=2 TO LEN(cWord)
cChar:=SUBSTR(cWord, i, 1) // get char nCode:=SoundexCode(cChar) // get soundex code for char IF nCode=0 // if 0, ignore LOOP ENDIF IF nCode#nLast // if not same code, add to soundex nLast:=nCode // and replace the last one cSoundex+=STR(nCode, 1) ENDIF
NEXT cSoundex:=PADR(cSoundex, 4, "0")
RETURN(cSoundex)
STATIC FUNCTION SoundexCode(cLetter) LOCAL aCodes:={"BFPV", "CGJKQSXZ", "DT", "L", "MN", "R"}, i, nRet:=0
FOR i:=1 TO LEN(aCodes)
IF cLetter $ aCodes[i] nRet:=i EXIT ENDIF
NEXT
RETURN(nRet)
FUNCTION SoundexDifference(cSound1, cSound2) LOCAL nMatch:=0, nLen1, nLen2, i
nLen1:=LEN(cSound1) nLen2:=LEN(cSound2)
// make the two words the same length. This is a safety. They both should be 4 characters long. IF nLen1 > nLen2
cSound2:=PADR(cSound2, nLen1-nLen2, "0")
ELSEIF nLen1 < nLen2
cSound1:=PADR(cSound1, nLen2-nLen1, "0")
ENDIF
// compare the corresponding characters between the two words FOR i:=1 TO LEN(cSound1)
IF SUBSTR(cSound1, i, 1) == SUBSTR(cSound2, i, 1) ++nMatch ENDIF
NEXT
RETURN(nMatch)
- </lang>
--Clippersolutions 23:14, 4 November 2010 (UTC)--Clippersolutions 23:14, 4 November 2010 (UTC)
Clojure
<lang Clojure>(defn get-code [c]
(case c (\B \F \P \V) 1 (\C \G \J \K \Q \S \X \Z) 2 (\D \T) 3 \L 4 (\M \N) 5 \R 6 nil)) ;(\A \E \I \O \U \H \W \Y)
(defn soundex [s]
(let [[f & s] (.toUpperCase s)] (-> (map get-code s)
distinct (concat , "0000") (->> (cons f ,) (remove nil? ,) (take 4 ,) (apply str ,)))))</lang>
Bug here? The distinct function eliminates duplicates. What is needed in Soundex is to eliminate consecutive duplicates.
Common Lisp
<lang lisp>(defun get-code (c)
(case c ((#\B #\F #\P #\V) #\1) ((#\C #\G #\J #\K #\Q #\S #\X #\Z) #\2) ((#\D #\T) #\3) (#\L #\4) ((#\M #\N) #\5) (#\R #\6)))
(defun soundex (s)
(if (zerop (length s)) "" (let* ((l (coerce (string-upcase s) 'list)) (o (list (first l)))) (loop for c in (rest l) for cg = (get-code c) and for cp = #\Z then cg when (and cg (not (eql cg cp))) do (push cg o) finally (return (subseq (coerce (nreverse `(#\0 #\0 #\0 ,@o)) 'string) 0 4))))))</lang>
D
Standard Version
The D standard library (Phobos) contains a soundex function: <lang d>import std.stdio: writeln; import std.string: soundex;
void main() {
assert(soundex("soundex") == "S532"); assert(soundex("example") == "E251"); assert(soundex("ciondecks") == "C532"); assert(soundex("ekzampul") == "E251"); assert(soundex("Robert") == "R163"); assert(soundex("Rupert") == "R163"); assert(soundex("Rubin") == "R150"); assert(soundex("Ashcraft") == "A261"); assert(soundex("Ashcroft") == "A261"); assert(soundex("Tymczak") == "T522");
}</lang> It works according to this document: http://www.archives.gov/publications/general-info-leaflets/55.html So soundex("Ashcraft") is A-261 instead of A-226.
Alternative Version
The following version uses the Wikipedia algorithm, it's long because it contains a ddoc text, design by contract (a long post-condition), sanity asserts, unittests and comments. A quite shorter version may be written that loses the safety net that's necessary in serious coding.
This version uses dynamic heap allocations in some places (replace, toupper, several string join) to allow a higher level style of coding, but this function may also be written to perform zero heap allocations. It may even return a char[4] by value, or use a given buffer like the C version.
<lang d>import std.string: toUpper, replace; import std.ascii: isUpper;
/***************************** Soundex is a phonetic algorithm for indexing names by sound, as pronounced in English. See: http://en.wikipedia.org/wiki/Soundex
- /
/*pure nothrow*/ string soundex(in string name) // Adapted from public domain Python code by Gregory Jorgensen: // http://code.activestate.com/recipes/52213/ out(result) { // postcondition
assert(result.length == 4); assert(result[0] == '0' || isUpper(result[0]));
if (name.length == 0) assert(result == "0000");
// this is too much fiddly int charCount = 0; foreach (dchar c; name) if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z')) charCount++; assert((charCount == 0) == (result == "0000"));
} body {
// digits holds the soundex values for the alphabet static immutable digits = "01230120022455012623010202"; string firstChar, result;
// translate alpha chars in name to soundex digits foreach (dchar c; name.toUpper()) { if (c >= 'A' && c <= 'Z') { if (!firstChar.length) firstChar ~= c; // remember first letter immutable char d = digits[c - 'A']; // duplicate consecutive soundex digits are skipped if (!result.length || d != result[$ - 1]) result ~= d; } }
// return 0000 if the name is empty if (!firstChar.length) return "0000";
// replace first digit with first alpha character assert(result.length > 0); result = firstChar ~ result[1 .. $];
// remove all 0s from the soundex code result = result.replace("0", ""); // not pure
// return soundex code padded to 4 zeros return (result ~ "0000")[0 .. 4];
} unittest { // tests of soundex()
auto tests = [["", "0000"], ["12346", "0000"], ["he", "H000"], ["soundex", "S532"], ["example", "E251"], ["ciondecks", "C532"], ["ekzampul", "E251"], ["résumé", "R250"], ["Robert", "R163"], ["Rupert", "R163"], ["Rubin", "R150"], ["Ashcraft", "A226"], ["Ashcroft", "A226"]]; foreach (pair; tests) assert(soundex(pair[0]) == pair[1]);
}
void main() {}</lang>
Delphi
<lang Delphi> program SoundexDemo;
{$APPTYPE CONSOLE}
uses
SysUtils, StrUtils;
begin
Writeln(Soundex('Soundex')); Writeln(Soundex('Example')); Writeln(Soundex('Sownteks')); Writeln(Soundex('Ekzampul')); Readln;
end. </lang> Output:
S532 E251 S532 E251
Erlang
This implements the US Census rules, where W and H are ignored but, unlike vowels, are not separators. <lang Erlang>-module(soundex). -export([soundex/1]).
soundex([]) ->
[];
soundex(Str) ->
[Head|Tail] = string:to_upper(Str), [Head | isoundex(Tail, [], todigit(Head))].
isoundex([], Acc, _) ->
case length(Acc) of
N when N == 3 -> lists:reverse(Acc); N when N < 3 -> isoundex([], [$0 | Acc], ignore); N when N > 3 -> isoundex([], lists:sublist(Acc, N-2, N), ignore)
end;
isoundex([Head|Tail], Acc, Lastn) ->
Dig = todigit(Head), case Dig of
Dig when Dig /= $0, Dig /= Lastn -> isoundex(Tail, [Dig | Acc], Dig); _ -> case Head of $H -> isoundex(Tail, Acc, Lastn); $W -> isoundex(Tail, Acc, Lastn); N when N >= $A, N =< $Z -> isoundex(Tail, Acc, Dig); _ -> isoundex(Tail, Acc, Lastn) % This clause handles non alpha characters end
end.
todigit(Chr) ->
Digits = "01230120022455012623010202", HeadOff = Chr - $A + 1, case HeadOff of
N when N > 0, N < 27 -> lists:nth(HeadOff, Digits); _ -> % Treat non alpha characters as a vowel $0
end.
</lang>
Forth
This implements the US Census rules, where W and H are ignored but, unlike vowels, aren't separators. Further corner cases welcome...
<lang forth>: alpha-table create does> swap 32 or [char] a - 0 max 26 min + 1+ c@ ;
alpha-table soundex-code
," 123 12. 22455 12623 1.2 2 " \ ABCDEFGHIJKLMNOPQRSTUVWXYZ
- soundex ( name len -- pad len )
over c@ pad c! \ First character verbatim pad 1+ 3 [char] 0 fill \ Pad to four characters with zeros 1 pad c@ soundex-code ( count code ) 2swap bounds do i c@ soundex-code ( count code next ) 2dup = if drop else \ runs are ignored dup [char] . = if drop else \ W, H don't separate runs of consonants dup bl = if nip else \ vowels separate consonants but aren't coded nip 2dup swap pad + c! swap 1+ tuck 4 = if leave then then then then loop 2drop pad 4 ;
\ Knuth's test cases s" Euler" soundex cr type \ E460 s" Gauss" soundex cr type \ G200 s" Hilbert" soundex cr type \ H416 s" Knuth" soundex cr type \ K530 s" Lloyd" soundex cr type \ L300 s" Lukasiewicz" soundex cr type \ L222 (W test) s" Ellery" soundex cr type \ E460 s" Ghosh" soundex cr type \ G200 s" Heilbronn" soundex cr type \ H416 s" Kant" soundex cr type \ K530 s" Ladd" soundex cr type \ L300 s" Lissajous" soundex cr type \ L222
s" Wheaton" soundex cr type \ W350 s" Ashcraft" soundex cr type \ A261 (H tests) s" Burroughs" soundex cr type \ B620 s" Burrows" soundex cr type \ B620 (W test) (any Welsh names?) s" O'Hara" soundex cr type \ O600 (punctuation test)</lang>
Go
WP article rules, plus my interpretation for input validation. <lang go>package main
import (
"errors" "fmt" "unicode"
)
var code = []byte("01230127022455012623017202")
func soundex(s string) (string, error) {
var sx [4]byte var sxi int var cx, lastCode byte for i, c := range s { switch { case !unicode.IsLetter(c): if c < ' ' || c == 127 { return "", errors.New("ASCII control characters disallowed") } if i == 0 { return "", errors.New("initial character must be a letter") } lastCode = '0' continue case c >= 'A' && c <= 'Z': cx = byte(c - 'A') case c >= 'a' && c <= 'z': cx = byte(c - 'a') default: return "", errors.New("non-ASCII letters unsupported") } // cx is valid letter index at this point if i == 0 { sx[0] = cx + 'A' sxi = 1 continue } switch x := code[cx]; x { case '7', lastCode: case '0': lastCode = '0' default: sx[sxi] = x if sxi == 3 { return string(sx[:]), nil } sxi++ lastCode = x } } if sxi == 0 { return "", errors.New("no letters present") } for ; sxi < 4; sxi++ { sx[sxi] = '0' } return string(sx[:]), nil
}
func main() {
for _, s := range []string{ "Robert", // WP test case = R163 "Rupert", // WP test case = R163 "Rubin", // WP test case = R150 "ashcroft", // WP test case = A261 "ashcraft", // s and c combine across h, t not needed "moses", // s's don't combine across e "O'Mally", // apostrophe allowed, adjacent ll's combine "d jay", // spaces allowed "R2-D2", // digits, hyphen allowed "12p2", // just not in leading position "naïve", // non ASCII disallowed "", // empty string disallowed "bump\t", // ASCII control characters disallowed } { if x, err := soundex(s); err == nil { fmt.Println("soundex", s, "=", x) } else { fmt.Printf("\"%s\" fail. %s\n", s, err) } }
}</lang> Output:
soundex Robert = R163 soundex Rupert = R163 soundex Rubin = R150 soundex ashcroft = A261 soundex ashcraft = A261 soundex moses = M220 soundex O'Mally = O540 soundex d jay = D200 soundex R2-D2 = R300 "12p2" fail. initial character must be a letter "naïve" fail. non-ASCII letters unsupported "" fail. no letters present "bump " fail. ASCII control characters disallowed
Groovy
<lang groovy> def soundex(s) {
def code = "" def lookup = [ B : 1, F : 1, P : 1, V : 1, C : 2, G : 2, J : 2, K : 2, Q : 2, S : 2, X : 2, Z : 2, D : 3, T : 3, L : 4, M : 5, N : 5, R : 6 ] s[1..-1].toUpperCase().inject(7) { lastCode, letter -> def letterCode = lookup[letter] if(letterCode && letterCode != lastCode) { code += letterCode } } return "${s[0]}${code}0000"[0..3]
}
println(soundex("Soundex")) println(soundex("Sownteks")) println(soundex("Example")) println(soundex("Ekzampul")) </lang>
Haskell
<lang haskell>import Text.PhoneticCode.Soundex import Control.Arrow</lang> Example: <lang haskell>*Main> mapM_ print $ map (id &&& soundexSimple) ["Soundex", "Example", "Sownteks", "Ekzampul"] ("Soundex","S532") ("Example","E251") ("Sownteks","S532") ("Ekzampul","E251")</lang>
Icon and Unicon
<lang icon>procedure main(arglist) # computes soundex of each argument every write(x := !arglist, " => ",soundex(x)) end
procedure soundex(name)
local dig,i,x static con initial { # construct mapping x[i] => i all else . x := ["bfpv","cgjkqsxz","dt","l","mn","r"] every ( dig := con := "") ||:= repl(i := 1 to *x,*x[i]) do con ||:= x[i] con := map(map(&lcase,con,dig),&lcase,repl(".",*&lcase)) } name := map(name) # lower case name[1] := map(name[1],&lcase,&ucase) # upper case 1st name := map(name,&lcase,con) # map cons every x := !"123456" do while name[find(x||x,name)+:2] := x # kill duplicates while name[upto('.',name)] := "" # kill . return left(name,4,"0")
end</lang>
implements soundex. The above version is an adaptation of that procedure
J
Solution <lang j>removeDups =: {.;.1~ (1 , }. ~: }: ) codes =: ;: 'BFPV CGJKQSXZ DT L MN R HW'
soundex =: 3 : 0
if. 0=# k=.toupper y do. '0' return. end. ({.k), ,": ,. 3 {. 0-.~ }. removeDups 7 0:`(I.@:=)`]} , k >:@I.@:(e. &>)"0 _ codes
)</lang> Usage <lang j>names=: 'Lloyd Woolcock Donnell Baragwanath Williams Ashcroft Euler Ellery Gauss Ghosh Hilbert Heilbronn Knuth Kant Ladd Lukasiewicz Lissajous' soundexNames=: 'L300 W422 D540 B625 W452 A226 E460 E460 G200 G200 H416 H416 K530 K530 L300 L222 L222'
soundex &> ;: names
L300 W422 D540 B625 W452 ....</lang> Test <lang j> soundexNames-:(soundex &.>) &. ;: names 1</lang>
Java
<lang java>public static void main(String[] args){
System.out.println(soundex("Soundex")); System.out.println(soundex("Example")); System.out.println(soundex("Sownteks")); System.out.println(soundex("Ekzampul")); }
private static String getCode(char c){
switch(c){ case 'B': case 'F': case 'P': case 'V': return "1"; case 'C': case 'G': case 'J': case 'K': case 'Q': case 'S': case 'X': case 'Z': return "2"; case 'D': case 'T': return "3"; case 'L': return "4"; case 'M': case 'N': return "5"; case 'R': return "6"; default: return ""; }
}
public static String soundex(String s){
String code, previous, soundex; code = s.toUpperCase().charAt(0) + ""; previous = "7"; for(int i = 1;i < s.length();i++){ String current = getCode(s.toUpperCase().charAt(i)); if(current.length() > 0 && !current.equals(previous)){ code = code + current; } previous = current; } soundex = (code + "0000").substring(0, 4); return soundex;
}</lang> Output:
S532 E251 S532 E251
JavaScript
<lang javascript>var soundex = function (s) {
var a = s.toLowerCase().split() f = a.shift(), r = , codes = { a: , e: , i: , o: , u: , b: 1, f: 1, p: 1, v: 1, c: 2, g: 2, j: 2, k: 2, q: 2, s: 2, x: 2, z: 2, d: 3, t: 3, l: 4, m: 5, n: 5, r: 6 }; r = f + a .map(function (v, i, a) { return codes[v] }) .filter(function (v, i, a) { return ((i === 0) ? v !== codes[f] : v !== a[i - 1]); }) .join(); return (r + '000').slice(0, 4).toUpperCase();
};
var tests = {
"Soundex": "S532", "Example": "E251", "Sownteks": "S532", "Ekzampul": "E251", "Euler": "E460", "Gauss": "G200", "Hilbert": "H416", "Knuth": "K530", "Lloyd": "L300", "Lukasiewicz": "L222", "Ellery": "E460", "Ghosh": "G200", "Heilbronn": "H416", "Kant": "K530", "Ladd": "L300", "Lissajous": "L222", "Wheaton": "W350", "Ashcraft": "A226", "Burroughs": "B622", "Burrows": "B620", "O'Hara": "O600" };
for (var i in tests)
if (tests.hasOwnProperty(i)) { console.log( i + ' \t' + tests[i] + '\t' + soundex(i) + '\t' + (soundex(i) === tests[i]) );
}
// Soundex S532 S532 true // Example E251 E251 true // Sownteks S532 S532 true // Ekzampul E251 E251 true // Euler E460 E460 true // Gauss G200 G200 true // Hilbert H416 H416 true // Knuth K530 K530 true // Lloyd L300 L300 true // Lukasiewicz L222 L222 true // Ellery E460 E460 true // Ghosh G200 G200 true // Heilbronn H416 H416 true // Kant K530 K530 true // Ladd L300 L300 true // Lissajous L222 L222 true // Wheaton W350 W350 true // Ashcraft A226 A226 true // Burroughs B622 B622 true // Burrows B620 B620 true // O'Hara O600 O600 true</lang>
Mathematica
<lang Mathematica>Soundex[ input_ ] := Module[{x = input, head, body}, {head, body} = {First@#, Rest@#}&@ToLowerCase@Characters@x; body = (Select[body, FreeQ[Characters["aeiouyhw"],#]&] /. {("b"|"f"|"p"|"v")->1, ("c"|"g"|"j"|"k"|"q"|"s"|"x"|"z")->2, ("d"|"t")->3,"l"->4 ,("m"|"n")->5, "r"->6}); If[Length[body] < 3,
body = PadRight[body, 3], body = DeleteDuplicates[body]
]; StringJoin @@ ToString /@ PrependTo[ body1 ;; 3, ToUpperCase@head]]</lang> Example usage:
Map[Soundex,{"Soundex", "Sownteks", "Example", "Ekzampul"}] -> {S532, S532, E251, E251}
MUMPS
<lang MUMPS>SOUNDEX(X,NARA=0)
;Converts a string to its Soundex value. ;Empty strings return "0000". Non-alphabetic ASCII characters are ignored. ;X is the name to be converted to Soundex ;NARA is a flag, defaulting to zero, for which implementation to perform. ;If NARA is 0, do what seems to be the Knuth implementation ;If NARA is a positive integer, do the NARA implementation. ; This varies the soundex rule for "W" and "H", and adds variants for prefixed names separated by carets. ; http://www.archives.gov/publications/general-info-leaflets/55-census.html ;Y is the string to be returned ;UP is the list of upper case letters ;LO is the list of lower case letters ;PREFIX is a list of prefixes to be stripped off ;X1 is the upper case version of X ;X2 is the name without a prefix ;Y2 is the soundex of a name without a prefix ;C is a loop variable ;DX is a list of Soundex values, in alphabetical order. Underscores are used for the NARA variation letters ;XD is a partially processed translation of X into soundex values NEW Y,UP,LO,PREFIX,X1,X2,Y2,C,DX,XD SET UP="ABCDEFGHIJKLMNOPQRSTUVWXYZ" ;Upper case characters SET LO="abcdefghijklmnopqrstuvwxyz" ;Lower case characters SET DX=" 123 12_ 22455 12623 1_2 2" ;Soundex values SET PREFIX="VAN^CO^DE^LA^LE" ;Prefixes that could create an alternate soundex value SET Y="" ;Y is the value to be returned SET X1=$TRANSLATE(X,LO,UP) ;Make local copy, and force all letters to be upper case SET XD=$TRANSLATE(X1,UP,DX) ;Soundex values for string ; SET Y=$EXTRACT(X1,1,1) ;Get first character FOR C=2:1:$LENGTH(X1) QUIT:$L(Y)>=4 DO . ;ignore doubled letters OR and side-by-side soundex values OR same soundex on either side of "H" or "W" . QUIT:($EXTRACT(X1,C,C)=$EXTRACT(X1,C-1,C-1)) . QUIT:($EXTRACT(XD,C,C)=$EXTRACT(XD,C-1,C-1)) . ;ignore non-alphabetic characters . QUIT:UP'[($EXTRACT(X1,C,C)) . QUIT:NARA&(($EXTRACT(XD,C-1,C-1)="_")&(C>2))&($EXTRACT(XD,C,C)=$EXTRACT(XD,C-2,C-2)) . QUIT:" _"[$EXTRACT(XD,C,C) . SET Y=Y_$EXTRACT(XD,C,C) ; Pad with "0" so string length is 4 IF $LENGTH(Y)<4 FOR C=$L(Y):1:3 SET Y=Y_"0" IF NARA DO . FOR C=1:1:$LENGTH(PREFIX,"^") DO . . IF $EXTRACT(X1,1,$LENGTH($PIECE(PREFIX,"^",C)))=$PIECE(PREFIX,"^",C) DO . . . ;Take off the prefix, and any leading spaces . . . SET X2=$EXTRACT(X1,$LENGTH($PIECE(PREFIX,"^",C))+1,$LENGTH(X1)-$PIECE(PREFIX,"^",C)) FOR QUIT:UP[$E(X2,1,1) SET X2=$E(X2,2,$L(X2)) . . . SET Y2=$$SOUNDEX(X2,NARA) SET Y=Y_"^"_Y2 KILL UP,LO,PREFIX,X1,X2,Y2,C,DX,XD QUIT Y
</lang>
Examples:
USER>W $$SOUNDEX^SOUNDEX("") 0000 USER>W $$SOUNDEX^SOUNDEX("ASHCROFT") A226 USER>W $$SOUNDEX^SOUNDEX("ASHCROFT",1) A261 USER>W $$SOUNDEX^SOUNDEX("EULER") E460 USER>W $$SOUNDEX^SOUNDEX("O'HARA") O600 USER>W $$SOUNDEX^SOUNDEX("naïve") N100 USER>W $$SOUNDEX^SOUNDEX("Moses") M220 USER>W $$SOUNDEX^SOUNDEX("Omalley") O540 USER>W $$SOUNDEX^SOUNDEX("O'Malley") O540 USER>W $$SOUNDEX^SOUNDEX("Delarosa") D462 USER>W $$SOUNDEX^SOUNDEX("Delarosa",1) D462^L620^R200 USER>W $$SOUNDEX^SOUNDEX("De la Rosa") D462 USER>W $$SOUNDEX^SOUNDEX("de la Rosa",1) D462^L620^R200 USER>W $$SOUNDEX^SOUNDEX("Van de Graaff") V532 USER>W $$SOUNDEX^SOUNDEX("Van de Graaff",1) V532^D261^G610
There's just one small problem...
USER>W $$SOUNDEX^SOUNDEX("fish") F200 USER>W $$SOUNDEX^SOUNDEX("ghoti") G300
OCaml
Here is an implementation:
<lang ocaml>let c2d = function
| 'B' | 'F' | 'P' | 'V' -> "1" | 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' -> "2" | 'D' | 'T' -> "3" | 'L' -> "4" | 'M' | 'N' -> "5" | 'R' -> "6" | _ -> ""
let rec dbl acc = function
| [] -> (List.rev acc) | [c] -> List.rev(c::acc) | c1::(c2::_ as tl) -> if c1 = c2 then dbl acc tl else dbl (c1::acc) tl
let pad s =
match String.length s with | 0 -> s ^ "000" | 1 -> s ^ "00" | 2 -> s ^ "0" | 3 -> s | _ -> String.sub s 0 3
let soundex_aux rem =
pad(String.concat "" (dbl [] (List.map c2d rem)))
let soundex s =
let s = String.uppercase s in let cl = ref [] in String.iter (fun c -> cl := c :: !cl) s; match dbl [] (List.rev !cl) with | c::rem -> (String.make 1 c) ^ (soundex_aux rem) | [] -> invalid_arg "soundex"</lang>
Test our implementation:
<lang ocaml>let tests = [
"Soundex", "S532"; "Example", "E251"; "Sownteks", "S532"; "Ekzampul", "E251"; "Euler", "E460"; "Gauss", "G200"; "Hilbert", "H416"; "Knuth", "K530"; "Lloyd", "L300"; "Lukasiewicz", "L222"; "Ellery", "E460"; "Ghosh", "G200"; "Heilbronn", "H416"; "Kant", "K530"; "Ladd", "L300"; "Lissajous", "L222"; "Wheaton", "W350"; "Ashcraft", "A226"; "Burroughs", "B622"; "Burrows", "B620"; "O'Hara", "O600"; ]
let () =
print_endline " Word \t Code Found Status"; List.iter (fun (word, code1) -> let code2 = soundex word in let status = if code1 = code2 then "OK " else "Arg" in Printf.printf " \"%s\" \t %s %s %s\n" word code1 code2 status ) tests</lang>
This test outputs:
Word Code Found Status "Soundex" S532 S532 OK "Example" E251 E251 OK "Sownteks" S532 S532 OK "Ekzampul" E251 E251 OK "Euler" E460 E460 OK "Gauss" G200 G200 OK "Hilbert" H416 H416 OK "Knuth" K530 K530 OK "Lloyd" L300 L300 OK "Lukasiewicz" L222 L222 OK "Ellery" E460 E460 OK "Ghosh" G200 G200 OK "Heilbronn" H416 H416 OK "Kant" K530 K530 OK "Ladd" L300 L300 OK "Lissajous" L222 L222 OK "Wheaton" W350 W350 OK "Ashcraft" A226 A226 OK "Burroughs" B622 B622 OK "Burrows" B620 B620 OK "O'Hara" O600 O600 OK
See Soundex/OCaml for a version that can switch the language (English, French...) with a type which definition is hidden in the interface.
Perl
The Text::Soundex core module supports various soundex algorithms. <lang perl>use Text::Soundex; print soundex("Soundex"), "\n"; # S532 print soundex("Example"), "\n"; # E251 print soundex("Sownteks"), "\n"; # S532 print soundex("Ekzampul"), "\n"; # E251</lang>
Perl 6
US census algorithm, so "Ashcraft" and "Burroughs" adjusted to match. We fake up a first consonant in some cases to make up for the fact that we always trim the first numeric code (so that the 'l' of 'Lloyd' is properly deleted). <lang perl6>sub soundex ($name --> Str) {
my $first = substr($name,0,1).uc; gather { take $first; my $fakefirst = ; $fakefirst = "de " if $first ~~ /^ <[AEIOUWH]> /; "$fakefirst$name".lc.trans('wh' => ) ~~ / ^ [ [ | <[ bfpv ]>+ { take 1 } | <[ cgjkqsxz ]>+ { take 2 } | <[ dt ]>+ { take 3 } | <[ l ]>+ { take 4 } | <[ mn ]>+ { take 5 } | <[ r ]>+ { take 6 } ] || . ]+ $ { take 0,0,0 } /; }.flat.[0,2,3,4].join;
}
for < Soundex S532
Example E251 Sownteks S532 Ekzampul E251 Euler E460 Gauss G200 Hilbert H416 Knuth K530 Lloyd L300 Lukasiewicz L222 Ellery E460 Ghosh G200 Heilbronn H416 Kant K530 Ladd L300 Lissajous L222 Wheaton W350 Ashcraft A261 Burroughs B620 Burrows B620 O'Hara O600 >
-> $n, $s {
my $s2 = soundex($n); say $n.fmt("%16s "), $s, $s eq $s2 ?? " OK" !! " NOT OK $s2";
}</lang> Output:
Soundex S532 OK Example E251 OK Sownteks S532 OK Ekzampul E251 OK Euler E460 OK Gauss G200 OK Hilbert H416 OK Knuth K530 OK Lloyd L300 OK Lukasiewicz L222 OK Ellery E460 OK Ghosh G200 OK Heilbronn H416 OK Kant K530 OK Ladd L300 OK Lissajous L222 OK Wheaton W350 OK Ashcraft A261 OK Burroughs B620 OK Burrows B620 OK O'Hara O600 OK
PHP
PHP already has a built-in soundex() function: <lang php><?php echo soundex("Soundex"), "\n"; // S532 echo soundex("Example"), "\n"; // E251 echo soundex("Sownteks"), "\n"; // S532 echo soundex("Ekzampul"), "\n"; // E251 ?></lang>
PicoLisp
<lang PicoLisp>(de soundex (Str)
(pack (pad -4 (cons (uppc (char (char Str))) (head 3 (let Last NIL (extract '((C) (and (setq C (case (uppc C) (`(chop "BFPV") "1") (`(chop "CGJKQSXZ") "2") (("D" "T") "3") ("L" "4") (("M" "N") "5") ("R" "6") ) ) (<> Last C) (setq Last C) ) ) (cdr (chop Str)) ) ) ) ) ) ) )</lang>
Output:
: (mapcar soundex '("Soundex" "Example" "Sownteks" "Ekzampul")) -> ("S532" "E251" "S532" "E251")
PL/I
<lang PL/I>Soundex: procedure (pword) returns (character(4));
declare pword character (*) varying, value character (length(pword)) varying; declare word character (length(pword)); declare (prevCode, currCode) character (1); declare alphabet CHARACTER (26) STATIC INITIAL ('AEIOUHWYBFPVCGJKQSXZDTLMNR'); declare replace character (26) static initial ('00000000111122222222334556'); declare i fixed binary;
word = pword;
/* Buffer to build up with character codes */ value = ;
/* Make sure the word is at least two characters in length. */ if length(word) <= 1 then return (word);
word = uppercase(word); /* Convert to uppercase. */
/* The current and previous character codes */ prevCode = '0';
value = substr(word, 1, 1); /* The first character is unchanged. */
word = Translate (word, replace, alphabet);
/* Loop through the remaining characters ... */ do i = 2 to length(word); currCode = substr(word, i, 1); /* Check to see if the current code is the same as the last one */ if currCode ^= prevCode & currCode ^= '0' then /* If the current code is a vowel, ignore it. */ value = value || currCode; /* Set the new previous character code */ prevCode = currCode; end; /* of do i = ... */
return ( left(value, 4, '0') ); /* Pad, if necessary. */
end Soundex;</lang>
PureBasic
<lang PureBasic>Procedure.s getCode(c.s)
Protected getCode.s = "" If FindString("BFPV", c ,1) : getCode = "1" : EndIf If FindString("CGJKQSXZ", c ,1) : getCode = "2" : EndIf If FindString("DT", c ,1) : getCode = "3" : EndIf If "L" = c : getCode = "4" : EndIf If FindString("MN", c ,1) : getCode = "5" : EndIf If "R" = c : getCode = "6" : EndIf If FindString("HW", c ,1) : getCode = "." : EndIf ProcedureReturn getCode
EndProcedure
Procedure.s soundex(word.s)
Protected.s previous.s = "" , code.s , current , soundex Protected.i i word = UCase(word) code = Mid(word,1,1) previous = "" For i = 2 To (Len(word) + 1) current = getCode(Mid(word, i, 1)) If current = "." : Continue : EndIf If Len(current) > 0 And current <> previous code + current EndIf previous = current If Len(code) = 4 Break EndIf Next If Len(code) < 4 code = LSet(code, 4,"0") EndIf ProcedureReturn code
EndProcedure
OpenConsole()
PrintN (soundex("Lukasiewicz")) PrintN("Press any key to exit"): Repeat: Until Inkey() <> ""</lang>
Python
<lang python>from itertools import groupby
def soundex(word):
codes = ("bfpv","cgjkqsxz", "dt", "l", "mn", "r") soundDict = dict((ch, str(ix+1)) for ix,cod in enumerate(codes) for ch in cod) cmap2 = lambda kar: soundDict.get(kar, '9') sdx = .join(cmap2(kar) for kar in word.lower()) sdx2 = word[0].upper() + .join(k for k,g in list(groupby(sdx))[1:] if k!='9') sdx3 = sdx2[0:4].ljust(4,'0') return sdx3
</lang> Example Output <lang Python>>>>print soundex("soundex") S532 >>>print soundex("example") E251 >>>print soundex("ciondecks") C532 >>>print soundex("ekzampul") E251</lang>
REXX
Some assumptions made:
- rules are from the algorithm for the American Soundex.
- rules were taken from the Wikipedia article: http://en.wikipedia.org/wiki/Soundex
- multiple words (like Van de Graaf) are treated as one word.
- anything that's not a letter of the Latin alphabet is ignored.
- words starting with a non-letter are processed.
- letters of the ASCII-extended character set are ignored.
- ASCII-extended characters (ÇüéâäàåçêëèïîìÄÅÉæÆôöòûùÿÖÜíóúñÑ) could be added to the program easily.
<lang rexx>/*REXX program demonstrates Soundex codes from some words | commandLine.*/ _=; @.= parse arg @.0 . /*allow input from command line. */
@.1 ='12346' ; #.1 ='0000' @.4 ='4-H' ; #.4 ='H000' @.11 ='Ashcraft' ; #.11 ='A261' @.12 ='Ashcroft' ; #.12 ='A261' @.18 ='auerbach' ; #.18 ='A612' @.20 ='Baragwanath' ; #.20 ='B625' @.22 ='bar' ; #.22 ='B600' @.23 ='barre' ; #.23 ='B600' @.20 ='Baragwanath' ; #.20 ='B625' @.28 ='Burroughs' ; #.28 ='B620' @.29 ='Burrows' ; #.29 ='B620' @.30 ='C.I.A.' ; #.30 ='C000' @.37 ='coöp' ; #.37 ='C100' @.43 ='D-day' ; #.43 ='D000' @.44 ='d jay' ; #.44 ='D200' @.45 ='de la Rosa' ; #.45 ='D462' @.46 ='Donnell' ; #.46 ='D540' @.47 ='Dracula' ; #.47 ='D624' @.48 ='Drakula' ; #.48 ='D624' @.49 ='Du Pont' ; #.49 ='D153' @.50 ='Ekzampul' ; #.50 ='E251' @.51 ='example' ; #.51 ='E251' @.55 ='Ellery' ; #.55 ='E460' @.59 ='Euler' ; #.59 ='E460' @.60 ='F.B.I.' ; #.60 ='F000' @.70 ='Gauss' ; #.70 ='G200' @.71 ='Ghosh' ; #.71 ='G200' @.72 ='Gutierrez' ; #.72 ='G362' @.80 ='he' ; #.80 ='H000' @.81 ='Heilbronn' ; #.81 ='H416' @.84 ='Hilbert' ; #.84 ='H416' @.100='Jackson' ; #.100='J250' @.104='Johnny' ; #.104='J500' @.105='Jonny' ; #.105='J500' @.110='Kant' ; #.110='K530' @.116='Knuth' ; #.116='K530' @.120='Ladd' ; #.120='L300' @.124='Llyod' ; #.124='L300' @.125='Lee' ; #.125='L000' @.126='Lissajous' ; #.126='L222' @.128='Lukasiewicz' ; #.128='L222' @.130='naïve' ; #.130='N100' @.141='Miller' ; #.141='M460' @.143='Moses' ; #.143='M220' @.146='Moskowitz' ; #.146='M232' @.147='Moskovitz' ; #.147='M213' @.150="O'Conner" ; #.150='O256' @.151="O'Connor" ; #.151='O256' @.152="O'Hara" ; #.152='O600' @.153="O'Mally" ; #.153='O540' @.161='Peters' ; #.161='P362' @.162='Peterson' ; #.162='P362' @.165='Pfister' ; #.165='P236' @.180='R2-D2' ; #.180='R300' @.182='rÄ≈sumÅ∙' ; #.182='R250' @.184='Robert' ; #.184='R163' @.185='Rupert' ; #.185='R163' @.187='Rubin' ; #.187='R150' @.191='Soundex' ; #.191='S532' @.192='sownteks' ; #.192='S532' @.199='Swhgler' ; #.199='S460' @.202="'til" ; #.202='T400' @.208='Tymczak' ; #.208='T522' @.216='Uhrbach' ; #.216='U612' @.221='Van de Graaff' ; #.221='V532' @.222='VanDeusen' ; #.222='V532' @.230='Washington' ; #.230='W252' @.233='Wheaton' ; #.233='W350' @.234='Williams' ; #.234='W452' @.236='Woolcock' ; #.236='W422'
do k=0 to 300; if @.k== then iterate; $=soundex(@.k) say word('nope [ok]',1+($==#.k|k==0)) _ $ 'is the Soundex for' @.k if k==0 then leave end
exit /*───────────────────────────────────SOUNDEX subroutine─────────────────*/ soundex: procedure; arg thing /*ARG automatically uppercases it*/ old_alphabet = 'AEIOUYHWBFPVCGJKQSXZDTLMNR' new_alphabet = '@@@@@@**111122222222334556' word=
do k=1 for length(thing) /*handle special chars: - ' _ etc*/ _=substr(thing,k,1) if datatype(_,'M') then word=word || _ /*it's a letter, then OK*/ end
value=strip(left(word,1)) /*first character is left alone. */ word=translate(word, new_alphabet, old_alphabet) prev=translate(value,new_alphabet, old_alphabet) /*the previous code.*/
do j=2 to length(word) /*process remainder of the word. */ ?=substr(word,j,1) if ?\==prev & datatype(?,'W') then do; value=value || ?; prev=?; end else if ?=='@' then prev=? end /*j*/
return left(value,4,0) /*return padded value with zeroes*/</lang> output when using the default input
[ok] 0000 is the Soundex for 12346 [ok] H000 is the Soundex for 4-H [ok] A261 is the Soundex for Ashcraft [ok] A261 is the Soundex for Ashcroft [ok] A612 is the Soundex for auerbach [ok] B625 is the Soundex for Baragwanath [ok] B600 is the Soundex for bar [ok] B600 is the Soundex for barre [ok] B620 is the Soundex for Burroughs [ok] B620 is the Soundex for Burrows [ok] C000 is the Soundex for C.I.A. [ok] C100 is the Soundex for coöp [ok] D000 is the Soundex for d-day [ok] D200 is the Soundex for d jay [ok] D462 is the Soundex for de la Rosa [ok] D540 is the Soundex for Donnell [ok] D624 is the Soundex for Dracula [ok] D624 is the Soundex for Drakula [ok] D153 is the Soundex for Du Pont [ok] E251 is the Soundex for Ekzampul [ok] E251 is the Soundex for example [ok] E460 is the Soundex for Ellery [ok] E460 is the Soundex for Euler [ok] F000 is the Soundex for F.B.I. [ok] G200 is the Soundex for Gauss [ok] G200 is the Soundex for Ghosh [ok] G362 is the Soundex for Gutierrez [ok] H000 is the Soundex for he [ok] H416 is the Soundex for Heilbronn [ok] H416 is the Soundex for Hilbert [ok] J250 is the Soundex for Jackson [ok] J500 is the Soundex for Johnny [ok] J500 is the Soundex for Jonny [ok] K530 is the Soundex for Kant [ok] K530 is the Soundex for Knuth [ok] L300 is the Soundex for Ladd [ok] L300 is the Soundex for Llyod [ok] L000 is the Soundex for Lee [ok] L222 is the Soundex for Lissajous [ok] L222 is the Soundex for Lukasiewicz [ok] N100 is the Soundex for naïve [ok] M460 is the Soundex for Miller [ok] M220 is the Soundex for Moses [ok] M232 is the Soundex for Moskowitz [ok] M213 is the Soundex for Moskovitz [ok] O256 is the Soundex for O'Conner [ok] O256 is the Soundex for O'Connor [ok] O600 is the Soundex for O'Hara [ok] O540 is the Soundex for O'Mally [ok] P362 is the Soundex for Peters [ok] P362 is the Soundex for Peterson [ok] P236 is the Soundex for Pfister [ok] R300 is the Soundex for R2-D2 [ok] R250 is the Soundex for rÄ≈sumÅ∙ [ok] R163 is the Soundex for Robert [ok] R163 is the Soundex for Rupert [ok] R150 is the Soundex for Rubin [ok] S532 is the Soundex for Soundex [ok] S532 is the Soundex for sownteks [ok] S460 is the Soundex for Swhgler [ok] T400 is the Soundex for 'til [ok] T522 is the Soundex for Tymczak [ok] U612 is the Soundex for Uhrbach [ok] V532 is the Soundex for Van de Graaff [ok] V532 is the Soundex for VanDeusen [ok] W252 is the Soundex for Washington [ok] W350 is the Soundex for Wheaton [ok] W452 is the Soundex for Williams [ok] W422 is the Soundex for Woolcock
Ruby
Courtesy http://snippets.dzone.com/posts/show/4530 <lang ruby>class String
SoundexChars = 'BFPVCGJKQSXZDTLMNR' SoundexNums = '111122222222334556' SoundexCharsEx = '^' + SoundexChars SoundexCharsDel = '^A-Z'
# desc: http://en.wikipedia.org/wiki/Soundex def soundex(census = true) str = self.upcase.delete(SoundexCharsDel) str[0,1] + str[1..-1].delete(SoundexCharsEx). tr_s(SoundexChars, SoundexNums)\ [0 .. (census ? 2 : -1)]. ljust(3, '0') rescue end
def sounds_like(other) self.soundex == other.soundex end
end
%w(Soundex Sownteks Example Ekzampul foo bar).each_slice(2) do |word1, word2|
[word1, word2].each {|word| puts '%-8s -> %s' % [word, word.soundex]}
print "'#{word1}' " print word1.sounds_like(word2) ? "sounds" : "does not sound" print " like '#{word2}'\n"
end</lang>
Soundex -> S532 Sownteks -> S532 'Soundex' sounds like 'Sownteks' Example -> E251 Ekzampul -> E251 'Example' sounds like 'Ekzampul' foo -> F000 bar -> B600 'foo' does not sound like 'bar'
Run BASIC
Courtesy http://dkokenge.com/rbp <lang runbasic>global val$ val$(1) = "BPFV" val$(2) = "CSGJKQXZ" val$(3) = "DT" val$(4) = "L" val$(5) = "MN" val$(6) = "R"
' --------------------------------- ' show soundex on these words ' --------------------------------- w$(1) = "Robert" 'R163 w$(2) = "Rupert" 'R163 w$(3) = "Rubin" 'R150 w$(4) = "moses" 'M220 w$(5) = "O'Mally" 'O540 w$(6) = "d jay" 'D200
for i = 1 to 6
print w$(i);" soundex:";soundex$(w$(i))
next i wait
' --------------------------------- ' Return soundex of word ' --------------------------------- function soundex$(a$) a$ = upper$(a$) for i = 2 to len(a$)
theLtr$ = mid$(a$,i,1) s$ = "0" if instr("AEIOUYHW |",theLtr$) <> 0 then s$ = "" if theLtr$ <> preLtr$ then for j = 1 to 6 if instr(val$(j),theLtr$) <> 0 then s$ = str$(j) next j end if sdx$ = sdx$ + s$ preLtr$ = theLtr$
next i soundex$ = left$(a$,1) + left$(sdx$;"000",3) end function</lang>
Robert soundex:R163 Rupert soundex:R163 Rubin soundex:R150 moses soundex:M220 O'Mally soundex:O054 d jay soundex:D200
Scala
<lang scala>def soundex(s:String)={
var code=s.head.toUpper.toString var previous=getCode(code.head) for(ch <- s.drop(1); current=getCode(ch.toUpper)){ if (!current.isEmpty && current!=previous) code+=current previous=current } code+="0000" code.slice(0,4)
}
def getCode(c:Char)={
val code=Map("1"->List('B','F','P','V'), "2"->List('C','G','J','K','Q','S','X','Z'), "3"->List('D', 'T'), "4"->List('L'), "5"->List('M', 'N'), "6"->List('R'))
code.find(_._2.exists(_==c)) match { case Some((k,_)) => k case _ => "" }
}</lang>
<lang scala>def main(args: Array[String]): Unit = {
val tests=Map( "Soundex" -> "S532", "Euler" -> "E460", "Gauss" -> "G200", "Hilbert" -> "H416", "Knuth" -> "K530", "Lloyd" -> "L300", "Lukasiewicz" -> "L222", "Ellery" -> "E460", "Ghosh" -> "G200", "Heilbronn" -> "H416", "Kant" -> "K530", "Ladd" -> "L300", "Lissajous" -> "L222", "Wheaton" -> "W350", "Ashcraft" -> "A226", "Burroughs" -> "B622", "Burrows" -> "B620", "O'Hara" -> "O600")
tests.foreach{(v)=> val code=soundex(v._1) val status=if (code==v._2) "OK" else "ERROR" printf("Name: %-20s Code: %s Found: %s - %s\n", v._1, v._2, code, status) }
}</lang>
Scheme
This implements American Soundex as described at [2].
<lang scheme>;; The American Soundex System
- The soundex code consist of the first letter of the name followed
- by three digits. These three digits are determined by dropping the
- letters a, e, i, o, u, h, w and y and adding three digits from the
- remaining letters of the name according to the table below. There
- are only two additional rules. (1) If two or more consecutive
- letters have the same code, they are coded as one letter. (2) If
- there are an insufficient numbers of letters to make the three
- digits, the remaining digits are set to zero.
- Soundex Table
- 1 b,f,p,v
- 2 c,g,j,k,q,s,x,z
- 3 d, t
- 4 l
- 5 m, n
- 6 r
- Examples
- Miller M460
- Peterson P362
- Peters P362
- Auerbach A612
- Uhrbach U612
- Moskowitz M232
- Moskovitz M213
(define (char->soundex c)
(case (char-upcase c) ((#\B #\F #\P #\V) #\1) ((#\C #\G #\J #\K #\Q #\S #\X #\Z) #\2) ((#\D #\T) #\3) ((#\L) #\4) ((#\M #\N) #\5) ((#\R) #\6) (else #\nul)))
(define (collapse-dups lst)
(if (= (length lst) 1) lst (if (equal? (car lst) (cadr lst))
(collapse-dups (cdr lst)) (cons (car lst) (collapse-dups (cdr lst))))))
(define (remove-nul lst)
(filter (lambda (c)
(not (equal? c #\nul))) lst))
(define (force-len n lst)
(cond ((= n 0) '())
((null? lst) (force-len n (list #\0))) (else (cons (car lst) (force-len (- n 1) (cdr lst))))))
(define (soundex s)
(let ((slst (string->list s))) (force-len 4 (cons (char-upcase (car slst))
(remove-nul (collapse-dups (map char->soundex (cdr slst))))))))
(soundex "miller") (soundex "Peterson") (soundex "PETERS") (soundex "auerbach") (soundex "Uhrbach") (soundex "Moskowitz") (soundex "Moskovitz")</lang>
Sample Output
> "M460" > "P362" > "P362" > "A612" > "U612" > "M232" > "M213"
SNOBOL4
US National Archives (NARA) Soundex. Includes the "HW" rule omitted by Knuth and many other implementations.
<lang SNOBOL4>* # Soundex coding
- # ABCDEFGHIJKLMNOPQRSTUVWXYZ
- # 01230127022455012623017202
define('soundex(str)init,ch') :(soundex_end)
soundex sdxmap = '01230127022455012623017202'
str = replace(str,&lcase,&ucase)
sdx1 str notany(&ucase) = :s(sdx1)
init = substr(str,1,1) str = replace(str,&ucase,sdxmap)
sdx2 str len(1) $ ch span(*ch) = ch :s(sdx2)
- # Omit next line for Knuth's simple Soundex
sdx3 str len(1) $ ch ('7' *ch) = ch :s(sdx3)
str len(1) = init
sdx4 str any('07') = :s(sdx4)
str = substr(str,1,4) str = lt(size(str),4) str dupl('0',4 - size(str)) soundex = str :(return)
soundex_end
- # Test and display
test = " Washington Lee Gutierrez Pfister Jackson Tymczak"
+ " Ashcroft Swhgler O'Connor Rhys-Davies" loop test span(' ') break(' ') . name = :f(end)
output = soundex(name) ' ' name :(loop)
end</lang>
Output:
W252 Washington L000 Lee G362 Gutierrez P236 Pfister J250 Jackson T522 Tymczak A261 Ashcroft S460 Swhgler O256 O'Connor
Tcl
contains an implementation of Knuth's soundex algorithm.
<lang tcl>package require soundex
foreach string {"Soundex" "Example" "Sownteks" "Ekzampul"} {
set soundexCode [soundex::knuth $string] puts "\"$string\" has code $soundexCode"
}</lang> Which produces this output:
"Soundex" has code S532 "Example" has code E251 "Sownteks" has code S532 "Ekzampul" has code E251
TSE SAL
<lang TSE SAL>
// library: string: get: soundex <description></description> <version>1.0.0.0.35</version> <version control></version control> (filenamemacro=getstgso.s) [kn, ri, sa, 15-10-2011 18:23:04] STRING PROC FNStringGetSoundexS( STRING inS )
// Except the first character, you replace each character in the string with its corresponding mapping number // Idea is that you give characters with the same sound the same mapping number (e.g. 'c' is replaced by '2'. And 'k' which might sound the same as a 'c' is also replaced by the same '2' STRING map1S[255] = "AEHIOUWYBFPVCGJKQSXZDTLMNR" STRING map2S[255] = "00000000111122222222334556" STRING s[255] = Upper( inS ) STRING soundexS[255] = "" STRING characterCurrentS[255] = "" STRING characterPreviousS[255] = "?" STRING characterMapS[255] = "" INTEGER mapPositionI = 0 INTEGER minI = 1 INTEGER I = minI INTEGER maxI = Length( s ) I = minI characterCurrentS = SubStr( s, I, 1 ) mapPositionI = Pos( characterCurrentS, map1S ) WHILE ( ( I <= maxI ) AND ( Length( soundexS ) < 4 ) AND ( NOT ( mapPositionI == 0 ) ) ) // Skip double letters, like CC, KK, PP, ... IF ( NOT ( mapPositionI == 0 ) ) AND ( NOT ( characterCurrentS == characterPreviousS ) ) characterPreviousS = characterCurrentS // First character is extracted unchanged, for sorting purposes. IF ( I == minI ) soundexS = Format( soundexS, characterCurrentS ) ELSE mapPositionI = Pos( characterCurrentS, map1S ) IF ( NOT ( mapPositionI == 0 ) ) characterMapS = SubStr( map2S, mapPositionI, 1 ) // skip vowels A, E, I, O, U, further also H, W and Y. In general all characters which have a mapping value of "0" IF ( NOT ( characterMapS == "0" ) ) soundexS = Format( soundexS, characterMapS ) ENDIF ENDIF ENDIF ENDIF I = I + 1 characterCurrentS = SubStr( s, I, 1 ) ENDWHILE IF ( NOT ( soundexS == "" ) ) WHILE ( Length( soundexS ) < 4 ) soundexS = Format( soundexS, "0" ) ENDWHILE ENDIF RETURN( soundexS )
END
PROC Main()
STRING s1[255] = "John Doe" // Warn( Format( FNStringGetSoundexS( "Ashcraft" ) ) ) // gives e.g. "A226" // using another rule the value might be "A261" (see Wikipedia, soundex) // Warn( Format( FNStringGetSoundexS( "Ashcroft" ) ) ) // gives e.g. "A226" // using another rule the value might be "A261" (see Wikipedia, soundex) // Warn( Format( FNStringGetSoundexS( "Davidson, Greg" ) ) ) // gives e.g. "D132" // Warn( Format( FNStringGetSoundexS( "Dracula" ) ) ) // gives e.g. "D624" // Warn( Format( FNStringGetSoundexS( "Drakula" ) ) ) // gives e.g. "D624" // Warn( Format( FNStringGetSoundexS( "Darwin" ) ) ) // gives e.g. "D650" // Warn( Format( FNStringGetSoundexS( "Darwin, Daemon" ) ) ) // gives e.g. "D650" // Warn( Format( FNStringGetSoundexS( "Darwin, Ian" ) ) ) // gives e.g. "D650" // Warn( Format( FNStringGetSoundexS( "Derwin" ) ) ) // gives e.g. "D650" // Warn( Format( FNStringGetSoundexS( "Darwent, William" ) ) ) // gives e.g. "D653" // Warn( Format( FNStringGetSoundexS( "Ellery" ) ) ) // gives e.g. "E460" // Warn( Format( FNStringGetSoundexS( "Euler" ) ) ) // gives e.g. "E460" // Warn( Format( FNStringGetSoundexS( "Ghosh" ) ) ) // gives e.g. "G200" // Warn( Format( FNStringGetSoundexS( "Gauss" ) ) ) // gives e.g. "G200" // Warn( Format( FNStringGetSoundexS( "Heilbronn" ) ) ) // gives e.g. "H416" // Warn( Format( FNStringGetSoundexS( "Hilbert" ) ) ) // gives e.g. "H416" // Warn( Format( FNStringGetSoundexS( "Johnny" ) ) ) // gives e.g. "J500" // Warn( Format( FNStringGetSoundexS( "Jonny" ) ) ) // gives e.g. "J500" // Warn( Format( FNStringGetSoundexS( "Kant" ) ) ) // gives e.g. "K530" // Warn( Format( FNStringGetSoundexS( "Knuth" ) ) ) // gives e.g. "K530" // Warn( Format( FNStringGetSoundexS( "Lissajous" ) ) ) // gives e.g. "L222" // Warn( Format( FNStringGetSoundexS( "Lukasiewicz" ) ) ) // gives e.g. "L222" // Warn( Format( FNStringGetSoundexS( "Ladd" ) ) ) // gives e.g. "L300" // Warn( Format( FNStringGetSoundexS( "Lloyd" ) ) ) // gives e.g. "L300" // Warn( Format( FNStringGetSoundexS( "Rubin" ) ) ) // gives e.g. "R150" // Warn( Format( FNStringGetSoundexS( "Robert" ) ) ) // gives e.g. "R163" // Warn( Format( FNStringGetSoundexS( "Rupert" ) ) ) // gives e.g. "R163" REPEAT IF ( NOT ( Ask( "string: get: soundex = ", s1, _EDIT_HISTORY_ ) ) AND ( Length( s1 ) > 0 ) ) RETURN() ENDIF Warn( Format( FNStringGetSoundexS( s1 ) ) ) UNTIL FALSE
END
</lang>
TUSCRIPT
<lang tuscript> $$ MODE TUSCRIPT BUILD X_TABLE soundex = * DATA :b:1:f:1:p:1:v:1: DATA :c:2:g:2:j:2:k:2:1:2:s:2:x:2:z:2: DATA :d:3:t:3: DATA :l:4: DATA :m:5:n:5: DATA :r:6: names="soundex'Lloyd'Woolcock'Donnell'Baragwanath'Williams'Ashcroft'Euler'Ellery'Gauss'Ghosh'Hilbert'Heilbronn'Knuth'Kant'Ladd'Lukasiewicz'Lissajous'Wheaton'Burroughs'Burrows"
LOOP/CLEAR n=names first=EXTRACT (n,1,2),second=EXTRACT (n,2,3) IF (first==second) THEN
rest=EXTRACT (n,3,0)
ELSE
rest=EXTRACT (n,2,0)
ENDIF
soundex=EXCHANGE (rest,soundex) soundex=STRINGS (soundex,":>/:a:e:i:o:u:") soundex=REDUCE (soundex) soundex=STRINGS (soundex,":>/:",0,0,1,0,"") soundex=CONCAT (soundex,"000") soundex=EXTRACT (soundex,0,4)
PRINT first,soundex,"=",n ENDLOOP </lang> Output:
s532=soundex L300=Lloyd W422=Woolcock D540=Donnell B625=Baragwanath W452=Williams A261=Ashcroft E460=Euler E460=Ellery G200=Gauss G200=Ghosh H416=Hilbert H416=Heilbronn K530=Knuth K530=Kant L300=Ladd L222=Lukasiewicz L222=Lissajous W350=Wheaton B620=Burroughs B620=Burrows
TXR
Without Using TXR Lisp
This implements the full Soundex described in [U.S. National Archives Website]. Doubled letters are condensed before separating the first letter, so that for instance "Lloyd" is not treated as L followed by the coding of LOYD but as L followed by the coding of OYD. Consecutive consonants which map to the same code are not condensed to a single occurrence of the code if they are separated by vowels, but separating W and H do not thus intervene. Names with common prefixes are encoded in two ways.
<lang txr>@(next :args) @### @# soundex-related filters @### @(deffilter remdbl ("AA" "A") ("BB" "B") ("CC" "C") ("DD" "D") ("EE" "E")
("FF" "F") ("GG" "G") ("HH" "H") ("II" "I") ("JJ" "J") ("KK" "K") ("LL" "L") ("MM" "M") ("NN" "N") ("OO" "O") ("PP" "P") ("QQ" "Q") ("RR" "R") ("SS" "S") ("TT" "T") ("UU" "U") ("VV" "V") ("WW" "W") ("XX" "X") ("YY" "Y") ("ZZ" "Z"))
@(deffilter code ("B" "F" "P" "V" "1")
("C" "G" "J" "K" "Q" "S" "X" "Z" "2") ("D" "T" "3") ("L" "4") ("M" "N" "5") ("R" "6") ("A" "E" "I" "O" "U" "Y" "0") ("H" "W" ""))
@(deffilter squeeze ("11" "111" "1111" "11111" "1")
("22" "222" "2222" "22222" "2") ("33" "333" "3333" "33333" "3") ("44" "444" "4444" "44444" "4") ("55" "555" "5555" "55555" "5") ("66" "666" "6666" "66666" "6"))
@(bind prefix ("VAN" "CON" "DE" "DI" "LA" "LE")) @(deffilter remzero ("0" "")) @### @# soundex function @### @(define soundex (in out)) @ (local nodouble letters remainder first rest coded) @ (next :string in) @ (coll)@{letters /[A-Za-z]+/}@(end) @ (cat letters "") @ (output :into nodouble :filter (:upcase remdbl)) @letters @ (end) @ (next :list nodouble) @ (maybe) @prefix@remainder @ (output :into nodouble) @nodouble @remainder @ (end) @ (end) @ (next :list nodouble) @ (collect) @{first 1}@rest @ (output :filter (code squeeze remzero) :into coded) @{rest}000 @ (end) @ (next :list coded) @{digits 3}@(skip) @ (end) @ (output :into out) @ (rep):@first@digits@(first)@first@digits@(end) @ (end) @ (cat out) @(end) @### @# process arguments and list soundex codes @### @(collect :vars ()) @input @ (output :filter (:fun soundex)) @input @ (end) @(end) @### @# compare first and second argument under soundex @### @(bind (first_arg second_arg . rest_args) input) @(cases) @ (bind first_arg second_arg :filter (:fun soundex)) @ (output) "@first_arg" and "@second_arg" match under soundex @ (end) @(end)</lang>
Run:
$ txr soundex.txr example soundex Lloyd lee guttierez o\'hara vandeusen dimeola E251 E251 S532 L300 L000 G362 O600 V532:D250 D540:M400 "example" and "egsampul" match under soundex
With TXR Lisp
This solution is similar to some of the solutions in other languages. Its treatment of the algorithm is not as complete as the above solution.
<lang txr>@(next :args) @(do (defun get-code (c)
(cond ((memq c '(#\B #\F #\P #\V)) #\1) ((memq c '(#\C #\G #\J #\K #\Q #\S #\X #\Z)) #\2) ((memq c '(#\D #\T)) #\3) ((eql c #\L) #\4) ((memq c '(#\M #\N)) #\5) ((eq c #\R) #\6)))
(defun soundex (s) (if (zerop (length s)) "" (let* ((su (upcase-str s)) (o (chr-str su 0))) (for ((i 1) (l (length su)) cp cg) ((< i l) (sub-str (cat-str '(,o "000") nil) 0 4)) ((inc i) (set cp cg)) (set cg (get-code (chr-str su i))) (if (and cg (null (eql cg cp))) (set o (cat-str '(,o ,cg) nil))))))))
@(collect) @arg @ (output) @arg -> @(soundex arg) @ (end) @(end)</lang>
Run:
$ ./txr soundex-lisp.txr soundex sowndex soundex -> S532 sowndex -> S532
VBScript
<lang vbscript>Function getCode(c)
Select Case c Case "B", "F", "P", "V" getCode = "1" Case "C", "G", "J", "K", "Q", "S", "X", "Z" getCode = "2" Case "D", "T" getCode = "3" Case "L" getCode = "4" Case "M", "N" getCode = "5" Case "R" getCode = "6" End Select
End Function
Function soundex(s)
Dim code, previous code = UCase(Mid(s, 1, 1)) previous = 7 For i = 2 to (Len(s) + 1) current = getCode(UCase(Mid(s, i, 1))) If Len(current) > 0 And current <> previous Then code = code & current End If previous = current Next soundex = Mid(code, 1, 4) If Len(code) < 4 Then soundex = soundex & String(4 - Len(code), "0") End If
End Function</lang>
- Programming Tasks
- Text processing
- Ada
- ALGOL 68
- AutoHotkey
- AWK
- BBC BASIC
- C
- C sharp
- Clipper/XBase++
- Clojure
- Common Lisp
- D
- Delphi
- Erlang
- Forth
- Go
- Groovy
- Haskell
- Text.PhoneticCode.Soundex
- Icon
- Unicon
- Icon Programming Library
- J
- Java
- JavaScript
- Mathematica
- MUMPS
- OCaml
- Perl
- Perl 6
- PHP
- PicoLisp
- PL/I
- PureBasic
- Python
- REXX
- Ruby
- Run BASIC
- Scala
- Scheme
- SNOBOL4
- Tcl
- Tcllib
- TSE SAL
- TUSCRIPT
- TXR
- VBScript