Soundex: Difference between revisions

Content added Content deleted

Inline

Revision as of 00:58, 17 September 2011

Soundex is an algorithm for creating indices for words based on their pronunciation. The goal is for homophones to be encoded to the same representation so that they can be matched despite minor differences in spelling (from the WP article).

AutoHotkey

Translation of: VBScript

<lang AutoHotkey>getCode(c){

       If c in B,F,P,V
           return 1
       If c in C,G,J,K,Q,S,X,Z
           return 2
       If c in D,T
           return 3
       If c = L
           return 4
       If c in M,N
           return 5
       If c = R
           return 6

}

soundex(s){

   code := SubStr(s, 1, 1)
  ,previous := 7
  ,i := 1
   While ++i <= StrLen(s){
       current := getCode(SubStr(s, i, 1))
       If StrLen(current) > 0 And current <> previous
           code := code . current
       previous := current
   }
   soundex := SubStr(code, 1, 4)
   If StrLen(code) < 4
       soundex .= String(4 - StrLen(code), "0")
   return soundex

}

String(a, n){

  Loop n
     o .= a
  return a

}

MsgBox % Soundex("Soundex") "`n" Soundex("Sowndeks") "`n" Soundex("Ashcroft") "`n" Soundex("Ashkrofd")</lang>

C

<lang c>#include <stdio.h>

include <stdlib.h>
include <string.h>

char chr(int i) {

   return ((0<=i) && (i<=9)) ? '0'+i : '-';

}

char *soundex(const char *word, char *bufr) {

   static const char *lset[] = {
       "", "BFPVbfpv", "CGJKQSXZcgjkqsxz", "DTdt", "Ll","MNmn", "Rr"  };

define LSIZE (sizeof(lset)/sizeof(char *))

   const char *ch;
   char *bp;
   char prev = '-';
   int j;

   bp = bufr;
   ch = word;

   while (*ch && (bp-bufr<4)) {
       for (j=1; j< LSIZE; j++)
           if (strchr( lset[j], *ch) ) break;

       if ((j<LSIZE) && (chr(j) != prev))
           *bp++ = prev = chr(j);
       else if (!strchr("WHwh", *ch)) 
           prev = '7';

       if (ch == word) {
           bp = bufr;
           *bp++ = toupper(*ch);
       }
       ch++;
   }
   while (bp-bufr < 4)
       *bp++ = '0';
   *bp = 0;
    return bufr;

}

int main() {

   const char *w, **wp;
   static const char *testwords[] = {
       "Euler","Gauss","Hilbert","Knuth","Lloyd","Lukasiewicz","Ellery",
       "Ghosh", "Heilbronn","Kant","Ladd","Lissajous","Wheaton","Ashcraft",
       "Burroughs","Burrows","O'Hara", NULL};
   char buffer[8];

   w= "Soundex";
   printf("soundex(%s) = %s\n", w, soundex(w,buffer));
   w= "Example";
   printf("soundex(%s) = %s\n", w, soundex(w,buffer));
   w= "Sownteks";
   printf("soundex(%s) = %s\n", w, soundex(w,buffer));
   w= "Ekzampul";
   printf("soundex(%s) = %s\n", w, soundex(w,buffer));

   for (wp=testwords; *wp; wp++ ) 
       printf("soundex(%s) = %s\n", *wp, soundex(*wp, buffer));
   
   return 0;

}</lang>

Clipper/XBase++

<lang Clipper/XBase++>FUNCTION Soundex(cWord)

/*

This is a Clipper/XBase++ implementation of the standard American Soundex procedure.

*/

LOCAL cSoundex, i, nLast, cChar, nCode

cWord:=ALLTRIM(UPPER(cWord)) cSoundex:=LEFT(cWord, 1) // first letter is first char nLast:=-1 FOR i:=2 TO LEN(cWord)

  cChar:=SUBSTR(cWord, i, 1)   // get char
  nCode:=SoundexCode(cChar)    // get soundex code for char
  IF nCode=0                   // if 0, ignore
     LOOP
  ENDIF
  IF nCode#nLast               // if not same code, add to soundex
     nLast:=nCode              // and replace the last one
     cSoundex+=STR(nCode, 1)
  ENDIF

NEXT cSoundex:=PADR(cSoundex, 4, "0")

RETURN(cSoundex)

STATIC FUNCTION SoundexCode(cLetter) LOCAL aCodes:={"BFPV", "CGJKQSXZ", "DT", "L", "MN", "R"}, i, nRet:=0

FOR i:=1 TO LEN(aCodes)

  IF cLetter $ aCodes[i]
     nRet:=i
     EXIT
  ENDIF

FUNCTION SoundexDifference(cSound1, cSound2) LOCAL nMatch:=0, nLen1, nLen2, i

nLen1:=LEN(cSound1) nLen2:=LEN(cSound2)

// make the two words the same length. This is a safety. They both should be 4 characters long. IF nLen1 > nLen2

  cSound2:=PADR(cSound2, nLen1-nLen2, "0")

ELSEIF nLen1 < nLen2

  cSound1:=PADR(cSound1, nLen2-nLen1, "0")

ENDIF

// compare the corresponding characters between the two words FOR i:=1 TO LEN(cSound1)

  IF SUBSTR(cSound1, i, 1) == SUBSTR(cSound2, i, 1)
     ++nMatch
  ENDIF

- - - - </lang>

--Clippersolutions 23:14, 4 November 2010 (UTC)--Clippersolutions 23:14, 4 November 2010 (UTC)

Clojure

<lang Clojure>(defn get-code [c]

 (case c
   (\B \F \P \V) 1
   (\C \G \J \K
    \Q \S \X \Z) 2
   (\D \T) 3
   \L 4
   (\M \N) 5
   \R 6
   nil)) ;(\A \E \I \O \U \H \W \Y)

(defn soundex [s]

 (let [[f & s] (.toUpperCase s)]
   (-> (map get-code s)

distinct (concat , "0000") (->> (cons f ,) (remove nil? ,) (take 4 ,) (apply str ,)))))</lang>

Common Lisp

<lang lisp> (defun get-code (c)

 (case c
   ((#\B #\F #\P #\V) #\1)
   ((#\C #\G #\J #\K
     #\Q #\S #\X #\Z) #\2)
   ((#\D #\T) #\3)
   (#\L #\4)
   ((#\M #\N) #\5)
   (#\R #\6)))

(defun soundex (s)

 (if (zerop (length s))
   ""
   (let* ((l (coerce (string-upcase s) 'list))
          (o (list (first l))))
     (loop for c in (rest l)
           for cg = (get-code c) and
           for cp = #\Z then cg
           when (and cg (not (eql cg cp))) do
             (push (get-code c) o)
           finally
             (return (subseq (coerce (nreverse `(#\0 #\0 #\0 #\0 ,@o)) 'string) 0 4))))))

</lang>

D

The D standard library (Phobos) contains a soundex function: <lang d>import std.stdio: writeln; import std.string: soundex;

void main() {

   assert(soundex("soundex") == "S532");
   assert(soundex("example") == "E251");
   assert(soundex("ciondecks") == "C532");
   assert(soundex("ekzampul") == "E251");
   assert(soundex("Robert") == "R163");
   assert(soundex("Rupert") == "R163");
   assert(soundex("Rubin") == "R150");
   assert(soundex("Ashcraft") == "A261");
   assert(soundex("Ashcroft") == "A261");
   assert(soundex("Tymczak") == "T522");

}</lang> It works according to this document: http://www.archives.gov/publications/general-info-leaflets/55.html So soundex("Ashcraft") is A-261 instead of A-226.

The following version uses the Wikipedia algorithm, it's long because it contains a ddoc text, design by contract (a long post-condition), sanity asserts, unittests and comments. A quite shorter version may be written that loses the safety net that's necessary in serious coding.

This version uses dynamic heap allocations in some places (replace, toupper, several string join) to allow a higher level style of coding, but this function may also be written to perform zero heap allocations. It may even return a char[4] by value, or use a given buffer like the C version.

(This function is not pure because the standard library Phobos of DMD 2.050 is not yet pure-corrected, so std.string.replace() is not pure yet). <lang d>import std.string: toupper, replace; import std.ctype: isupper;

/***************************** Soundex is a phonetic algorithm for indexing names by sound, as pronounced in English. See: http://en.wikipedia.org/wiki/Soundex

/

/*pure*/ string soundex(const string name)

   // Adapted from public domain Python code by Gregory Jorgensen:
   // http://code.activestate.com/recipes/52213/
   out(result) { // postcondition
       assert(result.length == 4);
       assert(result[0] == '0' || isupper(result[0]));

       if (name.length == 0)
           assert(result == "0000");

       // this is too much fiddly
       int charCount = 0;
       foreach (dchar c; name)
           if ((c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z'))
               charCount++;
       assert((charCount == 0) == (result == "0000"));
   } body {
       // digits holds the soundex values for the alphabet
       enum digits = "01230120022455012623010202";
       string firstChar, result;

       // translate alpha chars in name to soundex digits
       foreach (dchar c; name.toupper()) {
           if (c >= 'A' && c <= 'Z') {
               if (!firstChar.length)
                   firstChar ~= c; // remember first letter
               char d = digits[c - 'A'];
               // duplicate consecutive soundex digits are skipped
               if (!result.length || d != result[$ - 1])
                   result ~= d;
           }
       }

       // return 0000 if the name is empty
       if (!firstChar.length)
           return "0000";

       // replace first digit with first alpha character
       assert(result.length > 0);
       result = firstChar ~ result[1 .. $];

       // remove all 0s from the soundex code
       result = result.replace("0", "");

       // return soundex code padded to 4 zeros
       return (result ~ "0000")[0 .. 4];
   }

unittest { // tests of soundex()

   auto tests = [["", "0000"], ["12346", "0000"], ["he", "H000"],
                 ["soundex", "S532"], ["example", "E251"],
                 ["ciondecks", "C532"], ["ekzampul", "E251"],
                 ["résumé", "R250"], ["Robert", "R163"],
                 ["Rupert", "R163"], ["Rubin", "R150"],
                 ["Ashcraft", "A226"], ["Ashcroft", "A226"]];
   foreach (pair; tests)
       assert(soundex(pair[0]) == pair[1]);

}

void main() {}</lang>

Delphi

<lang Delphi> program SoundexDemo;

{$APPTYPE CONSOLE}

uses

 SysUtils,
 StrUtils;

begin

 Writeln(Soundex('Soundex'));
 Writeln(Soundex('Example'));
 Writeln(Soundex('Sownteks'));
 Writeln(Soundex('Ekzampul'));
 Readln;

end. </lang> Output:

S532
E251
S532
E251

Erlang

This implements the US Census rules, where W and H are ignored but, unlike vowels, are not separators. <lang Erlang>-module(soundex). -export([soundex/1]).

soundex([]) ->

[];

soundex(Str) ->

   [Head|Tail] = string:to_upper(Str),
   [Head | isoundex(Tail, [], todigit(Head))].

isoundex([], Acc, _) ->

   case length(Acc) of

N when N == 3 -> lists:reverse(Acc); N when N < 3 -> isoundex([], [$0 | Acc], ignore); N when N > 3 -> isoundex([], lists:sublist(Acc, N-2, N), ignore)

   end;

isoundex([Head|Tail], Acc, Lastn) ->

   Dig = todigit(Head),
   case Dig of

Dig when Dig /= $0, Dig /= Lastn -> isoundex(Tail, [Dig | Acc], Dig); _ -> case Head of $H -> isoundex(Tail, Acc, Lastn); $W -> isoundex(Tail, Acc, Lastn); N when N >= $A, N =< $Z -> isoundex(Tail, Acc, Dig); _ -> isoundex(Tail, Acc, Lastn) % This clause handles non alpha characters end

   end.

todigit(Chr) ->

   Digits = "01230120022455012623010202",
   HeadOff = Chr - $A + 1,
   case HeadOff of

N when N > 0, N < 27 -> lists:nth(HeadOff, Digits); _ -> % Treat non alpha characters as a vowel $0

   end.

</lang>

Forth

This implements the US Census rules, where W and H are ignored but, unlike vowels, aren't separators. Further corner cases welcome...

<lang forth>: alpha-table create does> swap 32 or [char] a - 0 max 26 min + 1+ c@ ;

alpha-table soundex-code

 ,"  123 12. 22455 12623 1.2 2 "
  \ ABCDEFGHIJKLMNOPQRSTUVWXYZ

soundex ( name len -- pad len )

 over c@ pad c!                  \ First character verbatim
 pad 1+ 3 [char] 0 fill          \ Pad to four characters with zeros
 1 pad c@ soundex-code  ( count code )
 2swap bounds do
   i c@ soundex-code   ( count code next )
   2dup = if         drop else   \ runs are ignored
   dup [char] . = if drop else   \ W, H don't separate runs of consonants
   dup bl = if        nip else   \ vowels separate consonants but aren't coded
     nip
     2dup swap pad + c!
     swap 1+
     tuck 4 = if leave then
   then then then
 loop
 2drop pad 4 ;

\ Knuth's test cases s" Euler" soundex cr type \ E460 s" Gauss" soundex cr type \ G200 s" Hilbert" soundex cr type \ H416 s" Knuth" soundex cr type \ K530 s" Lloyd" soundex cr type \ L300 s" Lukasiewicz" soundex cr type \ L222 (W test) s" Ellery" soundex cr type \ E460 s" Ghosh" soundex cr type \ G200 s" Heilbronn" soundex cr type \ H416 s" Kant" soundex cr type \ K530 s" Ladd" soundex cr type \ L300 s" Lissajous" soundex cr type \ L222

s" Wheaton" soundex cr type \ W350 s" Ashcraft" soundex cr type \ A261 (H tests) s" Burroughs" soundex cr type \ B620 s" Burrows" soundex cr type \ B620 (W test) (any Welsh names?) s" O'Hara" soundex cr type \ O600 (punctuation test)</lang>

Go

WP article rules, plus my interpretation for input validation. <lang go>

package main

import (

   "fmt"
   "os"
   "unicode"

)

var code = []int("01230127022455012623017202")

func soundex(s string) (string, os.Error) {

   var sx [4]int
   var sxi int
   lastCode := '0'
   for i, c := range s {
       switch {
       case !unicode.IsLetter(c):
           if c < ' ' || c == 127 {
               return "", os.NewError("ASCII control characters disallowed")
           }
           if i == 0 {
               return "", os.NewError("initial character must be a letter")
           }
           lastCode = '0'
           continue
       case c >= 'A' && c <= 'Z':
           c -= 'A'
       case c >= 'a' && c <= 'z':
           c -= 'a'
       default:
           return "", os.NewError("non-ASCII letters unsupported")
       }
       // c is valid letter index at this point
       if i == 0 {
           sx[0] = c + 'A'
           sxi = 1
           continue
       }
       x := code[c]
       switch x {
       case '7', lastCode:
       case '0':
           lastCode = '0'
       default:
           sx[sxi] = x
           if sxi == 3 {
               return string(sx[:]), nil
           }
           sxi++
           lastCode = x
       }
   }
   if sxi == 0 {
       return "", os.NewError("no letters present")
   }
   for ; sxi < 4; sxi++ {
       sx[sxi] = '0'
   }
   return string(sx[:]), nil

}

func main() {

   for _, s := range []string{
       "Robert",   // WP test case = R163
       "Rupert",   // WP test case = R163
       "Rubin",    // WP test case = R150
       "ashcroft", // WP test case = A261
       "ashcraft", // s and c combine across h, t not needed
       "moses",    // s's don't combine across e
       "O'Mally",  // apostrophe allowed, adjacent ll's combine
       "d jay",    // spaces allowed
       "R2-D2",    // digits, hyphen allowed
       "12p2",     // just not in leading position
       "naïve",    // non ASCII disallowed
       "",         // empty string disallowed
       "bump\t",   // ASCII control characters disallowed
   } {
       x, err := soundex(s)
       if err == nil {
           fmt.Println("soundex", s, "=", x)
       } else {
           fmt.Printf("\"%s\" fail. %s\n", s, err)
       }
   }

} </lang> Output:

soundex Robert = R163
soundex Rupert = R163
soundex Rubin = R150
soundex ashcroft = A261
soundex ashcraft = A261
soundex moses = M220
soundex O'Mally = O540
soundex d jay = D200
soundex R2-D2 = R300
"12p2" fail. initial character must be a letter
"naïve" fail. non-ASCII letters unsupported
"" fail. no letters present
"bump   " fail. ASCII control characters disallowed

Groovy

<lang groovy> def soundex(s) {

   def code = ""
   def lookup = [
      B : 1, F : 1, P : 1, V : 1,
      C : 2, G : 2, J : 2, K : 2, Q : 2, S : 2, X : 2, Z : 2,
      D : 3, T : 3,
      L : 4,
      M : 5, N : 5,
      R : 6
   ]
   s[1..-1].toUpperCase().inject(7) { lastCode, letter ->
       def letterCode = lookup[letter]
       if(letterCode && letterCode != lastCode) {
           code += letterCode
       }
   }
   return "${s[0]}${code}0000"[0..3]

}

println(soundex("Soundex")) println(soundex("Sownteks")) println(soundex("Example")) println(soundex("Ekzampul")) </lang>

Haskell

Library: Text.PhoneticCode.Soundex

<lang haskell>import Text.PhoneticCode.Soundex import Control.Arrow</lang> Example: <lang haskell>*Main> mapM_ print $ map (id &&& soundexSimple) ["Soundex", "Example", "Sownteks", "Ekzampul"] ("Soundex","S532") ("Example","E251") ("Sownteks","S532") ("Ekzampul","E251")</lang>

Icon and Unicon

<lang icon>procedure main(arglist) # computes soundex of each argument every write(x := !arglist, " => ",soundex(x)) end

procedure soundex(name)

  local  dig,i,x
  static con
  initial {                                   # construct mapping x[i] => i all else .
     x := ["bfpv","cgjkqsxz","dt","l","mn","r"]
     every ( dig := con := "") ||:= repl(i := 1 to *x,*x[i]) do con ||:= x[i]
     con := map(map(&lcase,con,dig),&lcase,repl(".",*&lcase))
     } 
 
  name := map(name)                           # lower case
  name[1] := map(name[1],&lcase,&ucase)       # upper case 1st
  name := map(name,&lcase,con)                # map cons
  every x := !"123456" do
      while name[find(x||x,name)+:2] := x     # kill duplicates
  while name[upto('.',name)] := ""            # kill .
  return left(name,4,"0")

end</lang>

Library: Icon Programming Library

implements soundex. The above version is an adaptation of that procedure

J

Solution <lang j>removeDups =: {.;.1~ (1 , }. ~: }: ) codes =: ;: 'BFPV CGJKQSXZ DT L MN R HW'

soundex =: 3 : 0

if. 0=# k=.toupper y do. '0' return. end.
({.k), ,": ,. 3 {. 0-.~ }. removeDups 7 0:`(I.@:=)`]} , k >:@I.@:(e. &>)"0 _ codes

)</lang> Usage <lang j>names=: 'Lloyd Woolcock Donnell Baragwanath Williams Ashcroft Euler Ellery Gauss Ghosh Hilbert Heilbronn Knuth Kant Ladd Lukasiewicz Lissajous' soundexNames=: 'L300 W422 D540 B625 W452 A226 E460 E460 G200 G200 H416 H416 K530 K530 L300 L222 L222'

  soundex &> ;: names

L300 W422 D540 B625 W452 ....</lang> Test <lang j> soundexNames-:(soundex &.>) &. ;: names 1</lang>

Java

Translation of: VBScript

<lang java>public static void main(String[] args){

   System.out.println(soundex("Soundex"));
   System.out.println(soundex("Example"));
   System.out.println(soundex("Sownteks"));
   System.out.println(soundex("Ekzampul"));
 }

private static String getCode(char c){

 switch(c){
   case 'B': case 'F': case 'P': case 'V':
     return "1";
   case 'C': case 'G': case 'J': case 'K':
   case 'Q': case 'S': case 'X': case 'Z':
     return "2";
   case 'D': case 'T':
     return "3";
   case 'L':
     return "4";
   case 'M': case 'N':
     return "5";
   case 'R':
     return "6";
   default:
     return "";
 }

}

public static String soundex(String s){

 String code, previous, soundex;
 code = s.toUpperCase().charAt(0) + "";
 previous = "7";
 for(int i = 1;i < s.length();i++){
   String current = getCode(s.toUpperCase().charAt(i));
   if(current.length() > 0 && !current.equals(previous)){
     code = code + current;
   }
   previous = current;
 }
 soundex = (code + "0000").substring(0, 4);
 return soundex;

}</lang> Output:

S532
E251
S532
E251

JavaScript

<lang javascript>var soundex = function (s) {

    var a = s
            .substring(1, s.length)
            .toLowerCase()
            .split(),
        r = ,
        codes = {
            a: , e: , i: , o: , u: ,
            b: 1, f: 1, p: 1, v: 1,
            c: 2, g: 2, j: 2, k: 2, q: 2, s: 2, x: 2, z: 2,
            d: 3, t: 3,
            l: 4,
            m: 5, n: 5,
            r: 6
        };

    r = s[0].toUpperCase() +
        a
        .filter(function (v, i, a) { return v !== a[i + 1]; })
        .map(function (v, i, a) { return codes[v] }).join();

    return (r + '000').slice(0, 4);

};</lang>

MUMPS

<lang MUMPS>SOUNDEX(X,NARA=0)

;Converts a string to its Soundex value.
;Empty strings return "0000". Non-alphabetic ASCII characters are ignored.
;X is the name to be converted to Soundex
;NARA is a flag, defaulting to zero, for which implementation to perform.
;If NARA is 0, do what seems to be the Knuth implementation
;If NARA is a positive integer, do the NARA implementation.
; This varies the soundex rule for "W" and "H", and adds variants for prefixed names separated by carets.
; http://www.archives.gov/publications/general-info-leaflets/55-census.html
;Y is the string to be returned
;UP is the list of upper case letters
;LO is the list of lower case letters
;PREFIX is a list of prefixes to be stripped off
;X1 is the upper case version of X
;X2 is the name without a prefix
;Y2 is the soundex of a name without a prefix
;C is a loop variable
;DX is a list of Soundex values, in alphabetical order. Underscores are used for the NARA variation letters
;XD is a partially processed translation of X into soundex values
NEW Y,UP,LO,PREFIX,X1,X2,Y2,C,DX,XD
SET UP="ABCDEFGHIJKLMNOPQRSTUVWXYZ" ;Upper case characters
SET LO="abcdefghijklmnopqrstuvwxyz" ;Lower case characters
SET DX=" 123 12_ 22455 12623 1_2 2" ;Soundex values
SET PREFIX="VAN^CO^DE^LA^LE" ;Prefixes that could create an alternate soundex value
SET Y="" ;Y is the value to be returned
SET X1=$TRANSLATE(X,LO,UP) ;Make local copy, and force all letters to be upper case
SET XD=$TRANSLATE(X1,UP,DX) ;Soundex values for string
;
SET Y=$EXTRACT(X1,1,1) ;Get first character
FOR C=2:1:$LENGTH(X1) QUIT:$L(Y)>=4  DO
. ;ignore doubled letters OR and side-by-side soundex values OR same soundex on either side of "H" or "W"
. QUIT:($EXTRACT(X1,C,C)=$EXTRACT(X1,C-1,C-1))
. QUIT:($EXTRACT(XD,C,C)=$EXTRACT(XD,C-1,C-1))
. ;ignore non-alphabetic characters
. QUIT:UP'[($EXTRACT(X1,C,C))
. QUIT:NARA&(($EXTRACT(XD,C-1,C-1)="_")&(C>2))&($EXTRACT(XD,C,C)=$EXTRACT(XD,C-2,C-2))
. QUIT:" _"[$EXTRACT(XD,C,C)
. SET Y=Y_$EXTRACT(XD,C,C)
; Pad with "0" so string length is 4
IF $LENGTH(Y)<4 FOR C=$L(Y):1:3 SET Y=Y_"0"
IF NARA DO
. FOR C=1:1:$LENGTH(PREFIX,"^") DO
. . IF $EXTRACT(X1,1,$LENGTH($PIECE(PREFIX,"^",C)))=$PIECE(PREFIX,"^",C) DO
. . . ;Take off the prefix, and any leading spaces
. . . SET X2=$EXTRACT(X1,$LENGTH($PIECE(PREFIX,"^",C))+1,$LENGTH(X1)-$PIECE(PREFIX,"^",C)) FOR  QUIT:UP[$E(X2,1,1)  SET X2=$E(X2,2,$L(X2))
. . . SET Y2=$$SOUNDEX(X2,NARA) SET Y=Y_"^"_Y2
KILL UP,LO,PREFIX,X1,X2,Y2,C,DX,XD
QUIT Y

</lang>

Examples:

USER>W $$SOUNDEX^SOUNDEX("")
0000
USER>W $$SOUNDEX^SOUNDEX("ASHCROFT")
A226
USER>W $$SOUNDEX^SOUNDEX("ASHCROFT",1)
A261
USER>W $$SOUNDEX^SOUNDEX("EULER")
E460
USER>W $$SOUNDEX^SOUNDEX("O'HARA")
O600
USER>W $$SOUNDEX^SOUNDEX("naïve")
N100
USER>W $$SOUNDEX^SOUNDEX("Moses")
M220
USER>W $$SOUNDEX^SOUNDEX("Omalley")
O540
USER>W $$SOUNDEX^SOUNDEX("O'Malley")
O540
USER>W $$SOUNDEX^SOUNDEX("Delarosa")
D462
USER>W $$SOUNDEX^SOUNDEX("Delarosa",1)
D462^L620^R200
USER>W $$SOUNDEX^SOUNDEX("De la Rosa")
D462
USER>W $$SOUNDEX^SOUNDEX("de la Rosa",1)
D462^L620^R200
USER>W $$SOUNDEX^SOUNDEX("Van de Graaff")
V532
USER>W $$SOUNDEX^SOUNDEX("Van de Graaff",1)
V532^D261^G610

There's just one small problem...

USER>W $$SOUNDEX^SOUNDEX("fish")
F200
USER>W $$SOUNDEX^SOUNDEX("ghoti")
G300

OCaml

Here is an implementation:

<lang ocaml>let c2d = function

 | 'B' | 'F' | 'P' | 'V' -> "1"
 | 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' -> "2"
 | 'D' | 'T' -> "3"
 | 'L' -> "4"
 | 'M' | 'N' -> "5"
 | 'R' -> "6"
 | _ -> ""

let rec dbl acc = function

 | [] -> (List.rev acc)
 | [c] -> List.rev(c::acc)
 | c1::(c2::_ as tl) ->
     if c1 = c2
     then dbl acc tl
     else dbl (c1::acc) tl

let pad s =

 match String.length s with
 | 0 -> s ^ "000"
 | 1 -> s ^ "00"
 | 2 -> s ^ "0"
 | 3 -> s
 | _ -> String.sub s 0 3

let soundex_aux rem =

 pad(String.concat "" (dbl [] (List.map c2d rem)))

let soundex s =

 let s = String.uppercase s in
 let cl = ref [] in
 String.iter (fun c -> cl := c :: !cl) s;
 match dbl [] (List.rev !cl) with
 | c::rem -> (String.make 1 c) ^ (soundex_aux rem)
 | [] -> invalid_arg "soundex"</lang>

Test our implementation:

<lang ocaml>let tests = [

 "Soundex",     "S532";
 "Example",     "E251";
 "Sownteks",    "S532";
 "Ekzampul",    "E251";
 "Euler",       "E460";
 "Gauss",       "G200";
 "Hilbert",     "H416";
 "Knuth",       "K530";
 "Lloyd",       "L300";
 "Lukasiewicz", "L222";
 "Ellery",      "E460";
 "Ghosh",       "G200";
 "Heilbronn",   "H416";
 "Kant",        "K530";
 "Ladd",        "L300";
 "Lissajous",   "L222";
 "Wheaton",     "W350";
 "Ashcraft",    "A226";
 "Burroughs",   "B622";
 "Burrows",     "B620";
 "O'Hara",      "O600";
 ]

let () =

 print_endline " Word   \t Code  Found Status";
 List.iter (fun (word, code1) ->
   let code2 = soundex word in
   let status = if code1 = code2 then "OK " else "Arg" in
   Printf.printf " \"%s\" \t %s  %s  %s\n" word code1 code2 status
 ) tests</lang>

This test outputs:

Word            Code  Found Status
"Soundex"       S532  S532  OK 
"Example"       E251  E251  OK 
"Sownteks"      S532  S532  OK 
"Ekzampul"      E251  E251  OK 
"Euler"         E460  E460  OK 
"Gauss"         G200  G200  OK 
"Hilbert"       H416  H416  OK 
"Knuth"         K530  K530  OK 
"Lloyd"         L300  L300  OK 
"Lukasiewicz"   L222  L222  OK 
"Ellery"        E460  E460  OK 
"Ghosh"         G200  G200  OK 
"Heilbronn"     H416  H416  OK 
"Kant"          K530  K530  OK 
"Ladd"          L300  L300  OK 
"Lissajous"     L222  L222  OK 
"Wheaton"       W350  W350  OK 
"Ashcraft"      A226  A226  OK 
"Burroughs"     B622  B622  OK 
"Burrows"       B620  B620  OK 
"O'Hara"        O600  O600  OK

See Soundex/OCaml for a version that can switch the language (English, French...) with a type which definition is hidden in the interface.

Perl

The Text::Soundex core module supports various soundex algorithms. <lang perl>use Text::Soundex; print soundex("Soundex"), "\n"; # S532 print soundex("Example"), "\n"; # E251 print soundex("Sownteks"), "\n"; # S532 print soundex("Ekzampul"), "\n"; # E251</lang>

Perl 6

US census algorithm, so "Ashcraft" and "Burroughs" adjusted to match. We fake up a first consonant in some cases to make up for the fact that we always trim the first numeric code (so that the 'l' of 'Lloyd' is properly deleted). <lang perl6>sub soundex ($name --> Str) {

   my $first = substr($name,0,1).uc;
   gather {
       take $first;
       my $fakefirst = ;
       $fakefirst = "de " if $first ~~ /^ <[AEIOUWH]> /;
       "$fakefirst$name".lc.trans('wh' => ) ~~ /
           ^
           [
               [
               | <[ bfpv     ]>+ { take 1 }
               | <[ cgjkqsxz ]>+ { take 2 }
               | <[ dt       ]>+ { take 3 }
               | <[ l        ]>+ { take 4 }
               | <[ mn       ]>+ { take 5 }
               | <[ r        ]>+ { take 6 }
               ]
           || .
           ]+
           $ { take 0,0,0 }
       /;
   }.flat.[0,2,3,4].join;

}

for < Soundex S532

     Example     E251
     Sownteks    S532
     Ekzampul    E251
     Euler       E460
     Gauss       G200
     Hilbert     H416
     Knuth       K530
     Lloyd       L300
     Lukasiewicz L222
     Ellery      E460
     Ghosh       G200
     Heilbronn   H416
     Kant        K530
     Ladd        L300
     Lissajous   L222
     Wheaton     W350
     Ashcraft    A261
     Burroughs   B620
     Burrows     B620
     O'Hara      O600 >

-> $n, $s {

   my $s2 = soundex($n);
   say $n.fmt("%16s "), $s, $s eq $s2 ?? " OK" !! " NOT OK $s2";

}</lang> Output:

         Soundex S532 OK
         Example E251 OK
        Sownteks S532 OK
        Ekzampul E251 OK
           Euler E460 OK
           Gauss G200 OK
         Hilbert H416 OK
           Knuth K530 OK
           Lloyd L300 OK
     Lukasiewicz L222 OK
          Ellery E460 OK
           Ghosh G200 OK
       Heilbronn H416 OK
            Kant K530 OK
            Ladd L300 OK
       Lissajous L222 OK
         Wheaton W350 OK
        Ashcraft A261 OK
       Burroughs B620 OK
         Burrows B620 OK
          O'Hara O600 OK

PHP

PHP already has a built-in soundex() function: <lang php><?php echo soundex("Soundex"), "\n"; // S532 echo soundex("Example"), "\n"; // E251 echo soundex("Sownteks"), "\n"; // S532 echo soundex("Ekzampul"), "\n"; // E251 ?></lang>

PicoLisp

<lang PicoLisp>(de soundex (Str)

  (pack
     (pad -4
        (cons
           (uppc (char (char Str)))
           (head 3
              (let Last NIL
                 (extract
                    '((C)
                       (and
                          (setq C
                             (case (uppc C)
                                (`(chop "BFPV") "1")
                                (`(chop "CGJKQSXZ") "2")
                                (("D" "T") "3")
                                ("L" "4")
                                (("M" "N") "5")
                                ("R" "6") ) )
                          (<> Last C)
                          (setq Last C) ) )
                    (cdr (chop Str)) ) ) ) ) ) ) )</lang>

Output:

: (mapcar soundex '("Soundex" "Example" "Sownteks" "Ekzampul"))
-> ("S532" "E251" "S532" "E251")

PureBasic

<lang PureBasic>Procedure.s getCode(c.s)

   Protected  getCode.s = ""
    
   If FindString("BFPV", c ,1)     : getCode = "1" : EndIf
   If FindString("CGJKQSXZ", c ,1) : getCode = "2" : EndIf 
   If FindString("DT", c ,1)       : getCode = "3" : EndIf
   If "L" = c                      : getCode = "4" : EndIf
   If FindString("MN", c ,1)       : getCode = "5" : EndIf
   If "R" = c                      : getCode = "6" : EndIf 
   If FindString("HW", c ,1)       : getCode = "." : EndIf
   ProcedureReturn getCode

EndProcedure

Procedure.s soundex(word.s)

   Protected.s previous.s = "" , code.s , current , soundex
   Protected.i i
   
   word = UCase(word)
   code = Mid(word,1,1)
   previous = ""
   For i = 2 To (Len(word) + 1)
       current = getCode(Mid(word, i, 1))
       If current = "." : Continue : EndIf
       If Len(current) > 0 And current <> previous
           code + current
       EndIf
       previous = current
       If Len(code) = 4
         Break
       EndIf  
   Next
   If Len(code) < 4 
       code = LSet(code, 4,"0")
   EndIf
   ProcedureReturn code

EndProcedure

OpenConsole()

PrintN (soundex("Lukasiewicz")) PrintN("Press any key to exit"): Repeat: Until Inkey() <> ""</lang>

Python

<lang python>from itertools import groupby

def soundex(word):

  codes = ("bfpv","cgjkqsxz", "dt", "l", "mn", "r")
  soundDict = dict((ch, str(ix+1)) for ix,cod in enumerate(codes) for ch in cod)
  cmap2 = lambda kar: soundDict.get(kar, '9')
  sdx =  .join(cmap2(kar) for kar in word.lower())
  sdx2 = word[0].upper() + .join(k for k,g in list(groupby(sdx))[1:] if k!='9')
  sdx3 = sdx2[0:4].ljust(4,'0')
  return sdx3

</lang> Example Output <lang Python>>>>print soundex("soundex") S532 >>>print soundex("example") E251 >>>print soundex("ciondecks") C532 >>>print soundex("ekzampul") E251</lang>

Ruby

Courtesy http://snippets.dzone.com/posts/show/4530 <lang ruby>class String

 SoundexChars = 'BFPVCGJKQSXZDTLMNR'
 SoundexNums  = '111122222222334556'
 SoundexCharsEx = '^' + SoundexChars
 SoundexCharsDel = '^A-Z'

 # desc: http://en.wikipedia.org/wiki/Soundex
 def soundex(census = true)
   str = self.upcase.delete(SoundexCharsDel)
   str[0,1] + str[1..-1].delete(SoundexCharsEx).
                         tr(SoundexChars, SoundexNums).
                         squeeze[0 .. (census ? 2 : -1)].
                         ljust(3, '0') rescue 
 end

 def sounds_like(other)
   self.soundex == other.soundex
 end

end

%w(Soundex Sownteks Example Ekzampul foo bar).each_slice(2) do |word1, word2|

 [word1, word2].each {|word| puts '%-8s -> %s' % [word, word.soundex]}

 print "'#{word1}' "
 print word1.sounds_like(word2) ? "sounds" : "does not sound"
 print " like '#{word2}'\n"

end</lang>

Soundex  -> S532
Sownteks -> S532
'Soundex' sounds like 'Sownteks'
Example  -> E251
Ekzampul -> E251
'Example' sounds like 'Ekzampul'
foo      -> F000
bar      -> B600
'foo' does not sound like 'bar'

Scala

<lang scala>def soundex(s:String)={

  var code=s.head.toUpper.toString
  var previous=getCode(code.head)
  for(ch <- s.drop(1); current=getCode(ch.toUpper)){
     if (!current.isEmpty && current!=previous)
        code+=current
     previous=current
  }
  code+="0000"
  code.slice(0,4)

}

def getCode(c:Char)={

  val code=Map("1"->List('B','F','P','V'),
     "2"->List('C','G','J','K','Q','S','X','Z'),
     "3"->List('D', 'T'),
     "4"->List('L'),
     "5"->List('M', 'N'),
     "6"->List('R'))

  code.find(_._2.exists(_==c)) match {
     case Some((k,_)) => k
     case _ => ""
  }

}</lang>

<lang scala>def main(args: Array[String]): Unit = {

  val tests=Map(
     "Soundex"     -> "S532",
     "Euler"	    -> "E460",
     "Gauss"	    -> "G200",
     "Hilbert"	    -> "H416",
     "Knuth"	    -> "K530",
     "Lloyd"	    -> "L300",
     "Lukasiewicz" -> "L222",
     "Ellery"	    -> "E460",
     "Ghosh"	    -> "G200",
     "Heilbronn"   -> "H416",
     "Kant"	    -> "K530",
     "Ladd"	    -> "L300",
     "Lissajous"   -> "L222",
     "Wheaton"	    -> "W350",
     "Ashcraft"    -> "A226",
     "Burroughs"   -> "B622",
     "Burrows"	    -> "B620",
     "O'Hara"	    -> "O600")

  tests.foreach{(v)=>
     val code=soundex(v._1)
     val status=if (code==v._2) "OK" else "ERROR"
     printf("Name: %-20s  Code: %s   Found: %s  - %s\n", v._1, v._2, code, status)
  }

}</lang>

Scheme

This implements American Soundex as described at [1].

Works with: any R6RS Scheme

<lang scheme>;; The American Soundex System

The soundex code consist of the first letter of the name followed by three digits. These three digits are determined by dropping the letters a, e, i, o, u, h, w and y and adding three digits from the remaining letters of the name according to the table below. There are only two additional rules. (1) If two or more consecutive letters have the same code, they are coded as one letter. (2) If there are an insufficient numbers of letters to make the three digits, the remaining digits are set to zero.

Soundex Table

1 b,f,p,v 2 c,g,j,k,q,s,x,z 3 d, t 4 l 5 m, n 6 r

Examples

Miller M460 Peterson P362 Peters P362 Auerbach A612 Uhrbach U612 Moskowitz M232 Moskovitz M213

(define (char->soundex c)

 (case (char-upcase c)
   ((#\B #\F #\P #\V) #\1)
   ((#\C #\G #\J #\K #\Q #\S #\X #\Z) #\2)
   ((#\D #\T) #\3)
   ((#\L) #\4)
   ((#\M #\N) #\5)
   ((#\R) #\6)
   (else #\nul)))

(define (collapse-dups lst)

 (if (= (length lst) 1) lst
     (if (equal? (car lst) (cadr lst))

(collapse-dups (cdr lst)) (cons (car lst) (collapse-dups (cdr lst))))))

(define (remove-nul lst)

 (filter (lambda (c)

(not (equal? c #\nul))) lst))

(define (force-len n lst)

 (cond ((= n 0) '())

((null? lst) (force-len n (list #\0))) (else (cons (car lst) (force-len (- n 1) (cdr lst))))))

(define (soundex s)

 (let ((slst (string->list s)))
   (force-len 4 (cons (char-upcase (car slst))

(remove-nul (collapse-dups (map char->soundex (cdr slst))))))))

(soundex "miller") (soundex "Peterson") (soundex "PETERS") (soundex "auerbach") (soundex "Uhrbach") (soundex "Moskowitz") (soundex "Moskovitz")</lang>

Sample Output

> "M460"
> "P362"
> "P362"
> "A612"
> "U612"
> "M232"
> "M213"

SNOBOL4

Works with: Macro Spitbol

Works with: Snobol4+

Works with: CSnobol

US National Archives (NARA) Soundex. Includes the "HW" rule omitted by Knuth and many other implementations.

<lang SNOBOL4>* # Soundex coding

# ABCDEFGHIJKLMNOPQRSTUVWXYZ
# 01230127022455012623017202

       define('soundex(str)init,ch') :(soundex_end)

soundex sdxmap = '01230127022455012623017202'

       str = replace(str,&lcase,&ucase)

sdx1 str notany(&ucase) = :s(sdx1)

       init = substr(str,1,1)
       str = replace(str,&ucase,sdxmap)

sdx2 str len(1) $ ch span(*ch) = ch :s(sdx2)

# Omit next line for Knuth's simple Soundex

sdx3 str len(1) $ ch ('7' *ch) = ch :s(sdx3)

       str len(1) = init

sdx4 str any('07') = :s(sdx4)

       str = substr(str,1,4)
       str = lt(size(str),4) str dupl('0',4 - size(str))
       soundex = str :(return)

soundex_end

# Test and display

       test = " Washington Lee Gutierrez Pfister Jackson Tymczak"

+ " Ashcroft Swhgler O'Connor Rhys-Davies" loop test span(' ') break(' ') . name = :f(end)

       output = soundex(name) ' ' name :(loop)

end</lang>

Output:

W252 Washington
L000 Lee
G362 Gutierrez
P236 Pfister
J250 Jackson
T522 Tymczak
A261 Ashcroft
S460 Swhgler
O256 O'Connor

Tcl

Library: Tcllib (Package: soundex)

contains an implementation of Knuth's soundex algorithm.

<lang tcl>package require soundex

foreach string {"Soundex" "Example" "Sownteks" "Ekzampul"} {

   set soundexCode [soundex::knuth $string]
   puts "\"$string\" has code $soundexCode"

}</lang> Which produces this output:

"Soundex" has code S532
"Example" has code E251
"Sownteks" has code S532
"Ekzampul" has code E251

TUSCRIPT

<lang tuscript> $$ MODE TUSCRIPT BUILD X_TABLE soundex = * DATA :b:1:f:1:p:1:v:1: DATA :c:2:g:2:j:2:k:2:1:2:s:2:x:2:z:2: DATA :d:3:t:3: DATA :l:4: DATA :m:5:n:5: DATA :r:6: names="soundex'Lloyd'Woolcock'Donnell'Baragwanath'Williams'Ashcroft'Euler'Ellery'Gauss'Ghosh'Hilbert'Heilbronn'Knuth'Kant'Ladd'Lukasiewicz'Lissajous'Wheaton'Burroughs'Burrows"

LOOP/CLEAR n=names first=EXTRACT (n,1,2),second=EXTRACT (n,2,3) IF (first==second) THEN

rest=EXTRACT (n,3,0)

ELSE

rest=EXTRACT (n,2,0)

ENDIF

soundex=EXCHANGE (rest,soundex) soundex=STRINGS (soundex,":>/:a:e:i:o:u:") soundex=REDUCE (soundex) soundex=STRINGS (soundex,":>/:",0,0,1,0,"") soundex=CONCAT (soundex,"000") soundex=EXTRACT (soundex,0,4)

PRINT first,soundex,"=",n ENDLOOP </lang> Output:

s532=soundex
L300=Lloyd
W422=Woolcock
D540=Donnell
B625=Baragwanath
W452=Williams
A261=Ashcroft
E460=Euler
E460=Ellery
G200=Gauss
G200=Ghosh
H416=Hilbert
H416=Heilbronn
K530=Knuth
K530=Kant
L300=Ladd
L222=Lukasiewicz
L222=Lissajous
W350=Wheaton
B620=Burroughs
B620=Burrows

VBScript

<lang vbscript>Function getCode(c)

   Select Case c
       Case "B", "F", "P", "V"
           getCode = "1"
       Case "C", "G", "J", "K", "Q", "S", "X", "Z"
           getCode = "2"
       Case "D", "T"
           getCode = "3"
       Case "L"
           getCode = "4"
       Case "M", "N"
           getCode = "5"
       Case "R"
           getCode = "6"
   End Select

End Function

Function soundex(s)

   Dim code, previous
   code = UCase(Mid(s, 1, 1))
   previous = 7
   For i = 2 to (Len(s) + 1)
       current = getCode(UCase(Mid(s, i, 1)))
       If Len(current) > 0 And current <> previous Then
           code = code & current
       End If
       previous = current
   Next
   soundex = Mid(code, 1, 4)
   If Len(code) < 4 Then
       soundex = soundex & String(4 - Len(code), "0")
   End If

End Function</lang>