NYSIIS: Difference between revisions
(→Tcl: Added implementation) |
(Refactored 'Caché ObjectScript' section based on other code examples to reduce footprint.) |
||
Line 6: | Line 6: | ||
=={{header|Caché ObjectScript}}== |
=={{header|Caché ObjectScript}}== |
||
Refactored code based on other examples to reduce footprint. |
|||
<lang cache>Class Utils.Phonetic [ Abstract ] |
|||
{ |
|||
<lang cache> |
|||
ClassMethod UnitTest() As %Status |
|||
Class Utils.Phonetic [ Abstract ] |
|||
{ |
{ |
||
// define test list |
|||
Set testlist=$ListBuild( |
|||
$ListBuild("knight", "NAGT"), |
|||
$ListBuild("mitchell", "MATCAL"), |
|||
$ListBuild("o'daniel", "ODANAL"), |
|||
$ListBuild("brown sr", "BRAN"), |
|||
$ListBuild("browne III", "BRAN"), |
|||
$ListBuild("browne IV", "BRAN"), |
|||
$ListBuild("O'Banion", "OBANAN"), |
|||
$ListBuild("Mclaughlin", "MCLAGL[AN]"), |
|||
$ListBuild("McCormack", "MCARNA[C]"), |
|||
$ListBuild("Chapman", "CHAPNA[N]"), |
|||
$ListBuild("Silva", "SALV"), |
|||
$ListBuild("McDonald", "MCDANA[LD]"), |
|||
$ListBuild("Lawson", "LASAN"), |
|||
$ListBuild("Jacobs", "JACAB"), |
|||
$ListBuild("Greene", "GRAN"), |
|||
$ListBuild("O'Brien", "OBRAN"), |
|||
$ListBuild("Morrison", "MARASA[N]"), |
|||
$ListBuild("Larson", "LARSAN"), |
|||
$ListBuild("Willis", "WAL"), |
|||
$ListBuild("Mackenzie", "MCANSY"), |
|||
$ListBuild("Carr", "CAR"), |
|||
$ListBuild("Lawrence", "LARANC"), |
|||
$ListBuild("Matthews", "MAT"), |
|||
$ListBuild("Richards", "RACARD"), |
|||
$ListBuild("Bishop", "BASAP"), |
|||
$ListBuild("Franklin", "FRANCL[AN]"), |
|||
$ListBuild("McDaniel", "MCDANA[L]"), |
|||
$ListBuild("Harper", "HARPAR"), |
|||
$ListBuild("Lynch", "LYNC"), |
|||
$ListBuild("Watkins", "WATCAN"), |
|||
$ListBuild("Carlson", "CARLSA[N]"), |
|||
$ListBuild("Wheeler", "WHALAR") |
|||
) |
|||
// order through test list |
|||
Set ptr=0, sc=$$$ERROR($$$GeneralError, "No entries found.") |
|||
While $ListNext(testlist, ptr, val) { |
|||
Set sc=##class(Utils.Phonetic).Encode("nysiis", $List(val), .code,, 6) |
|||
If $$$ISERR(sc) Quit |
|||
If code'=$List(val, 2) Set sc=$$$ERROR($$$GeneralError, "Encoding did not match.") Quit |
|||
} |
|||
// finished |
|||
If $$$ISERR(sc) Quit sc |
|||
Quit $$$OK |
|||
} |
|||
ClassMethod Encode(pAlgorithm As %String = "", pName As %String = "", ByRef pCode As %String, pSuffixRem As %Boolean = 1, pTruncate As %Integer = 0) As %Status |
ClassMethod Encode(pAlgorithm As %String = "", pName As %String = "", ByRef pCode As %String, pSuffixRem As %Boolean = 1, pTruncate As %Integer = 0) As %Status |
||
{ |
{ |
||
// check algorithm |
// check algorithm and name |
||
Set pAlgorithm=$ZConvert(pAlgorithm, "l") |
Set pAlgorithm=$ZConvert(pAlgorithm, "l") |
||
If pAlgorithm="" Quit $$$ERROR($$$GeneralError, "No algorithm specified.") |
If pAlgorithm="" Quit $$$ERROR($$$GeneralError, "No algorithm specified.") |
||
If $Case(pAlgorithm, "nysiis":1, :0)=0 Quit $$$ERROR($$$GeneralError, "Unknown algorithm specified.") |
If $Case(pAlgorithm, "nysiis":1, :0)=0 Quit $$$ERROR($$$GeneralError, "Unknown algorithm specified.") |
||
// check name |
|||
If $Match(pName, ".*\d.*# no numbers") Quit $$$ERROR($$$GeneralError, "Name cannot contain numerics.") |
If $Match(pName, ".*\d.*# no numbers") Quit $$$ERROR($$$GeneralError, "Name cannot contain numerics.") |
||
// remove apostrophes |
// remove apostrophes, find punctuation and replace with spaces (exclude hyphens) |
||
Set pName=$Translate(pName, "'") |
Set pName=$Translate(pName, "'") |
||
// find punctuation and replace with spaces (exclude hyphens) |
|||
Set pun=$ZStrip(pName, "*E'P", "-") |
Set pun=$ZStrip(pName, "*E'P", "-") |
||
Set pName=$Translate(pName, pun, $Justify(" ", $Length(pun))) |
Set pName=$Translate(pName, pun, $Justify(" ", $Length(pun))) |
||
Line 83: | Line 31: | ||
// - http://en.wikipedia.org/wiki/List_of_post-nominal_letters_(United_Kingdom) |
// - http://en.wikipedia.org/wiki/List_of_post-nominal_letters_(United_Kingdom) |
||
If pSuffixRem { |
If pSuffixRem { |
||
Set ords=$ListBuild("KG", "LG", "KT", "LT", "GCB", "KCB", "DCB", "CB", "GCMG", "KCMG", "DCMG", "CMG", "DSO |
Set ords=$ListBuild("KG", "LG", "KT", "LT", "GCB", "KCB", "DCB", "CB", "GCMG", "KCMG", "DCMG", "CMG", "DSO", |
||
"KCVO", "DCVO", "CVO", "LVO", "MVO", "OM", "ISO", "GBE", "KBE", "DBE", "CBE", "OBE", "MBE", "CH") |
"GCVO", "KCVO", "DCVO", "CVO", "LVO", "MVO", "OM", "ISO", "GBE", "KBE", "DBE", "CBE", "OBE", "MBE", "CH") |
||
Set decs=$ListBuild("VC", "GC", "CGC", "RRC", "DSC", "MC", "DFC", "AFC", "ARRC", "OBI", "IOM") |
Set decs=$ListBuild("VC", "GC", "CGC", "RRC", "DSC", "MC", "DFC", "AFC", "ARRC", "OBI", "IOM") |
||
Set regexp="( )(SNR$|SR$|JNR$|JR$|ESQ$|"_$ListToString(ords, "$|")_"$|"_$ListToString(decs, "$|")_"$|[ |
Set regexp="( )(SNR$|SR$|JNR$|JR$|ESQ$|"_$ListToString(ords, "$|")_"$|"_$ListToString(decs, "$|")_"$|[IVX]+$)" |
||
Set rem=##class(%Regex.Matcher).%New(regexp, pName) |
|||
For { |
|||
Set pName=rem.ReplaceAll("") |
|||
Set locn=$Locate(pName, regexp) If 'locn Quit |
|||
Set pName=$Extract(pName, 1, locn-1) |
|||
} |
|||
} |
} |
||
// |
// replace hyphen and white space, plus some final validation |
||
Set pName=$ZStrip($Translate(pName, "-", " "), "<=>W") |
Set pName=$ZStrip($Translate(pName, "-", " "), "<=>W") |
||
// some final validation |
|||
If $Length($Piece(pName, " "))<2 Quit $$$ERROR($$$GeneralError, "Invalid name.") |
If $Length($Piece(pName, " "))<2 Quit $$$ERROR($$$GeneralError, "Invalid name.") |
||
// begin algorithm |
// begin algorithm and truncate result, if necessary |
||
Set pCode="" |
Set pCode="" |
||
For piece=1:1:$Length(pName, " ") { |
For piece=1:1:$Length(pName, " ") { |
||
If pAlgorithm="nysiis" Set pCode=pCode_..NYSIIS(pName) |
If pAlgorithm="nysiis" Set pCode=pCode_..NYSIIS(pName) |
||
} |
} |
||
// truncate string, if necessary |
|||
If pTruncate { |
If pTruncate { |
||
Set pName=pCode |
Set pName=pCode |
||
Line 124: | Line 66: | ||
- http://www.dropby.com/indexLF.html?content=/NYSIIS.html |
- http://www.dropby.com/indexLF.html?content=/NYSIIS.html |
||
*/ |
*/ |
||
// create regexp matcher instance, remove punctuation and convert all to upper case |
|||
Set rem=##class(%Regex.Matcher).%New(" ") |
|||
Set rem.Text=$ZConvert($ZStrip(pName, "*P"), "U") |
|||
// translate first characters of name: |
// translate first characters of name: |
||
// => MAC->MCC, KN->N, K->C, PH/PF->FF, SCH->SSS |
// => MAC->MCC, KN->N, K->C, PH/PF->FF, SCH->SSS |
||
For rule="^MAC->MCC", "^KN->N", "^K->C", "^(PH|PF)->FF", "SCH->SSS" { |
|||
Set pName1=$Extract(pName, 1) |
|||
Set rem.Pattern=$Piece(rule, "->") |
|||
If rem.Locate() Set rem.Text=rem.ReplaceFirst($Piece(rule, "->", 2)) Quit |
|||
Set pName3=$Extract(pName, 1, 3) |
|||
If pName3="MAC" { |
|||
Set $Extract(pName, 1, 3)="MCC" |
|||
} ElseIf pName2="KN" { |
|||
Set $Extract(pName, 1, 2)="N" |
|||
} ElseIf pName1="K" { |
|||
Set $Extract(pName, 1)="C" |
|||
} ElseIf pName2="PH" { |
|||
Set $Extract(pName, 1, 2)="FF" |
|||
} ElseIf pName2="PF" { |
|||
Set $Extract(pName, 1, 2)="FF" |
|||
} ElseIf pName3="SCH" { |
|||
Set $Extract(pName, 1, 3)="SSS" |
|||
} |
} |
||
// translate last characters of name: |
// translate last characters of name: |
||
// => EE/IE->Y, DT/RT/RD/NT/ND->D |
// => EE/IE->Y, DT/RT/RD/NT/ND->D |
||
For rule="(EE|IE)$->Y", "(DT|RT|RD|NT|ND)$->D" { |
|||
Set pNamexx=$Case($Extract(pName, *-1, *), "EE": "Y", "IE": "Y", |
|||
Set rem.Pattern=$Piece(rule, "->") |
|||
"DT": "D", "RT": "D", "RD": "D", "NT": "D", "ND": "D", :"") |
|||
If rem.Locate() Set rem.Text=rem.ReplaceFirst($Piece(rule, "->", 2)) Quit |
|||
} |
|||
// first character of key = first character of name |
// first character of key = first character of name |
||
Set pName1=$Extract( |
Set pName1=$Extract(rem.Text, 1), rem.Text=$Extract(rem.Text, 2, *) |
||
Set $Extract(pName, 1)="" |
|||
// translate remaining characters by following rules, incrementing by one character each time: |
// translate remaining characters by following rules, incrementing by one character each time: |
||
Line 162: | Line 96: | ||
// => W->if previous is vowel, A (A is the only vowel left) |
// => W->if previous is vowel, A (A is the only vowel left) |
||
// => add current to key if current is not same as the last key character |
// => add current to key if current is not same as the last key character |
||
Set ptr=0, rules=$ListBuild("EV->AF", "(A|E|I|O|U)->A", "Q->G", "Z->S", "M->N", "KN->N", "K->C", |
|||
Set pName=$Replace(pName, "EV", "AF") |
|||
"SCH->SSS", "PH->FF", "H[^A]", "[^A]H", "AW->A") |
|||
Set pName=$Translate(pName, "EIOU", "AAAA") |
|||
While $ListNext(rules, ptr, rule) { |
|||
Set pName=$Translate(pName, "QZM", "GSN") |
|||
Set rem.Pattern=$Piece(rule, "->") |
|||
If $Piece(rule, "->", 2)="", rem.Locate() { |
|||
Set $Piece(rule, "->", 2)=$Translate(rem.Group, "H") |
|||
} |
|||
Set pName=$Replace(pName, "PH", "FF") |
|||
Set rem.Text=rem.ReplaceAll($Piece(rule, "->", 2)) |
|||
Set locn=$Locate(pName, "H[^A]") If locn Set $Extract(pName, locn)="" |
|||
} |
|||
Set locn=$Locate(pName, "[^A]H") If locn Set $Extract(pName, locn+1)="" |
|||
Set pName=$ |
Set pName=$ZStrip(rem.Text, "=U") // remove duplicates |
||
Set pName=$ZStrip(pName,"=U") // remove duplicates |
|||
// if last character is S, remove it |
// if last character is S, remove it |
||
Line 187: | Line 120: | ||
} |
} |
||
} |
|||
}</lang> |
|||
</lang> |
|||
{{out|Examples}} |
{{out|Examples}} |
||
<pre> |
<pre> |
||
USER> |
USER>For { Read !, name Quit:name="" Set sc=##class(Utils.Phonetic).Encode("nysiis", name, .code,, 6) If sc Write " -> ", code } |
||
USER>Write code |
|||
MCLAGLAN |
|||
</pre> |
|||
knight -> NAGT |
|||
mitchell -> MATCAL |
|||
o'daniel -> ODANAL |
|||
brown sr -> BRAN |
|||
browne III -> BRAN |
|||
browne IV -> BRAN |
|||
O'Banion -> OBANAN |
|||
Mclaughlin -> MCLAGL[AN] |
|||
McCormack -> MCARNA[C] |
|||
Chapman -> CHAPNA[N] |
|||
Silva -> SALV |
|||
McDonald -> MCDANA[LD] |
|||
Lawson -> LASAN |
|||
Jacobs -> JACAB |
|||
Greene -> GRAN |
|||
O'Brien -> OBRAN |
|||
Morrison -> MARASA[N] |
|||
Larson -> LARSAN |
|||
Willis -> WAL |
|||
Mackenzie -> MCANSY |
|||
Carr -> CAR |
|||
Lawrence -> LARANC |
|||
Matthews -> MAT |
|||
Richards -> RACARD |
|||
Bishop -> BASAP |
|||
Franklin -> FRANCL[AN] |
|||
McDaniel -> MCDANA[L] |
|||
Harper -> HARPAR |
|||
Lynch -> LYNC |
|||
Watkins -> WATCAN |
|||
Carlson -> CARLSA[N] |
|||
Wheeler -> WHALAR |
|||
Louis XVI -> L |
|||
</pre> |
|||
=={{header|Perl 6}}== |
=={{header|Perl 6}}== |
Revision as of 12:42, 23 March 2013
You are encouraged to solve this task according to the task description, using any language you may know.
This page uses content from Wikipedia. The original article was at NYSIIS. The list of authors can be seen in the page history. As with Rosetta Code, the text of Wikipedia is available under the GNU FDL. (See links for details on variance) |
The New York State Identification and Intelligence System phonetic code, commonly known as NYSIIS, is a phonetic algorithm for creating indices for words based on their pronunciation. The goal is for homophones to be encoded to the same representation so that they can be matched despite minor differences in spelling.
- See also
Caché ObjectScript
Refactored code based on other examples to reduce footprint.
<lang cache> Class Utils.Phonetic [ Abstract ] {
ClassMethod Encode(pAlgorithm As %String = "", pName As %String = "", ByRef pCode As %String, pSuffixRem As %Boolean = 1, pTruncate As %Integer = 0) As %Status { // check algorithm and name Set pAlgorithm=$ZConvert(pAlgorithm, "l") If pAlgorithm="" Quit $$$ERROR($$$GeneralError, "No algorithm specified.") If $Case(pAlgorithm, "nysiis":1, :0)=0 Quit $$$ERROR($$$GeneralError, "Unknown algorithm specified.") If $Match(pName, ".*\d.*# no numbers") Quit $$$ERROR($$$GeneralError, "Name cannot contain numerics.")
// remove apostrophes, find punctuation and replace with spaces (exclude hyphens) Set pName=$Translate(pName, "'") Set pun=$ZStrip(pName, "*E'P", "-") Set pName=$Translate(pName, pun, $Justify(" ", $Length(pun)))
// convert name(s) to uppercase and remove all white space Set pName=$ZStrip($ZConvert(pName, "U"), "<=>W")
// remove suffixes (e.g. 'Jnr', 'OBE', 'DSC', etc), including roman numerals (e.g. 'II', 'VIII') // - http://en.wikipedia.org/wiki/List_of_post-nominal_letters_(United_Kingdom) If pSuffixRem { Set ords=$ListBuild("KG", "LG", "KT", "LT", "GCB", "KCB", "DCB", "CB", "GCMG", "KCMG", "DCMG", "CMG", "DSO", "GCVO", "KCVO", "DCVO", "CVO", "LVO", "MVO", "OM", "ISO", "GBE", "KBE", "DBE", "CBE", "OBE", "MBE", "CH") Set decs=$ListBuild("VC", "GC", "CGC", "RRC", "DSC", "MC", "DFC", "AFC", "ARRC", "OBI", "IOM") Set regexp="( )(SNR$|SR$|JNR$|JR$|ESQ$|"_$ListToString(ords, "$|")_"$|"_$ListToString(decs, "$|")_"$|[IVX]+$)" Set rem=##class(%Regex.Matcher).%New(regexp, pName) Set pName=rem.ReplaceAll("") }
// replace hyphen and white space, plus some final validation Set pName=$ZStrip($Translate(pName, "-", " "), "<=>W") If $Length($Piece(pName, " "))<2 Quit $$$ERROR($$$GeneralError, "Invalid name.")
// begin algorithm and truncate result, if necessary Set pCode="" For piece=1:1:$Length(pName, " ") { If pAlgorithm="nysiis" Set pCode=pCode_..NYSIIS(pName) } If pTruncate { Set pName=pCode Set pCode=$Extract(pCode, 1, pTruncate) Set $Extract(pName, 1, pTruncate)="" If $Length(pName) Set pCode=pCode_"["_pName_"]" }
// finished Quit $$$OK }
ClassMethod NYSIIS(pName As %String) As %String { /* New York State Identification and Intelligence System (NYSIIS) Phonetic Encoder - http://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System - http://www.dropby.com/indexLF.html?content=/NYSIIS.html */
// create regexp matcher instance, remove punctuation and convert all to upper case Set rem=##class(%Regex.Matcher).%New(" ") Set rem.Text=$ZConvert($ZStrip(pName, "*P"), "U")
// translate first characters of name: // => MAC->MCC, KN->N, K->C, PH/PF->FF, SCH->SSS For rule="^MAC->MCC", "^KN->N", "^K->C", "^(PH|PF)->FF", "SCH->SSS" { Set rem.Pattern=$Piece(rule, "->") If rem.Locate() Set rem.Text=rem.ReplaceFirst($Piece(rule, "->", 2)) Quit }
// translate last characters of name: // => EE/IE->Y, DT/RT/RD/NT/ND->D For rule="(EE|IE)$->Y", "(DT|RT|RD|NT|ND)$->D" { Set rem.Pattern=$Piece(rule, "->") If rem.Locate() Set rem.Text=rem.ReplaceFirst($Piece(rule, "->", 2)) Quit }
// first character of key = first character of name Set pName1=$Extract(rem.Text, 1), rem.Text=$Extract(rem.Text, 2, *)
// translate remaining characters by following rules, incrementing by one character each time: // => EV->AF else A,E,I,O,U->A // => Q->G, Z->S, M->N // => KN->N else K->C // => SCH->SSS, PH->FF // => H->if previous or next is non-vowel, previous // => W->if previous is vowel, A (A is the only vowel left) // => add current to key if current is not same as the last key character Set ptr=0, rules=$ListBuild("EV->AF", "(A|E|I|O|U)->A", "Q->G", "Z->S", "M->N", "KN->N", "K->C", "SCH->SSS", "PH->FF", "H[^A]", "[^A]H", "AW->A") While $ListNext(rules, ptr, rule) { Set rem.Pattern=$Piece(rule, "->") If $Piece(rule, "->", 2)="", rem.Locate() { Set $Piece(rule, "->", 2)=$Translate(rem.Group, "H") } Set rem.Text=rem.ReplaceAll($Piece(rule, "->", 2)) } Set pName=$ZStrip(rem.Text, "=U") // remove duplicates
// if last character is S, remove it If $Extract(pName, *)="S" Set pName=$Extract(pName, 1, *-1)
// if last characters are AY, replace with Y If $Extract(pName, *-1, *)="AY" Set pName=$Extract(pName, 1, *-2)_"Y"
// if last character is A, remove it If $Extract(pName, *)="A" Set pName=$Extract(pName, 1, *-1)
// append translated key to removed first character Quit pName1_pName }
} </lang>
- Examples:
USER>For { Read !, name Quit:name="" Set sc=##class(Utils.Phonetic).Encode("nysiis", name, .code,, 6) If sc Write " -> ", code } knight -> NAGT mitchell -> MATCAL o'daniel -> ODANAL brown sr -> BRAN browne III -> BRAN browne IV -> BRAN O'Banion -> OBANAN Mclaughlin -> MCLAGL[AN] McCormack -> MCARNA[C] Chapman -> CHAPNA[N] Silva -> SALV McDonald -> MCDANA[LD] Lawson -> LASAN Jacobs -> JACAB Greene -> GRAN O'Brien -> OBRAN Morrison -> MARASA[N] Larson -> LARSAN Willis -> WAL Mackenzie -> MCANSY Carr -> CAR Lawrence -> LARANC Matthews -> MAT Richards -> RACARD Bishop -> BASAP Franklin -> FRANCL[AN] McDaniel -> MCDANA[L] Harper -> HARPAR Lynch -> LYNC Watkins -> WATCAN Carlson -> CARLSA[N] Wheeler -> WHALAR Louis XVI -> L
Perl 6
This implementation removes common name suffixes similar to the reference implementation, even though it is not specified in the task description or on the linked NYSIIS page. This algorithm isn't too friendly to certain French kings. :)
<lang perl6>sub no_suffix ($name) {
$name.uc.subst: /\h (<[JS]>R) | (<[IVX]>+) $/, ;
}
sub nysiis ($name is copy) {
given $name .= uc { s:g/<-[A..Z]>//; s/^MAC/MCC/; s/^P<[FH]>/FF/; s/^SCH/SSS/; s/^KN/N/; s/<[IE]>E$ /Y/; s/<[DRN]>T$ /D/; s/<[RN]>D$ /D/; s:c(1):g/EV/AF/; s:c(1):g/<[AEIOU]>+/A/; s:c(1):g/Q/G/; s:c(1):g/Z/S/; s:c(1):g/M/N/; s:c(1):g/KN/N/; s:c(1):g/K/C/; s:c(1):g/SCH/S/; s:c(1):g/PF/F/; s:c(1):g/K/C/; s:c(1):g/H(<-[AEIOU]>)/$0/; s:g/(<-[AEIOU]>)H/$0/; s:g/(<-[AEIOU]>)W/$0/; s/ AY$ /Y/; s/ S$ //; s/ A$ //; s:g/ (.)$0+ /$0/; }
}
for «
knight mitchell "o'daniel" "brown sr" "browne III" "browne IV" "O'Banion" Mclaughlin McCormack Chapman Silva McDonald Lawson Jacobs Greene "O'Brien" Morrison Larson Willis Mackenzie Carr Lawrence Matthews Richards Bishop Franklin McDaniel Harper Lynch Watkins Carlson Wheeler "Louis XVI"
» {
my $nysiis = nysiis no_suffix $_; if $nysiis.chars > 6 { $nysiis .= subst: rx/ <after .**6> .+ /, -> $/ { "[$/]" }; } printf "%10s, %s\n", $_, $nysiis;
}</lang>
Output:
knight, NAGT mitchell, MATCAL o'daniel, ODANAL brown sr, BRAN browne III, BRAN browne IV, BRAN O'Banion, OBANAN Mclaughlin, MCLAGL[AN] McCormack, MCARNA[C] Chapman, CAPNAN Silva, SALV McDonald, MCDANA[LD] Lawson, LASAN Jacobs, JACAB Greene, GRAN O'Brien, OBRAN Morrison, MARASA[N] Larson, LARSAN Willis, WAL Mackenzie, MCANSY Carr, CAR Lawrence, LARANC Matthews, MAT Richards, RACARD Bishop, BASAP Franklin, FRANCL[AN] McDaniel, MCDANA[L] Harper, HARPAR Lynch, LYNC Watkins, WATCAN Carlson, CARLSA[N] Wheeler, WALAR Louis XVI, L
Tcl
<lang tcl>proc nysiis {name {truncate false}} {
# Normalize to first word, uppercased, without non-letters set name [regsub -all {[^A-Z]+} [string toupper [regexp -inline {\S+} $name]] ""] # Prefix map foreach {from to} {MAC MCC KN N K C PH FF PF FF SCH SSS} {
if {[regsub ^$from $name $to name]} break
} # Suffix map foreach {from to} {EE Y IE Y DT D RT D NT D ND D} {
if {[regsub $from$ $name $to name]} break
} # Split regexp (.)(.*) $name -> name rest # Reduce suffix regsub -all {[AEIOU]} [regsub -all EV $rest AF] A rest set rest [string map {Q G Z S M N KN N K C SCH SSS PH FF} $rest] regsub -all {([^A])H|(.)H(?=[^A])} $rest {\1\2} rest regsub -all AW $rest A rest regsub -all {(.)\1+} $rest {\1} rest regsub {S$} $rest "" rest regsub {A(Y?)$} $rest {\1} rest append name $rest # Apply truncation if needed if {$truncate} {
set name [string range $name 0 5]
} return $name
}</lang> Demonstrating: <lang tcl>foreach name {
knight mitchell "o'daniel" "brown sr" "browne III" "browne IV" "O'Banion" Mclaughlin McCormack Chapman Silva McDonald Lawson Jacobs Greene "O'Brien" Morrison Larson Willis Mackenzie Carr Lawrence Matthews Richards Bishop Franklin McDaniel Harper Lynch Watkins Carlson Wheeler "Louis XVI"
} {
puts "$name -> [nysiis $name]"
}</lang>
- Output:
knight -> NAGT mitchell -> MATCAL o'daniel -> ODANAL brown sr -> BRAN browne III -> BRAN browne IV -> BRAN O'Banion -> OBANAN Mclaughlin -> MCLAGLAN McCormack -> MCARNAC Chapman -> CHAPNAN Silva -> SALV McDonald -> MCDANALD Lawson -> LASAN Jacobs -> JACAB Greene -> GRAN O'Brien -> OBRAN Morrison -> MARASAN Larson -> LARSAN Willis -> WAL Mackenzie -> MCANSY Carr -> CAR Lawrence -> LARANC Matthews -> MAT Richards -> RACARD Bishop -> BASAP Franklin -> FRANCLAN McDaniel -> MCDANAL Harper -> HARPAR Lynch -> LYNC Watkins -> WATCAN Carlson -> CARLSAN Wheeler -> WHALAR Louis XVI -> L