NYSIIS

From Rosetta Code
Revision as of 20:14, 17 March 2013 by rosettacode>Toucanbird (New York State Identification and Intelligence System (NYSIIS) Phonetic Encoder)
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Task
NYSIIS
You are encouraged to solve this task according to the task description, using any language you may know.

The New York State Identification and Intelligence System Phonetic Code, commonly known as NYSIIS, is a phonetic algorithm for creating indices for words based on their pronunciation. The goal is for homophones to be encoded to the same representation so that they can be matched despite minor differences in spelling (from the WP article).

Caché ObjectScript

Examples:

USER>Do ##class(Utils.Phonetic).Encode("nysiis", "Mclaughlin", .code)
USER>Write code
MCLAGLAN

Code: <lang Caché_ObjectScript> Class Utils.Phonetic [ Abstract ] { ClassMethod UnitTest() As %Status { // define test list Set testlist=$ListBuild( $ListBuild("knight", "NAGT"), $ListBuild("mitchell", "MATCAL"), $ListBuild("o'daniel", "ODANAL"), $ListBuild("brown sr", "BRAN"), $ListBuild("browne III", "BRAN"), $ListBuild("browne IV", "BRAN"), $ListBuild("O'Banion", "OBANAN"), $ListBuild("Mclaughlin", "MCLAGL[AN]"), $ListBuild("McCormack", "MCARNA[C]"), $ListBuild("Chapman", "CHAPNA[N]"), $ListBuild("Silva", "SALV"), $ListBuild("McDonald", "MCDANA[LD]"), $ListBuild("Lawson", "LASAN"), $ListBuild("Jacobs", "JACAB"), $ListBuild("Greene", "GRAN"), $ListBuild("O'Brien", "OBRAN"), $ListBuild("Morrison", "MARASA[N]"), $ListBuild("Larson", "LARSAN"), $ListBuild("Willis", "WAL"), $ListBuild("Mackenzie", "MCANSY"), $ListBuild("Carr", "CAR"), $ListBuild("Lawrence", "LARANC"), $ListBuild("Matthews", "MAT"), $ListBuild("Richards", "RACARD"), $ListBuild("Bishop", "BASAP"), $ListBuild("Franklin", "FRANCL[AN]"), $ListBuild("McDaniel", "MCDANA[L]"), $ListBuild("Harper", "HARPAR"), $ListBuild("Lynch", "LYNC"), $ListBuild("Watkins", "WATCAN"), $ListBuild("Carlson", "CARLSA[N]"), $ListBuild("Wheeler", "WHALAR") ) // order through test list Set ptr=0, sc=$$$ERROR($$$GeneralError, "No entries found.") While $ListNext(testlist, ptr, val) { Set sc=##class(Utils.Phonetic).Encode("nysiis", $List(val), .code,, 6) If $$$ISERR(sc) Quit If code'=$List(val, 2) Set sc=$$$ERROR($$$GeneralError, "Encoding did not match.") Quit } // finished If $$$ISERR(sc) Quit sc Quit $$$OK } ClassMethod Encode(pAlgorithm As %String = "", pName As %String = "", ByRef pCode As %String, pSuffixRem As %Boolean = 1, pTruncate As %Integer = 0) As %Status { // check algorithm Set pAlgorithm=$ZConvert(pAlgorithm, "l") If pAlgorithm="" Quit $$$ERROR($$$GeneralError, "No algorithm specified.") If $Case(pAlgorithm, "nysiis":1, :0)=0 Quit $$$ERROR($$$GeneralError, "Unknown algorithm specified.") // check name If $Match(pName, ".*\d.*# no numbers") Quit $$$ERROR($$$GeneralError, "Name cannot contain numerics.") // remove apostrophes Set pName=$Translate(pName, "'") // find punctuation and replace with spaces (exclude hyphens) Set pun=$ZStrip(pName, "*E'P", "-") Set pName=$Translate(pName, pun, $Justify(" ", $Length(pun))) // convert name(s) to uppercase and remove all white space Set pName=$ZStrip($ZConvert(pName, "U"), "<=>W") // remove suffixes (e.g. 'Jnr', 'OBE', 'DSC', etc), including roman numerals (e.g. 'II', 'VIII') // - http://en.wikipedia.org/wiki/List_of_post-nominal_letters_(United_Kingdom) If pSuffixRem { Set ords=$ListBuild("KG", "LG", "KT", "LT", "GCB", "KCB", "DCB", "CB", "GCMG", "KCMG", "DCMG", "CMG", "DSO", "GCVO", "KCVO", "DCVO", "CVO", "LVO", "MVO", "OM", "ISO", "GBE", "KBE", "DBE", "CBE", "OBE", "MBE", "CH") Set decs=$ListBuild("VC", "GC", "CGC", "RRC", "DSC", "MC", "DFC", "AFC", "ARRC", "OBI", "IOM") Set regexp="( )(SNR$|SR$|JNR$|JR$|ESQ$|"_$ListToString(ords, "$|")_"$|"_$ListToString(decs, "$|")_"$|[IV]+$)" For { Set locn=$Locate(pName, regexp) If 'locn Quit Set pName=$Extract(pName, 1, locn-1) } } // lastly replace hyphen with space Set pName=$ZStrip($Translate(pName, "-", " "), "<=>W") // some final validation If $Length($Piece(pName, " "))<2 Quit $$$ERROR($$$GeneralError, "Invalid name.") // begin algorithm Set pCode="" For piece=1:1:$Length(pName, " ") { If pAlgorithm="nysiis" Set pCode=pCode_..NYSIIS(pName) } // truncate string, if necessary If pTruncate { Set pName=pCode Set pCode=$Extract(pCode, 1, pTruncate) Set $Extract(pName, 1, pTruncate)="" If $Length(pName) Set pCode=pCode_"["_pName_"]" } // finished Quit $$$OK } ClassMethod NYSIIS(pName As %String) As %String { /* New York State Identification and Intelligence System (NYSIIS) Phonetic Encoder - http://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System - http://www.dropby.com/indexLF.html?content=/NYSIIS.html */ // translate first characters of name: // => MAC->MCC, KN->N, K->C, PH/PF->FF, SCH->SSS Set pName1=$Extract(pName, 1) Set pName2=$Extract(pName, 1, 2) Set pName3=$Extract(pName, 1, 3) If pName3="MAC" { Set $Extract(pName, 1, 3)="MCC" } ElseIf pName2="KN" { Set $Extract(pName, 1, 2)="N" } ElseIf pName1="K" { Set $Extract(pName, 1)="C" } ElseIf pName2="PH" { Set $Extract(pName, 1, 2)="FF" } ElseIf pName2="PF" { Set $Extract(pName, 1, 2)="FF" } ElseIf pName3="SCH" { Set $Extract(pName, 1, 3)="SSS" } // translate last characters of name: // => EE/IE->Y, DT/RT/RD/NT/ND->D Set pNamexx=$Case($Extract(pName, *-1, *), "EE": "Y", "IE": "Y", "DT": "D", "RT": "D", "RD": "D", "NT": "D", "ND": "D", :"") If $Length(pNamexx) Set pName=$Extract(pName, 1, *-2)_pNamexx // first character of key = first character of name Set pName1=$Extract(pName, 1) Set $Extract(pName, 1)="" // translate remaining characters by following rules, incrementing by one character each time: // => EV->AF else A,E,I,O,U->A // => Q->G, Z->S, M->N // => KN->N else K->C // => SCH->SSS, PH->FF // => H->if previous or next is non-vowel, previous // => W->if previous is vowel, A (A is the only vowel left) // => add current to key if current is not same as the last key character Set pName=$Replace(pName, "EV", "AF") Set pName=$Translate(pName, "EIOU", "AAAA") Set pName=$Translate(pName, "QZM", "GSN") Set pName=$Replace(pName, "KN", "N") Set pName=$Translate(pName, "K", "C") Set pName=$Replace(pName, "SCH", "SSS") Set pName=$Replace(pName, "PH", "FF") Set locn=$Locate(pName, "H[^A]") If locn Set $Extract(pName, locn)="" Set locn=$Locate(pName, "[^A]H") If locn Set $Extract(pName, locn+1)="" Set pName=$Replace(pName, "AW", "A") Set pName=$ZStrip(pName,"=U") // remove duplicates // if last character is S, remove it If $Extract(pName, *)="S" Set pName=$Extract(pName, 1, *-1) // if last characters are AY, replace with Y If $Extract(pName, *-1, *)="AY" Set pName=$Extract(pName, 1, *-2)_"Y" // if last character is A, remove it If $Extract(pName, *)="A" Set pName=$Extract(pName, 1, *-1) // append translated key to removed first character Quit pName1_pName } } </lang>