NYSIIS Phonetic Encoder

Introduction
The New York State Identification and Intelligence System phonetic code, commonly known as NYSIIS, is a phonetic algorithm for creating indices for words based on their pronunciation. The goal is for homophones to be encoded to the same representation so that they can be matched despite minor differences in spelling.

Example
The following are examples of how to convert a name into a NYSIIS string.

USER>For { Read !, name Quit:name=""  Set sc=##class(Utils.Phonetic).Encode("nysiis", name, .code,, 6) If sc Write " -> ", code }

knight -> NAGT mitchell -> MATCAL o'daniel -> ODANAL brown sr -> BRAN browne III -> BRAN browne IV -> BRAN O'Banion -> OBANAN Mclaughlin -> MCLAGL[AN] McCormack -> MCARNA[C] Chapman -> CHAPNA[N] Silva -> SALV McDonald -> MCDANA[LD] Lawson -> LASAN Jacobs -> JACAB Greene -> GRAN O'Brien -> OBRAN Morrison -> MARASA[N] Larson -> LARSAN Willis -> WAL Mackenzie -> MCANSY Carr -> CAR Lawrence -> LARANC Matthews -> MAT Richards -> RACARD Bishop -> BASAP Franklin -> FRANCL[AN] McDaniel -> MCDANA[L] Harper -> HARPAR Lynch -> LYNC Watkins -> WATCAN Carlson -> CARLSA[N] Wheeler -> WHALAR Louis XVI -> L Hoyle-Johnson -> HAYLJA[NSAN] Vaughan Williams -> VAGANW[ALAN] D'Souza -> DSAS de Sousa -> DSAS

Code
The code to generate a NYSIIS string is as follows.

Class Utils.Phonetic [ Abstract ] {

ClassMethod Encode(pAlgorithm As %String = "", pName As %String = "", Output pCode As %String, pSuffixRem As %Boolean = 1, pTruncate As %Integer = 0) As %Status {	// check algorithm and name Set pAlgorithm=$ZConvert(pAlgorithm, "l") If pAlgorithm="" Quit $$$ERROR($$$GeneralError, "No algorithm specified.") If $Case(pAlgorithm, "nysiis":1, :0)=0 Quit $$$ERROR($$$GeneralError, "Unknown algorithm specified.") If $Match(pName, ".*\d.*# no numbers") Quit $$$ERROR($$$GeneralError, "Name cannot contain numerics.") // remove apostrophes, find punctuation and replace with spaces (exclude hyphens) Set pName=$Translate(pName, "'") Set pun=$ZStrip(pName, "*E'P", "-") Set pName=$Translate(pName, pun, $Justify(" ", $Length(pun))) // convert name(s) to uppercase and remove all white space Set pName=$ZStrip($ZConvert(pName, "U"), "<=>W") // remove suffixes (e.g. 'Jnr', 'OBE', 'DSC', etc), including roman numerals (e.g. 'II', 'VIII') // - http://en.wikipedia.org/wiki/List_of_post-nominal_letters_(United_Kingdom) If pSuffixRem { Set ords=$ListBuild("KG", "LG", "KT", "LT", "GCB", "KCB", "DCB", "CB", "GCMG", "KCMG", "DCMG", "CMG", "DSO", 			"GCVO", "KCVO", "DCVO", "CVO", "LVO", "MVO", "OM", "ISO", "GBE", "KBE", "DBE", "CBE", "OBE", "MBE", "CH") Set decs=$ListBuild("VC", "GC", "CGC", "RRC", "DSC", "MC", "DFC", "AFC", "ARRC", "OBI", "IOM") Set regexp="(SNR|SR|JNR|JR|ESQ|"_$ListToString(ords, "|")_"|"_$ListToString(decs, "|")_"|[IVX]+)" Set rem=##class(%Regex.Matcher).%New(regexp, pName) Set pName=rem.ReplaceAll("") }	// replace hyphen and white space, plus some final validation Set pName=$ZStrip($Translate(pName, "-", " "), "<=>W") If $Length($Piece(pName, " "))<2 Quit $$$ERROR($$$GeneralError, "Invalid name.") // begin algorithm and truncate result, if necessary Set pCode="" For piece=1:1:$Length(pName, " ") { If pAlgorithm="nysiis" Set pCode=pCode_..NYSIIS($Piece(pName, " ", piece)) }	If pTruncate { Set pName=pCode Set pCode=$Extract(pCode, 1, pTruncate) Set $Extract(pName, 1, pTruncate)="" If $Length(pName) Set pCode=pCode_"["_pName_"]" }	// finished Quit $$$OK }

ClassMethod NYSIIS(pName As %String) As %String {	/*		New York State Identification and Intelligence System (NYSIIS) Phonetic Encoder - http://en.wikipedia.org/wiki/New_York_State_Identification_and_Intelligence_System - http://www.dropby.com/indexLF.html?content=/NYSIIS.html */	// create regexp matcher instance, remove punctuation and convert all to upper case Set rem=##class(%Regex.Matcher).%New(" ") Set rem.Text=$ZConvert($ZStrip(pName, "*P"), "U") // translate first characters of name: // => MAC->MCC, KN->N, K->C, PH/PF->FF, SCH->SSS For rule="^MAC->MCC", "^KN->N", "^K->C", "^(PH|PF)->FF", "^SCH->SSS" { Set rem.Pattern=$Piece(rule, "->") If rem.Locate Set rem.Text=rem.ReplaceFirst($Piece(rule, "->", 2)) Quit }

// translate last characters of name: // => EE/IE->Y, DT/RT/RD/NT/ND->D For rule="(EE|IE)$->Y", "(DT|RT|RD|NT|ND)$->D" { Set rem.Pattern=$Piece(rule, "->") If rem.Locate Set rem.Text=rem.ReplaceFirst($Piece(rule, "->", 2)) Quit }	// first character of key = first character of name Set pName1=$Extract(rem.Text, 1), rem.Text=$Extract(rem.Text, 2, *) // translate remaining characters by following rules, incrementing by one character each time: // => EV->AF else A,E,I,O,U->A // => Q->G, Z->S, M->N // => KN->N else K->C // => SCH->SSS, PH->FF // => H->if previous or next is non-vowel, previous // => W->if previous is vowel, A (A is the only vowel left) // => add current to key if current is not same as the last key character Set ptr=0, rules=$ListBuild("EV->AF", "(A|E|I|O|U)->A", "Q->G", "Z->S", "M->N", "KN->N", "K->C", 		"SCH->SSS", "PH->FF", "H[^A]", "[^A]H", "AW->A") While $ListNext(rules, ptr, rule) { Set rem.Pattern=$Piece(rule, "->") If $Piece(rule, "->", 2)="", rem.Locate { Set $Piece(rule, "->", 2)=$Translate(rem.Group, "H") }		Set rem.Text=rem.ReplaceAll($Piece(rule, "->", 2)) }	Set pName=$ZStrip(rem.Text, "=U") // remove duplicates // if last character is S, remove it	If $Extract(pName, *)="S" Set pName=$Extract(pName, 1, *-1) // if last characters are AY, replace with Y	If $Extract(pName, *-1, *)="AY" Set pName=$Extract(pName, 1, *-2)_"Y" // if last character is A, remove it	If $Extract(pName, *)="A" Set pName=$Extract(pName, 1, *-1) // append translated key to removed first character Quit pName1_pName }

}