From: decvax!decwrl!dec-rhea!dec-viking!wasser_1@Ucb-Vax.ARPA To: net.sources Subject: Final English-to-Phoneme version! Date: 15 Apr 85 21:58:34 GMT Final Version of ENGLISH TO PHONEME TRANSLATION 4/15/85 Here it is one last time. I have fixed all of the bugs I heard about and added a new feature or two (it now talks money as well as numbers). I think that this version is good enough for most purposes. I have proof-read the phoneme rules (found one bug) and made the program more "robust". I added protection against the "toupper()" problem some people had with earlier versions. If you make a major addition (like better abbreviation handling or an exception dictionary) please send me a copy. As before, this is all public domain and I make no copyright claims on it. The part derived from the Naval Research Lab should be public anyway. Sell it if you can! -John A. Wasser Work address: ARPAnet: WASSER%VIKING.DEC@decwrl.ARPA Usenet: {allegra,Shasta,decvax}!decwrl!dec-rhea!dec-viking!wasser Easynet: VIKING::WASSER Telephone: (617)486-2505 USPS: Digital Equipment Corp. Mail stop: LJO2/E4 30 Porter Rd Littleton, MA 01460 The files that make up this package are: ENGLISH.C Translation rules. PHONEME.C Translate a single word. PARSE.C Split a file into words. SPELLWORD.C Spell an ASCII character or word. SAYNUM.C Say a cardinal or ordinal number (long int). ------------------------------------------------------------------------------- **FILE** ENGLISH.C Translation rules. ------------------------------------------------------------------------------- /* ** English to Phoneme rules. ** ** Derived from: ** ** AUTOMATIC TRANSLATION OF ENGLISH TEXT TO PHONETICS ** BY MEANS OF LETTER-TO-SOUND RULES ** ** NRL Report 7948 ** ** January 21st, 1976 ** Naval Research Laboratory, Washington, D.C. ** ** ** Published by the National Technical Information Service as ** document "AD/A021 929". ** ** ** ** The Phoneme codes: ** ** IY bEEt IH bIt ** EY gAte EH gEt ** AE fAt AA fAther ** AO lAWn OW lOne ** UH fUll UW fOOl ** ER mURdER AX About ** AH bUt AY hIde ** AW hOW OY tOY ** ** p Pack b Back ** t Time d Dime ** k Coat g Goat ** f Fault v Vault ** TH eTHer DH eiTHer ** s Sue z Zoo ** SH leaSH ZH leiSure ** HH How m suM ** n suN NG suNG ** l Laugh w Wear ** y Young r Rate ** CH CHar j Jar ** WH WHere ** ** ** Rules are made up of four parts: ** ** The left context. ** The text to match. ** The right context. ** The phonemes to substitute for the matched text. ** ** Procedure: ** ** Seperate each block of letters (apostrophes included) ** and add a space on each side. For each unmatched ** letter in the word, look through the rules where the ** text to match starts with the letter in the word. If ** the text to match is found and the right and left ** context patterns also match, output the phonemes for ** that rule and skip to the next unmatched letter. ** ** ** Special Context Symbols: ** ** # One or more vowels ** : Zero or more consonants ** ^ One consonant. ** . One of B, D, V, G, J, L, M, N, R, W or Z (voiced ** consonants) ** % One of ER, E, ES, ED, ING, ELY (a suffix) ** (Found in right context only) ** + One of E, I or Y (a "front" vowel) ** */ /* Context definitions */ static char Anything[] = ""; /* No context requirement */ static char Nothing[] = " "; /* Context is beginning or end of word */ /* Phoneme definitions */ static char Pause[] = " "; /* Short silence */ static char Silent[] = ""; /* No phonemes */ #define LEFT_PART 0 #define MATCH_PART 1 #define RIGHT_PART 2 #define OUT_PART 3 typedef char *Rule[4]; /* Rule is an array of 4 character pointers */ /*0 = Punctuation */ /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule punct_rules[] = { {Anything, " ", Anything, Pause }, {Anything, "-", Anything, Silent }, {".", "'S", Anything, "z" }, {"#:.E", "'S", Anything, "z" }, {"#", "'S", Anything, "z" }, {Anything, "'", Anything, Silent }, {Anything, ",", Anything, Pause }, {Anything, ".", Anything, Pause }, {Anything, "?", Anything, Pause }, {Anything, "!", Anything, Pause }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule A_rules[] = { {Anything, "A", Nothing, "AX" }, {Nothing, "ARE", Nothing, "AAr" }, {Nothing, "AR", "O", "AXr" }, {Anything, "AR", "#", "EHr" }, {"^", "AS", "#", "EYs" }, {Anything, "A", "WA", "AX" }, {Anything, "AW", Anything, "AO" }, {" :", "ANY", Anything, "EHnIY" }, {Anything, "A", "^+#", "EY" }, {"#:", "ALLY", Anything, "AXlIY" }, {Nothing, "AL", "#", "AXl" }, {Anything, "AGAIN", Anything, "AXgEHn"}, {"#:", "AG", "E", "IHj" }, {Anything, "A", "^+:#", "AE" }, {" :", "A", "^+ ", "EY" }, {Anything, "A", "^%", "EY" }, {Nothing, "ARR", Anything, "AXr" }, {Anything, "ARR", Anything, "AEr" }, {" :", "AR", Nothing, "AAr" }, {Anything, "AR", Nothing, "ER" }, {Anything, "AR", Anything, "AAr" }, {Anything, "AIR", Anything, "EHr" }, {Anything, "AI", Anything, "EY" }, {Anything, "AY", Anything, "EY" }, {Anything, "AU", Anything, "AO" }, {"#:", "AL", Nothing, "AXl" }, {"#:", "ALS", Nothing, "AXlz" }, {Anything, "ALK", Anything, "AOk" }, {Anything, "AL", "^", "AOl" }, {" :", "ABLE", Anything, "EYbAXl"}, {Anything, "ABLE", Anything, "AXbAXl"}, {Anything, "ANG", "+", "EYnj" }, {Anything, "A", Anything, "AE" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule B_rules[] = { {Nothing, "BE", "^#", "bIH" }, {Anything, "BEING", Anything, "bIYIHNG"}, {Nothing, "BOTH", Nothing, "bOWTH" }, {Nothing, "BUS", "#", "bIHz" }, {Anything, "BUIL", Anything, "bIHl" }, {Anything, "B", Anything, "b" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule C_rules[] = { {Nothing, "CH", "^", "k" }, {"^E", "CH", Anything, "k" }, {Anything, "CH", Anything, "CH" }, {" S", "CI", "#", "sAY" }, {Anything, "CI", "A", "SH" }, {Anything, "CI", "O", "SH" }, {Anything, "CI", "EN", "SH" }, {Anything, "C", "+", "s" }, {Anything, "CK", Anything, "k" }, {Anything, "COM", "%", "kAHm" }, {Anything, "C", Anything, "k" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule D_rules[] = { {"#:", "DED", Nothing, "dIHd" }, {".E", "D", Nothing, "d" }, {"#:^E", "D", Nothing, "t" }, {Nothing, "DE", "^#", "dIH" }, {Nothing, "DO", Nothing, "dUW" }, {Nothing, "DOES", Anything, "dAHz" }, {Nothing, "DOING", Anything, "dUWIHNG"}, {Nothing, "DOW", Anything, "dAW" }, {Anything, "DU", "A", "jUW" }, {Anything, "D", Anything, "d" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule E_rules[] = { {"#:", "E", Nothing, Silent }, {"':^", "E", Nothing, Silent }, {" :", "E", Nothing, "IY" }, {"#", "ED", Nothing, "d" }, {"#:", "E", "D ", Silent }, {Anything, "EV", "ER", "EHv" }, {Anything, "E", "^%", "IY" }, {Anything, "ERI", "#", "IYrIY" }, {Anything, "ERI", Anything, "EHrIH" }, {"#:", "ER", "#", "ER" }, {Anything, "ER", "#", "EHr" }, {Anything, "ER", Anything, "ER" }, {Nothing, "EVEN", Anything, "IYvEHn"}, {"#:", "E", "W", Silent }, {"T", "EW", Anything, "UW" }, {"S", "EW", Anything, "UW" }, {"R", "EW", Anything, "UW" }, {"D", "EW", Anything, "UW" }, {"L", "EW", Anything, "UW" }, {"Z", "EW", Anything, "UW" }, {"N", "EW", Anything, "UW" }, {"J", "EW", Anything, "UW" }, {"TH", "EW", Anything, "UW" }, {"CH", "EW", Anything, "UW" }, {"SH", "EW", Anything, "UW" }, {Anything, "EW", Anything, "yUW" }, {Anything, "E", "O", "IY" }, {"#:S", "ES", Nothing, "IHz" }, {"#:C", "ES", Nothing, "IHz" }, {"#:G", "ES", Nothing, "IHz" }, {"#:Z", "ES", Nothing, "IHz" }, {"#:X", "ES", Nothing, "IHz" }, {"#:J", "ES", Nothing, "IHz" }, {"#:CH", "ES", Nothing, "IHz" }, {"#:SH", "ES", Nothing, "IHz" }, {"#:", "E", "S ", Silent }, {"#:", "ELY", Nothing, "lIY" }, {"#:", "EMENT", Anything, "mEHnt" }, {Anything, "EFUL", Anything, "fUHl" }, {Anything, "EE", Anything, "IY" }, {Anything, "EARN", Anything, "ERn" }, {Nothing, "EAR", "^", "ER" }, {Anything, "EAD", Anything, "EHd" }, {"#:", "EA", Nothing, "IYAX" }, {Anything, "EA", "SU", "EH" }, {Anything, "EA", Anything, "IY" }, {Anything, "EIGH", Anything, "EY" }, {Anything, "EI", Anything, "IY" }, {Nothing, "EYE", Anything, "AY" }, {Anything, "EY", Anything, "IY" }, {Anything, "EU", Anything, "yUW" }, {Anything, "E", Anything, "EH" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule F_rules[] = { {Anything, "FUL", Anything, "fUHl" }, {Anything, "F", Anything, "f" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule G_rules[] = { {Anything, "GIV", Anything, "gIHv" }, {Nothing, "G", "I^", "g" }, {Anything, "GE", "T", "gEH" }, {"SU", "GGES", Anything, "gjEHs" }, {Anything, "GG", Anything, "g" }, {" B#", "G", Anything, "g" }, {Anything, "G", "+", "j" }, {Anything, "GREAT", Anything, "grEYt" }, {"#", "GH", Anything, Silent }, {Anything, "G", Anything, "g" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule H_rules[] = { {Nothing, "HAV", Anything, "hAEv" }, {Nothing, "HERE", Anything, "hIYr" }, {Nothing, "HOUR", Anything, "AWER" }, {Anything, "HOW", Anything, "hAW" }, {Anything, "H", "#", "h" }, {Anything, "H", Anything, Silent }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule I_rules[] = { {Nothing, "IN", Anything, "IHn" }, {Nothing, "I", Nothing, "AY" }, {Anything, "IN", "D", "AYn" }, {Anything, "IER", Anything, "IYER" }, {"#:R", "IED", Anything, "IYd" }, {Anything, "IED", Nothing, "AYd" }, {Anything, "IEN", Anything, "IYEHn" }, {Anything, "IE", "T", "AYEH" }, {" :", "I", "%", "AY" }, {Anything, "I", "%", "IY" }, {Anything, "IE", Anything, "IY" }, {Anything, "I", "^+:#", "IH" }, {Anything, "IR", "#", "AYr" }, {Anything, "IZ", "%", "AYz" }, {Anything, "IS", "%", "AYz" }, {Anything, "I", "D%", "AY" }, {"+^", "I", "^+", "IH" }, {Anything, "I", "T%", "AY" }, {"#:^", "I", "^+", "IH" }, {Anything, "I", "^+", "AY" }, {Anything, "IR", Anything, "ER" }, {Anything, "IGH", Anything, "AY" }, {Anything, "ILD", Anything, "AYld" }, {Anything, "IGN", Nothing, "AYn" }, {Anything, "IGN", "^", "AYn" }, {Anything, "IGN", "%", "AYn" }, {Anything, "IQUE", Anything, "IYk" }, {Anything, "I", Anything, "IH" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule J_rules[] = { {Anything, "J", Anything, "j" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule K_rules[] = { {Nothing, "K", "N", Silent }, {Anything, "K", Anything, "k" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule L_rules[] = { {Anything, "LO", "C#", "lOW" }, {"L", "L", Anything, Silent }, {"#:^", "L", "%", "AXl" }, {Anything, "LEAD", Anything, "lIYd" }, {Anything, "L", Anything, "l" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule M_rules[] = { {Anything, "MOV", Anything, "mUWv" }, {Anything, "M", Anything, "m" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule N_rules[] = { {"E", "NG", "+", "nj" }, {Anything, "NG", "R", "NGg" }, {Anything, "NG", "#", "NGg" }, {Anything, "NGL", "%", "NGgAXl"}, {Anything, "NG", Anything, "NG" }, {Anything, "NK", Anything, "NGk" }, {Nothing, "NOW", Nothing, "nAW" }, {Anything, "N", Anything, "n" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule O_rules[] = { {Anything, "OF", Nothing, "AXv" }, {Anything, "OROUGH", Anything, "EROW" }, {"#:", "OR", Nothing, "ER" }, {"#:", "ORS", Nothing, "ERz" }, {Anything, "OR", Anything, "AOr" }, {Nothing, "ONE", Anything, "wAHn" }, {Anything, "OW", Anything, "OW" }, {Nothing, "OVER", Anything, "OWvER" }, {Anything, "OV", Anything, "AHv" }, {Anything, "O", "^%", "OW" }, {Anything, "O", "^EN", "OW" }, {Anything, "O", "^I#", "OW" }, {Anything, "OL", "D", "OWl" }, {Anything, "OUGHT", Anything, "AOt" }, {Anything, "OUGH", Anything, "AHf" }, {Nothing, "OU", Anything, "AW" }, {"H", "OU", "S#", "AW" }, {Anything, "OUS", Anything, "AXs" }, {Anything, "OUR", Anything, "AOr" }, {Anything, "OULD", Anything, "UHd" }, {"^", "OU", "^L", "AH" }, {Anything, "OUP", Anything, "UWp" }, {Anything, "OU", Anything, "AW" }, {Anything, "OY", Anything, "OY" }, {Anything, "OING", Anything, "OWIHNG"}, {Anything, "OI", Anything, "OY" }, {Anything, "OOR", Anything, "AOr" }, {Anything, "OOK", Anything, "UHk" }, {Anything, "OOD", Anything, "UHd" }, {Anything, "OO", Anything, "UW" }, {Anything, "O", "E", "OW" }, {Anything, "O", Nothing, "OW" }, {Anything, "OA", Anything, "OW" }, {Nothing, "ONLY", Anything, "OWnlIY"}, {Nothing, "ONCE", Anything, "wAHns" }, {Anything, "ON'T", Anything, "OWnt" }, {"C", "O", "N", "AA" }, {Anything, "O", "NG", "AO" }, {" :^", "O", "N", "AH" }, {"I", "ON", Anything, "AXn" }, {"#:", "ON", Nothing, "AXn" }, {"#^", "ON", Anything, "AXn" }, {Anything, "O", "ST ", "OW" }, {Anything, "OF", "^", "AOf" }, {Anything, "OTHER", Anything, "AHDHER"}, {Anything, "OSS", Nothing, "AOs" }, {"#:^", "OM", Anything, "AHm" }, {Anything, "O", Anything, "AA" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule P_rules[] = { {Anything, "PH", Anything, "f" }, {Anything, "PEOP", Anything, "pIYp" }, {Anything, "POW", Anything, "pAW" }, {Anything, "PUT", Nothing, "pUHt" }, {Anything, "P", Anything, "p" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule Q_rules[] = { {Anything, "QUAR", Anything, "kwAOr" }, {Anything, "QU", Anything, "kw" }, {Anything, "Q", Anything, "k" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule R_rules[] = { {Nothing, "RE", "^#", "rIY" }, {Anything, "R", Anything, "r" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule S_rules[] = { {Anything, "SH", Anything, "SH" }, {"#", "SION", Anything, "ZHAXn" }, {Anything, "SOME", Anything, "sAHm" }, {"#", "SUR", "#", "ZHER" }, {Anything, "SUR", "#", "SHER" }, {"#", "SU", "#", "ZHUW" }, {"#", "SSU", "#", "SHUW" }, {"#", "SED", Nothing, "zd" }, {"#", "S", "#", "z" }, {Anything, "SAID", Anything, "sEHd" }, {"^", "SION", Anything, "SHAXn" }, {Anything, "S", "S", Silent }, {".", "S", Nothing, "z" }, {"#:.E", "S", Nothing, "z" }, {"#:^##", "S", Nothing, "z" }, {"#:^#", "S", Nothing, "s" }, {"U", "S", Nothing, "s" }, {" :#", "S", Nothing, "z" }, {Nothing, "SCH", Anything, "sk" }, {Anything, "S", "C+", Silent }, {"#", "SM", Anything, "zm" }, {"#", "SN", "'", "zAXn" }, {Anything, "S", Anything, "s" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule T_rules[] = { {Nothing, "THE", Nothing, "DHAX" }, {Anything, "TO", Nothing, "tUW" }, {Anything, "THAT", Nothing, "DHAEt" }, {Nothing, "THIS", Nothing, "DHIHs" }, {Nothing, "THEY", Anything, "DHEY" }, {Nothing, "THERE", Anything, "DHEHr" }, {Anything, "THER", Anything, "DHER" }, {Anything, "THEIR", Anything, "DHEHr" }, {Nothing, "THAN", Nothing, "DHAEn" }, {Nothing, "THEM", Nothing, "DHEHm" }, {Anything, "THESE", Nothing, "DHIYz" }, {Nothing, "THEN", Anything, "DHEHn" }, {Anything, "THROUGH", Anything, "THrUW" }, {Anything, "THOSE", Anything, "DHOWz" }, {Anything, "THOUGH", Nothing, "DHOW" }, {Nothing, "THUS", Anything, "DHAHs" }, {Anything, "TH", Anything, "TH" }, {"#:", "TED", Nothing, "tIHd" }, {"S", "TI", "#N", "CH" }, {Anything, "TI", "O", "SH" }, {Anything, "TI", "A", "SH" }, {Anything, "TIEN", Anything, "SHAXn" }, {Anything, "TUR", "#", "CHER" }, {Anything, "TU", "A", "CHUW" }, {Nothing, "TWO", Anything, "tUW" }, {Anything, "T", Anything, "t" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule U_rules[] = { {Nothing, "UN", "I", "yUWn" }, {Nothing, "UN", Anything, "AHn" }, {Nothing, "UPON", Anything, "AXpAOn"}, {"T", "UR", "#", "UHr" }, {"S", "UR", "#", "UHr" }, {"R", "UR", "#", "UHr" }, {"D", "UR", "#", "UHr" }, {"L", "UR", "#", "UHr" }, {"Z", "UR", "#", "UHr" }, {"N", "UR", "#", "UHr" }, {"J", "UR", "#", "UHr" }, {"TH", "UR", "#", "UHr" }, {"CH", "UR", "#", "UHr" }, {"SH", "UR", "#", "UHr" }, {Anything, "UR", "#", "yUHr" }, {Anything, "UR", Anything, "ER" }, {Anything, "U", "^ ", "AH" }, {Anything, "U", "^^", "AH" }, {Anything, "UY", Anything, "AY" }, {" G", "U", "#", Silent }, {"G", "U", "%", Silent }, {"G", "U", "#", "w" }, {"#N", "U", Anything, "yUW" }, {"T", "U", Anything, "UW" }, {"S", "U", Anything, "UW" }, {"R", "U", Anything, "UW" }, {"D", "U", Anything, "UW" }, {"L", "U", Anything, "UW" }, {"Z", "U", Anything, "UW" }, {"N", "U", Anything, "UW" }, {"J", "U", Anything, "UW" }, {"TH", "U", Anything, "UW" }, {"CH", "U", Anything, "UW" }, {"SH", "U", Anything, "UW" }, {Anything, "U", Anything, "yUW" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule V_rules[] = { {Anything, "VIEW", Anything, "vyUW" }, {Anything, "V", Anything, "v" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule W_rules[] = { {Nothing, "WERE", Anything, "wER" }, {Anything, "WA", "S", "wAA" }, {Anything, "WA", "T", "wAA" }, {Anything, "WHERE", Anything, "WHEHr" }, {Anything, "WHAT", Anything, "WHAAt" }, {Anything, "WHOL", Anything, "hOWl" }, {Anything, "WHO", Anything, "hUW" }, {Anything, "WH", Anything, "WH" }, {Anything, "WAR", Anything, "wAOr" }, {Anything, "WOR", "^", "wER" }, {Anything, "WR", Anything, "r" }, {Anything, "W", Anything, "w" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule X_rules[] = { {Anything, "X", Anything, "ks" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule Y_rules[] = { {Anything, "YOUNG", Anything, "yAHNG" }, {Nothing, "YOU", Anything, "yUW" }, {Nothing, "YES", Anything, "yEHs" }, {Nothing, "Y", Anything, "y" }, {"#:^", "Y", Nothing, "IY" }, {"#:^", "Y", "I", "IY" }, {" :", "Y", Nothing, "AY" }, {" :", "Y", "#", "AY" }, {" :", "Y", "^+:#", "IH" }, {" :", "Y", "^#", "AY" }, {Anything, "Y", Anything, "IH" }, {Anything, 0, Anything, Silent }, }; /* ** LEFT_PART MATCH_PART RIGHT_PART OUT_PART */ static Rule Z_rules[] = { {Anything, "Z", Anything, "z" }, {Anything, 0, Anything, Silent }, }; Rule *Rules[] = { punct_rules, A_rules, B_rules, C_rules, D_rules, E_rules, F_rules, G_rules, H_rules, I_rules, J_rules, K_rules, L_rules, M_rules, N_rules, O_rules, P_rules, Q_rules, R_rules, S_rules, T_rules, U_rules, V_rules, W_rules, X_rules, Y_rules, Z_rules }; ------------------------------------------------------------------------------- **FILE** PHONEME.C Translate a single word. ------------------------------------------------------------------------------- #include #include #define FALSE (0) #define TRUE (!0) /* ** English to Phoneme translation. ** ** Rules are made up of four parts: ** ** The left context. ** The text to match. ** The right context. ** The phonemes to substitute for the matched text. ** ** Procedure: ** ** Seperate each block of letters (apostrophes included) ** and add a space on each side. For each unmatched ** letter in the word, look through the rules where the ** text to match starts with the letter in the word. If ** the text to match is found and the right and left ** context patterns also match, output the phonemes for ** that rule and skip to the next unmatched letter. ** ** ** Special Context Symbols: ** ** # One or more vowels ** : Zero or more consonants ** ^ One consonant. ** . One of B, D, V, G, J, L, M, N, R, W or Z (voiced ** consonants) ** % One of ER, E, ES, ED, ING, ELY (a suffix) ** (Right context only) ** + One of E, I or Y (a "front" vowel) */ typedef char *Rule[4]; /* A rule is four character pointers */ extern Rule *Rules[]; /* An array of pointers to rules */ int isvowel(chr) char chr; { return (chr == 'A' || chr == 'E' || chr == 'I' || chr == 'O' || chr == 'U'); } int isconsonant(chr) char chr; { return (isupper(chr) && !isvowel(chr)); } xlate_word(word) char word[]; { int index; /* Current position in word */ int type; /* First letter of match part */ index = 1; /* Skip the initial blank */ do { if (isupper(word[index])) type = word[index] - 'A' + 1; else type = 0; index = find_rule(word, index, Rules[type]); } while (word[index] != '\0'); } find_rule(word, index, rules) char word[]; int index; Rule *rules; { Rule *rule; char *left, *match, *right, *output; int remainder; for (;;) /* Search for the rule */ { rule = rules++; match = (*rule)[1]; if (match == 0) /* bad symbol! */ { fprintf(stderr, "Error: Can't find rule for: '%c' in \"%s\"\n", word[index], word); return index+1; /* Skip it! */ } for (remainder = index; *match != '\0'; match++, remainder++) { if (*match != word[remainder]) break; } if (*match != '\0') /* found missmatch */ continue; /* printf("\nWord: \"%s\", Index:%4d, Trying: \"%s/%s/%s\" = \"%s\"\n", word, index, (*rule)[0], (*rule)[1], (*rule)[2], (*rule)[3]); */ left = (*rule)[0]; right = (*rule)[2]; if (!leftmatch(left, &word[index-1])) continue; /* printf("leftmatch(\"%s\",\"...%c\") succeded!\n", left, word[index-1]); */ if (!rightmatch(right, &word[remainder])) continue; /* printf("rightmatch(\"%s\",\"%s\") succeded!\n", right, &word[remainder]); */ output = (*rule)[3]; /* printf("Success: "); */ outstring(output); return remainder; } } leftmatch(pattern, context) char *pattern; /* first char of pattern to match in text */ char *context; /* last char of text to be matched */ { char *pat; char *text; int count; if (*pattern == '\0') /* null string matches any context */ { return TRUE; } /* point to last character in pattern string */ count = strlen(pattern); pat = pattern + (count - 1); text = context; for (; count > 0; pat--, count--) { /* First check for simple text or space */ if (isalpha(*pat) || *pat == '\'' || *pat == ' ') if (*pat != *text) return FALSE; else { text--; continue; } switch (*pat) { case '#': /* One or more vowels */ if (!isvowel(*text)) return FALSE; text--; while (isvowel(*text)) text--; break; case ':': /* Zero or more consonants */ while (isconsonant(*text)) text--; break; case '^': /* One consonant */ if (!isconsonant(*text)) return FALSE; text--; break; case '.': /* B, D, V, G, J, L, M, N, R, W, Z */ if (*text != 'B' && *text != 'D' && *text != 'V' && *text != 'G' && *text != 'J' && *text != 'L' && *text != 'M' && *text != 'N' && *text != 'R' && *text != 'W' && *text != 'Z') return FALSE; text--; break; case '+': /* E, I or Y (front vowel) */ if (*text != 'E' && *text != 'I' && *text != 'Y') return FALSE; text--; break; case '%': default: fprintf(stderr, "Bad char in left rule: '%c'\n", *pat); return FALSE; } } return TRUE; } rightmatch(pattern, context) char *pattern; /* first char of pattern to match in text */ char *context; /* last char of text to be matched */ { char *pat; char *text; if (*pattern == '\0') /* null string matches any context */ return TRUE; pat = pattern; text = context; for (pat = pattern; *pat != '\0'; pat++) { /* First check for simple text or space */ if (isalpha(*pat) || *pat == '\'' || *pat == ' ') if (*pat != *text) return FALSE; else { text++; continue; } switch (*pat) { case '#': /* One or more vowels */ if (!isvowel(*text)) return FALSE; text++; while (isvowel(*text)) text++; break; case ':': /* Zero or more consonants */ while (isconsonant(*text)) text++; break; case '^': /* One consonant */ if (!isconsonant(*text)) return FALSE; text++; break; case '.': /* B, D, V, G, J, L, M, N, R, W, Z */ if (*text != 'B' && *text != 'D' && *text != 'V' && *text != 'G' && *text != 'J' && *text != 'L' && *text != 'M' && *text != 'N' && *text != 'R' && *text != 'W' && *text != 'Z') return FALSE; text++; break; case '+': /* E, I or Y (front vowel) */ if (*text != 'E' && *text != 'I' && *text != 'Y') return FALSE; text++; break; case '%': /* ER, E, ES, ED, ING, ELY (a suffix) */ if (*text == 'E') { text++; if (*text == 'L') { text++; if (*text == 'Y') { text++; break; } else { text--; /* Don't gobble L */ break; } } else if (*text == 'R' || *text == 'S' || *text == 'D') text++; break; } else if (*text == 'I') { text++; if (*text == 'N') { text++; if (*text == 'G') { text++; break; } } return FALSE; } else return FALSE; default: fprintf(stderr, "Bad char in right rule:'%c'\n", *pat); return FALSE; } } return TRUE; } ------------------------------------------------------------------------------- **FILE** PARSE.C Split a file into words. ------------------------------------------------------------------------------- #include #include #define MAX_LENGTH 128 static FILE *In_file; static FILE *Out_file; static int Char, Char1, Char2, Char3; /* ** main(argc, argv) ** int argc; ** char *argv[]; ** ** This is the main program. It takes up to two file names (input ** and output) and translates the input file to phoneme codes ** (see ENGLISH.C) on the output file. */ main(argc, argv) int argc; char *argv[]; { if (argc > 3) { fputs("Usage: PHONEME [infile [outfile]]\n", stderr); exit(); } if (argc == 1) { fputs("Enter english text:\n", stderr); } if (argc > 1) { In_file = fopen(argv[1], "r"); if (In_file == 0) { fputs("Error: Cannot open input file.\n", stderr); fputs("Usage: PHONEME [infile [outfile]]\n", stderr); exit(); } } else In_file = stdin; if (argc > 2) { Out_file = fopen(argv[2], "w"); if (Out_file == 0) { fputs("Error: Cannot create output file.\n", stderr); fputs("Usage: PHONEME [infile [outfile]]\n", stderr); exit(); } } else Out_file = stdout; xlate_file(); } outstring(string) char *string; { while (*string != '\0') outchar(*string++); } outchar(chr) int chr; { fputc(chr,Out_file); } int makeupper(character) int character; { if (islower(character)) return toupper(character); else return character; } new_char() { /* If the cache is full of newline, time to prime the look-ahead again. If an EOF is found, fill the remainder of the queue with EOF's. */ if (Char == '\n' && Char1 == '\n' && Char2 == '\n' && Char3 == '\n') { /* prime the pump again */ Char = getc(In_file); if (Char == EOF) { Char1 = EOF; Char2 = EOF; Char3 = EOF; return Char; } if (Char == '\n') return Char; Char1 = getc(In_file); if (Char1 == EOF) { Char2 = EOF; Char3 = EOF; return Char; } if (Char1 == '\n') return Char; Char2 = getc(In_file); if (Char2 == EOF) { Char3 = EOF; return Char; } if (Char2 == '\n') return Char; Char3 = getc(In_file); } else { /* Buffer not full of newline, shuffle the characters and either get a new one or propagate a newline or EOF. */ Char = Char1; Char1 = Char2; Char2 = Char3; if (Char3 != '\n' && Char3 != EOF) Char3 = getc(In_file); } return Char; } /* ** xlate_file() ** ** This is the input file translator. It sets up the first character ** and uses it to determine what kind of text follows. */ xlate_file() { /* Prime the queue */ Char = '\n'; Char1 = '\n'; Char2 = '\n'; Char3 = '\n'; new_char(); /* Fill Char, Char1, Char2 and Char3 */ while (Char != EOF) /* All of the words in the file */ { if (isdigit(Char)) have_number(); else if (isalpha(Char) || Char == '\'') have_letter(); else if (Char == '$' && isdigit(Char1)) have_dollars(); else have_special(); } } have_dollars() { long int value; value = 0L; for (new_char() ; isdigit(Char) || Char == ',' ; new_char()) { if (Char != ',') value = 10 * value + (Char-'0'); } say_cardinal(value); /* Say number of whole dollars */ /* Found a character that is a non-digit and non-comma */ /* Check for no decimal or no cents digits */ if (Char != '.' || !isdigit(Char1)) { if (value == 1L) outstring("dAAlER "); else outstring("dAAlAArz "); return; } /* We have '.' followed by a digit */ new_char(); /* Skip the period */ /* If it is ".dd " say as " DOLLARS AND n CENTS " */ if (isdigit(Char1) && !isdigit(Char2)) { if (value == 1L) outstring("dAAlER "); else outstring("dAAlAArz "); if (Char == '0' && Char1 == '0') { new_char(); /* Skip tens digit */ new_char(); /* Skip units digit */ return; } outstring("AAnd "); value = (Char-'0')*10 + Char1-'0'; say_cardinal(value); if (value == 1L) outstring("sEHnt "); else outstring("sEHnts "); new_char(); /* Used Char (tens digit) */ new_char(); /* Used Char1 (units digit) */ return; } /* Otherwise say as "n POINT ddd DOLLARS " */ outstring("pOYnt "); for ( ; isdigit(Char) ; new_char()) { say_ascii(Char); } outstring("dAAlAArz "); return; } have_special() { if (Char == '\n') outchar('\n'); else if (!isspace(Char)) say_ascii(Char); new_char(); return; } have_number() { long int value; int lastdigit; value = Char - '0'; lastdigit = Char; for (new_char() ; isdigit(Char) ; new_char()) { value = 10 * value + (Char-'0'); lastdigit = Char; } /* Recognize ordinals based on last digit of number */ switch (lastdigit) { case '1': /* ST */ if (makeupper(Char) == 'S' && makeupper(Char1) == 'T' && !isalpha(Char2) && !isdigit(Char2)) { say_ordinal(value); new_char(); /* Used Char */ new_char(); /* Used Char1 */ return; } break; case '2': /* ND */ if (makeupper(Char) == 'N' && makeupper(Char1) == 'D' && !isalpha(Char2) && !isdigit(Char2)) { say_ordinal(value); new_char(); /* Used Char */ new_char(); /* Used Char1 */ return; } break; case '3': /* RD */ if (makeupper(Char) == 'R' && makeupper(Char1) == 'D' && !isalpha(Char2) && !isdigit(Char2)) { say_ordinal(value); new_char(); /* Used Char */ new_char(); /* Used Char1 */ return; } break; case '0': /* TH */ case '4': /* TH */ case '5': /* TH */ case '6': /* TH */ case '7': /* TH */ case '8': /* TH */ case '9': /* TH */ if (makeupper(Char) == 'T' && makeupper(Char1) == 'H' && !isalpha(Char2) && !isdigit(Char2)) { say_ordinal(value); new_char(); /* Used Char */ new_char(); /* Used Char1 */ return; } break; } say_cardinal(value); /* Recognize decimal points */ if (Char == '.' && isdigit(Char1)) { outstring("pOYnt "); for (new_char() ; isdigit(Char) ; new_char()) { say_ascii(Char); } } /* Spell out trailing abbreviations */ if (isalpha(Char)) { while (isalpha(Char)) { say_ascii(Char); new_char(); } } return; } have_letter() { char buff[MAX_LENGTH]; int count; count = 0; buff[count++] = ' '; /* Required initial blank */ buff[count++] = makeupper(Char); for (new_char() ; isalpha(Char) || Char == '\'' ; new_char()) { buff[count++] = makeupper(Char); if (count > MAX_LENGTH-2) { buff[count++] = ' '; buff[count++] = '\0'; xlate_word(buff); count = 1; } } buff[count++] = ' '; /* Required terminating blank */ buff[count++] = '\0'; /* Check for AAANNN type abbreviations */ if (isdigit(Char)) { spell_word(buff); return; } else if (strlen(buff) == 3) /* one character, two spaces */ say_ascii(buff[1]); else if (Char == '.') /* Possible abbreviation */ abbrev(buff); else xlate_word(buff); if (Char == '-' && isalpha(Char1)) new_char(); /* Skip hyphens */ } /* Handle abbreviations. Text in buff was followed by '.' */ abbrev(buff) char buff[]; { if (strcmp(buff, " DR ") == 0) { xlate_word(" DOCTOR "); new_char(); } else if (strcmp(buff, " MR ") == 0) { xlate_word(" MISTER "); new_char(); } else if (strcmp(buff, " MRS ") == 0) { xlate_word(" MISSUS "); new_char(); } else if (strcmp(buff, " PHD ") == 0) { spell_word(" PHD "); new_char(); } else xlate_word(buff); } ------------------------------------------------------------------------------- **FILE** SPELLWORD.C Spell an ASCII character or word. ------------------------------------------------------------------------------- #include static char *Ascii[] = { "nUWl ","stAArt AXv hEHdER ","stAArt AXv tEHkst ","EHnd AXv tEHkst ", "EHnd AXv trAEnsmIHSHAXn", "EHnkwAYr ","AEk ","bEHl ","bAEkspEYs ","tAEb ","lIHnIYfIYd ", "vERtIHkAXl tAEb ","fAOrmfIYd ","kAErAYj rIYtERn ","SHIHft AWt ", "SHIHft IHn ","dIHlIYt ","dIHvIHs kAAntrAAl wAHn ","dIHvIHs kAAntrAAl tUW ", "dIHvIHs kAAntrAAl THrIY ","dIHvIHs kAAntrAAl fOWr ","nAEk ","sIHnk ", "EHnd tEHkst blAAk ","kAEnsEHl ","EHnd AXv mEHsIHj ","sUWbstIHtUWt ", "EHskEYp ","fAYEHld sIYpERAEtER ","grUWp sIYpERAEtER ","rIYkAOrd sIYpERAEtER ", "yUWnIHt sIYpERAEtER ","spEYs ","EHksklAEmEYSHAXn mAArk ","dAHbl kwOWt ", "nUWmbER sAYn ","dAAlER sAYn ","pERsEHnt ","AEmpERsAEnd ","kwOWt ", "OWpEHn pEHrEHn ","klOWz pEHrEHn ","AEstEHrIHsk ","plAHs ","kAAmmAX ", "mIHnAHs ","pIYrIYAAd ","slAESH ", "zIHrOW ","wAHn ","tUW ","THrIY ","fOWr ", "fAYv ","sIHks ","sEHvAXn ","EYt ","nAYn ", "kAAlAXn ","sEHmIHkAAlAXn ","lEHs DHAEn ","EHkwAXl sAYn ","grEYtER DHAEn ", "kwEHsCHAXn mAArk ","AEt sAYn ", "EY ","bIY ","sIY ","dIY ","IY ","EHf ","jIY ", "EYtCH ","AY ","jEY ","kEY ","EHl ","EHm ","EHn ","AA ","pIY ", "kw ","AAr ","EHz ","tIY ","AHw ","vIY ", "dAHblyUWw ","EHks ","wAYIY ","zIY ", "lEHft brAEkEHt ","bAEkslAESH ","rAYt brAEkEHt ","kAErEHt ", "AHndERskAOr ","AEpAAstrAAfIH ", "EY ","bIY ","sIY ","dIY ","IY ","EHf ","jIY ", "EYtCH ","AY ","jEY ","kEY ","EHl ","EHm ","EHn ","AA ","pIY ", "kw ","AAr ","EHz ","tIY ","AHw ","vIY ", "dAHblyUWw ","EHks ","wAYIY ","zIY ", "lEHft brEYs ","vERtIHkAXl bAAr ","rAYt brEYs ","tAYld ","dEHl ", }; say_ascii(character) int character; { outstring(Ascii[character&0x7F]); } spell_word(word) char *word; { for (word++ ; word[1] != '\0' ; word++) outstring(Ascii[(*word)&0x7F]); } ------------------------------------------------------------------------------- **FILE** SAYNUM.C Say a cardinal or ordinal number (long int). ------------------------------------------------------------------------------- #include /* ** Integer to Readable ASCII Conversion Routine. ** ** Synopsis: ** ** say_cardinal(value) ** long int value; -- The number to output ** ** The number is translated into a string of phonemes ** */ static char *Cardinals[] = { "zIHrOW ", "wAHn ", "tUW ", "THrIY ", "fOWr ", "fAYv ", "sIHks ", "sEHvAXn ", "EYt ", "nAYn ", "tEHn ", "IYlEHvAXn ", "twEHlv ", "THERtIYn ", "fOWrtIYn ", "fIHftIYn ", "sIHkstIYn ", "sEHvEHntIYn ", "EYtIYn ", "nAYntIYn " } ; static char *Twenties[] = { "twEHntIY ", "THERtIY ", "fAOrtIY ", "fIHftIY ", "sIHkstIY ", "sEHvEHntIY ", "EYtIY ", "nAYntIY " } ; static char *Ordinals[] = { "zIHrOWEHTH ", "fERst ", "sEHkAHnd ", "THERd ", "fOWrTH ", "fIHfTH ", "sIHksTH ", "sEHvEHnTH ", "EYtTH ", "nAYnTH ", "tEHnTH ", "IYlEHvEHnTH ", "twEHlvTH ", "THERtIYnTH ", "fAOrtIYnTH ", "fIHftIYnTH ", "sIHkstIYnTH ", "sEHvEHntIYnTH ", "EYtIYnTH ", "nAYntIYnTH " } ; static char *Ord_twenties[] = { "twEHntIYEHTH ","THERtIYEHTH ", "fOWrtIYEHTH ", "fIHftIYEHTH ", "sIHkstIYEHTH ","sEHvEHntIYEHTH ","EYtIYEHTH ", "nAYntIYEHTH " } ; /* ** Translate a number to phonemes. This version is for CARDINAL numbers. ** Note: this is recursive. */ say_cardinal(value) long int value; { if (value < 0) { outstring("mAYnAHs "); value = (-value); if (value < 0) /* Overflow! -32768 */ { outstring("IHnfIHnIHtIY "); return; } } if (value >= 1000000000L) /* Billions */ { say_cardinal(value/1000000000L); outstring("bIHlIYAXn "); value = value % 1000000000; if (value == 0) return; /* Even billion */ if (value < 100) /* as in THREE BILLION AND FIVE */ outstring("AEnd "); } if (value >= 1000000L) /* Millions */ { say_cardinal(value/1000000L); outstring("mIHlIYAXn "); value = value % 1000000L; if (value == 0) return; /* Even million */ if (value < 100) /* as in THREE MILLION AND FIVE */ outstring("AEnd "); } /* Thousands 1000..1099 2000..99999 */ /* 1100 to 1999 is eleven-hunderd to ninteen-hunderd */ if ((value >= 1000L && value <= 1099L) || value >= 2000L) { say_cardinal(value/1000L); outstring("THAWzAEnd "); value = value % 1000L; if (value == 0) return; /* Even thousand */ if (value < 100) /* as in THREE THOUSAND AND FIVE */ outstring("AEnd "); } if (value >= 100L) { outstring(Cardinals[value/100]); outstring("hAHndrEHd "); value = value % 100; if (value == 0) return; /* Even hundred */ } if (value >= 20) { outstring(Twenties[(value-20)/ 10]); value = value % 10; if (value == 0) return; /* Even ten */ } outstring(Cardinals[value]); return; } /* ** Translate a number to phonemes. This version is for ORDINAL numbers. ** Note: this is recursive. */ say_ordinal(value) long int value; { if (value < 0) { outstring("mAHnAXs "); value = (-value); if (value < 0) /* Overflow! -32768 */ { outstring("IHnfIHnIHtIY "); return; } } if (value >= 1000000000L) /* Billions */ { say_cardinal(value/1000000000L); value = value % 1000000000; if (value == 0) { outstring("bIHlIYAXnTH "); return; /* Even billion */ } outstring("bIHlIYAXn "); if (value < 100) /* as in THREE BILLION AND FIVE */ outstring("AEnd "); } if (value >= 1000000L) /* Millions */ { say_cardinal(value/1000000L); value = value % 1000000L; if (value == 0) { outstring("mIHlIYAXnTH "); return; /* Even million */ } outstring("mIHlIYAXn "); if (value < 100) /* as in THREE MILLION AND FIVE */ outstring("AEnd "); } /* Thousands 1000..1099 2000..99999 */ /* 1100 to 1999 is eleven-hunderd to ninteen-hunderd */ if ((value >= 1000L && value <= 1099L) || value >= 2000L) { say_cardinal(value/1000L); value = value % 1000L; if (value == 0) { outstring("THAWzAEndTH "); return; /* Even thousand */ } outstring("THAWzAEnd "); if (value < 100) /* as in THREE THOUSAND AND FIVE */ outstring("AEnd "); } if (value >= 100L) { outstring(Cardinals[value/100]); value = value % 100; if (value == 0) { outstring("hAHndrEHdTH "); return; /* Even hundred */ } outstring("hAHndrEHd "); } if (value >= 20) { if ((value%10) == 0) { outstring(Ord_twenties[(value-20)/ 10]); return; /* Even ten */ } outstring(Twenties[(value-20)/ 10]); value = value % 10; } outstring(Ordinals[value]); return; } ------------------------------------------------------------------------------- **FILE** End of Distribution ------------------------------------------------------------------------------- -John A. Wasser Work address: ARPAnet: WASSER%VIKING.DEC@decwrl.ARPA Usenet: {allegra,Shasta,decvax}!decwrl!dec-rhea!dec-viking!wasser Easynet: VIKING::WASSER Telephone: (617)486-2505 USPS: Digital Equipment Corp. Mail stop: LJO2/E4 30 Porter Rd Littleton, MA 01460