diff --git a/source/org/thdl/tib/dictionary/DictionaryEntries.java b/source/org/thdl/tib/dictionary/DictionaryEntries.java index 335d1ed..c38b16e 100644 --- a/source/org/thdl/tib/dictionary/DictionaryEntries.java +++ b/source/org/thdl/tib/dictionary/DictionaryEntries.java @@ -4,6 +4,6 @@ import org.thdl.tib.dictionary.DictionaryEntry ; import java.util.Collection ; public interface DictionaryEntries extends Collection -{ +{ }; diff --git a/source/org/thdl/tib/dictionary/DictionaryEntry.java b/source/org/thdl/tib/dictionary/DictionaryEntry.java index 0b79be6..d009b49 100644 --- a/source/org/thdl/tib/dictionary/DictionaryEntry.java +++ b/source/org/thdl/tib/dictionary/DictionaryEntry.java @@ -5,8 +5,8 @@ import org.thdl.tib.dictionary.DictionaryEntryDefinitions ; public interface DictionaryEntry { - public TextBody getKeyword () ; - public String getPhonetic () ; - public DictionaryEntryDefinitions getDefinitions () ; + public TextBody getKeyword () ; + public String getPhonetic () ; + public DictionaryEntryDefinitions getDefinitions () ; } diff --git a/source/org/thdl/tib/dictionary/DictionaryEntryDefinition.java b/source/org/thdl/tib/dictionary/DictionaryEntryDefinition.java index ab41fd8..f0a07e4 100644 --- a/source/org/thdl/tib/dictionary/DictionaryEntryDefinition.java +++ b/source/org/thdl/tib/dictionary/DictionaryEntryDefinition.java @@ -2,5 +2,5 @@ package org.thdl.tib.dictionary ; public interface DictionaryEntryDefinition { - public String toString () ; + public String toString () ; }; diff --git a/source/org/thdl/tib/dictionary/DictionaryEntryDescription.java b/source/org/thdl/tib/dictionary/DictionaryEntryDescription.java index 732b7ba..54774fd 100644 --- a/source/org/thdl/tib/dictionary/DictionaryEntryDescription.java +++ b/source/org/thdl/tib/dictionary/DictionaryEntryDescription.java @@ -2,6 +2,6 @@ package org.thdl.tib.dictionary ; public interface DictionaryEntryDescription { - Collection getItems () ; + Collection getItems () ; }; diff --git a/source/org/thdl/tib/dictionary/DictionaryInterface.java b/source/org/thdl/tib/dictionary/DictionaryInterface.java index 881d43e..daee39a 100644 --- a/source/org/thdl/tib/dictionary/DictionaryInterface.java +++ b/source/org/thdl/tib/dictionary/DictionaryInterface.java @@ -5,6 +5,6 @@ import org.thdl.tib.dictionary.DictionaryEntries ; public interface DictionaryInterface { - DictionaryEntries lookup ( TextBody in ) ; + DictionaryEntries lookup ( TextBody in ) ; } diff --git a/source/org/thdl/tib/dictionary/Phonetics.java b/source/org/thdl/tib/dictionary/Phonetics.java index 5302acc..989dbf2 100644 --- a/source/org/thdl/tib/dictionary/Phonetics.java +++ b/source/org/thdl/tib/dictionary/Phonetics.java @@ -4,106 +4,106 @@ import org.thdl.tib.dictionary.StandardPronounciationEngine ; public class Phonetics { - public static final String THDL_ENGLISH = "THDL_ENGLISH" ; - public static boolean valid = false ; + public static final String THDL_ENGLISH = "THDL_ENGLISH" ; + public static boolean valid = false ; - static StandardPronounciationEngine pronounciationEngine = null ; + static StandardPronounciationEngine pronounciationEngine = null ; - static - { - try - { - pronounciationEngine = new StandardPronounciationEngine () ; - valid = true ; - } - catch ( Exception e ) - { - valid = false ; - } - } + static + { + try + { + pronounciationEngine = new StandardPronounciationEngine () ; + valid = true ; + } + catch ( Exception e ) + { + valid = false ; + } + } - public static boolean isValid () - { - return valid ; - } + public static boolean isValid () + { + return valid ; + } - public static String standardToLocalized ( String locale, String in ) - { - if ( locale.equals ( THDL_ENGLISH ) ) - { - // - // put back the roman digraphs - // - in = in.replaceAll ( "B", "bh" ) ; - in = in.replaceAll ( "D", "dz" ) ; - in = in.replaceAll ( "K", "kh" ) ; - in = in.replaceAll ( "N", "ng" ) ; - in = in.replaceAll ( "P", "p" ) ; - in = in.replaceAll ( "S", "sh" ) ; - in = in.replaceAll ( "T", "t" ) ; - in = in.replaceAll ( "X", "ts" ) ; - in = in.replaceAll ( "Q", "ts" ) ; - in = in.replaceAll ( "Z", "sh" ) ; - in = in.replaceAll ( "c", "ch" ) ; - in = in.replaceAll ( "C", "ch" ) ; - } - else if ( locale.equals ( "POLISH" ) ) - { - // - // put back the roman digraphs - // - in = in.replaceAll ( "ny", "ni" ) ; - in = in.replaceAll ( "w", "\u0142" ) ; - in = in.replaceAll ( "B", "bh" ) ; - in = in.replaceAll ( "C", "cz'" ) ; - in = in.replaceAll ( "D", "dz" ) ; - in = in.replaceAll ( "j", "dzi" ) ; - in = in.replaceAll ( "K", "k'" ) ; - in = in.replaceAll ( "N", "ng" ) ; - in = in.replaceAll ( "P", "p'" ) ; - in = in.replaceAll ( "S", "sz" ) ; - in = in.replaceAll ( "T", "t'" ) ; - in = in.replaceAll ( "X", "c" ) ; - in = in.replaceAll ( "Q", "ts'" ) ; - in = in.replaceAll ( "y", "j" ) ; - in = in.replaceAll ( "Z", "sz" ) ; - in = in.replaceAll ( "c", "cz" ) ; - } - else if ( locale.equals ( "CZECH" ) || locale.equals ( "SLOVAK" ) ) - { - // - // put back the roman digraphs - // - in = in.replaceAll ( "ny", "\u0148" ) ; - in = in.replaceAll ( "w", "v" ) ; - in = in.replaceAll ( "B", "bh" ) ; - in = in.replaceAll ( "C", "\u010d'" ) ; - in = in.replaceAll ( "D", "dz" ) ; - in = in.replaceAll ( "j", "d\u017e" ) ; - in = in.replaceAll ( "K", "k'" ) ; - in = in.replaceAll ( "N", "ng" ) ; - in = in.replaceAll ( "P", "p'" ) ; - in = in.replaceAll ( "S", "\u0161" ) ; - in = in.replaceAll ( "T", "t'" ) ; - in = in.replaceAll ( "X", "c" ) ; - in = in.replaceAll ( "Q", "ts'" ) ; - in = in.replaceAll ( "y", "j" ) ; - in = in.replaceAll ( "Z", "\u0161" ) ; - in = in.replaceAll ( "c", "\u010d'" ) ; - } + public static String standardToLocalized ( String locale, String in ) + { + if ( locale.equals ( THDL_ENGLISH ) ) + { + // + // put back the roman digraphs + // + in = in.replaceAll ( "B", "bh" ) ; + in = in.replaceAll ( "D", "dz" ) ; + in = in.replaceAll ( "K", "kh" ) ; + in = in.replaceAll ( "N", "ng" ) ; + in = in.replaceAll ( "P", "p" ) ; + in = in.replaceAll ( "S", "sh" ) ; + in = in.replaceAll ( "T", "t" ) ; + in = in.replaceAll ( "X", "ts" ) ; + in = in.replaceAll ( "Q", "ts" ) ; + in = in.replaceAll ( "Z", "sh" ) ; + in = in.replaceAll ( "c", "ch" ) ; + in = in.replaceAll ( "C", "ch" ) ; + } + else if ( locale.equals ( "POLISH" ) ) + { + // + // put back the roman digraphs + // + in = in.replaceAll ( "ny", "ni" ) ; + in = in.replaceAll ( "w", "\u0142" ) ; + in = in.replaceAll ( "B", "bh" ) ; + in = in.replaceAll ( "C", "cz'" ) ; + in = in.replaceAll ( "D", "dz" ) ; + in = in.replaceAll ( "j", "dzi" ) ; + in = in.replaceAll ( "K", "k'" ) ; + in = in.replaceAll ( "N", "ng" ) ; + in = in.replaceAll ( "P", "p'" ) ; + in = in.replaceAll ( "S", "sz" ) ; + in = in.replaceAll ( "T", "t'" ) ; + in = in.replaceAll ( "X", "c" ) ; + in = in.replaceAll ( "Q", "ts'" ) ; + in = in.replaceAll ( "y", "j" ) ; + in = in.replaceAll ( "Z", "sz" ) ; + in = in.replaceAll ( "c", "cz" ) ; + } + else if ( locale.equals ( "CZECH" ) || locale.equals ( "SLOVAK" ) ) + { + // + // put back the roman digraphs + // + in = in.replaceAll ( "ny", "\u0148" ) ; + in = in.replaceAll ( "w", "v" ) ; + in = in.replaceAll ( "B", "bh" ) ; + in = in.replaceAll ( "C", "\u010d'" ) ; + in = in.replaceAll ( "D", "dz" ) ; + in = in.replaceAll ( "j", "d\u017e" ) ; + in = in.replaceAll ( "K", "k'" ) ; + in = in.replaceAll ( "N", "ng" ) ; + in = in.replaceAll ( "P", "p'" ) ; + in = in.replaceAll ( "S", "\u0161" ) ; + in = in.replaceAll ( "T", "t'" ) ; + in = in.replaceAll ( "X", "c" ) ; + in = in.replaceAll ( "Q", "ts'" ) ; + in = in.replaceAll ( "y", "j" ) ; + in = in.replaceAll ( "Z", "\u0161" ) ; + in = in.replaceAll ( "c", "\u010d'" ) ; + } - return in ; - } + return in ; + } - public static String wylieToStandardPhonetic ( String wylie ) - { - try - { - return pronounciationEngine.processWylie ( wylie ) ; - } - catch ( Exception e ) - { - return "" ; - } - } + public static String wylieToStandardPhonetic ( String wylie ) + { + try + { + return pronounciationEngine.processWylie ( wylie ) ; + } + catch ( Exception e ) + { + return "" ; + } + } } diff --git a/source/org/thdl/tib/dictionary/ScannerBasedDictionary.java b/source/org/thdl/tib/dictionary/ScannerBasedDictionary.java index c6dc39b..ce6a90c 100644 --- a/source/org/thdl/tib/dictionary/ScannerBasedDictionary.java +++ b/source/org/thdl/tib/dictionary/ScannerBasedDictionary.java @@ -11,32 +11,32 @@ import org.thdl.tib.dictionary.SimpleDictionaryEntries ; public class ScannerBasedDictionary implements DictionaryInterface { - TibetanScanner scanner ; + TibetanScanner scanner ; - public ScannerBasedDictionary ( TibetanScanner ts ) + public ScannerBasedDictionary ( TibetanScanner ts ) + { + scanner = ts ; + } + + public DictionaryEntries lookup ( TextBody tb ) + { + DictionaryEntries entries = new SimpleDictionaryEntries () ; + // + // TibetanScanner expects romanized wylie for lookup + // + String input = tb.getRomanizedWylie () ; + + scanner.scanBody ( input ) ; + scanner.finishUp () ; + Word [] words = scanner.getWordArray () ; + for ( int i = 0; i < words.length; i++ ) { - scanner = ts ; + SimpleDictionaryEntry entry = SimpleDictionaryEntry.fromWord ( words [i] ) ; + entries.add ( entry ) ; } - public DictionaryEntries lookup ( TextBody tb ) - { - DictionaryEntries entries = new SimpleDictionaryEntries () ; - // - // TibetanScanner expects romanized wylie for lookup - // - String input = tb.getRomanizedWylie () ; - - scanner.scanBody ( input ) ; - scanner.finishUp () ; - Word [] words = scanner.getWordArray () ; - for ( int i = 0; i < words.length; i++ ) - { - SimpleDictionaryEntry entry = SimpleDictionaryEntry.fromWord ( words [i] ) ; - entries.add ( entry ) ; - } + scanner.clearTokens () ; - scanner.clearTokens () ; - - return entries ; - } + return entries ; + } } diff --git a/source/org/thdl/tib/dictionary/SimpleDictionaryEntries.java b/source/org/thdl/tib/dictionary/SimpleDictionaryEntries.java index 7f31e6a..08acb16 100644 --- a/source/org/thdl/tib/dictionary/SimpleDictionaryEntries.java +++ b/source/org/thdl/tib/dictionary/SimpleDictionaryEntries.java @@ -3,11 +3,11 @@ package org.thdl.tib.dictionary ; import org.thdl.tib.dictionary.DictionaryEntries ; import java.util.LinkedList ; -public class SimpleDictionaryEntries extends LinkedList implements DictionaryEntries +public class SimpleDictionaryEntries extends LinkedList implements DictionaryEntries { - public SimpleDictionaryEntries () - { - super () ; - } + public SimpleDictionaryEntries () + { + super () ; + } } diff --git a/source/org/thdl/tib/dictionary/SimpleDictionaryEntry.java b/source/org/thdl/tib/dictionary/SimpleDictionaryEntry.java index ceba9be..c62cc5e 100644 --- a/source/org/thdl/tib/dictionary/SimpleDictionaryEntry.java +++ b/source/org/thdl/tib/dictionary/SimpleDictionaryEntry.java @@ -7,70 +7,70 @@ import org.thdl.tib.scanner.Word ; public class SimpleDictionaryEntry implements DictionaryEntry { - TextBody keyWord ; - DictionaryEntryDefinitions definitions ; + TextBody keyWord ; + DictionaryEntryDefinitions definitions ; - static boolean useDashes = true ; + static boolean useDashes = true ; - public static SimpleDictionaryEntry fromWord ( Word word ) + public static SimpleDictionaryEntry fromWord ( Word word ) + { + SimpleDictionaryEntry sde = new SimpleDictionaryEntry () ; + + sde.definitions = SimpleDictionaryEntryDefinitions.fromDefinitions ( word.getDefs () ) ; + sde.keyWord = SimpleTextBody.fromWylie ( word.getWylie () ) ; + //sde.spaceInfo = word.getSpaceInfo or something like that + + return sde ; + } + + boolean hasSpaceBeforeSyllable ( int syllableIndex ) + { + // + // TODO + // + return false ; + } + + String joinSyllables ( String text ) + { + String [] syllables = text.split ( " " ) ; + + String out = "" ; + for ( int i = 0; i < syllables.length; i++ ) { - SimpleDictionaryEntry sde = new SimpleDictionaryEntry () ; + if ( i > 0 ) + { + if ( hasSpaceBeforeSyllable ( i ) ) + out += " " ; + else if ( useDashes ) + out += "-" ; + } - sde.definitions = SimpleDictionaryEntryDefinitions.fromDefinitions ( word.getDefs () ) ; - sde.keyWord = SimpleTextBody.fromWylie ( word.getWylie () ) ; - //sde.spaceInfo = word.getSpaceInfo or something like that - return sde ; + + out += syllables [i] ; } - boolean hasSpaceBeforeSyllable ( int syllableIndex ) - { - // - // TODO - // - return false ; - } + return out ; + } - String joinSyllables ( String text ) - { - String [] syllables = text.split ( " " ) ; + public TextBody getKeyword () + { + return SimpleTextBody.fromWylie ( joinSyllables ( keyWord.getWylie () ) ) ; + } - String out = "" ; - for ( int i = 0; i < syllables.length; i++ ) - { - if ( i > 0 ) - { - if ( hasSpaceBeforeSyllable ( i ) ) - out += " " ; - else if ( useDashes ) - out += "-" ; - } + public String getPhonetic () + { + // + // if phonetics specified in the dictionary - use it + // + // otherwise, generate one (currently the only option) + return joinSyllables ( Phonetics.wylieToStandardPhonetic ( keyWord.getWylie () ) ) ; + } - - - out += syllables [i] ; - } - - return out ; - } - - public TextBody getKeyword () - { - return SimpleTextBody.fromWylie ( joinSyllables ( keyWord.getWylie () ) ) ; - } - - public String getPhonetic () - { - // - // if phonetics specified in the dictionary - use it - // - // otherwise, generate one (currently the only option) - return joinSyllables ( Phonetics.wylieToStandardPhonetic ( keyWord.getWylie () ) ) ; - } - - public DictionaryEntryDefinitions getDefinitions () - { - return definitions ; - } + public DictionaryEntryDefinitions getDefinitions () + { + return definitions ; + } } diff --git a/source/org/thdl/tib/dictionary/SimpleDictionaryEntryDefinition.java b/source/org/thdl/tib/dictionary/SimpleDictionaryEntryDefinition.java index 23a97a5..fa8b5f4 100644 --- a/source/org/thdl/tib/dictionary/SimpleDictionaryEntryDefinition.java +++ b/source/org/thdl/tib/dictionary/SimpleDictionaryEntryDefinition.java @@ -4,15 +4,15 @@ import org.thdl.tib.dictionary.DictionaryEntryDefinition ; class SimpleDictionaryEntryDefinition implements DictionaryEntryDefinition { - String body ; + String body ; - public SimpleDictionaryEntryDefinition ( String theBody ) - { - body = theBody ; - } + public SimpleDictionaryEntryDefinition ( String theBody ) + { + body = theBody ; + } - public String toString () - { - return body ; - } + public String toString () + { + return body ; + } } diff --git a/source/org/thdl/tib/dictionary/SimpleDictionaryEntryDefinitions.java b/source/org/thdl/tib/dictionary/SimpleDictionaryEntryDefinitions.java index 442709f..c9a0b8b 100644 --- a/source/org/thdl/tib/dictionary/SimpleDictionaryEntryDefinitions.java +++ b/source/org/thdl/tib/dictionary/SimpleDictionaryEntryDefinitions.java @@ -9,41 +9,41 @@ import org.thdl.tib.dictionary.SimpleDictionaryEntryDefinition ; class SimpleDictionaryEntryDefinitions extends Vector implements DictionaryEntryDefinitions { - public static SimpleDictionaryEntryDefinitions fromDefinitions ( Definitions defs ) + public static SimpleDictionaryEntryDefinitions fromDefinitions ( Definitions defs ) + { + SimpleDictionaryEntryDefinitions sded = new SimpleDictionaryEntryDefinitions () ; + sded.populate ( defs ) ; + + return sded ; + } + + protected void populate ( Definitions defs ) + { + DictionarySource source = defs.getDictionarySource () ; + String [] defArr = defs.def ; + + int i,j; + + if (FileSyllableListTree.versionNumber==2) { - SimpleDictionaryEntryDefinitions sded = new SimpleDictionaryEntryDefinitions () ; - sded.populate ( defs ) ; - - return sded ; + this.add ( new SimpleDictionaryEntryDefinition ( "(" + source.getTag(0) + ") " + defArr[0] ) ) ; + for (i=1; i" ; + } - ret = ret.replaceAll ( "[\\/\\_\\*]", " " ) ; - - return ret ; - } - - public String getUnicode () - { - if ( UNICODE_TYPE == basicType ) - { - return unicode ; - } - else if ( WYLIE_TYPE == basicType ) - { - return wylieToUnicode ( unicode ) ; - } - else - { - return "" ; - } - } - - public String getWylie () - { - if ( WYLIE_TYPE == basicType ) - { - return wylie ; - } - else if ( UNICODE_TYPE == basicType ) - { - return unicodeToWylie ( unicode ) ; - } - else - { - return "" ; - } - } - - protected static String unicodeToWylie ( String in ) - { - return "" ; - } - - protected static String wylieToUnicode ( String in ) - { - return "" ; - } + protected static String wylieToUnicode ( String in ) + { + return "" ; + } }; diff --git a/source/org/thdl/tib/dictionary/StandardPronounciationEngine.java b/source/org/thdl/tib/dictionary/StandardPronounciationEngine.java index 8df5722..854dfe9 100644 --- a/source/org/thdl/tib/dictionary/StandardPronounciationEngine.java +++ b/source/org/thdl/tib/dictionary/StandardPronounciationEngine.java @@ -1,541 +1,541 @@ -package org.thdl.tib.dictionary ; - +package org.thdl.tib.dictionary ; + import java.lang.* ; import java.util.regex.Pattern ; import java.util.regex.Matcher ; import java.util.Vector ; import java.util.Enumeration ; - -public class StandardPronounciationEngine -{ - public StandardPronounciationEngine () throws Exception - { - setRules () ; - lastWordOriginal = "" ; - } - - Rule [] rules ; - protected int stateFlag ; - protected int returnFlag ; - protected String lastWordOriginal ; +public class StandardPronounciationEngine +{ + public StandardPronounciationEngine () throws Exception + { + setRules () ; + lastWordOriginal = "" ; + } - class Rule - { - /** - * conditions - */ - public static final int STARTS_WITH = 0 ; - public static final int ENDS_WITH = 1 ; - public static final int ENDS_WITH_FOLLOWS_VOWEL = 2 ; - public static final int ENDS_WITH_FOLLOWS_CONSONANT = 3 ; - public static final int STARTS_WITH_BORDERS_VOWEL = 4 ; - public static final int STARTS_WITH_BORDERS_CONSONANT = 5 ; - public static final int EQUALS = 6 ; - public static final int CONTAINS = 7 ; - public static final int STARTS_WITH_BORDERS_CONSONANTS = 8 ; + Rule [] rules ; - /** - * actions - */ - public static final int REPLACE_MATCH = 0 ; - public static final int SET_FLAG = 1 ; - public static final int REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL = 2 ; - public static final int REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT = 3 ; - public static final int REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT = 4 ; - public static final int SET_RETURN_FLAG = 5 ; + protected int stateFlag ; + protected int returnFlag ; + protected String lastWordOriginal ; - /** - * flags - */ - public static final int FLAG_ENDS_WITH_VOWEL = 1 ; - public static final int FLAG_ENDS_WITH_CONSONANT_VOICED = 2 ; - public static final int FLAG_ENDS_WITH_CONSONANT_VOICELESS = 4 ; + class Rule + { + /** + * conditions + */ + public static final int STARTS_WITH = 0 ; + public static final int ENDS_WITH = 1 ; + public static final int ENDS_WITH_FOLLOWS_VOWEL = 2 ; + public static final int ENDS_WITH_FOLLOWS_CONSONANT = 3 ; + public static final int STARTS_WITH_BORDERS_VOWEL = 4 ; + public static final int STARTS_WITH_BORDERS_CONSONANT = 5 ; + public static final int EQUALS = 6 ; + public static final int CONTAINS = 7 ; + public static final int STARTS_WITH_BORDERS_CONSONANTS = 8 ; - /** - * return flags - */ - public static final int DO_NOTHING = 0 ; - public static final int DROP_SUFFIX_AND_NASALIZE = 1 ; + /** + * actions + */ + public static final int REPLACE_MATCH = 0 ; + public static final int SET_FLAG = 1 ; + public static final int REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL = 2 ; + public static final int REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT = 3 ; + public static final int REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT = 4 ; + public static final int SET_RETURN_FLAG = 5 ; - private int condition ; - private String condArg ; - private int action ; - private Object actionArg ; + /** + * flags + */ + public static final int FLAG_ENDS_WITH_VOWEL = 1 ; + public static final int FLAG_ENDS_WITH_CONSONANT_VOICED = 2 ; + public static final int FLAG_ENDS_WITH_CONSONANT_VOICELESS = 4 ; - private Pattern pattern ; + /** + * return flags + */ + public static final int DO_NOTHING = 0 ; + public static final int DROP_SUFFIX_AND_NASALIZE = 1 ; - private static final String vowelSet = "aeiou" ; - private static final String consonantSet = "bBcCdDfgGhjkKlmnNpPrsStTvwXzZ" ; + private int condition ; + private String condArg ; + private int action ; + private Object actionArg ; - /** - * constructior - */ - Rule ( int condition, String condArg, int action, Object actionArg ) throws Exception - { - this.condition = condition ; - this.action = action ; - this.condArg = condArg ; - this.actionArg = actionArg ; + private Pattern pattern ; - String patStr = "" ; + private static final String vowelSet = "aeiou" ; + private static final String consonantSet = "bBcCdDfgGhjkKlmnNpPrsStTvwXzZ" ; - switch ( condition ) - { - case CONTAINS : - patStr = condArg ; - break ; - case EQUALS : - patStr = "^" + condArg + "$" ; - break ; - case STARTS_WITH : - patStr = "^" + condArg ; - break ; - case ENDS_WITH : - patStr = condArg + "$" ; - break ; - case ENDS_WITH_FOLLOWS_VOWEL : - patStr = "([" + vowelSet + "]{1,2})" + condArg + "$" ; - if ( REPLACE_MATCH == action || - REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL == action || - REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT == action ) - this.actionArg = "$1" + (String)actionArg ; - break ; - case ENDS_WITH_FOLLOWS_CONSONANT : - patStr = "([" + consonantSet + "])" + condArg + "$" ; - if ( REPLACE_MATCH == action || - REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL == action || - REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT == action ) - this.actionArg = "$1" + (String)actionArg ; - break ; - case STARTS_WITH_BORDERS_VOWEL : - patStr = "^" + condArg + "([" + vowelSet + "]{1,2})" ; - if ( REPLACE_MATCH == action || - REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL == action || - REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT == action ) - this.actionArg = (String)actionArg + "$1" ; - break ; - case STARTS_WITH_BORDERS_CONSONANT: - patStr = "^" + condArg + "(([" + consonantSet + "]))" ; - if ( REPLACE_MATCH == action || - REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL == action || - REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT == action ) - this.actionArg = (String)actionArg + "$1" ; - break ; - case STARTS_WITH_BORDERS_CONSONANTS: - patStr = "^" + condArg + "([" + consonantSet + "]{2})" ; - if ( REPLACE_MATCH == action || - REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL == action || - REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT == action ) - this.actionArg = (String)actionArg + "$1" ; - break ; - default: - throw new Exception ( "Invalid condition for a rule." ) ; - } + /** + * constructior + */ + Rule ( int condition, String condArg, int action, Object actionArg ) throws Exception + { + this.condition = condition ; + this.action = action ; + this.condArg = condArg ; + this.actionArg = actionArg ; - pattern = Pattern.compile ( patStr ) ; - } + String patStr = "" ; - /** - * property access - */ - public Object getActionArg () - { - return actionArg ; - } + switch ( condition ) + { + case CONTAINS : + patStr = condArg ; + break ; + case EQUALS : + patStr = "^" + condArg + "$" ; + break ; + case STARTS_WITH : + patStr = "^" + condArg ; + break ; + case ENDS_WITH : + patStr = condArg + "$" ; + break ; + case ENDS_WITH_FOLLOWS_VOWEL : + patStr = "([" + vowelSet + "]{1,2})" + condArg + "$" ; + if ( REPLACE_MATCH == action || + REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL == action || + REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT == action ) + this.actionArg = "$1" + (String)actionArg ; + break ; + case ENDS_WITH_FOLLOWS_CONSONANT : + patStr = "([" + consonantSet + "])" + condArg + "$" ; + if ( REPLACE_MATCH == action || + REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL == action || + REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT == action ) + this.actionArg = "$1" + (String)actionArg ; + break ; + case STARTS_WITH_BORDERS_VOWEL : + patStr = "^" + condArg + "([" + vowelSet + "]{1,2})" ; + if ( REPLACE_MATCH == action || + REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL == action || + REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT == action ) + this.actionArg = (String)actionArg + "$1" ; + break ; + case STARTS_WITH_BORDERS_CONSONANT: + patStr = "^" + condArg + "(([" + consonantSet + "]))" ; + if ( REPLACE_MATCH == action || + REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL == action || + REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT == action ) + this.actionArg = (String)actionArg + "$1" ; + break ; + case STARTS_WITH_BORDERS_CONSONANTS: + patStr = "^" + condArg + "([" + consonantSet + "]{2})" ; + if ( REPLACE_MATCH == action || + REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL == action || + REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT == action ) + this.actionArg = (String)actionArg + "$1" ; + break ; + default: + throw new Exception ( "Invalid condition for a rule." ) ; + } - public String getConditionArg () - { - return condArg ; - } + pattern = Pattern.compile ( patStr ) ; + } - public int getCondition () - { - return condition ; - } + /** + * property access + */ + public Object getActionArg () + { + return actionArg ; + } - public int getAction () - { - return action ; - } + public String getConditionArg () + { + return condArg ; + } - public Pattern getPattern () - { - return pattern ; - } - } - - /** - * setRules - */ - protected void setRules () throws Exception - { - // - // based on http://www.thdl.org/xml/showEssay.php?xml=/collections/langling/THDL_phonetics.xml&l=d1e671 - // - Rule [] thdlRules = - { - // - // 6. When ba and bo appear as the final syllable of a word, they are transcribed as "wa" and "wo," respectively. - // This also includes ba'i ( > wé, about which see rule 16 below) and bar ( > war) as final syllables, although the latter is - // more evident in literary forms - // - new Rule ( Rule.EQUALS, "ba", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL, "wa" ), - new Rule ( Rule.EQUALS, "ba", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT, "wa" ), - new Rule ( Rule.EQUALS, "ba", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT, "pa" ), - new Rule ( Rule.EQUALS, "bo", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL, "wo" ), - new Rule ( Rule.EQUALS, "bo", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT, "wo" ), - new Rule ( Rule.EQUALS, "bo", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT, "po" ), - new Rule ( Rule.EQUALS, "bar", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL, "war" ), - new Rule ( Rule.EQUALS, "bar", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT, "war" ), - new Rule ( Rule.EQUALS, "bar", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT, "par" ), - new Rule ( Rule.EQUALS, "bo", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL, "wo" ), - new Rule ( Rule.EQUALS, "bo", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT, "wo" ), - new Rule ( Rule.EQUALS, "bo", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT, "po" ), + public int getCondition () + { + return condition ; + } - // - // 7. The consonant clusters py, phy and by are transcribed respectively as "ch," "ch," and "j." - // - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "py", Rule.REPLACE_MATCH, "c" ), - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "Py", Rule.REPLACE_MATCH, "c" ), - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "by", Rule.REPLACE_MATCH, "j" ) , + public int getAction () + { + return action ; + } - // - // 8. The consonant cluster my is transcribed as "ny." - // - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "my", Rule.REPLACE_MATCH, "ny" ) , + public Pattern getPattern () + { + return pattern ; + } + } - // - // 13. When the second syllable of a word begins with the prefix achung ('), nasalization occurs - // A. An 'n' is inserted after the first syllable, and the suffix of the first syllable (if there is one) is elided - // B. If the root letter of the second syllable is pha or ba, an 'm' is inserted after the first syllable, - // and the suffix of the first syllable (if there is one) is elided - new Rule ( Rule.STARTS_WITH, "'", Rule.SET_RETURN_FLAG, new Integer ( Rule.DROP_SUFFIX_AND_NASALIZE ) ), + /** + * setRules + */ + protected void setRules () throws Exception + { + // + // based on http://www.thdl.org/xml/showEssay.php?xml=/collections/langling/THDL_phonetics.xml&l=d1e671 + // + Rule [] thdlRules = + { + // + // 6. When ba and bo appear as the final syllable of a word, they are transcribed as "wa" and "wo," respectively. + // This also includes ba'i ( > wé, about which see rule 16 below) and bar ( > war) as final syllables, although the latter is + // more evident in literary forms + // + new Rule ( Rule.EQUALS, "ba", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL, "wa" ), + new Rule ( Rule.EQUALS, "ba", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT, "wa" ), + new Rule ( Rule.EQUALS, "ba", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT, "pa" ), + new Rule ( Rule.EQUALS, "bo", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL, "wo" ), + new Rule ( Rule.EQUALS, "bo", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT, "wo" ), + new Rule ( Rule.EQUALS, "bo", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT, "po" ), + new Rule ( Rule.EQUALS, "bar", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL, "war" ), + new Rule ( Rule.EQUALS, "bar", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT, "war" ), + new Rule ( Rule.EQUALS, "bar", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT, "par" ), + new Rule ( Rule.EQUALS, "bo", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL, "wo" ), + new Rule ( Rule.EQUALS, "bo", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT, "wo" ), + new Rule ( Rule.EQUALS, "bo", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT, "po" ), - // - // dirty workaround - // - new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANTS, "g", Rule.REPLACE_MATCH, "" ), - new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANTS, "d", Rule.REPLACE_MATCH, "" ), - new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANTS, "b", Rule.REPLACE_MATCH, "" ), - new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANTS, "m", Rule.REPLACE_MATCH, "" ), - new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANTS, "'", Rule.REPLACE_MATCH, "" ), + // + // 7. The consonant clusters py, phy and by are transcribed respectively as "ch," "ch," and "j." + // + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "py", Rule.REPLACE_MATCH, "c" ), + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "Py", Rule.REPLACE_MATCH, "c" ), + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "by", Rule.REPLACE_MATCH, "j" ) , - // - // 9. Consonant clusters with r subscripts (which are pronounced as retroflexes) are transcribed with an "r." - // - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "kr", Rule.REPLACE_MATCH, "tr" ) , - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "Kr", Rule.REPLACE_MATCH, "tr" ), - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "gr", Rule.REPLACE_MATCH, "dr" ) , - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "nr", Rule.REPLACE_MATCH, "n" ) , - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "pr", Rule.REPLACE_MATCH, "tr" ) , - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "Pr", Rule.REPLACE_MATCH, "tr" ), - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "br", Rule.REPLACE_MATCH, "dr" ) , + // + // 8. The consonant cluster my is transcribed as "ny." + // + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "my", Rule.REPLACE_MATCH, "ny" ) , - // - // For all other consonant clusters in which the r subscript is not pronounced, - // such as mr, sr, and so forth, THDL Simplified Phonetics simply drops the "r" - // in accordance with the general principle - // - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "mr", Rule.REPLACE_MATCH, "m" ) , - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "Sr", Rule.REPLACE_MATCH, "S" ), - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "sr", Rule.REPLACE_MATCH, "s" ) , + // + // 13. When the second syllable of a word begins with the prefix achung ('), nasalization occurs + // A. An 'n' is inserted after the first syllable, and the suffix of the first syllable (if there is one) is elided + // B. If the root letter of the second syllable is pha or ba, an 'm' is inserted after the first syllable, + // and the suffix of the first syllable (if there is one) is elided + new Rule ( Rule.STARTS_WITH, "'", Rule.SET_RETURN_FLAG, new Integer ( Rule.DROP_SUFFIX_AND_NASALIZE ) ), - // - // 10. Consonant clusters containing a subscript la are transcribed as "l" - // with the exception of zl, which is transcribed as "d." - // - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "kl", Rule.REPLACE_MATCH, "l" ) , - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "gl", Rule.REPLACE_MATCH, "l" ) , - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "bl", Rule.REPLACE_MATCH, "l" ) , - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "rl", Rule.REPLACE_MATCH, "l" ) , - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "zl", Rule.REPLACE_MATCH, "d" ) , - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "sl", Rule.REPLACE_MATCH, "l" ) , + // + // dirty workaround + // + new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANTS, "g", Rule.REPLACE_MATCH, "" ), + new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANTS, "d", Rule.REPLACE_MATCH, "" ), + new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANTS, "b", Rule.REPLACE_MATCH, "" ), + new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANTS, "m", Rule.REPLACE_MATCH, "" ), + new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANTS, "'", Rule.REPLACE_MATCH, "" ), - // - // 11. Consonant clusters with an l superscript and h root letter are rendered "lh." - // - new Rule ( Rule.STARTS_WITH, "lh", Rule.REPLACE_MATCH, "hl" ), - - // - // 12. Consonant clusters with a d prefix and b root letter undergo transformations in the following way, - // depending on whether the consonant cluster includes the subscripts y or r: - // - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "db", Rule.REPLACE_MATCH, "w" ), - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "dby", Rule.REPLACE_MATCH, "y" ), - new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "dbr", Rule.REPLACE_MATCH, "r" ), + // + // 9. Consonant clusters with r subscripts (which are pronounced as retroflexes) are transcribed with an "r." + // + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "kr", Rule.REPLACE_MATCH, "tr" ) , + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "Kr", Rule.REPLACE_MATCH, "tr" ), + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "gr", Rule.REPLACE_MATCH, "dr" ) , + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "nr", Rule.REPLACE_MATCH, "n" ) , + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "pr", Rule.REPLACE_MATCH, "tr" ) , + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "Pr", Rule.REPLACE_MATCH, "tr" ), + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "br", Rule.REPLACE_MATCH, "dr" ) , - // !!!!! TODO !!!!! - // Note: there are some exceptions to the form the nasalization takes: - // skyabs 'gro > kyamdro - // rten 'brel > temdrel - // lam 'bras > lamdré + // + // For all other consonant clusters in which the r subscript is not pronounced, + // such as mr, sr, and so forth, THDL Simplified Phonetics simply drops the "r" + // in accordance with the general principle + // + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "mr", Rule.REPLACE_MATCH, "m" ) , + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "Sr", Rule.REPLACE_MATCH, "S" ), + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "sr", Rule.REPLACE_MATCH, "s" ) , - // - // http://www.thdl.org/xml/showEssay.php?xml=/collections/langling/THDL_phonetics.xml&l=d1e294 - // The THDL Simplified Phonetic system, in contrast to Wylie, drops all Tibetan letters not - // pronounced in a given syllable. This includes the superscribed consonants r, l, and s ; - new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "r", Rule.REPLACE_MATCH, "" ), - new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "l", Rule.REPLACE_MATCH, "" ), - new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "s", Rule.REPLACE_MATCH, "" ), - - // - // .... the prefixes g, d, b, m, and ' ; - // - new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "g", Rule.REPLACE_MATCH, "" ), - new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "d", Rule.REPLACE_MATCH, "" ), - new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "b", Rule.REPLACE_MATCH, "" ), - new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "m", Rule.REPLACE_MATCH, "" ), - new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "'", Rule.REPLACE_MATCH, "" ), + // + // 10. Consonant clusters containing a subscript la are transcribed as "l" + // with the exception of zl, which is transcribed as "d." + // + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "kl", Rule.REPLACE_MATCH, "l" ) , + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "gl", Rule.REPLACE_MATCH, "l" ) , + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "bl", Rule.REPLACE_MATCH, "l" ) , + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "rl", Rule.REPLACE_MATCH, "l" ) , + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "zl", Rule.REPLACE_MATCH, "d" ) , + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "sl", Rule.REPLACE_MATCH, "l" ) , - // the suffixes d , ' , and s ; ( THEY ARE HANDLED ELSEWHERE ) + // + // 11. Consonant clusters with an l superscript and h root letter are rendered "lh." + // + new Rule ( Rule.STARTS_WITH, "lh", Rule.REPLACE_MATCH, "hl" ), - // - // .... and the post-suffixes s and d. - // - new Rule ( Rule.ENDS_WITH_FOLLOWS_CONSONANT, "s", Rule.REPLACE_MATCH, "" ), - new Rule ( Rule.ENDS_WITH_FOLLOWS_CONSONANT, "d", Rule.REPLACE_MATCH, "" ), + // + // 12. Consonant clusters with a d prefix and b root letter undergo transformations in the following way, + // depending on whether the consonant cluster includes the subscripts y or r: + // + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "db", Rule.REPLACE_MATCH, "w" ), + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "dby", Rule.REPLACE_MATCH, "y" ), + new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "dbr", Rule.REPLACE_MATCH, "r" ), - // - // 15. When two of the same vowels are connected by an achung, they are transcribed by dropping the achung and - // combining the two vowels into one - // - new Rule ( Rule.CONTAINS, "a'a", Rule.REPLACE_MATCH, "a" ), - new Rule ( Rule.CONTAINS, "e'e", Rule.REPLACE_MATCH, "e" ), - new Rule ( Rule.CONTAINS, "i'i", Rule.REPLACE_MATCH, "i" ), - new Rule ( Rule.CONTAINS, "o'o", Rule.REPLACE_MATCH, "o" ), - new Rule ( Rule.CONTAINS, "u'u", Rule.REPLACE_MATCH, "u" ), + // !!!!! TODO !!!!! + // Note: there are some exceptions to the form the nasalization takes: + // skyabs 'gro > kyamdro + // rten 'brel > temdrel + // lam 'bras > lamdré - new Rule ( Rule.ENDS_WITH, "as", Rule.REPLACE_MATCH, "é" ) , - new Rule ( Rule.ENDS_WITH, "ad", Rule.REPLACE_MATCH, "é" ) , - new Rule ( Rule.ENDS_WITH, "an", Rule.REPLACE_MATCH, "én" ) , - new Rule ( Rule.ENDS_WITH, "al", Rule.REPLACE_MATCH, "él" ) , - new Rule ( Rule.ENDS_WITH, "os", Rule.REPLACE_MATCH, "ö" ) , - new Rule ( Rule.ENDS_WITH, "od", Rule.REPLACE_MATCH, "ö" ) , - new Rule ( Rule.ENDS_WITH, "on", Rule.REPLACE_MATCH, "ön" ) , - new Rule ( Rule.ENDS_WITH, "ol", Rule.REPLACE_MATCH, "öl" ) , - new Rule ( Rule.ENDS_WITH, "u'", Rule.REPLACE_MATCH, "u" ) , - new Rule ( Rule.ENDS_WITH, "us", Rule.REPLACE_MATCH, "ü" ) , - new Rule ( Rule.ENDS_WITH, "ud", Rule.REPLACE_MATCH, "ü" ) , - new Rule ( Rule.ENDS_WITH, "un", Rule.REPLACE_MATCH, "ün" ) , - new Rule ( Rule.ENDS_WITH, "ul", Rule.REPLACE_MATCH, "ül" ) , - new Rule ( Rule.ENDS_WITH, "es", Rule.REPLACE_MATCH, "e" ) , - new Rule ( Rule.ENDS_WITH, "ed", Rule.REPLACE_MATCH, "e" ) , - new Rule ( Rule.ENDS_WITH, "en", Rule.REPLACE_MATCH, "en" ) , - new Rule ( Rule.ENDS_WITH, "el", Rule.REPLACE_MATCH, "el" ) , - new Rule ( Rule.ENDS_WITH, "is", Rule.REPLACE_MATCH, "i" ) , - new Rule ( Rule.ENDS_WITH, "id", Rule.REPLACE_MATCH, "i" ) , + // + // http://www.thdl.org/xml/showEssay.php?xml=/collections/langling/THDL_phonetics.xml&l=d1e294 + // The THDL Simplified Phonetic system, in contrast to Wylie, drops all Tibetan letters not + // pronounced in a given syllable. This includes the superscribed consonants r, l, and s ; + new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "r", Rule.REPLACE_MATCH, "" ), + new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "l", Rule.REPLACE_MATCH, "" ), + new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "s", Rule.REPLACE_MATCH, "" ), - // - // 5. The suffixes g and b are devoiced and rendered "k" and "p," respectively, - // since this most closely approximates actual pronunciation. - // - new Rule ( Rule.ENDS_WITH_FOLLOWS_VOWEL, "g", Rule.REPLACE_MATCH, "k" ), - new Rule ( Rule.ENDS_WITH_FOLLOWS_VOWEL, "b", Rule.REPLACE_MATCH, "p" ), - - // these rule must be *last* - new Rule ( Rule.ENDS_WITH, "[aeiou]", Rule.SET_FLAG, new Integer(Rule.FLAG_ENDS_WITH_VOWEL) ), - new Rule ( Rule.ENDS_WITH, "(b|d|g|l|m|n|N|r)", Rule.SET_FLAG, new Integer ( Rule.FLAG_ENDS_WITH_CONSONANT_VOICED ) ), - new Rule ( Rule.ENDS_WITH, "(k|p|s|t)", Rule.SET_FLAG, new Integer ( Rule.FLAG_ENDS_WITH_CONSONANT_VOICELESS ) ), + // + // .... the prefixes g, d, b, m, and ' ; + // + new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "g", Rule.REPLACE_MATCH, "" ), + new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "d", Rule.REPLACE_MATCH, "" ), + new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "b", Rule.REPLACE_MATCH, "" ), + new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "m", Rule.REPLACE_MATCH, "" ), + new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "'", Rule.REPLACE_MATCH, "" ), - new Rule ( Rule.ENDS_WITH, "a'i", Rule.REPLACE_MATCH, "e" ) , - new Rule ( Rule.ENDS_WITH, "e'i", Rule.REPLACE_MATCH, "e" ) , - new Rule ( Rule.ENDS_WITH, "o'i", Rule.REPLACE_MATCH, "ö" ) , - new Rule ( Rule.ENDS_WITH, "u'i", Rule.REPLACE_MATCH, "ü" ), + // the suffixes d , ' , and s ; ( THEY ARE HANDLED ELSEWHERE ) - // - // 14. Multiple vowels that have discrete sounds and are connected by an achung (') are transcribed by dropping the achung - // (at this point all achungs have been removed by previous rules) - // - new Rule ( Rule.CONTAINS, "'", Rule.REPLACE_MATCH, "" ), + // + // .... and the post-suffixes s and d. + // + new Rule ( Rule.ENDS_WITH_FOLLOWS_CONSONANT, "s", Rule.REPLACE_MATCH, "" ), + new Rule ( Rule.ENDS_WITH_FOLLOWS_CONSONANT, "d", Rule.REPLACE_MATCH, "" ), - //new Rule ( Rule.CONTAINS, "X", Rule.REPLACE_MATCH, "ts" ), - //new Rule ( Rule.CONTAINS, "T", Rule.REPLACE_MATCH, "t" ), - //new Rule ( Rule.CONTAINS, "P", Rule.REPLACE_MATCH, "p" ), - - } ; + // + // 15. When two of the same vowels are connected by an achung, they are transcribed by dropping the achung and + // combining the two vowels into one + // + new Rule ( Rule.CONTAINS, "a'a", Rule.REPLACE_MATCH, "a" ), + new Rule ( Rule.CONTAINS, "e'e", Rule.REPLACE_MATCH, "e" ), + new Rule ( Rule.CONTAINS, "i'i", Rule.REPLACE_MATCH, "i" ), + new Rule ( Rule.CONTAINS, "o'o", Rule.REPLACE_MATCH, "o" ), + new Rule ( Rule.CONTAINS, "u'u", Rule.REPLACE_MATCH, "u" ), - rules = thdlRules ; - } - - /** - * applyRule - */ - protected String applyRule ( Rule rule, String in ) throws Exception - { - switch ( rule.getAction () ) - { - // - // Rule.REPLACE_MATCH - if text defined in condArg found, replace it with actionArg unconditionally - // - case Rule.REPLACE_MATCH : - { - Matcher matcher = rule.getPattern ().matcher ( in ) ; - return matcher.replaceFirst ( (String)rule.getActionArg () ) ; - } + new Rule ( Rule.ENDS_WITH, "as", Rule.REPLACE_MATCH, "é" ) , + new Rule ( Rule.ENDS_WITH, "ad", Rule.REPLACE_MATCH, "é" ) , + new Rule ( Rule.ENDS_WITH, "an", Rule.REPLACE_MATCH, "én" ) , + new Rule ( Rule.ENDS_WITH, "al", Rule.REPLACE_MATCH, "él" ) , + new Rule ( Rule.ENDS_WITH, "os", Rule.REPLACE_MATCH, "ö" ) , + new Rule ( Rule.ENDS_WITH, "od", Rule.REPLACE_MATCH, "ö" ) , + new Rule ( Rule.ENDS_WITH, "on", Rule.REPLACE_MATCH, "ön" ) , + new Rule ( Rule.ENDS_WITH, "ol", Rule.REPLACE_MATCH, "öl" ) , + new Rule ( Rule.ENDS_WITH, "u'", Rule.REPLACE_MATCH, "u" ) , + new Rule ( Rule.ENDS_WITH, "us", Rule.REPLACE_MATCH, "ü" ) , + new Rule ( Rule.ENDS_WITH, "ud", Rule.REPLACE_MATCH, "ü" ) , + new Rule ( Rule.ENDS_WITH, "un", Rule.REPLACE_MATCH, "ün" ) , + new Rule ( Rule.ENDS_WITH, "ul", Rule.REPLACE_MATCH, "ül" ) , + new Rule ( Rule.ENDS_WITH, "es", Rule.REPLACE_MATCH, "e" ) , + new Rule ( Rule.ENDS_WITH, "ed", Rule.REPLACE_MATCH, "e" ) , + new Rule ( Rule.ENDS_WITH, "en", Rule.REPLACE_MATCH, "en" ) , + new Rule ( Rule.ENDS_WITH, "el", Rule.REPLACE_MATCH, "el" ) , + new Rule ( Rule.ENDS_WITH, "is", Rule.REPLACE_MATCH, "i" ) , + new Rule ( Rule.ENDS_WITH, "id", Rule.REPLACE_MATCH, "i" ) , - // - // Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL - if text defined in condArg found, - // AND last syllable ends with a vowel - replace it with actionArg - // - case Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL : - if ( 0 != ( stateFlag & Rule.FLAG_ENDS_WITH_VOWEL ) ) - { - Matcher matcher = rule.getPattern ().matcher ( in ) ; - return matcher.replaceFirst ( (String)rule.getActionArg () ) ; - } - break ; + // + // 5. The suffixes g and b are devoiced and rendered "k" and "p," respectively, + // since this most closely approximates actual pronunciation. + // + new Rule ( Rule.ENDS_WITH_FOLLOWS_VOWEL, "g", Rule.REPLACE_MATCH, "k" ), + new Rule ( Rule.ENDS_WITH_FOLLOWS_VOWEL, "b", Rule.REPLACE_MATCH, "p" ), - // - // Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT - if text defined in condArg found, - // AND last syllable ends with a voiced consonant - replace it with actionArg - // - case Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT : - if ( 0 != ( stateFlag & Rule.FLAG_ENDS_WITH_CONSONANT_VOICED ) ) - { - Matcher matcher = rule.getPattern ().matcher ( in ) ; - return matcher.replaceFirst ( (String)rule.getActionArg () ) ; - } - break ; + // these rule must be *last* + new Rule ( Rule.ENDS_WITH, "[aeiou]", Rule.SET_FLAG, new Integer(Rule.FLAG_ENDS_WITH_VOWEL) ), + new Rule ( Rule.ENDS_WITH, "(b|d|g|l|m|n|N|r)", Rule.SET_FLAG, new Integer ( Rule.FLAG_ENDS_WITH_CONSONANT_VOICED ) ), + new Rule ( Rule.ENDS_WITH, "(k|p|s|t)", Rule.SET_FLAG, new Integer ( Rule.FLAG_ENDS_WITH_CONSONANT_VOICELESS ) ), - // - // Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT - if text defined in condArg found, - // AND last syllable ends with a voiceless consonant - replace it with actionArg - // - case Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT : - if ( 0 != ( stateFlag & Rule.FLAG_ENDS_WITH_CONSONANT_VOICELESS ) ) - { - Matcher matcher = rule.getPattern ().matcher ( in ) ; - return matcher.replaceFirst ( (String)rule.getActionArg () ) ; - } - break ; + new Rule ( Rule.ENDS_WITH, "a'i", Rule.REPLACE_MATCH, "e" ) , + new Rule ( Rule.ENDS_WITH, "e'i", Rule.REPLACE_MATCH, "e" ) , + new Rule ( Rule.ENDS_WITH, "o'i", Rule.REPLACE_MATCH, "ö" ) , + new Rule ( Rule.ENDS_WITH, "u'i", Rule.REPLACE_MATCH, "ü" ), - // - // Rule.SET_FLAG - set the flag - // - case Rule.SET_FLAG : - if ( rule.getPattern ().matcher ( in ).find () ) - { - stateFlag = ((Integer) rule.getActionArg ()).intValue () ; - } - break ; + // + // 14. Multiple vowels that have discrete sounds and are connected by an achung (') are transcribed by dropping the achung + // (at this point all achungs have been removed by previous rules) + // + new Rule ( Rule.CONTAINS, "'", Rule.REPLACE_MATCH, "" ), - // - // Rule.SET_RETURN_FLAG - set the return flag - // - case Rule.SET_RETURN_FLAG : - if ( rule.getPattern ().matcher ( in ).find () ) - { - returnFlag = ((Integer) rule.getActionArg ()).intValue () ; - } - break ; - - // - // shouldn't ever happen - // - default: - throw new Exception ( "Invalid action." ) ; - } + //new Rule ( Rule.CONTAINS, "X", Rule.REPLACE_MATCH, "ts" ), + //new Rule ( Rule.CONTAINS, "T", Rule.REPLACE_MATCH, "t" ), + //new Rule ( Rule.CONTAINS, "P", Rule.REPLACE_MATCH, "p" ), - return in ; - } + } ; - /** - * processWord - */ - protected String processWord ( String in ) throws Exception - { - returnFlag = Rule.DO_NOTHING ; - lastWordOriginal = in ; + rules = thdlRules ; + } - // - // run the word through all rules - // - for ( int i = 0; i < rules.length; i++ ) - { - in = applyRule ( rules [i], in ) ; - } + /** + * applyRule + */ + protected String applyRule ( Rule rule, String in ) throws Exception + { + switch ( rule.getAction () ) + { + // + // Rule.REPLACE_MATCH - if text defined in condArg found, replace it with actionArg unconditionally + // + case Rule.REPLACE_MATCH : + { + Matcher matcher = rule.getPattern ().matcher ( in ) ; + return matcher.replaceFirst ( (String)rule.getActionArg () ) ; + } - return in ; - } - - private static boolean isVowel ( char c ) - { - return ( -1 != "aeiouéöü".indexOf ( c ) ) ; - } + // + // Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL - if text defined in condArg found, + // AND last syllable ends with a vowel - replace it with actionArg + // + case Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL : + if ( 0 != ( stateFlag & Rule.FLAG_ENDS_WITH_VOWEL ) ) + { + Matcher matcher = rule.getPattern ().matcher ( in ) ; + return matcher.replaceFirst ( (String)rule.getActionArg () ) ; + } + break ; - private static String dropSuffix ( String in ) - { - while ( !isVowel ( in.charAt ( in.length () - 1 ) ) ) - in = in.substring ( 0, in.length () - 1 ) ; + // + // Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT - if text defined in condArg found, + // AND last syllable ends with a voiced consonant - replace it with actionArg + // + case Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT : + if ( 0 != ( stateFlag & Rule.FLAG_ENDS_WITH_CONSONANT_VOICED ) ) + { + Matcher matcher = rule.getPattern ().matcher ( in ) ; + return matcher.replaceFirst ( (String)rule.getActionArg () ) ; + } + break ; - return in ; - } + // + // Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT - if text defined in condArg found, + // AND last syllable ends with a voiceless consonant - replace it with actionArg + // + case Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT : + if ( 0 != ( stateFlag & Rule.FLAG_ENDS_WITH_CONSONANT_VOICELESS ) ) + { + Matcher matcher = rule.getPattern ().matcher ( in ) ; + return matcher.replaceFirst ( (String)rule.getActionArg () ) ; + } + break ; - private String nasalize ( String in, String followingSyllable ) - { - if ( followingSyllable.startsWith ( "'by" ) || followingSyllable.startsWith ( "'br" ) ) - in = in + "n" ; - else if ( followingSyllable.startsWith ( "'b" ) || lastWordOriginal.startsWith ( "'ph" ) ) - in = in + "m" ; - else - in = in + "n" ; + // + // Rule.SET_FLAG - set the flag + // + case Rule.SET_FLAG : + if ( rule.getPattern ().matcher ( in ).find () ) + { + stateFlag = ((Integer) rule.getActionArg ()).intValue () ; + } + break ; - return in ; - } + // + // Rule.SET_RETURN_FLAG - set the return flag + // + case Rule.SET_RETURN_FLAG : + if ( rule.getPattern ().matcher ( in ).find () ) + { + returnFlag = ((Integer) rule.getActionArg ()).intValue () ; + } + break ; - public String processWylie ( String in ) throws Exception - { - return process ( eliminateDigraphs ( in ) ) ; - } + // + // shouldn't ever happen + // + default: + throw new Exception ( "Invalid action." ) ; + } - public static String eliminateDigraphs ( String in ) - { - // - // replace the roman digraphs - // - in = in.replaceAll ( "bh", "B" ) ; - in = in.replaceAll ( "ch", "C" ) ; - in = in.replaceAll ( "dz", "D" ) ; - in = in.replaceAll ( "kh", "K" ) ; - in = in.replaceAll ( "ng", "N" ) ; - in = in.replaceAll ( "ph", "P" ) ; - in = in.replaceAll ( "sh", "S" ) ; - in = in.replaceAll ( "th", "T" ) ; - in = in.replaceAll ( "ts", "X" ) ; - in = in.replaceAll ( "tsh", "Q" ) ; - in = in.replaceAll ( "zh", "Z" ) ; - - return in ; - } + return in ; + } - /** - * process - */ - public String process ( String in ) throws Exception - { - stateFlag = 0 ; + /** + * processWord + */ + protected String processWord ( String in ) throws Exception + { + returnFlag = Rule.DO_NOTHING ; + lastWordOriginal = in ; - String out = "" ; + // + // run the word through all rules + // + for ( int i = 0; i < rules.length; i++ ) + { + in = applyRule ( rules [i], in ) ; + } - in = in.trim () ; - String [] matchWords = in.split ( "[^a-z'A-Z]+" ) ; + return in ; + } - for ( int i = 0; i < matchWords.length; i++ ) - { - String newWord = processWord ( matchWords [i] ) ; - if ( i > 0 && Rule.DROP_SUFFIX_AND_NASALIZE == returnFlag ) - { - out = dropSuffix ( out ) ; - out = nasalize ( out, newWord ) ; - } - - if ( out.length () > 0 ) - out += " " ; + private static boolean isVowel ( char c ) + { + return ( -1 != "aeiouéöü".indexOf ( c ) ) ; + } - out += newWord ; - } + private static String dropSuffix ( String in ) + { + while ( !isVowel ( in.charAt ( in.length () - 1 ) ) ) + in = in.substring ( 0, in.length () - 1 ) ; - return out ; - } + return in ; + } + + private String nasalize ( String in, String followingSyllable ) + { + if ( followingSyllable.startsWith ( "'by" ) || followingSyllable.startsWith ( "'br" ) ) + in = in + "n" ; + else if ( followingSyllable.startsWith ( "'b" ) || lastWordOriginal.startsWith ( "'ph" ) ) + in = in + "m" ; + else + in = in + "n" ; + + return in ; + } + + public String processWylie ( String in ) throws Exception + { + return process ( eliminateDigraphs ( in ) ) ; + } + + public static String eliminateDigraphs ( String in ) + { + // + // replace the roman digraphs + // + in = in.replaceAll ( "bh", "B" ) ; + in = in.replaceAll ( "ch", "C" ) ; + in = in.replaceAll ( "dz", "D" ) ; + in = in.replaceAll ( "kh", "K" ) ; + in = in.replaceAll ( "ng", "N" ) ; + in = in.replaceAll ( "ph", "P" ) ; + in = in.replaceAll ( "sh", "S" ) ; + in = in.replaceAll ( "th", "T" ) ; + in = in.replaceAll ( "ts", "X" ) ; + in = in.replaceAll ( "tsh", "Q" ) ; + in = in.replaceAll ( "zh", "Z" ) ; + + return in ; + } + + /** + * process + */ + public String process ( String in ) throws Exception + { + stateFlag = 0 ; + + String out = "" ; + + in = in.trim () ; + String [] matchWords = in.split ( "[^a-z'A-Z]+" ) ; + + for ( int i = 0; i < matchWords.length; i++ ) + { + String newWord = processWord ( matchWords [i] ) ; + if ( i > 0 && Rule.DROP_SUFFIX_AND_NASALIZE == returnFlag ) + { + out = dropSuffix ( out ) ; + out = nasalize ( out, newWord ) ; + } + + if ( out.length () > 0 ) + out += " " ; + + out += newWord ; + } + + return out ; + } } \ No newline at end of file diff --git a/source/org/thdl/tib/dictionary/TextBody.java b/source/org/thdl/tib/dictionary/TextBody.java index f3097bf..2918e73 100644 --- a/source/org/thdl/tib/dictionary/TextBody.java +++ b/source/org/thdl/tib/dictionary/TextBody.java @@ -4,39 +4,39 @@ import java.lang.String ; public interface TextBody { - /** - * setWylie - * - * populate TextBody based on romanized Wylie input string - */ - void setWylie ( String in ) ; - - /** - * setUnicode - * - * populate TextBody based on Unicode input string - */ - void setUnicode ( String in ) ; + /** + * setWylie + * + * populate TextBody based on romanized Wylie input string + */ + void setWylie ( String in ) ; - /** - * getRomanizedWylie - * - * populate TextBody based on romanized Wylie input string - */ - public String getRomanizedWylie () ; + /** + * setUnicode + * + * populate TextBody based on Unicode input string + */ + void setUnicode ( String in ) ; - /** - * getWylie - * - * populate TextBody based on Wylie input string - */ - public String getWylie () ; + /** + * getRomanizedWylie + * + * populate TextBody based on romanized Wylie input string + */ + public String getRomanizedWylie () ; - /** - * getUnicode - * - * populate TextBody based on Unicode input string - */ - public String getUnicode () ; + /** + * getWylie + * + * populate TextBody based on Wylie input string + */ + public String getWylie () ; + + /** + * getUnicode + * + * populate TextBody based on Unicode input string + */ + public String getUnicode () ; }