dictionary interfaces
This commit is contained in:
parent
6f87bbd844
commit
955abdf0cf
17 changed files with 1064 additions and 0 deletions
9
source/org/thdl/tib/dictionary/DictionaryEntries.java
Normal file
9
source/org/thdl/tib/dictionary/DictionaryEntries.java
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
package org.thdl.tib.dictionary ;
|
||||||
|
|
||||||
|
import org.thdl.tib.dictionary.DictionaryEntry ;
|
||||||
|
import java.util.Collection ;
|
||||||
|
|
||||||
|
public interface DictionaryEntries extends Collection
|
||||||
|
{
|
||||||
|
};
|
||||||
|
|
12
source/org/thdl/tib/dictionary/DictionaryEntry.java
Normal file
12
source/org/thdl/tib/dictionary/DictionaryEntry.java
Normal file
|
@ -0,0 +1,12 @@
|
||||||
|
package org.thdl.tib.dictionary ;
|
||||||
|
|
||||||
|
import org.thdl.tib.dictionary.TextBody ;
|
||||||
|
import org.thdl.tib.dictionary.DictionaryEntryDefinitions ;
|
||||||
|
|
||||||
|
public interface DictionaryEntry
|
||||||
|
{
|
||||||
|
public TextBody getKeyword () ;
|
||||||
|
public String getPhonetic () ;
|
||||||
|
public DictionaryEntryDefinitions getDefinitions () ;
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,6 @@
|
||||||
|
package org.thdl.tib.dictionary ;
|
||||||
|
|
||||||
|
public interface DictionaryEntryDefinition
|
||||||
|
{
|
||||||
|
public String toString () ;
|
||||||
|
};
|
|
@ -0,0 +1,8 @@
|
||||||
|
package org.thdl.tib.dictionary ;
|
||||||
|
|
||||||
|
import java.util.Collection ;
|
||||||
|
|
||||||
|
public interface DictionaryEntryDefinitions extends Collection
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,7 @@
|
||||||
|
package org.thdl.tib.dictionary ;
|
||||||
|
|
||||||
|
public interface DictionaryEntryDescription
|
||||||
|
{
|
||||||
|
Collection getItems () ;
|
||||||
|
};
|
||||||
|
|
|
@ -0,0 +1,8 @@
|
||||||
|
package org.thdl.tib.dictionary ;
|
||||||
|
|
||||||
|
import java.util.Collection ;
|
||||||
|
|
||||||
|
public interface DictionaryEntryDescriptions extends Collection
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
10
source/org/thdl/tib/dictionary/DictionaryInterface.java
Normal file
10
source/org/thdl/tib/dictionary/DictionaryInterface.java
Normal file
|
@ -0,0 +1,10 @@
|
||||||
|
package org.thdl.tib.dictionary ;
|
||||||
|
|
||||||
|
import org.thdl.tib.dictionary.TextBody ;
|
||||||
|
import org.thdl.tib.dictionary.DictionaryEntries ;
|
||||||
|
|
||||||
|
public interface DictionaryInterface
|
||||||
|
{
|
||||||
|
DictionaryEntries lookup ( TextBody in ) ;
|
||||||
|
}
|
||||||
|
|
109
source/org/thdl/tib/dictionary/Phonetics.java
Normal file
109
source/org/thdl/tib/dictionary/Phonetics.java
Normal file
|
@ -0,0 +1,109 @@
|
||||||
|
package org.thdl.tib.dictionary ;
|
||||||
|
|
||||||
|
import org.thdl.tib.dictionary.StandardPronounciationEngine ;
|
||||||
|
|
||||||
|
public class Phonetics
|
||||||
|
{
|
||||||
|
public static final String THDL_ENGLISH = "THDL_ENGLISH" ;
|
||||||
|
public static boolean valid = false ;
|
||||||
|
|
||||||
|
static StandardPronounciationEngine pronounciationEngine = null ;
|
||||||
|
|
||||||
|
static
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
pronounciationEngine = new StandardPronounciationEngine () ;
|
||||||
|
valid = true ;
|
||||||
|
}
|
||||||
|
catch ( Exception e )
|
||||||
|
{
|
||||||
|
valid = false ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static boolean isValid ()
|
||||||
|
{
|
||||||
|
return valid ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String standardToLocalized ( String locale, String in )
|
||||||
|
{
|
||||||
|
if ( locale.equals ( THDL_ENGLISH ) )
|
||||||
|
{
|
||||||
|
//
|
||||||
|
// put back the roman digraphs
|
||||||
|
//
|
||||||
|
in = in.replaceAll ( "B", "bh" ) ;
|
||||||
|
in = in.replaceAll ( "D", "dz" ) ;
|
||||||
|
in = in.replaceAll ( "K", "kh" ) ;
|
||||||
|
in = in.replaceAll ( "N", "ng" ) ;
|
||||||
|
in = in.replaceAll ( "P", "p" ) ;
|
||||||
|
in = in.replaceAll ( "S", "sh" ) ;
|
||||||
|
in = in.replaceAll ( "T", "t" ) ;
|
||||||
|
in = in.replaceAll ( "X", "ts" ) ;
|
||||||
|
in = in.replaceAll ( "Q", "ts" ) ;
|
||||||
|
in = in.replaceAll ( "Z", "zh" ) ;
|
||||||
|
in = in.replaceAll ( "c", "ch" ) ;
|
||||||
|
in = in.replaceAll ( "C", "ch" ) ;
|
||||||
|
}
|
||||||
|
else if ( locale.equals ( "POLISH" ) )
|
||||||
|
{
|
||||||
|
//
|
||||||
|
// put back the roman digraphs
|
||||||
|
//
|
||||||
|
in = in.replaceAll ( "ny", "ni" ) ;
|
||||||
|
in = in.replaceAll ( "w", "\u0142" ) ;
|
||||||
|
in = in.replaceAll ( "B", "bh" ) ;
|
||||||
|
in = in.replaceAll ( "C", "cz'" ) ;
|
||||||
|
in = in.replaceAll ( "D", "dz" ) ;
|
||||||
|
in = in.replaceAll ( "j", "dzi" ) ;
|
||||||
|
in = in.replaceAll ( "K", "k'" ) ;
|
||||||
|
in = in.replaceAll ( "N", "ng" ) ;
|
||||||
|
in = in.replaceAll ( "P", "p'" ) ;
|
||||||
|
in = in.replaceAll ( "S", "sz" ) ;
|
||||||
|
in = in.replaceAll ( "T", "t'" ) ;
|
||||||
|
in = in.replaceAll ( "X", "c" ) ;
|
||||||
|
in = in.replaceAll ( "Q", "ts'" ) ;
|
||||||
|
in = in.replaceAll ( "y", "j" ) ;
|
||||||
|
in = in.replaceAll ( "Z", "sz" ) ;
|
||||||
|
in = in.replaceAll ( "c", "cz" ) ;
|
||||||
|
}
|
||||||
|
else if ( locale.equals ( "CZECH" ) || locale.equals ( "SLOVAK" ) )
|
||||||
|
{
|
||||||
|
//
|
||||||
|
// put back the roman digraphs
|
||||||
|
//
|
||||||
|
in = in.replaceAll ( "ny", "\u0148" ) ;
|
||||||
|
in = in.replaceAll ( "w", "v" ) ;
|
||||||
|
in = in.replaceAll ( "B", "bh" ) ;
|
||||||
|
in = in.replaceAll ( "C", "\u010d'" ) ;
|
||||||
|
in = in.replaceAll ( "D", "dz" ) ;
|
||||||
|
in = in.replaceAll ( "j", "d\u017e" ) ;
|
||||||
|
in = in.replaceAll ( "K", "k'" ) ;
|
||||||
|
in = in.replaceAll ( "N", "ng" ) ;
|
||||||
|
in = in.replaceAll ( "P", "p'" ) ;
|
||||||
|
in = in.replaceAll ( "S", "\u0161" ) ;
|
||||||
|
in = in.replaceAll ( "T", "t'" ) ;
|
||||||
|
in = in.replaceAll ( "X", "c" ) ;
|
||||||
|
in = in.replaceAll ( "Q", "ts'" ) ;
|
||||||
|
in = in.replaceAll ( "y", "j" ) ;
|
||||||
|
in = in.replaceAll ( "Z", "\u0161" ) ;
|
||||||
|
in = in.replaceAll ( "c", "\u010d'" ) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
return in ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String wylieToStandardPhonetic ( String wylie )
|
||||||
|
{
|
||||||
|
try
|
||||||
|
{
|
||||||
|
return pronounciationEngine.processWylie ( wylie ) ;
|
||||||
|
}
|
||||||
|
catch ( Exception e )
|
||||||
|
{
|
||||||
|
return "<INVALID>" ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
42
source/org/thdl/tib/dictionary/ScannerBasedDictionary.java
Normal file
42
source/org/thdl/tib/dictionary/ScannerBasedDictionary.java
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
package org.thdl.tib.dictionary ;
|
||||||
|
|
||||||
|
import org.thdl.tib.scanner.TibetanScanner ;
|
||||||
|
import org.thdl.tib.scanner.LocalTibetanScanner ;
|
||||||
|
import org.thdl.tib.scanner.RemoteTibetanScanner ;
|
||||||
|
import org.thdl.tib.scanner.Word ;
|
||||||
|
import org.thdl.tib.dictionary.DictionaryEntries ;
|
||||||
|
import org.thdl.tib.dictionary.TextBody ;
|
||||||
|
import org.thdl.tib.dictionary.SimpleDictionaryEntry ;
|
||||||
|
import org.thdl.tib.dictionary.SimpleDictionaryEntries ;
|
||||||
|
|
||||||
|
public class ScannerBasedDictionary implements DictionaryInterface
|
||||||
|
{
|
||||||
|
TibetanScanner scanner ;
|
||||||
|
|
||||||
|
public ScannerBasedDictionary ( TibetanScanner ts )
|
||||||
|
{
|
||||||
|
scanner = ts ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public DictionaryEntries lookup ( TextBody tb )
|
||||||
|
{
|
||||||
|
DictionaryEntries entries = new SimpleDictionaryEntries () ;
|
||||||
|
//
|
||||||
|
// TibetanScanner expects romanized wylie for lookup
|
||||||
|
//
|
||||||
|
String input = tb.getRomanizedWylie () ;
|
||||||
|
|
||||||
|
scanner.scanBody ( input ) ;
|
||||||
|
scanner.finishUp () ;
|
||||||
|
Word [] words = scanner.getWordArray () ;
|
||||||
|
for ( int i = 0; i < words.length; i++ )
|
||||||
|
{
|
||||||
|
SimpleDictionaryEntry entry = SimpleDictionaryEntry.fromWord ( words [i] ) ;
|
||||||
|
entries.add ( entry ) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
scanner.clearTokens () ;
|
||||||
|
|
||||||
|
return entries ;
|
||||||
|
}
|
||||||
|
}
|
13
source/org/thdl/tib/dictionary/SimpleDictionaryEntries.java
Normal file
13
source/org/thdl/tib/dictionary/SimpleDictionaryEntries.java
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
package org.thdl.tib.dictionary ;
|
||||||
|
|
||||||
|
import org.thdl.tib.dictionary.DictionaryEntries ;
|
||||||
|
import java.util.LinkedList ;
|
||||||
|
|
||||||
|
public class SimpleDictionaryEntries extends LinkedList implements DictionaryEntries
|
||||||
|
{
|
||||||
|
public SimpleDictionaryEntries ()
|
||||||
|
{
|
||||||
|
super () ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
76
source/org/thdl/tib/dictionary/SimpleDictionaryEntry.java
Normal file
76
source/org/thdl/tib/dictionary/SimpleDictionaryEntry.java
Normal file
|
@ -0,0 +1,76 @@
|
||||||
|
package org.thdl.tib.dictionary ;
|
||||||
|
|
||||||
|
import org.thdl.tib.dictionary.DictionaryEntry ;
|
||||||
|
import org.thdl.tib.dictionary.SimpleDictionaryEntryDefinitions ;
|
||||||
|
import org.thdl.tib.dictionary.Phonetics ;
|
||||||
|
import org.thdl.tib.scanner.Word ;
|
||||||
|
|
||||||
|
public class SimpleDictionaryEntry implements DictionaryEntry
|
||||||
|
{
|
||||||
|
TextBody keyWord ;
|
||||||
|
DictionaryEntryDefinitions definitions ;
|
||||||
|
|
||||||
|
static boolean useDashes = true ;
|
||||||
|
|
||||||
|
public static SimpleDictionaryEntry fromWord ( Word word )
|
||||||
|
{
|
||||||
|
SimpleDictionaryEntry sde = new SimpleDictionaryEntry () ;
|
||||||
|
|
||||||
|
sde.definitions = SimpleDictionaryEntryDefinitions.fromDefinitions ( word.getDefs () ) ;
|
||||||
|
sde.keyWord = SimpleTextBody.fromWylie ( word.getWylie () ) ;
|
||||||
|
//sde.spaceInfo = word.getSpaceInfo or something like that
|
||||||
|
|
||||||
|
return sde ;
|
||||||
|
}
|
||||||
|
|
||||||
|
boolean hasSpaceBeforeSyllable ( int syllableIndex )
|
||||||
|
{
|
||||||
|
//
|
||||||
|
// TODO
|
||||||
|
//
|
||||||
|
return false ;
|
||||||
|
}
|
||||||
|
|
||||||
|
String joinSyllables ( String text )
|
||||||
|
{
|
||||||
|
String [] syllables = text.split ( " " ) ;
|
||||||
|
|
||||||
|
String out = "" ;
|
||||||
|
for ( int i = 0; i < syllables.length; i++ )
|
||||||
|
{
|
||||||
|
if ( i > 0 )
|
||||||
|
{
|
||||||
|
if ( hasSpaceBeforeSyllable ( i ) )
|
||||||
|
out += " " ;
|
||||||
|
else if ( useDashes )
|
||||||
|
out += "-" ;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
out += syllables [i] ;
|
||||||
|
}
|
||||||
|
|
||||||
|
return out ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public TextBody getKeyword ()
|
||||||
|
{
|
||||||
|
return SimpleTextBody.fromWylie ( joinSyllables ( keyWord.getWylie () ) ) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getPhonetic ()
|
||||||
|
{
|
||||||
|
//
|
||||||
|
// if phonetics specified in the dictionary - use it
|
||||||
|
//
|
||||||
|
// otherwise, generate one (currently the only option)
|
||||||
|
return joinSyllables ( Phonetics.wylieToStandardPhonetic ( keyWord.getWylie () ) ) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public DictionaryEntryDefinitions getDefinitions ()
|
||||||
|
{
|
||||||
|
return definitions ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,18 @@
|
||||||
|
package org.thdl.tib.dictionary ;
|
||||||
|
|
||||||
|
import org.thdl.tib.dictionary.DictionaryEntryDefinition ;
|
||||||
|
|
||||||
|
class SimpleDictionaryEntryDefinition implements DictionaryEntryDefinition
|
||||||
|
{
|
||||||
|
String body ;
|
||||||
|
|
||||||
|
public SimpleDictionaryEntryDefinition ( String theBody )
|
||||||
|
{
|
||||||
|
body = theBody ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String toString ()
|
||||||
|
{
|
||||||
|
return body ;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,49 @@
|
||||||
|
package org.thdl.tib.dictionary ;
|
||||||
|
|
||||||
|
import java.util.Vector ;
|
||||||
|
import org.thdl.tib.scanner.Definitions ;
|
||||||
|
import org.thdl.tib.scanner.DictionarySource ;
|
||||||
|
import org.thdl.tib.scanner.ByteDictionarySource ;
|
||||||
|
import org.thdl.tib.scanner.FileSyllableListTree ;
|
||||||
|
import org.thdl.tib.dictionary.SimpleDictionaryEntryDefinition ;
|
||||||
|
|
||||||
|
class SimpleDictionaryEntryDefinitions extends Vector implements DictionaryEntryDefinitions
|
||||||
|
{
|
||||||
|
public static SimpleDictionaryEntryDefinitions fromDefinitions ( Definitions defs )
|
||||||
|
{
|
||||||
|
SimpleDictionaryEntryDefinitions sded = new SimpleDictionaryEntryDefinitions () ;
|
||||||
|
sded.populate ( defs ) ;
|
||||||
|
|
||||||
|
return sded ;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void populate ( Definitions defs )
|
||||||
|
{
|
||||||
|
DictionarySource source = defs.getDictionarySource () ;
|
||||||
|
String [] defArr = defs.def ;
|
||||||
|
|
||||||
|
int i,j;
|
||||||
|
|
||||||
|
if (FileSyllableListTree.versionNumber==2)
|
||||||
|
{
|
||||||
|
this.add ( new SimpleDictionaryEntryDefinition ( "(" + source.getTag(0) + ") " + defArr[0] ) ) ;
|
||||||
|
for (i=1; i<defArr.length; i++)
|
||||||
|
this.add ( new SimpleDictionaryEntryDefinition ( "(" + source.getTag(i) + ") " + defArr[i] ) ) ;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
ByteDictionarySource sourceb = (ByteDictionarySource) source;
|
||||||
|
j=0;
|
||||||
|
while (sourceb.isEmpty(j)) j++;
|
||||||
|
|
||||||
|
this.add ( new SimpleDictionaryEntryDefinition ( "(" + sourceb.getTag(j) + ") " + defArr[0] ) ) ;
|
||||||
|
for (i=1; i<defArr.length; i++)
|
||||||
|
{
|
||||||
|
j++;
|
||||||
|
while (sourceb.isEmpty(j)) j++;
|
||||||
|
|
||||||
|
this.add ( new SimpleDictionaryEntryDefinition ( "(" + sourceb.getTag(j) + ") " + defArr[i] ) ) ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
package org.thdl.tib.dictionary ;
|
||||||
|
|
||||||
|
import java.util.Vector ;
|
||||||
|
import org.thdl.tib.scanner.Definitions ;
|
||||||
|
|
||||||
|
class SimpleDictionaryEntryDefinitions implements DictionaryEntryDescriptions extends Vector
|
||||||
|
{
|
||||||
|
public static fromDescriptions ( Definitions defs )
|
||||||
|
{
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
102
source/org/thdl/tib/dictionary/SimpleTextBody.java
Normal file
102
source/org/thdl/tib/dictionary/SimpleTextBody.java
Normal file
|
@ -0,0 +1,102 @@
|
||||||
|
package org.thdl.tib.dictionary ;
|
||||||
|
|
||||||
|
import org.thdl.tib.dictionary.TextBody ;
|
||||||
|
|
||||||
|
public class SimpleTextBody implements TextBody
|
||||||
|
{
|
||||||
|
static final int UNDEFINED_TYPE = 0 ;
|
||||||
|
static final int WYLIE_TYPE = 1 ;
|
||||||
|
static final int UNICODE_TYPE = 2 ;
|
||||||
|
|
||||||
|
protected int basicType = UNDEFINED_TYPE ;
|
||||||
|
|
||||||
|
String unicode ;
|
||||||
|
String wylie ;
|
||||||
|
|
||||||
|
SimpleTextBody ()
|
||||||
|
{
|
||||||
|
unicode = "" ;
|
||||||
|
wylie = "" ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static TextBody fromWylie ( String in )
|
||||||
|
{
|
||||||
|
SimpleTextBody stb = new SimpleTextBody () ;
|
||||||
|
stb.setWylie ( in ) ;
|
||||||
|
|
||||||
|
return stb ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static TextBody fromUnicode ( String in )
|
||||||
|
{
|
||||||
|
SimpleTextBody stb = new SimpleTextBody () ;
|
||||||
|
stb.setUnicode ( in ) ;
|
||||||
|
|
||||||
|
return stb ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setWylie ( String in )
|
||||||
|
{
|
||||||
|
wylie = in ;
|
||||||
|
basicType = WYLIE_TYPE ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setUnicode ( String in )
|
||||||
|
{
|
||||||
|
unicode = in ;
|
||||||
|
basicType = UNICODE_TYPE ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getRomanizedWylie ()
|
||||||
|
{
|
||||||
|
String ret = getWylie () ;
|
||||||
|
|
||||||
|
ret = ret.replaceAll ( "[\\/\\_\\*]", " " ) ;
|
||||||
|
|
||||||
|
return ret ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getUnicode ()
|
||||||
|
{
|
||||||
|
if ( UNICODE_TYPE == basicType )
|
||||||
|
{
|
||||||
|
return unicode ;
|
||||||
|
}
|
||||||
|
else if ( WYLIE_TYPE == basicType )
|
||||||
|
{
|
||||||
|
return wylieToUnicode ( unicode ) ;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return "" ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getWylie ()
|
||||||
|
{
|
||||||
|
if ( WYLIE_TYPE == basicType )
|
||||||
|
{
|
||||||
|
return wylie ;
|
||||||
|
}
|
||||||
|
else if ( UNICODE_TYPE == basicType )
|
||||||
|
{
|
||||||
|
return unicodeToWylie ( unicode ) ;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
return "" ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static String unicodeToWylie ( String in )
|
||||||
|
{
|
||||||
|
return "<INVALID>" ;
|
||||||
|
}
|
||||||
|
|
||||||
|
protected static String wylieToUnicode ( String in )
|
||||||
|
{
|
||||||
|
return "<INVALID>" ;
|
||||||
|
}
|
||||||
|
|
||||||
|
};
|
||||||
|
|
541
source/org/thdl/tib/dictionary/StandardPronounciationEngine.java
Normal file
541
source/org/thdl/tib/dictionary/StandardPronounciationEngine.java
Normal file
|
@ -0,0 +1,541 @@
|
||||||
|
package org.thdl.tib.dictionary ;
|
||||||
|
|
||||||
|
import java.lang.* ;
|
||||||
|
import java.util.regex.Pattern ;
|
||||||
|
import java.util.regex.Matcher ;
|
||||||
|
import java.util.Vector ;
|
||||||
|
import java.util.Enumeration ;
|
||||||
|
|
||||||
|
public class StandardPronounciationEngine
|
||||||
|
{
|
||||||
|
public StandardPronounciationEngine () throws Exception
|
||||||
|
{
|
||||||
|
setRules () ;
|
||||||
|
lastWordOriginal = "" ;
|
||||||
|
}
|
||||||
|
|
||||||
|
Rule [] rules ;
|
||||||
|
|
||||||
|
protected int stateFlag ;
|
||||||
|
protected int returnFlag ;
|
||||||
|
protected String lastWordOriginal ;
|
||||||
|
|
||||||
|
class Rule
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* conditions
|
||||||
|
*/
|
||||||
|
public static final int STARTS_WITH = 0 ;
|
||||||
|
public static final int ENDS_WITH = 1 ;
|
||||||
|
public static final int ENDS_WITH_FOLLOWS_VOWEL = 2 ;
|
||||||
|
public static final int ENDS_WITH_FOLLOWS_CONSONANT = 3 ;
|
||||||
|
public static final int STARTS_WITH_BORDERS_VOWEL = 4 ;
|
||||||
|
public static final int STARTS_WITH_BORDERS_CONSONANT = 5 ;
|
||||||
|
public static final int EQUALS = 6 ;
|
||||||
|
public static final int CONTAINS = 7 ;
|
||||||
|
public static final int STARTS_WITH_BORDERS_CONSONANTS = 8 ;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* actions
|
||||||
|
*/
|
||||||
|
public static final int REPLACE_MATCH = 0 ;
|
||||||
|
public static final int SET_FLAG = 1 ;
|
||||||
|
public static final int REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL = 2 ;
|
||||||
|
public static final int REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT = 3 ;
|
||||||
|
public static final int REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT = 4 ;
|
||||||
|
public static final int SET_RETURN_FLAG = 5 ;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* flags
|
||||||
|
*/
|
||||||
|
public static final int FLAG_ENDS_WITH_VOWEL = 1 ;
|
||||||
|
public static final int FLAG_ENDS_WITH_CONSONANT_VOICED = 2 ;
|
||||||
|
public static final int FLAG_ENDS_WITH_CONSONANT_VOICELESS = 4 ;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* return flags
|
||||||
|
*/
|
||||||
|
public static final int DO_NOTHING = 0 ;
|
||||||
|
public static final int DROP_SUFFIX_AND_NASALIZE = 1 ;
|
||||||
|
|
||||||
|
private int condition ;
|
||||||
|
private String condArg ;
|
||||||
|
private int action ;
|
||||||
|
private Object actionArg ;
|
||||||
|
|
||||||
|
private Pattern pattern ;
|
||||||
|
|
||||||
|
private static final String vowelSet = "aeiou" ;
|
||||||
|
private static final String consonantSet = "bBcCdDfgGhjkKlmnNpPrsStTvwXzZ" ;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* constructior
|
||||||
|
*/
|
||||||
|
Rule ( int condition, String condArg, int action, Object actionArg ) throws Exception
|
||||||
|
{
|
||||||
|
this.condition = condition ;
|
||||||
|
this.action = action ;
|
||||||
|
this.condArg = condArg ;
|
||||||
|
this.actionArg = actionArg ;
|
||||||
|
|
||||||
|
String patStr = "" ;
|
||||||
|
|
||||||
|
switch ( condition )
|
||||||
|
{
|
||||||
|
case CONTAINS :
|
||||||
|
patStr = condArg ;
|
||||||
|
break ;
|
||||||
|
case EQUALS :
|
||||||
|
patStr = "^" + condArg + "$" ;
|
||||||
|
break ;
|
||||||
|
case STARTS_WITH :
|
||||||
|
patStr = "^" + condArg ;
|
||||||
|
break ;
|
||||||
|
case ENDS_WITH :
|
||||||
|
patStr = condArg + "$" ;
|
||||||
|
break ;
|
||||||
|
case ENDS_WITH_FOLLOWS_VOWEL :
|
||||||
|
patStr = "([" + vowelSet + "]{1,2})" + condArg + "$" ;
|
||||||
|
if ( REPLACE_MATCH == action ||
|
||||||
|
REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL == action ||
|
||||||
|
REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT == action )
|
||||||
|
this.actionArg = "$1" + (String)actionArg ;
|
||||||
|
break ;
|
||||||
|
case ENDS_WITH_FOLLOWS_CONSONANT :
|
||||||
|
patStr = "([" + consonantSet + "])" + condArg + "$" ;
|
||||||
|
if ( REPLACE_MATCH == action ||
|
||||||
|
REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL == action ||
|
||||||
|
REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT == action )
|
||||||
|
this.actionArg = "$1" + (String)actionArg ;
|
||||||
|
break ;
|
||||||
|
case STARTS_WITH_BORDERS_VOWEL :
|
||||||
|
patStr = "^" + condArg + "([" + vowelSet + "]{1,2})" ;
|
||||||
|
if ( REPLACE_MATCH == action ||
|
||||||
|
REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL == action ||
|
||||||
|
REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT == action )
|
||||||
|
this.actionArg = (String)actionArg + "$1" ;
|
||||||
|
break ;
|
||||||
|
case STARTS_WITH_BORDERS_CONSONANT:
|
||||||
|
patStr = "^" + condArg + "(([" + consonantSet + "]))" ;
|
||||||
|
if ( REPLACE_MATCH == action ||
|
||||||
|
REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL == action ||
|
||||||
|
REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT == action )
|
||||||
|
this.actionArg = (String)actionArg + "$1" ;
|
||||||
|
break ;
|
||||||
|
case STARTS_WITH_BORDERS_CONSONANTS:
|
||||||
|
patStr = "^" + condArg + "([" + consonantSet + "]{2})" ;
|
||||||
|
if ( REPLACE_MATCH == action ||
|
||||||
|
REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL == action ||
|
||||||
|
REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT == action )
|
||||||
|
this.actionArg = (String)actionArg + "$1" ;
|
||||||
|
break ;
|
||||||
|
default:
|
||||||
|
throw new Exception ( "Invalid condition for a rule." ) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
pattern = Pattern.compile ( patStr ) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* property access
|
||||||
|
*/
|
||||||
|
public Object getActionArg ()
|
||||||
|
{
|
||||||
|
return actionArg ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String getConditionArg ()
|
||||||
|
{
|
||||||
|
return condArg ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getCondition ()
|
||||||
|
{
|
||||||
|
return condition ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public int getAction ()
|
||||||
|
{
|
||||||
|
return action ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public Pattern getPattern ()
|
||||||
|
{
|
||||||
|
return pattern ;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* setRules
|
||||||
|
*/
|
||||||
|
protected void setRules () throws Exception
|
||||||
|
{
|
||||||
|
//
|
||||||
|
// based on http://www.thdl.org/xml/showEssay.php?xml=/collections/langling/THDL_phonetics.xml&l=d1e671
|
||||||
|
//
|
||||||
|
Rule [] thdlRules =
|
||||||
|
{
|
||||||
|
//
|
||||||
|
// 6. When ba and bo appear as the final syllable of a word, they are transcribed as "wa" and "wo," respectively.
|
||||||
|
// This also includes ba'i ( > wé, about which see rule 16 below) and bar ( > war) as final syllables, although the latter is
|
||||||
|
// more evident in literary forms
|
||||||
|
//
|
||||||
|
new Rule ( Rule.EQUALS, "ba", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL, "wa" ),
|
||||||
|
new Rule ( Rule.EQUALS, "ba", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT, "wa" ),
|
||||||
|
new Rule ( Rule.EQUALS, "ba", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT, "pa" ),
|
||||||
|
new Rule ( Rule.EQUALS, "bo", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL, "wo" ),
|
||||||
|
new Rule ( Rule.EQUALS, "bo", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT, "wo" ),
|
||||||
|
new Rule ( Rule.EQUALS, "bo", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT, "po" ),
|
||||||
|
new Rule ( Rule.EQUALS, "bar", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL, "war" ),
|
||||||
|
new Rule ( Rule.EQUALS, "bar", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT, "war" ),
|
||||||
|
new Rule ( Rule.EQUALS, "bar", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT, "par" ),
|
||||||
|
new Rule ( Rule.EQUALS, "bo", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL, "wo" ),
|
||||||
|
new Rule ( Rule.EQUALS, "bo", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT, "wo" ),
|
||||||
|
new Rule ( Rule.EQUALS, "bo", Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT, "po" ),
|
||||||
|
|
||||||
|
//
|
||||||
|
// 7. The consonant clusters py, phy and by are transcribed respectively as "ch," "ch," and "j."
|
||||||
|
//
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "py", Rule.REPLACE_MATCH, "c" ),
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "Py", Rule.REPLACE_MATCH, "c" ),
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "by", Rule.REPLACE_MATCH, "j" ) ,
|
||||||
|
|
||||||
|
//
|
||||||
|
// 8. The consonant cluster my is transcribed as "ny."
|
||||||
|
//
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "my", Rule.REPLACE_MATCH, "ny" ) ,
|
||||||
|
|
||||||
|
//
|
||||||
|
// 13. When the second syllable of a word begins with the prefix achung ('), nasalization occurs
|
||||||
|
// A. An 'n' is inserted after the first syllable, and the suffix of the first syllable (if there is one) is elided
|
||||||
|
// B. If the root letter of the second syllable is pha or ba, an 'm' is inserted after the first syllable,
|
||||||
|
// and the suffix of the first syllable (if there is one) is elided
|
||||||
|
new Rule ( Rule.STARTS_WITH, "'", Rule.SET_RETURN_FLAG, new Integer ( Rule.DROP_SUFFIX_AND_NASALIZE ) ),
|
||||||
|
|
||||||
|
//
|
||||||
|
// dirty workaround
|
||||||
|
//
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANTS, "g", Rule.REPLACE_MATCH, "" ),
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANTS, "d", Rule.REPLACE_MATCH, "" ),
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANTS, "b", Rule.REPLACE_MATCH, "" ),
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANTS, "m", Rule.REPLACE_MATCH, "" ),
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANTS, "'", Rule.REPLACE_MATCH, "" ),
|
||||||
|
|
||||||
|
//
|
||||||
|
// 9. Consonant clusters with r subscripts (which are pronounced as retroflexes) are transcribed with an "r."
|
||||||
|
//
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "kr", Rule.REPLACE_MATCH, "tr" ) ,
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "Kr", Rule.REPLACE_MATCH, "tr" ),
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "gr", Rule.REPLACE_MATCH, "dr" ) ,
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "nr", Rule.REPLACE_MATCH, "n" ) ,
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "pr", Rule.REPLACE_MATCH, "tr" ) ,
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "Pr", Rule.REPLACE_MATCH, "tr" ),
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "br", Rule.REPLACE_MATCH, "dr" ) ,
|
||||||
|
|
||||||
|
//
|
||||||
|
// For all other consonant clusters in which the r subscript is not pronounced,
|
||||||
|
// such as mr, sr, and so forth, THDL Simplified Phonetics simply drops the "r"
|
||||||
|
// in accordance with the general principle
|
||||||
|
//
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "mr", Rule.REPLACE_MATCH, "m" ) ,
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "Sr", Rule.REPLACE_MATCH, "S" ),
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "sr", Rule.REPLACE_MATCH, "s" ) ,
|
||||||
|
|
||||||
|
//
|
||||||
|
// 10. Consonant clusters containing a subscript la are transcribed as "l"
|
||||||
|
// with the exception of zl, which is transcribed as "d."
|
||||||
|
//
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "kl", Rule.REPLACE_MATCH, "l" ) ,
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "gl", Rule.REPLACE_MATCH, "l" ) ,
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "bl", Rule.REPLACE_MATCH, "l" ) ,
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "rl", Rule.REPLACE_MATCH, "l" ) ,
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "zl", Rule.REPLACE_MATCH, "d" ) ,
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "sl", Rule.REPLACE_MATCH, "l" ) ,
|
||||||
|
|
||||||
|
//
|
||||||
|
// 11. Consonant clusters with an l superscript and h root letter are rendered "lh."
|
||||||
|
//
|
||||||
|
new Rule ( Rule.STARTS_WITH, "lh", Rule.REPLACE_MATCH, "hl" ),
|
||||||
|
|
||||||
|
//
|
||||||
|
// 12. Consonant clusters with a d prefix and b root letter undergo transformations in the following way,
|
||||||
|
// depending on whether the consonant cluster includes the subscripts y or r:
|
||||||
|
//
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "db", Rule.REPLACE_MATCH, "w" ),
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "dby", Rule.REPLACE_MATCH, "y" ),
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_VOWEL, "dbr", Rule.REPLACE_MATCH, "r" ),
|
||||||
|
|
||||||
|
// !!!!! TODO !!!!!
|
||||||
|
// Note: there are some exceptions to the form the nasalization takes:
|
||||||
|
// skyabs 'gro > kyamdro
|
||||||
|
// rten 'brel > temdrel
|
||||||
|
// lam 'bras > lamdré
|
||||||
|
|
||||||
|
//
|
||||||
|
// http://www.thdl.org/xml/showEssay.php?xml=/collections/langling/THDL_phonetics.xml&l=d1e294
|
||||||
|
// The THDL Simplified Phonetic system, in contrast to Wylie, drops all Tibetan letters not
|
||||||
|
// pronounced in a given syllable. This includes the superscribed consonants r, l, and s ;
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "r", Rule.REPLACE_MATCH, "" ),
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "l", Rule.REPLACE_MATCH, "" ),
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "s", Rule.REPLACE_MATCH, "" ),
|
||||||
|
|
||||||
|
//
|
||||||
|
// .... the prefixes g, d, b, m, and ' ;
|
||||||
|
//
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "g", Rule.REPLACE_MATCH, "" ),
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "d", Rule.REPLACE_MATCH, "" ),
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "b", Rule.REPLACE_MATCH, "" ),
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "m", Rule.REPLACE_MATCH, "" ),
|
||||||
|
new Rule ( Rule.STARTS_WITH_BORDERS_CONSONANT, "'", Rule.REPLACE_MATCH, "" ),
|
||||||
|
|
||||||
|
// the suffixes d , ' , and s ; ( THEY ARE HANDLED ELSEWHERE )
|
||||||
|
|
||||||
|
//
|
||||||
|
// .... and the post-suffixes s and d.
|
||||||
|
//
|
||||||
|
new Rule ( Rule.ENDS_WITH_FOLLOWS_CONSONANT, "s", Rule.REPLACE_MATCH, "" ),
|
||||||
|
new Rule ( Rule.ENDS_WITH_FOLLOWS_CONSONANT, "d", Rule.REPLACE_MATCH, "" ),
|
||||||
|
|
||||||
|
//
|
||||||
|
// 15. When two of the same vowels are connected by an achung, they are transcribed by dropping the achung and
|
||||||
|
// combining the two vowels into one
|
||||||
|
//
|
||||||
|
new Rule ( Rule.CONTAINS, "a'a", Rule.REPLACE_MATCH, "a" ),
|
||||||
|
new Rule ( Rule.CONTAINS, "e'e", Rule.REPLACE_MATCH, "e" ),
|
||||||
|
new Rule ( Rule.CONTAINS, "i'i", Rule.REPLACE_MATCH, "i" ),
|
||||||
|
new Rule ( Rule.CONTAINS, "o'o", Rule.REPLACE_MATCH, "o" ),
|
||||||
|
new Rule ( Rule.CONTAINS, "u'u", Rule.REPLACE_MATCH, "u" ),
|
||||||
|
|
||||||
|
new Rule ( Rule.ENDS_WITH, "as", Rule.REPLACE_MATCH, "é" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "ad", Rule.REPLACE_MATCH, "é" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "an", Rule.REPLACE_MATCH, "én" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "al", Rule.REPLACE_MATCH, "él" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "os", Rule.REPLACE_MATCH, "ö" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "od", Rule.REPLACE_MATCH, "ö" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "on", Rule.REPLACE_MATCH, "ön" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "ol", Rule.REPLACE_MATCH, "öl" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "u'", Rule.REPLACE_MATCH, "u" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "us", Rule.REPLACE_MATCH, "ü" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "ud", Rule.REPLACE_MATCH, "ü" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "un", Rule.REPLACE_MATCH, "ün" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "ul", Rule.REPLACE_MATCH, "ül" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "es", Rule.REPLACE_MATCH, "e" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "ed", Rule.REPLACE_MATCH, "e" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "en", Rule.REPLACE_MATCH, "en" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "el", Rule.REPLACE_MATCH, "el" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "is", Rule.REPLACE_MATCH, "i" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "id", Rule.REPLACE_MATCH, "i" ) ,
|
||||||
|
|
||||||
|
//
|
||||||
|
// 5. The suffixes g and b are devoiced and rendered "k" and "p," respectively,
|
||||||
|
// since this most closely approximates actual pronunciation.
|
||||||
|
//
|
||||||
|
new Rule ( Rule.ENDS_WITH_FOLLOWS_VOWEL, "g", Rule.REPLACE_MATCH, "k" ),
|
||||||
|
new Rule ( Rule.ENDS_WITH_FOLLOWS_VOWEL, "b", Rule.REPLACE_MATCH, "p" ),
|
||||||
|
|
||||||
|
// these rule must be *last*
|
||||||
|
new Rule ( Rule.ENDS_WITH, "[aeiou]", Rule.SET_FLAG, new Integer(Rule.FLAG_ENDS_WITH_VOWEL) ),
|
||||||
|
new Rule ( Rule.ENDS_WITH, "(b|d|g|l|m|n|N|r)", Rule.SET_FLAG, new Integer ( Rule.FLAG_ENDS_WITH_CONSONANT_VOICED ) ),
|
||||||
|
new Rule ( Rule.ENDS_WITH, "(k|p|s|t)", Rule.SET_FLAG, new Integer ( Rule.FLAG_ENDS_WITH_CONSONANT_VOICELESS ) ),
|
||||||
|
|
||||||
|
new Rule ( Rule.ENDS_WITH, "a'i", Rule.REPLACE_MATCH, "e" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "e'i", Rule.REPLACE_MATCH, "e" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "o'i", Rule.REPLACE_MATCH, "ö" ) ,
|
||||||
|
new Rule ( Rule.ENDS_WITH, "u'i", Rule.REPLACE_MATCH, "ü" ),
|
||||||
|
|
||||||
|
//
|
||||||
|
// 14. Multiple vowels that have discrete sounds and are connected by an achung (') are transcribed by dropping the achung
|
||||||
|
// (at this point all achungs have been removed by previous rules)
|
||||||
|
//
|
||||||
|
new Rule ( Rule.CONTAINS, "'", Rule.REPLACE_MATCH, "" ),
|
||||||
|
|
||||||
|
//new Rule ( Rule.CONTAINS, "X", Rule.REPLACE_MATCH, "ts" ),
|
||||||
|
//new Rule ( Rule.CONTAINS, "T", Rule.REPLACE_MATCH, "t" ),
|
||||||
|
//new Rule ( Rule.CONTAINS, "P", Rule.REPLACE_MATCH, "p" ),
|
||||||
|
|
||||||
|
} ;
|
||||||
|
|
||||||
|
rules = thdlRules ;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* applyRule
|
||||||
|
*/
|
||||||
|
protected String applyRule ( Rule rule, String in ) throws Exception
|
||||||
|
{
|
||||||
|
switch ( rule.getAction () )
|
||||||
|
{
|
||||||
|
//
|
||||||
|
// Rule.REPLACE_MATCH - if text defined in condArg found, replace it with actionArg unconditionally
|
||||||
|
//
|
||||||
|
case Rule.REPLACE_MATCH :
|
||||||
|
{
|
||||||
|
Matcher matcher = rule.getPattern ().matcher ( in ) ;
|
||||||
|
return matcher.replaceFirst ( (String)rule.getActionArg () ) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL - if text defined in condArg found,
|
||||||
|
// AND last syllable ends with a vowel - replace it with actionArg
|
||||||
|
//
|
||||||
|
case Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOWEL :
|
||||||
|
if ( 0 != ( stateFlag & Rule.FLAG_ENDS_WITH_VOWEL ) )
|
||||||
|
{
|
||||||
|
Matcher matcher = rule.getPattern ().matcher ( in ) ;
|
||||||
|
return matcher.replaceFirst ( (String)rule.getActionArg () ) ;
|
||||||
|
}
|
||||||
|
break ;
|
||||||
|
|
||||||
|
//
|
||||||
|
// Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT - if text defined in condArg found,
|
||||||
|
// AND last syllable ends with a voiced consonant - replace it with actionArg
|
||||||
|
//
|
||||||
|
case Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICED_CONSONANT :
|
||||||
|
if ( 0 != ( stateFlag & Rule.FLAG_ENDS_WITH_CONSONANT_VOICED ) )
|
||||||
|
{
|
||||||
|
Matcher matcher = rule.getPattern ().matcher ( in ) ;
|
||||||
|
return matcher.replaceFirst ( (String)rule.getActionArg () ) ;
|
||||||
|
}
|
||||||
|
break ;
|
||||||
|
|
||||||
|
//
|
||||||
|
// Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT - if text defined in condArg found,
|
||||||
|
// AND last syllable ends with a voiceless consonant - replace it with actionArg
|
||||||
|
//
|
||||||
|
case Rule.REPLACE_IF_LAST_SYLLABLE_ENDED_WITH_VOICELESS_CONSONANT :
|
||||||
|
if ( 0 != ( stateFlag & Rule.FLAG_ENDS_WITH_CONSONANT_VOICELESS ) )
|
||||||
|
{
|
||||||
|
Matcher matcher = rule.getPattern ().matcher ( in ) ;
|
||||||
|
return matcher.replaceFirst ( (String)rule.getActionArg () ) ;
|
||||||
|
}
|
||||||
|
break ;
|
||||||
|
|
||||||
|
//
|
||||||
|
// Rule.SET_FLAG - set the flag
|
||||||
|
//
|
||||||
|
case Rule.SET_FLAG :
|
||||||
|
if ( rule.getPattern ().matcher ( in ).find () )
|
||||||
|
{
|
||||||
|
stateFlag = ((Integer) rule.getActionArg ()).intValue () ;
|
||||||
|
}
|
||||||
|
break ;
|
||||||
|
|
||||||
|
//
|
||||||
|
// Rule.SET_RETURN_FLAG - set the return flag
|
||||||
|
//
|
||||||
|
case Rule.SET_RETURN_FLAG :
|
||||||
|
if ( rule.getPattern ().matcher ( in ).find () )
|
||||||
|
{
|
||||||
|
returnFlag = ((Integer) rule.getActionArg ()).intValue () ;
|
||||||
|
}
|
||||||
|
break ;
|
||||||
|
|
||||||
|
//
|
||||||
|
// shouldn't ever happen
|
||||||
|
//
|
||||||
|
default:
|
||||||
|
throw new Exception ( "Invalid action." ) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
return in ;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* processWord
|
||||||
|
*/
|
||||||
|
protected String processWord ( String in ) throws Exception
|
||||||
|
{
|
||||||
|
returnFlag = Rule.DO_NOTHING ;
|
||||||
|
lastWordOriginal = in ;
|
||||||
|
|
||||||
|
//
|
||||||
|
// run the word through all rules
|
||||||
|
//
|
||||||
|
for ( int i = 0; i < rules.length; i++ )
|
||||||
|
{
|
||||||
|
in = applyRule ( rules [i], in ) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
return in ;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static boolean isVowel ( char c )
|
||||||
|
{
|
||||||
|
return ( -1 != "aeiouéöü".indexOf ( c ) ) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String dropSuffix ( String in )
|
||||||
|
{
|
||||||
|
while ( !isVowel ( in.charAt ( in.length () - 1 ) ) )
|
||||||
|
in = in.substring ( 0, in.length () - 1 ) ;
|
||||||
|
|
||||||
|
return in ;
|
||||||
|
}
|
||||||
|
|
||||||
|
private String nasalize ( String in, String followingSyllable )
|
||||||
|
{
|
||||||
|
if ( followingSyllable.startsWith ( "'by" ) || followingSyllable.startsWith ( "'br" ) )
|
||||||
|
in = in + "n" ;
|
||||||
|
else if ( followingSyllable.startsWith ( "'b" ) || lastWordOriginal.startsWith ( "'ph" ) )
|
||||||
|
in = in + "m" ;
|
||||||
|
else
|
||||||
|
in = in + "n" ;
|
||||||
|
|
||||||
|
return in ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public String processWylie ( String in ) throws Exception
|
||||||
|
{
|
||||||
|
return process ( eliminateDigraphs ( in ) ) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String eliminateDigraphs ( String in )
|
||||||
|
{
|
||||||
|
//
|
||||||
|
// replace the roman digraphs
|
||||||
|
//
|
||||||
|
in = in.replaceAll ( "bh", "B" ) ;
|
||||||
|
in = in.replaceAll ( "ch", "C" ) ;
|
||||||
|
in = in.replaceAll ( "dz", "D" ) ;
|
||||||
|
in = in.replaceAll ( "kh", "K" ) ;
|
||||||
|
in = in.replaceAll ( "ng", "N" ) ;
|
||||||
|
in = in.replaceAll ( "ph", "P" ) ;
|
||||||
|
in = in.replaceAll ( "sh", "S" ) ;
|
||||||
|
in = in.replaceAll ( "th", "T" ) ;
|
||||||
|
in = in.replaceAll ( "ts", "X" ) ;
|
||||||
|
in = in.replaceAll ( "tsh", "Q" ) ;
|
||||||
|
in = in.replaceAll ( "zh", "Z" ) ;
|
||||||
|
|
||||||
|
return in ;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* process
|
||||||
|
*/
|
||||||
|
public String process ( String in ) throws Exception
|
||||||
|
{
|
||||||
|
stateFlag = 0 ;
|
||||||
|
|
||||||
|
String out = "" ;
|
||||||
|
|
||||||
|
in = in.trim () ;
|
||||||
|
String [] matchWords = in.split ( "[^a-z'A-Z]+" ) ;
|
||||||
|
|
||||||
|
for ( int i = 0; i < matchWords.length; i++ )
|
||||||
|
{
|
||||||
|
String newWord = processWord ( matchWords [i] ) ;
|
||||||
|
if ( i > 0 && Rule.DROP_SUFFIX_AND_NASALIZE == returnFlag )
|
||||||
|
{
|
||||||
|
out = dropSuffix ( out ) ;
|
||||||
|
out = nasalize ( out, newWord ) ;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( out.length () > 0 )
|
||||||
|
out += " " ;
|
||||||
|
|
||||||
|
out += newWord ;
|
||||||
|
}
|
||||||
|
|
||||||
|
return out ;
|
||||||
|
}
|
||||||
|
}
|
42
source/org/thdl/tib/dictionary/TextBody.java
Normal file
42
source/org/thdl/tib/dictionary/TextBody.java
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
package org.thdl.tib.dictionary ;
|
||||||
|
|
||||||
|
import java.lang.String ;
|
||||||
|
|
||||||
|
public interface TextBody
|
||||||
|
{
|
||||||
|
/**
|
||||||
|
* setWylie
|
||||||
|
*
|
||||||
|
* populate TextBody based on romanized Wylie input string
|
||||||
|
*/
|
||||||
|
void setWylie ( String in ) ;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* setUnicode
|
||||||
|
*
|
||||||
|
* populate TextBody based on Unicode input string
|
||||||
|
*/
|
||||||
|
void setUnicode ( String in ) ;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* getRomanizedWylie
|
||||||
|
*
|
||||||
|
* populate TextBody based on romanized Wylie input string
|
||||||
|
*/
|
||||||
|
public String getRomanizedWylie () ;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* getWylie
|
||||||
|
*
|
||||||
|
* populate TextBody based on Wylie input string
|
||||||
|
*/
|
||||||
|
public String getWylie () ;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* getUnicode
|
||||||
|
*
|
||||||
|
* populate TextBody based on Unicode input string
|
||||||
|
*/
|
||||||
|
public String getUnicode () ;
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue