2002-11-02 03:38:59 +00:00
/ *
The contents of this file are subject to the THDL Open Community License
Version 1 . 0 ( the " License " ) ; you may not use this file except in compliance
with the License . You may obtain a copy of the License on the THDL web site
( http : //www.thdl.org/).
Software distributed under the License is distributed on an " AS IS " basis ,
WITHOUT WARRANTY OF ANY KIND , either express or implied . See the
License for the specific terms governing rights and limitations under the
License .
The Initial Developer of this software is the Tibetan and Himalayan Digital
2004-05-01 19:11:15 +00:00
Library ( THDL ) . Portions created by the THDL are Copyright 2001 - 2004 THDL .
2002-11-02 03:38:59 +00:00
All Rights Reserved .
Contributor ( s ) : ______________________________________ .
* /
package org.thdl.tib.text ;
import java.util.* ;
import javax.swing.* ;
import javax.swing.text.* ;
import javax.swing.text.rtf.RTFEditorKit ;
import java.io.* ;
import org.thdl.util.ThdlDebug ;
2003-10-19 20:16:06 +00:00
import org.thdl.tib.text.ttt.ACIPTshegBarScanner ;
import org.thdl.tib.text.ttt.ACIPConverter ;
2003-08-23 22:03:37 +00:00
import org.thdl.tib.text.tshegbar.LegalTshegBar ;
import org.thdl.tib.text.tshegbar.UnicodeConstants ;
import org.thdl.tib.text.tshegbar.UnicodeUtils ;
2002-11-02 03:38:59 +00:00
/ * *
* Provides methods for converting back and forth between Extended
2004-05-01 19:11:15 +00:00
* Wylie / ACIP and Tibetan represented in TibetanMachineWeb glyphs .
* This class is not instantiable .
2002-11-02 03:38:59 +00:00
*
2004-05-01 19:11:15 +00:00
* < p > The class provides a variety of static methods for converting
* back and forth between Extended Wylie or ACIP and TibetanMachineWeb .
* The Wylie / ACIP can be accessed as a String , while the
* TibetanMachineWeb can be exported as Rich Text Format .
2002-11-02 03:38:59 +00:00
*
* @author Edward Garrett , Tibetan and Himalayan Digital Library * /
2003-03-31 00:33:50 +00:00
public class TibTextUtils implements THDLWylieConstants {
2003-07-08 14:30:17 +00:00
/ * * Change to true to see various things on System . out and
System . err . * /
private static final boolean debug = false ;
2002-11-02 03:38:59 +00:00
/** Do not use this contructor. */
private TibTextUtils ( ) { super ( ) ; }
/ * *
* Converts a list of glyphs into an array of { @link DuffData DuffData } .
* The motivation for this is that most processes - for example using
* TibetanMachineWeb in HTML - only need to know what
* text to output , and when to change fonts . In general , they don ' t
* need to have an explicit indication for each glyph of the font
* for that glyph .
* @param glyphs the list of TibetanMachineWeb glyphs
* you want to convert
* @return an array of DuffData corresponding to this
* list of glyphs
* /
2003-10-18 18:34:49 +00:00
public static DuffData [ ] convertGlyphs ( List glyphs ) {
if ( glyphs . size ( ) = = 0 )
return null ;
List data = new ArrayList ( ) ;
StringBuffer sb = new StringBuffer ( ) ;
Iterator iter = glyphs . iterator ( ) ;
DuffCode dc = ( DuffCode ) iter . next ( ) ;
int lastfont = dc . getFontNum ( ) ;
sb . append ( dc . getCharacter ( ) ) ;
while ( iter . hasNext ( ) ) {
dc = ( DuffCode ) iter . next ( ) ;
if ( dc . getFontNum ( ) = = lastfont )
sb . append ( dc . getCharacter ( ) ) ;
else {
data . add ( new DuffData ( sb . toString ( ) , lastfont ) ) ;
lastfont = dc . getFontNum ( ) ;
sb = new StringBuffer ( ) ;
sb . append ( dc . getCharacter ( ) ) ;
}
}
data . add ( new DuffData ( sb . toString ( ) , lastfont ) ) ;
DuffData [ ] dd = new DuffData [ 0 ] ;
dd = ( DuffData [ ] ) data . toArray ( dd ) ;
return dd ;
}
2002-11-02 03:38:59 +00:00
/ * *
2003-04-08 04:56:40 +00:00
* Figures out how to arrange a list of characters into glyphs . For
* example , if the user types ' bsgr ' using the Extended Wylie keyboard ,
* this method figures out that this should be represented as a 'b'
* glyph followed by a ' s - g - r ' glyph . If you know that the characters
* do not contain Sanskrit stacks , or do not contain Tibetan stacks ,
* then you can specify this to speed the process up . Otherwise , the
* method will first check to see if the characters correspond to any
* Tibetan stacks , and if not , then it will check for Sanskrit stacks .
* @param chars the list of Tibetan characters you want to find glyphs
* for
* @param areStacksOnRight whether stacking should try to maximize from
* right to left ( true ) or from left to right ( false ) . In the Extended
* Wylie keyboard , you try to stack from right to left . Thus , the
* character sequence r - g - r would be stacked as r followed by gr ,
* rather than rg followed by r . In the Sambhota and TCC keyboards , the
* stack direction is reversed .
* @param definitelyTibetan should be true if the characters are known
* to be Tibetan and not Sanskrit
* @param definitelySanskrit should be true if the characters are known
* to be Sanskrit and not Tibetan
2002-11-02 03:38:59 +00:00
* /
2003-10-18 18:34:49 +00:00
public static List getGlyphs ( List chars , boolean areStacksOnRight , boolean definitelyTibetan , boolean definitelySanskrit ) {
StringBuffer tibBuffer , sanBuffer ;
String tibCluster , sanCluster ;
2002-11-02 03:38:59 +00:00
2003-10-18 18:34:49 +00:00
boolean checkTibetan , checkSanskrit ;
2002-11-02 03:38:59 +00:00
2003-10-18 18:34:49 +00:00
if ( ! ( definitelyTibetan | | definitelySanskrit ) ) {
checkTibetan = true ;
checkSanskrit = true ;
}
else {
checkTibetan = definitelyTibetan ;
checkSanskrit = definitelySanskrit ;
}
2002-11-02 03:38:59 +00:00
2003-10-18 18:34:49 +00:00
int length = chars . size ( ) ;
2002-11-02 03:38:59 +00:00
2003-10-18 18:34:49 +00:00
List glyphs = new ArrayList ( ) ;
glyphs . clear ( ) ;
if ( areStacksOnRight ) {
for ( int i = 0 ; i < length ; i + + ) {
tibBuffer = new StringBuffer ( ) ;
tibCluster = null ;
sanBuffer = new StringBuffer ( ) ;
sanCluster = null ;
for ( int k = i ; k < length ; k + + ) {
String s = ( String ) chars . get ( k ) ;
if ( checkTibetan )
tibBuffer . append ( s ) ;
if ( checkSanskrit )
sanBuffer . append ( s ) ;
if ( k ! = length - 1 ) {
if ( checkTibetan )
tibBuffer . append ( " - " ) ;
if ( checkSanskrit )
sanBuffer . append ( " + " ) ;
}
}
if ( checkTibetan ) {
tibCluster = tibBuffer . toString ( ) ;
if ( TibetanMachineWeb . hasGlyph ( tibCluster ) ) {
Iterator iter = chars . iterator ( ) ;
for ( int k = 0 ; k < i ; k + + ) //should really check here to make sure glyphs exist FIXME
glyphs . add ( TibetanMachineWeb . getGlyph ( ( String ) iter . next ( ) ) ) ;
glyphs . add ( TibetanMachineWeb . getGlyph ( tibCluster ) ) ;
return glyphs ;
}
}
if ( checkSanskrit ) {
sanCluster = sanBuffer . toString ( ) ;
if ( TibetanMachineWeb . hasGlyph ( sanCluster ) ) {
Iterator iter = chars . iterator ( ) ;
for ( int k = 0 ; k < i ; k + + ) //should really check here to make sure glyphs exist FIXME
glyphs . add ( TibetanMachineWeb . getGlyph ( ( String ) iter . next ( ) ) ) ;
glyphs . add ( TibetanMachineWeb . getGlyph ( sanCluster ) ) ;
return glyphs ;
}
}
}
}
else {
for ( int i = length - 1 ; i > - 1 ; i - - ) {
tibBuffer = new StringBuffer ( ) ;
tibCluster = null ;
sanBuffer = new StringBuffer ( ) ;
sanCluster = null ;
Iterator iter = chars . iterator ( ) ;
for ( int k = 0 ; k < i + 1 ; k + + ) {
String s = ( String ) iter . next ( ) ;
if ( checkTibetan )
tibBuffer . append ( s ) ;
if ( checkSanskrit )
sanBuffer . append ( s ) ;
if ( k ! = i ) {
if ( checkTibetan )
tibBuffer . append ( " - " ) ;
if ( checkSanskrit )
sanBuffer . append ( " + " ) ;
}
}
if ( checkTibetan ) {
tibCluster = tibBuffer . toString ( ) ;
if ( TibetanMachineWeb . hasGlyph ( tibCluster ) ) {
glyphs . add ( TibetanMachineWeb . getGlyph ( tibCluster ) ) ;
for ( int k = i + 1 ; k < length ; k + + )
glyphs . add ( TibetanMachineWeb . getGlyph ( ( String ) iter . next ( ) ) ) ;
return glyphs ;
}
}
if ( checkSanskrit ) {
sanCluster = sanBuffer . toString ( ) ;
if ( TibetanMachineWeb . hasGlyph ( sanCluster ) ) {
glyphs . add ( TibetanMachineWeb . getGlyph ( sanCluster ) ) ;
for ( int k = i + 1 ; k < length ; k + + )
glyphs . add ( TibetanMachineWeb . getGlyph ( ( String ) iter . next ( ) ) ) ;
return glyphs ;
}
}
}
}
return null ;
}
2002-11-02 03:38:59 +00:00
/ * *
2003-04-08 04:56:40 +00:00
* Finds the first meaningful element to occur within a string of
* Extended Wylie . This could be a character , a vowel , punctuation , or
* formatting . For example , passed the string ' tshapo ' , this method
* will return ' tsh ' .
2002-11-02 03:38:59 +00:00
* @param wylie the String of wylie you want to scan
* @return the next meaningful subpart of this string , or null if
* no meaningful subpart can be found ( for example 'x' has no equivalent
* in Extended Wylie )
* /
2003-10-18 18:34:49 +00:00
public static String getNext ( String wylie ) {
boolean hasThereBeenValidity = false ;
boolean isThereValidity = false ;
String s ;
int i ;
int offset = 0 ;
char c = wylie . charAt ( offset ) ;
int k = ( int ) c ;
if ( k < 32 ) //return null if character is just formatting
return String . valueOf ( c ) ;
if ( c = = WYLIE_DISAMBIGUATING_KEY )
return String . valueOf ( WYLIE_DISAMBIGUATING_KEY ) ;
if ( c = = WYLIE_SANSKRIT_STACKING_KEY )
return String . valueOf ( WYLIE_SANSKRIT_STACKING_KEY ) ;
for ( i = offset + 1 ; i < wylie . length ( ) + 1 ; i + + ) {
s = wylie . substring ( offset , i ) ;
if ( ! isThereValidity ) {
if ( TibetanMachineWeb . isWyliePunc ( s ) | | TibetanMachineWeb . isWylieVowel ( s ) | | TibetanMachineWeb . isWylieChar ( s ) ) {
isThereValidity = true ;
hasThereBeenValidity = true ;
}
}
else {
if ( ! TibetanMachineWeb . isWyliePunc ( s ) & & ! TibetanMachineWeb . isWylieVowel ( s ) & & ! TibetanMachineWeb . isWylieChar ( s ) ) {
isThereValidity = false ;
break ;
}
}
}
if ( ! hasThereBeenValidity )
s = null ;
else {
if ( isThereValidity ) //the whole text region is valid
s = wylie . substring ( offset , wylie . length ( ) ) ;
else //the loop was broken out of
s = wylie . substring ( offset , i - 1 ) ;
}
return s ;
}
2002-11-02 03:38:59 +00:00
2003-07-01 03:43:33 +00:00
/ * * An array containing one boolean value . Pass this to
TibetanMachineWeb . getWylieForGlyph ( . . ) if you don ' t care if a
certain glyph has corresponding Wylie or not . * /
public static final boolean [ ] weDoNotCareIfThereIsCorrespondingWylieOrNot
= new boolean [ ] { false } ;
2003-10-19 20:16:06 +00:00
/ * *
* Converts a string of ACIP into TibetanMachineWeb and inserts that
* into tdoc at offset loc .
* @param acip the ACIP you want to convert
* @param tdoc the document in which to insert the TMW
2003-10-26 00:24:28 +00:00
* @param loc the offset inside the document at which to insert the TMW
* @param withWarnings true if and only if you want warnings to appear
* in the output , such as " this could be a mistranscription of blah... "
2003-10-19 20:16:06 +00:00
* @throws InvalidACIPException if the ACIP is deemed invalid , i . e . if
* it does not conform to the ACIP transcription rules ( those in the
* official document and the subtler rules pieced together by David
* Chandler through study and private correspondence with Robert
2003-10-26 18:25:25 +00:00
* Chilton )
* @return the number of characters inserted into tdoc * /
public static int insertTibetanMachineWebForACIP ( String acip ,
TibetanDocument tdoc ,
int loc ,
boolean withWarnings )
2003-10-19 20:16:06 +00:00
throws InvalidACIPException
{
StringBuffer errors = new StringBuffer ( ) ;
2004-04-25 00:37:57 +00:00
String warningLevel = withWarnings ? " All " : " None " ;
ArrayList al = ACIPTshegBarScanner . scan ( acip , errors , 500 , false ,
warningLevel ) ;
2003-10-19 20:16:06 +00:00
if ( null = = al | | errors . length ( ) > 0 ) {
if ( errors . length ( ) > 0 )
throw new InvalidACIPException ( errors . toString ( ) ) ;
else
throw new InvalidACIPException ( " Fatal error converting ACIP to TMW. " ) ;
}
2003-11-23 01:22:27 +00:00
boolean colors = withWarnings ;
2003-10-19 20:16:06 +00:00
boolean putWarningsInOutput = false ;
if ( " None " ! = warningLevel ) {
putWarningsInOutput = true ;
}
try {
2003-10-26 18:25:25 +00:00
int tloc [ ] = new int [ ] { loc } ;
2003-12-14 07:41:15 +00:00
ACIPConverter . convertToTMW ( al , tdoc , null , null , null ,
2003-10-26 18:25:25 +00:00
putWarningsInOutput , warningLevel ,
2004-04-24 17:49:16 +00:00
false , colors , tloc ) ;
2003-10-26 18:25:25 +00:00
return tloc [ 0 ] - loc ;
2003-10-19 20:16:06 +00:00
} catch ( IOException e ) {
throw new Error ( " Can't happen: " + e ) ;
}
}
2002-11-02 03:38:59 +00:00
/ * *
* Converts a string of Extended Wylie into { @link DuffData DuffData } .
* @param wylie the Wylie you want to convert
* @return an array of TibetanMachineWeb data
* corresponding to the Wylie text
* @throws InvalidWylieException if the Wylie is deemed invalid ,
* i . e . if it does not conform to the Extended Wylie standard
* /
2003-10-19 20:16:06 +00:00
public static DuffData [ ] getTibetanMachineWebForEWTS ( String wylie ) throws InvalidWylieException {
2002-11-02 03:38:59 +00:00
List chars = new ArrayList ( ) ;
2003-10-18 18:34:49 +00:00
DuffCode dc ;
int start = 0 ;
boolean isSanskrit = false ;
boolean wasLastSanskritStackingKey = false ;
LinkedList glyphs = new LinkedList ( ) ;
while ( start < wylie . length ( ) ) {
String next = getNext ( wylie . substring ( start ) ) ;
if ( next = = null ) {
if ( ! chars . isEmpty ( ) ) {
glyphs . addAll ( getGlyphs ( chars , true , ! isSanskrit , isSanskrit ) ) ;
chars . clear ( ) ;
isSanskrit = false ;
}
else { //could not convert - throw exception
2003-06-03 23:45:15 +00:00
// FIXME: we're printing to stdout!
2003-10-18 18:34:49 +00:00
if ( start + 5 < = wylie . length ( ) ) {
System . out . println ( " Bad wylie: "
2003-06-03 23:45:15 +00:00
+ wylie . substring ( start ,
start + 5 ) ) ;
} else {
2003-10-18 18:34:49 +00:00
System . out . println ( " Bad wylie: " + wylie . substring ( start ) ) ;
}
throw new InvalidWylieException ( wylie , start ) ;
}
}
else if ( TibetanMachineWeb . isWyliePunc ( next ) ) {
if ( ! chars . isEmpty ( ) )
glyphs . addAll ( getGlyphs ( chars , true , ! isSanskrit , isSanskrit ) ) ;
chars . clear ( ) ;
if ( next . equals ( String . valueOf ( BINDU ) ) ) {
if ( glyphs . isEmpty ( ) )
dc = null ;
else
dc = ( DuffCode ) glyphs . removeLast ( ) ; //LinkedList implementation
getBindu ( glyphs , dc ) ;
}
else {
dc = TibetanMachineWeb . getGlyph ( next ) ;
glyphs . add ( dc ) ;
}
isSanskrit = false ;
}
else if ( TibetanMachineWeb . isWylieVowel ( next ) ) {
if ( ! chars . isEmpty ( ) ) {
glyphs . addAll ( getGlyphs ( chars , true , ! isSanskrit , isSanskrit ) ) ;
dc = ( DuffCode ) glyphs . removeLast ( ) ; //LinkedList implementation
getVowel ( glyphs , dc , next ) ;
chars . clear ( ) ;
}
else { //if previous is punctuation or null, then achen plus vowel - otherwise, previous could be vowel
int size = glyphs . size ( ) ;
vowel_block : {
if ( size > 1 ) {
dc = ( DuffCode ) glyphs . get ( glyphs . size ( ) - 1 ) ;
if ( ! TibetanMachineWeb . isWyliePunc ( TibetanMachineWeb . getWylieForGlyph ( dc , weDoNotCareIfThereIsCorrespondingWylieOrNot ) ) ) {
DuffCode dc_2 = ( DuffCode ) glyphs . removeLast ( ) ;
DuffCode dc_1 = ( DuffCode ) glyphs . removeLast ( ) ;
getVowel ( glyphs , dc_1 , dc_2 , next ) ;
break vowel_block ;
}
}
DuffCode [ ] dc_array = ( DuffCode [ ] ) TibetanMachineWeb . getTibHash ( ) . get ( ACHEN ) ;
dc = dc_array [ TibetanMachineWeb . TMW ] ;
getVowel ( glyphs , dc , next ) ;
2003-06-03 23:45:15 +00:00
}
2003-10-18 18:34:49 +00:00
chars . clear ( ) ;
}
isSanskrit = false ;
}
else if ( TibetanMachineWeb . isWylieChar ( next ) ) {
if ( ! isSanskrit ) //add char to list - it is not sanskrit
chars . add ( next ) ;
else if ( wasLastSanskritStackingKey ) { //add char to list - it is still part of sanskrit stack
chars . add ( next ) ;
wasLastSanskritStackingKey = false ;
}
else { //char is no longer part of sanskrit stack, therefore compute and add previous stack
glyphs . addAll ( getGlyphs ( chars , true , ! isSanskrit , isSanskrit ) ) ;
chars . clear ( ) ;
chars . add ( next ) ;
isSanskrit = false ;
wasLastSanskritStackingKey = false ;
}
}
else if ( next . equals ( String . valueOf ( WYLIE_DISAMBIGUATING_KEY ) ) ) {
if ( ! chars . isEmpty ( ) )
glyphs . addAll ( getGlyphs ( chars , true , ! isSanskrit , isSanskrit ) ) ;
chars . clear ( ) ;
isSanskrit = false ;
}
else if ( next . equals ( String . valueOf ( WYLIE_SANSKRIT_STACKING_KEY ) ) ) {
if ( ! isSanskrit ) { //begin sanskrit stack
switch ( chars . size ( ) ) {
case 0 :
break ; //'+' is not "pre-stacking" key
case 1 :
isSanskrit = true ;
wasLastSanskritStackingKey = true ;
break ;
default :
String top_char = ( String ) chars . get ( chars . size ( ) - 1 ) ;
chars . remove ( chars . size ( ) - 1 ) ;
2003-09-05 05:08:47 +00:00
// DLC PERFORMANCE FIXME: make glyphs a parameter
2003-10-18 18:34:49 +00:00
glyphs . addAll ( getGlyphs ( chars , true , ! isSanskrit , isSanskrit ) ) ;
chars . clear ( ) ;
chars . add ( top_char ) ;
isSanskrit = true ;
wasLastSanskritStackingKey = true ;
break ;
}
}
}
else if ( TibetanMachineWeb . isFormatting ( next . charAt ( 0 ) ) ) {
if ( ! chars . isEmpty ( ) )
glyphs . addAll ( getGlyphs ( chars , true , ! isSanskrit , isSanskrit ) ) ;
dc = new DuffCode ( 1 , next . charAt ( 0 ) ) ;
glyphs . add ( dc ) ;
chars . clear ( ) ;
isSanskrit = false ;
}
if ( next ! = null )
start + = next . length ( ) ;
}
if ( ! chars . isEmpty ( ) ) {
glyphs . addAll ( getGlyphs ( chars , true , ! isSanskrit , isSanskrit ) ) ;
chars . clear ( ) ;
}
DuffData [ ] dd = convertGlyphs ( glyphs ) ;
return dd ;
}
2002-11-02 03:38:59 +00:00
/ * *
2003-09-12 05:06:37 +00:00
* Gets the bindu sequence for a given context . In the
* TibetanMachineWeb fonts , bindu ( anusvara ) is realized differently
* depending on which vowel it attaches to . Although the default bindu
* glyph is affixed to consonants and subscript vowels , for superscript
* vowels ( i , e , o , etc ) , there is a single glyph which merges the
* bindu and that vowel together . When you pass this method a glyph
* context and a list , it will append to that list glyphs which will either consist
* of the original glyph followed by the default bindu glyph , or a
* composite vowel + bindu glyph . Note that there is only one glyph in
* the context . This means that bindus will not affix properly if
* superscript vowels are allowed to directly precede subscript vowels
* ( e . g . pou ) .
* @param list a List of DuffCode glyphs to which will be appended the
* original dc ( if non - null ) as well as a bindu , or the one glyph that
* represents both
* @param dc the DuffCode of the glyph you want to attach a bindu to ,
* or null * /
public static void getBindu ( List list , DuffCode dc ) {
if ( null = = dc ) {
list . add ( TibetanMachineWeb . getGlyph ( String . valueOf ( BINDU ) ) ) ;
} else {
if ( ! TibetanMachineWeb . getBinduMap ( ) . containsKey ( dc ) ) {
list . add ( dc ) ;
list . add ( TibetanMachineWeb . getGlyph ( String . valueOf ( BINDU ) ) ) ;
} else {
list . add ( ( DuffCode ) TibetanMachineWeb . getBinduMap ( ) . get ( dc ) ) ;
}
}
}
2002-11-02 03:38:59 +00:00
/ * *
2003-09-12 05:06:37 +00:00
* Gets the vowel sequence for a given vowel in a given context . Given
* a context , this method affixes a vowel and returns the context ( iff
* context_added [ 0 ] is false ) plus the vowel . Generally , it is enough
* to provide just one glyph for context .
2002-11-02 03:38:59 +00:00
* @param context the glyph preceding the vowel you want to affix
* @param vowel the vowel you want to affix , in Wylie
2003-09-12 05:06:37 +00:00
* @param context_added an array of one boolean , an input / output
* parameter that , if true , means that only the vowel will be added to
* l , not the context , and if false , means that the context and the
* vowel will be added and that context_added [ 0 ] will be updated to be
* true
2002-11-02 03:38:59 +00:00
* @return a List of glyphs equal to the vowel in context
2003-09-12 05:06:37 +00:00
* @throws IllegalArgumentException if the given combination is not
* supported * /
public static void getVowel ( List l , DuffCode context , String vowel , boolean context_added [ ] ) {
getVowel ( l , null , context , vowel , context_added ) ;
}
/** Wrapper that calls for adding context to l. */
2003-09-05 05:08:47 +00:00
public static void getVowel ( List l , DuffCode context , String vowel ) {
2003-09-12 05:06:37 +00:00
getVowel ( l , null , context , vowel , new boolean [ ] { false } ) ;
}
/** Wrapper that calls for adding context to l. */
public static void getVowel ( List l , DuffCode context_1 , DuffCode context_2 , String vowel ) {
getVowel ( l , context_1 , context_2 , vowel , new boolean [ ] { false } ) ;
2003-09-05 05:08:47 +00:00
}
2002-11-02 03:38:59 +00:00
/ * *
2003-09-05 05:08:47 +00:00
* Gets the vowel sequence for a given vowel in a given context and
* appends it to l . Given a context , this method affixes a vowel and
2003-09-12 05:06:37 +00:00
* appends the context ( iff context_added [ 0 ] is false ) plus the vowel
* to l . Since the choice of vowel glyph depends on the consonant to
* which it is attached , generally it is enough to provide just the
* immediately preceding context . However , in some cases , double vowels
* are allowed - for example ' buo ' . To find the correct glyph for 'o' ,
* we need 'b' in this case , not 'u' . Note also that some Extended
* Wylie vowels correspond to multiple glyphs in TibetanMachineWeb . For
* example , the vowel I consists of both an achung and a reverse
* gigu . All required glyphs are appended to l .
2003-09-05 05:08:47 +00:00
* @param context_1 the glyph occurring two glyphs before the vowel you
* want to affix
* @param context_2 the glyph immediately before the vowel you want to
* affix
2003-09-12 05:06:37 +00:00
* @param vowel the vowel you want to affix , in Wylie
* @param context_added an array of one boolean , an input / output
* parameter that , if true , means that only the vowel will be added to
* l , not the context , and if false , means that the context and the
* vowel will be added and that context_added [ 0 ] will be updated to be
* true
* @throws IllegalArgumentException if the given combination is not
* supported * /
public static void getVowel ( List l , DuffCode context_1 , DuffCode context_2 ,
String vowel , boolean context_added [ ] )
throws IllegalArgumentException
{
//this vowel doesn't correspond to a glyph -
//so you just return the original context
if ( vowel . equals ( WYLIE_aVOWEL )
| | TibetanMachineWeb . isTopVowel ( context_2 ) ) {
if ( TibetanMachineWeb . isTopVowel ( context_2 ) )
2003-10-16 04:13:14 +00:00
throw new IllegalArgumentException ( " dropping vowels is bad1 " ) ;
2003-09-12 05:06:37 +00:00
if ( ! context_added [ 0 ] ) {
context_added [ 0 ] = true ;
if ( context_1 ! = null )
l . add ( context_1 ) ;
l . add ( context_2 ) ;
}
return ;
}
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
//first, the three easiest cases: ai, au, and <i
//these vowels have one invariant form - therefore,
//dc_context is just returned along with that form
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( vowel . equals ( ai_VOWEL ) ) {
if ( ! context_added [ 0 ] ) {
context_added [ 0 ] = true ;
if ( context_1 ! = null )
l . add ( context_1 ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
l . add ( context_2 ) ;
}
DuffCode [ ] dc_v = ( DuffCode [ ] ) TibetanMachineWeb . getTibHash ( ) . get ( ai_VOWEL ) ;
l . add ( dc_v [ TibetanMachineWeb . TMW ] ) ;
return ;
}
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( vowel . equals ( au_VOWEL ) ) {
if ( ! context_added [ 0 ] ) {
context_added [ 0 ] = true ;
if ( context_1 ! = null )
l . add ( context_1 ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
l . add ( context_2 ) ;
}
DuffCode [ ] dc_v = ( DuffCode [ ] ) TibetanMachineWeb . getTibHash ( ) . get ( au_VOWEL ) ;
l . add ( dc_v [ TibetanMachineWeb . TMW ] ) ;
return ;
}
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( vowel . equals ( reverse_i_VOWEL ) ) {
if ( ! context_added [ 0 ] ) {
context_added [ 0 ] = true ;
if ( context_1 ! = null )
l . add ( context_1 ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
l . add ( context_2 ) ;
}
if ( ! TibetanMachineWeb . isTopVowel ( context_2 ) ) {
DuffCode [ ] dc_v = ( DuffCode [ ] ) TibetanMachineWeb . getTibHash ( ) . get ( reverse_i_VOWEL ) ;
l . add ( dc_v [ TibetanMachineWeb . TMW ] ) ;
2003-10-16 04:13:14 +00:00
} else throw new IllegalArgumentException ( " dropping vowels is bad2 " ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
return ;
}
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
//second, the vowels i, e, and o
//these vowels have many different glyphs each,
//whose correct selection depends on the
//preceding context. therefore, dc_context is
//returned along with the vowel appropriate to
//that context
if ( vowel . equals ( i_VOWEL ) ) {
String hashKey_context = TibetanMachineWeb . getHashKeyForGlyph ( context_2 ) ;
DuffCode dc_v = TibetanMachineWeb . getVowel ( hashKey_context , TibetanMachineWeb . VOWEL_i ) ;
if ( null = = dc_v & & null ! = context_1 ) {
hashKey_context = TibetanMachineWeb . getHashKeyForGlyph ( context_1 ) ;
dc_v = TibetanMachineWeb . getVowel ( hashKey_context , TibetanMachineWeb . VOWEL_i ) ;
}
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( ! context_added [ 0 ] ) {
context_added [ 0 ] = true ;
if ( context_1 ! = null )
l . add ( context_1 ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
l . add ( context_2 ) ;
}
if ( null ! = dc_v )
l . add ( dc_v ) ;
2003-10-16 04:13:14 +00:00
else throw new IllegalArgumentException ( " dropping vowels is bad3 " ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
return ;
}
// DLC perfect TMW->Wylie wouldn't produce o'i for an input file containing merely TMW9.61 -- it would produce \u0f7c,\u0f60,\u0f72 -- round-trip shows why.
if ( vowel . equals ( e_VOWEL ) ) {
String hashKey_context = TibetanMachineWeb . getHashKeyForGlyph ( context_2 ) ;
DuffCode dc_v = TibetanMachineWeb . getVowel ( hashKey_context , TibetanMachineWeb . VOWEL_e ) ;
if ( null = = dc_v & & null ! = context_1 ) {
hashKey_context = TibetanMachineWeb . getHashKeyForGlyph ( context_1 ) ;
dc_v = TibetanMachineWeb . getVowel ( hashKey_context , TibetanMachineWeb . VOWEL_e ) ;
}
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( ! context_added [ 0 ] ) {
context_added [ 0 ] = true ;
if ( context_1 ! = null )
l . add ( context_1 ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
l . add ( context_2 ) ;
}
if ( null ! = dc_v )
l . add ( dc_v ) ;
2003-10-16 04:13:14 +00:00
else throw new IllegalArgumentException ( " dropping vowels is bad4 " ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
return ;
}
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( vowel . equals ( o_VOWEL ) ) {
String hashKey_context = TibetanMachineWeb . getHashKeyForGlyph ( context_2 ) ;
DuffCode dc_v = TibetanMachineWeb . getVowel ( hashKey_context , TibetanMachineWeb . VOWEL_o ) ;
if ( null = = dc_v & & null ! = context_1 ) {
hashKey_context = TibetanMachineWeb . getHashKeyForGlyph ( context_1 ) ;
dc_v = TibetanMachineWeb . getVowel ( hashKey_context , TibetanMachineWeb . VOWEL_o ) ;
}
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( ! context_added [ 0 ] ) {
context_added [ 0 ] = true ;
if ( context_1 ! = null )
l . add ( context_1 ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
l . add ( context_2 ) ;
}
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( null ! = dc_v )
l . add ( dc_v ) ;
2003-10-16 04:13:14 +00:00
else throw new IllegalArgumentException ( " dropping vowels is bad5 " ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
return ;
}
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
//next come the vowels u, A, and U
//these three vowels are grouped together because they all
//can cause the preceding context to change. in particular,
//both u and A cannot be affixed to ordinary k or g, but
//rather the shortened versions of k and g - therefore,
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( vowel . equals ( u_VOWEL ) ) {
String hashKey_context = TibetanMachineWeb . getHashKeyForGlyph ( context_2 ) ;
DuffCode halfHeight = TibetanMachineWeb . getHalfHeightGlyph ( hashKey_context ) ;
DuffCode dc_v = TibetanMachineWeb . getVowel ( hashKey_context , TibetanMachineWeb . VOWEL_u ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( ! context_added [ 0 ] ) {
context_added [ 0 ] = true ;
if ( null ! = context_1 )
l . add ( context_1 ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( null = = halfHeight )
l . add ( context_2 ) ;
else
l . add ( halfHeight ) ;
}
if ( null ! = dc_v )
l . add ( dc_v ) ;
2003-10-16 04:13:14 +00:00
else throw new IllegalArgumentException ( " dropping vowels is bad6 " ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
return ;
}
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( vowel . equals ( A_VOWEL ) ) {
String hashKey_context = TibetanMachineWeb . getHashKeyForGlyph ( context_2 ) ;
DuffCode halfHeight = TibetanMachineWeb . getHalfHeightGlyph ( hashKey_context ) ;
DuffCode dc_v = TibetanMachineWeb . getVowel ( hashKey_context , TibetanMachineWeb . VOWEL_A ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( ! context_added [ 0 ] ) {
context_added [ 0 ] = true ;
if ( null ! = context_1 )
l . add ( context_1 ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( null = = halfHeight )
l . add ( context_2 ) ;
else
l . add ( halfHeight ) ;
}
if ( null ! = dc_v )
l . add ( dc_v ) ;
2003-10-16 04:13:14 +00:00
else throw new IllegalArgumentException ( " dropping vowels is bad7 " ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
return ;
}
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( vowel . equals ( U_VOWEL ) ) {
String hashKey_context = TibetanMachineWeb . getHashKeyForGlyph ( context_2 ) ;
DuffCode halfHeight = TibetanMachineWeb . getHalfHeightGlyph ( hashKey_context ) ;
DuffCode dc_v = TibetanMachineWeb . getVowel ( hashKey_context , TibetanMachineWeb . VOWEL_U ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( ! context_added [ 0 ] ) {
context_added [ 0 ] = true ;
if ( null ! = context_1 )
l . add ( context_1 ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( null = = halfHeight )
l . add ( context_2 ) ;
else
l . add ( halfHeight ) ;
}
if ( null ! = dc_v & & ! TibetanMachineWeb . isTopVowel ( context_2 ) )
l . add ( dc_v ) ;
2003-10-16 04:13:14 +00:00
else throw new IllegalArgumentException ( " dropping vowels is bad8 " ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
return ;
}
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
//finally, the vowels I and <I
//these vowels are unique in that they both
//require a change from the previous character,
//and consist of two glyphs themselves
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( vowel . equals ( I_VOWEL ) ) {
String hashKey_context = TibetanMachineWeb . getHashKeyForGlyph ( context_2 ) ;
DuffCode halfHeight = TibetanMachineWeb . getHalfHeightGlyph ( hashKey_context ) ;
DuffCode dc_v_sub = TibetanMachineWeb . getVowel ( hashKey_context , TibetanMachineWeb . VOWEL_A ) ;
DuffCode dc_v_sup = TibetanMachineWeb . getVowel ( hashKey_context , TibetanMachineWeb . VOWEL_i ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( ! context_added [ 0 ] ) {
context_added [ 0 ] = true ;
if ( null ! = context_1 )
l . add ( context_1 ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( null = = halfHeight )
l . add ( context_2 ) ;
else
l . add ( halfHeight ) ;
}
if ( null ! = dc_v_sub & & null ! = dc_v_sup ) {
l . add ( dc_v_sub ) ;
l . add ( dc_v_sup ) ;
2003-10-16 04:13:14 +00:00
} else throw new IllegalArgumentException ( " dropping vowels is bad9 " ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
return ;
}
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( vowel . equals ( reverse_I_VOWEL ) ) {
String hashKey_context = TibetanMachineWeb . getHashKeyForGlyph ( context_2 ) ;
DuffCode halfHeight = TibetanMachineWeb . getHalfHeightGlyph ( hashKey_context ) ;
DuffCode dc_v_sub = TibetanMachineWeb . getVowel ( hashKey_context , TibetanMachineWeb . VOWEL_A ) ;
DuffCode [ ] tv_array = ( DuffCode [ ] ) TibetanMachineWeb . getTibHash ( ) . get ( reverse_i_VOWEL ) ;
DuffCode dc_v_sup = tv_array [ TibetanMachineWeb . TMW ] ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( ! context_added [ 0 ] ) {
context_added [ 0 ] = true ;
if ( null ! = context_1 )
l . add ( context_1 ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
if ( null = = halfHeight )
l . add ( context_2 ) ;
else
l . add ( halfHeight ) ;
}
if ( null ! = dc_v_sub & & null ! = dc_v_sup ) {
l . add ( dc_v_sub ) ;
l . add ( dc_v_sup ) ;
2003-10-16 04:13:14 +00:00
} else throw new IllegalArgumentException ( " dropping vowels is bad10 " ) ;
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
return ;
}
2002-11-02 03:38:59 +00:00
2003-09-12 05:06:37 +00:00
throw new IllegalArgumentException ( " bad vowel " + vowel ) ;
}
2002-11-02 03:38:59 +00:00
2003-03-30 02:31:16 +00:00
/ * *
* True if you want TibetanMachineWeb - to - Extended - Wylie conversion
* to produce Wylie that , if typed , will produce the same sequence
* of TibetanMachineWeb glyphs . Without it , converting the glyphs
* you get from typing jskad , skaska , skaskaska , skaskaskaska ,
* etc . will not give you Wylie , that , if typed in again , will
* produce the original glyphs . Hence , if this is true , then you
* get working , end - to - end Wylie for syntactically illegal
* sequences of glyphs . * /
private static final boolean makeIllegalTibetanGoEndToEnd = true ;
2003-09-02 06:39:33 +00:00
/ * * Returns " a " / " A " , unless wylie ( which really is EWTS , not ACIP )
is already " a " . * /
private static String aVowelToUseAfter ( boolean EWTSNotACIP , String wylie ) {
2004-04-14 05:44:51 +00:00
if ( wylie . equals ( ACHEN ) & & EWTSNotACIP ) {
/ * it ' s EWTS { a } , not EWTS { aa } , for achen alone . But it ' s
ACIP { AA } . * /
return " " ;
} else
return ( ( EWTSNotACIP )
? WYLIE_aVOWEL : " A " /* hard-coded ACIP constant */ ) ;
2003-03-30 02:31:16 +00:00
}
2003-09-02 06:39:33 +00:00
private static String unambiguousPostAVowelTranslit ( boolean EWTSNotACIP ,
String wylie1 ,
String wylie2 ,
String acip1 ,
String acip2 ) {
2003-03-30 02:31:16 +00:00
String disambiguator = " " ;
// type "lard" vs. "lar.d", and you'll see the need for this
// disambiguation of suffix and postsuffix. sa doesn't take
// any head letters, so only da needs to be considered.
if ( TibetanMachineWeb . isWylieTop ( wylie1 )
& & wylie2 . equals ( /* FIXME: hard-coded */ " d " ) )
2003-09-02 06:39:33 +00:00
disambiguator = ( EWTSNotACIP ) ? WYLIE_DISAMBIGUATING_KEY_STRING : " - " ;
if ( EWTSNotACIP )
return wylie1 + disambiguator + wylie2 ;
else
return acip1 + disambiguator + acip2 ;
2003-03-30 02:31:16 +00:00
}
2002-11-02 03:38:59 +00:00
/ * *
2003-09-02 06:39:33 +00:00
* Gets the Extended Wylie for the given sequence of glyphs if
* EWTSNotACIP is true , or the ACIP otherwise .
* @param EWTSNotACIP true if you want THDL Extended Wylie , false if
* you want ACIP
2004-04-14 05:44:51 +00:00
* @param dcs an array of TMW glyphs
2003-09-02 06:39:33 +00:00
* @param noSuch an array which will not be touched if this is
* successful ; however , if there is no THDL Extended Wylie / ACIP
* corresponding to these glyphs , then noSuch [ 0 ] will be set to true
* @return the Extended Wylie / ACIP corresponding to these glyphs , or
* null * /
public static String getTranslit ( boolean EWTSNotACIP ,
DuffCode [ ] dcs ,
boolean noSuch [ ] ) {
2003-07-08 14:30:17 +00:00
StringBuffer warnings = ( debug ? new StringBuffer ( ) : null ) ;
2003-09-02 06:39:33 +00:00
String ans
= getTranslitImplementation ( EWTSNotACIP , dcs , noSuch , warnings ) ;
2003-07-08 14:30:17 +00:00
if ( debug & & warnings . length ( ) > 0 )
System . out . println ( " DEBUG: warnings in TMW->Wylie: " + warnings ) ;
return ans ;
}
2003-03-30 02:31:16 +00:00
2003-07-08 14:30:17 +00:00
/ * * True for and only for ma and nga because ' am and ' ang are
appendages . * /
private static final boolean isAppendageNonVowelWylie ( String wylie ) {
2003-10-18 03:04:47 +00:00
return ( MA . equals ( wylie ) /* 'AM */
| | NGA . equals ( wylie ) /* 'ANG, 'UNG */
| | SA . equals ( wylie ) /* 'OS, 'US, maybe 'IS */
| | RA . equals ( wylie ) /* 'UR */
) ;
2003-07-08 14:30:17 +00:00
}
2003-03-30 02:31:16 +00:00
2003-08-31 20:38:28 +00:00
// DLC FIXME: {H}, U+0F7F, is part of a grapheme cluster!
// David Chapman and I both need a comprehensive list of these
2003-09-12 05:06:37 +00:00
// guys. Get it from Unicode 4.0 spec?
2004-04-14 05:44:51 +00:00
/ * * Scans the TMW glyphs in glyphList and creates the returned
list of grapheme clusters based on them . A grapheme cluster
is a consonant or consonant stack with optional adornment or a
2003-07-08 14:30:17 +00:00
number ( possibly super - or subscribed ) or some other glyph
alone . * /
2003-08-10 19:02:56 +00:00
private static TGCList breakTshegBarIntoGraphemeClusters ( java . util . List glyphList ,
boolean noSuchWylie [ ] ) {
2003-07-08 14:30:17 +00:00
// Definition: adornment means vowels and achungs and bindus.
2003-08-31 20:38:28 +00:00
// It should be this, though (FIXME): any combining
// characters.
2003-08-10 19:02:56 +00:00
2003-07-08 14:30:17 +00:00
int sz = glyphList . size ( ) ;
ThdlDebug . verify ( sz > 0 ) ;
// A list of grapheme clusters (see UnicodeGraphemeCluster).
// sz is an overestimate (speeds us up, wastes some memory).
2003-08-10 19:02:56 +00:00
TMWGCList gcs = new TMWGCList ( sz ) ;
2003-07-08 14:30:17 +00:00
2003-08-23 22:03:37 +00:00
StringBuffer buildingUpVowel = new StringBuffer ( ) ; // for {cui}, we append to this guy twice.
String nonVowelWylie = null ; // for the "c" in {cui}
int pairType = TGCPair . TYPE_OTHER ;
2003-07-08 14:30:17 +00:00
for ( int i = 0 ; i < sz ; i + + ) {
DuffCode dc = ( DuffCode ) glyphList . get ( i ) ;
String wylie = TibetanMachineWeb . getWylieForGlyph ( dc , noSuchWylie ) ;
boolean buildingUpSanskritNext = false ;
if ( ( buildingUpSanskritNext
2004-04-14 05:44:51 +00:00
= ( TibetanMachineWeb . isWylieSanskritConsonantStack ( wylie )
| |
/ * U + 0FAD , which should become ACIP " V " , not " W " ,
though the EWTS is " w " just as it is for
TMW ( fontNum = = 1 ) . 53 : * /
( 8 = = dc . getFontNum ( ) & & 69 = = dc . getCharNum ( ) ) ) )
2003-07-08 14:30:17 +00:00
| | TibetanMachineWeb . isWylieTibetanConsonantOrConsonantStack ( wylie ) ) {
2003-08-23 22:03:37 +00:00
if ( buildingUpVowel . length ( ) > 0 | | null ! = nonVowelWylie ) {
gcs . add ( new TGCPair ( nonVowelWylie ,
buildingUpVowel . toString ( ) ,
pairType ) ) ;
buildingUpVowel . delete ( 0 , buildingUpVowel . length ( ) ) ;
2003-03-30 02:31:16 +00:00
}
2003-08-23 22:03:37 +00:00
// We want {p-y}, not {py}.
nonVowelWylie
= TibetanMachineWeb . getHashKeyForGlyph ( dc . getFontNum ( ) , dc . getCharNum ( ) ) ;
pairType = ( buildingUpSanskritNext
? TGCPair . TYPE_SANSKRIT
: TGCPair . TYPE_TIBETAN ) ;
} else if ( TibetanMachineWeb . isWylieAdornmentAndContainsVowel ( wylie )
2003-07-08 14:30:17 +00:00
| | TibetanMachineWeb . isWylieAdornment ( wylie ) ) {
2003-08-23 22:03:37 +00:00
buildingUpVowel . append ( wylie ) ;
2003-09-12 05:06:37 +00:00
// DLC FIXME: I bet three or four vowels together
// breaks TMW->ACIP and TMW->EWTS. Test it. When it
// does, revamp TGCPair to have a set of vowels. The
// output order should be consistent with the
// Unicode-imposed order on vowels.
2003-07-08 14:30:17 +00:00
} else {
// number or weird thing:
2003-08-23 22:03:37 +00:00
if ( buildingUpVowel . length ( ) > 0 | | null ! = nonVowelWylie ) {
gcs . add ( new TGCPair ( nonVowelWylie ,
buildingUpVowel . toString ( ) ,
pairType ) ) ;
buildingUpVowel . delete ( 0 , buildingUpVowel . length ( ) ) ;
nonVowelWylie = null ;
2003-03-30 02:31:16 +00:00
}
2003-08-23 22:03:37 +00:00
gcs . add ( new TGCPair ( wylie , null , TGCPair . TYPE_OTHER ) ) ;
pairType = TGCPair . TYPE_OTHER ;
2003-03-30 02:31:16 +00:00
}
2003-07-08 14:30:17 +00:00
}
2003-08-23 22:03:37 +00:00
if ( buildingUpVowel . length ( ) > 0 | | null ! = nonVowelWylie ) {
gcs . add ( new TGCPair ( nonVowelWylie ,
buildingUpVowel . toString ( ) ,
pairType ) ) ;
2003-07-08 14:30:17 +00:00
}
return gcs ;
}
2003-03-30 02:31:16 +00:00
2003-08-23 22:03:37 +00:00
/ * * Returns a string that classifies gcs as a legal Tibetan tsheg
* bar , a single Sanskrit grapheme cluster
* ( " single-sanskrit-gc " ) , or invalid ( " invalid " ) . If
* noPrefixTests is true , then ggyi will be seen as a
* " prefix-root " , even though gya doesn ' t take a ga prefix . * /
2003-08-10 19:02:56 +00:00
public static String getClassificationOfTshegBar ( TGCList gcs ,
// DLC the warnings are Wylie-specific
2003-08-23 22:03:37 +00:00
StringBuffer warnings ,
boolean noPrefixTests ) {
2003-07-08 14:30:17 +00:00
String candidateType = null ;
// Now that we have grapheme clusters, see if they match any
// of the "legal tsheg bars":
int sz = gcs . size ( ) ;
2003-08-10 19:02:56 +00:00
if ( sz = = 1 ) {
TGCPair tp = gcs . get ( 0 ) ;
int cls = tp . classification ;
if ( TGCPair . SANSKRIT_WITHOUT_VOWEL = = cls
| | TGCPair . SANSKRIT_WITH_VOWEL = = cls )
return " single-sanskrit-gc " ;
}
2003-08-23 22:03:37 +00:00
TGCPair lastPair = null ;
2003-07-08 14:30:17 +00:00
for ( int i = 0 ; i < sz ; i + + ) {
2003-08-10 19:02:56 +00:00
TGCPair tp = gcs . get ( i ) ;
2003-07-08 14:30:17 +00:00
int cls = tp . classification ;
2003-08-23 22:03:37 +00:00
String wylie = tp . getWylie ( ) ;
2003-07-08 14:30:17 +00:00
if ( TGCPair . OTHER = = cls ) {
if ( TibetanMachineWeb . isWylieNumber ( wylie ) ) {
if ( null = = candidateType ) {
candidateType = " number " ;
} else {
if ( " number " ! = candidateType ) {
if ( null ! = warnings )
warnings . append ( " Found something odd; the wylie is " + wylie + " \ n " ) ;
candidateType = " invalid " ;
break ;
2003-03-30 02:31:16 +00:00
}
2003-07-08 14:30:17 +00:00
}
} else {
if ( null ! = warnings )
warnings . append ( " Found something odd; the wylie is " + wylie + " \ n " ) ;
candidateType = " invalid " ;
break ;
}
} else if ( TGCPair . SANSKRIT_WITHOUT_VOWEL = = cls
| | TGCPair . SANSKRIT_WITH_VOWEL = = cls ) {
candidateType = " invalid " ;
2003-07-13 17:38:00 +00:00
break ;
2003-07-08 14:30:17 +00:00
} else if ( TGCPair . CONSONANTAL_WITHOUT_VOWEL = = cls
| | TGCPair . CONSONANTAL_WITH_VOWEL = = cls ) {
if ( null = = candidateType ) {
if ( TibetanMachineWeb . isWylieLeft ( wylie ) ) {
candidateType = " prefix/root " ;
2003-03-30 02:31:16 +00:00
} else {
2003-07-08 14:30:17 +00:00
candidateType = " root " ;
2003-03-30 02:31:16 +00:00
}
} else {
2003-07-08 14:30:17 +00:00
if ( " prefix/root " = = candidateType ) {
if ( ACHUNG . equals ( wylie ) ) {
// peek ahead to distinguish between ba's,
// ba'ala and ba'am:
2003-08-10 19:02:56 +00:00
TGCPair nexttp = ( i + 1 < sz ) ? gcs . get ( i + 1 ) : null ;
2003-08-23 22:03:37 +00:00
String nextwylie = ( nexttp = = null ) ? " " : nexttp . getWylie ( ) ;
2003-07-08 14:30:17 +00:00
if ( isAppendageNonVowelWylie ( nextwylie ) ) {
candidateType = " maybe-appendaged-prefix/root " ;
} else {
2003-08-23 22:03:37 +00:00
if ( noPrefixTests
| | isLegalPrefixRootCombo ( lastPair . getConsonantWylie ( ) ,
tp . getConsonantWylie ( ) ) )
candidateType = " prefix/root-root/suffix " ;
else
candidateType = " root-suffix " ;
2003-07-08 14:30:17 +00:00
}
} else if ( TibetanMachineWeb . isWylieRight ( wylie ) ) {
2003-08-23 22:03:37 +00:00
if ( noPrefixTests
| | isLegalPrefixRootCombo ( lastPair . getConsonantWylie ( ) ,
tp . getConsonantWylie ( ) ) )
candidateType = " prefix/root-root/suffix " ;
else
candidateType = " root-suffix " ;
2003-07-08 14:30:17 +00:00
} else if ( TibetanMachineWeb . isWylieAchungAppendage ( wylie ) ) {
candidateType = " appendaged-prefix/root " ;
} else {
2003-08-23 22:03:37 +00:00
if ( noPrefixTests
| | isLegalPrefixRootCombo ( lastPair . getConsonantWylie ( ) ,
tp . getConsonantWylie ( ) ) )
candidateType = " prefix-root " ;
else {
if ( null ! = warnings )
warnings . append ( " Found what would be a prefix-root combo, but the root stack with wylie " + wylie + " does not take the prefix with wylie " + lastPair . getConsonantWylie ( ) ) ;
candidateType = " invalid " ;
break ;
}
2003-07-08 14:30:17 +00:00
}
} else if ( " root " = = candidateType ) {
if ( ACHUNG . equals ( wylie ) ) {
// peek ahead to distinguish between pa's,
// pa'ala and pa'am:
2003-08-10 19:02:56 +00:00
TGCPair nexttp = ( i + 1 < sz ) ? gcs . get ( i + 1 ) : null ;
2003-08-23 22:03:37 +00:00
String nextwylie = ( nexttp = = null ) ? " " : nexttp . getWylie ( ) ;
2003-07-08 14:30:17 +00:00
if ( isAppendageNonVowelWylie ( nextwylie ) ) {
candidateType = " maybe-appendaged-root " ;
} else {
candidateType = " root-suffix " ;
}
} else if ( TibetanMachineWeb . isWylieRight ( wylie ) ) {
candidateType = " root-suffix " ;
} else if ( TibetanMachineWeb . isWylieAchungAppendage ( wylie ) ) {
candidateType = " appendaged-root " ;
} else {
if ( null ! = warnings )
warnings . append ( " Found a non-prefix consonant or consonant stack followed by a consonant or consonant stack that is not simply a suffix; that thing's wylie is " + wylie + " \ n " ) ;
candidateType = " invalid " ;
break ;
}
} else if ( " prefix-root " = = candidateType ) {
if ( ACHUNG . equals ( wylie ) ) {
// peek ahead to distinguish between bpa's,
// bpa'ala and bpa'am:
2003-08-10 19:02:56 +00:00
TGCPair nexttp = ( i + 1 < sz ) ? gcs . get ( i + 1 ) : null ;
2003-08-23 22:03:37 +00:00
String nextwylie = ( nexttp = = null ) ? " " : nexttp . getWylie ( ) ;
2003-07-08 14:30:17 +00:00
if ( isAppendageNonVowelWylie ( nextwylie ) ) {
candidateType = " maybe-appendaged-prefix-root " ;
} else {
candidateType = " prefix-root-suffix " ;
}
} else if ( TibetanMachineWeb . isWylieRight ( wylie ) ) {
candidateType = " prefix-root-suffix " ;
} else if ( TibetanMachineWeb . isWylieAchungAppendage ( wylie ) ) {
candidateType = " appendaged-prefix-root " ;
} else {
if ( null ! = warnings )
warnings . append ( " Found a prefix plus a root stack plus a non-suffix consonant or consonant stack whose wylie is " + wylie + " \ n " ) ;
candidateType = " invalid " ;
break ;
}
} else if ( " prefix/root-root/suffix " = = candidateType ) {
// this has no peekahead, gag'am works.
if ( ACHUNG . equals ( wylie ) ) {
// peek ahead to distinguish between
// gga'am and gaga'ala:
2003-08-10 19:02:56 +00:00
TGCPair nexttp = ( i + 1 < sz ) ? gcs . get ( i + 1 ) : null ;
2003-08-23 22:03:37 +00:00
String nextwylie = ( nexttp = = null ) ? " " : nexttp . getWylie ( ) ;
2003-07-08 14:30:17 +00:00
if ( isAppendageNonVowelWylie ( nextwylie ) ) {
candidateType = " maybe-appendaged-prefix/root-root/suffix " ;
} else {
candidateType = " prefix-root-suffix " ;
}
} else if ( TibetanMachineWeb . isWylieFarRight ( wylie ) ) {
candidateType = " prefix/root-root/suffix-suffix/postsuffix " ;
} else if ( TibetanMachineWeb . isWylieRight ( wylie ) ) {
candidateType = " prefix-root-suffix " ;
} else if ( TibetanMachineWeb . isWylieAchungAppendage ( wylie ) ) {
candidateType = " appendaged-prefix/root-root/suffix " ;
} else {
if ( null ! = warnings )
warnings . append ( " Found a prefix/root stack plus a suffix/root stack plus a non-suffix, non-postsuffix consonant or consonant stack whose wylie is " + wylie + " \ n " ) ;
candidateType = " invalid " ;
break ;
}
} else if ( " root-suffix " = = candidateType ) {
// This has no peekahead w.r.t. 'am and 'ang,
// but it needs none because we peeked to be
// sure that this was root-suffix and not
// maybe-appendaged-root.
if ( TibetanMachineWeb . isWylieFarRight ( wylie ) ) {
candidateType = " root-suffix-postsuffix " ;
} else if ( TibetanMachineWeb . isWylieAchungAppendage ( wylie ) ) {
candidateType = " appendaged-root-suffix " ;
} else if ( ACHUNG . equals ( wylie ) ) {
candidateType = " maybe-appendaged-root-suffix " ;
} else {
if ( null ! = warnings )
warnings . append ( " Found a root stack plus a suffix plus a non-postsuffix consonant or consonant stack whose wylie is " + wylie + " \ n " ) ;
candidateType = " invalid " ;
break ;
}
} else if ( " prefix/root-root/suffix-suffix/postsuffix " = = candidateType
| | " prefix-root-suffix " = = candidateType ) {
// this has no peekahead and needs none.
if ( TibetanMachineWeb . isWylieFarRight ( wylie ) ) {
candidateType = " prefix-root-suffix-postsuffix " ;
} else if ( TibetanMachineWeb . isWylieAchungAppendage ( wylie ) ) {
// if we simply prepended to
// candidateType, we wouldn't get interned
// strings.
candidateType = ( " appendaged- " + candidateType ) . intern ( ) ;
} else if ( ACHUNG . equals ( wylie ) ) {
candidateType = ( " maybe-appendaged- " + candidateType ) . intern ( ) ;
} else {
if ( null ! = warnings )
warnings . append ( " Found a prefix/root stack plus a suffix/root stack plus a suffix/postsuffix plus a non-postsuffix consonant or consonant stack whose wylie is " + wylie + " \ n " ) ;
candidateType = " invalid " ;
break ;
}
} else if ( " prefix-root-suffix-postsuffix " = = candidateType ) {
// this has no peekahead and needs none.
if ( TibetanMachineWeb . isWylieAchungAppendage ( wylie ) ) {
candidateType = " appendaged-prefix-root-suffix-postsuffix " ;
} else if ( ACHUNG . equals ( wylie ) ) {
candidateType = " maybe-appendaged-prefix-root-suffix-postsuffix " ;
} else {
if ( null ! = warnings )
warnings . append ( " Found a prefix plus root stack plus suffix plus postsuffix; then found yet another consonant or consonant stack whose wylie is " + wylie + " \ n " ) ;
candidateType = " invalid " ;
break ;
}
} else if ( " root-suffix-postsuffix " = = candidateType ) {
// this has no peekahead and needs none.
if ( TibetanMachineWeb . isWylieAchungAppendage ( wylie ) ) {
candidateType = " appendaged-root-suffix-postsuffix " ;
} else if ( ACHUNG . equals ( wylie ) ) {
candidateType = " maybe-appendaged-root-suffix-postsuffix " ;
} else {
if ( null ! = warnings )
warnings . append ( " Found a root stack plus suffix plus postsuffix; then found yet another consonant or consonant stack whose wylie is " + wylie + " \ n " ) ;
candidateType = " invalid " ;
break ;
}
} else if ( candidateType . startsWith ( " maybe-appendaged- " ) ) {
if ( isAppendageNonVowelWylie ( wylie ) ) {
candidateType
= candidateType . substring ( " maybe- " . length ( ) ) . intern ( ) ;
} else {
if ( null ! = warnings )
warnings . append ( " Found a tsheg bar that has an achung ( " + ACHUNG + " ) tacked on, followed by some other thing whose wylie is " + wylie + " \ n " ) ;
candidateType = " invalid " ;
break ;
}
} else if ( candidateType . startsWith ( " appendaged- " ) ) {
2003-10-18 03:04:47 +00:00
if ( TibetanMachineWeb . isWylieAchungAppendage ( wylie )
// 'ang:
| | TibetanMachineWeb . isWylieAchungAppendage ( lastPair . getWylie ( ) + wylie )
// 'ongs, as in ma'ongs:
| | ( i > 1
& & TibetanMachineWeb . isWylieAchungAppendage ( gcs . get ( i - 2 ) . getWylie ( ) + lastPair . getWylie ( ) + wylie ) ) ) {
2003-07-08 14:30:17 +00:00
// candidateType stays what it is.
} else if ( ACHUNG . equals ( wylie ) ) {
candidateType = ( " maybe- " + candidateType ) . intern ( ) ;
} else {
if ( null ! = warnings )
warnings . append ( " Found a tsheg bar that has a 'i, 'e, 'o, 'u, or 'ang 'am appendage already and then found yet another consonant or consonant stack whose wylie is " + wylie + " \ n " ) ;
candidateType = " invalid " ;
break ;
}
2003-03-30 02:31:16 +00:00
} else {
2003-07-13 17:38:00 +00:00
if ( " invalid " = = candidateType )
throw new Error ( " forgot to break out of the for loop after diagnosing invalidity. " ) ;
2003-07-08 14:30:17 +00:00
if ( " number " ! = candidateType )
2003-07-13 17:38:00 +00:00
throw new Error ( " missed a case; case is " + candidateType ) ;
2003-07-08 14:30:17 +00:00
if ( null ! = warnings )
warnings . append ( " Found a consonant or consonant stack after something odd; the consonantish thing has wylie " + wylie + " \ n " ) ;
candidateType = " invalid " ;
break ;
2003-03-30 02:31:16 +00:00
}
}
2003-07-08 14:30:17 +00:00
} else if ( TGCPair . LONE_VOWEL = = cls ) {
if ( null ! = warnings )
warnings . append ( " Found a vowel that did not follow either a Tibetan consonant or consonant stack or another vowel. " ) ;
candidateType = " invalid " ;
break ;
2003-03-30 02:31:16 +00:00
} else {
2003-07-08 14:30:17 +00:00
throw new Error ( " bad cls " ) ;
2003-03-30 02:31:16 +00:00
}
2003-08-23 22:03:37 +00:00
lastPair = tp ;
2003-07-08 14:30:17 +00:00
}
2003-10-16 04:13:14 +00:00
if ( null = = candidateType ) candidateType = " invalid " ;
2003-07-08 14:30:17 +00:00
if ( candidateType . startsWith ( " maybe-appendaged- " ) ) {
if ( null ! = warnings )
warnings . append ( " Found a tsheg bar that has an extra achung ( " + ACHUNG + " ) tacked on \ n " ) ;
candidateType = " invalid " ;
}
return candidateType ;
}
2002-11-02 03:38:59 +00:00
2003-09-02 06:39:33 +00:00
/ * * Appends to translitBuffer the EWTS / ACIP for the glyph list
glyphList ( which should be an ArrayList for speed ) . This will
be very user - friendly for " legal tsheg bars " and will be
valid , but possibly ugly ( interspersed with disambiguators or
extra vowels , etc . ) Wylie / ACIP for other things , such as
Sanskrit transliteration . Updates warnings and noSuch like
the caller does .
2003-07-08 14:30:17 +00:00
< p > What constitutes a legal , non - punctuation , non - whitespace
tsheg bar ? The following are the only such : < / p >
< ul >
< li > one or more numbers < / li >
< li > a single , possibly adorned consonant stack < / li >
< li > a legal " tyllable " appended with zero or more particles
from the set { ' i , ' o , ' u , ' e , ' ang , ' am } < / li >
< / ul >
< p > A " tyllable " is , by definition , one of the following : < / p >
< ul >
< li > a single , possibly adorned consonant stack < / li >
< li > two consonant stacks where one is a single ,
unadorned consonant ( and is a prefix it it is first and
a suffix if it is last ) and the other is possibly
adorned < / li >
< li > three consonant stacks where at most one has adornment .
If the second has adornment , then the first must be an
unadorned prefix consonant and the last must be an
unadorned suffix consonant . If the first has adornment ,
then the second must be an unadorned suffix consonant
and the third must be an unadorned secondary suffix
consonant . < / li >
< li > four consonant stacks where either none is adorned or
only the second consonant stack is adorned , the first is
an unadorned prefix consonant , the third is an unadorned
suffix consonant , and the fourth is an unadorned
secondary suffix consonant . < / li >
< / ul >
< p > When there are three unadorned consonant stacks in a
tyllable , a hard - coded list of valid Tibetan tsheg bars is
2003-09-02 06:39:33 +00:00
relied upon to determine if the 'a' / 'A' vowel comes after
the first or the second consonant . < / p > * /
private static void getTshegBarTranslit ( boolean EWTSNotACIP ,
java . util . List glyphList ,
boolean noSuch [ ] ,
StringBuffer warnings ,
StringBuffer translitBuffer ) {
2003-08-10 19:02:56 +00:00
TGCList gcs
2003-09-02 06:39:33 +00:00
= breakTshegBarIntoGraphemeClusters ( glyphList , noSuch ) ;
2003-08-23 22:03:37 +00:00
String candidateType = getClassificationOfTshegBar ( gcs , warnings , false ) ;
2003-07-08 14:30:17 +00:00
int sz = gcs . size ( ) ;
2003-08-10 19:02:56 +00:00
if ( candidateType = = " invalid "
| | candidateType = = " single-sanskrit-gc " ) {
2003-07-08 14:30:17 +00:00
// Forget beauty and succintness -- just be sure to
2003-09-02 06:39:33 +00:00
// generate transliteration that can be converted
// unambiguously into Tibetan. Use a disambiguator or
// vowel after each grapheme cluster.
2003-07-08 14:30:17 +00:00
//
// If we truly didn't care about beauty, we'd just lump
// SANSKRIT_WITHOUT_VOWEL and SANSKRIT_WITH_VOWEL into
// OTHER.
for ( int i = 0 ; i < sz ; i + + ) {
TGCPair tp = ( TGCPair ) gcs . get ( i ) ;
int cls = tp . classification ;
2003-08-23 22:03:37 +00:00
String wylie = tp . getWylie ( ) ;
2003-09-02 06:39:33 +00:00
String translit = ( EWTSNotACIP ) ? wylie : tp . getACIP ( ) ;
2003-11-23 01:22:27 +00:00
if ( TibetanMachineWeb . isWylieVowel ( wylie ) & & i > 0 ) {
// au would be achen with au vowel, so use a.u; ai
// would be achen with ai vowel, so use a.i; l-i
// won't happen, you'd see la-i or gla-i, not l-i
// or gl-i; similarly for r-i, r-I, and l-I.
// Even though we only need it for ka.u and ka.i
// and a.u and a.i, we always do it (see Rule 10
// of the September 1, 2003 draft of EWTS
// standard).
translitBuffer . append ( WYLIE_DISAMBIGUATING_KEY ) ;
}
2003-09-02 06:39:33 +00:00
translitBuffer . append ( translit ) ;
2003-07-08 14:30:17 +00:00
if ( TibetanMachineWeb . isWylieTibetanConsonantOrConsonantStack ( wylie )
| | TibetanMachineWeb . isWylieSanskritConsonantStack ( wylie ) ) {
2003-09-02 06:39:33 +00:00
translitBuffer . append ( aVowelToUseAfter ( EWTSNotACIP , wylie ) ) ;
2003-09-12 05:06:37 +00:00
} else if ( i + 1 < sz ) {
2003-07-08 14:30:17 +00:00
if ( TGCPair . CONSONANTAL_WITH_VOWEL ! = cls
& & TGCPair . SANSKRIT_WITH_VOWEL ! = cls )
2003-09-02 06:39:33 +00:00
translitBuffer . append ( EWTSNotACIP ? WYLIE_DISAMBIGUATING_KEY : '-' ) ;
2003-07-08 14:30:17 +00:00
}
}
} else {
2003-09-02 06:39:33 +00:00
// Generate perfect, beautiful transliteration, using the
// minimum number of vowels and disambiguators.
2002-11-02 03:38:59 +00:00
2003-07-08 14:30:17 +00:00
int leftover = sz + 1 ;
2002-11-02 03:38:59 +00:00
2003-07-08 14:30:17 +00:00
// Appendaged vs. not appendaged? it affects nothing at
2003-09-12 05:06:37 +00:00
// this stage except for pa'm vs. pa'am.
boolean appendaged = ( candidateType . startsWith ( " appendaged- " ) ) ;
2003-08-10 19:02:56 +00:00
candidateType = getCandidateTypeModuloAppendage ( candidateType ) ;
2003-04-08 04:56:40 +00:00
2003-07-08 14:30:17 +00:00
if ( " prefix/root-root/suffix-suffix/postsuffix " = = candidateType ) {
/ * Yes , this is ambiguous . How do we handle it ? See
* this from Andres :
*
* I ' m posting this upon David Chandler ' s
* request . According to Lobsang Thonden in Modern
* Tibetan Grammar Language ( page 42 ) , with regards to
* identifying the root letter in 3 lettered words
* there are only 23 ambiguous cases . He writes :
*
* If the last letter is ' sa ' and the first two
* letters are affixes , then the SECOND ONE is the
* root letter in the following 9 WORDS ONLY :
*
2003-08-09 17:05:43 +00:00
* gdas gnas gsas dgas dmas bdas mdas ' gas ' das [ NOTE :
* Andres later came across ' bad , so we handle it this
* way also ]
2003-07-08 14:30:17 +00:00
*
* And the FIRST is the root letter in the following
* 14 WORDS ONLY :
*
* rags lags nags bags bangs gangs rangs langs nangs
* sangs babs rabs rams nams
*
* As I mentioned before , I think that the best
* solution for now is to hard - wire these cases . Even
* if the list is not exhaustive , at least we ' ll have
* most cases covered . * /
leftover = 3 ;
/ * FIXME : these constants are hard - wired here , rather
2003-11-23 01:22:27 +00:00
* than in THDLWylieConstants , because I ' m lazy . * /
2003-08-23 22:03:37 +00:00
String wylie1 = ( ( TGCPair ) gcs . get ( 0 ) ) . getWylie ( ) ;
String wylie2 = ( ( TGCPair ) gcs . get ( 1 ) ) . getWylie ( ) ;
String wylie3 = ( ( TGCPair ) gcs . get ( 2 ) ) . getWylie ( ) ;
2003-09-02 06:39:33 +00:00
String acip1 = ( EWTSNotACIP ) ? null : ( ( TGCPair ) gcs . get ( 0 ) ) . getACIP ( ) ;
String acip2 = ( EWTSNotACIP ) ? null : ( ( TGCPair ) gcs . get ( 1 ) ) . getACIP ( ) ;
String acip3 = ( EWTSNotACIP ) ? null : ( ( TGCPair ) gcs . get ( 2 ) ) . getACIP ( ) ;
2003-07-08 14:30:17 +00:00
if ( ( wylie1 . equals ( " g " ) & & ( wylie2 . equals ( " d " ) | | wylie2 . equals ( " n " ) | | wylie2 . equals ( " s " ) ) )
| | ( wylie1 . equals ( " d " ) & & ( wylie2 . equals ( " g " ) | | wylie2 . equals ( " m " ) ) )
| | ( wylie1 . equals ( " b " ) & & wylie2 . equals ( " d " ) )
| | ( wylie1 . equals ( " m " ) & & wylie2 . equals ( " d " ) )
2003-08-09 17:05:43 +00:00
| | ( wylie1 . equals ( " ' " ) & & ( wylie2 . equals ( " g " ) | | wylie2 . equals ( " d " ) | | wylie2 . equals ( " b " ) ) ) ) {
2003-07-08 14:30:17 +00:00
if ( TibetanMachineWeb . isAmbiguousWylie ( wylie1 , wylie2 ) )
2003-09-02 06:39:33 +00:00
if ( EWTSNotACIP )
translitBuffer . append ( wylie1 + WYLIE_DISAMBIGUATING_KEY + wylie2 ) ;
else
translitBuffer . append ( acip1 + '-' + acip2 ) ;
2003-07-08 14:30:17 +00:00
else
2003-09-02 06:39:33 +00:00
if ( EWTSNotACIP )
translitBuffer . append ( wylie1 + wylie2 ) ;
else
translitBuffer . append ( acip1 + acip2 ) ;
2002-11-02 03:38:59 +00:00
2003-09-02 06:39:33 +00:00
translitBuffer . append ( aVowelToUseAfter ( EWTSNotACIP , wylie2 )
+ ( EWTSNotACIP ? wylie3 : acip3 ) ) ;
2003-07-08 14:30:17 +00:00
} else {
2003-09-02 06:39:33 +00:00
if ( EWTSNotACIP )
translitBuffer . append ( wylie1
+ aVowelToUseAfter ( EWTSNotACIP , wylie1 )
+ unambiguousPostAVowelTranslit ( EWTSNotACIP ,
wylie2 ,
wylie3 ,
acip2 ,
acip3 ) ) ;
else
translitBuffer . append ( acip1
+ aVowelToUseAfter ( EWTSNotACIP , wylie1 )
+ unambiguousPostAVowelTranslit ( EWTSNotACIP ,
wylie2 ,
wylie3 ,
acip2 ,
acip3 ) ) ;
2003-07-08 14:30:17 +00:00
}
} else if ( " root " = = candidateType
| | " prefix/root-root/suffix " = = candidateType
| | " prefix/root " = = candidateType
| | " root-suffix-postsuffix " = = candidateType
| | " root-suffix " = = candidateType ) {
2003-08-23 22:03:37 +00:00
String wylie1 = ( ( TGCPair ) gcs . get ( 0 ) ) . getWylie ( ) ;
2003-09-02 06:39:33 +00:00
String acip1 = ( EWTSNotACIP ) ? null : ( ( TGCPair ) gcs . get ( 0 ) ) . getACIP ( ) ;
2003-07-08 14:30:17 +00:00
leftover = 1 ;
2003-09-02 06:39:33 +00:00
translitBuffer . append ( ( EWTSNotACIP ) ? wylie1 : acip1 ) ;
2003-07-08 14:30:17 +00:00
if ( ( ( TGCPair ) gcs . get ( 0 ) ) . classification
! = TGCPair . CONSONANTAL_WITH_VOWEL ) {
ThdlDebug . verify ( TGCPair . CONSONANTAL_WITHOUT_VOWEL
= = ( ( TGCPair ) gcs . get ( 0 ) ) . classification ) ;
2003-09-02 06:39:33 +00:00
translitBuffer . append ( aVowelToUseAfter ( EWTSNotACIP , wylie1 ) ) ;
2003-07-08 14:30:17 +00:00
if ( debug ) System . out . println ( " DEBUG: appending vowel " ) ;
} else {
if ( debug ) System . out . println ( " DEBUG: already has vowel 2 " ) ;
}
if ( " root-suffix-postsuffix " = = candidateType ) {
leftover = 3 ;
2003-08-23 22:03:37 +00:00
String wylie2 = ( ( TGCPair ) gcs . get ( 1 ) ) . getWylie ( ) ;
String wylie3 = ( ( TGCPair ) gcs . get ( 2 ) ) . getWylie ( ) ;
2003-09-02 06:39:33 +00:00
String acip2 = ( EWTSNotACIP ) ? null : ( ( TGCPair ) gcs . get ( 1 ) ) . getACIP ( ) ;
String acip3 = ( EWTSNotACIP ) ? null : ( ( TGCPair ) gcs . get ( 2 ) ) . getACIP ( ) ;
translitBuffer . append ( unambiguousPostAVowelTranslit ( EWTSNotACIP ,
wylie2 ,
wylie3 ,
acip2 ,
acip3 ) ) ;
2003-07-08 14:30:17 +00:00
}
} else if ( " prefix-root-suffix " = = candidateType
| | " prefix-root " = = candidateType
| | " prefix-root-suffix-postsuffix " = = candidateType ) {
2003-08-23 22:03:37 +00:00
String wylie1 = ( ( TGCPair ) gcs . get ( 0 ) ) . getWylie ( ) ;
String wylie2 = ( ( TGCPair ) gcs . get ( 1 ) ) . getWylie ( ) ;
2003-09-02 06:39:33 +00:00
String acip1 = ( EWTSNotACIP ) ? null : ( ( TGCPair ) gcs . get ( 0 ) ) . getACIP ( ) ;
String acip2 = ( EWTSNotACIP ) ? null : ( ( TGCPair ) gcs . get ( 1 ) ) . getACIP ( ) ;
2003-07-08 14:30:17 +00:00
leftover = 2 ;
if ( TibetanMachineWeb . isAmbiguousWylie ( wylie1 , wylie2 ) )
2003-09-02 06:39:33 +00:00
if ( EWTSNotACIP )
translitBuffer . append ( wylie1 + WYLIE_DISAMBIGUATING_KEY + wylie2 ) ;
else
translitBuffer . append ( acip1 + '-' + acip2 ) ;
2003-07-08 14:30:17 +00:00
else
2003-09-02 06:39:33 +00:00
if ( EWTSNotACIP )
translitBuffer . append ( wylie1 + wylie2 ) ;
else
translitBuffer . append ( acip1 + acip2 ) ;
2003-07-08 14:30:17 +00:00
if ( ( ( TGCPair ) gcs . get ( 1 ) ) . classification
! = TGCPair . CONSONANTAL_WITH_VOWEL ) {
ThdlDebug . verify ( TGCPair . CONSONANTAL_WITHOUT_VOWEL
= = ( ( TGCPair ) gcs . get ( 1 ) ) . classification ) ;
if ( debug ) System . out . println ( " DEBUG: appending vowel " ) ;
2003-09-02 06:39:33 +00:00
translitBuffer . append ( aVowelToUseAfter ( EWTSNotACIP , wylie2 ) ) ;
2003-07-08 14:30:17 +00:00
} else {
if ( debug ) System . out . println ( " DEBUG: already has vowel 1 " ) ;
}
if ( " prefix-root-suffix-postsuffix " = = candidateType ) {
leftover = 4 ;
2003-08-23 22:03:37 +00:00
String wylie3 = ( ( TGCPair ) gcs . get ( 2 ) ) . getWylie ( ) ;
String wylie4 = ( ( TGCPair ) gcs . get ( 3 ) ) . getWylie ( ) ;
2003-09-02 06:39:33 +00:00
String acip3 = ( EWTSNotACIP ) ? null : ( ( TGCPair ) gcs . get ( 2 ) ) . getACIP ( ) ;
String acip4 = ( EWTSNotACIP ) ? null : ( ( TGCPair ) gcs . get ( 3 ) ) . getACIP ( ) ;
translitBuffer . append ( unambiguousPostAVowelTranslit ( EWTSNotACIP ,
wylie3 ,
wylie4 ,
acip3 ,
acip4 ) ) ;
2003-07-08 14:30:17 +00:00
}
} else if ( " number " = = candidateType ) {
leftover = 0 ;
} else {
throw new Error ( " missed a case down here " ) ;
}
2002-11-02 03:38:59 +00:00
2003-09-02 06:39:33 +00:00
// append the wylie/ACIP left over:
2003-10-18 03:04:47 +00:00
String lastPairTranslit = null ;
if ( appendaged & & leftover > = 1 ) {
TGCPair tp = ( TGCPair ) gcs . get ( leftover - 1 ) ;
lastPairTranslit = ( EWTSNotACIP
? tp . getWylie ( null )
: tp . getACIP ( null ) ) ;
2004-05-01 19:11:15 +00:00
if ( ! translitBuffer . toString ( ) . endsWith ( lastPairTranslit ) ) {
int l ;
if ( ( l = translitBuffer . length ( ) ) > 0 ) {
char lc = translitBuffer . charAt ( l - 1 ) ;
ThdlDebug . verify ( lc = = ( ( EWTSNotACIP ) ? 'a' : 'A' ) /* hard-coded ACIP and EWTS values */ ) ;
lastPairTranslit = lastPairTranslit + lc ; /* 'da'i can cause this */
} else {
ThdlDebug . verify ( false ) ; // this better never happen.
lastPairTranslit = null ;
}
}
2003-10-18 03:04:47 +00:00
}
2003-07-08 14:30:17 +00:00
for ( int i = leftover ; i < sz ; i + + ) {
TGCPair tp = ( TGCPair ) gcs . get ( i ) ;
2003-10-18 03:04:47 +00:00
String y ;
2003-09-12 05:06:37 +00:00
translitBuffer . append ( EWTSNotACIP
2003-10-18 03:04:47 +00:00
? ( y = tp . getWylie ( lastPairTranslit ) )
: ( y = tp . getACIP ( lastPairTranslit ) ) ) ;
if ( appendaged )
lastPairTranslit = y ;
2003-07-08 14:30:17 +00:00
}
}
}
2002-11-02 03:38:59 +00:00
/ * *
2003-09-02 06:39:33 +00:00
* Gets the Extended Wylie / ACIP for a sequence of glyphs . This works
* as follows :
2003-07-08 14:30:17 +00:00
*
* < p > We run along until we hit whitespace or punctuation . We take
* everything before that and we see if it ' s a legal Tibetan tsheg bar ,
* either a number or a word fragment . If it is , we insert only one
* vowel in the correct place . If not , then we throw a disambiguating
* key or a vowel after each stack .
*
2003-09-02 06:39:33 +00:00
* @param EWTSNotACIP true if you want THDL Extended Wylie , false if
* you want ACIP
2002-11-02 03:38:59 +00:00
* @param dcs an array of glyphs
2003-09-02 06:39:33 +00:00
* @param noSuch an array which will not be touched if this is
* successful ; however , if there is no THDL Extended Wylie / ACIP
* corresponding to these glyphs , then noSuch [ 0 ] will be set to true
2003-07-08 14:30:17 +00:00
* @param warnings either null or a buffer to which will be appended
* warnings about illegal tsheg bars
2003-09-02 06:39:33 +00:00
* @return the Extended Wylie / ACIP corresponding to these glyphs , or
* null * /
private static String getTranslitImplementation ( boolean EWTSNotACIP ,
DuffCode [ ] dcs ,
boolean noSuch [ ] ,
StringBuffer warnings ) {
2003-07-08 14:30:17 +00:00
if ( dcs . length = = 0 )
return null ;
2002-11-02 03:38:59 +00:00
2003-07-08 14:30:17 +00:00
ArrayList glyphList = new ArrayList ( ) ;
2003-09-02 06:39:33 +00:00
StringBuffer translitBuffer = new StringBuffer ( ) ;
2002-11-02 03:38:59 +00:00
2004-04-14 05:44:51 +00:00
// DLC FIXME: " " should become " " for ACIP
2003-07-08 14:30:17 +00:00
for ( int i = 0 ; i < dcs . length ; i + + ) {
char ch = dcs [ i ] . getCharacter ( ) ;
2003-05-31 17:02:06 +00:00
int k = dcs [ i ] . getCharNum ( ) ;
// int fontNum = dcs[i].getFontNum();
2003-03-30 02:31:16 +00:00
if ( k < 32 ) {
2003-07-08 14:30:17 +00:00
if ( ! glyphList . isEmpty ( ) ) {
2003-09-02 06:39:33 +00:00
getTshegBarTranslit ( EWTSNotACIP , glyphList , noSuch ,
warnings , translitBuffer ) ;
2003-03-30 02:31:16 +00:00
glyphList . clear ( ) ;
2003-07-08 14:30:17 +00:00
if ( null ! = warnings )
warnings . append ( " Some glyphs came right before a newline; they did not have a tsheg or shad come first. " ) ;
2003-03-30 02:31:16 +00:00
}
2003-09-12 05:06:37 +00:00
// In ACIP, \n\n (or \r\n\r\n with DOS line feeds)
// indicates a real line break.
if ( ! EWTSNotACIP & & '\n' = = ch ) {
if ( i > 0 & & dcs [ i - 1 ] . getCharacter ( ) = = '\r' )
translitBuffer . append ( " \ r \ n " ) ;
else
translitBuffer . append ( ch ) ;
}
2003-09-02 06:39:33 +00:00
translitBuffer . append ( ch ) ;
2003-03-30 02:31:16 +00:00
} else {
2003-09-02 06:39:33 +00:00
String wylie = TibetanMachineWeb . getWylieForGlyph ( dcs [ i ] , noSuch ) ;
2004-04-10 18:26:57 +00:00
String acip = null ;
if ( ! EWTSNotACIP ) {
// U+0F04 and U+0F05 -- these require lookahead to
// see if the ACIP is # (two shishes) or * (one
// swish)
int howManyConsumed [ ] = new int [ ] { - 1 /* invalid */ } ;
acip = TibetanMachineWeb . getACIPForGlyph ( dcs [ i ] ,
( ( i + 1 < dcs . length )
? dcs [ i + 1 ]
: null ) ,
2004-04-14 05:44:51 +00:00
( ( i + 2 < dcs . length )
? dcs [ i + 2 ]
: null ) ,
2004-04-10 18:26:57 +00:00
noSuch ,
howManyConsumed ) ;
if ( howManyConsumed [ 0 ] = = 1 ) {
// nothing to do
2004-04-14 05:44:51 +00:00
} else if ( howManyConsumed [ 0 ] = = 2 ) {
2004-04-10 18:26:57 +00:00
+ + i ;
2004-04-14 05:44:51 +00:00
} else {
ThdlDebug . verify ( howManyConsumed [ 0 ] = = 3 ) ;
+ + i ; + + i ;
2004-04-10 18:26:57 +00:00
}
}
2003-07-08 14:30:17 +00:00
if ( TibetanMachineWeb . isWyliePunc ( wylie )
& & ! TibetanMachineWeb . isWylieAdornment ( wylie ) ) {
if ( ! glyphList . isEmpty ( ) ) {
2003-09-02 06:39:33 +00:00
getTshegBarTranslit ( EWTSNotACIP , glyphList , noSuch ,
warnings , translitBuffer ) ;
2003-07-08 14:30:17 +00:00
glyphList . clear ( ) ;
2003-03-30 02:31:16 +00:00
}
2003-09-02 06:39:33 +00:00
translitBuffer . append ( EWTSNotACIP ? wylie : acip ) ; //append the punctuation
2003-07-08 14:30:17 +00:00
} else {
glyphList . add ( dcs [ i ] ) ;
2003-03-30 02:31:16 +00:00
}
}
}
2003-09-02 06:39:33 +00:00
// replace remaining TMW with transliteration
2003-03-30 02:31:16 +00:00
if ( ! glyphList . isEmpty ( ) ) {
2003-09-02 06:39:33 +00:00
getTshegBarTranslit ( EWTSNotACIP , glyphList , noSuch ,
warnings , translitBuffer ) ;
2003-07-08 14:30:17 +00:00
// glyphList.clear() if we weren't about to exit...
if ( null ! = warnings )
warnings . append ( " The stretch of Tibetan ended without final punctuation. " ) ;
2003-03-30 02:31:16 +00:00
}
2004-04-14 05:44:51 +00:00
if ( translitBuffer . length ( ) > 0 ) {
2003-09-02 06:39:33 +00:00
return translitBuffer . toString ( ) ;
2004-04-14 05:44:51 +00:00
}
2003-03-30 02:31:16 +00:00
else
return null ;
2003-07-08 14:30:17 +00:00
}
2003-08-10 19:02:56 +00:00
/** Returns "root" instead of "appendaged-root", for example. */
private static final String getCandidateTypeModuloAppendage ( String candidateType ) {
if ( candidateType . startsWith ( " appendaged- " ) ) {
candidateType
= candidateType . substring ( " appendaged- " . length ( ) ) . intern ( ) ;
}
return candidateType ;
2003-07-08 14:30:17 +00:00
}
2003-08-10 19:02:56 +00:00
/ * * Returns an array of size 2 that lists all the possible indices
* of the root stack given the chosen candidate type . A negative
* number appears if there are not that many possible positions
* for the root . ( You ' ll get two negative numbers if there is no
* root stack . ) * /
public static final int [ ] getIndicesOfRootForCandidateType ( String candidateType ) {
// Appendaged vs. not appendaged? it affects nothing.
candidateType = getCandidateTypeModuloAppendage ( candidateType ) ;
int [ ] rv = new int [ ] { - 1 , - 1 } ;
if ( candidateType = = " prefix/root "
| | candidateType . startsWith ( " root " ) ) {
rv [ 0 ] = 0 ;
} else if ( candidateType . startsWith ( " prefix/root- " ) ) {
rv [ 0 ] = 0 ;
rv [ 1 ] = 1 ;
} else if ( candidateType . startsWith ( " prefix-root " ) ) {
rv [ 0 ] = 1 ;
}
return rv ;
2003-07-08 14:30:17 +00:00
}
2003-08-23 22:03:37 +00:00
/ * * Returns true if and only if the stack with Wylie < i > root < / i >
* can take the prefix < i > prefix < / i > . * /
private static boolean isLegalPrefixRootCombo ( String prefix , String root ) {
// This will be decomposed enough. If you can decompose it,
// then it doesn't take a prefix!
if ( ! TibetanMachineWeb . isKnownHashKey ( root ) ) {
root = root . replace ( '+' , '-' ) ;
if ( ! TibetanMachineWeb . isKnownHashKey ( root ) ) {
2003-10-19 03:47:33 +00:00
// If the glyph isn't even in TibetanMachine, then
// it's not able to take any prefix.
return false ;
2003-08-23 22:03:37 +00:00
}
}
String ru = TibetanMachineWeb . getUnicodeForWylieForGlyph ( root ) ;
// ru may be for (head, root, sub), (head, root), (root), or
// (root, sub). Try all possibilities that are possible with
// a String of length ru. If there's a wa-zur, then we say
// (FIXME: do we say correctly?) that a stack with wa-zur can
// take a prefix if and only if the stack without can take a
// prefix.
if ( ru = = null ) throw new Error ( " how? root is " + root ) ; // FIXME: make this an assertion
int rl = ru . length ( ) ;
if ( ru . charAt ( rl - 1 ) = = UnicodeConstants . EWSUB_wa_zur )
- - rl ; // forget about wa-zur: see above.
if ( rl = = 2 ) {
char ch0 = ru . charAt ( 0 ) ;
char ch1 = UnicodeUtils . getNominalRepresentationOfSubscribedConsonant ( ru . charAt ( 1 ) ) ;
// (head, root) and (root, sub) are possibilities.
if ( ACHUNG . equals ( prefix ) ) {
return LegalTshegBar . takesAchungPrefix ( ch0 , ch1 , UnicodeConstants . EW_ABSENT )
| | LegalTshegBar . takesAchungPrefix ( UnicodeConstants . EW_ABSENT , ch0 , ch1 ) ;
} else if ( " b " . equals ( prefix ) ) {
return LegalTshegBar . takesBao ( ch0 , ch1 , UnicodeConstants . EW_ABSENT )
| | LegalTshegBar . takesBao ( UnicodeConstants . EW_ABSENT , ch0 , ch1 ) ;
} else if ( " m " . equals ( prefix ) ) {
return LegalTshegBar . takesMao ( ch0 , ch1 , UnicodeConstants . EW_ABSENT )
| | LegalTshegBar . takesMao ( UnicodeConstants . EW_ABSENT , ch0 , ch1 ) ;
} else if ( " g " . equals ( prefix ) ) {
return LegalTshegBar . takesGao ( ch0 , ch1 , UnicodeConstants . EW_ABSENT )
| | LegalTshegBar . takesGao ( UnicodeConstants . EW_ABSENT , ch0 , ch1 ) ;
} else if ( " d " . equals ( prefix ) ) {
return LegalTshegBar . takesDao ( ch0 , ch1 , UnicodeConstants . EW_ABSENT )
| | LegalTshegBar . takesDao ( UnicodeConstants . EW_ABSENT , ch0 , ch1 ) ;
} else {
throw new IllegalArgumentException ( " prefix is " + prefix ) ;
}
} else if ( rl = = 1 ) {
char ch0 = ru . charAt ( 0 ) ;
// (root) is the only choice.
if ( ACHUNG . equals ( prefix ) ) {
return LegalTshegBar . takesAchungPrefix ( UnicodeConstants . EW_ABSENT , ch0 , UnicodeConstants . EW_ABSENT ) ;
} else if ( " b " . equals ( prefix ) ) {
return LegalTshegBar . takesBao ( UnicodeConstants . EW_ABSENT , ch0 , UnicodeConstants . EW_ABSENT ) ;
} else if ( " m " . equals ( prefix ) ) {
return LegalTshegBar . takesMao ( UnicodeConstants . EW_ABSENT , ch0 , UnicodeConstants . EW_ABSENT ) ;
} else if ( " g " . equals ( prefix ) ) {
return LegalTshegBar . takesGao ( UnicodeConstants . EW_ABSENT , ch0 , UnicodeConstants . EW_ABSENT ) ;
} else if ( " d " . equals ( prefix ) ) {
return LegalTshegBar . takesDao ( UnicodeConstants . EW_ABSENT , ch0 , UnicodeConstants . EW_ABSENT ) ;
} else {
throw new IllegalArgumentException ( " prefix is " + prefix ) ;
}
} else if ( rl = = 3 ) {
char ch0 = ru . charAt ( 0 ) ;
char ch1 = UnicodeUtils . getNominalRepresentationOfSubscribedConsonant ( ru . charAt ( 1 ) ) ;
char ch2 = UnicodeUtils . getNominalRepresentationOfSubscribedConsonant ( ru . charAt ( 2 ) ) ;
// (head, root, sub) is the only choice.
if ( ACHUNG . equals ( prefix ) ) {
return LegalTshegBar . takesAchungPrefix ( ch0 , ch1 , ch2 ) ;
} else if ( " b " . equals ( prefix ) ) {
return LegalTshegBar . takesBao ( ch0 , ch1 , ch2 ) ;
} else if ( " m " . equals ( prefix ) ) {
return LegalTshegBar . takesMao ( ch0 , ch1 , ch2 ) ;
} else if ( " g " . equals ( prefix ) ) {
return LegalTshegBar . takesGao ( ch0 , ch1 , ch2 ) ;
} else if ( " d " . equals ( prefix ) ) {
return LegalTshegBar . takesDao ( ch0 , ch1 , ch2 ) ;
} else {
throw new IllegalArgumentException ( " prefix is " + prefix ) ;
}
} else {
return false ;
}
}
2002-11-02 03:38:59 +00:00
}