2003-08-10 19:02:56 +00:00
/ *
The contents of this file are subject to the THDL Open Community License
Version 1 . 0 ( the " License " ) ; you may not use this file except in compliance
with the License . You may obtain a copy of the License on the THDL web site
( http : //www.thdl.org/).
Software distributed under the License is distributed on an " AS IS " basis ,
WITHOUT WARRANTY OF ANY KIND , either express or implied . See the
License for the specific terms governing rights and limitations under the
License .
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library ( THDL ) . Portions created by the THDL are Copyright 2003 THDL .
All Rights Reserved .
Contributor ( s ) : ______________________________________ .
* /
package org.thdl.tib.text ;
2005-02-07 03:17:40 +00:00
import org.thdl.util.ThdlDebug ;
2003-08-10 19:02:56 +00:00
/ * * An ordered pair consisting of a Tibetan grapheme cluster ' s ( see
{ @link org . thdl . tib . text . tshegbar . UnicodeGraphemeCluster } for a
definition of the term } ) classification and its
context - insensitive THDL Extended Wylie representation . NOTE
WELL : this is not a real grapheme cluster ; I ' m misusing the term
( FIXME ) . It ' s actually whole or part of one . It ' s part of one
2003-08-23 22:03:37 +00:00
when this is U + 0F7F alone .
2003-08-10 19:02:56 +00:00
@author David Chandler * /
2003-11-23 01:22:27 +00:00
public class TGCPair implements THDLWylieConstants {
2003-08-10 19:02:56 +00:00
public static final int OTHER = 1 ;
// a standalone achen would fall into this category:
public static final int CONSONANTAL_WITHOUT_VOWEL = 2 ;
public static final int CONSONANTAL_WITH_VOWEL = 3 ;
public static final int LONE_VOWEL = 4 ;
public static final int SANSKRIT_WITHOUT_VOWEL = 5 ;
public static final int SANSKRIT_WITH_VOWEL = 6 ;
2003-08-31 20:38:28 +00:00
/ * * Returns a human - readable ( well , programmer - readable )
representation of one of the public enumerations in this
class . * /
public static final String enumToString ( int cls ) {
if ( OTHER = = cls ) return " OTHER " ;
if ( LONE_VOWEL = = cls ) return " LONE_VOWEL " ;
if ( SANSKRIT_WITH_VOWEL = = cls ) return " SANSKRIT_WITH_VOWEL " ;
if ( SANSKRIT_WITHOUT_VOWEL = = cls ) return " SANSKRIT_WITHOUT_VOWEL " ;
if ( CONSONANTAL_WITH_VOWEL = = cls ) return " CONSONANTAL_WITH_VOWEL " ;
if ( CONSONANTAL_WITHOUT_VOWEL = = cls ) return " CONSONANTAL_WITHOUT_VOWEL " ;
if ( TYPE_OTHER = = cls ) return " TYPE_OTHER " ;
if ( TYPE_SANSKRIT = = cls ) return " TYPE_SANSKRIT " ;
if ( TYPE_TIBETAN = = cls ) return " TYPE_TIBETAN " ;
return null ;
}
2003-08-23 22:03:37 +00:00
public static final int TYPE_OTHER = 31 ;
public static final int TYPE_SANSKRIT = 32 ;
public static final int TYPE_TIBETAN = 33 ;
// Sanskrit or Tibetan consonant, or number, or oddball:
private String consonantWylie ;
private String vowelWylie ;
public String getConsonantWylie ( ) {
return consonantWylie ;
}
public String getVowelWylie ( ) {
return vowelWylie ;
}
/** Cludge. */
public void setWylie ( String x ) {
consonantWylie = x ;
vowelWylie = null ;
}
public String getWylie ( ) {
2003-10-18 03:04:47 +00:00
return getWylie ( null ) ;
2003-09-12 05:06:37 +00:00
}
2003-10-18 03:04:47 +00:00
/ * * Returns the EWTS for this pair , given that , if and only if
this pair is part of an appendaged tsheg bar like ma ' ongs or
pa ' am or spre ' ur , previousTranslitIfAppendaged is non - null .
If it is non - null , then it must be equal to the EWTS
transliteration of the previous pair .
@see # getACIP ( String )
* /
public String getWylie ( String previousTranslitIfAppendaged ) {
2003-11-23 01:22:27 +00:00
if ( ACHEN . equals ( consonantWylie ) ) {
// Unlike ACIP, EWTS uses e for achen with e vowel, not ae.
if ( null = = vowelWylie )
return ACHEN ;
else
return vowelWylie ;
}
2003-08-23 22:03:37 +00:00
StringBuffer b = new StringBuffer ( ) ;
if ( consonantWylie ! = null ) {
2003-10-18 03:04:47 +00:00
// Think of pa'am... we want 'am, not 'm; 'ang, not 'ng. But we want 'ur, not 'uar, 'is, not 'ias.
if ( null ! = previousTranslitIfAppendaged
& & " ' " . equals ( previousTranslitIfAppendaged ) ) {
b . append ( " a " ) ;
}
2003-09-12 05:06:37 +00:00
2003-08-23 22:03:37 +00:00
// we may have {p-y}, but the user wants to see {py}.
for ( int i = 0 ; i < consonantWylie . length ( ) ; i + + ) {
char ch = consonantWylie . charAt ( i ) ;
if ( '-' ! = ch )
b . append ( ch ) ;
}
}
2005-02-07 03:17:40 +00:00
if ( vowelWylie ! = null ) {
if ( consonantWylie ! = null // we need a stack to put an 'a' onto
& & ! TibetanMachineWeb . startsWithWylieVowelSequence ( vowelWylie ) ) {
b . append ( " a " ) ; // we want laM, not lM -- see bug 998476
}
2003-08-23 22:03:37 +00:00
b . append ( vowelWylie ) ;
2005-02-07 03:17:40 +00:00
}
2003-08-23 22:03:37 +00:00
return b . toString ( ) ;
}
2003-09-02 06:39:33 +00:00
public String getACIP ( ) {
2003-10-18 03:04:47 +00:00
return getACIP ( null ) ;
2003-09-12 05:06:37 +00:00
}
2004-04-14 05:44:51 +00:00
/ * * Like { @link # getWylie ( String ) } but for ACIP transliteration ,
not EWTS . * /
2003-10-18 03:04:47 +00:00
public String getACIP ( String previousTranslitIfAppendaged ) {
2003-09-02 06:39:33 +00:00
// DLC FIXME: has the EWTS change affected Manipulate.acipToWylie?
StringBuffer b = new StringBuffer ( ) ;
if ( consonantWylie ! = null ) {
2003-09-12 05:06:37 +00:00
String consonantACIP
2004-04-14 05:44:51 +00:00
= null ;
if ( " w " . equals ( consonantWylie )
& & ( SANSKRIT_WITHOUT_VOWEL = = classification
| | SANSKRIT_WITH_VOWEL = = classification ) )
consonantACIP = " V " ;
else
consonantACIP
= org . thdl . tib . text . ttt . ACIPRules . getACIPForEWTS ( consonantWylie ) ;
2003-09-12 05:06:37 +00:00
if ( null = = consonantACIP ) {
2004-04-14 05:44:51 +00:00
if ( null ! = consonantWylie & & consonantWylie . startsWith ( " R+ " ) )
return TibetanMachineWeb . getTMWToACIPErrorString ( " glyph with THDL Extended Wylie " + consonantWylie , " because the ACIP R+... could imply the short superscribed form, but this most likely intends the full form (i.e., Unicode character U+0F6A) " ) ;
return TibetanMachineWeb . getTMWToACIPErrorString ( " glyph with THDL Extended Wylie " + consonantWylie , " " ) ;
2003-09-12 05:06:37 +00:00
} else {
2004-04-14 05:44:51 +00:00
// Think of pa'am... we want 'am, not 'm; 'ang, not
// 'ng. But we want 'ur, not 'uar, 'is, not 'ias.
2003-10-18 03:04:47 +00:00
if ( null ! = previousTranslitIfAppendaged
& & " ' " . equals ( previousTranslitIfAppendaged ) ) {
b . append ( " A " ) ;
}
2003-09-12 05:06:37 +00:00
// we may have {P-Y}, but the user wants to see {PY}.
for ( int i = 0 ; i < consonantACIP . length ( ) ; i + + ) {
char ch = consonantACIP . charAt ( i ) ;
if ( '-' ! = ch )
b . append ( ch ) ;
}
2003-09-02 06:39:33 +00:00
}
}
if ( vowelWylie ! = null ) {
2003-09-12 05:06:37 +00:00
String vowelACIP
= org . thdl . tib . text . ttt . ACIPRules . getACIPForEWTS ( vowelWylie ) ;
if ( null = = vowelACIP ) {
2004-04-14 05:44:51 +00:00
return TibetanMachineWeb . getTMWToACIPErrorString ( " glyph with THDL Extended Wylie " + vowelWylie , " " ) ;
2003-09-12 05:06:37 +00:00
} else {
b . append ( vowelACIP ) ;
}
2003-09-02 06:39:33 +00:00
}
return b . toString ( ) ;
}
2003-08-10 19:02:56 +00:00
public int classification ;
2003-08-23 22:03:37 +00:00
/ * * Constructs a new TGCPair with ( Tibetan or Sanskrit ) consonant
* consonantWylie and vowel vowelWylie . Use
* classification = = TYPE_OTHER for numbers , lone vowels , marks ,
* etc . Use classification = = TYPE_TIBETAN for Tibetan ( not
* Tibetanized Sanskrit ) and classification = TYPE_SANSKRIT for
* Tibetanized Sanskrit . * /
public TGCPair ( String consonantWylie , String vowelWylie , int classification ) {
if ( " " . equals ( vowelWylie ) )
vowelWylie = null ;
// Technically, we don't need the following check, but it's
// nice for consistency's sake.
if ( " " . equals ( consonantWylie ) )
consonantWylie = null ;
// DLC FIXME: for speed, make these assertions:
if ( classification ! = TYPE_OTHER
& & classification ! = TYPE_TIBETAN
& & classification ! = TYPE_SANSKRIT ) {
throw new IllegalArgumentException ( " Bad classification " + classification + " . " ) ;
}
int realClassification = - 37 ;
if ( vowelWylie = = null & & classification = = TYPE_TIBETAN )
realClassification = CONSONANTAL_WITHOUT_VOWEL ;
2005-02-07 03:17:40 +00:00
if ( vowelWylie ! = null & & classification = = TYPE_TIBETAN ) {
2003-08-23 22:03:37 +00:00
realClassification = CONSONANTAL_WITH_VOWEL ;
2005-02-07 03:17:40 +00:00
}
2003-08-23 22:03:37 +00:00
if ( vowelWylie = = null & & classification = = TYPE_SANSKRIT )
realClassification = SANSKRIT_WITHOUT_VOWEL ;
if ( vowelWylie ! = null & & classification = = TYPE_SANSKRIT )
realClassification = SANSKRIT_WITH_VOWEL ;
if ( consonantWylie = = null ) {
if ( classification ! = TYPE_OTHER )
throw new IllegalArgumentException ( " That's the very definition of a lone vowel. " ) ;
realClassification = LONE_VOWEL ;
} else {
if ( classification = = TYPE_OTHER )
realClassification = OTHER ;
}
this . consonantWylie = consonantWylie ;
2003-09-12 05:06:37 +00:00
if ( null ! = vowelWylie ) {
if ( vowelWylie . equals ( " iA " ) | | vowelWylie . equals ( " Ai " ) )
vowelWylie = " I " ;
if ( vowelWylie . equals ( " uA " ) | | vowelWylie . equals ( " Au " ) )
vowelWylie = " U " ;
}
2005-02-07 03:17:40 +00:00
// Normalize vowelWylie such that any real vowel (i.e., a
// vowel for which TibetanMachineWeb.isWylieVowel(..) returns
// true) comes before any non-vowel but vowelish combining
// character like 'M', '?', '~M', '~M`', or 'H':
this . vowelWylie = normalizedVowel ( vowelWylie ) ;
2003-08-23 22:03:37 +00:00
this . classification = realClassification ;
2003-08-10 19:02:56 +00:00
}
2003-08-23 22:03:37 +00:00
2005-02-07 03:17:40 +00:00
private static final int MAX_CHARACTERS_IN_VOWELISH = 3 ; // ~M` has 3 characters
/ * * Returns v such that all the normal vowels in v come first and
the combining marks ( Sanskrit - ish ) come last . Relative order
in each of the two groups is preserved . * /
private static String normalizedVowel ( String v ) {
if ( null = = v | | " " . equals ( v ) ) return null ;
StringBuffer vowel_sb = new StringBuffer ( ) ;
StringBuffer vowelish_sb = new StringBuffer ( ) ;
int mark = 0 ;
int maxVowelishLength
= Math . max ( TibetanMachineWeb . getMaxEwtsVowelLength ( ) ,
TGCPair . MAX_CHARACTERS_IN_VOWELISH ) ;
for ( int i = 0 ; i < v . length ( ) ; i + + ) {
// Grab ~M` if it's there because you don't want to grab
// ~M and then have ` left over, which is not vowelish.
for ( int j = maxVowelishLength ; j > 0 ; j - - ) {
if ( i + j < = v . length ( ) ) {
String x = v . substring ( mark , i + j ) ;
if ( TibetanMachineWeb . isWylieVowel ( x ) ) {
mark = i + j ;
i + = j - 1 ;
vowel_sb . append ( x ) ;
} else if ( isWylieVowelishButNotVowel ( x ) ) {
mark = i + j ;
i + = j - 1 ;
vowelish_sb . append ( x ) ;
}
}
}
}
if ( mark < v . length ( ) ) {
vowelish_sb . append ( v . substring ( mark ) ) ;
ThdlDebug . noteIffyCode ( ) ;
// FIXME(dchandler): what should I do here? I doubt v is
// valid.
}
if ( vowelish_sb . length ( ) > 0 ) { // just an optimization
vowel_sb . append ( vowelish_sb ) ;
return vowel_sb . toString ( ) ;
} else {
ThdlDebug . verify ( vowel_sb . toString ( ) . equals ( v ) ) ;
return v ;
}
}
/** Returns true if v is in the set { "M", "H", "~M", "~M`", "?" }. */
private static boolean isWylieVowelishButNotVowel ( String v ) {
boolean ans = ( v . equals ( " M " )
| | v . equals ( " H " )
| | v . equals ( " ~M` " )
| | v . equals ( " ~M " )
| | v . equals ( " ? " ) ) ;
ThdlDebug . verify ( ! ans | | TGCPair . MAX_CHARACTERS_IN_VOWELISH > = v . length ( ) ) ;
return ans ;
}
2003-08-10 19:02:56 +00:00
public String toString ( ) {
2003-08-23 22:03:37 +00:00
return " <TGCPair wylie= " + getWylie ( ) + " classification= "
2003-08-10 19:02:56 +00:00
+ classification + " /> " ;
}
}