2003-08-10 19:30:07 +00:00
/ *
The contents of this file are subject to the THDL Open Community License
Version 1 . 0 ( the " License " ) ; you may not use this file except in compliance
with the License . You may obtain a copy of the License on the THDL web site
( http : //www.thdl.org/).
Software distributed under the License is distributed on an " AS IS " basis ,
WITHOUT WARRANTY OF ANY KIND , either express or implied . See the
License for the specific terms governing rights and limitations under the
License .
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library ( THDL ) . Portions created by the THDL are Copyright 2003 THDL .
All Rights Reserved .
Contributor ( s ) : ______________________________________ .
* /
2005-02-21 01:16:10 +00:00
// TODO(DLC)[EWTS->Tibetan]: If EWTS still has 'v', warn about it if it looks like someone thinks that ACIP's usage of it for wa-zur is how EWTS does things.
2003-08-10 19:30:07 +00:00
package org.thdl.tib.text.ttt ;
/ * * A factory for creating { @link TPairList TPairLists } from
* Strings of ACIP .
* @author David Chandler * /
2005-06-20 06:18:00 +00:00
// TODO(DLC)[EWTS->Tibetan]: kill this class; put it all in TTraits.
2003-08-10 19:30:07 +00:00
class TPairListFactory {
/** This class is not instantiable. */
private TPairListFactory ( ) { }
2005-06-20 06:18:00 +00:00
/** See {@link TTraits#breakTshegBarIntoChunks}. */
static TPairList [ ] breakACIPIntoChunks ( String tt ,
boolean specialHandlingForAppendages ) {
TTraits ttraits = ACIPTraits . instance ( ) ;
TPairList a = breakHelperACIP ( tt , true , false , ttraits ) ;
TPairList b = null ;
if ( specialHandlingForAppendages )
b = breakHelperACIP ( tt , false , false , ttraits ) ;
if ( null ! = b & & a . equals ( b ) )
return new TPairList [ ] { a , null } ;
else
return new TPairList [ ] { a , b } ;
2005-02-21 01:16:10 +00:00
}
2004-04-17 15:48:50 +00:00
/ * * Helps { @link # breakACIPIntoChunks ( String , boolean ) } .
2003-10-18 17:49:29 +00:00
* @param tickIsVowel true if and only if you want to treat the
* ACIP { ' } as an U + 0F71 vowel instead of the full - sized
* consonant in special , " this might be an appendage like 'AM or
* ' ANG " circumstances
* @param weHaveSeenVowelAlready true if and only if , in our
* recursion , we ' ve already found one vowel ( not a disambiguator ,
2003-11-30 02:06:48 +00:00
* but a vowel like " A " , " E " , " Um: " , " m " , " 'U " , etc . ) * /
2005-02-21 01:16:10 +00:00
private static TPairList breakHelperACIP ( String acip , boolean tickIsVowel ,
boolean weHaveSeenVowelAlready ,
TTraits ttraits ) {
2003-08-10 19:30:07 +00:00
// base case for our recursion:
if ( " " . equals ( acip ) )
2005-02-22 04:36:54 +00:00
return new TPairList ( ttraits ) ;
2003-08-10 19:30:07 +00:00
StringBuffer acipBuf = new StringBuffer ( acip ) ;
int howMuchBuf [ ] = new int [ 1 ] ;
2005-02-21 01:16:10 +00:00
TPair head = getFirstConsonantAndVowel ( acipBuf , howMuchBuf , ttraits ) ;
2003-08-10 19:30:07 +00:00
int howMuch = howMuchBuf [ 0 ] ;
2003-10-16 04:15:10 +00:00
if ( ! tickIsVowel
& & null ! = head . getLeft ( )
& & null ! = head . getRight ( )
2003-10-18 17:49:29 +00:00
& & weHaveSeenVowelAlready
2005-02-22 04:36:54 +00:00
& & ttraits . isSuffix ( head . getLeft ( ) ) // DKY'O should be two horizontal units, not three. -- {D}{KY'O}, not {D}{KY}{'O}.
2003-10-16 04:15:10 +00:00
& & head . getRight ( ) . startsWith ( " ' " ) ) {
2005-02-22 04:36:54 +00:00
head = new TPair ( ttraits , head . getLeft ( ) ,
2003-10-16 04:15:10 +00:00
// Without this disambiguator, we are
// less efficient (8 parses, not 4) and
// we can't handle PA'AM'ANG etc.
" - " ) ;
howMuch = head . getLeft ( ) . length ( ) ;
}
2003-08-10 19:30:07 +00:00
TPairList tail ;
if ( ( tail
2005-02-21 01:16:10 +00:00
= breakHelperACIP ( acipBuf . substring ( howMuch ) ,
tickIsVowel ,
weHaveSeenVowelAlready
| | ( head . getRight ( ) ! = null
& & ! " + " . equals ( head . getRight ( ) )
& & ! " - " . equals ( head . getRight ( ) ) ) ,
2005-06-20 06:18:00 +00:00
ttraits ) ) . hasSimpleError ( ) ) {
2003-08-10 19:30:07 +00:00
for ( int i = 1 ; i < howMuch ; i + + ) {
// try giving i characters back if that leaves us with
// a legal head and makes the rest free of simple
// errors.
TPairList newTail = null ;
TPair newHead ;
2005-02-21 01:16:10 +00:00
if ( ( newHead = head . minusNRightmostTransliterationCharacters ( i ) ) . isLegal ( )
2003-08-10 19:30:07 +00:00
& & ! ( newTail
2005-02-21 01:16:10 +00:00
= breakHelperACIP ( acipBuf . substring ( howMuch - i ) ,
tickIsVowel ,
weHaveSeenVowelAlready
| | ( newHead . getRight ( ) ! = null
& & ! " + " . equals ( newHead . getRight ( ) )
& & ! " - " . equals ( newHead . getRight ( ) ) ) ,
2005-06-20 06:18:00 +00:00
ttraits ) ) . hasSimpleError ( ) ) {
2005-02-21 01:16:10 +00:00
newTail . prepend ( newHead ) ;
return newTail ;
}
}
// It didn't work. Return the first thing we'd thought
// of: head appended with tail. (I.e., fall through.)
}
tail . prepend ( head ) ;
return tail ;
}
2005-06-20 06:18:00 +00:00
/** See {@link TTraits#breakTshegBarIntoChunks}. */
static TPairList [ ] breakEWTSIntoChunks ( String ewts )
throws IllegalArgumentException
{
EWTSTraits traits = EWTSTraits . instance ( ) ;
TPairList pl = breakHelperEWTS ( ewts , traits ) ;
TPairList npl = pl ;
// TODO(DLC)[EWTS->Tibetan]: this crap ain't workin' for kaHM. But kaeM and kaMe shouldn't work, right? Figure out what EWTS really says...
// TODO(DLC)[EWTS->Tibetan]: for "a\\0f86" e.g.:
if ( pl . size ( ) > 1 ) {
npl = new TPairList ( traits , pl . size ( ) ) ;
for ( int i = pl . size ( ) - 1 ; i > = 1 ; i - - ) {
TPair left = pl . get ( i - 1 ) ;
TPair right = pl . get ( i ) ;
if ( traits . aVowel ( ) . equals ( left . getRight ( ) )
& & left . getLeft ( ) = = null
& & right . getLeft ( ) = = null
& & traits . isWowelThatRequiresAChen ( right . getRight ( ) ) ) {
npl . prepend ( new TPair ( traits , traits . aVowel ( ) , right . getRight ( ) ) ) ;
- - i ;
} else if ( traits . aVowel ( ) . equals ( left . getRight ( ) )
& & left . getLeft ( ) ! = null
& & right . getLeft ( ) = = null
& & traits . isWowelThatRequiresAChen ( right . getRight ( ) )
& & false /* TODO(DLC)[EWTS->Tibetan]: ewts kaM is bothersome now */ ) {
npl . prepend ( new TPair ( traits , left . getLeft ( ) , right . getRight ( ) ) ) ;
- - i ;
} else {
npl . prepend ( right ) ;
if ( i = = 1 )
npl . prepend ( left ) ;
}
}
}
TPairList nnpl ;
if ( true ) {
// Collapse ( . wowel1) ( . wowel2) into (
// . wowel1+wowel2). Then collapse (* . a) ( . x) into (*
// . x). Also, if an a-chen (\u0f68) is implied, then
// insert it.
TPairList xnnpl = new TPairList ( traits , pl . size ( ) ) ;
for ( int i = 0 ; i < npl . size ( ) ; ) {
TPair p = npl . get ( i ) ;
int set_i_to = i + 1 ;
if ( p . getLeft ( ) = = null
& & p . getRight ( ) ! = null
& & ! traits . disambiguator ( ) . equals ( p . getRight ( ) )
& & ! " + " . equals ( p . getRight ( ) ) ) {
StringBuffer sb = new StringBuffer ( p . getRight ( ) ) ;
for ( int j = i + 1 ; j < npl . size ( ) ; j + + ) {
TPair p2 = npl . get ( j ) ;
if ( p2 . getLeft ( ) = = null
& & p2 . getRight ( ) ! = null
& & ! traits . disambiguator ( ) . equals ( p2 . getRight ( ) )
& & ! " + " . equals ( p2 . getRight ( ) ) )
{
sb . append ( " + " + p2 . getRight ( ) ) ;
set_i_to = j + 1 ;
} else {
break ;
}
}
p = new TPair ( traits , traits . aVowel ( ) , sb . toString ( ) ) ;
}
// TODO(DLC)[EWTS->Tibetan]: Do we still have "ai" converting to the wrong thing. "ae"?
xnnpl . append ( p ) ;
i = set_i_to ;
}
nnpl = new TPairList ( traits , pl . size ( ) ) ;
// (* . a ) ( . x) ... ( . y) -> (* . a+x+...+y)
for ( int i = 0 ; i < xnnpl . size ( ) ; ) {
TPair p = xnnpl . get ( i ) ;
int set_i_to = i + 1 ;
if ( traits . aVowel ( ) . equals ( p . getRight ( ) ) ) {
StringBuffer sb = new StringBuffer ( p . getRight ( ) ) ;
for ( int j = i + 1 ; j < xnnpl . size ( ) ; j + + ) {
TPair p2 = xnnpl . get ( j ) ;
if ( p2 . getLeft ( ) = = null
& & p2 . getRight ( ) ! = null
& & ! traits . disambiguator ( ) . equals ( p2 . getRight ( ) )
& & ! " + " . equals ( p2 . getRight ( ) ) )
{
// TODO(DLC)[EWTS->Tibetan] a+o+e is what we'll get.. maybe we want just o+e?
sb . append ( " + " + p2 . getRight ( ) ) ;
set_i_to = j + 1 ;
} else {
break ;
}
}
p = new TPair ( traits , p . getLeft ( ) , sb . toString ( ) ) ;
}
if ( false ) { // TODO(DLC)[EWTS->Tibetan]: bra is screwed up, do in it stacklist?
// EWTS does not think that kra is k+ra. Replace
// (consonant . ) with (consonant . DISAMBIGUATOR):
if ( p . getRight ( ) = = null & & p . getLeft ( ) ! = null
& & i + 1 < xnnpl . size ( ) )
p = new TPair ( traits , p . getLeft ( ) , traits . disambiguator ( ) ) ;
}
nnpl . append ( p ) ;
i = set_i_to ;
}
} else {
// TODO(DLC)[EWTS->Tibetan]: this block is not executing. kill it after testing and thinking
nnpl = new TPairList ( traits , pl . size ( ) ) ;
for ( int i = npl . size ( ) - 1 ; i > = 0 ; i - - ) {
TPair p = npl . get ( i ) ;
if ( p . getLeft ( ) = = null
& & p . getRight ( ) ! = null
& & ! traits . disambiguator ( ) . equals ( p . getRight ( ) )
& & ! " + " . equals ( p . getRight ( ) ) ) /* TODO(DLC)[EWTS->Tibetan] this should be equivalent to isWowel(p.getRight()) but o+o shows that's not true yet */
p = new TPair ( traits , traits . aVowel ( ) , p . getRight ( ) ) ;
// TODO(DLC)[EWTS->Tibetan]: do you still have "ai" converting to the wrong thing? ("ae" also?)
nnpl . prepend ( p ) ;
}
}
// TODO(DLC)[EWTS->Tibetan]: this nnpl crap was before getFirstConsonantAndVowel got fixed. Try killing it!
return new TPairList [ ] {
nnpl , null
} ;
}
2005-02-21 01:16:10 +00:00
// TODO(DLC)[EWTS->Tibetan]: doc
2005-02-22 04:36:54 +00:00
private static TPairList breakHelperEWTS ( String ewts , TTraits ttraits ) {
2005-02-21 01:16:10 +00:00
// base case for our recursion:
if ( " " . equals ( ewts ) )
2005-02-22 04:36:54 +00:00
return new TPairList ( ttraits ) ;
2005-02-21 01:16:10 +00:00
StringBuffer ewtsBuf = new StringBuffer ( ewts ) ;
int howMuchBuf [ ] = new int [ 1 ] ;
TPair head = getFirstConsonantAndVowel ( ewtsBuf , howMuchBuf , ttraits ) ;
int howMuch = howMuchBuf [ 0 ] ;
TPairList tail ;
if ( ( tail = breakHelperEWTS ( ewtsBuf . substring ( howMuch ) ,
2005-06-20 06:18:00 +00:00
ttraits ) ) . hasSimpleError ( ) ) {
2005-02-21 01:16:10 +00:00
for ( int i = 1 ; i < howMuch ; i + + ) {
// try giving i characters back if that leaves us with
// a legal head and makes the rest free of simple
// errors.
TPairList newTail = null ;
TPair newHead ;
if ( ( newHead = head . minusNRightmostTransliterationCharacters ( i ) ) . isLegal ( )
& & ! ( newTail
2005-06-20 06:18:00 +00:00
= breakHelperEWTS ( ewtsBuf . substring ( howMuch - i ) , ttraits ) ) . hasSimpleError ( ) ) {
2003-08-10 19:30:07 +00:00
newTail . prepend ( newHead ) ;
return newTail ;
}
}
// It didn't work. Return the first thing we'd thought
// of: head appended with tail. (I.e., fall through.)
}
tail . prepend ( head ) ;
return tail ;
}
2005-06-20 06:18:00 +00:00
private static String GetInitialVowel ( TTraits ttraits , String tx ,
String startOfVowel ) {
if ( null = = startOfVowel ) startOfVowel = " " ;
boolean startsWithPlus = false ;
if ( ! " " . equals ( startOfVowel )
& & ( ! ttraits . vowelsMayStack ( )
| | ( tx . length ( ) < 1 | | ! ( startsWithPlus = tx . substring ( 0 , 1 ) . equals ( " + " ) ) ) ) )
return ( " " . equals ( startOfVowel ) ? null : startOfVowel ) ;
if ( startsWithPlus )
tx = tx . substring ( 1 ) ;
for ( int i = Math . min ( ttraits . maxWowelLength ( ) , tx . length ( ) ) ; i > = 1 ; i - - ) {
String t = tx . substring ( 0 , i ) ;
if ( ttraits . isWowel ( t )
| | ( ttraits . isACIP ( )
// Or these, which we massage into "Am", "Am:", and
// "A:" because I didn't think {Pm} should be treated
// like {PAm} originally:
// TODO(DLC)[EWTS->Tibetan]: NOW NIGHTMARE
& & ( " m " . equals ( t ) | | " m: " . equals ( t ) | | " : " . equals ( t ) ) ) ) {
// If this is followed by +wowel[+wowel[+wowel... in EWTS then that's part of the vowel also:
return GetInitialVowel ( ttraits ,
tx . substring ( i ) ,
startOfVowel + ( startsWithPlus ? " + " : " " ) + t ) ;
}
}
return null ;
}
/ * * Returns the largest TPair we can make from the transliteration
* starting from the left . This will return a size zero pair if
* and only if tx is the empty string ; otherwise , it may return a
* pair with either the left or right component empty . [ FOR
* ACIP : ] This mutates tx when we run into { NA + YA } ; it mutates tx
* into { N + YA } . For { NE + YA } , it does not mutate tx or behave
* intelligently . A later phase will need to turn that into
* { N + YE } or an error or whatever you like . howMuch [ 0 ] will be
* set to the number of characters of tx that this call has
* consumed . * /
private static TPair getFirstConsonantAndVowel ( StringBuffer tx , // TODO(DLC)[EWTS->Tibetan]: function name needs ACIP in it?
2005-02-21 01:16:10 +00:00
int howMuch [ ] ,
TTraits ttraits ) {
2005-06-20 06:18:00 +00:00
// To handle EWTS "phywa\\u0f84\u0f86" [yes that's two slashes
// and then one slash], for example, we need to make the wowel
// (the getRight() field of the returned TPair) contain
// everything that it should.
//
// It can't hurt in ACIP, though I don't recall if ACIP's lexer
// allows Unicode characters.
TPair og = helpGetFirstConsonantAndVowel ( tx , howMuch , ttraits ) ;
int len = tx . length ( ) ;
StringBuffer x = null ;
while ( howMuch [ 0 ] < len ) {
if ( isUnicodeWowelChar ( tx . charAt ( howMuch [ 0 ] ) ) ) {
if ( null = = x ) x = new StringBuffer ( ) ; // rarely happens
if ( x . length ( ) > 0 ) x . append ( '+' ) ;
x . append ( tx . charAt ( howMuch [ 0 ] + + ) ) ;
} else {
break ;
}
}
// In EWTS, deal with M, ~M`, etc. They're much like
// UnicodeWowelCharacters.
if ( ttraits instanceof EWTSTraits ) {
EWTSTraits tt = ( EWTSTraits ) ttraits ;
while ( howMuch [ 0 ] < len ) {
int howMuchExtra [ ] = new int [ ] { 0 } ;
TPair p
= helpGetFirstConsonantAndVowel ( new StringBuffer ( tx . substring ( howMuch [ 0 ] ) ) ,
howMuchExtra ,
ttraits ) ;
if ( p . getLeft ( ) = = null
& & p . getRight ( ) ! = null
& & tt . isWowelThatRequiresAChen ( p . getRight ( ) ) ) {
if ( null = = x ) x = new StringBuffer ( ) ; // rarely happens
String extra ;
if ( x . length ( ) > 0 ) x . append ( '+' ) ;
x . append ( extra = tx . substring ( howMuch [ 0 ] , howMuch [ 0 ] + howMuchExtra [ 0 ] ) ) ;
// System.out.println("extra is " + extra); TODO(DLC)[EWTS->Tibetan]
howMuch [ 0 ] + = howMuchExtra [ 0 ] ;
} else {
break ;
}
}
}
if ( null ! = x )
return new TPair ( ttraits , og . getLeft ( ) ,
( null = = og . getRight ( ) | | ttraits . aVowel ( ) . equals ( og . getRight ( ) ) )
? x . toString ( )
: ( og . getRight ( ) + " + " + x . toString ( ) ) ) ;
else
return og ;
}
private static TPair helpGetFirstConsonantAndVowel ( StringBuffer tx , // TODO(DLC)[EWTS->Tibetan]: function name needs ACIP in it?
int howMuch [ ] ,
TTraits ttraits ) {
// Note that it is *not* the case that if tx.substring(0, N)
2003-08-10 19:30:07 +00:00
// is legal (according to TPair.isLegal()), then
2005-06-20 06:18:00 +00:00
// tx.substring(0, N-1) is legal for all N. For example,
2005-02-21 01:16:10 +00:00
// think of ACIP's {shA} and {KshA}. However, 's' is the only
2005-06-20 06:18:00 +00:00
// tricky fellow in ACIP, so in ACIP it is true that
// tx.substring(0, N-1) is either legal or ends with 's' if
// tx.substring(0, N) is legal.
2003-08-10 19:30:07 +00:00
//
// We don't, however, use this approach. We just try to find
// a consonant of length 3, and then, failing that, of length
// 2, etc. Likewise with vowels. This avoids the issue.
2005-06-20 06:18:00 +00:00
int i , xl = tx . length ( ) ;
// TODO(DLC)[EWTS->Tibetan]: nasty special case!
if ( false & & ! ttraits . isACIP ( ) /* TODO(DLC)[EWTS->Tibetan]: isEWTS! */
& & xl > = 2 & & tx . charAt ( 0 ) = = 'a' & & ( tx . charAt ( 1 ) = = 'i' | | tx . charAt ( 1 ) = = 'u' ) ) {
howMuch [ 0 ] = 2 ;
return new TPair ( ttraits , null , tx . substring ( 0 , 2 ) ) ;
// TODO(DLC)[EWTS->Tibetan]: test that "au" alone is \u0f68\u0f7d, "ai" alone is \u0f68\u0f7b in EWTS.
}
2003-08-10 19:30:07 +00:00
if ( 0 = = xl ) {
howMuch [ 0 ] = 0 ;
2005-02-22 04:36:54 +00:00
return new TPair ( ttraits , null , null ) ;
2003-08-10 19:30:07 +00:00
}
2005-06-20 06:18:00 +00:00
if ( tx . charAt ( 0 ) = = ttraits . disambiguatorChar ( ) ) {
2003-08-10 19:30:07 +00:00
howMuch [ 0 ] = 1 ;
2005-02-22 04:36:54 +00:00
return new TPair ( ttraits , null , ttraits . disambiguator ( ) ) ;
2003-08-10 19:30:07 +00:00
}
2005-06-20 06:18:00 +00:00
char ch = tx . charAt ( 0 ) ;
2003-08-10 19:30:07 +00:00
// Numbers never appear in stacks, so if you see 1234, that's
2005-06-20 06:18:00 +00:00
// like seeing 1-2-3-4. Though in EWTS you can have '0\u0f19'
2003-08-10 19:30:07 +00:00
if ( ch > = '0' & & ch < = '9' ) {
2005-06-20 06:18:00 +00:00
// TODO(DLC)[EWTS->Tibetan]: test case: 0e should have a-chen and 0\u0f74 should go through without errors.
if ( xl > 1 & & ttraits . isUnicodeWowel ( tx . charAt ( 1 ) ) ) {
howMuch [ 0 ] = 2 ;
return new TPair ( ttraits , tx . substring ( 0 , 1 ) , tx . substring ( 1 , 2 ) ) ;
}
2003-08-10 19:30:07 +00:00
howMuch [ 0 ] = 1 ; // not 2...
2005-06-20 06:18:00 +00:00
return new TPair ( ttraits , tx . substring ( 0 , 1 ) , ( xl = = 1 ) ? null : ttraits . disambiguator ( ) ) ;
2003-08-10 19:30:07 +00:00
}
String l = null , r = null ;
2005-02-21 01:16:10 +00:00
for ( i = Math . min ( ttraits . maxConsonantLength ( ) , xl ) ; i > = 1 ; i - - ) {
2003-08-10 19:30:07 +00:00
String t = null ;
2005-06-20 06:18:00 +00:00
if ( ttraits . isConsonant ( t = tx . substring ( 0 , i ) )
| | ( ttraits . vowelAloneImpliesAChen ( ) // handle EWTS {a+yo}
& & ttraits . aVowel ( ) . equals ( tx . substring ( 0 , i ) )
& & i < xl & & tx . substring ( i , i + i ) . equals ( " + " ) ) ) {
2003-08-10 19:30:07 +00:00
l = t ;
break ;
}
}
int ll = ( null = = l ) ? 0 : l . length ( ) ;
2005-06-20 06:18:00 +00:00
if ( null ! = l & & xl > ll & & tx . charAt ( ll ) = = ttraits . disambiguatorChar ( ) ) {
2003-08-10 19:30:07 +00:00
howMuch [ 0 ] = l . length ( ) + 1 ;
2005-02-22 04:36:54 +00:00
return new TPair ( ttraits , l , ttraits . disambiguator ( ) ) ;
2003-08-10 19:30:07 +00:00
}
2005-06-20 06:18:00 +00:00
if ( null ! = l & & xl > ll & & tx . charAt ( ll ) = = '+' ) {
2003-08-10 19:30:07 +00:00
howMuch [ 0 ] = l . length ( ) + 1 ;
2005-02-22 04:36:54 +00:00
return new TPair ( ttraits , l , " + " ) ;
2003-08-10 19:30:07 +00:00
}
2005-06-20 06:18:00 +00:00
int mod = 0 ;
r = GetInitialVowel ( ttraits , tx . substring ( ll ) , null ) ;
if ( ttraits . isACIP ( ) ) {
// Treat {BATA+SA'I} like {BAT+SA'I}: // TODO(DLC)[EWTS->Tibetan]: in EWTS???
int z ;
if ( null ! = l
& & ttraits . aVowel ( ) . equals ( r )
& & ( ( z = ll + ttraits . aVowel ( ) . length ( ) ) < xl )
& & tx . charAt ( z ) = = '+' ) {
tx . deleteCharAt ( z - 1 ) ;
howMuch [ 0 ] = l . length ( ) + 1 ;
return new TPair ( ttraits , l , " + " ) ;
2003-08-10 19:30:07 +00:00
}
2005-06-20 06:18:00 +00:00
// Allow Pm to mean PAm, P: to mean PA:, Pm: to mean PAm:. /* TODO(DLC)[EWTS->Tibetan]: in EWTS? */
if ( " m " . equals ( r ) ) { r = " Am " ; mod = - 1 ; }
if ( " : " . equals ( r ) ) { r = " A: " ; mod = - 1 ; }
if ( " m: " . equals ( r ) ) { r = " Am: " ; mod = - 1 ; }
if ( " :m " . equals ( r ) ) { r = " A:m " ; mod = - 1 ; } // not seen, though...
2003-08-10 19:30:07 +00:00
}
2005-02-21 01:16:10 +00:00
// what if we see a character that's not part of any wowel or
2003-08-10 19:30:07 +00:00
// consonant? We return it.
if ( null = = l & & null = = r ) {
howMuch [ 0 ] = 1 ; // not 2...
2005-02-21 01:16:10 +00:00
// add a disambiguator to avoid exponential running time:
2005-06-20 06:18:00 +00:00
return new TPair ( ttraits , tx . substring ( 0 , 1 ) ,
2005-02-21 01:16:10 +00:00
( xl = = 1 ) ? null : ttraits . disambiguator ( ) ) ;
2003-08-10 19:30:07 +00:00
}
howMuch [ 0 ] = ( ( ( l = = null ) ? 0 : l . length ( ) )
2003-11-30 02:06:48 +00:00
+ ( ( r = = null ) ? 0 : r . length ( ) )
+ mod ) ;
2005-02-22 04:36:54 +00:00
return new TPair ( ttraits , l , r ) ;
2005-02-21 01:16:10 +00:00
} // TODO(DLC)[EWTS->Tibetan]:
2005-06-20 06:18:00 +00:00
private static boolean isUnicodeWowelChar ( char ch ) {
return ( ( ch > = '\u0f71' & & ch < = '\u0f84' )
| | " \ u0f35 \ u0f37 \ u0f18 \ u0f19 \ u0f3e \ u0f3f \ u0f86 \ u0f87 \ u0fc6 " . indexOf ( ch ) > = 0 ) ;
// TODO(dchandler): should we really allow "phywa\\u0f18", or
// does \u0f18 only combine with digits?
}
2003-08-10 19:30:07 +00:00
}
2003-11-09 01:07:45 +00:00
// FIXME: test for nested comments
2003-08-10 19:30:07 +00:00
2003-11-09 01:07:45 +00:00
// FIXME: see Translit directory on ACIP v4 CD-ROM