2003-08-10 19:30:07 +00:00
/ *
The contents of this file are subject to the THDL Open Community License
Version 1 . 0 ( the " License " ) ; you may not use this file except in compliance
with the License . You may obtain a copy of the License on the THDL web site
( http : //www.thdl.org/).
Software distributed under the License is distributed on an " AS IS " basis ,
WITHOUT WARRANTY OF ANY KIND , either express or implied . See the
License for the specific terms governing rights and limitations under the
License .
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library ( THDL ) . Portions created by the THDL are Copyright 2003 THDL .
All Rights Reserved .
Contributor ( s ) : ______________________________________ .
* /
2005-02-21 01:16:10 +00:00
// TODO(DLC)[EWTS->Tibetan]: If EWTS still has 'v', warn about it if it looks like someone thinks that ACIP's usage of it for wa-zur is how EWTS does things.
2003-08-10 19:30:07 +00:00
package org.thdl.tib.text.ttt ;
Numerous EWTS->Unicode and especially EWTS->TMW improvements.
Fixed ordering of Unicode wowels. [ku+A] gives the correct Unicode
now, e.g.
EWTS->TMW looks better for some wacky wowels like, I'm guessing here, [ku+A].
EWTS->TMW should now give errors any time the full input isn't used.
Previously, wacky wowels like [kai+-i] would lead to some droppage.
EWTS->TMW->Unicode testing is now in effect. This found a ton of
EWTS->TMW bugs, most or all of which are fixed now.
TMW->Unicode is improved/fixed for {
\u5350,\u534D,\u0F88+k,\u0F88+kh,U }. (Why U? "\u0f75" is
discouraged in favor of "\u0f71\u0f74".)
NOTE: TMW_RTF_TO_THDL_WYLIETest is still disabled for the nightly
builds' sake, but I ran it in my sandbox and it passed.
2005-07-11 02:51:06 +00:00
import java.util.Arrays ;
import java.util.Comparator ;
2005-07-11 03:10:32 +00:00
import java.util.List ;
Numerous EWTS->Unicode and especially EWTS->TMW improvements.
Fixed ordering of Unicode wowels. [ku+A] gives the correct Unicode
now, e.g.
EWTS->TMW looks better for some wacky wowels like, I'm guessing here, [ku+A].
EWTS->TMW should now give errors any time the full input isn't used.
Previously, wacky wowels like [kai+-i] would lead to some droppage.
EWTS->TMW->Unicode testing is now in effect. This found a ton of
EWTS->TMW bugs, most or all of which are fixed now.
TMW->Unicode is improved/fixed for {
\u5350,\u534D,\u0F88+k,\u0F88+kh,U }. (Why U? "\u0f75" is
discouraged in favor of "\u0f71\u0f74".)
NOTE: TMW_RTF_TO_THDL_WYLIETest is still disabled for the nightly
builds' sake, but I ran it in my sandbox and it passed.
2005-07-11 02:51:06 +00:00
import org.thdl.tib.text.THDLWylieConstants ;
2005-07-11 03:10:32 +00:00
import org.thdl.tib.text.TibetanMachineWeb ;
2005-07-06 02:19:38 +00:00
2003-08-10 19:30:07 +00:00
/ * * A factory for creating { @link TPairList TPairLists } from
* Strings of ACIP .
* @author David Chandler * /
2005-06-20 06:18:00 +00:00
// TODO(DLC)[EWTS->Tibetan]: kill this class; put it all in TTraits.
2003-08-10 19:30:07 +00:00
class TPairListFactory {
/** This class is not instantiable. */
private TPairListFactory ( ) { }
2005-06-20 06:18:00 +00:00
/** See {@link TTraits#breakTshegBarIntoChunks}. */
static TPairList [ ] breakACIPIntoChunks ( String tt ,
boolean specialHandlingForAppendages ) {
TTraits ttraits = ACIPTraits . instance ( ) ;
TPairList a = breakHelperACIP ( tt , true , false , ttraits ) ;
TPairList b = null ;
if ( specialHandlingForAppendages )
b = breakHelperACIP ( tt , false , false , ttraits ) ;
if ( null ! = b & & a . equals ( b ) )
return new TPairList [ ] { a , null } ;
else
return new TPairList [ ] { a , b } ;
2005-02-21 01:16:10 +00:00
}
2004-04-17 15:48:50 +00:00
/ * * Helps { @link # breakACIPIntoChunks ( String , boolean ) } .
2003-10-18 17:49:29 +00:00
* @param tickIsVowel true if and only if you want to treat the
* ACIP { ' } as an U + 0F71 vowel instead of the full - sized
* consonant in special , " this might be an appendage like 'AM or
* ' ANG " circumstances
* @param weHaveSeenVowelAlready true if and only if , in our
* recursion , we ' ve already found one vowel ( not a disambiguator ,
2003-11-30 02:06:48 +00:00
* but a vowel like " A " , " E " , " Um: " , " m " , " 'U " , etc . ) * /
2005-02-21 01:16:10 +00:00
private static TPairList breakHelperACIP ( String acip , boolean tickIsVowel ,
boolean weHaveSeenVowelAlready ,
TTraits ttraits ) {
2003-08-10 19:30:07 +00:00
// base case for our recursion:
if ( " " . equals ( acip ) )
2005-02-22 04:36:54 +00:00
return new TPairList ( ttraits ) ;
2003-08-10 19:30:07 +00:00
StringBuffer acipBuf = new StringBuffer ( acip ) ;
int howMuchBuf [ ] = new int [ 1 ] ;
2005-02-21 01:16:10 +00:00
TPair head = getFirstConsonantAndVowel ( acipBuf , howMuchBuf , ttraits ) ;
2003-08-10 19:30:07 +00:00
int howMuch = howMuchBuf [ 0 ] ;
2003-10-16 04:15:10 +00:00
if ( ! tickIsVowel
& & null ! = head . getLeft ( )
& & null ! = head . getRight ( )
2003-10-18 17:49:29 +00:00
& & weHaveSeenVowelAlready
2005-02-22 04:36:54 +00:00
& & ttraits . isSuffix ( head . getLeft ( ) ) // DKY'O should be two horizontal units, not three. -- {D}{KY'O}, not {D}{KY}{'O}.
2003-10-16 04:15:10 +00:00
& & head . getRight ( ) . startsWith ( " ' " ) ) {
2005-02-22 04:36:54 +00:00
head = new TPair ( ttraits , head . getLeft ( ) ,
2003-10-16 04:15:10 +00:00
// Without this disambiguator, we are
// less efficient (8 parses, not 4) and
// we can't handle PA'AM'ANG etc.
" - " ) ;
howMuch = head . getLeft ( ) . length ( ) ;
}
2003-08-10 19:30:07 +00:00
TPairList tail ;
if ( ( tail
2005-02-21 01:16:10 +00:00
= breakHelperACIP ( acipBuf . substring ( howMuch ) ,
tickIsVowel ,
weHaveSeenVowelAlready
| | ( head . getRight ( ) ! = null
& & ! " + " . equals ( head . getRight ( ) )
& & ! " - " . equals ( head . getRight ( ) ) ) ,
2005-06-20 06:18:00 +00:00
ttraits ) ) . hasSimpleError ( ) ) {
2003-08-10 19:30:07 +00:00
for ( int i = 1 ; i < howMuch ; i + + ) {
// try giving i characters back if that leaves us with
// a legal head and makes the rest free of simple
// errors.
TPairList newTail = null ;
TPair newHead ;
2005-02-21 01:16:10 +00:00
if ( ( newHead = head . minusNRightmostTransliterationCharacters ( i ) ) . isLegal ( )
2003-08-10 19:30:07 +00:00
& & ! ( newTail
2005-02-21 01:16:10 +00:00
= breakHelperACIP ( acipBuf . substring ( howMuch - i ) ,
tickIsVowel ,
weHaveSeenVowelAlready
| | ( newHead . getRight ( ) ! = null
& & ! " + " . equals ( newHead . getRight ( ) )
& & ! " - " . equals ( newHead . getRight ( ) ) ) ,
2005-06-20 06:18:00 +00:00
ttraits ) ) . hasSimpleError ( ) ) {
2005-02-21 01:16:10 +00:00
newTail . prepend ( newHead ) ;
return newTail ;
}
}
// It didn't work. Return the first thing we'd thought
// of: head appended with tail. (I.e., fall through.)
}
tail . prepend ( head ) ;
return tail ;
}
2005-07-06 02:19:38 +00:00
private static final boolean debug = false ;
2005-06-20 06:18:00 +00:00
/** See {@link TTraits#breakTshegBarIntoChunks}. */
static TPairList [ ] breakEWTSIntoChunks ( String ewts )
throws IllegalArgumentException
{
EWTSTraits traits = EWTSTraits . instance ( ) ;
TPairList pl = breakHelperEWTS ( ewts , traits ) ;
2005-07-06 02:19:38 +00:00
if ( debug ) System . out . println ( " breakEWTSIntoChunks: pl is " + pl ) ;
2005-06-20 06:18:00 +00:00
TPairList npl = pl ;
// TODO(DLC)[EWTS->Tibetan]: this crap ain't workin' for kaHM. But kaeM and kaMe shouldn't work, right? Figure out what EWTS really says...
// TODO(DLC)[EWTS->Tibetan]: for "a\\0f86" e.g.:
if ( pl . size ( ) > 1 ) {
npl = new TPairList ( traits , pl . size ( ) ) ;
for ( int i = pl . size ( ) - 1 ; i > = 1 ; i - - ) {
TPair left = pl . get ( i - 1 ) ;
TPair right = pl . get ( i ) ;
if ( traits . aVowel ( ) . equals ( left . getRight ( ) )
& & left . getLeft ( ) = = null
& & right . getLeft ( ) = = null
& & traits . isWowelThatRequiresAChen ( right . getRight ( ) ) ) {
npl . prepend ( new TPair ( traits , traits . aVowel ( ) , right . getRight ( ) ) ) ;
- - i ;
} else if ( traits . aVowel ( ) . equals ( left . getRight ( ) )
& & left . getLeft ( ) ! = null
& & right . getLeft ( ) = = null
& & traits . isWowelThatRequiresAChen ( right . getRight ( ) )
& & false /* TODO(DLC)[EWTS->Tibetan]: ewts kaM is bothersome now */ ) {
npl . prepend ( new TPair ( traits , left . getLeft ( ) , right . getRight ( ) ) ) ;
- - i ;
} else {
npl . prepend ( right ) ;
if ( i = = 1 )
npl . prepend ( left ) ;
}
}
}
2005-07-06 02:19:38 +00:00
pl = null ;
if ( debug ) System . out . println ( " breakEWTSIntoChunks: npl is " + npl ) ;
2005-06-20 06:18:00 +00:00
TPairList nnpl ;
if ( true ) {
2005-07-06 02:19:38 +00:00
// TODO(DLC)[EWTS->Tibetan]: this nnpl crap was before getFirstConsonantAndVowel got fixed. Try killing it!
2005-06-20 06:18:00 +00:00
// Collapse ( . wowel1) ( . wowel2) into (
// . wowel1+wowel2). Then collapse (* . a) ( . x) into (*
// . x). Also, if an a-chen (\u0f68) is implied, then
// insert it.
2005-07-06 02:19:38 +00:00
TPairList xnnpl = new TPairList ( traits , npl . size ( ) ) ;
2005-06-20 06:18:00 +00:00
for ( int i = 0 ; i < npl . size ( ) ; ) {
TPair p = npl . get ( i ) ;
int set_i_to = i + 1 ;
if ( p . getLeft ( ) = = null
& & p . getRight ( ) ! = null
& & ! traits . disambiguator ( ) . equals ( p . getRight ( ) )
& & ! " + " . equals ( p . getRight ( ) ) ) {
StringBuffer sb = new StringBuffer ( p . getRight ( ) ) ;
for ( int j = i + 1 ; j < npl . size ( ) ; j + + ) {
TPair p2 = npl . get ( j ) ;
if ( p2 . getLeft ( ) = = null
& & p2 . getRight ( ) ! = null
& & ! traits . disambiguator ( ) . equals ( p2 . getRight ( ) )
& & ! " + " . equals ( p2 . getRight ( ) ) )
{
sb . append ( " + " + p2 . getRight ( ) ) ;
set_i_to = j + 1 ;
} else {
break ;
}
}
p = new TPair ( traits , traits . aVowel ( ) , sb . toString ( ) ) ;
}
// TODO(DLC)[EWTS->Tibetan]: Do we still have "ai" converting to the wrong thing. "ae"?
xnnpl . append ( p ) ;
i = set_i_to ;
}
2005-07-06 02:19:38 +00:00
nnpl = new TPairList ( traits , xnnpl . size ( ) ) ;
2005-06-20 06:18:00 +00:00
// (* . a ) ( . x) ... ( . y) -> (* . a+x+...+y)
for ( int i = 0 ; i < xnnpl . size ( ) ; ) {
TPair p = xnnpl . get ( i ) ;
int set_i_to = i + 1 ;
if ( traits . aVowel ( ) . equals ( p . getRight ( ) ) ) {
StringBuffer sb = new StringBuffer ( p . getRight ( ) ) ;
for ( int j = i + 1 ; j < xnnpl . size ( ) ; j + + ) {
TPair p2 = xnnpl . get ( j ) ;
if ( p2 . getLeft ( ) = = null
& & p2 . getRight ( ) ! = null
& & ! traits . disambiguator ( ) . equals ( p2 . getRight ( ) )
& & ! " + " . equals ( p2 . getRight ( ) ) )
{
// TODO(DLC)[EWTS->Tibetan] a+o+e is what we'll get.. maybe we want just o+e?
sb . append ( " + " + p2 . getRight ( ) ) ;
set_i_to = j + 1 ;
} else {
break ;
}
}
p = new TPair ( traits , p . getLeft ( ) , sb . toString ( ) ) ;
}
if ( false ) { // TODO(DLC)[EWTS->Tibetan]: bra is screwed up, do in it stacklist?
// EWTS does not think that kra is k+ra. Replace
// (consonant . ) with (consonant . DISAMBIGUATOR):
if ( p . getRight ( ) = = null & & p . getLeft ( ) ! = null
& & i + 1 < xnnpl . size ( ) )
p = new TPair ( traits , p . getLeft ( ) , traits . disambiguator ( ) ) ;
}
nnpl . append ( p ) ;
i = set_i_to ;
}
} else {
// TODO(DLC)[EWTS->Tibetan]: this block is not executing. kill it after testing and thinking
2005-07-06 02:19:38 +00:00
nnpl = new TPairList ( traits , npl . size ( ) ) ;
2005-06-20 06:18:00 +00:00
for ( int i = npl . size ( ) - 1 ; i > = 0 ; i - - ) {
TPair p = npl . get ( i ) ;
if ( p . getLeft ( ) = = null
& & p . getRight ( ) ! = null
& & ! traits . disambiguator ( ) . equals ( p . getRight ( ) )
& & ! " + " . equals ( p . getRight ( ) ) ) /* TODO(DLC)[EWTS->Tibetan] this should be equivalent to isWowel(p.getRight()) but o+o shows that's not true yet */
p = new TPair ( traits , traits . aVowel ( ) , p . getRight ( ) ) ;
// TODO(DLC)[EWTS->Tibetan]: do you still have "ai" converting to the wrong thing? ("ae" also?)
nnpl . prepend ( p ) ;
}
}
2005-07-06 02:19:38 +00:00
npl = null ;
if ( debug ) System . out . println ( " breakEWTSIntoChunks: nnpl is " + nnpl ) ;
TPairList nnnpl = transformNativeStacks ( traits , nnpl ) ;
if ( debug ) System . out . println ( " breakEWTSIntoChunks: nnnpl is " + nnnpl ) ;
2005-06-20 06:18:00 +00:00
return new TPairList [ ] {
2005-07-06 02:19:38 +00:00
nnnpl , null
2005-06-20 06:18:00 +00:00
} ;
}
2005-07-06 02:19:38 +00:00
/ * * EWTS helper function that transforms native stacks to include
* pluses : [ ( ph . ) ( y . ) ( w . * ) ] - > [ ( ph . + ) ( y . + ) ( w
2005-07-06 07:46:21 +00:00
* . * ) ] , e . g . The tricky case is something like [ brgyad ] or
* [ brjod ] because b + r is a native stack and so is r + g + y ( and in
* fact r + g + y accepts a bao prefix ) . It ' s not quite safe to
* always grab the rightmost native stack from a stretch , as
* [ drwa ] proves . You must grab the longest , rightmost stack .
* In most cases , either way you did it it ' d be illegal . In the
* rest , the only way it can be legal is if there ' s a prefix and
* the rightmost stack .
2005-07-06 02:19:38 +00:00
* @param traits must mesh with orig * /
private static TPairList transformNativeStacks ( TTraits traits ,
TPairList orig ) {
// TODO(DLC)[EWTS->Tibetan]: instead of using
// TibetanMachineWeb's knowledge of the hash keys in tibwn.ini
// (ph-y-w is a hash key, e.g.), we assume that 3 is the
// maximum size of a native stack.
final int maxNativeStackSize = 3 ;
// [(s . *)] alone doesn't need transformation. [(s . )
// (k . *)] does:
final int minNativeStackSize = 2 ;
TPairList result = new TPairList ( traits , orig . size ( ) ) ;
for ( int i = 0 ; i < orig . size ( ) ;
) { // we increment i inside the loop
// If, upon looking ahead, we see a native stack of
// size 3, we transform three pairs. Failing that, if
// we see a native stack of size 2, we transform it.
boolean found_something = false ;
2005-07-06 07:46:21 +00:00
TPair p [ ]
= new TPair [ maxNativeStackSize + 1 ] ; // plus one for [brgyad]
for ( int j = 0 ; j < maxNativeStackSize + 1 ; j + + ) {
2005-07-06 02:19:38 +00:00
if ( i + j < orig . size ( ) )
p [ j ] = orig . get ( i + j ) ;
else
p [ j ] = null ;
}
// Now p[0] is current pair, p[1] is the one after that, etc.
2005-07-06 07:46:21 +00:00
if ( null ! = p [ 0 ] . getLeft ( )
& & null = = p [ 0 ] . getRight ( ) ) {
// TODO(dchandler): The way I do this [drwa] case,
// does it rely on the fact that maxNativeStackSize ==
// 3? Let's have it not rely on that...
int h ;
if ( 0 = = ( h = helper ( traits , 0 , maxNativeStackSize , p , result ) ) ) { // [drwa]
// [brgyad] makes us go from right to left.
// (TODO(dchandler): It's a shame we're doing this
// stuff when we have the code to figure out, for
// ACIP, that [BRGYAD] is what it is.)
for ( int offset = 1 ; offset > = 0 ; offset - - ) {
if ( found_something ) break ;
for ( int nss = maxNativeStackSize ;
nss > = minNativeStackSize ;
nss - - ) {
if ( 0 ! = ( h = helper ( traits , offset , nss , p , result ) ) ) {
found_something = true ;
i + = h ;
break ;
}
}
2005-07-06 02:19:38 +00:00
}
2005-07-06 07:46:21 +00:00
} else {
i + = h ;
2005-07-06 02:19:38 +00:00
found_something = true ;
}
}
if ( ! found_something ) {
+ + i ;
result . append ( p [ 0 ] ) ;
}
}
if ( result . size ( ) ! = orig . size ( ) ) {
throw new Error ( " orig= " + orig + " \ nresult= " + result ) ; // TODO(dchandler): make this an assertion.
}
return result ;
}
2005-07-06 07:46:21 +00:00
/ * * We mutate result and return the number of TPairs we scarfed if
* we find a native stack of size nss at p [ offset ] , p [ offset +
* 1 ] , . . . , p [ offset + nss - 1 ] . * /
private static int helper ( TTraits traits , int offset , int nss , TPair p [ ] ,
TPairList result ) {
String hashKey = " " ;
int good = 0 ;
for ( int k = 0 ; k < nss - 1 ; k + + ) {
if ( null ! = p [ k + offset ]
& & null ! = p [ k + offset ] . getLeft ( )
& & null = = p [ k + offset ] . getRight ( ) ) {
hashKey + = p [ k + offset ] . getLeft ( ) + " - " ;
+ + good ;
}
}
if ( null ! = p [ nss - 1 + offset ]
& & null ! = p [ nss - 1 + offset ] . getLeft ( )
& & ! " + " . equals ( p [ nss - 1 + offset ] . getRight ( ) ) ) {
hashKey + = p [ nss - 1 + offset ] . getLeft ( ) ;
+ + good ;
}
if ( nss = = good
& & TibetanMachineWeb . isKnownHashKey ( hashKey ) ) {
int i = 0 ;
if ( 1 = = offset ) {
+ + i ;
result . append ( p [ 0 ] ) ;
}
for ( int n = 0 ; n < nss - 1 ; n + + ) {
+ + i ;
result . append ( new TPair ( traits ,
p [ n + offset ] . getLeft ( ) ,
" + " ) ) ;
}
+ + i ;
result . append ( p [ nss - 1 + offset ] ) ;
return i ;
}
return 0 ;
}
Numerous EWTS->Unicode and especially EWTS->TMW improvements.
Fixed ordering of Unicode wowels. [ku+A] gives the correct Unicode
now, e.g.
EWTS->TMW looks better for some wacky wowels like, I'm guessing here, [ku+A].
EWTS->TMW should now give errors any time the full input isn't used.
Previously, wacky wowels like [kai+-i] would lead to some droppage.
EWTS->TMW->Unicode testing is now in effect. This found a ton of
EWTS->TMW bugs, most or all of which are fixed now.
TMW->Unicode is improved/fixed for {
\u5350,\u534D,\u0F88+k,\u0F88+kh,U }. (Why U? "\u0f75" is
discouraged in favor of "\u0f71\u0f74".)
NOTE: TMW_RTF_TO_THDL_WYLIETest is still disabled for the nightly
builds' sake, but I ran it in my sandbox and it passed.
2005-07-11 02:51:06 +00:00
/ * * Returns a TPair just like tp ( sometimes the very same ,
* unchanged instance ) except that the wowel , if present , is in
* the order that Section 9 . 11 of the Unicode Standard , version
* 4 . 0 . 1 , would have us use . * /
private static TPair ewtsSortWowels ( TPair tp ) {
if ( tp . getRight ( ) ! = null
& & tp . getRight ( ) . length ( ) > 0
& & ! " + " . equals ( tp . getRight ( ) ) ) {
class WowelComparator implements Comparator {
/ * * @see
* org . thdl . tib . text . tshegbar . UnicodeUtils # fixSomeOrderingErrorsInTibetanUnicode ( StringBuffer ) * /
private List order = Arrays . asList ( new String [ ] {
// equivalence class:
" \ u0f39 " , THDLWylieConstants . WYLIE_TSA_PHRU ,
// equivalence class:
THDLWylieConstants . WYLIE_aVOWEL ,
// equivalence class:
" \ u0f71 " , THDLWylieConstants . A_VOWEL ,
" \ u0f73 " , THDLWylieConstants . I_VOWEL , // TODO(dchandler): in a perfect world, we'd decompose and sort the components.
" \ u0f75 " , THDLWylieConstants . U_VOWEL , // TODO(dchandler): in a perfect world, we'd decompose and sort the components.
" \ u0f81 " , THDLWylieConstants . reverse_I_VOWEL , // TODO(dchandler): in a perfect world, we'd decompose and sort the components.
" \ u0f74 " , THDLWylieConstants . u_VOWEL ,
2005-08-01 05:54:20 +00:00
// TODO(dchandler): equivalence classes I'm not
// sure.
// http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml
// says to go above base and then upwards. Think
// it over.
Numerous EWTS->Unicode and especially EWTS->TMW improvements.
Fixed ordering of Unicode wowels. [ku+A] gives the correct Unicode
now, e.g.
EWTS->TMW looks better for some wacky wowels like, I'm guessing here, [ku+A].
EWTS->TMW should now give errors any time the full input isn't used.
Previously, wacky wowels like [kai+-i] would lead to some droppage.
EWTS->TMW->Unicode testing is now in effect. This found a ton of
EWTS->TMW bugs, most or all of which are fixed now.
TMW->Unicode is improved/fixed for {
\u5350,\u534D,\u0F88+k,\u0F88+kh,U }. (Why U? "\u0f75" is
discouraged in favor of "\u0f71\u0f74".)
NOTE: TMW_RTF_TO_THDL_WYLIETest is still disabled for the nightly
builds' sake, but I ran it in my sandbox and it passed.
2005-07-11 02:51:06 +00:00
// equivalence class:
" \ u0f72 " , THDLWylieConstants . i_VOWEL ,
" \ u0f7a " , THDLWylieConstants . e_VOWEL ,
" \ u0f7b " , THDLWylieConstants . ai_VOWEL ,
" \ u0f7c " , THDLWylieConstants . o_VOWEL ,
" \ u0f7d " , THDLWylieConstants . au_VOWEL ,
" \ u0f80 " , THDLWylieConstants . reverse_i_VOWEL ,
// equivalence class:
" \ u0f7e " , THDLWylieConstants . BINDU ,
" \ u0f82 " , THDLWylieConstants . U0F82 ,
" \ u0f83 " , THDLWylieConstants . U0F83 ,
" \ u0f86 " , THDLWylieConstants . U0F86 ,
" \ u0f87 " , THDLWylieConstants . U0F87 ,
// NOTE: we always say "e" comes before "o" but
// either order would work.
/ * TODO ( dchandler ) : should these go with other
* under - line wowels like \ u0f74 ? They ' re for the
* whole tsheg - bar , so they ' re oddballs . . .
*
* bestEwtsMap . put ( " \ u0f35 " , THDLWylieConstants . U0F35 ) ;
*
* bestEwtsMap . put ( " \ u0f37 " , THDLWylieConstants . U0F37 ) ;
*
* bestEwtsMap . put ( " \ u0f84 " , THDLWylieConstants . U0F84 ) ;
*
* bestEwtsMap . put ( " \ u0fc6 " , THDLWylieConstants . U0FC6 ) ;
* /
} ) ;
public int compare ( Object o1 , Object o2 ) {
int i1 = order . indexOf ( o1 ) ;
int i2 = order . indexOf ( o2 ) ;
if ( i1 < 0 ) i1 = order . size ( ) ;
if ( i2 < 0 ) i2 = order . size ( ) ;
return i1 - i2 ;
}
}
String wowels [ ] = tp . getRight ( ) . split ( " \\ + " ) ;
java . util . Arrays . sort ( wowels , new WowelComparator ( ) ) ;
StringBuffer sb = new StringBuffer ( ) ;
for ( int i = 0 ; i < wowels . length ; i + + ) {
sb . append ( wowels [ i ] ) ;
if ( i + 1 < wowels . length )
sb . append ( '+' ) ;
}
return new TPair ( tp . getTraits ( ) , tp . getLeft ( ) , sb . toString ( ) ) ;
} else {
return tp ;
}
}
2005-02-21 01:16:10 +00:00
// TODO(DLC)[EWTS->Tibetan]: doc
2005-02-22 04:36:54 +00:00
private static TPairList breakHelperEWTS ( String ewts , TTraits ttraits ) {
2005-02-21 01:16:10 +00:00
// base case for our recursion:
if ( " " . equals ( ewts ) )
2005-02-22 04:36:54 +00:00
return new TPairList ( ttraits ) ;
2005-02-21 01:16:10 +00:00
StringBuffer ewtsBuf = new StringBuffer ( ewts ) ;
int howMuchBuf [ ] = new int [ 1 ] ;
Numerous EWTS->Unicode and especially EWTS->TMW improvements.
Fixed ordering of Unicode wowels. [ku+A] gives the correct Unicode
now, e.g.
EWTS->TMW looks better for some wacky wowels like, I'm guessing here, [ku+A].
EWTS->TMW should now give errors any time the full input isn't used.
Previously, wacky wowels like [kai+-i] would lead to some droppage.
EWTS->TMW->Unicode testing is now in effect. This found a ton of
EWTS->TMW bugs, most or all of which are fixed now.
TMW->Unicode is improved/fixed for {
\u5350,\u534D,\u0F88+k,\u0F88+kh,U }. (Why U? "\u0f75" is
discouraged in favor of "\u0f71\u0f74".)
NOTE: TMW_RTF_TO_THDL_WYLIETest is still disabled for the nightly
builds' sake, but I ran it in my sandbox and it passed.
2005-07-11 02:51:06 +00:00
TPair head = ewtsSortWowels ( getFirstConsonantAndVowel ( ewtsBuf ,
howMuchBuf ,
ttraits ) ) ;
2005-02-21 01:16:10 +00:00
int howMuch = howMuchBuf [ 0 ] ;
TPairList tail ;
if ( ( tail = breakHelperEWTS ( ewtsBuf . substring ( howMuch ) ,
2005-06-20 06:18:00 +00:00
ttraits ) ) . hasSimpleError ( ) ) {
2005-02-21 01:16:10 +00:00
for ( int i = 1 ; i < howMuch ; i + + ) {
// try giving i characters back if that leaves us with
// a legal head and makes the rest free of simple
// errors.
TPairList newTail = null ;
TPair newHead ;
if ( ( newHead = head . minusNRightmostTransliterationCharacters ( i ) ) . isLegal ( )
& & ! ( newTail
2005-06-20 06:18:00 +00:00
= breakHelperEWTS ( ewtsBuf . substring ( howMuch - i ) , ttraits ) ) . hasSimpleError ( ) ) {
2003-08-10 19:30:07 +00:00
newTail . prepend ( newHead ) ;
return newTail ;
}
}
// It didn't work. Return the first thing we'd thought
// of: head appended with tail. (I.e., fall through.)
}
tail . prepend ( head ) ;
return tail ;
}
2005-06-20 06:18:00 +00:00
private static String GetInitialVowel ( TTraits ttraits , String tx ,
String startOfVowel ) {
if ( null = = startOfVowel ) startOfVowel = " " ;
boolean startsWithPlus = false ;
if ( ! " " . equals ( startOfVowel )
& & ( ! ttraits . vowelsMayStack ( )
| | ( tx . length ( ) < 1 | | ! ( startsWithPlus = tx . substring ( 0 , 1 ) . equals ( " + " ) ) ) ) )
return ( " " . equals ( startOfVowel ) ? null : startOfVowel ) ;
if ( startsWithPlus )
tx = tx . substring ( 1 ) ;
for ( int i = Math . min ( ttraits . maxWowelLength ( ) , tx . length ( ) ) ; i > = 1 ; i - - ) {
String t = tx . substring ( 0 , i ) ;
if ( ttraits . isWowel ( t )
| | ( ttraits . isACIP ( )
// Or these, which we massage into "Am", "Am:", and
// "A:" because I didn't think {Pm} should be treated
// like {PAm} originally:
// TODO(DLC)[EWTS->Tibetan]: NOW NIGHTMARE
& & ( " m " . equals ( t ) | | " m: " . equals ( t ) | | " : " . equals ( t ) ) ) ) {
// If this is followed by +wowel[+wowel[+wowel... in EWTS then that's part of the vowel also:
return GetInitialVowel ( ttraits ,
tx . substring ( i ) ,
startOfVowel + ( startsWithPlus ? " + " : " " ) + t ) ;
}
}
return null ;
}
/ * * Returns the largest TPair we can make from the transliteration
* starting from the left . This will return a size zero pair if
* and only if tx is the empty string ; otherwise , it may return a
* pair with either the left or right component empty . [ FOR
* ACIP : ] This mutates tx when we run into { NA + YA } ; it mutates tx
* into { N + YA } . For { NE + YA } , it does not mutate tx or behave
* intelligently . A later phase will need to turn that into
* { N + YE } or an error or whatever you like . howMuch [ 0 ] will be
* set to the number of characters of tx that this call has
* consumed . * /
Numerous EWTS->Unicode and especially EWTS->TMW improvements.
Fixed ordering of Unicode wowels. [ku+A] gives the correct Unicode
now, e.g.
EWTS->TMW looks better for some wacky wowels like, I'm guessing here, [ku+A].
EWTS->TMW should now give errors any time the full input isn't used.
Previously, wacky wowels like [kai+-i] would lead to some droppage.
EWTS->TMW->Unicode testing is now in effect. This found a ton of
EWTS->TMW bugs, most or all of which are fixed now.
TMW->Unicode is improved/fixed for {
\u5350,\u534D,\u0F88+k,\u0F88+kh,U }. (Why U? "\u0f75" is
discouraged in favor of "\u0f71\u0f74".)
NOTE: TMW_RTF_TO_THDL_WYLIETest is still disabled for the nightly
builds' sake, but I ran it in my sandbox and it passed.
2005-07-11 02:51:06 +00:00
private static TPair getFirstConsonantAndVowel ( StringBuffer tx ,
2005-02-21 01:16:10 +00:00
int howMuch [ ] ,
TTraits ttraits ) {
2005-06-20 06:18:00 +00:00
// To handle EWTS "phywa\\u0f84\u0f86" [yes that's two slashes
// and then one slash], for example, we need to make the wowel
// (the getRight() field of the returned TPair) contain
// everything that it should.
//
// It can't hurt in ACIP, though I don't recall if ACIP's lexer
// allows Unicode characters.
TPair og = helpGetFirstConsonantAndVowel ( tx , howMuch , ttraits ) ;
int len = tx . length ( ) ;
StringBuffer x = null ;
while ( howMuch [ 0 ] < len ) {
if ( isUnicodeWowelChar ( tx . charAt ( howMuch [ 0 ] ) ) ) {
if ( null = = x ) x = new StringBuffer ( ) ; // rarely happens
if ( x . length ( ) > 0 ) x . append ( '+' ) ;
x . append ( tx . charAt ( howMuch [ 0 ] + + ) ) ;
} else {
break ;
}
}
// In EWTS, deal with M, ~M`, etc. They're much like
// UnicodeWowelCharacters.
if ( ttraits instanceof EWTSTraits ) {
EWTSTraits tt = ( EWTSTraits ) ttraits ;
while ( howMuch [ 0 ] < len ) {
int howMuchExtra [ ] = new int [ ] { 0 } ;
TPair p
= helpGetFirstConsonantAndVowel ( new StringBuffer ( tx . substring ( howMuch [ 0 ] ) ) ,
howMuchExtra ,
ttraits ) ;
if ( p . getLeft ( ) = = null
& & p . getRight ( ) ! = null
& & tt . isWowelThatRequiresAChen ( p . getRight ( ) ) ) {
if ( null = = x ) x = new StringBuffer ( ) ; // rarely happens
String extra ;
if ( x . length ( ) > 0 ) x . append ( '+' ) ;
x . append ( extra = tx . substring ( howMuch [ 0 ] , howMuch [ 0 ] + howMuchExtra [ 0 ] ) ) ;
// System.out.println("extra is " + extra); TODO(DLC)[EWTS->Tibetan]
howMuch [ 0 ] + = howMuchExtra [ 0 ] ;
} else {
break ;
}
}
}
if ( null ! = x )
return new TPair ( ttraits , og . getLeft ( ) ,
( null = = og . getRight ( ) | | ttraits . aVowel ( ) . equals ( og . getRight ( ) ) )
? x . toString ( )
: ( og . getRight ( ) + " + " + x . toString ( ) ) ) ;
else
return og ;
}
private static TPair helpGetFirstConsonantAndVowel ( StringBuffer tx , // TODO(DLC)[EWTS->Tibetan]: function name needs ACIP in it?
int howMuch [ ] ,
TTraits ttraits ) {
// Note that it is *not* the case that if tx.substring(0, N)
2003-08-10 19:30:07 +00:00
// is legal (according to TPair.isLegal()), then
2005-06-20 06:18:00 +00:00
// tx.substring(0, N-1) is legal for all N. For example,
2005-02-21 01:16:10 +00:00
// think of ACIP's {shA} and {KshA}. However, 's' is the only
2005-06-20 06:18:00 +00:00
// tricky fellow in ACIP, so in ACIP it is true that
// tx.substring(0, N-1) is either legal or ends with 's' if
// tx.substring(0, N) is legal.
2003-08-10 19:30:07 +00:00
//
// We don't, however, use this approach. We just try to find
// a consonant of length 3, and then, failing that, of length
// 2, etc. Likewise with vowels. This avoids the issue.
2005-06-20 06:18:00 +00:00
int i , xl = tx . length ( ) ;
// TODO(DLC)[EWTS->Tibetan]: nasty special case!
if ( false & & ! ttraits . isACIP ( ) /* TODO(DLC)[EWTS->Tibetan]: isEWTS! */
& & xl > = 2 & & tx . charAt ( 0 ) = = 'a' & & ( tx . charAt ( 1 ) = = 'i' | | tx . charAt ( 1 ) = = 'u' ) ) {
howMuch [ 0 ] = 2 ;
return new TPair ( ttraits , null , tx . substring ( 0 , 2 ) ) ;
// TODO(DLC)[EWTS->Tibetan]: test that "au" alone is \u0f68\u0f7d, "ai" alone is \u0f68\u0f7b in EWTS.
}
2003-08-10 19:30:07 +00:00
if ( 0 = = xl ) {
howMuch [ 0 ] = 0 ;
2005-02-22 04:36:54 +00:00
return new TPair ( ttraits , null , null ) ;
2003-08-10 19:30:07 +00:00
}
2005-06-20 06:18:00 +00:00
if ( tx . charAt ( 0 ) = = ttraits . disambiguatorChar ( ) ) {
2003-08-10 19:30:07 +00:00
howMuch [ 0 ] = 1 ;
2005-02-22 04:36:54 +00:00
return new TPair ( ttraits , null , ttraits . disambiguator ( ) ) ;
2003-08-10 19:30:07 +00:00
}
2005-06-20 06:18:00 +00:00
char ch = tx . charAt ( 0 ) ;
2003-08-10 19:30:07 +00:00
// Numbers never appear in stacks, so if you see 1234, that's
2005-06-20 06:18:00 +00:00
// like seeing 1-2-3-4. Though in EWTS you can have '0\u0f19'
2003-08-10 19:30:07 +00:00
if ( ch > = '0' & & ch < = '9' ) {
2005-06-20 06:18:00 +00:00
// TODO(DLC)[EWTS->Tibetan]: test case: 0e should have a-chen and 0\u0f74 should go through without errors.
if ( xl > 1 & & ttraits . isUnicodeWowel ( tx . charAt ( 1 ) ) ) {
howMuch [ 0 ] = 2 ;
return new TPair ( ttraits , tx . substring ( 0 , 1 ) , tx . substring ( 1 , 2 ) ) ;
}
2003-08-10 19:30:07 +00:00
howMuch [ 0 ] = 1 ; // not 2...
2005-06-20 06:18:00 +00:00
return new TPair ( ttraits , tx . substring ( 0 , 1 ) , ( xl = = 1 ) ? null : ttraits . disambiguator ( ) ) ;
2003-08-10 19:30:07 +00:00
}
String l = null , r = null ;
2005-02-21 01:16:10 +00:00
for ( i = Math . min ( ttraits . maxConsonantLength ( ) , xl ) ; i > = 1 ; i - - ) {
2003-08-10 19:30:07 +00:00
String t = null ;
2005-06-20 06:18:00 +00:00
if ( ttraits . isConsonant ( t = tx . substring ( 0 , i ) )
| | ( ttraits . vowelAloneImpliesAChen ( ) // handle EWTS {a+yo}
& & ttraits . aVowel ( ) . equals ( tx . substring ( 0 , i ) )
& & i < xl & & tx . substring ( i , i + i ) . equals ( " + " ) ) ) {
2003-08-10 19:30:07 +00:00
l = t ;
break ;
}
}
int ll = ( null = = l ) ? 0 : l . length ( ) ;
2005-06-20 06:18:00 +00:00
if ( null ! = l & & xl > ll & & tx . charAt ( ll ) = = ttraits . disambiguatorChar ( ) ) {
2003-08-10 19:30:07 +00:00
howMuch [ 0 ] = l . length ( ) + 1 ;
2005-02-22 04:36:54 +00:00
return new TPair ( ttraits , l , ttraits . disambiguator ( ) ) ;
2003-08-10 19:30:07 +00:00
}
2005-06-20 06:18:00 +00:00
if ( null ! = l & & xl > ll & & tx . charAt ( ll ) = = '+' ) {
2003-08-10 19:30:07 +00:00
howMuch [ 0 ] = l . length ( ) + 1 ;
2005-02-22 04:36:54 +00:00
return new TPair ( ttraits , l , " + " ) ;
2003-08-10 19:30:07 +00:00
}
2005-06-20 06:18:00 +00:00
int mod = 0 ;
r = GetInitialVowel ( ttraits , tx . substring ( ll ) , null ) ;
if ( ttraits . isACIP ( ) ) {
// Treat {BATA+SA'I} like {BAT+SA'I}: // TODO(DLC)[EWTS->Tibetan]: in EWTS???
int z ;
if ( null ! = l
& & ttraits . aVowel ( ) . equals ( r )
& & ( ( z = ll + ttraits . aVowel ( ) . length ( ) ) < xl )
& & tx . charAt ( z ) = = '+' ) {
tx . deleteCharAt ( z - 1 ) ;
howMuch [ 0 ] = l . length ( ) + 1 ;
return new TPair ( ttraits , l , " + " ) ;
2003-08-10 19:30:07 +00:00
}
2005-06-20 06:18:00 +00:00
// Allow Pm to mean PAm, P: to mean PA:, Pm: to mean PAm:. /* TODO(DLC)[EWTS->Tibetan]: in EWTS? */
if ( " m " . equals ( r ) ) { r = " Am " ; mod = - 1 ; }
if ( " : " . equals ( r ) ) { r = " A: " ; mod = - 1 ; }
if ( " m: " . equals ( r ) ) { r = " Am: " ; mod = - 1 ; }
if ( " :m " . equals ( r ) ) { r = " A:m " ; mod = - 1 ; } // not seen, though...
2003-08-10 19:30:07 +00:00
}
2005-02-21 01:16:10 +00:00
// what if we see a character that's not part of any wowel or
2003-08-10 19:30:07 +00:00
// consonant? We return it.
if ( null = = l & & null = = r ) {
howMuch [ 0 ] = 1 ; // not 2...
2005-02-21 01:16:10 +00:00
// add a disambiguator to avoid exponential running time:
2005-06-20 06:18:00 +00:00
return new TPair ( ttraits , tx . substring ( 0 , 1 ) ,
2005-02-21 01:16:10 +00:00
( xl = = 1 ) ? null : ttraits . disambiguator ( ) ) ;
2003-08-10 19:30:07 +00:00
}
howMuch [ 0 ] = ( ( ( l = = null ) ? 0 : l . length ( ) )
2003-11-30 02:06:48 +00:00
+ ( ( r = = null ) ? 0 : r . length ( ) )
+ mod ) ;
2005-02-22 04:36:54 +00:00
return new TPair ( ttraits , l , r ) ;
2005-02-21 01:16:10 +00:00
} // TODO(DLC)[EWTS->Tibetan]:
2005-06-20 06:18:00 +00:00
private static boolean isUnicodeWowelChar ( char ch ) {
return ( ( ch > = '\u0f71' & & ch < = '\u0f84' )
| | " \ u0f35 \ u0f37 \ u0f18 \ u0f19 \ u0f3e \ u0f3f \ u0f86 \ u0f87 \ u0fc6 " . indexOf ( ch ) > = 0 ) ;
// TODO(dchandler): should we really allow "phywa\\u0f18", or
// does \u0f18 only combine with digits?
}
2003-08-10 19:30:07 +00:00
}
2003-11-09 01:07:45 +00:00
// FIXME: test for nested comments
2003-08-10 19:30:07 +00:00
2003-11-09 01:07:45 +00:00
// FIXME: see Translit directory on ACIP v4 CD-ROM