2005-02-22 04:36:54 +00:00
/ *
The contents of this file are subject to the THDL Open Community License
Version 1 . 0 ( the " License " ) ; you may not use this file except in compliance
with the License . You may obtain a copy of the License on the THDL web site
( http : //www.thdl.org/).
Software distributed under the License is distributed on an " AS IS " basis ,
WITHOUT WARRANTY OF ANY KIND , either express or implied . See the
License for the specific terms governing rights and limitations under the
License .
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library ( THDL ) . Portions created by the THDL are Copyright 2003 THDL .
All Rights Reserved .
Contributor ( s ) : ______________________________________ .
* /
package org.thdl.tib.text.ttt ;
2005-06-20 06:18:00 +00:00
import java.math.BigInteger ;
2005-02-22 04:36:54 +00:00
import java.util.ArrayList ;
/ * *
* This singleton class is able to break up Strings of EWTS text ( for
* example , an entire sutra file ) into tsheg bars , comments , etc .
* Non - Tibetan parts are segregated ( so that consumers can ensure that
* they remain non - Tibetan ) , and Tibetan passages are broken up into
* tsheg bars .
*
* This is not public because you should use { @link EWTSTraits # scanner ( ) } .
*
* @author David Chandler * /
class EWTSTshegBarScanner extends TTshegBarScanner {
2005-06-20 06:18:00 +00:00
/** Returns true iff ch can appear within an EWTS tsheg bar. */
protected static boolean isValidInsideTshegBar ( char ch ) {
// '\\' is absent, but should it be? TODO(DLC)[EWTS->Tibetan]
return ( ( ch > = '0' & & ch < = '9' )
| | ( ch > = '\u0f71' & & ch < = '\u0f84' )
| | EWTSTraits . instance ( ) . isUnicodeConsonant ( ch )
| | EWTSTraits . instance ( ) . isUnicodeWowel ( ch )
| | ( ch > = '\u0f20' & & ch < = '\u0f33' )
| | " khgncjytdpbmtstdzwzz'rlafvTDNSWYReuioIAUMHX?^ \ u0f39 \ u0f35 \ u0f37.+~'`- \ u0f19 \ u0f18 \ u0f3f \ u0f3e \ u0f86 \ u0f87 \ u0f88 " . indexOf ( ch ) > = 0 ) ;
}
2005-02-22 04:36:54 +00:00
/ * * See the comment in TTshegBarScanner . This does not find
2005-06-20 06:18:00 +00:00
errors and warnings that you ' d think of a parser finding ( TODO ( DLC ) [ EWTS - > Tibetan ] :
2005-02-22 04:36:54 +00:00
DOES IT ? ) . * /
2005-06-20 06:18:00 +00:00
public ArrayList scan ( String s , StringBuffer errors , int maxErrors , // TODO(DLC)[EWTS->Tibetan]: ignored
2005-02-22 04:36:54 +00:00
boolean shortMessages , String warningLevel ) {
// the size depends on whether it's mostly Tibetan or mostly
// Latin and a number of other factors. This is meant to be
// an underestimate, but not too much of an underestimate.
ArrayList al = new ArrayList ( s . length ( ) / 10 ) ;
2005-06-20 06:18:00 +00:00
// TODO(DLC)[EWTS->Tibetan]: use jflex, javacc or something similar
// TODO(DLC)[EWTS->Tibetan]: what about Unicode escapes like \u0f20? When do you do that? Immediately like Java source files? I think so and then we can say that oddballs like \u0f19 are valid within tsheg bars.
StringBuffer sb = new StringBuffer ( s ) ;
ExpandEscapeSequences ( sb ) ;
int sl = sb . length ( ) ;
for ( int i = 0 ; i < sl ; i + + ) {
if ( isValidInsideTshegBar ( sb . charAt ( i ) ) ) {
StringBuffer tbsb = new StringBuffer ( ) ;
for ( ; i < sl ; i + + ) {
if ( isValidInsideTshegBar ( sb . charAt ( i ) ) )
tbsb . append ( sb . charAt ( i ) ) ;
else {
- - i ;
break ;
}
}
al . add ( new TString ( " EWTS " , tbsb . toString ( ) ,
TString . TIBETAN_NON_PUNCTUATION ) ) ;
} else {
if ( " /;|!:=_@#$%<>() \ r \ n \ t " . indexOf ( sb . charAt ( i ) ) > = 0 )
al . add ( new TString ( " EWTS " , sb . substring ( i , i + 1 ) ,
TString . TIBETAN_PUNCTUATION ) ) ;
else
al . add ( new TString ( " EWTS " , " ERROR TODO(DLC)[EWTS->Tibetan]: this character is illegal in EWTS: " + sb . substring ( i , i + 1 ) ,
TString . ERROR ) ) ;
}
}
return al ;
}
/ * * Modifies the EWTS in sb such that Unicode escape sequences are
* expanded . * /
public static void ExpandEscapeSequences ( StringBuffer sb ) {
int sl ;
for ( int i = 0 ; i < ( sl = sb . length ( ) ) ; i + + ) {
if ( i + " \\ u00000000 " . length ( ) < = sl ) {
if ( sb . charAt ( i ) = = '\\' & & sb . charAt ( i + 1 ) = = 'u' | | sb . charAt ( i + 1 ) = = 'U' ) {
boolean isEscape = true ;
for ( int j = 0 ; j < " 00000000 " . length ( ) ; j + + ) {
char ch = sb . charAt ( i + " \\ u " . length ( ) + j ) ;
if ( ! ( ( ch < = '9' & & ch > = '0' )
| | ( ch < = 'F' & & ch > = 'A' )
| | ( ch < = 'f' & & ch > = 'a' ) ) ) {
isEscape = false ;
break ;
}
}
if ( isEscape ) {
long x = - 1 ;
try {
BigInteger bigx = new java . math . BigInteger ( sb . substring ( i + 2 , i + 10 ) , 16 ) ;
x = bigx . longValue ( ) ;
if ( ! ( bigx . compareTo ( new BigInteger ( " 0 " , 16 ) ) > = 0
& & bigx . compareTo ( new BigInteger ( " FFFFFFFF " , 16 ) ) < = 0 ) )
x = - 1 ;
} catch ( NumberFormatException e ) {
// leave x == -1
}
if ( x > = 0 & & x < = 0xFFFF ) {
sb . replace ( i , i + " \\ uXXXXyyyy " . length ( ) , new String ( new char [ ] { ( char ) x } ) ) ;
continue ;
} else if ( x > = 0x00000000L
& & x < = 0xFFFFFFFFL ) {
// TODO(DLC)[EWTS->Tibetan]: do nothing? test errors al.add(new TString("EWTS", "Sorry, we don't yet support Unicode escape sequences above 0x0000FFFF! File a bug.",
//TString.ERROR));
i + = " uXXXXYYYY " . length ( ) ;
continue ;
}
}
}
}
if ( i + " \\ u0000 " . length ( ) < = sl ) {
if ( sb . charAt ( i ) = = '\\' & & sb . charAt ( i + 1 ) = = 'u' | | sb . charAt ( i + 1 ) = = 'U' ) {
boolean isEscape = true ;
for ( int j = 0 ; j < " 0000 " . length ( ) ; j + + ) {
char ch = sb . charAt ( i + " \\ u " . length ( ) + j ) ;
if ( ! ( ( ch < = '9' & & ch > = '0' )
| | ( ch < = 'F' & & ch > = 'A' )
| | ( ch < = 'f' & & ch > = 'a' ) ) ) {
isEscape = false ;
break ;
}
}
if ( isEscape ) {
int x = - 1 ;
try {
if ( ! ( ( x = Integer . parseInt ( sb . substring ( i + 2 , i + 6 ) , 16 ) ) > = 0x0000
& & x < = 0xFFFF ) )
x = - 1 ;
} catch ( NumberFormatException e ) {
// leave x == -1
}
if ( x > = 0 ) {
sb . replace ( i , i + " \\ uXXXX " . length ( ) , new String ( new char [ ] { ( char ) x } ) ) ;
continue ;
}
}
}
}
}
2005-02-22 04:36:54 +00:00
}
/** non-public because this is a singleton */
protected EWTSTshegBarScanner ( ) { }
private static EWTSTshegBarScanner singleton = null ;
/** Returns the sole instance of this class. */
public synchronized static EWTSTshegBarScanner instance ( ) {
if ( null = = singleton ) {
singleton = new EWTSTshegBarScanner ( ) ;
}
return singleton ;
}
}