2003-08-18 02:38:54 +00:00
/ *
The contents of this file are subject to the THDL Open Community License
Version 1 . 0 ( the " License " ) ; you may not use this file except in compliance
with the License . You may obtain a copy of the License on the THDL web site
( http : //www.thdl.org/).
Software distributed under the License is distributed on an " AS IS " basis ,
WITHOUT WARRANTY OF ANY KIND , either express or implied . See the
License for the specific terms governing rights and limitations under the
License .
The Initial Developer of this software is the Tibetan and Himalayan Digital
2004-04-24 17:49:16 +00:00
Library ( THDL ) . Portions created by the THDL are Copyright 2003 - 2004 THDL .
2003-08-18 02:38:54 +00:00
All Rights Reserved .
Contributor ( s ) : ______________________________________ .
* /
package org.thdl.tib.text.ttt ;
import java.io.* ;
import java.util.ArrayList ;
import java.util.Stack ;
2003-09-05 05:54:35 +00:00
import java.awt.Color ;
2003-08-18 02:38:54 +00:00
import org.thdl.util.ThdlDebug ;
import org.thdl.util.ThdlOptions ;
2003-08-31 16:06:35 +00:00
import org.thdl.tib.text.TibetanDocument ;
import org.thdl.tib.text.TibetanMachineWeb ;
import org.thdl.tib.text.DuffCode ;
2003-08-18 02:38:54 +00:00
/ * *
2003-08-31 16:06:35 +00:00
* This class is able to convert an ACIP file into Tibetan Machine Web
2003-10-04 01:22:59 +00:00
* and an ACIP file into Unicode . ACIP - > Unicode should yield the same
2003-08-31 16:06:35 +00:00
* results as ACIP - > TMW followed by TMW - > Unicode ( FIXME : test it ! )
2003-08-18 02:38:54 +00:00
* @author David Chandler
* /
2005-02-21 01:35:23 +00:00
public class TConverter {
2003-09-06 22:56:10 +00:00
2004-04-24 17:49:16 +00:00
/ * * Command - line converter for testing only - - use
* org . thdl . tib . input . TibetanConverter for production work .
* Gives error messages on standard output about why we can ' t
* convert the document perfectly and exits with non - zero return
* code , or is silent otherwise and exits with code zero .
*
* < p > FIXME : not so efficient ; copies the whole file into memory
* first . * /
2003-08-18 02:38:54 +00:00
public static void main ( String [ ] args )
2003-08-24 06:40:53 +00:00
throws IOException
2003-08-18 02:38:54 +00:00
{
2003-09-04 04:04:21 +00:00
// We don't want to load the TM or TMW font files ourselves:
ThdlOptions . setUserPreference ( " thdl.rely.on.system.tmw.fonts " , true ) ;
ThdlOptions . setUserPreference ( " thdl.rely.on.system.tm.fonts " , true ) ;
ThdlOptions . setUserPreference ( " thdl.debug " , true ) ;
2004-04-24 17:49:16 +00:00
// Only developers should use this.
if ( ! ThdlOptions . getBooleanOption ( " thdl.debug " ) ) {
2005-02-21 01:35:23 +00:00
System . err . println ( " Use org.thdl.tib.input.TibetanConverter for production work, not TConverter. " ) ;
2004-04-24 17:49:16 +00:00
System . exit ( 1 ) ;
}
2003-08-18 02:38:54 +00:00
boolean verbose = true ;
2003-08-24 06:40:53 +00:00
if ( args . length ! = 1 ) {
System . out . println ( " Bad args! Need just the name of the ACIP text file. " ) ;
2003-08-18 02:38:54 +00:00
}
StringBuffer errors = new StringBuffer ( ) ;
2004-04-24 17:49:16 +00:00
int maxErrors = 1000 ; // FIXME: make this PER CAPITA or else large ACIP Tibetan files are not converted for fear that they are English
boolean shortMessages = false ;
2004-04-25 00:37:57 +00:00
String warningLevel = " Most " ;
ArrayList al
= ACIPTshegBarScanner . scanFile ( args [ 0 ] , errors ,
maxErrors - 1 , shortMessages ,
warningLevel ) ;
2003-08-18 02:38:54 +00:00
if ( null = = al ) {
2003-08-23 22:03:37 +00:00
System . err . println ( maxErrors + " or more lexical errors occurred while scanning ACIP input file; is this " ) ;
2003-08-18 02:38:54 +00:00
System . err . println ( " Tibetan or English input? " ) ;
System . err . println ( " " ) ;
2003-08-23 22:03:37 +00:00
if ( false ) {
// Nobody wants to see this. FIXME: maybe somebody; have an option.
System . err . println ( " First " + maxErrors + " lexical errors scanning ACIP input file: " ) ;
System . err . println ( errors ) ;
}
System . err . println ( " Exiting with " + maxErrors + " or more lexical errors; please fix input file and try again. " ) ;
2003-08-18 02:38:54 +00:00
System . exit ( 1 ) ;
}
2003-08-24 06:40:53 +00:00
final boolean abortUponScanningError = false ;
2003-08-18 02:38:54 +00:00
if ( errors . length ( ) > 0 ) {
System . err . println ( " Errors scanning ACIP input file: " ) ;
System . err . println ( errors ) ;
2003-08-23 22:03:37 +00:00
if ( abortUponScanningError ) {
System . err . println ( " Exiting; please fix input file and try again. " ) ;
System . exit ( 1 ) ;
}
2003-08-18 02:38:54 +00:00
}
2003-09-06 22:56:10 +00:00
boolean colors = true ;
2003-08-24 06:40:53 +00:00
StringBuffer warnings = null ;
boolean putWarningsInOutput = false ;
if ( " None " ! = warningLevel ) {
warnings = new StringBuffer ( ) ;
putWarningsInOutput = true ;
}
2003-12-14 07:41:15 +00:00
convertToTMW ( al , System . out , errors , warnings , null ,
2004-04-24 17:49:16 +00:00
putWarningsInOutput , warningLevel , shortMessages , colors ) ;
2003-08-31 16:06:35 +00:00
int retCode = 0 ;
2003-08-18 02:38:54 +00:00
if ( errors . length ( ) > 0 ) {
System . err . println ( " Errors converting ACIP input file: " ) ;
System . err . println ( errors ) ;
2003-08-23 22:03:37 +00:00
System . err . println ( " The output contains these errors. " ) ;
2003-08-18 02:38:54 +00:00
System . err . println ( " Exiting; please fix input file and try again. " ) ;
2003-08-31 16:06:35 +00:00
retCode = 2 ;
2003-08-18 02:38:54 +00:00
}
2003-08-24 06:40:53 +00:00
if ( null ! = warnings & & warnings . length ( ) > 0 ) {
2003-08-23 22:03:37 +00:00
System . err . println ( " Warnings converting ACIP input file: " ) ;
System . err . println ( warnings ) ;
if ( putWarningsInOutput )
System . err . println ( " The output contains these warnings. " ) ;
2003-08-31 16:06:35 +00:00
retCode = 2 ;
2003-08-23 22:03:37 +00:00
}
2003-08-31 16:06:35 +00:00
if ( 0 = = retCode ) {
if ( verbose ) System . err . println ( " Converted " + args [ 0 ] + " perfectly. " ) ;
}
System . exit ( retCode ) ;
2003-08-18 02:38:54 +00:00
}
/ * * Writes TMW / Latin to out . If errors occur in converting a
2003-08-31 16:06:35 +00:00
* tsheg bar , then they are written into the output , and also
* appended to errors if errors is non - null . If warnings occur
* in converting a tsheg bar , then they are written into the
* output if writeWarningsToResult is true , and also appended to
* warnings if warnings is non - null . Returns true upon perfect
* success or if there were merely warnings , false if errors
2003-08-18 02:38:54 +00:00
* occurred .
2003-09-06 22:56:10 +00:00
* @param colors true if and only if you want Sanskrit in one
* color , errors / warnings in another , and tsheg - bars affected by
* prefix rules in another
2003-08-18 02:38:54 +00:00
* @throws IOException if we cannot write to out
* /
2003-08-31 16:06:35 +00:00
public static boolean convertToTMW ( ArrayList scan ,
OutputStream out ,
StringBuffer errors ,
StringBuffer warnings ,
2003-12-14 07:41:15 +00:00
boolean [ ] hasWarnings ,
2003-08-31 16:06:35 +00:00
boolean writeWarningsToResult ,
2003-09-06 22:56:10 +00:00
String warningLevel ,
2004-04-24 17:49:16 +00:00
boolean shortMessages ,
2003-09-06 22:56:10 +00:00
boolean colors )
2003-08-31 16:06:35 +00:00
throws IOException
{
TibetanDocument tdoc = new TibetanDocument ( ) ;
boolean rv
2003-12-14 07:41:15 +00:00
= convertToTMW ( scan , tdoc , errors , warnings , hasWarnings ,
2004-04-24 17:49:16 +00:00
writeWarningsToResult , warningLevel ,
shortMessages , colors ,
2003-10-26 18:25:25 +00:00
new int [ ] { tdoc . getLength ( ) } ) ;
2003-08-31 16:06:35 +00:00
tdoc . writeRTFOutputStream ( out ) ;
return rv ;
}
2003-10-26 18:25:25 +00:00
/ * * Turns the list of TStrings scan into TibetanMachineWeb and
Roman warnings and error messages that are inserted at
2003-11-09 01:07:45 +00:00
position loc in tdoc . FIXME : DOC better
2003-10-26 18:25:25 +00:00
@param loc an input - output parameter . On input , loc [ 0 ] is the
offset from zero inside tdoc at which conversion results will
be placed . On output , loc [ 0 ] is one past the offset of the
last of the conversion results . * /
2003-10-19 20:16:06 +00:00
public static boolean convertToTMW ( ArrayList scan ,
TibetanDocument tdoc ,
StringBuffer errors ,
StringBuffer warnings ,
2003-12-14 07:41:15 +00:00
boolean [ ] hasWarnings ,
2003-10-19 20:16:06 +00:00
boolean writeWarningsToResult ,
String warningLevel ,
2004-04-24 17:49:16 +00:00
boolean shortMessages ,
2003-10-19 20:16:06 +00:00
boolean colors ,
2003-10-26 18:25:25 +00:00
int [ ] loc )
2003-08-18 02:38:54 +00:00
throws IOException
{
2003-10-19 22:19:16 +00:00
return convertTo ( false , true , scan , null , tdoc , errors , warnings ,
2003-12-14 07:41:15 +00:00
hasWarnings , writeWarningsToResult , warningLevel ,
2004-04-24 17:49:16 +00:00
shortMessages , colors , loc ,
loc [ 0 ] = = tdoc . getLength ( ) ) ;
2003-08-18 02:38:54 +00:00
}
2003-08-31 16:06:35 +00:00
2003-08-18 02:38:54 +00:00
/ * * Returns UTF - 8 encoded Unicode . A bit indirect , so use this
* for testing only if performance is a concern . If errors occur
* in scanning the ACIP or in converting a tsheg bar , then they
2003-08-23 22:03:37 +00:00
* are appended to errors if errors is non - null , as well as
* written to the result . If warnings occur in scanning the ACIP
* or in converting a tsheg bar , then they are appended to
* warnings if warnings is non - null , and they are written to the
2004-04-24 17:49:16 +00:00
* result if writeWarningsToResult is true . Error and warning
* messages are long and self - contained unless shortMessages is
* true . Returns the conversion upon perfect success or if there
* were merely warnings , null if errors occurred . * /
2005-02-21 01:35:23 +00:00
public static String convertToUnicodeText ( String transliteration ,
String acip ,
2003-10-19 22:19:16 +00:00
StringBuffer errors ,
StringBuffer warnings ,
boolean writeWarningsToResult ,
2004-04-24 17:49:16 +00:00
String warningLevel ,
boolean shortMessages ) {
2005-02-21 01:35:23 +00:00
if ( transliteration ! = " ACIP " ) {
ThdlDebug . noteIffyCode ( ) ;
throw new IllegalArgumentException ( " Unsupported transliteration " ) ;
}
2003-08-18 02:38:54 +00:00
ByteArrayOutputStream sw = new ByteArrayOutputStream ( ) ;
2004-04-25 00:37:57 +00:00
ArrayList al = ACIPTshegBarScanner . scan ( acip , errors , - 1 , shortMessages ,
warningLevel ) ;
2003-08-18 02:38:54 +00:00
try {
2003-09-07 16:19:50 +00:00
if ( null ! = al ) {
2003-10-19 22:19:16 +00:00
convertToUnicodeText ( al , sw , errors ,
2003-12-14 07:41:15 +00:00
warnings , null , writeWarningsToResult ,
2004-04-24 17:49:16 +00:00
warningLevel , shortMessages ) ;
2003-08-18 02:38:54 +00:00
return sw . toString ( " UTF-8 " ) ;
} else {
return null ;
}
2003-09-07 18:30:59 +00:00
} catch ( IOException e ) {
2003-08-18 02:38:54 +00:00
throw new Error ( e . toString ( ) ) ;
}
}
2003-09-06 22:56:10 +00:00
/ * * Writes Unicode text ( not RTF ) to out . < em > NOTE WELL : This
* inherently cannot describe the ACIP { ( KA ) KHA } properly , as
* that requires showing KA in a smaller font than KHA , which is
* not possible in plain text . < / em > If errors occur in converting
* a tsheg bar , then they are appended to errors if errors is
* non - null . Furthermore , errors are written to out . If
* writeWarningsToOut is true , then warnings also will be written
* to out .
2003-08-24 06:40:53 +00:00
* @return true upon perfect success , false if errors occurred .
2003-08-23 22:03:37 +00:00
* @param scan result of ACIPTshegBarScanner . scan ( . . )
* @param out stream to which to write converted text
* @param errors if non - null , all error messages are appended
2003-09-06 22:56:10 +00:00
* @param warnings if non - null , all warning messages appropriate
* to warningLevel are appended
2003-12-14 07:41:15 +00:00
* @param hasWarnings if non - null , then hasWarnings [ 0 ] will be
* updated to true if and only if warnings are encountered and
* false otherwise
2003-08-23 22:03:37 +00:00
* @param writeWarningsToOut if true , then all warning messages
* are written to out in the appropriate places
2003-08-18 02:38:54 +00:00
* @throws IOException if we cannot write to out
* /
2003-10-19 22:19:16 +00:00
public static boolean convertToUnicodeText ( ArrayList scan ,
OutputStream out ,
StringBuffer errors ,
StringBuffer warnings ,
2003-12-14 07:41:15 +00:00
boolean [ ] hasWarnings ,
2003-10-19 22:19:16 +00:00
boolean writeWarningsToOut ,
2004-04-24 17:49:16 +00:00
String warningLevel ,
boolean shortMessages )
2003-08-18 02:38:54 +00:00
throws IOException
2003-08-31 16:06:35 +00:00
{
2003-10-19 22:19:16 +00:00
return convertTo ( true , false , scan , out , null , errors , warnings ,
2004-04-24 17:49:16 +00:00
hasWarnings , writeWarningsToOut , warningLevel ,
shortMessages , false , new int [ ] { - 1 } , true ) ;
2003-08-31 16:06:35 +00:00
}
2003-10-04 01:22:59 +00:00
private static boolean peekaheadFindsSpacesAndComma ( ArrayList /* of TString */ scan ,
2003-09-04 04:04:21 +00:00
int pos ) {
int sz = scan . size ( ) ;
while ( pos < sz ) {
2003-10-04 01:22:59 +00:00
TString s = ( TString ) scan . get ( pos + + ) ;
if ( s . getType ( ) = = TString . TIBETAN_PUNCTUATION & & s . getText ( ) . equals ( " " ) ) {
2003-09-04 04:04:21 +00:00
// keep going
} else {
2003-10-04 01:22:59 +00:00
if ( s . getType ( ) = = TString . TIBETAN_PUNCTUATION & & s . getText ( ) . equals ( " , " ) ) {
2003-09-04 04:04:21 +00:00
return true ;
} else {
return false ;
}
}
}
return false ;
}
2003-08-31 16:06:35 +00:00
private static boolean convertTo ( boolean toUnicode , // else to TMW
2003-10-19 22:19:16 +00:00
boolean toRTF , // else to UTF-8-encoded text
2003-08-31 16:06:35 +00:00
ArrayList scan ,
2003-10-19 22:19:16 +00:00
OutputStream out , // for (toUnicode && !toRTF) mode
TibetanDocument tdoc , // for !toUnicode mode or (toUnicode && toRTF) mode
2003-08-31 16:06:35 +00:00
StringBuffer errors ,
StringBuffer warnings ,
2003-12-14 07:41:15 +00:00
boolean [ ] hasWarnings ,
2003-08-31 16:06:35 +00:00
boolean writeWarningsToOut ,
2003-09-06 22:56:10 +00:00
String warningLevel ,
2004-04-24 17:49:16 +00:00
boolean shortMessages ,
2003-10-19 20:16:06 +00:00
boolean colors ,
2003-10-26 18:25:25 +00:00
// tdocLocation[0] is an
// input-output parameter. It's
// the starting location on input
// and the location just past the
// end on output.
int [ ] tdocLocation ,
2003-10-19 20:16:06 +00:00
boolean isCleanDoc )
2003-08-31 16:06:35 +00:00
throws IOException
2003-08-18 02:38:54 +00:00
{
2003-10-19 20:16:06 +00:00
try {
2003-11-29 22:56:18 +00:00
if ( null ! = tdoc & & ( toUnicode & & ! toRTF ) )
throw new Error ( " Doing both at once might work, but it's not been tested. I bet some 'continue;' statements will need to go. " ) ;
2003-10-19 22:19:16 +00:00
if ( toUnicode & & toRTF )
2003-11-09 01:07:45 +00:00
throw new Error ( " FIXME: support this ACIP->Unicode.rtf mode so that KA (GA) shows up in two different font sizes. See RFE 838591. " ) ;
2003-10-19 22:19:16 +00:00
if ( ! toUnicode & & ! toRTF )
throw new IllegalArgumentException ( " ACIP->Uni.rtf, ACIP->Uni.txt, and ACIP->TMW.rtf are supported, but not ACIP->TMW.txt " ) ;
if ( toUnicode & & toRTF & & null = = tdoc )
throw new IllegalArgumentException ( " ACIP->Uni.rtf requires a TibetanDocument " ) ;
if ( null ! = out & & ! ( toUnicode & & ! toRTF ) )
throw new IllegalArgumentException ( " That stream is only used in ACIP->Uni.txt mode " ) ;
2004-04-24 17:49:16 +00:00
if ( null ! = out & & null ! = tdoc )
throw new IllegalArgumentException ( " Errors are not treated properly yet; do one conversion and then the other. Is performance important enough to risk improper output for you? " ) ;
if ( null = = out & & null = = tdoc )
throw new IllegalArgumentException ( " Why would you? " ) ;
2003-09-07 18:30:59 +00:00
int smallFontSize = - 1 ;
int regularFontSize = - 1 ;
if ( null ! = tdoc ) {
String latinFont
= ThdlOptions . getStringOption ( " thdl.acip.to.x.latin.font " ,
2003-09-07 22:08:35 +00:00
" Times New Roman " ) ;
2003-09-07 18:30:59 +00:00
int latinFontSize
= ThdlOptions . getIntegerOption ( " thdl.acip.to.x.latin.font.size " ,
2003-09-07 22:08:35 +00:00
18 ) ;
2003-09-07 18:30:59 +00:00
tdoc . setRomanAttributeSet ( latinFont , latinFontSize ) ;
regularFontSize = tdoc . getTibetanFontSize ( ) ;
smallFontSize = ( int ) ( 0 . 75 * regularFontSize ) ;
if ( smallFontSize > = regularFontSize )
smallFontSize = regularFontSize - 1 ;
2003-10-19 22:19:16 +00:00
if ( colors )
tdoc . enableColors ( ) ;
else
tdoc . disableColors ( ) ;
2003-09-07 18:30:59 +00:00
}
2003-08-18 02:38:54 +00:00
int sz = scan . size ( ) ;
boolean hasErrors = false ;
2003-12-14 07:41:15 +00:00
if ( null ! = hasWarnings ) hasWarnings [ 0 ] = false ;
2003-08-31 16:06:35 +00:00
BufferedWriter writer = null ;
2003-10-19 22:19:16 +00:00
if ( toUnicode & & ! toRTF )
2003-08-31 16:06:35 +00:00
writer
= new BufferedWriter ( new OutputStreamWriter ( out , " UTF-8 " ) ) ;
2003-09-04 04:04:21 +00:00
boolean lastGuyWasNonPunct = false ;
TStackList lastGuy = null ;
2004-08-19 14:59:06 +00:00
Color lastColor = Color . black ;
Color color = Color . black ;
2003-12-14 08:47:03 +00:00
boolean outputCurlyBracketsAroundFolioMarkers
= ThdlOptions . getBooleanOption ( " thdl.acip.to.x.output.curly.brackets.around.folio.markers " ) ;
2003-08-18 02:38:54 +00:00
for ( int i = 0 ; i < sz ; i + + ) {
2003-10-04 01:22:59 +00:00
TString s = ( TString ) scan . get ( i ) ;
2003-08-18 02:38:54 +00:00
int stype = s . getType ( ) ;
2003-10-04 01:22:59 +00:00
if ( stype = = TString . ERROR ) {
2003-11-11 03:43:11 +00:00
// leave lastGuyWasNonPunct and lastGuy alone; WARNINGs and ERRORs are invisible.
2003-08-18 02:38:54 +00:00
hasErrors = true ;
2004-04-24 17:49:16 +00:00
String text = " [#ERROR " + s . getText ( ) + " ] " ;
2003-08-31 16:06:35 +00:00
if ( null ! = writer ) writer . write ( text ) ;
2003-10-19 20:16:06 +00:00
if ( null ! = tdoc ) {
2003-10-26 18:25:25 +00:00
tdoc . appendRoman ( tdocLocation [ 0 ] , text , Color . RED ) ;
tdocLocation [ 0 ] + = text . length ( ) ;
2003-10-19 20:16:06 +00:00
}
2003-10-04 01:22:59 +00:00
} else if ( stype = = TString . TSHEG_BAR_ADORNMENT ) {
2003-09-07 16:19:50 +00:00
if ( lastGuyWasNonPunct ) {
2004-04-24 17:49:16 +00:00
String err = " [#ERROR " + ErrorsAndWarnings . getMessage ( 133 , shortMessages , s . getText ( ) ) + " ] " ;
2003-09-07 16:19:50 +00:00
if ( null ! = writer ) {
String uni = ACIPRules . getUnicodeFor ( s . getText ( ) , false ) ;
if ( null = = uni ) {
hasErrors = true ;
uni = err ;
}
2003-11-30 02:06:48 +00:00
writer . write ( uni ) ;
2003-09-07 16:19:50 +00:00
}
if ( null ! = tdoc ) {
String wylie
= ACIPRules . getWylieForACIPOther ( s . getText ( ) ) ;
if ( null = = wylie ) {
hasErrors = true ;
2003-10-26 18:25:25 +00:00
tdoc . appendRoman ( tdocLocation [ 0 ] , err , Color . RED ) ;
tdocLocation [ 0 ] + = err . length ( ) ;
2003-09-07 16:19:50 +00:00
} else {
2003-10-26 18:25:25 +00:00
tdoc . appendDuffCode ( tdocLocation [ 0 ] + + ,
2003-10-19 20:16:06 +00:00
TibetanMachineWeb . getGlyph ( wylie ) ,
2004-08-19 14:59:06 +00:00
Color . black ) ;
2003-09-07 16:19:50 +00:00
}
}
} else {
hasErrors = true ;
}
lastGuyWasNonPunct = true ; // this stuff is not really punctuation
lastGuy = null ;
2003-10-04 01:22:59 +00:00
} else if ( stype = = TString . WARNING ) {
2003-11-11 03:43:11 +00:00
// leave lastGuyWasNonPunct and lastGuy alone; WARNINGs and ERRORs are invisible.
2003-08-24 06:40:53 +00:00
if ( writeWarningsToOut ) {
2004-04-24 17:49:16 +00:00
String text = " [#WARNING " + s . getText ( ) + " ] " ;
2003-08-31 16:06:35 +00:00
if ( null ! = writer ) writer . write ( text ) ;
2003-10-19 20:16:06 +00:00
if ( null ! = tdoc ) {
2003-10-26 18:25:25 +00:00
tdoc . appendRoman ( tdocLocation [ 0 ] , text , Color . RED ) ;
tdocLocation [ 0 ] + = text . length ( ) ;
2003-10-19 20:16:06 +00:00
}
2003-08-24 06:40:53 +00:00
}
2003-12-14 08:38:10 +00:00
if ( null ! = hasWarnings ) hasWarnings [ 0 ] = true ;
2003-08-24 06:40:53 +00:00
if ( null ! = warnings ) {
2004-04-24 17:49:16 +00:00
warnings . append ( " Warning: " ) ;
2003-08-24 06:40:53 +00:00
warnings . append ( s . getText ( ) ) ;
warnings . append ( '\n' ) ;
}
2003-08-18 02:38:54 +00:00
} else {
2003-11-29 22:56:18 +00:00
if ( s . isLatin ( ) ) {
2003-09-04 04:04:21 +00:00
lastGuyWasNonPunct = false ;
lastGuy = null ;
2003-08-31 16:06:35 +00:00
String text
2003-12-14 08:47:03 +00:00
= ( ( ( outputCurlyBracketsAroundFolioMarkers
& & stype = = TString . FOLIO_MARKER ) ? " { " : " " )
2003-08-31 16:06:35 +00:00
+ s . getText ( )
2003-12-14 08:47:03 +00:00
+ ( ( outputCurlyBracketsAroundFolioMarkers
& & stype = = TString . FOLIO_MARKER ) ? " } " : " " ) ) ;
2003-08-31 16:06:35 +00:00
if ( null ! = writer ) writer . write ( text ) ;
2003-10-19 20:16:06 +00:00
if ( null ! = tdoc ) {
2004-08-19 14:59:06 +00:00
tdoc . appendRoman ( tdocLocation [ 0 ] , text , Color . black ) ;
2003-10-26 18:25:25 +00:00
tdocLocation [ 0 ] + = text . length ( ) ;
2003-10-19 20:16:06 +00:00
}
2003-08-18 02:38:54 +00:00
} else {
String unicode = null ;
2003-10-16 04:15:10 +00:00
Object [ ] duff = null ;
2003-10-04 01:22:59 +00:00
if ( stype = = TString . TIBETAN_NON_PUNCTUATION ) {
2003-09-04 04:04:21 +00:00
lastGuyWasNonPunct = true ;
2003-10-19 20:48:22 +00:00
TPairList pls [ ] = TPairListFactory . breakACIPIntoChunks ( s . getText ( ) , false ) ;
2003-08-18 02:38:54 +00:00
String acipError ;
2004-04-24 17:49:16 +00:00
if ( ( acipError = pls [ 0 ] . getACIPError ( s . getText ( ) , shortMessages ) ) ! = null
& & ( null = = pls [ 1 ] | | pls [ 1 ] . getACIPError ( s . getText ( ) , shortMessages ) ! = null ) ) {
2003-08-18 02:38:54 +00:00
hasErrors = true ;
2004-04-24 17:49:16 +00:00
String errorMessage = " [#ERROR " + acipError + " ] " ;
2003-08-31 16:06:35 +00:00
if ( null ! = writer ) writer . write ( errorMessage ) ;
2003-10-19 20:16:06 +00:00
if ( null ! = tdoc ) {
2003-10-26 18:25:25 +00:00
tdoc . appendRoman ( tdocLocation [ 0 ] , errorMessage ,
2003-10-19 20:16:06 +00:00
Color . RED ) ;
2003-10-26 18:25:25 +00:00
tdocLocation [ 0 ] + = errorMessage . length ( ) ;
2003-10-19 20:16:06 +00:00
}
2003-08-18 02:38:54 +00:00
if ( null ! = errors )
errors . append ( errorMessage + " \ n " ) ;
} else {
2003-10-16 04:15:10 +00:00
TParseTree pt0 = pls [ 0 ] . getParseTree ( ) ;
TParseTree pt1 = ( ( null = = pls [ 1 ] )
? null : pls [ 1 ] . getParseTree ( ) ) ;
if ( null = = pt0 & & null = = pt1 ) {
2003-08-18 02:38:54 +00:00
hasErrors = true ;
2004-04-24 17:49:16 +00:00
String errorMessage
= ( " [#ERROR "
+ ErrorsAndWarnings . getMessage ( 130 , shortMessages , s . getText ( ) )
+ " ] " ) ;
2003-08-31 16:06:35 +00:00
if ( null ! = writer ) writer . write ( errorMessage ) ;
2003-10-19 20:16:06 +00:00
if ( null ! = tdoc ) {
2003-10-26 18:25:25 +00:00
tdoc . appendRoman ( tdocLocation [ 0 ] , errorMessage ,
2003-10-19 20:16:06 +00:00
Color . RED ) ;
2003-10-26 18:25:25 +00:00
tdocLocation [ 0 ] + = errorMessage . length ( ) ;
2003-10-19 20:16:06 +00:00
}
2003-08-18 02:38:54 +00:00
if ( null ! = errors )
errors . append ( errorMessage + " \ n " ) ;
} else {
2003-10-16 04:15:10 +00:00
TStackList sl0 = pt0 . getBestParse ( ) ;
TStackList sl1 = ( ( null = = pt1 )
? null : pt1 . getBestParse ( ) ) ;
if ( null = = sl0 & & null = = sl1 ) {
2004-04-24 17:49:16 +00:00
// {A-DZU} causes this, for example.
2003-08-18 02:38:54 +00:00
hasErrors = true ;
2004-04-24 17:49:16 +00:00
String errorMessage =
" [#ERROR "
+ ErrorsAndWarnings . getMessage ( 134 ,
shortMessages ,
s . getText ( ) )
+ " ] " ;
if ( null ! = writer )
writer . write ( errorMessage ) ;
2003-10-19 20:16:06 +00:00
if ( null ! = tdoc ) {
2003-10-26 18:25:25 +00:00
tdoc . appendRoman ( tdocLocation [ 0 ] ,
2003-10-19 20:16:06 +00:00
errorMessage ,
Color . RED ) ;
2003-10-26 18:25:25 +00:00
tdocLocation [ 0 ] + = errorMessage . length ( ) ;
2003-10-19 20:16:06 +00:00
}
2003-08-18 02:38:54 +00:00
if ( null ! = errors )
errors . append ( errorMessage + " \ n " ) ;
} else {
2003-10-16 04:15:10 +00:00
TStackList sl = sl0 ;
TPairList pl = pls [ 0 ] ;
TParseTree pt = pt0 ;
// set sl equal to the best choice of sl0 and sl1.
if ( null ! = sl1 ) {
BoolTriple sl0bt = sl0 . isLegalTshegBar ( false ) ;
BoolTriple sl1bt = sl1 . isLegalTshegBar ( false ) ;
int ct ;
if ( ( ct = sl0bt . compareTo ( sl1bt ) ) < 0 ) {
sl = sl1 ;
pl = pls [ 1 ] ;
pt = pt1 ;
} else if ( 0 = = ct ) {
// sl remains sl0 -- '* is
// a vowel unless it's
// clearly part of an
// appendage like 'AM.
}
}
2003-09-04 04:04:21 +00:00
lastGuy = sl ;
2003-10-04 16:12:48 +00:00
String warning = null ;
if ( " None " ! = warningLevel ) {
warning = pt . getWarning ( warningLevel ,
pl ,
2004-04-24 17:49:16 +00:00
s . getText ( ) ,
shortMessages ) ;
2003-10-04 16:12:48 +00:00
}
2003-08-23 22:03:37 +00:00
if ( null ! = warning ) {
if ( writeWarningsToOut ) {
2003-08-31 16:06:35 +00:00
String text
2004-04-24 17:49:16 +00:00
= ( " [#WARNING "
2003-08-31 16:06:35 +00:00
+ warning + " ] " ) ;
if ( null ! = writer ) writer . write ( text ) ;
2003-10-19 20:16:06 +00:00
if ( null ! = tdoc ) {
2003-10-26 18:25:25 +00:00
tdoc . appendRoman ( tdocLocation [ 0 ] ,
2003-10-19 20:16:06 +00:00
text ,
Color . RED ) ;
2003-10-26 18:25:25 +00:00
tdocLocation [ 0 ] + = text . length ( ) ;
2003-10-19 20:16:06 +00:00
}
2003-08-23 22:03:37 +00:00
}
2003-12-14 08:38:10 +00:00
if ( null ! = hasWarnings ) hasWarnings [ 0 ] = true ;
2003-08-23 22:03:37 +00:00
if ( null ! = warnings ) {
warnings . append ( warning ) ;
warnings . append ( '\n' ) ;
}
}
2003-08-31 16:06:35 +00:00
if ( null ! = writer ) {
unicode = sl . getUnicode ( ) ;
if ( null = = unicode ) throw new Error ( " FIXME: make this an assertion 4 " ) ;
2003-12-16 07:45:40 +00:00
// Warn if any of the stacks
// in this tsheg bar do not
// have corresponding glyphs
// in TMW. That means there
// was probably a typo in the
// input.
2004-04-24 17:49:16 +00:00
if ( ErrorsAndWarnings . isEnabled ( 511 , warningLevel ) ) {
Object [ ] trialDuff
= sl . getDuff ( shortMessages ,
false ) ;
2003-12-16 07:45:40 +00:00
for ( int ii = 0 ; ii < trialDuff . length ; ii + + ) {
if ( trialDuff [ ii ] instanceof String ) {
2004-04-24 17:49:16 +00:00
if ( ! ( ( String ) trialDuff [ ii ] ) . startsWith ( " 511 " ) )
throw new Error ( " I thought 511 was the only beast like this; FIXME: make this an assertion 324xd3 " ) ;
2003-12-16 07:45:40 +00:00
String bwarning
2004-04-24 17:49:16 +00:00
= " [#WARNING "
2003-12-16 07:45:40 +00:00
+ ( String ) trialDuff [ ii ] + " ] " ;
unicode = bwarning + unicode ;
if ( null ! = hasWarnings ) hasWarnings [ 0 ] = true ;
if ( null ! = warnings ) {
warnings . append ( bwarning ) ;
warnings . append ( '\n' ) ;
}
}
}
}
2003-08-31 16:06:35 +00:00
}
if ( null ! = tdoc ) {
2004-04-24 17:49:16 +00:00
duff = sl . getDuff ( shortMessages , true ) ;
2003-10-26 18:56:48 +00:00
BoolTriple bt ;
2003-09-07 22:08:35 +00:00
if ( colors & & sl . isLegalTshegBar ( true ) . isLegal & & ! sl . isLegalTshegBar ( false ) . isLegal ) {
2003-09-05 06:05:46 +00:00
color = Color . YELLOW ;
2003-10-26 18:56:48 +00:00
} else if ( colors & & ( bt = sl . isLegalTshegBar ( false ) ) . isLegal & & ! bt . isLegalButSanskrit ( ) ) {
2004-08-19 14:59:06 +00:00
color = Color . black ;
2003-09-05 05:54:35 +00:00
} else {
2003-10-26 18:56:48 +00:00
// Sanskrit.
2003-09-06 22:56:10 +00:00
2003-10-26 18:56:48 +00:00
// FIXME: should a funny
// vowel cause green to
// appear too? G'EEm is
// black, not green, right
// now, though GA: is
// green.
2003-09-05 06:05:46 +00:00
color = Color . GREEN ;
2003-09-05 05:54:35 +00:00
}
2003-08-31 16:06:35 +00:00
if ( 0 = = duff . length ) {
throw new Error ( " No DuffCodes for stack list " + sl ) ; // FIXME: make this an assertion
}
}
2003-08-18 02:38:54 +00:00
}
}
}
} else {
2004-08-19 14:59:06 +00:00
color = Color . black ;
2003-10-04 01:22:59 +00:00
if ( stype = = TString . START_SLASH ) {
2003-08-31 16:06:35 +00:00
if ( null ! = writer ) unicode = " \ u0F3C " ;
2003-10-16 04:15:10 +00:00
if ( null ! = tdoc ) duff = new Object [ ] { TibetanMachineWeb . getGlyph ( " ( " ) } ;
2003-10-04 01:22:59 +00:00
} else if ( stype = = TString . END_SLASH ) {
2003-08-31 16:06:35 +00:00
if ( null ! = writer ) unicode = " \ u0F3D " ;
2003-10-16 04:15:10 +00:00
if ( null ! = tdoc ) duff = new Object [ ] { TibetanMachineWeb . getGlyph ( " ) " ) } ;
2003-10-04 01:22:59 +00:00
} else if ( stype = = TString . TIBETAN_PUNCTUATION ) {
2003-09-04 04:04:21 +00:00
// For ACIP, tshegs are used as both
// tshegs and whitespace. We treat a
// space as a tsheg if and only if it
// occurs after TIBETAN_NON_PUNCTUATION.
// But "SHIG ,MDO" is an example of a
// special case, needed because a tsheg is
// not used after a GA in Tibetan
// typesetting.
boolean done = false ;
2003-11-09 01:07:45 +00:00
// what about after numbers? marks? FIXME: test
2003-09-05 05:08:47 +00:00
TPairList lpl = null ;
2003-09-04 04:04:21 +00:00
if ( s . getText ( ) . equals ( " " ) ) {
if ( ! lastGuyWasNonPunct
| | ( null ! = lastGuy
& & ( lpl = lastGuy . get ( lastGuy . size ( ) - 1 ) ) . size ( ) = = 1
2003-09-12 05:06:37 +00:00
// "GU ," and "KU ," each have
// tshegs, but "GI ," and "KI
// ," each have a Tibetan
// space.
& & ( ( lpl . get ( 0 ) . getLeft ( ) . equals ( " G " )
| | lpl . get ( 0 ) . getLeft ( ) . equals ( " K " ) )
2003-10-04 01:22:59 +00:00
& & ( null = = lpl . get ( 0 ) . getRight ( )
| | lpl . get ( 0 ) . getRight ( ) . indexOf ( 'U' ) < 0 ) )
2003-09-12 05:06:37 +00:00
& &
// it's (G . anything)
// followed by some number of
// spaces (at least one, this
// one) and then a comma:
2003-09-04 04:04:21 +00:00
peekaheadFindsSpacesAndComma ( scan , i + 1 ) ) ) {
if ( null ! = writer ) {
2003-12-14 08:38:10 +00:00
unicode = " " ; // DLC NOW FIXME: allow for U+00A0 between two <i>shad</i>s (0F0D or 0F0E), and optionally insert a U+200B after the <i>shad</i> following the whitespace so that stupid software will break lines more nicely
2003-09-04 04:04:21 +00:00
done = true ;
}
if ( null ! = tdoc ) {
2003-12-14 08:38:10 +00:00
DuffCode spaceDuff = TibetanMachineWeb . getGlyph ( " _ " ) ;
if ( null = = spaceDuff ) throw new Error ( " whitespace duff " ) ;
tdoc . appendDuffCode ( tdocLocation [ 0 ] + + ,
2004-08-19 14:59:06 +00:00
spaceDuff , Color . black ) ;
2003-12-14 08:38:10 +00:00
continue ; // FIXME: if null != writer, output was just dropped.
2003-09-04 04:04:21 +00:00
}
2003-08-31 16:06:35 +00:00
}
2003-09-05 05:08:47 +00:00
} else if ( s . getText ( ) . equals ( " , " )
& & lastGuyWasNonPunct
& & null ! = lastGuy
& & ( lpl = lastGuy . get ( lastGuy . size ( ) - 1 ) ) . size ( ) = = 1
& & lpl . get ( 0 ) . getLeft ( ) . equals ( " NG " ) ) {
2003-12-10 06:50:14 +00:00
// {NGO,} is not acceptable;
// typesetting requires we treat this
// like {NGO\u0F0C,}.
if ( null ! = writer ) {
writer . write ( " \ u0F0C " ) ;
}
2003-10-19 22:19:16 +00:00
if ( null ! = tdoc ) {
2003-12-10 06:50:14 +00:00
DuffCode tshegDuff = TibetanMachineWeb . getGlyph ( " * " ) ;
if ( null = = tshegDuff ) throw new Error ( " non-breaking tsheg duff " ) ;
2003-10-26 18:25:25 +00:00
tdoc . appendDuffCode ( tdocLocation [ 0 ] + + ,
2003-10-19 22:19:16 +00:00
tshegDuff , lastColor ) ;
}
2003-09-04 04:04:21 +00:00
}
2003-09-05 05:08:47 +00:00
2003-09-04 04:04:21 +00:00
if ( ! done ) {
if ( null ! = writer ) unicode = ACIPRules . getUnicodeFor ( s . getText ( ) , false ) ;
if ( null ! = tdoc ) {
2003-09-04 04:34:18 +00:00
if ( s . getText ( ) . equals ( " \ r " )
| | s . getText ( ) . equals ( " \ t " )
| | s . getText ( ) . equals ( " \ n " )
| | s . getText ( ) . equals ( " \ r \ n " ) ) {
2003-10-26 18:25:25 +00:00
tdoc . appendRoman ( tdocLocation [ 0 ] , s . getText ( ) ,
2004-08-19 14:59:06 +00:00
Color . black ) ;
2003-10-26 18:25:25 +00:00
tdocLocation [ 0 ] + = s . getText ( ) . length ( ) ;
2003-11-29 22:56:18 +00:00
continue ; // FIXME: this means the unicode above doesn't go into the output if null != writer && null != tdoc?
2003-09-04 04:34:18 +00:00
} else {
2004-04-14 05:44:51 +00:00
if ( " # " . equals ( s . getText ( ) ) ) { // hard-coded ACIP value
duff = new Object [ ] {
TibetanMachineWeb . getGlyph ( " @# " ) ,
TibetanMachineWeb . getGlyph ( " # " )
} ; // hard-coded EWTS values
} else {
String wy = ACIPRules . getWylieForACIPOther ( s . getText ( ) ) ;
if ( null = = wy ) throw new Error ( " No wylie for ACIP " + s . getText ( ) ) ;
duff = new Object [ ] { TibetanMachineWeb . getGlyph ( wy ) } ;
}
2003-09-04 04:04:21 +00:00
}
2003-08-31 16:06:35 +00:00
}
2005-02-21 01:16:10 +00:00
} // TODO(DLC)[EWTS->Tibetan]: change this to have a "parse" phase that puts out error messagesf like 142 and figures out what a space means. This is a very long function that is difficult to maintain, and we want EWTS->Tibetan to be clean.
2003-10-04 01:22:59 +00:00
} else if ( stype = = TString . START_PAREN ) {
2004-06-06 21:59:16 +00:00
if ( null ! = writer )
writer . write ( " [ERROR "
+ ErrorsAndWarnings . getMessage ( 142 ,
shortMessages ,
" ( " /* hard-coded ACIP value */ ) + " ] " ) ;
2003-09-07 18:30:59 +00:00
if ( null ! = tdoc ) {
tdoc . setTibetanFontSize ( smallFontSize ) ;
}
continue ;
2003-10-04 01:22:59 +00:00
} else if ( stype = = TString . END_PAREN ) {
2004-06-06 21:59:16 +00:00
if ( null ! = writer )
writer . write ( " [ERROR "
+ ErrorsAndWarnings . getMessage ( 143 ,
shortMessages ,
" ) " /* hard-coded ACIP value */ ) + " ] " ) ;
2003-09-07 18:30:59 +00:00
if ( null ! = tdoc ) {
tdoc . setTibetanFontSize ( regularFontSize ) ;
}
continue ;
2003-11-29 22:56:18 +00:00
} else if ( stype = = TString . UNICODE_CHARACTER ) {
2003-12-08 07:15:27 +00:00
ThdlDebug . verify ( 1 = = s . getText ( ) . length ( ) ) ;
2003-11-29 22:56:18 +00:00
if ( null ! = writer ) {
2003-12-08 07:15:27 +00:00
char ch = s . getText ( ) . charAt ( 0 ) ;
if ( ch > = '\uF021' & & ch < = '\uF0FF' ) {
hasErrors = true ;
2004-04-24 17:49:16 +00:00
String errorMessage =
" [#ERROR "
+ ErrorsAndWarnings . getMessage ( 135 ,
shortMessages ,
" " + ch )
+ " ] " ;
writer . write ( errorMessage ) ;
if ( null ! = errors )
errors . append ( errorMessage + " \ n " ) ;
continue ; // FIXME: dropping output if null != tdoc
} else if ( org . thdl . tib . text . tshegbar . UnicodeUtils . isReservedTibetanCode ( ch ) ) {
hasErrors = true ;
String errorMessage =
" [#ERROR "
+ ErrorsAndWarnings . getMessage ( 138 ,
shortMessages ,
" " + ch )
+ " ] " ;
2003-12-08 07:15:27 +00:00
writer . write ( errorMessage ) ;
if ( null ! = errors )
errors . append ( errorMessage + " \ n " ) ;
continue ; // FIXME: dropping output if null != tdoc
} else
unicode = s . getText ( ) ;
2003-11-29 22:56:18 +00:00
}
if ( null ! = tdoc ) {
duff = TibetanMachineWeb . mapUnicodeToTMW ( s . getText ( ) . charAt ( 0 ) ) ;
if ( null = = duff ) {
hasErrors = true ;
2004-04-24 17:49:16 +00:00
String errorMessage =
" [#ERROR "
+ ErrorsAndWarnings . getMessage ( 136 ,
shortMessages ,
s . getText ( ) )
+ " ] " ;
2003-11-29 22:56:18 +00:00
tdoc . appendRoman ( tdocLocation [ 0 ] ,
errorMessage ,
Color . RED ) ;
tdocLocation [ 0 ] + = errorMessage . length ( ) ;
if ( null ! = errors )
errors . append ( errorMessage + " \ n " ) ;
continue ; // FIXME: if null != writer, we dropped some output.
}
}
2003-09-05 05:08:47 +00:00
} else {
throw new Error ( " forgot a case " ) ;
2003-08-31 16:06:35 +00:00
}
if ( null ! = writer & & null = = unicode )
throw new Error ( " FIXME: make this an assertion 1 " ) ;
if ( null ! = tdoc & & ( null = = duff | | 0 = = duff . length ) )
throw new Error ( " FIXME: make this an assertion 2 " ) ;
2003-09-04 04:04:21 +00:00
lastGuyWasNonPunct = false ;
lastGuy = null ;
2003-08-18 02:38:54 +00:00
}
2003-08-31 16:06:35 +00:00
if ( null ! = writer & & null ! = unicode ) writer . write ( unicode ) ;
if ( null ! = tdoc ) {
if ( null ! = duff & & 0 ! = duff . length ) {
2003-10-16 04:15:10 +00:00
for ( int j = 0 ; j < duff . length ; j + + ) {
if ( duff [ j ] instanceof DuffCode )
2003-10-26 18:25:25 +00:00
tdoc . appendDuffCode ( tdocLocation [ 0 ] + + ,
2003-10-19 20:16:06 +00:00
( DuffCode ) duff [ j ] ,
2003-10-16 04:15:10 +00:00
color ) ;
else {
hasErrors = true ;
2003-12-16 07:45:40 +00:00
String emsg
2004-04-24 17:49:16 +00:00
= " [ERROR " + ( String ) duff [ j ] + " ] " ;
2003-10-16 04:15:10 +00:00
if ( null ! = errors )
2003-12-16 07:45:40 +00:00
errors . append ( emsg + " \ n " ) ;
2003-10-26 18:25:25 +00:00
tdoc . appendRoman ( tdocLocation [ 0 ] ,
2003-12-16 07:45:40 +00:00
emsg ,
2003-10-16 04:15:10 +00:00
Color . RED ) ;
2003-12-16 07:45:40 +00:00
tdocLocation [ 0 ] + = emsg . length ( ) ;
2003-10-16 04:15:10 +00:00
}
}
2003-08-31 16:06:35 +00:00
} else {
// this happens when you have an
// [#ERROR]-producing tsheg bar.
// System.err.println("Bad tsheg bar with ACIP {" + s.getText() + "}");
}
2003-08-18 02:38:54 +00:00
}
}
}
2003-09-05 05:54:35 +00:00
lastColor = color ;
2003-08-18 02:38:54 +00:00
}
2003-08-31 16:06:35 +00:00
if ( null ! = writer ) {
writer . close ( ) ;
}
2003-10-26 18:25:25 +00:00
if ( isCleanDoc & & null ! = tdoc & & tdocLocation [ 0 ] ! = tdoc . getLength ( ) )
throw new Error ( " Oops -- we dropped something from the output! tdocLocation[0]++; and tdocLocation[0]+=xyz; are not being used correctly. " ) ;
2003-08-18 02:38:54 +00:00
return ! hasErrors ;
2003-10-19 22:19:16 +00:00
} catch ( javax . swing . text . BadLocationException e ) {
2003-10-26 18:25:25 +00:00
throw new IllegalArgumentException ( " tdocLocation[0] is no good: " + tdocLocation [ 0 ] ) ;
2003-10-19 22:19:16 +00:00
}
2003-08-18 02:38:54 +00:00
}
}
2003-11-11 03:43:11 +00:00