2003-08-18 02:38:54 +00:00
/ *
The contents of this file are subject to the THDL Open Community License
Version 1 . 0 ( the " License " ) ; you may not use this file except in compliance
with the License . You may obtain a copy of the License on the THDL web site
( http : //www.thdl.org/).
Software distributed under the License is distributed on an " AS IS " basis ,
WITHOUT WARRANTY OF ANY KIND , either express or implied . See the
License for the specific terms governing rights and limitations under the
License .
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library ( THDL ) . Portions created by the THDL are Copyright 2003 THDL .
All Rights Reserved .
Contributor ( s ) : ______________________________________ .
* /
package org.thdl.tib.text.ttt ;
import java.io.* ;
import java.util.ArrayList ;
import java.util.Stack ;
import org.thdl.util.ThdlDebug ;
import org.thdl.util.ThdlOptions ;
2003-08-31 16:06:35 +00:00
import org.thdl.tib.text.TibetanDocument ;
import org.thdl.tib.text.TibetanMachineWeb ;
import org.thdl.tib.text.DuffCode ;
2003-08-18 02:38:54 +00:00
/ * *
2003-08-31 16:06:35 +00:00
* This class is able to convert an ACIP file into Tibetan Machine Web
* and an ACIP file into TMW . ACIP - > Unicode should yield the same
* results as ACIP - > TMW followed by TMW - > Unicode ( FIXME : test it ! )
2003-08-18 02:38:54 +00:00
* @author David Chandler
* /
public class ACIPConverter {
2003-08-24 06:40:53 +00:00
// DLC NOW: (KA)'s info is lost when you convert to Unicode text instead of Unicode RTF. Give an ERROR.
2003-08-18 02:38:54 +00:00
/ * * Command - line converter . Gives error messages on standard
* output about why we can ' t convert the document perfectly and
* exits with non - zero return code , or is silent otherwise and
* exits with code zero . < p > FIXME : not so efficient ; copies the
* whole file into memory first . * /
public static void main ( String [ ] args )
2003-08-24 06:40:53 +00:00
throws IOException
2003-08-18 02:38:54 +00:00
{
2003-09-04 04:04:21 +00:00
// We don't want to load the TM or TMW font files ourselves:
ThdlOptions . setUserPreference ( " thdl.rely.on.system.tmw.fonts " , true ) ;
ThdlOptions . setUserPreference ( " thdl.rely.on.system.tm.fonts " , true ) ;
ThdlOptions . setUserPreference ( " thdl.debug " , true ) ;
2003-08-18 02:38:54 +00:00
boolean verbose = true ;
2003-08-24 06:40:53 +00:00
if ( args . length ! = 1 ) {
System . out . println ( " Bad args! Need just the name of the ACIP text file. " ) ;
2003-08-18 02:38:54 +00:00
}
StringBuffer errors = new StringBuffer ( ) ;
int maxErrors = 250 ;
2003-08-24 06:40:53 +00:00
ArrayList al = ACIPTshegBarScanner . scanFile ( args [ 0 ] , errors , maxErrors - 1 ) ;
2003-08-18 02:38:54 +00:00
if ( null = = al ) {
2003-08-23 22:03:37 +00:00
System . err . println ( maxErrors + " or more lexical errors occurred while scanning ACIP input file; is this " ) ;
2003-08-18 02:38:54 +00:00
System . err . println ( " Tibetan or English input? " ) ;
System . err . println ( " " ) ;
2003-08-23 22:03:37 +00:00
if ( false ) {
// Nobody wants to see this. FIXME: maybe somebody; have an option.
System . err . println ( " First " + maxErrors + " lexical errors scanning ACIP input file: " ) ;
System . err . println ( errors ) ;
}
System . err . println ( " Exiting with " + maxErrors + " or more lexical errors; please fix input file and try again. " ) ;
2003-08-18 02:38:54 +00:00
System . exit ( 1 ) ;
}
2003-08-24 06:40:53 +00:00
final boolean abortUponScanningError = false ;
2003-08-23 22:03:37 +00:00
// DLC NOW: BAo isn't converting.
2003-08-18 02:38:54 +00:00
if ( errors . length ( ) > 0 ) {
System . err . println ( " Errors scanning ACIP input file: " ) ;
System . err . println ( errors ) ;
2003-08-23 22:03:37 +00:00
if ( abortUponScanningError ) {
System . err . println ( " Exiting; please fix input file and try again. " ) ;
System . exit ( 1 ) ;
}
2003-08-18 02:38:54 +00:00
}
2003-08-24 06:40:53 +00:00
String warningLevel = " Most " ; // DLC make me configurable.
StringBuffer warnings = null ;
boolean putWarningsInOutput = false ;
if ( " None " ! = warningLevel ) {
warnings = new StringBuffer ( ) ;
putWarningsInOutput = true ;
}
2003-08-31 16:06:35 +00:00
convertToTMW ( al , System . out , errors , warnings ,
putWarningsInOutput , warningLevel ) ;
int retCode = 0 ;
2003-08-18 02:38:54 +00:00
if ( errors . length ( ) > 0 ) {
System . err . println ( " Errors converting ACIP input file: " ) ;
System . err . println ( errors ) ;
2003-08-23 22:03:37 +00:00
System . err . println ( " The output contains these errors. " ) ;
2003-08-18 02:38:54 +00:00
System . err . println ( " Exiting; please fix input file and try again. " ) ;
2003-08-31 16:06:35 +00:00
retCode = 2 ;
2003-08-18 02:38:54 +00:00
}
2003-08-24 06:40:53 +00:00
if ( null ! = warnings & & warnings . length ( ) > 0 ) {
2003-08-23 22:03:37 +00:00
System . err . println ( " Warnings converting ACIP input file: " ) ;
System . err . println ( warnings ) ;
if ( putWarningsInOutput )
System . err . println ( " The output contains these warnings. " ) ;
2003-08-31 16:06:35 +00:00
retCode = 2 ;
2003-08-23 22:03:37 +00:00
}
2003-08-31 16:06:35 +00:00
if ( 0 = = retCode ) {
if ( verbose ) System . err . println ( " Converted " + args [ 0 ] + " perfectly. " ) ;
}
System . exit ( retCode ) ;
// DLC NOW: tRAStA is not converter correctly to Unicode, and
// no warning is given when converting to TMW.
2003-08-18 02:38:54 +00:00
}
/ * * Writes TMW / Latin to out . If errors occur in converting a
2003-08-31 16:06:35 +00:00
* tsheg bar , then they are written into the output , and also
* appended to errors if errors is non - null . If warnings occur
* in converting a tsheg bar , then they are written into the
* output if writeWarningsToResult is true , and also appended to
* warnings if warnings is non - null . Returns true upon perfect
* success or if there were merely warnings , false if errors
2003-08-18 02:38:54 +00:00
* occurred .
* @throws IOException if we cannot write to out
* /
2003-08-31 16:06:35 +00:00
public static boolean convertToTMW ( ArrayList scan ,
OutputStream out ,
StringBuffer errors ,
StringBuffer warnings ,
boolean writeWarningsToResult ,
String warningLevel )
throws IOException
{
TibetanDocument tdoc = new TibetanDocument ( ) ;
tdoc . setRomanAttributeSet ( " Courier " , 14 ) ; // DLC make me configurable.
boolean rv
= convertToTMW ( scan , tdoc , errors , warnings ,
writeWarningsToResult , warningLevel ) ;
tdoc . writeRTFOutputStream ( out ) ;
return rv ;
}
private static boolean convertToTMW ( ArrayList scan ,
TibetanDocument tdoc ,
StringBuffer errors ,
StringBuffer warnings ,
boolean writeWarningsToResult ,
String warningLevel )
2003-08-18 02:38:54 +00:00
throws IOException
{
2003-08-31 16:06:35 +00:00
return convertTo ( false , scan , null , tdoc , errors , warnings ,
writeWarningsToResult , warningLevel ) ;
2003-08-18 02:38:54 +00:00
}
2003-08-31 16:06:35 +00:00
2003-08-23 22:03:37 +00:00
// DLC FIXME: sometimes { } is \u0F0B, and sometimes it is a
// space. Treat it as a tsheg only when it appears after a
// syllable or another tsheg.
2003-08-18 02:38:54 +00:00
/ * * Returns UTF - 8 encoded Unicode . A bit indirect , so use this
* for testing only if performance is a concern . If errors occur
* in scanning the ACIP or in converting a tsheg bar , then they
2003-08-23 22:03:37 +00:00
* are appended to errors if errors is non - null , as well as
* written to the result . If warnings occur in scanning the ACIP
* or in converting a tsheg bar , then they are appended to
* warnings if warnings is non - null , and they are written to the
* result if writeWarningsToResult is true . Returns the
2003-08-31 16:06:35 +00:00
* conversion upon perfect success or if there were merely
* warnings , null if errors occurred .
2003-08-18 02:38:54 +00:00
* /
public static String convertToUnicode ( String acip ,
2003-08-23 22:03:37 +00:00
StringBuffer errors ,
StringBuffer warnings ,
2003-08-24 06:40:53 +00:00
boolean writeWarningsToResult ,
String warningLevel ) {
2003-08-18 02:38:54 +00:00
ByteArrayOutputStream sw = new ByteArrayOutputStream ( ) ;
2003-08-24 06:40:53 +00:00
ArrayList al = ACIPTshegBarScanner . scan ( acip , errors , - 1 ) ;
2003-08-18 02:38:54 +00:00
try {
2003-08-23 22:03:37 +00:00
if ( null ! = al
& & convertToUnicode ( al , sw , errors ,
2003-08-24 06:40:53 +00:00
warnings , writeWarningsToResult ,
warningLevel ) ) {
2003-08-18 02:38:54 +00:00
return sw . toString ( " UTF-8 " ) ;
} else {
return null ;
}
} catch ( Exception e ) {
throw new Error ( e . toString ( ) ) ;
}
}
2003-08-23 22:03:37 +00:00
/ * * Writes Unicode to out . If errors occur in converting a tsheg
* bar , then they are appended to errors if errors is non - null .
* Furthermore , errors are written to out . If writeWarningsToOut
2003-08-24 06:40:53 +00:00
* is true , then warnings also will be written to out .
* @return true upon perfect success , false if errors occurred .
2003-08-23 22:03:37 +00:00
* @param scan result of ACIPTshegBarScanner . scan ( . . )
* @param out stream to which to write converted text
* @param errors if non - null , all error messages are appended
* @param warnings if non - null , all warning messages are appended
* to this
* @param writeWarningsToOut if true , then all warning messages
* are written to out in the appropriate places
2003-08-18 02:38:54 +00:00
* @throws IOException if we cannot write to out
* /
public static boolean convertToUnicode ( ArrayList scan ,
OutputStream out ,
2003-08-23 22:03:37 +00:00
StringBuffer errors ,
StringBuffer warnings ,
2003-08-24 06:40:53 +00:00
boolean writeWarningsToOut ,
String warningLevel )
2003-08-18 02:38:54 +00:00
throws IOException
2003-08-31 16:06:35 +00:00
{
return convertTo ( true , scan , out , null , errors , warnings ,
writeWarningsToOut , warningLevel ) ;
}
2003-09-04 04:04:21 +00:00
private static boolean peekaheadFindsSpacesAndComma ( ArrayList /* of ACIPString */ scan ,
int pos ) {
int sz = scan . size ( ) ;
while ( pos < sz ) {
ACIPString s = ( ACIPString ) scan . get ( pos + + ) ;
if ( s . getType ( ) = = ACIPString . TIBETAN_PUNCTUATION & & s . getText ( ) . equals ( " " ) ) {
// keep going
} else {
if ( s . getType ( ) = = ACIPString . TIBETAN_PUNCTUATION & & s . getText ( ) . equals ( " , " ) ) {
return true ;
} else {
return false ;
}
}
}
return false ;
}
2003-08-31 16:06:35 +00:00
private static boolean convertTo ( boolean toUnicode , // else to TMW
ArrayList scan ,
OutputStream out , // for toUnicode mode
TibetanDocument tdoc , // for !toUnicode mode
StringBuffer errors ,
StringBuffer warnings ,
boolean writeWarningsToOut ,
String warningLevel )
throws IOException
2003-08-18 02:38:54 +00:00
{
int sz = scan . size ( ) ;
boolean hasErrors = false ;
2003-08-31 16:06:35 +00:00
BufferedWriter writer = null ;
if ( toUnicode )
writer
= new BufferedWriter ( new OutputStreamWriter ( out , " UTF-8 " ) ) ;
2003-09-04 04:04:21 +00:00
boolean lastGuyWasNonPunct = false ;
TStackList lastGuy = null ;
2003-08-18 02:38:54 +00:00
for ( int i = 0 ; i < sz ; i + + ) {
ACIPString s = ( ACIPString ) scan . get ( i ) ;
int stype = s . getType ( ) ;
if ( stype = = ACIPString . ERROR ) {
2003-09-04 04:04:21 +00:00
lastGuyWasNonPunct = false ;
lastGuy = null ;
2003-08-18 02:38:54 +00:00
hasErrors = true ;
2003-08-31 16:06:35 +00:00
String text = " [#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s . getText ( ) + " ] " ;
if ( null ! = writer ) writer . write ( text ) ;
if ( null ! = tdoc ) tdoc . appendRoman ( text ) ;
2003-08-24 06:40:53 +00:00
} else if ( stype = = ACIPString . WARNING ) {
2003-09-04 04:04:21 +00:00
lastGuyWasNonPunct = false ;
lastGuy = null ;
2003-08-24 06:40:53 +00:00
if ( writeWarningsToOut ) {
2003-08-31 16:06:35 +00:00
String text = " [#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: " + s . getText ( ) + " ] " ;
if ( null ! = writer ) writer . write ( text ) ;
if ( null ! = tdoc ) tdoc . appendRoman ( text ) ;
2003-08-24 06:40:53 +00:00
}
2003-08-31 16:06:35 +00:00
// DLC NOW: Warning: We're going with {'}{R}{DA}, but only because our knowledge of prefix rules says that {'}{R+DA} is not a legal Tibetan tsheg bar ("syllable")
2003-08-24 06:40:53 +00:00
if ( null ! = warnings ) {
warnings . append ( " Warning: Lexical warning: " ) ;
warnings . append ( s . getText ( ) ) ;
warnings . append ( '\n' ) ;
}
2003-08-18 02:38:54 +00:00
} else {
if ( s . isLatin ( stype ) ) {
2003-09-04 04:04:21 +00:00
lastGuyWasNonPunct = false ;
lastGuy = null ;
2003-08-31 16:06:35 +00:00
String text
= ( ( ( stype = = ACIPString . FOLIO_MARKER ) ? " { " : " " )
+ s . getText ( )
+ ( ( stype = = ACIPString . FOLIO_MARKER ) ? " } " : " " ) ) ;
if ( null ! = writer ) writer . write ( text ) ;
if ( null ! = tdoc ) tdoc . appendRoman ( text ) ;
2003-08-18 02:38:54 +00:00
} else {
String unicode = null ;
2003-08-31 16:06:35 +00:00
DuffCode [ ] duff = null ;
2003-08-18 02:38:54 +00:00
if ( stype = = ACIPString . TIBETAN_NON_PUNCTUATION ) {
2003-09-04 04:04:21 +00:00
lastGuyWasNonPunct = true ;
2003-08-18 02:38:54 +00:00
TPairList pl = TPairListFactory . breakACIPIntoChunks ( s . getText ( ) ) ;
String acipError ;
if ( ( acipError = pl . getACIPError ( ) ) ! = null ) {
hasErrors = true ;
String errorMessage = " [#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR ( \" SYLLABLE \" ) " + s . getText ( ) + " HAS THESE ERRORS: " + acipError + " ] " ;
2003-08-31 16:06:35 +00:00
if ( null ! = writer ) writer . write ( errorMessage ) ;
if ( null ! = tdoc ) tdoc . appendRoman ( errorMessage ) ;
2003-08-18 02:38:54 +00:00
if ( null ! = errors )
errors . append ( errorMessage + " \ n " ) ;
} else {
TParseTree pt = pl . getParseTree ( ) ;
if ( null = = pt ) {
hasErrors = true ;
String errorMessage = " [#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR ( \" SYLLABLE \" ) " + s . getText ( ) + " IS ESSENTIALLY NOTHING.] " ;
2003-08-31 16:06:35 +00:00
if ( null ! = writer ) writer . write ( errorMessage ) ;
if ( null ! = tdoc ) tdoc . appendRoman ( errorMessage ) ;
2003-08-18 02:38:54 +00:00
if ( null ! = errors )
errors . append ( errorMessage + " \ n " ) ;
} else {
TStackList sl = pt . getBestParse ( ) ;
if ( null = = sl ) {
hasErrors = true ;
String errorMessage = " [#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR ( \" SYLLABLE \" ) " + s . getText ( ) + " HAS NO LEGAL PARSES.] " ;
2003-08-31 16:06:35 +00:00
if ( null ! = writer ) writer . write ( errorMessage ) ;
if ( null ! = tdoc ) tdoc . appendRoman ( errorMessage ) ;
2003-08-18 02:38:54 +00:00
if ( null ! = errors )
errors . append ( errorMessage + " \ n " ) ;
} else {
2003-09-04 04:04:21 +00:00
lastGuy = sl ;
2003-08-23 22:03:37 +00:00
String warning
2003-08-24 06:40:53 +00:00
= pt . getWarning ( warningLevel ,
2003-08-23 22:03:37 +00:00
pl ,
s . getText ( ) ) ;
if ( null ! = warning ) {
if ( writeWarningsToOut ) {
2003-08-31 16:06:35 +00:00
String text
= ( " [#WARNING CONVERTING ACIP DOCUMENT: "
+ warning + " ] " ) ;
if ( null ! = writer ) writer . write ( text ) ;
if ( null ! = tdoc ) tdoc . appendRoman ( text ) ;
2003-08-23 22:03:37 +00:00
}
if ( null ! = warnings ) {
warnings . append ( warning ) ;
warnings . append ( '\n' ) ;
}
}
2003-08-31 16:06:35 +00:00
if ( null ! = writer ) {
unicode = sl . getUnicode ( ) ;
if ( null = = unicode ) throw new Error ( " FIXME: make this an assertion 4 " ) ;
}
if ( null ! = tdoc ) {
duff = sl . getDuff ( ) ;
if ( 0 = = duff . length ) {
throw new Error ( " No DuffCodes for stack list " + sl ) ; // FIXME: make this an assertion
}
}
2003-08-18 02:38:54 +00:00
}
}
}
} else {
2003-08-31 16:06:35 +00:00
if ( stype = = ACIPString . START_SLASH ) {
if ( null ! = writer ) unicode = " \ u0F3C " ;
if ( null ! = tdoc ) duff = new DuffCode [ ] { TibetanMachineWeb . getGlyph ( " ( " ) } ;
} else if ( stype = = ACIPString . END_SLASH ) {
if ( null ! = writer ) unicode = " \ u0F3D " ;
if ( null ! = tdoc ) duff = new DuffCode [ ] { TibetanMachineWeb . getGlyph ( " ) " ) } ;
} else {
2003-09-04 04:04:21 +00:00
// For ACIP, tshegs are used as both
// tshegs and whitespace. We treat a
// space as a tsheg if and only if it
// occurs after TIBETAN_NON_PUNCTUATION.
// But "SHIG ,MDO" is an example of a
// special case, needed because a tsheg is
// not used after a GA in Tibetan
// typesetting.
boolean done = false ;
// DLC what about after numbers? marks?
if ( s . getText ( ) . equals ( " " ) ) {
TPairList lpl = null ;
if ( ! lastGuyWasNonPunct
| | ( null ! = lastGuy
& & ( lpl = lastGuy . get ( lastGuy . size ( ) - 1 ) ) . size ( ) = = 1
& & lpl . get ( 0 ) . getLeft ( ) . equals ( " G " )
& & // it's (G . anything)
// followed by some number
// of spaces (at least one,
// this one) and then a
// comma:
peekaheadFindsSpacesAndComma ( scan , i + 1 ) ) ) {
if ( null ! = writer ) {
unicode = " " ;
done = true ;
}
if ( null ! = tdoc ) {
tdoc . appendRoman ( " " ) ;
continue ;
}
2003-08-31 16:06:35 +00:00
}
2003-09-04 04:04:21 +00:00
}
if ( ! done ) {
if ( null ! = writer ) unicode = ACIPRules . getUnicodeFor ( s . getText ( ) , false ) ;
if ( null ! = tdoc ) {
if ( s . getText ( ) . equals ( " \ r " ) | | s . getText ( ) . equals ( " \ t " ) | | s . getText ( ) . equals ( " \ n " ) ) {
tdoc . appendRoman ( s . getText ( ) ) ;
continue ;
}
else {
String wy = ACIPRules . getWylieForACIPOther ( s . getText ( ) ) ;
if ( null = = wy ) throw new Error ( " No wylie for ACIP " + s . getText ( ) ) ;
duff = new DuffCode [ ] { TibetanMachineWeb . getGlyph ( wy ) } ;
}
2003-08-31 16:06:35 +00:00
}
}
}
if ( null ! = writer & & null = = unicode )
throw new Error ( " FIXME: make this an assertion 1 " ) ;
if ( null ! = tdoc & & ( null = = duff | | 0 = = duff . length ) )
throw new Error ( " FIXME: make this an assertion 2 " ) ;
2003-09-04 04:04:21 +00:00
lastGuyWasNonPunct = false ;
lastGuy = null ;
2003-08-18 02:38:54 +00:00
}
2003-08-31 16:06:35 +00:00
if ( null ! = writer & & null ! = unicode ) writer . write ( unicode ) ;
if ( null ! = tdoc ) {
if ( null ! = duff & & 0 ! = duff . length ) {
tdoc . appendDuffCodes ( duff ) ;
// DLC NOW FIXME: use TibTextUtils.getVowel logic to make the output beautiful.
} else {
// this happens when you have an
// [#ERROR]-producing tsheg bar.
// System.err.println("Bad tsheg bar with ACIP {" + s.getText() + "}");
}
2003-08-18 02:38:54 +00:00
}
}
}
}
2003-08-31 16:06:35 +00:00
if ( null ! = writer ) {
writer . close ( ) ;
}
2003-08-18 02:38:54 +00:00
return ! hasErrors ;
}
}
// DLC FIXME: putting Tibetan in black, Sanskrit in green, and Latin
// in yellow would help you quickly decide if ZHIGN maybe should've
// been ZHING.