Jskad's converter now has ACIP-to-Unicode built in. There are known

bugs; it is pre-alpha.  It's usable, though, and finds tons of errors
in ACIP input files, with the user deciding just how pedantic to be.
The biggest outstanding bug is the silent one: treating { }, space, as
tsheg instead of whitespace when we ought to know better.
This commit is contained in:
dchandler 2003-08-24 06:40:53 +00:00
parent d5ad760230
commit 1982c5847b
11 changed files with 355 additions and 244 deletions

View file

@ -26,6 +26,10 @@ import javax.swing.text.StyleConstants;
import org.thdl.util.*;
import org.thdl.tib.text.*;
import org.thdl.tib.text.ttt.ACIPConverter;
import org.thdl.tib.text.ttt.ACIPTshegBarScanner;
import java.util.ArrayList;
/** TibetanConverter is a command-line utility for converting to
* and from Tibetan Machine Web (TMW). It converts TMW to Wylie, to
* Unicode, or to Tibetan Machine (TM). It also converts TM to TMW.
@ -66,6 +70,7 @@ public class TibetanConverter implements FontConverterConstants {
try {
boolean convertToUnicodeMode = false;
boolean convertToTMMode = false;
boolean convertACIPToUniMode = false;
boolean convertToTMWMode = false;
boolean convertToWylieMode = false;
boolean findSomeNonTMWMode = false;
@ -84,6 +89,8 @@ public class TibetanConverter implements FontConverterConstants {
= args[0].equals("--to-tibetan-machine"))
|| (convertToTMWMode
= args[0].equals("--to-tibetan-machine-web"))
|| (convertACIPToUniMode
= args[0].equals("--acip-to-unicode"))
|| (convertToUnicodeMode
= args[0].equals("--to-unicode"))
|| (convertToWylieMode
@ -98,6 +105,7 @@ public class TibetanConverter implements FontConverterConstants {
out.println("TibetanConverter [--find-all-non-tmw | --find-some-non-tmw");
out.println(" | --to-tibetan-machine | --to-tibetan-machine-web");
out.println(" | --to-unicode | --to-wylie] RTF_file");
out.println(" | TibetanConverter --acip-to-unicode TXT_file");
out.println(" | TibetanConverter [--version | -v | --help | -h]");
out.println("");
out.println("Distributed under the terms of the THDL Open Community License Version 1.0.");
@ -105,6 +113,11 @@ public class TibetanConverter implements FontConverterConstants {
out.println("Usage:");
out.println(" -v | --version for version info");
out.println(" -h | --help for this message");
out.println(" --to-tibetan-machine to convert TibetanMachineWeb to TibetanMachine");
out.println(" --to-unicode to convert TibetanMachineWeb to Unicode");
out.println(" --to-tibetan-machine-web to convert TibetanMachine to TibetanMachineWeb");
out.println(" --to-wylie to convert TibetanMachineWeb to THDL Extended Wylie");
out.println(" --acip-to-unicode to convert ACIP text file to Unicode text file");
out.println(" --find-all-non-tmw to locate all characters in the input document that are");
out.println(" not in Tibetan Machine Web fonts, exit zero if and only if none found");
out.println(" --find-some-non-tmw to locate all distinct characters in the input document");
@ -113,14 +126,12 @@ public class TibetanConverter implements FontConverterConstants {
out.println(" not in Tibetan Machine fonts, exit zero if and only if none found");
out.println(" --find-some-non-tm to locate all distinct characters in the input document");
out.println(" not in Tibetan Machine fonts, exit zero if and only if none found");
out.println(" --to-tibetan-machine to convert TibetanMachineWeb to TibetanMachine");
out.println(" --to-unicode to convert TibetanMachineWeb to Unicode");
out.println(" --to-tibetan-machine-web to convert TibetanMachine to TibetanMachineWeb");
out.println(" --to-wylie to convert TibetanMachineWeb to THDL Extended Wylie");
out.println("");
out.println(" In --to... modes, needs one argument, the name of the TibetanMachineWeb RTF");
out.println(" In --to... and --acip-to... modes, needs one argument, the name of the");
out.println(" TibetanMachineWeb RTF");
out.println(" file (for --to-wylie, --to-unicode, and --to-tibetan-machine) or the name of");
out.println(" the TibetanMachine RTF file (for --to-tibetan-machine-web). Writes the");
out.println(" the TibetanMachine RTF file (for --to-tibetan-machine-web) or the name of the");
out.println(" ACIP text file (for --acip-to-unicode). Writes the");
out.println(" result to standard output (after dealing with the curly brace problem if");
out.println(" the input is TibetanMachineWeb). Exit code is zero on success, 42 if some");
out.println(" glyphs couldn't be converted (in which case the output is just those glyphs),");
@ -135,11 +146,10 @@ public class TibetanConverter implements FontConverterConstants {
out.println(" You may find it helpful to use `--find-some-non-tmw' mode (or");
out.println(" `--find-some-non-tm' mode for Tibetan Machine input) before doing a");
out.println(" conversion so that you have confidence in the conversion's correctness.");
// DLC add Wylie->TMW mode.
return 77;
}
if (args[0].equals("--version") || args[0].equals("-v")) {
out.println("TibetanConverter version 0.82");
out.println("TibetanConverter version 0.83");
out.println("Compiled at "
+ ThdlVersion.getTimeOfCompilation());
return 77;
@ -168,12 +178,15 @@ public class TibetanConverter implements FontConverterConstants {
conversionTag = TMW_TO_UNI;
} else if (convertToTMWMode) {
conversionTag = TM_TO_TMW;
} else if (convertACIPToUniMode) {
conversionTag = ACIP_TO_UNI;
} else {
ThdlDebug.verify(convertToTMMode);
conversionTag = TMW_TO_TM;
}
}
return reallyConvert(in, out, conversionTag);
return reallyConvert(in, out, conversionTag, "Most" // DLC make me configurable
);
} catch (ThdlLazyException e) {
out.println("TibetanConverter has a BUG:");
e.getRealException().printStackTrace(out);
@ -190,132 +203,155 @@ public class TibetanConverter implements FontConverterConstants {
number of strings -- see the code. Returns an appropriate
return code so that TibetanConverter's usage message is
honored. */
static int reallyConvert(InputStream in, PrintStream out, String ct) {
TibetanDocument tdoc = new TibetanDocument();
{
SimpleAttributeSet ras = new SimpleAttributeSet();
StyleConstants.setFontFamily(ras,
ThdlOptions.getStringOption("thdl.default.roman.font.face",
"Serif"));
StyleConstants.setFontSize(ras,
ThdlOptions.getIntegerOption("thdl.default.roman.font.size",
14));
tdoc.setRomanAttributeSet(ras);
}
try {
// Read in the rtf file.
if (debug) System.err.println("Start: reading in old RTF file");
if (!ThdlOptions.getBooleanOption("thdl.do.not.fix.rtf.hex.escapes"))
in = new RTFFixerInputStream(in);
(new RTFEditorKit()).read(in, tdoc, 0);
if (debug) System.err.println("End : reading in old RTF file");
} catch (Exception e) {
out.println("TibetanConverter:\n"
+ rtfErrorMessage);
return 3;
}
try {
in.close();
} catch (IOException e) {
// silently ignore; we don't care about the input so much...
ThdlDebug.noteIffyCode();
}
if (FIND_ALL_NON_TMW == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findAllNonTMWCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else if (FIND_SOME_NON_TMW == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findSomeNonTMWCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else if (FIND_SOME_NON_TM == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findSomeNonTMCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else if (FIND_ALL_NON_TM == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findAllNonTMCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else { // conversion {to Wylie or TM} mode
// Fix curly braces in the entire document if the input is TMW:
if (TM_TO_TMW != ct) {
// DLC make me optional
if (debug) System.err.println("Start: solving curly brace problem");
tdoc.replaceTahomaCurlyBracesAndBackslashes(0, -1);
if (debug) System.err.println("End : solving curly brace problem");
}
int exitCode = 0;
ThdlDebug.verify(((TMW_TO_TM == ct) ? 1 : 0)
+ ((TMW_TO_UNI == ct) ? 1 : 0)
+ ((TM_TO_TMW == ct) ? 1 : 0)
+ ((TMW_TO_WYLIE == ct) ? 1 : 0)
== 1);
long numAttemptedReplacements[] = new long[] { 0 };
if (TMW_TO_WYLIE == ct) {
// Convert to THDL Wylie:
if (!tdoc.toWylie(0,
tdoc.getLength(),
numAttemptedReplacements)) {
exitCode = 44;
}
} else if (TMW_TO_UNI == ct) {
StringBuffer errors = new StringBuffer();
// Convert to Unicode:
if (tdoc.convertToUnicode(0,
tdoc.getLength(),
errors,
ThdlOptions.getStringOption("thdl.tmw.to.unicode.font").intern(),
numAttemptedReplacements)) {
System.err.println(errors);
exitCode = 42;
}
} else if (TM_TO_TMW == ct) {
StringBuffer errors = new StringBuffer();
// Convert to TibetanMachineWeb:
if (tdoc.convertToTMW(0, tdoc.getLength(), errors,
numAttemptedReplacements)) {
System.err.println(errors);
exitCode = 42;
}
} else {
ThdlDebug.verify(TMW_TO_TM == ct);
StringBuffer errors = new StringBuffer();
// Convert to TibetanMachine:
if (tdoc.convertToTM(0, tdoc.getLength(), errors,
numAttemptedReplacements)) {
System.err.println(errors);
exitCode = 42;
}
}
// Write to standard output the result:
static int reallyConvert(InputStream in, PrintStream out, String ct,
String warningLevel) {
if (ACIP_TO_UNI == ct) {
try {
tdoc.writeRTFOutputStream(out);
ArrayList al = ACIPTshegBarScanner.scanStream(in, null,
250 - 1 // DLC FIXME: make me configurable
);
if (null == al)
return 47;
StringBuffer warnings = new StringBuffer();
boolean embeddedWarnings = (warningLevel != "None");
if (!ACIPConverter.convertToUnicode(al, out, null, warnings,
embeddedWarnings,
warningLevel))
return 46;
if (embeddedWarnings && warnings.length() > 0)
return 45;
else
return 0;
} catch (IOException e) {
exitCode = 40;
return 48;
}
} else {
TibetanDocument tdoc = new TibetanDocument();
{
SimpleAttributeSet ras = new SimpleAttributeSet();
StyleConstants.setFontFamily(ras,
ThdlOptions.getStringOption("thdl.default.roman.font.face",
"Serif"));
StyleConstants.setFontSize(ras,
ThdlOptions.getIntegerOption("thdl.default.roman.font.size",
14));
tdoc.setRomanAttributeSet(ras);
}
try {
// Read in the rtf file.
if (debug) System.err.println("Start: reading in old RTF file");
if (!ThdlOptions.getBooleanOption("thdl.do.not.fix.rtf.hex.escapes"))
in = new RTFFixerInputStream(in);
(new RTFEditorKit()).read(in, tdoc, 0);
if (debug) System.err.println("End : reading in old RTF file");
} catch (Exception e) {
out.println("TibetanConverter:\n"
+ rtfErrorMessage);
return 3;
}
if (out.checkError())
exitCode = 41;
if (numAttemptedReplacements[0] < 1)
exitCode = 43;
return exitCode;
try {
in.close();
} catch (IOException e) {
// silently ignore; we don't care about the input so much...
ThdlDebug.noteIffyCode();
}
if (FIND_ALL_NON_TMW == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findAllNonTMWCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else if (FIND_SOME_NON_TMW == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findSomeNonTMWCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else if (FIND_SOME_NON_TM == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findSomeNonTMCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else if (FIND_ALL_NON_TM == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findAllNonTMCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else { // conversion {to Wylie or TM} mode
// Fix curly braces in the entire document if the input is TMW:
if (TM_TO_TMW != ct) {
// DLC make me optional
if (debug) System.err.println("Start: solving curly brace problem");
tdoc.replaceTahomaCurlyBracesAndBackslashes(0, -1);
if (debug) System.err.println("End : solving curly brace problem");
}
int exitCode = 0;
ThdlDebug.verify(((TMW_TO_TM == ct) ? 1 : 0)
+ ((TMW_TO_UNI == ct) ? 1 : 0)
+ ((TM_TO_TMW == ct) ? 1 : 0)
+ ((TMW_TO_WYLIE == ct) ? 1 : 0)
== 1);
long numAttemptedReplacements[] = new long[] { 0 };
if (TMW_TO_WYLIE == ct) {
// Convert to THDL Wylie:
if (!tdoc.toWylie(0,
tdoc.getLength(),
numAttemptedReplacements)) {
exitCode = 44;
}
} else if (TMW_TO_UNI == ct) {
StringBuffer errors = new StringBuffer();
// Convert to Unicode:
if (tdoc.convertToUnicode(0,
tdoc.getLength(),
errors,
ThdlOptions.getStringOption("thdl.tmw.to.unicode.font").intern(),
numAttemptedReplacements)) {
System.err.println(errors);
exitCode = 42;
}
} else if (TM_TO_TMW == ct) {
StringBuffer errors = new StringBuffer();
// Convert to TibetanMachineWeb:
if (tdoc.convertToTMW(0, tdoc.getLength(), errors,
numAttemptedReplacements)) {
System.err.println(errors);
exitCode = 42;
}
} else {
ThdlDebug.verify(TMW_TO_TM == ct);
StringBuffer errors = new StringBuffer();
// Convert to TibetanMachine:
if (tdoc.convertToTM(0, tdoc.getLength(), errors,
numAttemptedReplacements)) {
System.err.println(errors);
exitCode = 42;
}
}
// Write to standard output the result:
try {
tdoc.writeRTFOutputStream(out);
} catch (IOException e) {
exitCode = 40;
}
if (out.checkError())
exitCode = 41;
if (numAttemptedReplacements[0] < 1)
exitCode = 43;
return exitCode;
}
}
}
}