Jskad's converter now has ACIP-to-Unicode built in. There are known
bugs; it is pre-alpha. It's usable, though, and finds tons of errors in ACIP input files, with the user deciding just how pedantic to be. The biggest outstanding bug is the silent one: treating { }, space, as tsheg instead of whitespace when we ought to know better.
This commit is contained in:
parent
d5ad760230
commit
1982c5847b
11 changed files with 355 additions and 244 deletions
|
@ -26,6 +26,10 @@ import javax.swing.text.StyleConstants;
|
|||
import org.thdl.util.*;
|
||||
import org.thdl.tib.text.*;
|
||||
|
||||
import org.thdl.tib.text.ttt.ACIPConverter;
|
||||
import org.thdl.tib.text.ttt.ACIPTshegBarScanner;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/** TibetanConverter is a command-line utility for converting to
|
||||
* and from Tibetan Machine Web (TMW). It converts TMW to Wylie, to
|
||||
* Unicode, or to Tibetan Machine (TM). It also converts TM to TMW.
|
||||
|
@ -66,6 +70,7 @@ public class TibetanConverter implements FontConverterConstants {
|
|||
try {
|
||||
boolean convertToUnicodeMode = false;
|
||||
boolean convertToTMMode = false;
|
||||
boolean convertACIPToUniMode = false;
|
||||
boolean convertToTMWMode = false;
|
||||
boolean convertToWylieMode = false;
|
||||
boolean findSomeNonTMWMode = false;
|
||||
|
@ -84,6 +89,8 @@ public class TibetanConverter implements FontConverterConstants {
|
|||
= args[0].equals("--to-tibetan-machine"))
|
||||
|| (convertToTMWMode
|
||||
= args[0].equals("--to-tibetan-machine-web"))
|
||||
|| (convertACIPToUniMode
|
||||
= args[0].equals("--acip-to-unicode"))
|
||||
|| (convertToUnicodeMode
|
||||
= args[0].equals("--to-unicode"))
|
||||
|| (convertToWylieMode
|
||||
|
@ -98,6 +105,7 @@ public class TibetanConverter implements FontConverterConstants {
|
|||
out.println("TibetanConverter [--find-all-non-tmw | --find-some-non-tmw");
|
||||
out.println(" | --to-tibetan-machine | --to-tibetan-machine-web");
|
||||
out.println(" | --to-unicode | --to-wylie] RTF_file");
|
||||
out.println(" | TibetanConverter --acip-to-unicode TXT_file");
|
||||
out.println(" | TibetanConverter [--version | -v | --help | -h]");
|
||||
out.println("");
|
||||
out.println("Distributed under the terms of the THDL Open Community License Version 1.0.");
|
||||
|
@ -105,6 +113,11 @@ public class TibetanConverter implements FontConverterConstants {
|
|||
out.println("Usage:");
|
||||
out.println(" -v | --version for version info");
|
||||
out.println(" -h | --help for this message");
|
||||
out.println(" --to-tibetan-machine to convert TibetanMachineWeb to TibetanMachine");
|
||||
out.println(" --to-unicode to convert TibetanMachineWeb to Unicode");
|
||||
out.println(" --to-tibetan-machine-web to convert TibetanMachine to TibetanMachineWeb");
|
||||
out.println(" --to-wylie to convert TibetanMachineWeb to THDL Extended Wylie");
|
||||
out.println(" --acip-to-unicode to convert ACIP text file to Unicode text file");
|
||||
out.println(" --find-all-non-tmw to locate all characters in the input document that are");
|
||||
out.println(" not in Tibetan Machine Web fonts, exit zero if and only if none found");
|
||||
out.println(" --find-some-non-tmw to locate all distinct characters in the input document");
|
||||
|
@ -113,14 +126,12 @@ public class TibetanConverter implements FontConverterConstants {
|
|||
out.println(" not in Tibetan Machine fonts, exit zero if and only if none found");
|
||||
out.println(" --find-some-non-tm to locate all distinct characters in the input document");
|
||||
out.println(" not in Tibetan Machine fonts, exit zero if and only if none found");
|
||||
out.println(" --to-tibetan-machine to convert TibetanMachineWeb to TibetanMachine");
|
||||
out.println(" --to-unicode to convert TibetanMachineWeb to Unicode");
|
||||
out.println(" --to-tibetan-machine-web to convert TibetanMachine to TibetanMachineWeb");
|
||||
out.println(" --to-wylie to convert TibetanMachineWeb to THDL Extended Wylie");
|
||||
out.println("");
|
||||
out.println(" In --to... modes, needs one argument, the name of the TibetanMachineWeb RTF");
|
||||
out.println(" In --to... and --acip-to... modes, needs one argument, the name of the");
|
||||
out.println(" TibetanMachineWeb RTF");
|
||||
out.println(" file (for --to-wylie, --to-unicode, and --to-tibetan-machine) or the name of");
|
||||
out.println(" the TibetanMachine RTF file (for --to-tibetan-machine-web). Writes the");
|
||||
out.println(" the TibetanMachine RTF file (for --to-tibetan-machine-web) or the name of the");
|
||||
out.println(" ACIP text file (for --acip-to-unicode). Writes the");
|
||||
out.println(" result to standard output (after dealing with the curly brace problem if");
|
||||
out.println(" the input is TibetanMachineWeb). Exit code is zero on success, 42 if some");
|
||||
out.println(" glyphs couldn't be converted (in which case the output is just those glyphs),");
|
||||
|
@ -135,11 +146,10 @@ public class TibetanConverter implements FontConverterConstants {
|
|||
out.println(" You may find it helpful to use `--find-some-non-tmw' mode (or");
|
||||
out.println(" `--find-some-non-tm' mode for Tibetan Machine input) before doing a");
|
||||
out.println(" conversion so that you have confidence in the conversion's correctness.");
|
||||
// DLC add Wylie->TMW mode.
|
||||
return 77;
|
||||
}
|
||||
if (args[0].equals("--version") || args[0].equals("-v")) {
|
||||
out.println("TibetanConverter version 0.82");
|
||||
out.println("TibetanConverter version 0.83");
|
||||
out.println("Compiled at "
|
||||
+ ThdlVersion.getTimeOfCompilation());
|
||||
return 77;
|
||||
|
@ -168,12 +178,15 @@ public class TibetanConverter implements FontConverterConstants {
|
|||
conversionTag = TMW_TO_UNI;
|
||||
} else if (convertToTMWMode) {
|
||||
conversionTag = TM_TO_TMW;
|
||||
} else if (convertACIPToUniMode) {
|
||||
conversionTag = ACIP_TO_UNI;
|
||||
} else {
|
||||
ThdlDebug.verify(convertToTMMode);
|
||||
conversionTag = TMW_TO_TM;
|
||||
}
|
||||
}
|
||||
return reallyConvert(in, out, conversionTag);
|
||||
return reallyConvert(in, out, conversionTag, "Most" // DLC make me configurable
|
||||
);
|
||||
} catch (ThdlLazyException e) {
|
||||
out.println("TibetanConverter has a BUG:");
|
||||
e.getRealException().printStackTrace(out);
|
||||
|
@ -190,132 +203,155 @@ public class TibetanConverter implements FontConverterConstants {
|
|||
number of strings -- see the code. Returns an appropriate
|
||||
return code so that TibetanConverter's usage message is
|
||||
honored. */
|
||||
static int reallyConvert(InputStream in, PrintStream out, String ct) {
|
||||
TibetanDocument tdoc = new TibetanDocument();
|
||||
{
|
||||
SimpleAttributeSet ras = new SimpleAttributeSet();
|
||||
StyleConstants.setFontFamily(ras,
|
||||
ThdlOptions.getStringOption("thdl.default.roman.font.face",
|
||||
"Serif"));
|
||||
StyleConstants.setFontSize(ras,
|
||||
ThdlOptions.getIntegerOption("thdl.default.roman.font.size",
|
||||
14));
|
||||
tdoc.setRomanAttributeSet(ras);
|
||||
}
|
||||
try {
|
||||
// Read in the rtf file.
|
||||
if (debug) System.err.println("Start: reading in old RTF file");
|
||||
if (!ThdlOptions.getBooleanOption("thdl.do.not.fix.rtf.hex.escapes"))
|
||||
in = new RTFFixerInputStream(in);
|
||||
(new RTFEditorKit()).read(in, tdoc, 0);
|
||||
if (debug) System.err.println("End : reading in old RTF file");
|
||||
} catch (Exception e) {
|
||||
out.println("TibetanConverter:\n"
|
||||
+ rtfErrorMessage);
|
||||
return 3;
|
||||
}
|
||||
|
||||
try {
|
||||
in.close();
|
||||
} catch (IOException e) {
|
||||
// silently ignore; we don't care about the input so much...
|
||||
ThdlDebug.noteIffyCode();
|
||||
}
|
||||
|
||||
|
||||
if (FIND_ALL_NON_TMW == ct) {
|
||||
// 0, -1 is the entire document.
|
||||
int exitCode
|
||||
= tdoc.findAllNonTMWCharacters(0, -1, out);
|
||||
if (out.checkError())
|
||||
exitCode = 41;
|
||||
return exitCode;
|
||||
} else if (FIND_SOME_NON_TMW == ct) {
|
||||
// 0, -1 is the entire document.
|
||||
int exitCode
|
||||
= tdoc.findSomeNonTMWCharacters(0, -1, out);
|
||||
if (out.checkError())
|
||||
exitCode = 41;
|
||||
return exitCode;
|
||||
} else if (FIND_SOME_NON_TM == ct) {
|
||||
// 0, -1 is the entire document.
|
||||
int exitCode
|
||||
= tdoc.findSomeNonTMCharacters(0, -1, out);
|
||||
if (out.checkError())
|
||||
exitCode = 41;
|
||||
return exitCode;
|
||||
} else if (FIND_ALL_NON_TM == ct) {
|
||||
// 0, -1 is the entire document.
|
||||
int exitCode
|
||||
= tdoc.findAllNonTMCharacters(0, -1, out);
|
||||
if (out.checkError())
|
||||
exitCode = 41;
|
||||
return exitCode;
|
||||
} else { // conversion {to Wylie or TM} mode
|
||||
// Fix curly braces in the entire document if the input is TMW:
|
||||
if (TM_TO_TMW != ct) {
|
||||
// DLC make me optional
|
||||
if (debug) System.err.println("Start: solving curly brace problem");
|
||||
tdoc.replaceTahomaCurlyBracesAndBackslashes(0, -1);
|
||||
if (debug) System.err.println("End : solving curly brace problem");
|
||||
}
|
||||
|
||||
int exitCode = 0;
|
||||
ThdlDebug.verify(((TMW_TO_TM == ct) ? 1 : 0)
|
||||
+ ((TMW_TO_UNI == ct) ? 1 : 0)
|
||||
+ ((TM_TO_TMW == ct) ? 1 : 0)
|
||||
+ ((TMW_TO_WYLIE == ct) ? 1 : 0)
|
||||
== 1);
|
||||
long numAttemptedReplacements[] = new long[] { 0 };
|
||||
if (TMW_TO_WYLIE == ct) {
|
||||
// Convert to THDL Wylie:
|
||||
if (!tdoc.toWylie(0,
|
||||
tdoc.getLength(),
|
||||
numAttemptedReplacements)) {
|
||||
exitCode = 44;
|
||||
}
|
||||
} else if (TMW_TO_UNI == ct) {
|
||||
StringBuffer errors = new StringBuffer();
|
||||
// Convert to Unicode:
|
||||
if (tdoc.convertToUnicode(0,
|
||||
tdoc.getLength(),
|
||||
errors,
|
||||
ThdlOptions.getStringOption("thdl.tmw.to.unicode.font").intern(),
|
||||
numAttemptedReplacements)) {
|
||||
System.err.println(errors);
|
||||
exitCode = 42;
|
||||
}
|
||||
} else if (TM_TO_TMW == ct) {
|
||||
StringBuffer errors = new StringBuffer();
|
||||
// Convert to TibetanMachineWeb:
|
||||
if (tdoc.convertToTMW(0, tdoc.getLength(), errors,
|
||||
numAttemptedReplacements)) {
|
||||
System.err.println(errors);
|
||||
exitCode = 42;
|
||||
}
|
||||
} else {
|
||||
ThdlDebug.verify(TMW_TO_TM == ct);
|
||||
StringBuffer errors = new StringBuffer();
|
||||
// Convert to TibetanMachine:
|
||||
if (tdoc.convertToTM(0, tdoc.getLength(), errors,
|
||||
numAttemptedReplacements)) {
|
||||
System.err.println(errors);
|
||||
exitCode = 42;
|
||||
}
|
||||
}
|
||||
|
||||
// Write to standard output the result:
|
||||
static int reallyConvert(InputStream in, PrintStream out, String ct,
|
||||
String warningLevel) {
|
||||
if (ACIP_TO_UNI == ct) {
|
||||
try {
|
||||
tdoc.writeRTFOutputStream(out);
|
||||
ArrayList al = ACIPTshegBarScanner.scanStream(in, null,
|
||||
250 - 1 // DLC FIXME: make me configurable
|
||||
);
|
||||
if (null == al)
|
||||
return 47;
|
||||
StringBuffer warnings = new StringBuffer();
|
||||
boolean embeddedWarnings = (warningLevel != "None");
|
||||
if (!ACIPConverter.convertToUnicode(al, out, null, warnings,
|
||||
embeddedWarnings,
|
||||
warningLevel))
|
||||
return 46;
|
||||
if (embeddedWarnings && warnings.length() > 0)
|
||||
return 45;
|
||||
else
|
||||
return 0;
|
||||
} catch (IOException e) {
|
||||
exitCode = 40;
|
||||
return 48;
|
||||
}
|
||||
} else {
|
||||
TibetanDocument tdoc = new TibetanDocument();
|
||||
{
|
||||
SimpleAttributeSet ras = new SimpleAttributeSet();
|
||||
StyleConstants.setFontFamily(ras,
|
||||
ThdlOptions.getStringOption("thdl.default.roman.font.face",
|
||||
"Serif"));
|
||||
StyleConstants.setFontSize(ras,
|
||||
ThdlOptions.getIntegerOption("thdl.default.roman.font.size",
|
||||
14));
|
||||
tdoc.setRomanAttributeSet(ras);
|
||||
}
|
||||
try {
|
||||
// Read in the rtf file.
|
||||
if (debug) System.err.println("Start: reading in old RTF file");
|
||||
if (!ThdlOptions.getBooleanOption("thdl.do.not.fix.rtf.hex.escapes"))
|
||||
in = new RTFFixerInputStream(in);
|
||||
(new RTFEditorKit()).read(in, tdoc, 0);
|
||||
if (debug) System.err.println("End : reading in old RTF file");
|
||||
} catch (Exception e) {
|
||||
out.println("TibetanConverter:\n"
|
||||
+ rtfErrorMessage);
|
||||
return 3;
|
||||
}
|
||||
if (out.checkError())
|
||||
exitCode = 41;
|
||||
if (numAttemptedReplacements[0] < 1)
|
||||
exitCode = 43;
|
||||
|
||||
return exitCode;
|
||||
try {
|
||||
in.close();
|
||||
} catch (IOException e) {
|
||||
// silently ignore; we don't care about the input so much...
|
||||
ThdlDebug.noteIffyCode();
|
||||
}
|
||||
|
||||
|
||||
if (FIND_ALL_NON_TMW == ct) {
|
||||
// 0, -1 is the entire document.
|
||||
int exitCode
|
||||
= tdoc.findAllNonTMWCharacters(0, -1, out);
|
||||
if (out.checkError())
|
||||
exitCode = 41;
|
||||
return exitCode;
|
||||
} else if (FIND_SOME_NON_TMW == ct) {
|
||||
// 0, -1 is the entire document.
|
||||
int exitCode
|
||||
= tdoc.findSomeNonTMWCharacters(0, -1, out);
|
||||
if (out.checkError())
|
||||
exitCode = 41;
|
||||
return exitCode;
|
||||
} else if (FIND_SOME_NON_TM == ct) {
|
||||
// 0, -1 is the entire document.
|
||||
int exitCode
|
||||
= tdoc.findSomeNonTMCharacters(0, -1, out);
|
||||
if (out.checkError())
|
||||
exitCode = 41;
|
||||
return exitCode;
|
||||
} else if (FIND_ALL_NON_TM == ct) {
|
||||
// 0, -1 is the entire document.
|
||||
int exitCode
|
||||
= tdoc.findAllNonTMCharacters(0, -1, out);
|
||||
if (out.checkError())
|
||||
exitCode = 41;
|
||||
return exitCode;
|
||||
} else { // conversion {to Wylie or TM} mode
|
||||
// Fix curly braces in the entire document if the input is TMW:
|
||||
if (TM_TO_TMW != ct) {
|
||||
// DLC make me optional
|
||||
if (debug) System.err.println("Start: solving curly brace problem");
|
||||
tdoc.replaceTahomaCurlyBracesAndBackslashes(0, -1);
|
||||
if (debug) System.err.println("End : solving curly brace problem");
|
||||
}
|
||||
|
||||
int exitCode = 0;
|
||||
ThdlDebug.verify(((TMW_TO_TM == ct) ? 1 : 0)
|
||||
+ ((TMW_TO_UNI == ct) ? 1 : 0)
|
||||
+ ((TM_TO_TMW == ct) ? 1 : 0)
|
||||
+ ((TMW_TO_WYLIE == ct) ? 1 : 0)
|
||||
== 1);
|
||||
long numAttemptedReplacements[] = new long[] { 0 };
|
||||
if (TMW_TO_WYLIE == ct) {
|
||||
// Convert to THDL Wylie:
|
||||
if (!tdoc.toWylie(0,
|
||||
tdoc.getLength(),
|
||||
numAttemptedReplacements)) {
|
||||
exitCode = 44;
|
||||
}
|
||||
} else if (TMW_TO_UNI == ct) {
|
||||
StringBuffer errors = new StringBuffer();
|
||||
// Convert to Unicode:
|
||||
if (tdoc.convertToUnicode(0,
|
||||
tdoc.getLength(),
|
||||
errors,
|
||||
ThdlOptions.getStringOption("thdl.tmw.to.unicode.font").intern(),
|
||||
numAttemptedReplacements)) {
|
||||
System.err.println(errors);
|
||||
exitCode = 42;
|
||||
}
|
||||
} else if (TM_TO_TMW == ct) {
|
||||
StringBuffer errors = new StringBuffer();
|
||||
// Convert to TibetanMachineWeb:
|
||||
if (tdoc.convertToTMW(0, tdoc.getLength(), errors,
|
||||
numAttemptedReplacements)) {
|
||||
System.err.println(errors);
|
||||
exitCode = 42;
|
||||
}
|
||||
} else {
|
||||
ThdlDebug.verify(TMW_TO_TM == ct);
|
||||
StringBuffer errors = new StringBuffer();
|
||||
// Convert to TibetanMachine:
|
||||
if (tdoc.convertToTM(0, tdoc.getLength(), errors,
|
||||
numAttemptedReplacements)) {
|
||||
System.err.println(errors);
|
||||
exitCode = 42;
|
||||
}
|
||||
}
|
||||
|
||||
// Write to standard output the result:
|
||||
try {
|
||||
tdoc.writeRTFOutputStream(out);
|
||||
} catch (IOException e) {
|
||||
exitCode = 40;
|
||||
}
|
||||
if (out.checkError())
|
||||
exitCode = 41;
|
||||
if (numAttemptedReplacements[0] < 1)
|
||||
exitCode = 43;
|
||||
|
||||
return exitCode;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue