Jskad's converter now has ACIP-to-Unicode built in. There are known

bugs; it is pre-alpha.  It's usable, though, and finds tons of errors
in ACIP input files, with the user deciding just how pedantic to be.
The biggest outstanding bug is the silent one: treating { }, space, as
tsheg instead of whitespace when we ought to know better.
This commit is contained in:
dchandler 2003-08-24 06:40:53 +00:00
parent d5ad760230
commit 1982c5847b
11 changed files with 355 additions and 244 deletions

View file

@ -46,11 +46,13 @@ class ConvertDialog extends JDialog
JComboBox choices;
private JComboBox warningLevels;
JTextField oldTextField, newTextField;
JButton browseOld, browseNew, convert, cancel, openDocOld, openDocNew, about;
JLabel type, oldLabel, newLabel;
JLabel oldLabel, newLabel;
String[] choiceNames;
@ -68,6 +70,12 @@ class ConvertDialog extends JDialog
public void theRealActionPerformed(ActionEvent e) {
ConvertDialog.this.theRealActionPerformed(e);
}};
private void updateWarningLevels() {
if (choices.getSelectedItem() == ACIP_TO_UNI)
this.warningLevels.enable();
else
this.warningLevels.disable();
}
private void init()
{
jfc = new JFileChooser(controller.getDefaultDirectory());
@ -76,9 +84,17 @@ class ConvertDialog extends JDialog
content = new JPanel(new GridLayout(0,1));
JPanel temp = new JPanel(new FlowLayout(FlowLayout.CENTER,5,5));
type = new JLabel("Type of Conversion: ");
temp.add(type);
temp.add(new JLabel("Type of Conversion: "));
temp.add(choices);
temp.add(Box.createHorizontalStrut(20));
temp.add(new JLabel("Warning Level: "));
this.warningLevels
= new JComboBox(new String[] { "None", "Some", "Most", "All" });
this.warningLevels.setSelectedItem("Most");
this.warningLevels.addActionListener(tal);
updateWarningLevels();
temp.add(warningLevels);
content.add(temp);
temp = new JPanel(new FlowLayout(FlowLayout.CENTER,5,5));
@ -260,7 +276,8 @@ class ConvertDialog extends JDialog
controller.doConversion(this,
origFile,
convertedFile,
(String)choices.getSelectedItem());
(String)choices.getSelectedItem(),
(String)warningLevels.getSelectedItem());
} catch (OutOfMemoryError e) {
JOptionPane.showMessageDialog(this,
"The converter ran out of memory. Please give the\nJVM more memory by using java -XmxYYYm where YYY\nis the amount of memory your system has, or\nsomething close to it. E.g., try\n'java -Xmx512m -jar Jskad.jar'.",
@ -316,7 +333,11 @@ class ConvertDialog extends JDialog
"About",
JOptionPane.PLAIN_MESSAGE);
} else if (cmd.equals("comboBoxChanged")) {
updateNewFileGuess();
JComboBox src = (JComboBox)ae.getSource();
if (src == choices) {
updateNewFileGuess();
updateWarningLevels();
}
}
}
@ -400,7 +421,7 @@ class ConvertDialog extends JDialog
} else { // conversion {to Wylie or TM} mode
if (TMW_TO_WYLIE == ct) {
newFileNamePrefix = suggested_WYLIE_prefix;
} else if (TMW_TO_UNI == ct) {
} else if (TMW_TO_UNI == ct || ACIP_TO_UNI == ct) {
newFileNamePrefix = suggested_TO_UNI_prefix;
} else if (TM_TO_TMW == ct) {
newFileNamePrefix = suggested_TO_TMW_prefix;

View file

@ -48,14 +48,15 @@ public class ConverterGUI implements FontConversion, FontConverterConstants {
}
public boolean doConversion(ConvertDialog cd, File oldFile, File newFile,
String whichConversion) {
String whichConversion, String warningLevel) {
PrintStream ps;
try {
returnCode
= TibetanConverter.reallyConvert(new FileInputStream(oldFile),
ps = new PrintStream(new FileOutputStream(newFile),
false),
whichConversion);
whichConversion,
warningLevel);
ps.close();
} catch (FileNotFoundException e) {
returnCode = 39;
@ -89,6 +90,28 @@ public class ConverterGUI implements FontConversion, FontConverterConstants {
"Errors in Conversion",
JOptionPane.ERROR_MESSAGE);
return false;
} else if (45 == returnCode) {
if (warningLevel == "None") throw new Error("FIXME: make this an assertion");
JOptionPane.showMessageDialog(cd,
"No errors occurred, but some warnings are embedded in\nthe output as [#WARNING...].",
"Warnings in Conversion",
JOptionPane.ERROR_MESSAGE);
return false;
} else if (46 == returnCode) {
JOptionPane.showMessageDialog(cd,
"Errors occurred, and are embedded in the output\nas [#ERROR...]."
+ ((warningLevel == "None")
? ""
: " Warnings may have occurred; if so,\nthey are embedded in the output as [#WARNING...]."),
"Errors in Conversion",
JOptionPane.ERROR_MESSAGE);
return false;
} else if (47 == returnCode) {
JOptionPane.showMessageDialog(cd,
"So many errors occurred that the document is likely\nEnglish, not Tibetan. No output was produced.",
"Many Errors in Conversion",
JOptionPane.ERROR_MESSAGE);
return false;
} else if (1 == returnCode) {
if (FIND_SOME_NON_TMW == whichConversion
|| FIND_ALL_NON_TMW == whichConversion) {
@ -102,6 +125,8 @@ public class ConverterGUI implements FontConversion, FontConverterConstants {
"Something besides TibetanMachine was found; see output file.",
"Not entirely TM",
JOptionPane.PLAIN_MESSAGE);
} else {
throw new Error("Who returned this??");
}
return false;
} else if (0 != returnCode) {
@ -150,6 +175,7 @@ public class ConverterGUI implements FontConversion, FontConverterConstants {
try {
final ConvertDialog convDialog;
String[] choices = new String[]{
ACIP_TO_UNI,
TM_TO_TMW,
TMW_TO_UNI,
TMW_TO_WYLIE,

View file

@ -372,6 +372,9 @@ public class DuffPaneTest extends TestCase {
ensureKeysGiveCorrectWylie("bskyUMbs");
ensureKeysGiveCorrectWylie("bskyUMbsHgro ");
ensureKeysGiveCorrectWylie("gyurd", "gyurda");
ensureKeysGiveCorrectWylie("gyur.d");
ensureKeysGiveCorrectWylie("favakakhagangacachajanyatathadanapaphabamatsatshadzawazhaza'ayaralashasahaTaThaDaNaSha");
ensureKeysGiveCorrectWylie("fevekekhegengecechejenyetethedenepephebemetsetshedzewezheze'eyerelesheseheTeTheDeNeShe");
ensureKeysGiveCorrectWylie("fuvukukhugungucuchujunyututhudunupuphubumutsutshudzuwuzhuzu'uyurulushusuhuTuThuDuNuShu");

View file

@ -37,5 +37,6 @@ interface FontConversion
whichConversion, which must be one of the known conversions.
@return true on success, false otherwise */
boolean doConversion(ConvertDialog cd, File oldFile,
File newFile, String whichConversion);
File newFile, String whichConversion,
String warningLevel);
}

View file

@ -26,6 +26,7 @@ import java.awt.*;
@author Nathaniel Garson, Tibetan and Himalayan Digital Library */
interface FontConverterConstants
{
final String ACIP_TO_UNI = "ACIP to Unicode";
final String TM_TO_TMW = "TM to TMW";
final String TMW_TO_UNI = "TMW to Unicode";
final String TMW_TO_WYLIE = "TMW to Wylie";

View file

@ -26,6 +26,10 @@ import javax.swing.text.StyleConstants;
import org.thdl.util.*;
import org.thdl.tib.text.*;
import org.thdl.tib.text.ttt.ACIPConverter;
import org.thdl.tib.text.ttt.ACIPTshegBarScanner;
import java.util.ArrayList;
/** TibetanConverter is a command-line utility for converting to
* and from Tibetan Machine Web (TMW). It converts TMW to Wylie, to
* Unicode, or to Tibetan Machine (TM). It also converts TM to TMW.
@ -66,6 +70,7 @@ public class TibetanConverter implements FontConverterConstants {
try {
boolean convertToUnicodeMode = false;
boolean convertToTMMode = false;
boolean convertACIPToUniMode = false;
boolean convertToTMWMode = false;
boolean convertToWylieMode = false;
boolean findSomeNonTMWMode = false;
@ -84,6 +89,8 @@ public class TibetanConverter implements FontConverterConstants {
= args[0].equals("--to-tibetan-machine"))
|| (convertToTMWMode
= args[0].equals("--to-tibetan-machine-web"))
|| (convertACIPToUniMode
= args[0].equals("--acip-to-unicode"))
|| (convertToUnicodeMode
= args[0].equals("--to-unicode"))
|| (convertToWylieMode
@ -98,6 +105,7 @@ public class TibetanConverter implements FontConverterConstants {
out.println("TibetanConverter [--find-all-non-tmw | --find-some-non-tmw");
out.println(" | --to-tibetan-machine | --to-tibetan-machine-web");
out.println(" | --to-unicode | --to-wylie] RTF_file");
out.println(" | TibetanConverter --acip-to-unicode TXT_file");
out.println(" | TibetanConverter [--version | -v | --help | -h]");
out.println("");
out.println("Distributed under the terms of the THDL Open Community License Version 1.0.");
@ -105,6 +113,11 @@ public class TibetanConverter implements FontConverterConstants {
out.println("Usage:");
out.println(" -v | --version for version info");
out.println(" -h | --help for this message");
out.println(" --to-tibetan-machine to convert TibetanMachineWeb to TibetanMachine");
out.println(" --to-unicode to convert TibetanMachineWeb to Unicode");
out.println(" --to-tibetan-machine-web to convert TibetanMachine to TibetanMachineWeb");
out.println(" --to-wylie to convert TibetanMachineWeb to THDL Extended Wylie");
out.println(" --acip-to-unicode to convert ACIP text file to Unicode text file");
out.println(" --find-all-non-tmw to locate all characters in the input document that are");
out.println(" not in Tibetan Machine Web fonts, exit zero if and only if none found");
out.println(" --find-some-non-tmw to locate all distinct characters in the input document");
@ -113,14 +126,12 @@ public class TibetanConverter implements FontConverterConstants {
out.println(" not in Tibetan Machine fonts, exit zero if and only if none found");
out.println(" --find-some-non-tm to locate all distinct characters in the input document");
out.println(" not in Tibetan Machine fonts, exit zero if and only if none found");
out.println(" --to-tibetan-machine to convert TibetanMachineWeb to TibetanMachine");
out.println(" --to-unicode to convert TibetanMachineWeb to Unicode");
out.println(" --to-tibetan-machine-web to convert TibetanMachine to TibetanMachineWeb");
out.println(" --to-wylie to convert TibetanMachineWeb to THDL Extended Wylie");
out.println("");
out.println(" In --to... modes, needs one argument, the name of the TibetanMachineWeb RTF");
out.println(" In --to... and --acip-to... modes, needs one argument, the name of the");
out.println(" TibetanMachineWeb RTF");
out.println(" file (for --to-wylie, --to-unicode, and --to-tibetan-machine) or the name of");
out.println(" the TibetanMachine RTF file (for --to-tibetan-machine-web). Writes the");
out.println(" the TibetanMachine RTF file (for --to-tibetan-machine-web) or the name of the");
out.println(" ACIP text file (for --acip-to-unicode). Writes the");
out.println(" result to standard output (after dealing with the curly brace problem if");
out.println(" the input is TibetanMachineWeb). Exit code is zero on success, 42 if some");
out.println(" glyphs couldn't be converted (in which case the output is just those glyphs),");
@ -135,11 +146,10 @@ public class TibetanConverter implements FontConverterConstants {
out.println(" You may find it helpful to use `--find-some-non-tmw' mode (or");
out.println(" `--find-some-non-tm' mode for Tibetan Machine input) before doing a");
out.println(" conversion so that you have confidence in the conversion's correctness.");
// DLC add Wylie->TMW mode.
return 77;
}
if (args[0].equals("--version") || args[0].equals("-v")) {
out.println("TibetanConverter version 0.82");
out.println("TibetanConverter version 0.83");
out.println("Compiled at "
+ ThdlVersion.getTimeOfCompilation());
return 77;
@ -168,12 +178,15 @@ public class TibetanConverter implements FontConverterConstants {
conversionTag = TMW_TO_UNI;
} else if (convertToTMWMode) {
conversionTag = TM_TO_TMW;
} else if (convertACIPToUniMode) {
conversionTag = ACIP_TO_UNI;
} else {
ThdlDebug.verify(convertToTMMode);
conversionTag = TMW_TO_TM;
}
}
return reallyConvert(in, out, conversionTag);
return reallyConvert(in, out, conversionTag, "Most" // DLC make me configurable
);
} catch (ThdlLazyException e) {
out.println("TibetanConverter has a BUG:");
e.getRealException().printStackTrace(out);
@ -190,132 +203,155 @@ public class TibetanConverter implements FontConverterConstants {
number of strings -- see the code. Returns an appropriate
return code so that TibetanConverter's usage message is
honored. */
static int reallyConvert(InputStream in, PrintStream out, String ct) {
TibetanDocument tdoc = new TibetanDocument();
{
SimpleAttributeSet ras = new SimpleAttributeSet();
StyleConstants.setFontFamily(ras,
ThdlOptions.getStringOption("thdl.default.roman.font.face",
"Serif"));
StyleConstants.setFontSize(ras,
ThdlOptions.getIntegerOption("thdl.default.roman.font.size",
14));
tdoc.setRomanAttributeSet(ras);
}
try {
// Read in the rtf file.
if (debug) System.err.println("Start: reading in old RTF file");
if (!ThdlOptions.getBooleanOption("thdl.do.not.fix.rtf.hex.escapes"))
in = new RTFFixerInputStream(in);
(new RTFEditorKit()).read(in, tdoc, 0);
if (debug) System.err.println("End : reading in old RTF file");
} catch (Exception e) {
out.println("TibetanConverter:\n"
+ rtfErrorMessage);
return 3;
}
try {
in.close();
} catch (IOException e) {
// silently ignore; we don't care about the input so much...
ThdlDebug.noteIffyCode();
}
if (FIND_ALL_NON_TMW == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findAllNonTMWCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else if (FIND_SOME_NON_TMW == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findSomeNonTMWCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else if (FIND_SOME_NON_TM == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findSomeNonTMCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else if (FIND_ALL_NON_TM == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findAllNonTMCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else { // conversion {to Wylie or TM} mode
// Fix curly braces in the entire document if the input is TMW:
if (TM_TO_TMW != ct) {
// DLC make me optional
if (debug) System.err.println("Start: solving curly brace problem");
tdoc.replaceTahomaCurlyBracesAndBackslashes(0, -1);
if (debug) System.err.println("End : solving curly brace problem");
}
int exitCode = 0;
ThdlDebug.verify(((TMW_TO_TM == ct) ? 1 : 0)
+ ((TMW_TO_UNI == ct) ? 1 : 0)
+ ((TM_TO_TMW == ct) ? 1 : 0)
+ ((TMW_TO_WYLIE == ct) ? 1 : 0)
== 1);
long numAttemptedReplacements[] = new long[] { 0 };
if (TMW_TO_WYLIE == ct) {
// Convert to THDL Wylie:
if (!tdoc.toWylie(0,
tdoc.getLength(),
numAttemptedReplacements)) {
exitCode = 44;
}
} else if (TMW_TO_UNI == ct) {
StringBuffer errors = new StringBuffer();
// Convert to Unicode:
if (tdoc.convertToUnicode(0,
tdoc.getLength(),
errors,
ThdlOptions.getStringOption("thdl.tmw.to.unicode.font").intern(),
numAttemptedReplacements)) {
System.err.println(errors);
exitCode = 42;
}
} else if (TM_TO_TMW == ct) {
StringBuffer errors = new StringBuffer();
// Convert to TibetanMachineWeb:
if (tdoc.convertToTMW(0, tdoc.getLength(), errors,
numAttemptedReplacements)) {
System.err.println(errors);
exitCode = 42;
}
} else {
ThdlDebug.verify(TMW_TO_TM == ct);
StringBuffer errors = new StringBuffer();
// Convert to TibetanMachine:
if (tdoc.convertToTM(0, tdoc.getLength(), errors,
numAttemptedReplacements)) {
System.err.println(errors);
exitCode = 42;
}
}
// Write to standard output the result:
static int reallyConvert(InputStream in, PrintStream out, String ct,
String warningLevel) {
if (ACIP_TO_UNI == ct) {
try {
tdoc.writeRTFOutputStream(out);
ArrayList al = ACIPTshegBarScanner.scanStream(in, null,
250 - 1 // DLC FIXME: make me configurable
);
if (null == al)
return 47;
StringBuffer warnings = new StringBuffer();
boolean embeddedWarnings = (warningLevel != "None");
if (!ACIPConverter.convertToUnicode(al, out, null, warnings,
embeddedWarnings,
warningLevel))
return 46;
if (embeddedWarnings && warnings.length() > 0)
return 45;
else
return 0;
} catch (IOException e) {
exitCode = 40;
return 48;
}
} else {
TibetanDocument tdoc = new TibetanDocument();
{
SimpleAttributeSet ras = new SimpleAttributeSet();
StyleConstants.setFontFamily(ras,
ThdlOptions.getStringOption("thdl.default.roman.font.face",
"Serif"));
StyleConstants.setFontSize(ras,
ThdlOptions.getIntegerOption("thdl.default.roman.font.size",
14));
tdoc.setRomanAttributeSet(ras);
}
try {
// Read in the rtf file.
if (debug) System.err.println("Start: reading in old RTF file");
if (!ThdlOptions.getBooleanOption("thdl.do.not.fix.rtf.hex.escapes"))
in = new RTFFixerInputStream(in);
(new RTFEditorKit()).read(in, tdoc, 0);
if (debug) System.err.println("End : reading in old RTF file");
} catch (Exception e) {
out.println("TibetanConverter:\n"
+ rtfErrorMessage);
return 3;
}
if (out.checkError())
exitCode = 41;
if (numAttemptedReplacements[0] < 1)
exitCode = 43;
return exitCode;
try {
in.close();
} catch (IOException e) {
// silently ignore; we don't care about the input so much...
ThdlDebug.noteIffyCode();
}
if (FIND_ALL_NON_TMW == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findAllNonTMWCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else if (FIND_SOME_NON_TMW == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findSomeNonTMWCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else if (FIND_SOME_NON_TM == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findSomeNonTMCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else if (FIND_ALL_NON_TM == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findAllNonTMCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else { // conversion {to Wylie or TM} mode
// Fix curly braces in the entire document if the input is TMW:
if (TM_TO_TMW != ct) {
// DLC make me optional
if (debug) System.err.println("Start: solving curly brace problem");
tdoc.replaceTahomaCurlyBracesAndBackslashes(0, -1);
if (debug) System.err.println("End : solving curly brace problem");
}
int exitCode = 0;
ThdlDebug.verify(((TMW_TO_TM == ct) ? 1 : 0)
+ ((TMW_TO_UNI == ct) ? 1 : 0)
+ ((TM_TO_TMW == ct) ? 1 : 0)
+ ((TMW_TO_WYLIE == ct) ? 1 : 0)
== 1);
long numAttemptedReplacements[] = new long[] { 0 };
if (TMW_TO_WYLIE == ct) {
// Convert to THDL Wylie:
if (!tdoc.toWylie(0,
tdoc.getLength(),
numAttemptedReplacements)) {
exitCode = 44;
}
} else if (TMW_TO_UNI == ct) {
StringBuffer errors = new StringBuffer();
// Convert to Unicode:
if (tdoc.convertToUnicode(0,
tdoc.getLength(),
errors,
ThdlOptions.getStringOption("thdl.tmw.to.unicode.font").intern(),
numAttemptedReplacements)) {
System.err.println(errors);
exitCode = 42;
}
} else if (TM_TO_TMW == ct) {
StringBuffer errors = new StringBuffer();
// Convert to TibetanMachineWeb:
if (tdoc.convertToTMW(0, tdoc.getLength(), errors,
numAttemptedReplacements)) {
System.err.println(errors);
exitCode = 42;
}
} else {
ThdlDebug.verify(TMW_TO_TM == ct);
StringBuffer errors = new StringBuffer();
// Convert to TibetanMachine:
if (tdoc.convertToTM(0, tdoc.getLength(), errors,
numAttemptedReplacements)) {
System.err.println(errors);
exitCode = 42;
}
}
// Write to standard output the result:
try {
tdoc.writeRTFOutputStream(out);
} catch (IOException e) {
exitCode = 40;
}
if (out.checkError())
exitCode = 41;
if (numAttemptedReplacements[0] < 1)
exitCode = 43;
return exitCode;
}
}
}
}