Jskad's converter now has ACIP-to-Unicode built in. There are known

bugs; it is pre-alpha.  It's usable, though, and finds tons of errors
in ACIP input files, with the user deciding just how pedantic to be.
The biggest outstanding bug is the silent one: treating { }, space, as
tsheg instead of whitespace when we ought to know better.
This commit is contained in:
dchandler 2003-08-24 06:40:53 +00:00
parent d5ad760230
commit 1982c5847b
11 changed files with 355 additions and 244 deletions

View file

@ -46,11 +46,13 @@ class ConvertDialog extends JDialog
JComboBox choices; JComboBox choices;
private JComboBox warningLevels;
JTextField oldTextField, newTextField; JTextField oldTextField, newTextField;
JButton browseOld, browseNew, convert, cancel, openDocOld, openDocNew, about; JButton browseOld, browseNew, convert, cancel, openDocOld, openDocNew, about;
JLabel type, oldLabel, newLabel; JLabel oldLabel, newLabel;
String[] choiceNames; String[] choiceNames;
@ -68,6 +70,12 @@ class ConvertDialog extends JDialog
public void theRealActionPerformed(ActionEvent e) { public void theRealActionPerformed(ActionEvent e) {
ConvertDialog.this.theRealActionPerformed(e); ConvertDialog.this.theRealActionPerformed(e);
}}; }};
private void updateWarningLevels() {
if (choices.getSelectedItem() == ACIP_TO_UNI)
this.warningLevels.enable();
else
this.warningLevels.disable();
}
private void init() private void init()
{ {
jfc = new JFileChooser(controller.getDefaultDirectory()); jfc = new JFileChooser(controller.getDefaultDirectory());
@ -76,9 +84,17 @@ class ConvertDialog extends JDialog
content = new JPanel(new GridLayout(0,1)); content = new JPanel(new GridLayout(0,1));
JPanel temp = new JPanel(new FlowLayout(FlowLayout.CENTER,5,5)); JPanel temp = new JPanel(new FlowLayout(FlowLayout.CENTER,5,5));
type = new JLabel("Type of Conversion: "); temp.add(new JLabel("Type of Conversion: "));
temp.add(type);
temp.add(choices); temp.add(choices);
temp.add(Box.createHorizontalStrut(20));
temp.add(new JLabel("Warning Level: "));
this.warningLevels
= new JComboBox(new String[] { "None", "Some", "Most", "All" });
this.warningLevels.setSelectedItem("Most");
this.warningLevels.addActionListener(tal);
updateWarningLevels();
temp.add(warningLevels);
content.add(temp); content.add(temp);
temp = new JPanel(new FlowLayout(FlowLayout.CENTER,5,5)); temp = new JPanel(new FlowLayout(FlowLayout.CENTER,5,5));
@ -260,7 +276,8 @@ class ConvertDialog extends JDialog
controller.doConversion(this, controller.doConversion(this,
origFile, origFile,
convertedFile, convertedFile,
(String)choices.getSelectedItem()); (String)choices.getSelectedItem(),
(String)warningLevels.getSelectedItem());
} catch (OutOfMemoryError e) { } catch (OutOfMemoryError e) {
JOptionPane.showMessageDialog(this, JOptionPane.showMessageDialog(this,
"The converter ran out of memory. Please give the\nJVM more memory by using java -XmxYYYm where YYY\nis the amount of memory your system has, or\nsomething close to it. E.g., try\n'java -Xmx512m -jar Jskad.jar'.", "The converter ran out of memory. Please give the\nJVM more memory by using java -XmxYYYm where YYY\nis the amount of memory your system has, or\nsomething close to it. E.g., try\n'java -Xmx512m -jar Jskad.jar'.",
@ -316,7 +333,11 @@ class ConvertDialog extends JDialog
"About", "About",
JOptionPane.PLAIN_MESSAGE); JOptionPane.PLAIN_MESSAGE);
} else if (cmd.equals("comboBoxChanged")) { } else if (cmd.equals("comboBoxChanged")) {
updateNewFileGuess(); JComboBox src = (JComboBox)ae.getSource();
if (src == choices) {
updateNewFileGuess();
updateWarningLevels();
}
} }
} }
@ -400,7 +421,7 @@ class ConvertDialog extends JDialog
} else { // conversion {to Wylie or TM} mode } else { // conversion {to Wylie or TM} mode
if (TMW_TO_WYLIE == ct) { if (TMW_TO_WYLIE == ct) {
newFileNamePrefix = suggested_WYLIE_prefix; newFileNamePrefix = suggested_WYLIE_prefix;
} else if (TMW_TO_UNI == ct) { } else if (TMW_TO_UNI == ct || ACIP_TO_UNI == ct) {
newFileNamePrefix = suggested_TO_UNI_prefix; newFileNamePrefix = suggested_TO_UNI_prefix;
} else if (TM_TO_TMW == ct) { } else if (TM_TO_TMW == ct) {
newFileNamePrefix = suggested_TO_TMW_prefix; newFileNamePrefix = suggested_TO_TMW_prefix;

View file

@ -48,14 +48,15 @@ public class ConverterGUI implements FontConversion, FontConverterConstants {
} }
public boolean doConversion(ConvertDialog cd, File oldFile, File newFile, public boolean doConversion(ConvertDialog cd, File oldFile, File newFile,
String whichConversion) { String whichConversion, String warningLevel) {
PrintStream ps; PrintStream ps;
try { try {
returnCode returnCode
= TibetanConverter.reallyConvert(new FileInputStream(oldFile), = TibetanConverter.reallyConvert(new FileInputStream(oldFile),
ps = new PrintStream(new FileOutputStream(newFile), ps = new PrintStream(new FileOutputStream(newFile),
false), false),
whichConversion); whichConversion,
warningLevel);
ps.close(); ps.close();
} catch (FileNotFoundException e) { } catch (FileNotFoundException e) {
returnCode = 39; returnCode = 39;
@ -89,6 +90,28 @@ public class ConverterGUI implements FontConversion, FontConverterConstants {
"Errors in Conversion", "Errors in Conversion",
JOptionPane.ERROR_MESSAGE); JOptionPane.ERROR_MESSAGE);
return false; return false;
} else if (45 == returnCode) {
if (warningLevel == "None") throw new Error("FIXME: make this an assertion");
JOptionPane.showMessageDialog(cd,
"No errors occurred, but some warnings are embedded in\nthe output as [#WARNING...].",
"Warnings in Conversion",
JOptionPane.ERROR_MESSAGE);
return false;
} else if (46 == returnCode) {
JOptionPane.showMessageDialog(cd,
"Errors occurred, and are embedded in the output\nas [#ERROR...]."
+ ((warningLevel == "None")
? ""
: " Warnings may have occurred; if so,\nthey are embedded in the output as [#WARNING...]."),
"Errors in Conversion",
JOptionPane.ERROR_MESSAGE);
return false;
} else if (47 == returnCode) {
JOptionPane.showMessageDialog(cd,
"So many errors occurred that the document is likely\nEnglish, not Tibetan. No output was produced.",
"Many Errors in Conversion",
JOptionPane.ERROR_MESSAGE);
return false;
} else if (1 == returnCode) { } else if (1 == returnCode) {
if (FIND_SOME_NON_TMW == whichConversion if (FIND_SOME_NON_TMW == whichConversion
|| FIND_ALL_NON_TMW == whichConversion) { || FIND_ALL_NON_TMW == whichConversion) {
@ -102,6 +125,8 @@ public class ConverterGUI implements FontConversion, FontConverterConstants {
"Something besides TibetanMachine was found; see output file.", "Something besides TibetanMachine was found; see output file.",
"Not entirely TM", "Not entirely TM",
JOptionPane.PLAIN_MESSAGE); JOptionPane.PLAIN_MESSAGE);
} else {
throw new Error("Who returned this??");
} }
return false; return false;
} else if (0 != returnCode) { } else if (0 != returnCode) {
@ -150,6 +175,7 @@ public class ConverterGUI implements FontConversion, FontConverterConstants {
try { try {
final ConvertDialog convDialog; final ConvertDialog convDialog;
String[] choices = new String[]{ String[] choices = new String[]{
ACIP_TO_UNI,
TM_TO_TMW, TM_TO_TMW,
TMW_TO_UNI, TMW_TO_UNI,
TMW_TO_WYLIE, TMW_TO_WYLIE,

View file

@ -372,6 +372,9 @@ public class DuffPaneTest extends TestCase {
ensureKeysGiveCorrectWylie("bskyUMbs"); ensureKeysGiveCorrectWylie("bskyUMbs");
ensureKeysGiveCorrectWylie("bskyUMbsHgro "); ensureKeysGiveCorrectWylie("bskyUMbsHgro ");
ensureKeysGiveCorrectWylie("gyurd", "gyurda");
ensureKeysGiveCorrectWylie("gyur.d");
ensureKeysGiveCorrectWylie("favakakhagangacachajanyatathadanapaphabamatsatshadzawazhaza'ayaralashasahaTaThaDaNaSha"); ensureKeysGiveCorrectWylie("favakakhagangacachajanyatathadanapaphabamatsatshadzawazhaza'ayaralashasahaTaThaDaNaSha");
ensureKeysGiveCorrectWylie("fevekekhegengecechejenyetethedenepephebemetsetshedzewezheze'eyerelesheseheTeTheDeNeShe"); ensureKeysGiveCorrectWylie("fevekekhegengecechejenyetethedenepephebemetsetshedzewezheze'eyerelesheseheTeTheDeNeShe");
ensureKeysGiveCorrectWylie("fuvukukhugungucuchujunyututhudunupuphubumutsutshudzuwuzhuzu'uyurulushusuhuTuThuDuNuShu"); ensureKeysGiveCorrectWylie("fuvukukhugungucuchujunyututhudunupuphubumutsutshudzuwuzhuzu'uyurulushusuhuTuThuDuNuShu");

View file

@ -37,5 +37,6 @@ interface FontConversion
whichConversion, which must be one of the known conversions. whichConversion, which must be one of the known conversions.
@return true on success, false otherwise */ @return true on success, false otherwise */
boolean doConversion(ConvertDialog cd, File oldFile, boolean doConversion(ConvertDialog cd, File oldFile,
File newFile, String whichConversion); File newFile, String whichConversion,
String warningLevel);
} }

View file

@ -26,6 +26,7 @@ import java.awt.*;
@author Nathaniel Garson, Tibetan and Himalayan Digital Library */ @author Nathaniel Garson, Tibetan and Himalayan Digital Library */
interface FontConverterConstants interface FontConverterConstants
{ {
final String ACIP_TO_UNI = "ACIP to Unicode";
final String TM_TO_TMW = "TM to TMW"; final String TM_TO_TMW = "TM to TMW";
final String TMW_TO_UNI = "TMW to Unicode"; final String TMW_TO_UNI = "TMW to Unicode";
final String TMW_TO_WYLIE = "TMW to Wylie"; final String TMW_TO_WYLIE = "TMW to Wylie";

View file

@ -26,6 +26,10 @@ import javax.swing.text.StyleConstants;
import org.thdl.util.*; import org.thdl.util.*;
import org.thdl.tib.text.*; import org.thdl.tib.text.*;
import org.thdl.tib.text.ttt.ACIPConverter;
import org.thdl.tib.text.ttt.ACIPTshegBarScanner;
import java.util.ArrayList;
/** TibetanConverter is a command-line utility for converting to /** TibetanConverter is a command-line utility for converting to
* and from Tibetan Machine Web (TMW). It converts TMW to Wylie, to * and from Tibetan Machine Web (TMW). It converts TMW to Wylie, to
* Unicode, or to Tibetan Machine (TM). It also converts TM to TMW. * Unicode, or to Tibetan Machine (TM). It also converts TM to TMW.
@ -66,6 +70,7 @@ public class TibetanConverter implements FontConverterConstants {
try { try {
boolean convertToUnicodeMode = false; boolean convertToUnicodeMode = false;
boolean convertToTMMode = false; boolean convertToTMMode = false;
boolean convertACIPToUniMode = false;
boolean convertToTMWMode = false; boolean convertToTMWMode = false;
boolean convertToWylieMode = false; boolean convertToWylieMode = false;
boolean findSomeNonTMWMode = false; boolean findSomeNonTMWMode = false;
@ -84,6 +89,8 @@ public class TibetanConverter implements FontConverterConstants {
= args[0].equals("--to-tibetan-machine")) = args[0].equals("--to-tibetan-machine"))
|| (convertToTMWMode || (convertToTMWMode
= args[0].equals("--to-tibetan-machine-web")) = args[0].equals("--to-tibetan-machine-web"))
|| (convertACIPToUniMode
= args[0].equals("--acip-to-unicode"))
|| (convertToUnicodeMode || (convertToUnicodeMode
= args[0].equals("--to-unicode")) = args[0].equals("--to-unicode"))
|| (convertToWylieMode || (convertToWylieMode
@ -98,6 +105,7 @@ public class TibetanConverter implements FontConverterConstants {
out.println("TibetanConverter [--find-all-non-tmw | --find-some-non-tmw"); out.println("TibetanConverter [--find-all-non-tmw | --find-some-non-tmw");
out.println(" | --to-tibetan-machine | --to-tibetan-machine-web"); out.println(" | --to-tibetan-machine | --to-tibetan-machine-web");
out.println(" | --to-unicode | --to-wylie] RTF_file"); out.println(" | --to-unicode | --to-wylie] RTF_file");
out.println(" | TibetanConverter --acip-to-unicode TXT_file");
out.println(" | TibetanConverter [--version | -v | --help | -h]"); out.println(" | TibetanConverter [--version | -v | --help | -h]");
out.println(""); out.println("");
out.println("Distributed under the terms of the THDL Open Community License Version 1.0."); out.println("Distributed under the terms of the THDL Open Community License Version 1.0.");
@ -105,6 +113,11 @@ public class TibetanConverter implements FontConverterConstants {
out.println("Usage:"); out.println("Usage:");
out.println(" -v | --version for version info"); out.println(" -v | --version for version info");
out.println(" -h | --help for this message"); out.println(" -h | --help for this message");
out.println(" --to-tibetan-machine to convert TibetanMachineWeb to TibetanMachine");
out.println(" --to-unicode to convert TibetanMachineWeb to Unicode");
out.println(" --to-tibetan-machine-web to convert TibetanMachine to TibetanMachineWeb");
out.println(" --to-wylie to convert TibetanMachineWeb to THDL Extended Wylie");
out.println(" --acip-to-unicode to convert ACIP text file to Unicode text file");
out.println(" --find-all-non-tmw to locate all characters in the input document that are"); out.println(" --find-all-non-tmw to locate all characters in the input document that are");
out.println(" not in Tibetan Machine Web fonts, exit zero if and only if none found"); out.println(" not in Tibetan Machine Web fonts, exit zero if and only if none found");
out.println(" --find-some-non-tmw to locate all distinct characters in the input document"); out.println(" --find-some-non-tmw to locate all distinct characters in the input document");
@ -113,14 +126,12 @@ public class TibetanConverter implements FontConverterConstants {
out.println(" not in Tibetan Machine fonts, exit zero if and only if none found"); out.println(" not in Tibetan Machine fonts, exit zero if and only if none found");
out.println(" --find-some-non-tm to locate all distinct characters in the input document"); out.println(" --find-some-non-tm to locate all distinct characters in the input document");
out.println(" not in Tibetan Machine fonts, exit zero if and only if none found"); out.println(" not in Tibetan Machine fonts, exit zero if and only if none found");
out.println(" --to-tibetan-machine to convert TibetanMachineWeb to TibetanMachine");
out.println(" --to-unicode to convert TibetanMachineWeb to Unicode");
out.println(" --to-tibetan-machine-web to convert TibetanMachine to TibetanMachineWeb");
out.println(" --to-wylie to convert TibetanMachineWeb to THDL Extended Wylie");
out.println(""); out.println("");
out.println(" In --to... modes, needs one argument, the name of the TibetanMachineWeb RTF"); out.println(" In --to... and --acip-to... modes, needs one argument, the name of the");
out.println(" TibetanMachineWeb RTF");
out.println(" file (for --to-wylie, --to-unicode, and --to-tibetan-machine) or the name of"); out.println(" file (for --to-wylie, --to-unicode, and --to-tibetan-machine) or the name of");
out.println(" the TibetanMachine RTF file (for --to-tibetan-machine-web). Writes the"); out.println(" the TibetanMachine RTF file (for --to-tibetan-machine-web) or the name of the");
out.println(" ACIP text file (for --acip-to-unicode). Writes the");
out.println(" result to standard output (after dealing with the curly brace problem if"); out.println(" result to standard output (after dealing with the curly brace problem if");
out.println(" the input is TibetanMachineWeb). Exit code is zero on success, 42 if some"); out.println(" the input is TibetanMachineWeb). Exit code is zero on success, 42 if some");
out.println(" glyphs couldn't be converted (in which case the output is just those glyphs),"); out.println(" glyphs couldn't be converted (in which case the output is just those glyphs),");
@ -135,11 +146,10 @@ public class TibetanConverter implements FontConverterConstants {
out.println(" You may find it helpful to use `--find-some-non-tmw' mode (or"); out.println(" You may find it helpful to use `--find-some-non-tmw' mode (or");
out.println(" `--find-some-non-tm' mode for Tibetan Machine input) before doing a"); out.println(" `--find-some-non-tm' mode for Tibetan Machine input) before doing a");
out.println(" conversion so that you have confidence in the conversion's correctness."); out.println(" conversion so that you have confidence in the conversion's correctness.");
// DLC add Wylie->TMW mode.
return 77; return 77;
} }
if (args[0].equals("--version") || args[0].equals("-v")) { if (args[0].equals("--version") || args[0].equals("-v")) {
out.println("TibetanConverter version 0.82"); out.println("TibetanConverter version 0.83");
out.println("Compiled at " out.println("Compiled at "
+ ThdlVersion.getTimeOfCompilation()); + ThdlVersion.getTimeOfCompilation());
return 77; return 77;
@ -168,12 +178,15 @@ public class TibetanConverter implements FontConverterConstants {
conversionTag = TMW_TO_UNI; conversionTag = TMW_TO_UNI;
} else if (convertToTMWMode) { } else if (convertToTMWMode) {
conversionTag = TM_TO_TMW; conversionTag = TM_TO_TMW;
} else if (convertACIPToUniMode) {
conversionTag = ACIP_TO_UNI;
} else { } else {
ThdlDebug.verify(convertToTMMode); ThdlDebug.verify(convertToTMMode);
conversionTag = TMW_TO_TM; conversionTag = TMW_TO_TM;
} }
} }
return reallyConvert(in, out, conversionTag); return reallyConvert(in, out, conversionTag, "Most" // DLC make me configurable
);
} catch (ThdlLazyException e) { } catch (ThdlLazyException e) {
out.println("TibetanConverter has a BUG:"); out.println("TibetanConverter has a BUG:");
e.getRealException().printStackTrace(out); e.getRealException().printStackTrace(out);
@ -190,132 +203,155 @@ public class TibetanConverter implements FontConverterConstants {
number of strings -- see the code. Returns an appropriate number of strings -- see the code. Returns an appropriate
return code so that TibetanConverter's usage message is return code so that TibetanConverter's usage message is
honored. */ honored. */
static int reallyConvert(InputStream in, PrintStream out, String ct) { static int reallyConvert(InputStream in, PrintStream out, String ct,
TibetanDocument tdoc = new TibetanDocument(); String warningLevel) {
{ if (ACIP_TO_UNI == ct) {
SimpleAttributeSet ras = new SimpleAttributeSet();
StyleConstants.setFontFamily(ras,
ThdlOptions.getStringOption("thdl.default.roman.font.face",
"Serif"));
StyleConstants.setFontSize(ras,
ThdlOptions.getIntegerOption("thdl.default.roman.font.size",
14));
tdoc.setRomanAttributeSet(ras);
}
try {
// Read in the rtf file.
if (debug) System.err.println("Start: reading in old RTF file");
if (!ThdlOptions.getBooleanOption("thdl.do.not.fix.rtf.hex.escapes"))
in = new RTFFixerInputStream(in);
(new RTFEditorKit()).read(in, tdoc, 0);
if (debug) System.err.println("End : reading in old RTF file");
} catch (Exception e) {
out.println("TibetanConverter:\n"
+ rtfErrorMessage);
return 3;
}
try {
in.close();
} catch (IOException e) {
// silently ignore; we don't care about the input so much...
ThdlDebug.noteIffyCode();
}
if (FIND_ALL_NON_TMW == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findAllNonTMWCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else if (FIND_SOME_NON_TMW == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findSomeNonTMWCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else if (FIND_SOME_NON_TM == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findSomeNonTMCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else if (FIND_ALL_NON_TM == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findAllNonTMCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else { // conversion {to Wylie or TM} mode
// Fix curly braces in the entire document if the input is TMW:
if (TM_TO_TMW != ct) {
// DLC make me optional
if (debug) System.err.println("Start: solving curly brace problem");
tdoc.replaceTahomaCurlyBracesAndBackslashes(0, -1);
if (debug) System.err.println("End : solving curly brace problem");
}
int exitCode = 0;
ThdlDebug.verify(((TMW_TO_TM == ct) ? 1 : 0)
+ ((TMW_TO_UNI == ct) ? 1 : 0)
+ ((TM_TO_TMW == ct) ? 1 : 0)
+ ((TMW_TO_WYLIE == ct) ? 1 : 0)
== 1);
long numAttemptedReplacements[] = new long[] { 0 };
if (TMW_TO_WYLIE == ct) {
// Convert to THDL Wylie:
if (!tdoc.toWylie(0,
tdoc.getLength(),
numAttemptedReplacements)) {
exitCode = 44;
}
} else if (TMW_TO_UNI == ct) {
StringBuffer errors = new StringBuffer();
// Convert to Unicode:
if (tdoc.convertToUnicode(0,
tdoc.getLength(),
errors,
ThdlOptions.getStringOption("thdl.tmw.to.unicode.font").intern(),
numAttemptedReplacements)) {
System.err.println(errors);
exitCode = 42;
}
} else if (TM_TO_TMW == ct) {
StringBuffer errors = new StringBuffer();
// Convert to TibetanMachineWeb:
if (tdoc.convertToTMW(0, tdoc.getLength(), errors,
numAttemptedReplacements)) {
System.err.println(errors);
exitCode = 42;
}
} else {
ThdlDebug.verify(TMW_TO_TM == ct);
StringBuffer errors = new StringBuffer();
// Convert to TibetanMachine:
if (tdoc.convertToTM(0, tdoc.getLength(), errors,
numAttemptedReplacements)) {
System.err.println(errors);
exitCode = 42;
}
}
// Write to standard output the result:
try { try {
tdoc.writeRTFOutputStream(out); ArrayList al = ACIPTshegBarScanner.scanStream(in, null,
250 - 1 // DLC FIXME: make me configurable
);
if (null == al)
return 47;
StringBuffer warnings = new StringBuffer();
boolean embeddedWarnings = (warningLevel != "None");
if (!ACIPConverter.convertToUnicode(al, out, null, warnings,
embeddedWarnings,
warningLevel))
return 46;
if (embeddedWarnings && warnings.length() > 0)
return 45;
else
return 0;
} catch (IOException e) { } catch (IOException e) {
exitCode = 40; return 48;
}
} else {
TibetanDocument tdoc = new TibetanDocument();
{
SimpleAttributeSet ras = new SimpleAttributeSet();
StyleConstants.setFontFamily(ras,
ThdlOptions.getStringOption("thdl.default.roman.font.face",
"Serif"));
StyleConstants.setFontSize(ras,
ThdlOptions.getIntegerOption("thdl.default.roman.font.size",
14));
tdoc.setRomanAttributeSet(ras);
}
try {
// Read in the rtf file.
if (debug) System.err.println("Start: reading in old RTF file");
if (!ThdlOptions.getBooleanOption("thdl.do.not.fix.rtf.hex.escapes"))
in = new RTFFixerInputStream(in);
(new RTFEditorKit()).read(in, tdoc, 0);
if (debug) System.err.println("End : reading in old RTF file");
} catch (Exception e) {
out.println("TibetanConverter:\n"
+ rtfErrorMessage);
return 3;
} }
if (out.checkError())
exitCode = 41;
if (numAttemptedReplacements[0] < 1)
exitCode = 43;
return exitCode; try {
in.close();
} catch (IOException e) {
// silently ignore; we don't care about the input so much...
ThdlDebug.noteIffyCode();
}
if (FIND_ALL_NON_TMW == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findAllNonTMWCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else if (FIND_SOME_NON_TMW == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findSomeNonTMWCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else if (FIND_SOME_NON_TM == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findSomeNonTMCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else if (FIND_ALL_NON_TM == ct) {
// 0, -1 is the entire document.
int exitCode
= tdoc.findAllNonTMCharacters(0, -1, out);
if (out.checkError())
exitCode = 41;
return exitCode;
} else { // conversion {to Wylie or TM} mode
// Fix curly braces in the entire document if the input is TMW:
if (TM_TO_TMW != ct) {
// DLC make me optional
if (debug) System.err.println("Start: solving curly brace problem");
tdoc.replaceTahomaCurlyBracesAndBackslashes(0, -1);
if (debug) System.err.println("End : solving curly brace problem");
}
int exitCode = 0;
ThdlDebug.verify(((TMW_TO_TM == ct) ? 1 : 0)
+ ((TMW_TO_UNI == ct) ? 1 : 0)
+ ((TM_TO_TMW == ct) ? 1 : 0)
+ ((TMW_TO_WYLIE == ct) ? 1 : 0)
== 1);
long numAttemptedReplacements[] = new long[] { 0 };
if (TMW_TO_WYLIE == ct) {
// Convert to THDL Wylie:
if (!tdoc.toWylie(0,
tdoc.getLength(),
numAttemptedReplacements)) {
exitCode = 44;
}
} else if (TMW_TO_UNI == ct) {
StringBuffer errors = new StringBuffer();
// Convert to Unicode:
if (tdoc.convertToUnicode(0,
tdoc.getLength(),
errors,
ThdlOptions.getStringOption("thdl.tmw.to.unicode.font").intern(),
numAttemptedReplacements)) {
System.err.println(errors);
exitCode = 42;
}
} else if (TM_TO_TMW == ct) {
StringBuffer errors = new StringBuffer();
// Convert to TibetanMachineWeb:
if (tdoc.convertToTMW(0, tdoc.getLength(), errors,
numAttemptedReplacements)) {
System.err.println(errors);
exitCode = 42;
}
} else {
ThdlDebug.verify(TMW_TO_TM == ct);
StringBuffer errors = new StringBuffer();
// Convert to TibetanMachine:
if (tdoc.convertToTM(0, tdoc.getLength(), errors,
numAttemptedReplacements)) {
System.err.println(errors);
exitCode = 42;
}
}
// Write to standard output the result:
try {
tdoc.writeRTFOutputStream(out);
} catch (IOException e) {
exitCode = 40;
}
if (out.checkError())
exitCode = 41;
if (numAttemptedReplacements[0] < 1)
exitCode = 43;
return exitCode;
}
} }
} }
} }

View file

@ -38,24 +38,23 @@ public class ACIPConverter {
ThdlOptions.setUserPreference("thdl.debug", true); ThdlOptions.setUserPreference("thdl.debug", true);
} }
// DLC NOW: (KA)'s info is lost when you convert to Unicode text instead of Unicode RTF. Give an ERROR.
/** Command-line converter. Gives error messages on standard /** Command-line converter. Gives error messages on standard
* output about why we can't convert the document perfectly and * output about why we can't convert the document perfectly and
* exits with non-zero return code, or is silent otherwise and * exits with non-zero return code, or is silent otherwise and
* exits with code zero. <p>FIXME: not so efficient; copies the * exits with code zero. <p>FIXME: not so efficient; copies the
* whole file into memory first. */ * whole file into memory first. */
public static void main(String[] args) public static void main(String[] args)
throws IOException // DLC FIXME: give nice error messages throws IOException
{ {
boolean verbose = true; boolean verbose = true;
boolean strict = true; if (args.length != 1) {
if (args.length != 2 System.out.println("Bad args! Need just the name of the ACIP text file.");
|| (!(strict = "--strict".equals(args[0])) && !"--lenient".equals(args[0]))) {
System.err.println("Bad args! Need '--strict filename' or '--lenient filename'.");
System.exit(1);
} }
StringBuffer errors = new StringBuffer(); StringBuffer errors = new StringBuffer();
int maxErrors = 250; int maxErrors = 250;
ArrayList al = ACIPTshegBarScanner.scanFile(args[1], errors, strict, maxErrors - 1); ArrayList al = ACIPTshegBarScanner.scanFile(args[0], errors, maxErrors - 1);
if (null == al) { if (null == al) {
System.err.println(maxErrors + " or more lexical errors occurred while scanning ACIP input file; is this"); System.err.println(maxErrors + " or more lexical errors occurred while scanning ACIP input file; is this");
@ -69,7 +68,7 @@ public class ACIPConverter {
System.err.println("Exiting with " + maxErrors + " or more lexical errors; please fix input file and try again."); System.err.println("Exiting with " + maxErrors + " or more lexical errors; please fix input file and try again.");
System.exit(1); System.exit(1);
} }
final boolean abortUponScanningError = false; // DLC MAKE ME CONFIGURABLE final boolean abortUponScanningError = false;
// DLC NOW: BAo isn't converting. // DLC NOW: BAo isn't converting.
if (errors.length() > 0) { if (errors.length() > 0) {
System.err.println("Errors scanning ACIP input file: "); System.err.println("Errors scanning ACIP input file: ");
@ -80,10 +79,15 @@ public class ACIPConverter {
} }
} }
StringBuffer warnings = new StringBuffer(); String warningLevel = "Most"; // DLC make me configurable.
boolean putWarningsInOutput = true; // DLC make me configurable. StringBuffer warnings = null;
boolean putWarningsInOutput = false;
if ("None" != warningLevel) {
warnings = new StringBuffer();
putWarningsInOutput = true;
}
convertToUnicode(al, System.out, errors, warnings, convertToUnicode(al, System.out, errors, warnings,
putWarningsInOutput); putWarningsInOutput, warningLevel);
if (errors.length() > 0) { if (errors.length() > 0) {
System.err.println("Errors converting ACIP input file: "); System.err.println("Errors converting ACIP input file: ");
System.err.println(errors); System.err.println(errors);
@ -91,14 +95,14 @@ public class ACIPConverter {
System.err.println("Exiting; please fix input file and try again."); System.err.println("Exiting; please fix input file and try again.");
System.exit(2); System.exit(2);
} }
if (warnings.length() > 0) { if (null != warnings && warnings.length() > 0) {
System.err.println("Warnings converting ACIP input file: "); System.err.println("Warnings converting ACIP input file: ");
System.err.println(warnings); System.err.println(warnings);
if (putWarningsInOutput) if (putWarningsInOutput)
System.err.println("The output contains these warnings."); System.err.println("The output contains these warnings.");
System.exit(2); System.exit(2);
} }
if (verbose) System.err.println("Converted " + args[1] + " perfectly."); if (verbose) System.err.println("Converted " + args[0] + " perfectly.");
System.exit(0); System.exit(0);
} }
@ -131,16 +135,17 @@ public class ACIPConverter {
public static String convertToUnicode(String acip, public static String convertToUnicode(String acip,
StringBuffer errors, StringBuffer errors,
StringBuffer warnings, StringBuffer warnings,
boolean writeWarningsToResult) { boolean writeWarningsToResult,
String warningLevel) {
ByteArrayOutputStream sw = new ByteArrayOutputStream(); ByteArrayOutputStream sw = new ByteArrayOutputStream();
ArrayList al = ACIPTshegBarScanner.scan(acip, errors, true /* DLC FIXME */, -1); ArrayList al = ACIPTshegBarScanner.scan(acip, errors, -1);
try { try {
if (null != al if (null != al
&& convertToUnicode(al, sw, errors, && convertToUnicode(al, sw, errors,
warnings, writeWarningsToResult)) { warnings, writeWarningsToResult,
warningLevel)) {
return sw.toString("UTF-8"); return sw.toString("UTF-8");
} else { } else {
System.out.println("DLC al is " + al + " and convertToUnicode returned null.");
return null; return null;
} }
} catch (Exception e) { } catch (Exception e) {
@ -151,8 +156,8 @@ public class ACIPConverter {
/** Writes Unicode to out. If errors occur in converting a tsheg /** Writes Unicode to out. If errors occur in converting a tsheg
* bar, then they are appended to errors if errors is non-null. * bar, then they are appended to errors if errors is non-null.
* Furthermore, errors are written to out. If writeWarningsToOut * Furthermore, errors are written to out. If writeWarningsToOut
* is true, then warnings also will be written to out. Returns * is true, then warnings also will be written to out.
* true upon perfect success, false if errors occurred. * @return true upon perfect success, false if errors occurred.
* @param scan result of ACIPTshegBarScanner.scan(..) * @param scan result of ACIPTshegBarScanner.scan(..)
* @param out stream to which to write converted text * @param out stream to which to write converted text
* @param errors if non-null, all error messages are appended * @param errors if non-null, all error messages are appended
@ -166,7 +171,8 @@ public class ACIPConverter {
OutputStream out, OutputStream out,
StringBuffer errors, StringBuffer errors,
StringBuffer warnings, StringBuffer warnings,
boolean writeWarningsToOut) boolean writeWarningsToOut,
String warningLevel)
throws IOException throws IOException
{ {
int sz = scan.size(); int sz = scan.size();
@ -181,8 +187,18 @@ public class ACIPConverter {
writer.write("[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: "); writer.write("[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: ");
writer.write(s.getText()); writer.write(s.getText());
writer.write("]"); writer.write("]");
} else if (stype == ACIPString.WARNING) {
if (writeWarningsToOut) {
writer.write("[#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: ");
writer.write(s.getText());
writer.write("]");
}
if (null != warnings) {
warnings.append("Warning: Lexical warning: ");
warnings.append(s.getText());
warnings.append('\n');
}
} else { } else {
// DLC FIXME: what about 'no A on root stack' and 'no A on such-and-such stack' warnings?
if (s.isLatin(stype)) { if (s.isLatin(stype)) {
if (stype == ACIPString.FOLIO_MARKER) if (stype == ACIPString.FOLIO_MARKER)
writer.write("{"); writer.write("{");
@ -219,7 +235,7 @@ public class ACIPConverter {
errors.append(errorMessage + "\n"); errors.append(errorMessage + "\n");
} else { } else {
String warning String warning
= pt.getWarning(false, // DLC: make me configurable = pt.getWarning(warningLevel,
pl, pl,
s.getText()); s.getText());
if (null != warning) { if (null != warning) {
@ -234,7 +250,7 @@ public class ACIPConverter {
} }
} }
unicode = sl.getUnicode(); unicode = sl.getUnicode();
if (null == unicode) throw new Error("DLC: HOW?"); if (null == unicode) throw new Error("FIXME: make this an assertion");
} }
} }
} }
@ -245,7 +261,7 @@ public class ACIPConverter {
unicode = "\u0F3D"; unicode = "\u0F3D";
else else
unicode = ACIPRules.getUnicodeFor(s.getText(), false); unicode = ACIPRules.getUnicodeFor(s.getText(), false);
if (null == unicode) throw new Error("DLC: HOW?"); if (null == unicode) throw new Error("FIXME: make this an assertion");
} }
if (null != unicode) { if (null != unicode) {
writer.write(unicode); writer.write(unicode);

View file

@ -75,9 +75,11 @@ public class ACIPString {
public static final int START_PAREN = 15; public static final int START_PAREN = 15;
/** For the closing ) in (NYA) */ /** For the closing ) in (NYA) */
public static final int END_PAREN = 16; public static final int END_PAREN = 16;
/** For things that may not be legal syntax, such as {KA . KHA} */
public static final int WARNING = 17;
/** For things that are not legal syntax, such as a file that /** For things that are not legal syntax, such as a file that
* contains just "[# HALF A COMMEN" */ * contains just "[# HALF A COMMEN" */
public static final int ERROR = 17; public static final int ERROR = 18;
/** Returns true if and only if this string is Latin (usually /** Returns true if and only if this string is Latin (usually
* English). Returns false if this string is transliteration of * English). Returns false if this string is transliteration of
@ -132,6 +134,7 @@ public class ACIPString {
if (type == END_SLASH) typeString = "END_SLASH"; if (type == END_SLASH) typeString = "END_SLASH";
if (type == START_PAREN) typeString = "START_PAREN"; if (type == START_PAREN) typeString = "START_PAREN";
if (type == END_PAREN) typeString = "END_PAREN"; if (type == END_PAREN) typeString = "END_PAREN";
if (type == WARNING) typeString = "WARNING";
if (type == ERROR) typeString = "ERROR"; if (type == ERROR) typeString = "ERROR";
return typeString + ":{" + getText() + "}"; return typeString + ":{" + getText() + "}";
} }

View file

@ -39,15 +39,13 @@ public class ACIPTshegBarScanner {
* with code zero. <p>FIXME: not so efficient; copies the whole * with code zero. <p>FIXME: not so efficient; copies the whole
* file into memory first. */ * file into memory first. */
public static void main(String[] args) throws IOException { public static void main(String[] args) throws IOException {
boolean strict = true; if (args.length != 1) {
if (args.length != 2 System.out.println("Bad args! Need just the name of the ACIP text file.");
|| (!(strict = "--strict".equals(args[0])) && !"--lenient".equals(args[0]))) {
System.out.println("Bad args! Need '--strict filename' or '--lenient filename'.");
System.exit(1); System.exit(1);
} }
StringBuffer errors = new StringBuffer(); StringBuffer errors = new StringBuffer();
int maxErrors = 250; int maxErrors = 250;
ArrayList al = scanFile(args[1], errors, strict, maxErrors - 1); ArrayList al = scanFile(args[0], errors, maxErrors - 1);
if (null == al) { if (null == al) {
System.out.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this"); System.out.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this");
@ -70,27 +68,39 @@ public class ACIPTshegBarScanner {
} }
/** Scans an ACIP file with path fname into tsheg bars. If errors /** Scans an ACIP file with path fname into tsheg bars. If errors
* is non-null, error messages will be appended to it. If strict * is non-null, error messages will be appended to it. Returns a
* is true, then you're more likely to see error * list of ACIPStrings that is the scan. <p>FIXME: not so
* messages. Returns a list of ACIPStrings that is the * efficient; copies the whole file into memory first.
* scan. <p>FIXME: not so efficient; copies the whole file into
* memory first.
* @throws IOException if we cannot read in the ACIP input file */ * @throws IOException if we cannot read in the ACIP input file */
public static ArrayList scanFile(String fname, StringBuffer errors, boolean strict, int maxErrors) public static ArrayList scanFile(String fname, StringBuffer errors, int maxErrors)
throws IOException
{
return scanStream(new FileInputStream(fname),
errors, maxErrors);
}
/** Scans a stream of ACIP into tsheg bars. If errors is
* non-null, error messages will be appended to it. You can
* recover both errors and warnings (modulo offset information)
* from the result, though. Returns a list of ACIPStrings that
* is the scan, or null if more than maxErrors occur. <p>FIXME:
* not so efficient; copies the whole file into memory first.
* @throws IOException if we cannot read the whole ACIP stream */
public static ArrayList scanStream(InputStream stream, StringBuffer errors,
int maxErrors)
throws IOException throws IOException
{ {
StringBuffer s = new StringBuffer(); StringBuffer s = new StringBuffer();
char ch[] = new char[8192]; char ch[] = new char[8192];
BufferedReader in BufferedReader in
= new BufferedReader(new InputStreamReader(new FileInputStream(fname), = new BufferedReader(new InputStreamReader(stream, "US-ASCII"));
"US-ASCII"));
int amt; int amt;
while (-1 != (amt = in.read(ch))) { while (-1 != (amt = in.read(ch))) {
s.append(ch, 0, amt); s.append(ch, 0, amt);
} }
in.close(); in.close();
return scan(s.toString(), errors, !strict, maxErrors); return scan(s.toString(), errors, maxErrors);
} }
/** Returns a list of {@link ACIPString ACIPStrings} corresponding /** Returns a list of {@link ACIPString ACIPStrings} corresponding
@ -99,26 +109,25 @@ public class ACIPTshegBarScanner {
* text, a tsheg bar (minus the tsheg or shad or whatever), a * text, a tsheg bar (minus the tsheg or shad or whatever), a
* String of inter-tsheg-bar punctuation, etc. * String of inter-tsheg-bar punctuation, etc.
* *
* <p>This not only scans; it finds all the errors a parser would * <p>This not only scans; it finds all the errors and warnings a
* too, like "NYA x" and "(" and ")" and "/NYA" etc. It puts * parser would too, like "NYA x" and "(" and ")" and "/NYA" etc.
* those in as ACIPStrings with type {@link ACIPString#ERROR}, * It puts those in as ACIPStrings with type {@link
* and also, if errors is non-null, appends helpful messages to * ACIPString#ERROR} or {@link ACIPString#WARNING}, and also, if
* errors, each followed by a '\n'. There is at least one case * errors is non-null, appends helpful messages to errors, each
* where no ERROR ACIPString will appear but errors will be * followed by a '\n'.
* modified.
* @param s the ACIP text * @param s the ACIP text
* @param errors if non-null, the buffer to which to append error * @param errors if non-null, the buffer to which to append error
* messages * messages (DLC FIXME: cludge, just get this info by scanning
* @param lenientPeriods if and only if this is true, periods * the result for ACIPString.ERROR (and maybe ACIPString.WARNING,
* will never cause errors, even if iffy text like "PAS... LA " * if you care about warnings), but then we'd have to put the
* appears. * Offset info in the ACIPString)
* @param maxErrors if nonnegative, then scanning will stop when * @param maxErrors if nonnegative, then scanning will stop when
* more than maxErrors errors occur. In this event, null is * more than maxErrors errors occur. In this event, null is
* returned. * returned.
* @return null if more than maxErrors errors occur, or the scan * @return null if more than maxErrors errors occur, or the scan
* otherwise * otherwise
*/ */
public static ArrayList scan(String s, StringBuffer errors, boolean lenientPeriods, int maxErrors) { public static ArrayList scan(String s, StringBuffer errors, int maxErrors) {
// the size depends on whether it's mostly Tibetan or mostly // the size depends on whether it's mostly Tibetan or mostly
// Latin and a number of other factors. This is meant to be // Latin and a number of other factors. This is meant to be
@ -159,9 +168,9 @@ public class ACIPTshegBarScanner {
al.add(new ACIPString(s.substring(startOfString, i), al.add(new ACIPString(s.substring(startOfString, i),
currentType)); currentType));
} }
al.add(new ACIPString("Found a truly unmatched close bracket, " + s.substring(i, i+1),
ACIPString.ERROR));
if (!waitingForMatchingIllegalClose) { if (!waitingForMatchingIllegalClose) {
al.add(new ACIPString("Found a truly unmatched close bracket, " + s.substring(i, i+1),
ACIPString.ERROR));
if (null != errors) { if (null != errors) {
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a truly unmatched close bracket, ] or }.\n"); + "Found a truly unmatched close bracket, ] or }.\n");
@ -169,6 +178,8 @@ public class ACIPTshegBarScanner {
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }
waitingForMatchingIllegalClose = false; waitingForMatchingIllegalClose = false;
al.add(new ACIPString("Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.",
ACIPString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); + "Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
@ -422,9 +433,9 @@ public class ACIPTshegBarScanner {
// This is an error. Sometimes [COMMENTS APPEAR // This is an error. Sometimes [COMMENTS APPEAR
// WITHOUT # MARKS]. Though "... [" could cause // WITHOUT # MARKS]. Though "... [" could cause
// this too. // this too.
al.add(new ACIPString("Found an illegal open bracket: " + s.substring(i, i+1),
ACIPString.ERROR));
if (waitingForMatchingIllegalClose) { if (waitingForMatchingIllegalClose) {
al.add(new ACIPString("Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.",
ACIPString.ERROR));
if (null != errors) { if (null != errors) {
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.\n"); + "Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.\n");
@ -443,6 +454,8 @@ public class ACIPTshegBarScanner {
inContext = inContext + "..."; inContext = inContext + "...";
} }
} }
al.add(new ACIPString("Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?",
ACIPString.ERROR));
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n"); + "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
@ -729,23 +742,17 @@ public class ACIPTshegBarScanner {
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
} }
// . is used for a non-breaking tsheg, such as in // . is used for a non-breaking tsheg, such as in
// {NGO.,} and {....,DAM}. We give an error unless , // {NGO.,} and {....,DAM}. We give a warning unless ,
// or ., or [A-Za-z] follows '.'. // or ., or [A-Za-z] follows '.'.
if (lenientPeriods al.add(new ACIPString(s.substring(i, i+1),
|| (i + 1 < sl ACIPString.TIBETAN_PUNCTUATION));
&& (s.charAt(i+1) == '.' || s.charAt(i+1) == ',' if (!(i + 1 < sl
|| (s.charAt(i+1) == '\r' || s.charAt(i+1) == '\n') && (s.charAt(i+1) == '.' || s.charAt(i+1) == ','
|| (s.charAt(i+1) >= 'a' && s.charAt(i+1) <= 'z') || (s.charAt(i+1) == '\r' || s.charAt(i+1) == '\n')
|| (s.charAt(i+1) >= 'A' && s.charAt(i+1) <= 'Z')))) { || (s.charAt(i+1) >= 'a' && s.charAt(i+1) <= 'z')
al.add(new ACIPString(s.substring(i, i+1), || (s.charAt(i+1) >= 'A' && s.charAt(i+1) <= 'Z')))) {
ACIPString.TIBETAN_PUNCTUATION));
} else {
al.add(new ACIPString("A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".", al.add(new ACIPString("A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".",
ACIPString.ERROR)); ACIPString.WARNING));
if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }
startOfString = i+1; startOfString = i+1;
break; // end '.' case break; // end '.' case
@ -832,16 +839,11 @@ public class ACIPTshegBarScanner {
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }
if (!bracketTypeStack.empty()) { if (!bracketTypeStack.empty()) {
al.add(new ACIPString("UNEXPECTED END OF INPUT", al.add(new ACIPString("Unmatched open bracket found. A " + ((ACIPString.COMMENT == currentType) ? "comment" : "correction") + " does not terminate.",
ACIPString.ERROR)); ACIPString.ERROR));
if (null != errors) { if (null != errors) {
if (ACIPString.COMMENT == currentType) { errors.append("Offset END: "
errors.append("Offset END: " + "Unmatched open bracket found. A " + ((ACIPString.COMMENT == currentType) ? "comment" : "correction") + " does not terminate.\n");
+ "Unmatched open bracket found. A comment does not terminate.\n");
} else {
errors.append("Offset END: "
+ "Unmatched open bracket found. A correction does not terminate.\n");
}
} }
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }

View file

@ -102,10 +102,10 @@ public class PackageTest extends TestCase {
assertTrue(null == expectedLegalParses || expectedLegalParses.length == 0); assertTrue(null == expectedLegalParses || expectedLegalParses.length == 0);
return; return;
} else { } else {
if (pt.getWarning(false, l, acip) != null) { if (pt.getWarning("Most", l, acip) != null) {
System.out.println(pt.getWarning(false, l, acip)); System.out.println(pt.getWarning("Most", l, acip));
} else if (pt.getWarning(true, l, acip) != null) } else if (pt.getWarning("All", l, acip) != null)
if (sdebug || debug) System.out.println("Paranoiac warning is this: " + pt.getWarning(true, l, acip)); if (sdebug || debug) System.out.println("Paranoiac warning is this: " + pt.getWarning("All", l, acip));
} }
int np = pt.numberOfParses(); int np = pt.numberOfParses();
boolean goodness = expectedParses == null || expectedParses.length == np; boolean goodness = expectedParses == null || expectedParses.length == np;
@ -7049,12 +7049,8 @@ tstHelper("ZUR");
} }
private static void shelp(String s, String expectedErrors, String expectedScan) { private static void shelp(String s, String expectedErrors, String expectedScan) {
shelp(s, expectedErrors, false, expectedScan);
}
private static void shelp(String s, String expectedErrors, boolean lenientPeriods, String expectedScan) {
StringBuffer errors = new StringBuffer(); StringBuffer errors = new StringBuffer();
ArrayList al = ACIPTshegBarScanner.scan(s, errors, lenientPeriods, -1); ArrayList al = ACIPTshegBarScanner.scan(s, errors, -1);
if (null != expectedScan) { if (null != expectedScan) {
if (!al.toString().equals(expectedScan)) { if (!al.toString().equals(expectedScan)) {
System.out.println("Scanning " + s + " into tsheg bars was expected to cause the following scan:"); System.out.println("Scanning " + s + " into tsheg bars was expected to cause the following scan:");
@ -7075,18 +7071,14 @@ tstHelper("ZUR");
} }
} }
/** Tests {@link ACIPTshegBarScanner#scan(String, StringBuffer, boolean, int)}. */ /** Tests {@link ACIPTshegBarScanner#scan(String, StringBuffer, int)}. */
public void testScanner() { public void testScanner() {
shelp("LA...SGRUB", shelp("LA...SGRUB",
"", "",
"[TIBETAN_NON_PUNCTUATION:{LA}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_NON_PUNCTUATION:{SGRUB}]"); // DLC FIXME "[TIBETAN_NON_PUNCTUATION:{LA}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_NON_PUNCTUATION:{SGRUB}]");
shelp("PAS... LA",
"Offset 5 or maybe 5: A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n",
"[TIBETAN_NON_PUNCTUATION:{PAS}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, ERROR:{A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{LA}]");
shelp("PAS... LA", shelp("PAS... LA",
"", "",
true, "[TIBETAN_NON_PUNCTUATION:{PAS}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, WARNING:{A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{LA}]");
"[TIBETAN_NON_PUNCTUATION:{PAS}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{LA}]");
shelp("^GONG SA,", shelp("^GONG SA,",
"", "",
"[TIBETAN_NON_PUNCTUATION:{^GONG}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{SA}, TIBETAN_PUNCTUATION:{,}]"); "[TIBETAN_NON_PUNCTUATION:{^GONG}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{SA}, TIBETAN_PUNCTUATION:{,}]");
@ -7220,7 +7212,7 @@ tstHelper("ZUR");
} }
private static void uhelp(String acip, String expectedUnicode) { private static void uhelp(String acip, String expectedUnicode) {
StringBuffer errors = new StringBuffer(); StringBuffer errors = new StringBuffer();
String unicode = ACIPConverter.convertToUnicode(acip, errors, null, true); String unicode = ACIPConverter.convertToUnicode(acip, errors, null, true, "Most");
if (null == unicode) { if (null == unicode) {
if (null != expectedUnicode && "none" != expectedUnicode) { if (null != expectedUnicode && "none" != expectedUnicode) {
System.out.println("No unicode exists for " + acip + " but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode)); System.out.println("No unicode exists for " + acip + " but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode));

View file

@ -139,7 +139,7 @@ class TParseTree {
// We give a warning about these, optionally, so that // We give a warning about these, optionally, so that
// users can produce output that even a dumb ACIP reader // users can produce output that even a dumb ACIP reader
// can understand. See getWarning(true, ..). // can understand. See getWarning("All", ..).
// if j is in this list, then up.get(j) is still a // if j is in this list, then up.get(j) is still a
// potential winner. // potential winner.
@ -246,16 +246,24 @@ class TParseTree {
/** Returns null if this parse tree is perfectly legal and valid. /** Returns null if this parse tree is perfectly legal and valid.
* Returns a warning for users otherwise. If and only if * Returns a warning for users otherwise. If and only if
* paranoid is true, then even unambiguous ACIP like PADMA, which * warningLevel is "All", then even unambiguous ACIP like PADMA,
* could be improved by being written as PAD+MA, will cause a * which could be improved by being written as PAD+MA, will cause
* warning. * a warning.
* @param paranoid true if you do not mind a lot of warnings * @param warningLevel "All" if you're paranoid, "Most" to see
* warnings about lacking vowels on final stacks, "Some" to see
* warnings about lacking vowels on non-final stacks and also
* warnings about when prefix rules affect you, "None" if you
* like to see IllegalArgumentExceptions.
* @param pl the pair list from which this parse tree originated * @param pl the pair list from which this parse tree originated
* @param originalACIP the original ACIP, or null if you want * @param originalACIP the original ACIP, or null if you want
* this parse tree to make a best guess. */ * this parse tree to make a best guess. */
public String getWarning(boolean paranoid, public String getWarning(String warningLevel,
TPairList pl, TPairList pl,
String originalACIP) { String originalACIP) {
if (warningLevel != "Some"
&& warningLevel != "Most"
&& warningLevel != "All")
throw new IllegalArgumentException("warning level bad: is it interned?");
{ {
TStackList bestParse = getBestParse(); TStackList bestParse = getBestParse();
@ -276,19 +284,21 @@ class TParseTree {
} else { } else {
if (getBestParse().hasStackWithoutVowel(pl, isLastStack)) { if (getBestParse().hasStackWithoutVowel(pl, isLastStack)) {
if (isLastStack[0]) { if (isLastStack[0]) {
return "Warning: The last stack does not have a vowel in the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}"; if (warningLevel == "All" || warningLevel == "Most")
return "Warning: The last stack does not have a vowel in the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}";
} else { } else {
return "Warning: There is a stack, before the last stack, without a vowel in the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}"; return "Warning: There is a stack, before the last stack, without a vowel in the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}";
} }
} }
if (paranoid) { if ("All" == warningLevel) {
return "Warning: Though the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "} is unambiguous, it would be more computer-friendly if + signs were used to stack things because there are two (or more) ways to interpret this ACIP if you're not careful."; return "Warning: Though the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "} is unambiguous, it would be more computer-friendly if + signs were used to stack things because there are two (or more) ways to interpret this ACIP if you're not careful.";
} }
} }
} else { } else {
if (nip.get(0).hasStackWithoutVowel(pl, isLastStack)) { if (nip.get(0).hasStackWithoutVowel(pl, isLastStack)) {
if (isLastStack[0]) { if (isLastStack[0]) {
return "Warning: The last stack does not have a vowel in the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}"; if (warningLevel == "All" || warningLevel == "Most")
return "Warning: The last stack does not have a vowel in the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}";
} else { } else {
return "Warning: There is a stack, before the last stack, without a vowel in the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}"; return "Warning: There is a stack, before the last stack, without a vowel in the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}";
} }