Jskad's converter now has ACIP-to-Unicode built in. There are known

bugs; it is pre-alpha. It's usable, though, and finds tons of errors in ACIP input files, with the user deciding just how pedantic to be. The biggest outstanding bug is the silent one: treating { }, space, as tsheg instead of whitespace when we ought to know better.
2003-08-24 06:40:53 +00:00 · 2003-08-24 06:40:53 +00:00 · 1982c5847b
commit 1982c5847b
parent d5ad760230
11 changed files with 355 additions and 244 deletions
--- a/source/org/thdl/tib/input/TibetanConverter.java
+++ b/source/org/thdl/tib/input/TibetanConverter.java
@ -26,6 +26,10 @@ import javax.swing.text.StyleConstants;
 import org.thdl.util.*;
 import org.thdl.tib.text.*;

+import org.thdl.tib.text.ttt.ACIPConverter;
+import org.thdl.tib.text.ttt.ACIPTshegBarScanner;
+import java.util.ArrayList;
+
 /** TibetanConverter is a command-line utility for converting to
 *  and from Tibetan Machine Web (TMW).  It converts TMW to Wylie, to
 *  Unicode, or to Tibetan Machine (TM).  It also converts TM to TMW.
@ -66,6 +70,7 @@ public class TibetanConverter implements FontConverterConstants {
        try {
            boolean convertToUnicodeMode = false;
            boolean convertToTMMode = false;
+            boolean convertACIPToUniMode = false;
            boolean convertToTMWMode = false;
            boolean convertToWylieMode = false;
            boolean findSomeNonTMWMode = false;
@ -84,6 +89,8 @@ public class TibetanConverter implements FontConverterConstants {
                             = args[0].equals("--to-tibetan-machine"))
                         || (convertToTMWMode
                             = args[0].equals("--to-tibetan-machine-web"))
+                         || (convertACIPToUniMode
+                             = args[0].equals("--acip-to-unicode"))
                         || (convertToUnicodeMode
                             = args[0].equals("--to-unicode"))
                         || (convertToWylieMode
@ -98,6 +105,7 @@ public class TibetanConverter implements FontConverterConstants {
                out.println("TibetanConverter [--find-all-non-tmw | --find-some-non-tmw");
                out.println("                  | --to-tibetan-machine | --to-tibetan-machine-web");
                out.println("                  | --to-unicode | --to-wylie] RTF_file");
+                out.println(" | TibetanConverter --acip-to-unicode TXT_file");
                out.println(" | TibetanConverter [--version | -v | --help | -h]");
                out.println("");
                out.println("Distributed under the terms of the THDL Open Community License Version 1.0.");
@ -105,6 +113,11 @@ public class TibetanConverter implements FontConverterConstants {
                out.println("Usage:");
                out.println(" -v | --version for version info");
                out.println(" -h | --help for this message");
+                out.println(" --to-tibetan-machine to convert TibetanMachineWeb to TibetanMachine");
+                out.println(" --to-unicode to convert TibetanMachineWeb to Unicode");
+                out.println(" --to-tibetan-machine-web to convert TibetanMachine to TibetanMachineWeb");
+                out.println(" --to-wylie to convert TibetanMachineWeb to THDL Extended Wylie");
+                out.println(" --acip-to-unicode to convert ACIP text file to Unicode text file");
                out.println(" --find-all-non-tmw to locate all characters in the input document that are");
                out.println("   not in Tibetan Machine Web fonts, exit zero if and only if none found");
                out.println(" --find-some-non-tmw to locate all distinct characters in the input document");
@ -113,14 +126,12 @@ public class TibetanConverter implements FontConverterConstants {
                out.println("   not in Tibetan Machine fonts, exit zero if and only if none found");
                out.println(" --find-some-non-tm to locate all distinct characters in the input document");
                out.println("   not in Tibetan Machine fonts, exit zero if and only if none found");
-                out.println(" --to-tibetan-machine to convert TibetanMachineWeb to TibetanMachine");
-                out.println(" --to-unicode to convert TibetanMachineWeb to Unicode");
-                out.println(" --to-tibetan-machine-web to convert TibetanMachine to TibetanMachineWeb");
-                out.println(" --to-wylie to convert TibetanMachineWeb to THDL Extended Wylie");
                out.println("");
-                out.println(" In --to... modes, needs one argument, the name of the TibetanMachineWeb RTF");
+                out.println(" In --to... and --acip-to... modes, needs one argument, the name of the");
+                out.println(" TibetanMachineWeb RTF");
                out.println(" file (for --to-wylie, --to-unicode, and --to-tibetan-machine) or the name of");
-                out.println(" the TibetanMachine RTF file (for --to-tibetan-machine-web).  Writes the");
+                out.println(" the TibetanMachine RTF file (for --to-tibetan-machine-web) or the name of the");
+                out.println(" ACIP text file (for --acip-to-unicode).  Writes the");
                out.println(" result to standard output (after dealing with the curly brace problem if");
                out.println(" the input is TibetanMachineWeb).  Exit code is zero on success, 42 if some");
                out.println(" glyphs couldn't be converted (in which case the output is just those glyphs),");
@ -135,11 +146,10 @@ public class TibetanConverter implements FontConverterConstants {
                out.println(" You may find it helpful to use `--find-some-non-tmw' mode (or");
                out.println(" `--find-some-non-tm' mode for Tibetan Machine input) before doing a");
                out.println(" conversion so that you have confidence in the conversion's correctness.");
-                // DLC add Wylie->TMW mode.
                return 77;
            }
            if (args[0].equals("--version") || args[0].equals("-v")) {
-                out.println("TibetanConverter version 0.82");
+                out.println("TibetanConverter version 0.83");
                out.println("Compiled at "
                            + ThdlVersion.getTimeOfCompilation());
                return 77;
@ -168,12 +178,15 @@ public class TibetanConverter implements FontConverterConstants {
                    conversionTag = TMW_TO_UNI;
                } else if (convertToTMWMode) {
                    conversionTag = TM_TO_TMW;
+                } else if (convertACIPToUniMode) {
+                    conversionTag = ACIP_TO_UNI;
                } else {
                    ThdlDebug.verify(convertToTMMode);
                    conversionTag = TMW_TO_TM;
                }
            }
-            return reallyConvert(in, out, conversionTag);
+            return reallyConvert(in, out, conversionTag, "Most" // DLC make me configurable
+                                 );
        } catch (ThdlLazyException e) {
            out.println("TibetanConverter has a BUG:");
            e.getRealException().printStackTrace(out);
@ -190,132 +203,155 @@ public class TibetanConverter implements FontConverterConstants {
        number of strings -- see the code.  Returns an appropriate
        return code so that TibetanConverter's usage message is
        honored. */
-    static int reallyConvert(InputStream in, PrintStream out, String ct) {
-        TibetanDocument tdoc = new TibetanDocument();
-        {
-            SimpleAttributeSet ras = new SimpleAttributeSet();
-            StyleConstants.setFontFamily(ras,
-                                         ThdlOptions.getStringOption("thdl.default.roman.font.face",
-                                                                     "Serif"));
-            StyleConstants.setFontSize(ras,
-                                       ThdlOptions.getIntegerOption("thdl.default.roman.font.size",
-                                                                    14));
-            tdoc.setRomanAttributeSet(ras);
-        }
-        try {
-            // Read in the rtf file.
-            if (debug) System.err.println("Start: reading in old RTF file");
-            if (!ThdlOptions.getBooleanOption("thdl.do.not.fix.rtf.hex.escapes"))
-                in = new RTFFixerInputStream(in);
-            (new RTFEditorKit()).read(in, tdoc, 0);
-            if (debug) System.err.println("End  : reading in old RTF file");
-        } catch (Exception e) {
-            out.println("TibetanConverter:\n"
-                        + rtfErrorMessage);
-            return 3;
-        }
-
-        try {
-            in.close();
-        } catch (IOException e) {
-            // silently ignore; we don't care about the input so much...
-            ThdlDebug.noteIffyCode();
-        }
-
-
-        if (FIND_ALL_NON_TMW == ct) {
-            // 0, -1 is the entire document.
-            int exitCode
-                = tdoc.findAllNonTMWCharacters(0, -1, out);
-            if (out.checkError())
-                exitCode = 41;
-            return exitCode;
-        } else if (FIND_SOME_NON_TMW == ct) {
-            // 0, -1 is the entire document.
-            int exitCode
-                = tdoc.findSomeNonTMWCharacters(0, -1, out);
-            if (out.checkError())
-                exitCode = 41;
-            return exitCode;
-        } else if (FIND_SOME_NON_TM == ct) {
-            // 0, -1 is the entire document.
-            int exitCode
-                = tdoc.findSomeNonTMCharacters(0, -1, out);
-            if (out.checkError())
-                exitCode = 41;
-            return exitCode;
-        } else if (FIND_ALL_NON_TM == ct) {
-            // 0, -1 is the entire document.
-            int exitCode
-                = tdoc.findAllNonTMCharacters(0, -1, out);
-            if (out.checkError())
-                exitCode = 41;
-            return exitCode;
-        } else { // conversion {to Wylie or TM} mode
-            // Fix curly braces in the entire document if the input is TMW:
-            if (TM_TO_TMW != ct) {
-                // DLC make me optional
-                if (debug) System.err.println("Start: solving curly brace problem");
-                tdoc.replaceTahomaCurlyBracesAndBackslashes(0, -1);
-                if (debug) System.err.println("End  : solving curly brace problem");
-            }
-
-            int exitCode = 0;
-            ThdlDebug.verify(((TMW_TO_TM == ct) ? 1 : 0)
-                             + ((TMW_TO_UNI == ct) ? 1 : 0)
-                             + ((TM_TO_TMW == ct) ? 1 : 0)
-                             + ((TMW_TO_WYLIE == ct) ? 1 : 0)
-                             == 1);
-            long numAttemptedReplacements[] = new long[] { 0 };
-            if (TMW_TO_WYLIE == ct) {
-                // Convert to THDL Wylie:
-                if (!tdoc.toWylie(0,
-                                  tdoc.getLength(),
-                                  numAttemptedReplacements)) {
-                    exitCode = 44;
-                }
-            } else if (TMW_TO_UNI == ct) {
-                StringBuffer errors = new StringBuffer();
-                // Convert to Unicode:
-                if (tdoc.convertToUnicode(0,
-                                          tdoc.getLength(),
-                                          errors,
-                                          ThdlOptions.getStringOption("thdl.tmw.to.unicode.font").intern(),
-                                          numAttemptedReplacements)) {
-                    System.err.println(errors);
-                    exitCode = 42;
-                }
-            } else if (TM_TO_TMW == ct) {
-                StringBuffer errors = new StringBuffer();
-                // Convert to TibetanMachineWeb:
-                if (tdoc.convertToTMW(0, tdoc.getLength(), errors,
-                                      numAttemptedReplacements)) {
-                    System.err.println(errors);
-                    exitCode = 42;
-                }
-            } else {
-                ThdlDebug.verify(TMW_TO_TM == ct);
-                StringBuffer errors = new StringBuffer();
-                // Convert to TibetanMachine:
-                if (tdoc.convertToTM(0, tdoc.getLength(), errors,
-                                     numAttemptedReplacements)) {
-                    System.err.println(errors);
-                    exitCode = 42;
-                }
-            }
-
-            // Write to standard output the result:
+    static int reallyConvert(InputStream in, PrintStream out, String ct,
+                             String warningLevel) {
+        if (ACIP_TO_UNI == ct) {
            try {
-                tdoc.writeRTFOutputStream(out);
+                ArrayList al = ACIPTshegBarScanner.scanStream(in, null,
+                                                              250 - 1 // DLC FIXME: make me configurable
+                                                              );
+                if (null == al)
+                    return 47;
+                StringBuffer warnings = new StringBuffer();
+                boolean embeddedWarnings = (warningLevel != "None");
+                if (!ACIPConverter.convertToUnicode(al, out, null, warnings,
+                                                    embeddedWarnings,
+                                                    warningLevel))
+                    return 46;
+                if (embeddedWarnings && warnings.length() > 0)
+                    return 45;
+                else
+                    return 0;
            } catch (IOException e) {
-                exitCode = 40;
+                return 48;
+            }
+        } else {
+            TibetanDocument tdoc = new TibetanDocument();
+            {
+                SimpleAttributeSet ras = new SimpleAttributeSet();
+                StyleConstants.setFontFamily(ras,
+                                             ThdlOptions.getStringOption("thdl.default.roman.font.face",
+                                                                         "Serif"));
+                StyleConstants.setFontSize(ras,
+                                           ThdlOptions.getIntegerOption("thdl.default.roman.font.size",
+                                                                        14));
+                tdoc.setRomanAttributeSet(ras);
+            }
+            try {
+                // Read in the rtf file.
+                if (debug) System.err.println("Start: reading in old RTF file");
+                if (!ThdlOptions.getBooleanOption("thdl.do.not.fix.rtf.hex.escapes"))
+                    in = new RTFFixerInputStream(in);
+                (new RTFEditorKit()).read(in, tdoc, 0);
+                if (debug) System.err.println("End  : reading in old RTF file");
+            } catch (Exception e) {
+                out.println("TibetanConverter:\n"
+                            + rtfErrorMessage);
+                return 3;
            }
-            if (out.checkError())
-                exitCode = 41;
-            if (numAttemptedReplacements[0] < 1)
-                exitCode = 43;

-            return exitCode;
+            try {
+                in.close();
+            } catch (IOException e) {
+                // silently ignore; we don't care about the input so much...
+                ThdlDebug.noteIffyCode();
+            }
+
+
+            if (FIND_ALL_NON_TMW == ct) {
+                // 0, -1 is the entire document.
+                int exitCode
+                    = tdoc.findAllNonTMWCharacters(0, -1, out);
+                if (out.checkError())
+                    exitCode = 41;
+                return exitCode;
+            } else if (FIND_SOME_NON_TMW == ct) {
+                // 0, -1 is the entire document.
+                int exitCode
+                    = tdoc.findSomeNonTMWCharacters(0, -1, out);
+                if (out.checkError())
+                    exitCode = 41;
+                return exitCode;
+            } else if (FIND_SOME_NON_TM == ct) {
+                // 0, -1 is the entire document.
+                int exitCode
+                    = tdoc.findSomeNonTMCharacters(0, -1, out);
+                if (out.checkError())
+                    exitCode = 41;
+                return exitCode;
+            } else if (FIND_ALL_NON_TM == ct) {
+                // 0, -1 is the entire document.
+                int exitCode
+                    = tdoc.findAllNonTMCharacters(0, -1, out);
+                if (out.checkError())
+                    exitCode = 41;
+                return exitCode;
+            } else { // conversion {to Wylie or TM} mode
+                // Fix curly braces in the entire document if the input is TMW:
+                if (TM_TO_TMW != ct) {
+                    // DLC make me optional
+                    if (debug) System.err.println("Start: solving curly brace problem");
+                    tdoc.replaceTahomaCurlyBracesAndBackslashes(0, -1);
+                    if (debug) System.err.println("End  : solving curly brace problem");
+                }
+
+                int exitCode = 0;
+                ThdlDebug.verify(((TMW_TO_TM == ct) ? 1 : 0)
+                                 + ((TMW_TO_UNI == ct) ? 1 : 0)
+                                 + ((TM_TO_TMW == ct) ? 1 : 0)
+                                 + ((TMW_TO_WYLIE == ct) ? 1 : 0)
+                                 == 1);
+                long numAttemptedReplacements[] = new long[] { 0 };
+                if (TMW_TO_WYLIE == ct) {
+                    // Convert to THDL Wylie:
+                    if (!tdoc.toWylie(0,
+                                      tdoc.getLength(),
+                                      numAttemptedReplacements)) {
+                        exitCode = 44;
+                    }
+                } else if (TMW_TO_UNI == ct) {
+                    StringBuffer errors = new StringBuffer();
+                    // Convert to Unicode:
+                    if (tdoc.convertToUnicode(0,
+                                              tdoc.getLength(),
+                                              errors,
+                                              ThdlOptions.getStringOption("thdl.tmw.to.unicode.font").intern(),
+                                              numAttemptedReplacements)) {
+                        System.err.println(errors);
+                        exitCode = 42;
+                    }
+                } else if (TM_TO_TMW == ct) {
+                    StringBuffer errors = new StringBuffer();
+                    // Convert to TibetanMachineWeb:
+                    if (tdoc.convertToTMW(0, tdoc.getLength(), errors,
+                                          numAttemptedReplacements)) {
+                        System.err.println(errors);
+                        exitCode = 42;
+                    }
+                } else {
+                    ThdlDebug.verify(TMW_TO_TM == ct);
+                    StringBuffer errors = new StringBuffer();
+                    // Convert to TibetanMachine:
+                    if (tdoc.convertToTM(0, tdoc.getLength(), errors,
+                                         numAttemptedReplacements)) {
+                        System.err.println(errors);
+                        exitCode = 42;
+                    }
+                }
+
+                // Write to standard output the result:
+                try {
+                    tdoc.writeRTFOutputStream(out);
+                } catch (IOException e) {
+                    exitCode = 40;
+                }
+                if (out.checkError())
+                    exitCode = 41;
+                if (numAttemptedReplacements[0] < 1)
+                    exitCode = 43;
+
+                return exitCode;
+            }
        }
    }
 }