ACIP->Unicode, without going through TMW, is now possible, so long as

\, the Sanskrit virama, is not used. Of the 1370-odd ACIP texts I've got here, about 57% make it through the gauntlet (fewer if you demand a vowel or disambiguator on every stack of a non-Tibetan tsheg bar).
2003-08-18 02:38:54 +00:00 · 2003-08-18 02:38:54 +00:00 · 1afb3a0fdd
commit 1afb3a0fdd
parent 245aac4911
12 changed files with 646 additions and 40 deletions
--- a/source/org/thdl/tib/text/ttt/ACIPConverter.java
+++ b/source/org/thdl/tib/text/ttt/ACIPConverter.java
@ -0,0 +1,208 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site 
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis, 
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
+License for the specific terms governing rights and limitations under the 
+License. 
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
+All Rights Reserved. 
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.ttt;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Stack;
+
+import org.thdl.util.ThdlDebug;
+import org.thdl.util.ThdlOptions;
+
+/**
+* This class is able to convert an ACIP file into Tibetan Machine Web.
+* From there, TMW->Unicode takes you to Unicode.
+* @author David Chandler
+*/
+public class ACIPConverter {
+    static {
+        // We don't want to load the TM or TMW font files ourselves:
+        ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
+        ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
+        ThdlOptions.setUserPreference("thdl.debug", true);
+    }
+
+    /** Command-line converter.  Gives error messages on standard
+     *  output about why we can't convert the document perfectly and
+     *  exits with non-zero return code, or is silent otherwise and
+     *  exits with code zero.  <p>FIXME: not so efficient; copies the
+     *  whole file into memory first. */
+    public static void main(String[] args)
+        throws IOException // DLC FIXME: give nice error messages
+    {
+        boolean verbose = true;
+        boolean strict = true;
+        if (args.length != 2
+            || (!(strict = "--strict".equals(args[0])) && !"--lenient".equals(args[0]))) {
+            System.err.println("Bad args!  Need '--strict filename' or '--lenient filename'.");
+            System.exit(1);
+        }
+        StringBuffer errors = new StringBuffer();
+        int maxErrors = 250;
+        ArrayList al = ACIPTshegBarScanner.scanFile(args[1], errors, strict, maxErrors - 1);
+
+        if (null == al) {
+            System.err.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this");
+            System.err.println("Tibetan or English input?");
+            System.err.println("");
+            System.err.println("First " + maxErrors + " errors scanning ACIP input file: ");
+            System.err.println(errors);
+            System.err.println("Exiting with " + maxErrors + " or more errors; please fix input file and try again.");
+            System.exit(1);
+        }
+        if (errors.length() > 0) {
+            System.err.println("Errors scanning ACIP input file: ");
+            System.err.println(errors);
+            System.err.println("Exiting; please fix input file and try again.");
+            System.exit(1);
+        }
+
+        convertToUnicode(al, System.out, errors);
+        if (errors.length() > 0) {
+            System.err.println("Errors converting ACIP input file: ");
+            System.err.println(errors);
+            System.err.println("Exiting; please fix input file and try again.");
+            System.exit(2);
+        }
+        if (verbose) System.err.println("Converted " + args[1] + " perfectly.");
+        System.exit(0);
+    }
+
+    /** Writes TMW/Latin to out.  If errors occur in converting a
+     *  tsheg bar, then they are appended to errors if errors is
+     *  non-null.  Returns true upon perfect success, false if errors
+     *  occurred.
+     *  @throws IOException if we cannot write to out
+     */
+    public static boolean convertToTMW(ArrayList scan, String latinFont,
+                                       OutputStream out, StringBuffer errors)
+        throws IOException
+    {
+        throw new Error("DLC UNIMPLEMENTED");
+    }
+
+    /** Returns UTF-8 encoded Unicode.  A bit indirect, so use this
+     *  for testing only if performance is a concern.  If errors occur
+     *  in scanning the ACIP or in converting a tsheg bar, then they
+     *  are appended to errors if errors is non-null.  Returns the
+     *  conversion upon perfect success, null if errors occurred.
+     */
+    public static String convertToUnicode(String acip,
+                                          StringBuffer errors) {
+        ByteArrayOutputStream sw = new ByteArrayOutputStream();
+        ArrayList al = ACIPTshegBarScanner.scan(acip, errors, true /* DLC FIXME */, -1);
+        try {
+            if (null != al && convertToUnicode(al, sw, errors)) {
+                return sw.toString("UTF-8");
+            } else {
+                System.out.println("DLC al is " + al + " and convertToUnicode returned null.");
+                return null;
+            }
+        } catch (Exception e) {
+            throw new Error(e.toString());
+        }
+    }
+
+    /** Writes Unicode to out.  If errors occur in converting a
+     *  tsheg bar, then they are appended to errors if errors is
+     *  non-null.  Returns true upon perfect success, false if errors
+     *  occurred.
+     *  @throws IOException if we cannot write to out
+     */
+    public static boolean convertToUnicode(ArrayList scan,
+                                           OutputStream out,
+                                           StringBuffer errors)
+        throws IOException
+    {
+        int sz = scan.size();
+        boolean hasErrors = false;
+        BufferedWriter writer
+            = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
+        for (int i = 0; i < sz; i++) {
+            ACIPString s = (ACIPString)scan.get(i);
+            int stype = s.getType();
+            if (stype == ACIPString.ERROR) {
+                hasErrors = true;
+                writer.write("[#ERROR CONVERTING ACIP DOCUMENT: ");
+                writer.write(s.getText());
+                writer.write("]");
+            } else {
+                // DLC FIXME: what about 'no A on root stack' and 'no A on such-and-such stack' warnings?
+                if (s.isLatin(stype)) {
+                    if (stype == ACIPString.FOLIO_MARKER)
+                        writer.write("{");
+                    writer.write(s.getText());
+                    if (stype == ACIPString.FOLIO_MARKER)
+                        writer.write("}");
+                } else {
+                    String unicode = null;
+                    if (stype == ACIPString.TIBETAN_NON_PUNCTUATION) {
+                        TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText());
+                        String acipError;
+
+                        if ((acipError = pl.getACIPError()) != null) {
+                            hasErrors = true;
+                            String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS THESE ERRORS: " + acipError + "]";
+                            writer.write(errorMessage);
+                            if (null != errors)
+                                errors.append(errorMessage + "\n");
+                        } else {
+                            TParseTree pt = pl.getParseTree();
+                            if (null == pt) {
+                                hasErrors = true;
+                                String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " IS ESSENTIALLY NOTHING.]";
+                                writer.write(errorMessage);
+                                if (null != errors)
+                                    errors.append(errorMessage + "\n");
+                            } else {
+                                TStackList sl = pt.getBestParse();
+                                if (null == sl) {
+                                    hasErrors = true;
+                                    String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS NO LEGAL PARSES.]";
+                                    writer.write(errorMessage);
+                                    if (null != errors)
+                                        errors.append(errorMessage + "\n");
+                                } else {
+                                    unicode = sl.getUnicode();
+                                    if (null == unicode) throw new Error("DLC: HOW?");
+                                }
+                            }
+                        }
+                    } else {
+                        if (stype == ACIPString.START_SLASH)
+                            unicode = "\u0F3C";
+                        else if (stype == ACIPString.END_SLASH)
+                            unicode = "\u0F3D";
+                        else
+                            unicode = ACIPRules.getUnicodeFor(s.getText(), false);
+                        if (null == unicode) throw new Error("DLC: HOW?");
+                    }
+                    if (null != unicode) {
+                        writer.write(unicode);
+                    }
+                }
+            }
+        }
+        writer.close();
+        return !hasErrors;
+    }
+}
+// DLC FIXME: putting Tibetan in black, Sanskrit in green, and Latin
+// in yellow would help you quickly decide if ZHIGN maybe should've
+// been ZHING.