ACIP->Unicode, without going through TMW, is now possible, so long as

\, the Sanskrit virama, is not used. Of the 1370-odd ACIP texts I've got here, about 57% make it through the gauntlet (fewer if you demand a vowel or disambiguator on every stack of a non-Tibetan tsheg bar).
2003-08-18 02:38:54 +00:00 · 2003-08-18 02:38:54 +00:00 · 1afb3a0fdd
commit 1afb3a0fdd
parent 245aac4911
12 changed files with 646 additions and 40 deletions
--- a/build.xml
+++ b/build.xml
@ -318,13 +318,18 @@ Contributor(s): ______________________________________.
      <param name="my.included.source.file"
             value="org/thdl/tib/text/TibetanHTML.java"/>
    </antcall>
-    <!-- Put TibetanConverter in Jskad's jar for those who want
-         to use it. -->
+    <!-- Put TibetanConverter and ACIPConverter in Jskad's jar for
+         those who want to use them. -->
    <antcall target="our-internal-javac-task">
      <param name="mybin" value="${jskadbin}"/>
      <param name="my.included.source.file"
             value="org/thdl/tib/input/TibetanConverter.java"/>
    </antcall>
+    <antcall target="our-internal-javac-task">
+      <param name="mybin" value="${jskadbin}"/>
+      <param name="my.included.source.file"
+             value="org/thdl/tib/text/ttt/ACIPConverter.java"/>
+    </antcall>
    <antcall target="our-internal-javac-task">
      <param name="mybin" value="${jskadbin}"/>
      <param name="my.included.source.file"
--- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
+++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
@ -341,7 +341,7 @@ public final class LegalTshegBar
            EWC_ta,  EWC_tha,  EWC_da,     EWC_na,
            EWC_pa,  EWC_pha,  EWC_ba,     EWC_ma,
            EWC_tsa, EWC_tsha, EWC_dza,    EWC_wa,
-            EWC_zha, EWC_za,   EWC_achung,  EWC_ya,
+            EWC_zha, EWC_za,   EWC_achung, EWC_ya,
            EWC_ra,  EWC_la,   EWC_sha,    EWC_sa,
            EWC_ha,  EWC_a
        });
@ -833,7 +833,7 @@ public final class LegalTshegBar
                    return internalThrowThing(throwIfIllegal,
                                              errorBuf,
                                              "Illegal suffix -- not one of the ten legal suffixes: "
-                                              + UnicodeUtils.unicodeCodepointToString(suffix.charAt(0)));
+                                              + UnicodeUtils.unicodeCodepointToString(suffix.charAt(0), false));
                }
            }
        }
--- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
@ -286,8 +286,41 @@ public class UnicodeUtils implements UnicodeConstants {
    }

    /** Returns a human-readable, ASCII form of the Unicode codepoint
-        cp. */
-    public static String unicodeCodepointToString(char cp) {
+        cp. If shortenIfPossible is true, then printable ASCII
+        characters will appear as themselves. */
+    public static String unicodeCodepointToString(char cp,
+                                                  boolean shortenIfPossible) {
+        if (shortenIfPossible) {
+            if ((cp >= 'a' && cp <= 'z')
+                || (cp >= 'A' && cp <= 'Z')
+                || (cp >= '0' && cp <= '9')
+                || cp == '.'
+                || cp == ','
+                || cp == ' '
+                || cp == '\''
+                || cp == '"'
+                || cp == '+'
+                || cp == '-'
+                || cp == '='
+                || cp == '_'
+                || cp == '@'
+                || cp == '!'
+                || cp == '#'
+                || cp == '$'
+                || cp == '%'
+                || cp == '^'
+                || cp == '&'
+                || cp == '*'
+                || cp == '\t'
+                || cp == ':'
+                || cp == '['
+                || cp == ']'
+                || cp == '('
+                || cp == ')'
+                || cp == '{'
+                || cp == '}')
+                return new String(new char[] { cp });
+        }
        if (cp < '\u0010')
            return "\\u000" + Integer.toHexString((int)cp);
        else if (cp < '\u0100')
@ -304,7 +337,19 @@ public class UnicodeUtils implements UnicodeConstants {
    public static String unicodeStringToString(String s) {
        StringBuffer sb = new StringBuffer(s.length() * 6);
        for (int i = 0; i < s.length(); i++) {
-            sb.append(unicodeCodepointToString(s.charAt(i)));
+            sb.append(unicodeCodepointToString(s.charAt(i), false));
+        }
+        return sb.toString();
+    }
+
+    /**
+     * Returns the most succinct possible, human-readable, ASCII form
+     * of the String s of Unicode codepoints. */
+    public static String unicodeStringToPrettyString(String s) {
+        if (s == null) return "null";
+        StringBuffer sb = new StringBuffer(s.length() * 6);
+        for (int i = 0; i < s.length(); i++) {
+            sb.append(unicodeCodepointToString(s.charAt(i), true));
        }
        return sb.toString();
    }
--- a/source/org/thdl/tib/text/tshegbar/UnicodeUtilsTest.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtilsTest.java
@ -321,15 +321,15 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants {
     * Tests the {@link UnicodeUtils#unicodeCodepointToString(char)}
     * method. */
    public void testUnicodeCodepointToString() {
-        assertTrue(UnicodeUtils.unicodeCodepointToString('\u0000').equals("\\u0000"));
-        assertTrue(UnicodeUtils.unicodeCodepointToString('\u0001').equals("\\u0001"));
-        assertTrue(UnicodeUtils.unicodeCodepointToString('\u000F').equals("\\u000f"));
-        assertTrue(UnicodeUtils.unicodeCodepointToString('\u001F').equals("\\u001f"));
-        assertTrue(UnicodeUtils.unicodeCodepointToString('\u00fF').equals("\\u00ff"));
-        assertTrue(UnicodeUtils.unicodeCodepointToString('\u01fF').equals("\\u01ff"));
-        assertTrue(UnicodeUtils.unicodeCodepointToString('\u0ffF').equals("\\u0fff"));
-        assertTrue(UnicodeUtils.unicodeCodepointToString('\u1ffF').equals("\\u1fff"));
-        assertTrue(UnicodeUtils.unicodeCodepointToString('\ufffF').equals("\\uffff"));
+        assertTrue(UnicodeUtils.unicodeCodepointToString('\u0000', false).equals("\\u0000"));
+        assertTrue(UnicodeUtils.unicodeCodepointToString('\u0001', false).equals("\\u0001"));
+        assertTrue(UnicodeUtils.unicodeCodepointToString('\u000F', false).equals("\\u000f"));
+        assertTrue(UnicodeUtils.unicodeCodepointToString('\u001F', false).equals("\\u001f"));
+        assertTrue(UnicodeUtils.unicodeCodepointToString('\u00fF', false).equals("\\u00ff"));
+        assertTrue(UnicodeUtils.unicodeCodepointToString('\u01fF', false).equals("\\u01ff"));
+        assertTrue(UnicodeUtils.unicodeCodepointToString('\u0ffF', false).equals("\\u0fff"));
+        assertTrue(UnicodeUtils.unicodeCodepointToString('\u1ffF', false).equals("\\u1fff"));
+        assertTrue(UnicodeUtils.unicodeCodepointToString('\ufffF', false).equals("\\uffff"));
    }

    /**
--- a/source/org/thdl/tib/text/ttt/ACIPConverter.java
+++ b/source/org/thdl/tib/text/ttt/ACIPConverter.java
@ -0,0 +1,208 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site 
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis, 
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
+License for the specific terms governing rights and limitations under the 
+License. 
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
+All Rights Reserved. 
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.ttt;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Stack;
+
+import org.thdl.util.ThdlDebug;
+import org.thdl.util.ThdlOptions;
+
+/**
+* This class is able to convert an ACIP file into Tibetan Machine Web.
+* From there, TMW->Unicode takes you to Unicode.
+* @author David Chandler
+*/
+public class ACIPConverter {
+    static {
+        // We don't want to load the TM or TMW font files ourselves:
+        ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
+        ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
+        ThdlOptions.setUserPreference("thdl.debug", true);
+    }
+
+    /** Command-line converter.  Gives error messages on standard
+     *  output about why we can't convert the document perfectly and
+     *  exits with non-zero return code, or is silent otherwise and
+     *  exits with code zero.  <p>FIXME: not so efficient; copies the
+     *  whole file into memory first. */
+    public static void main(String[] args)
+        throws IOException // DLC FIXME: give nice error messages
+    {
+        boolean verbose = true;
+        boolean strict = true;
+        if (args.length != 2
+            || (!(strict = "--strict".equals(args[0])) && !"--lenient".equals(args[0]))) {
+            System.err.println("Bad args!  Need '--strict filename' or '--lenient filename'.");
+            System.exit(1);
+        }
+        StringBuffer errors = new StringBuffer();
+        int maxErrors = 250;
+        ArrayList al = ACIPTshegBarScanner.scanFile(args[1], errors, strict, maxErrors - 1);
+
+        if (null == al) {
+            System.err.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this");
+            System.err.println("Tibetan or English input?");
+            System.err.println("");
+            System.err.println("First " + maxErrors + " errors scanning ACIP input file: ");
+            System.err.println(errors);
+            System.err.println("Exiting with " + maxErrors + " or more errors; please fix input file and try again.");
+            System.exit(1);
+        }
+        if (errors.length() > 0) {
+            System.err.println("Errors scanning ACIP input file: ");
+            System.err.println(errors);
+            System.err.println("Exiting; please fix input file and try again.");
+            System.exit(1);
+        }
+
+        convertToUnicode(al, System.out, errors);
+        if (errors.length() > 0) {
+            System.err.println("Errors converting ACIP input file: ");
+            System.err.println(errors);
+            System.err.println("Exiting; please fix input file and try again.");
+            System.exit(2);
+        }
+        if (verbose) System.err.println("Converted " + args[1] + " perfectly.");
+        System.exit(0);
+    }
+
+    /** Writes TMW/Latin to out.  If errors occur in converting a
+     *  tsheg bar, then they are appended to errors if errors is
+     *  non-null.  Returns true upon perfect success, false if errors
+     *  occurred.
+     *  @throws IOException if we cannot write to out
+     */
+    public static boolean convertToTMW(ArrayList scan, String latinFont,
+                                       OutputStream out, StringBuffer errors)
+        throws IOException
+    {
+        throw new Error("DLC UNIMPLEMENTED");
+    }
+
+    /** Returns UTF-8 encoded Unicode.  A bit indirect, so use this
+     *  for testing only if performance is a concern.  If errors occur
+     *  in scanning the ACIP or in converting a tsheg bar, then they
+     *  are appended to errors if errors is non-null.  Returns the
+     *  conversion upon perfect success, null if errors occurred.
+     */
+    public static String convertToUnicode(String acip,
+                                          StringBuffer errors) {
+        ByteArrayOutputStream sw = new ByteArrayOutputStream();
+        ArrayList al = ACIPTshegBarScanner.scan(acip, errors, true /* DLC FIXME */, -1);
+        try {
+            if (null != al && convertToUnicode(al, sw, errors)) {
+                return sw.toString("UTF-8");
+            } else {
+                System.out.println("DLC al is " + al + " and convertToUnicode returned null.");
+                return null;
+            }
+        } catch (Exception e) {
+            throw new Error(e.toString());
+        }
+    }
+
+    /** Writes Unicode to out.  If errors occur in converting a
+     *  tsheg bar, then they are appended to errors if errors is
+     *  non-null.  Returns true upon perfect success, false if errors
+     *  occurred.
+     *  @throws IOException if we cannot write to out
+     */
+    public static boolean convertToUnicode(ArrayList scan,
+                                           OutputStream out,
+                                           StringBuffer errors)
+        throws IOException
+    {
+        int sz = scan.size();
+        boolean hasErrors = false;
+        BufferedWriter writer
+            = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
+        for (int i = 0; i < sz; i++) {
+            ACIPString s = (ACIPString)scan.get(i);
+            int stype = s.getType();
+            if (stype == ACIPString.ERROR) {
+                hasErrors = true;
+                writer.write("[#ERROR CONVERTING ACIP DOCUMENT: ");
+                writer.write(s.getText());
+                writer.write("]");
+            } else {
+                // DLC FIXME: what about 'no A on root stack' and 'no A on such-and-such stack' warnings?
+                if (s.isLatin(stype)) {
+                    if (stype == ACIPString.FOLIO_MARKER)
+                        writer.write("{");
+                    writer.write(s.getText());
+                    if (stype == ACIPString.FOLIO_MARKER)
+                        writer.write("}");
+                } else {
+                    String unicode = null;
+                    if (stype == ACIPString.TIBETAN_NON_PUNCTUATION) {
+                        TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText());
+                        String acipError;
+
+                        if ((acipError = pl.getACIPError()) != null) {
+                            hasErrors = true;
+                            String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS THESE ERRORS: " + acipError + "]";
+                            writer.write(errorMessage);
+                            if (null != errors)
+                                errors.append(errorMessage + "\n");
+                        } else {
+                            TParseTree pt = pl.getParseTree();
+                            if (null == pt) {
+                                hasErrors = true;
+                                String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " IS ESSENTIALLY NOTHING.]";
+                                writer.write(errorMessage);
+                                if (null != errors)
+                                    errors.append(errorMessage + "\n");
+                            } else {
+                                TStackList sl = pt.getBestParse();
+                                if (null == sl) {
+                                    hasErrors = true;
+                                    String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS NO LEGAL PARSES.]";
+                                    writer.write(errorMessage);
+                                    if (null != errors)
+                                        errors.append(errorMessage + "\n");
+                                } else {
+                                    unicode = sl.getUnicode();
+                                    if (null == unicode) throw new Error("DLC: HOW?");
+                                }
+                            }
+                        }
+                    } else {
+                        if (stype == ACIPString.START_SLASH)
+                            unicode = "\u0F3C";
+                        else if (stype == ACIPString.END_SLASH)
+                            unicode = "\u0F3D";
+                        else
+                            unicode = ACIPRules.getUnicodeFor(s.getText(), false);
+                        if (null == unicode) throw new Error("DLC: HOW?");
+                    }
+                    if (null != unicode) {
+                        writer.write(unicode);
+                    }
+                }
+            }
+        }
+        writer.close();
+        return !hasErrors;
+    }
+}
+// DLC FIXME: putting Tibetan in black, Sanskrit in green, and Latin
+// in yellow would help you quickly decide if ZHIGN maybe should've
+// been ZHING.
--- a/source/org/thdl/tib/text/ttt/ACIPRules.java
+++ b/source/org/thdl/tib/text/ttt/ACIPRules.java
@ -28,9 +28,9 @@ class ACIPRules {
     *  three. */
    public static int MAX_CONSONANT_LENGTH = 3;

-    /** {'im:}, the longest "vowel", has 4 characters, so this is
-     *  four. */
-    public static int MAX_VOWEL_LENGTH = 4;
+    /** {'EEm:}, the longest "vowel", has 5 characters, so this is
+     *  five. */
+    public static int MAX_VOWEL_LENGTH = 5;

    /** For O(1) {@link #isVowel(String)} calls. */
    private static HashSet acipVowels = null;
@ -42,18 +42,9 @@ class ACIPRules {
        { "U", "u" },
        { "E", "e" },
        { "O", "o" },
-        { "'I", "I" },
-        { "'U", "U" },
        { "EE", "ai" },
        { "OO", "au" },
-        { "i", "-i" },
-        { "'i", "-I" },
-        { "'A", "A" },
-        { "'O", "Ao" },
-        { "'E", "Ae" }
-        // DLC I'm on my own with 'O and 'E, but GANG'O appears
-        // and I wonder... so here are 'O and 'E.  It's
-        // consistent with 'I and 'A and 'U, at least.
+        { "i", "-i" }
    };

    /** Returns true if and only if s is an ACIP "vowel".  You can't
@ -61,14 +52,24 @@ class ACIPRules {
     *  ACIP, so you have to call this in the right context. */
    public static boolean isVowel(String s) {
        if (null == acipVowels) {
-            acipVowels = new HashSet();
+            acipVowels = new HashSet(baseVowels.length * 8);
            for (int i = 0; i < baseVowels.length; i++) {
-                acipVowels.add(baseVowels[i][0]);
-                acipVowels.add(baseVowels[i][0] + 'm');
-                acipVowels.add(baseVowels[i][0] + ':');
-                acipVowels.add(baseVowels[i][0] + "m:");
-                // DLC '\' for visarga? how shall we do \ the visarga? like a vowel or not?
+                // DLC I'm on my own with 'O and 'E and 'OO and 'EE, but
+                // GANG'O appears and I wonder... so here they are.  It's
+                // consistent with 'I and 'A and 'U, at least: all the vowels
+                // may appear as K'vowel.

+                acipVowels.add(baseVowels[i][0]);
+                acipVowels.add('\'' + baseVowels[i][0]);
+                acipVowels.add(baseVowels[i][0] + 'm');
+                acipVowels.add('\'' + baseVowels[i][0] + 'm');
+                acipVowels.add(baseVowels[i][0] + ':');
+                acipVowels.add('\'' + baseVowels[i][0] + ':');
+                acipVowels.add(baseVowels[i][0] + "m:");
+                acipVowels.add('\'' + baseVowels[i][0] + "m:");
+                // DLC keep this code in sync with getUnicodeFor.
+
+                // DLC '\' for visarga? how shall we do \ the visarga? like a vowel or not?
            }
        }
        return (acipVowels.contains(s));
@ -204,4 +205,212 @@ class ACIPRules {
        }
        return (String)acipVowel2wylie.get(acip);
    }
+
+    private static HashMap superACIP2unicode = null;
+    private static HashMap subACIP2unicode = null;
+    /** If acip is an ACIP consonant or vowel or punctuation mark,
+     *  then this returns the Unicode for it.  The Unicode for the
+     *  subscribed form of the glyph is returned if subscribed is
+     *  true.  Returns null if acip is unknown. */
+    static String getUnicodeFor(String acip, boolean subscribed) {
+        if (superACIP2unicode == null) {
+            superACIP2unicode = new HashMap(144);
+            subACIP2unicode = new HashMap(42);
+
+            // oddball:
+            subACIP2unicode.put("V", "\u0FAD");
+
+            superACIP2unicode.put("DH", "\u0F52");
+            subACIP2unicode.put("DH", "\u0FA2");
+            superACIP2unicode.put("BH", "\u0F57");
+            subACIP2unicode.put("BH", "\u0FA7");
+            superACIP2unicode.put("dH", "\u0F4D");
+            subACIP2unicode.put("dH", "\u0F9D");
+            superACIP2unicode.put("DZH", "\u0F5C");
+            subACIP2unicode.put("DZH", "\u0FAC");
+            superACIP2unicode.put("Ksh", "\u0F69");
+            subACIP2unicode.put("Ksh", "\u0FB9");
+            superACIP2unicode.put("GH", "\u0F43");
+            subACIP2unicode.put("GH", "\u0F93");
+            superACIP2unicode.put("K", "\u0F40");
+            subACIP2unicode.put("K", "\u0F90");
+            superACIP2unicode.put("KH", "\u0F41");
+            subACIP2unicode.put("KH", "\u0F91");
+            superACIP2unicode.put("G", "\u0F42");
+            subACIP2unicode.put("G", "\u0F92");
+            superACIP2unicode.put("NG", "\u0F44");
+            subACIP2unicode.put("NG", "\u0F94");
+            superACIP2unicode.put("C", "\u0F45");
+            subACIP2unicode.put("C", "\u0F95");
+            superACIP2unicode.put("CH", "\u0F46");
+            subACIP2unicode.put("CH", "\u0F96");
+            superACIP2unicode.put("J", "\u0F47");
+            subACIP2unicode.put("J", "\u0F97");
+            superACIP2unicode.put("NY", "\u0F49");
+            subACIP2unicode.put("NY", "\u0F99");
+            superACIP2unicode.put("T", "\u0F4F");
+            subACIP2unicode.put("T", "\u0F9F");
+            superACIP2unicode.put("TH", "\u0F50");
+            subACIP2unicode.put("TH", "\u0FA0");
+            superACIP2unicode.put("D", "\u0F51");
+            subACIP2unicode.put("D", "\u0FA1");
+            superACIP2unicode.put("N", "\u0F53");
+            subACIP2unicode.put("N", "\u0FA3");
+            superACIP2unicode.put("P", "\u0F54");
+            subACIP2unicode.put("P", "\u0FA4");
+            superACIP2unicode.put("PH", "\u0F55");
+            subACIP2unicode.put("PH", "\u0FA5");
+            superACIP2unicode.put("B", "\u0F56");
+            subACIP2unicode.put("B", "\u0FA6");
+            superACIP2unicode.put("M", "\u0F58");
+            subACIP2unicode.put("M", "\u0FA8");
+            superACIP2unicode.put("TZ", "\u0F59");
+            subACIP2unicode.put("TZ", "\u0FA9");
+            superACIP2unicode.put("TS", "\u0F5A");
+            subACIP2unicode.put("TS", "\u0FAA");
+            superACIP2unicode.put("DZ", "\u0F5B");
+            subACIP2unicode.put("DZ", "\u0FAB");
+            superACIP2unicode.put("W", "\u0F5D");
+            subACIP2unicode.put("W", "\u0FBA"); // oddball
+            superACIP2unicode.put("ZH", "\u0F5E");
+            subACIP2unicode.put("ZH", "\u0FAE");
+            superACIP2unicode.put("Z", "\u0F5F");
+            subACIP2unicode.put("Z", "\u0FAF");
+            superACIP2unicode.put("'", "\u0F60");
+            subACIP2unicode.put("'", "\u0FB0");
+            superACIP2unicode.put("Y", "\u0F61");
+            subACIP2unicode.put("Y", "\u0FB1");
+            superACIP2unicode.put("R", "\u0F62");
+            subACIP2unicode.put("R", "\u0FB2");
+            superACIP2unicode.put("L", "\u0F63");
+            subACIP2unicode.put("L", "\u0FB3");
+            superACIP2unicode.put("SH", "\u0F64");
+            subACIP2unicode.put("SH", "\u0FB4");
+            superACIP2unicode.put("S", "\u0F66");
+            subACIP2unicode.put("S", "\u0FB6");
+            superACIP2unicode.put("H", "\u0F67");
+            subACIP2unicode.put("H", "\u0FB7");
+            superACIP2unicode.put("A", "\u0F68");
+            subACIP2unicode.put("A", "\u0FB8");
+            superACIP2unicode.put("t", "\u0F4A");
+            subACIP2unicode.put("t", "\u0F9A");
+            superACIP2unicode.put("th", "\u0F4B");
+            subACIP2unicode.put("th", "\u0F9B");
+            superACIP2unicode.put("d", "\u0F4C");
+            subACIP2unicode.put("d", "\u0F9C");
+            superACIP2unicode.put("n", "\u0F4E");
+            subACIP2unicode.put("n", "\u0F9E");
+            superACIP2unicode.put("sh", "\u0F65");
+            subACIP2unicode.put("sh", "\u0FB5");
+
+            superACIP2unicode.put("I", "\u0F72");
+            superACIP2unicode.put("E", "\u0F7A");
+            superACIP2unicode.put("O", "\u0F7C");
+            superACIP2unicode.put("U", "\u0F74");
+            superACIP2unicode.put("OO", "\u0F7D");
+            superACIP2unicode.put("EE", "\u0F7B");
+            superACIP2unicode.put("i", "\u0F80");
+            superACIP2unicode.put("'A", "\u0F71");
+            superACIP2unicode.put("'I", "\u0F71\u0F72");
+            superACIP2unicode.put("'E", "\u0F71\u0F7A");
+            superACIP2unicode.put("'O", "\u0F71\u0F7C");
+            superACIP2unicode.put("'U", "\u0F71\u0F74");
+            superACIP2unicode.put("'OO", "\u0F71\u0F7D");
+            superACIP2unicode.put("'EE", "\u0F71\u0F7B");
+            superACIP2unicode.put("'i", "\u0F71\u0F80");
+
+            superACIP2unicode.put("Im", "\u0F72\u0F7E");
+            superACIP2unicode.put("Em", "\u0F7A\u0F7E");
+            superACIP2unicode.put("Om", "\u0F7C\u0F7E");
+            superACIP2unicode.put("Um", "\u0F74\u0F7E");
+            superACIP2unicode.put("OOm", "\u0F7D\u0F7E");
+            superACIP2unicode.put("EEm", "\u0F7B\u0F7E");
+            superACIP2unicode.put("im", "\u0F80\u0F7E");
+            superACIP2unicode.put("'Am", "\u0F71\u0F7E");
+            superACIP2unicode.put("'Im", "\u0F71\u0F72\u0F7E");
+            superACIP2unicode.put("'Em", "\u0F71\u0F7A\u0F7E");
+            superACIP2unicode.put("'Om", "\u0F71\u0F7C\u0F7E");
+            superACIP2unicode.put("'Um", "\u0F71\u0F74\u0F7E");
+            superACIP2unicode.put("'OOm", "\u0F71\u0F7D\u0F7E");
+            superACIP2unicode.put("'EEm", "\u0F71\u0F7B\u0F7E");
+            superACIP2unicode.put("'im", "\u0F71\u0F80\u0F7E");
+
+            superACIP2unicode.put("I:", "\u0F72\u0F7F");
+            superACIP2unicode.put("E:", "\u0F7A\u0F7F");
+            superACIP2unicode.put("O:", "\u0F7C\u0F7F");
+            superACIP2unicode.put("U:", "\u0F74\u0F7F");
+            superACIP2unicode.put("OO:", "\u0F7D\u0F7F");
+            superACIP2unicode.put("EE:", "\u0F7B\u0F7F");
+            superACIP2unicode.put("i:", "\u0F80\u0F7F");
+            superACIP2unicode.put("'A:", "\u0F71\u0F7F");
+            superACIP2unicode.put("'I:", "\u0F71\u0F72\u0F7F");
+            superACIP2unicode.put("'E:", "\u0F71\u0F7A\u0F7F");
+            superACIP2unicode.put("'O:", "\u0F71\u0F7C\u0F7F");
+            superACIP2unicode.put("'U:", "\u0F71\u0F74\u0F7F");
+            superACIP2unicode.put("'OO:", "\u0F71\u0F7D\u0F7F");
+            superACIP2unicode.put("'EE:", "\u0F71\u0F7B\u0F7F");
+            superACIP2unicode.put("'i:", "\u0F71\u0F80\u0F7F");
+
+            superACIP2unicode.put("Im:", "\u0F72\u0F7E\u0F7F");
+            superACIP2unicode.put("Em:", "\u0F7A\u0F7E\u0F7F");
+            superACIP2unicode.put("Om:", "\u0F7C\u0F7E\u0F7F");
+            superACIP2unicode.put("Um:", "\u0F74\u0F7E\u0F7F");
+            superACIP2unicode.put("OOm:", "\u0F7D\u0F7E\u0F7F");
+            superACIP2unicode.put("EEm:", "\u0F7B\u0F7E\u0F7F");
+            superACIP2unicode.put("im:", "\u0F80\u0F7E\u0F7F");
+            superACIP2unicode.put("'Am:", "\u0F71\u0F7E\u0F7F");
+            superACIP2unicode.put("'Im:", "\u0F71\u0F72\u0F7E\u0F7F");
+            superACIP2unicode.put("'Em:", "\u0F71\u0F7A\u0F7E\u0F7F");
+            superACIP2unicode.put("'Om:", "\u0F71\u0F7C\u0F7E\u0F7F");
+            superACIP2unicode.put("'Um:", "\u0F71\u0F74\u0F7E\u0F7F");
+            superACIP2unicode.put("'OOm:", "\u0F71\u0F7D\u0F7E\u0F7F");
+            superACIP2unicode.put("'EEm:", "\u0F71\u0F7B\u0F7E\u0F7F");
+            superACIP2unicode.put("'im:", "\u0F71\u0F80\u0F7E\u0F7F");
+            // :m does not appear, though you'd think it's as valid as m:.
+
+            // I doubt these will occur alone:
+            superACIP2unicode.put("m", "\u0F7E");
+            superACIP2unicode.put(":", "\u0F7F");
+
+            superACIP2unicode.put("Am", "\u0F7E");
+            superACIP2unicode.put("A:", "\u0F7F");
+
+            superACIP2unicode.put("0", "\u0F20");
+            superACIP2unicode.put("1", "\u0F21");
+            superACIP2unicode.put("2", "\u0F22");
+            superACIP2unicode.put("3", "\u0F23");
+            superACIP2unicode.put("4", "\u0F24");
+            superACIP2unicode.put("5", "\u0F25");
+            superACIP2unicode.put("6", "\u0F26");
+            superACIP2unicode.put("7", "\u0F27");
+            superACIP2unicode.put("8", "\u0F28");
+            superACIP2unicode.put("9", "\u0F29");
+
+            // DLC punctuation
+            superACIP2unicode.put("&", "\u0F85");
+            superACIP2unicode.put(",", "\u0F0D");
+            superACIP2unicode.put(" ", "\u0F0B");
+            superACIP2unicode.put(".", "\u0F0C");
+            superACIP2unicode.put("`", "\u0F08");
+            superACIP2unicode.put("`", "\u0F08");
+            superACIP2unicode.put("*", "\u0F04\u0F05");
+            superACIP2unicode.put("#", "\u0F04\u0F05\u0F05");
+            superACIP2unicode.put("%", "\u0F35");
+            superACIP2unicode.put(";", "\u0F11");
+            superACIP2unicode.put("\r", "\r");
+            superACIP2unicode.put("\t", "\t");
+            superACIP2unicode.put("\n", "\n");
+            superACIP2unicode.put("\\", "\u0F84"); // DLC FIXME: make this like a vowel
+            // DLC FIXME: what's the Unicode for caret, ^?
+            // DLC FIXME: what's the Unicode for o?
+            // DLC FIXME: what's the Unicode for x?
+
+        }
+        if (subscribed) {
+            String u = (String)subACIP2unicode.get(acip);
+            if (null != u) return u;
+        }
+        return (String)superACIP2unicode.get(acip);
+
+    }
 }
--- a/source/org/thdl/tib/text/ttt/ACIPString.java
+++ b/source/org/thdl/tib/text/ttt/ACIPString.java
@ -30,6 +30,15 @@ public class ACIPString {
    private int type;
    private String text;

+    /** Returns true if and only if an ACIPString with type type is to
+     *  be converted to Latin, not Tibetan, text. */
+    public static boolean isLatin(int type) {
+        return (type != TIBETAN_NON_PUNCTUATION
+                && type != TIBETAN_PUNCTUATION
+                && type != START_SLASH
+                && type != END_SLASH);
+    }
+
    /** For [#COMMENTS] */
    public static final int COMMENT = 0;
    /** For Folio markers like @012B */
--- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
@ -57,7 +57,6 @@ public class ACIPTshegBarScanner {
            System.out.println(errors);
            System.out.println("Exiting with " + maxErrors + " or more errors; please fix input file and try again.");
            System.exit(1);
-        } else {
        }
        if (errors.length() > 0) {
            System.out.println("Errors scanning ACIP input file: ");
@ -90,6 +89,7 @@ public class ACIPTshegBarScanner {
        while (-1 != (amt = in.read(ch))) {
            s.append(ch, 0, amt);
        }
+        in.close();
        return scan(s.toString(), errors, !strict, maxErrors);
    }

@ -621,6 +621,18 @@ public class ACIPTshegBarScanner {
                }

                if (startSlashIndex >= 0) {
+                    if (startSlashIndex + 1 == i) {
+                        /* //NYA\\ appears in ACIP input, and I think
+                         * it means /NYA/.  We warn about // for this
+                         * reason.  \\ causes a tsheg-bar error (DLC
+                         * FIXME: verify this is so). */
+                        al.add(new ACIPString("//", ACIPString.ERROR));
+                        if (errors != null) {
+                            errors.append("Offset " + i + ": "
+                                          + "Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\n");
+                        }
+                        if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
+                    }
                    al.add(new ACIPString(s.substring(i, i+1),
                                          ACIPString.END_SLASH));
                    startOfString = i+1;
@ -766,6 +778,9 @@ public class ACIPTshegBarScanner {
                        if ((int)ch == 65533) {
                            errors.append("Offset " + i + ": "
                                          + "Found an illegal, unprintable character.\n");
+                        } else if ('\\' == ch) {
+                            errors.append("Offset " + i + ": "
+                                          + "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly.  Sorry!  Please do complain to the maintainers.\n");
                        } else {
                            errors.append("Offset " + i + ": "
                                          + "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".\n");
@ -849,7 +864,7 @@ public class ACIPTshegBarScanner {
            || ch == 'x'
            || ch == ':'
            || ch == '^'
-            || ch == '\\'
+            // DLC FIXME: we must treat this guy like a vowel, a special vowel that numerals can take on.  Until then, warn.            || ch == '\\'

            || ch == '-'
            || ch == '+'
--- a/source/org/thdl/tib/text/ttt/PackageTest.java
+++ b/source/org/thdl/tib/text/ttt/PackageTest.java
@ -292,6 +292,12 @@ public class PackageTest extends TestCase {
                  new String[] { "{SH}{LO}", "{SH+LO}" },
                  new String[] { "{SH+LO}" });
        tstHelper("ZLUM", "{Z}{LU}{M}", new String[] { "{Z}{LU}{M}", "{Z+LU}{M}" }, new String[] { "{Z+LU}{M}" });
+        tstHelper("K'EE", "{K'EE}");
+        tstHelper("K'O", "{K'O}");
+        tstHelper("K'OO", "{K'OO}");
+        tstHelper("K'II", "{K'I}{I}");
+        tstHelper("K'i", "{K'i}");
+        tstHelper("K'A", "{K'A}");
        tstHelper("B+DDZ", "{B+}{D}{DZ}",
                  new String[] { "{B+D}{DZ}",
                                 "{B+D+DZ}" }); // we're conservative.
@ -6984,7 +6990,7 @@ tstHelper("ZUR");
        shelp("DD]",
              "Offset 2: Found a truly unmatched close bracket, ] or }.\nOffset 2: Found a closing bracket without a matching open bracket.  Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");

-        shelp("///NYA", "Offset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
+        shelp("///NYA", "Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
        shelp("/NYA/", "");
        shelp("[?][BP][LS][DD1][DD2][DDD][DR][# (<{A COMMENT)}>]", "");
        shelp("[LS][# A [[[[[COMMENT][LS]",
@ -7029,14 +7035,26 @@ tstHelper("ZUR");
        shelp("?", "", "[QUESTION:{?}]");
        shelp("KHAN~ BAR ", "Offset 4: Found an illegal character, ~, with ordinal 126.\n");
        shelp("[* Correction with []]",
-              "Offset 5: Found an illegal character, r, with ordinal 114.\nOffset 6: Found an illegal character, r, with ordinal 114.\nOffset 7: Found an illegal character, e, with ordinal 101.\nOffset 8: Found an illegal character, c, with ordinal 99.\nOffset 14: Found an illegal character, w, with ordinal 119.\nOffset 15: Found an illegal character, i, with ordinal 105.\nOffset 19: Found an illegal open bracket (in context, this is []]).  Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21: Found a closing bracket without a matching open bracket.  Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
+              "Offset 5: Found an illegal character, r, with ordinal 114.\nOffset 6: Found an illegal character, r, with ordinal 114.\nOffset 7: Found an illegal character, e, with ordinal 101.\nOffset 8: Found an illegal character, c, with ordinal 99.\nOffset 14: Found an illegal character, w, with ordinal 119.\nOffset 19: Found an illegal open bracket (in context, this is []]).  Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21: Found a closing bracket without a matching open bracket.  Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");

        // DLC FIXME: the line SDIG PA'I GROGS PO'I LAG TU SON PAR 'GYUR PA is followed by a blank line.  Note that it's "PA", not "PA ", ending it.  Autocorrect to the latter.

        // DLC FIXME: @0B1 isn't handled correctly!

        shelp(",NGES ? PA", "", "[TIBETAN_PUNCTUATION:{,}, TIBETAN_NON_PUNCTUATION:{NGES}, TIBETAN_PUNCTUATION:{ }, QUESTION:{?}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{PA}]");
-        shelp("K\\,", "", "[TIBETAN_NON_PUNCTUATION:{K\\}, TIBETAN_PUNCTUATION:{,}]");
+
+
+
+        // FIXME: just until we treat viramas correctly:
+        if (false) {
+            uhelp("1\\", "\u0f21\u0f84");
+            uhelp(" 1\\ ", "\u0f0b\u0f21\u0f84\u0f0b");
+        }
+        shelp("K\\,",
+              "Offset 1: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly.  Sorry!  Please do complain to the maintainers.\n",
+              "[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{\\}, TIBETAN_PUNCTUATION:{,}]");
+
+
        shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR%}]");
        shelp("PHYIR;", "", "[TIBETAN_NON_PUNCTUATION:{PHYIR}, TIBETAN_PUNCTUATION:{;}]");
        shelp("......,DAM ",
@ -7078,8 +7096,70 @@ tstHelper("ZUR");

        shelp("{ DD }", "", "[DD:{{ DD }}]"); // TD3790E2.ACT
        shelp("{ BP }", "", "[BP:{{ BP }}]"); // TD3790E2.ACT
+        shelp("//NYA\\\\",
+              "Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset 5: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly.  Sorry!  Please do complain to the maintainers.\nOffset 6: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly.  Sorry!  Please do complain to the maintainers.\n",
+              "[START_SLASH:{/}, ERROR:{//}, END_SLASH:{/}, TIBETAN_NON_PUNCTUATION:{NYA}, ERROR:{\\}, ERROR:{\\}]");

    }
+    private static void uhelp(String acip) {
+        uhelp(acip, null);
+    }
+    private static void uhelp(String acip, String expectedUnicode) {
+        StringBuffer errors = new StringBuffer();
+        String unicode = ACIPConverter.convertToUnicode(acip, errors);
+        if (null == unicode) {
+            if (null != expectedUnicode && "none" != expectedUnicode) {
+                System.out.println("No unicode exists for " + acip + " but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode));
+                assertTrue(false);
+            }
+            System.out.println("DLC: Unicode for " + acip + " can't be had; errors are " + errors);
+        } else {
+            if (null != expectedUnicode && !expectedUnicode.equals(unicode)) {
+                System.out.println("The unicode for " + acip + " is " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(unicode) + ", but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode));
+                assertTrue(false);
+            }
+        }
+    }
+
+    public void testACIPConversion() {
+        uhelp("G+DHA", "\u0f42\u0fa2");
+        uhelp("P'EE", "\u0f54\u0f71\u0f7b");
+
+        uhelp("KA", "\u0f40");
+        uhelp("KI", "\u0f40\u0f72");
+        uhelp("KO", "\u0f40\u0f7c");
+        uhelp("KE", "\u0f40\u0f7a");
+        uhelp("KU", "\u0f40\u0f74");
+        uhelp("KOO", "\u0f40\u0f7d");
+        uhelp("KEE", "\u0f40\u0f7b");
+        uhelp("KEEm", "\u0f40\u0f7b\u0f7e");
+        uhelp("KEEm:", "\u0f40\u0f7b\u0f7e\u0f7f");
+        uhelp("KEE:", "\u0f40\u0f7b\u0f7f");
+
+        uhelp("K'I", "\u0f40\u0f71\u0f72");
+        uhelp("K'O", "\u0f40\u0f71\u0f7c");
+        uhelp("K'E", "\u0f40\u0f71\u0f7a");
+        uhelp("K'U", "\u0f40\u0f71\u0f74");
+        uhelp("K'OO", "\u0f40\u0f71\u0f7d");
+        uhelp("K'EE", "\u0f40\u0f71\u0f7b");
+        uhelp("K'EEm", "\u0f40\u0f71\u0f7b\u0f7e");
+        tstHelper("K'EEm:", "{K'EEm:}",
+                  new String[] { "{K'EEm:}" },
+                  new String[] { },
+                  "{K'EEm:}");
+        uhelp("K'EEm:", "\u0f40\u0f71\u0f7b\u0f7e\u0f7f");
+        uhelp("K'EE:", "\u0f40\u0f71\u0f7b\u0f7f");
+
+        uhelp("K'A:", "\u0f40\u0f71\u0f7f");
+
+        // DLC FIXME: in ACIP RTF files, (PARENTHESES) seem to make
+        // text go from 24-point to 18-point.  Thus, ACIP->Unicode.txt
+        // is fundamentally flawed, whereas ACIP->Unicode.rtf is OK.
+
+        uhelp("/NY'EE/", "\u0f3C\u0f49\u0F71\u0F7B\u0f3D");
+        uhelp("*#HUm: G+DHOO GRO`;.,", "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
+        uhelp("*#HUm: K+DHA GRO`;.,", "none");
+    }

    /** Tests some more tsheg bars, these from Dr. Lacey's critical
        edition of Mahavyutpatti.
--- a/source/org/thdl/tib/text/ttt/TPair.java
+++ b/source/org/thdl/tib/text/ttt/TPair.java
@ -167,4 +167,19 @@ class TPair {
        if (null == rightWylie) rightWylie = "";
        return leftWylie + rightWylie;
    }
+
+    /** Appends legal Unicode corresponding to this (possible
+     *  subscribed) pair to sb.  DLC FIXME: which normalization form,
+     *  if any? */
+    void getUnicode(StringBuffer sb, boolean subscribed) {
+        if (null != getLeft()) {
+            String x = ACIPRules.getUnicodeFor(getLeft(), subscribed);
+            if (null != x) sb.append(x);
+        }
+        if (null != getRight()
+            && !("-".equals(getRight()) || "A".equals(getRight()))) {
+            String x = ACIPRules.getUnicodeFor(getRight(), subscribed);
+            if (null != x) sb.append(x);
+        }
+    }
 }
--- a/source/org/thdl/tib/text/ttt/TPairList.java
+++ b/source/org/thdl/tib/text/ttt/TPairList.java
@ -603,5 +603,16 @@ class TPairList {
            }
        }
    }
+
+    /** Appends legal Unicode corresponding to this stack to sb.  DLC
+     *  FIXME: which normalization form, if any? */
+    void getUnicode(StringBuffer sb) {
+        boolean subscribed = false;
+        for (int i = 0; i < size(); i++) {
+            get(i).getUnicode(sb, subscribed);
+            subscribed = true;
+        }
+    }
+
 }
 // DLC FIXME: handle 'o' and 'x', e.g. KAo and NYAx.
--- a/source/org/thdl/tib/text/ttt/TStackList.java
+++ b/source/org/thdl/tib/text/ttt/TStackList.java
@ -205,6 +205,15 @@ class TStackList {
            throw new IllegalArgumentException("opl (" + opl + ") is bad for this stack list (" + toString() + ")");
        return false;
    }
+
+    /** Returns legal Unicode corresponding to this tsheg bar.  DLC FIXME: which normalization form, if any? */
+    String getUnicode() {
+        StringBuffer u = new StringBuffer(size());
+        for (int i = 0; i < size(); i++) {
+            get(i).getUnicode(u);
+        }
+        return u.toString();
+    }
 }

 class BoolPair {