ACIP->Unicode, without going through TMW, is now possible, so long as

\, the Sanskrit virama, is not used. Of the 1370-odd ACIP texts I've got here, about 57% make it through the gauntlet (fewer if you demand a vowel or disambiguator on every stack of a non-Tibetan tsheg bar).
2003-08-18 02:38:54 +00:00 · 2003-08-18 02:38:54 +00:00 · 1afb3a0fdd
commit 1afb3a0fdd
parent 245aac4911
12 changed files with 646 additions and 40 deletions
--- a/source/org/thdl/tib/text/ttt/ACIPRules.java
+++ b/source/org/thdl/tib/text/ttt/ACIPRules.java
@ -28,9 +28,9 @@ class ACIPRules {
     *  three. */
    public static int MAX_CONSONANT_LENGTH = 3;

-    /** {'im:}, the longest "vowel", has 4 characters, so this is
-     *  four. */
-    public static int MAX_VOWEL_LENGTH = 4;
+    /** {'EEm:}, the longest "vowel", has 5 characters, so this is
+     *  five. */
+    public static int MAX_VOWEL_LENGTH = 5;

    /** For O(1) {@link #isVowel(String)} calls. */
    private static HashSet acipVowels = null;
@ -42,18 +42,9 @@ class ACIPRules {
        { "U", "u" },
        { "E", "e" },
        { "O", "o" },
-        { "'I", "I" },
-        { "'U", "U" },
        { "EE", "ai" },
        { "OO", "au" },
-        { "i", "-i" },
-        { "'i", "-I" },
-        { "'A", "A" },
-        { "'O", "Ao" },
-        { "'E", "Ae" }
-        // DLC I'm on my own with 'O and 'E, but GANG'O appears
-        // and I wonder... so here are 'O and 'E.  It's
-        // consistent with 'I and 'A and 'U, at least.
+        { "i", "-i" }
    };

    /** Returns true if and only if s is an ACIP "vowel".  You can't
@ -61,14 +52,24 @@ class ACIPRules {
     *  ACIP, so you have to call this in the right context. */
    public static boolean isVowel(String s) {
        if (null == acipVowels) {
-            acipVowels = new HashSet();
+            acipVowels = new HashSet(baseVowels.length * 8);
            for (int i = 0; i < baseVowels.length; i++) {
-                acipVowels.add(baseVowels[i][0]);
-                acipVowels.add(baseVowels[i][0] + 'm');
-                acipVowels.add(baseVowels[i][0] + ':');
-                acipVowels.add(baseVowels[i][0] + "m:");
-                // DLC '\' for visarga? how shall we do \ the visarga? like a vowel or not?
+                // DLC I'm on my own with 'O and 'E and 'OO and 'EE, but
+                // GANG'O appears and I wonder... so here they are.  It's
+                // consistent with 'I and 'A and 'U, at least: all the vowels
+                // may appear as K'vowel.

+                acipVowels.add(baseVowels[i][0]);
+                acipVowels.add('\'' + baseVowels[i][0]);
+                acipVowels.add(baseVowels[i][0] + 'm');
+                acipVowels.add('\'' + baseVowels[i][0] + 'm');
+                acipVowels.add(baseVowels[i][0] + ':');
+                acipVowels.add('\'' + baseVowels[i][0] + ':');
+                acipVowels.add(baseVowels[i][0] + "m:");
+                acipVowels.add('\'' + baseVowels[i][0] + "m:");
+                // DLC keep this code in sync with getUnicodeFor.
+
+                // DLC '\' for visarga? how shall we do \ the visarga? like a vowel or not?
            }
        }
        return (acipVowels.contains(s));
@ -204,4 +205,212 @@ class ACIPRules {
        }
        return (String)acipVowel2wylie.get(acip);
    }
+
+    private static HashMap superACIP2unicode = null;
+    private static HashMap subACIP2unicode = null;
+    /** If acip is an ACIP consonant or vowel or punctuation mark,
+     *  then this returns the Unicode for it.  The Unicode for the
+     *  subscribed form of the glyph is returned if subscribed is
+     *  true.  Returns null if acip is unknown. */
+    static String getUnicodeFor(String acip, boolean subscribed) {
+        if (superACIP2unicode == null) {
+            superACIP2unicode = new HashMap(144);
+            subACIP2unicode = new HashMap(42);
+
+            // oddball:
+            subACIP2unicode.put("V", "\u0FAD");
+
+            superACIP2unicode.put("DH", "\u0F52");
+            subACIP2unicode.put("DH", "\u0FA2");
+            superACIP2unicode.put("BH", "\u0F57");
+            subACIP2unicode.put("BH", "\u0FA7");
+            superACIP2unicode.put("dH", "\u0F4D");
+            subACIP2unicode.put("dH", "\u0F9D");
+            superACIP2unicode.put("DZH", "\u0F5C");
+            subACIP2unicode.put("DZH", "\u0FAC");
+            superACIP2unicode.put("Ksh", "\u0F69");
+            subACIP2unicode.put("Ksh", "\u0FB9");
+            superACIP2unicode.put("GH", "\u0F43");
+            subACIP2unicode.put("GH", "\u0F93");
+            superACIP2unicode.put("K", "\u0F40");
+            subACIP2unicode.put("K", "\u0F90");
+            superACIP2unicode.put("KH", "\u0F41");
+            subACIP2unicode.put("KH", "\u0F91");
+            superACIP2unicode.put("G", "\u0F42");
+            subACIP2unicode.put("G", "\u0F92");
+            superACIP2unicode.put("NG", "\u0F44");
+            subACIP2unicode.put("NG", "\u0F94");
+            superACIP2unicode.put("C", "\u0F45");
+            subACIP2unicode.put("C", "\u0F95");
+            superACIP2unicode.put("CH", "\u0F46");
+            subACIP2unicode.put("CH", "\u0F96");
+            superACIP2unicode.put("J", "\u0F47");
+            subACIP2unicode.put("J", "\u0F97");
+            superACIP2unicode.put("NY", "\u0F49");
+            subACIP2unicode.put("NY", "\u0F99");
+            superACIP2unicode.put("T", "\u0F4F");
+            subACIP2unicode.put("T", "\u0F9F");
+            superACIP2unicode.put("TH", "\u0F50");
+            subACIP2unicode.put("TH", "\u0FA0");
+            superACIP2unicode.put("D", "\u0F51");
+            subACIP2unicode.put("D", "\u0FA1");
+            superACIP2unicode.put("N", "\u0F53");
+            subACIP2unicode.put("N", "\u0FA3");
+            superACIP2unicode.put("P", "\u0F54");
+            subACIP2unicode.put("P", "\u0FA4");
+            superACIP2unicode.put("PH", "\u0F55");
+            subACIP2unicode.put("PH", "\u0FA5");
+            superACIP2unicode.put("B", "\u0F56");
+            subACIP2unicode.put("B", "\u0FA6");
+            superACIP2unicode.put("M", "\u0F58");
+            subACIP2unicode.put("M", "\u0FA8");
+            superACIP2unicode.put("TZ", "\u0F59");
+            subACIP2unicode.put("TZ", "\u0FA9");
+            superACIP2unicode.put("TS", "\u0F5A");
+            subACIP2unicode.put("TS", "\u0FAA");
+            superACIP2unicode.put("DZ", "\u0F5B");
+            subACIP2unicode.put("DZ", "\u0FAB");
+            superACIP2unicode.put("W", "\u0F5D");
+            subACIP2unicode.put("W", "\u0FBA"); // oddball
+            superACIP2unicode.put("ZH", "\u0F5E");
+            subACIP2unicode.put("ZH", "\u0FAE");
+            superACIP2unicode.put("Z", "\u0F5F");
+            subACIP2unicode.put("Z", "\u0FAF");
+            superACIP2unicode.put("'", "\u0F60");
+            subACIP2unicode.put("'", "\u0FB0");
+            superACIP2unicode.put("Y", "\u0F61");
+            subACIP2unicode.put("Y", "\u0FB1");
+            superACIP2unicode.put("R", "\u0F62");
+            subACIP2unicode.put("R", "\u0FB2");
+            superACIP2unicode.put("L", "\u0F63");
+            subACIP2unicode.put("L", "\u0FB3");
+            superACIP2unicode.put("SH", "\u0F64");
+            subACIP2unicode.put("SH", "\u0FB4");
+            superACIP2unicode.put("S", "\u0F66");
+            subACIP2unicode.put("S", "\u0FB6");
+            superACIP2unicode.put("H", "\u0F67");
+            subACIP2unicode.put("H", "\u0FB7");
+            superACIP2unicode.put("A", "\u0F68");
+            subACIP2unicode.put("A", "\u0FB8");
+            superACIP2unicode.put("t", "\u0F4A");
+            subACIP2unicode.put("t", "\u0F9A");
+            superACIP2unicode.put("th", "\u0F4B");
+            subACIP2unicode.put("th", "\u0F9B");
+            superACIP2unicode.put("d", "\u0F4C");
+            subACIP2unicode.put("d", "\u0F9C");
+            superACIP2unicode.put("n", "\u0F4E");
+            subACIP2unicode.put("n", "\u0F9E");
+            superACIP2unicode.put("sh", "\u0F65");
+            subACIP2unicode.put("sh", "\u0FB5");
+
+            superACIP2unicode.put("I", "\u0F72");
+            superACIP2unicode.put("E", "\u0F7A");
+            superACIP2unicode.put("O", "\u0F7C");
+            superACIP2unicode.put("U", "\u0F74");
+            superACIP2unicode.put("OO", "\u0F7D");
+            superACIP2unicode.put("EE", "\u0F7B");
+            superACIP2unicode.put("i", "\u0F80");
+            superACIP2unicode.put("'A", "\u0F71");
+            superACIP2unicode.put("'I", "\u0F71\u0F72");
+            superACIP2unicode.put("'E", "\u0F71\u0F7A");
+            superACIP2unicode.put("'O", "\u0F71\u0F7C");
+            superACIP2unicode.put("'U", "\u0F71\u0F74");
+            superACIP2unicode.put("'OO", "\u0F71\u0F7D");
+            superACIP2unicode.put("'EE", "\u0F71\u0F7B");
+            superACIP2unicode.put("'i", "\u0F71\u0F80");
+
+            superACIP2unicode.put("Im", "\u0F72\u0F7E");
+            superACIP2unicode.put("Em", "\u0F7A\u0F7E");
+            superACIP2unicode.put("Om", "\u0F7C\u0F7E");
+            superACIP2unicode.put("Um", "\u0F74\u0F7E");
+            superACIP2unicode.put("OOm", "\u0F7D\u0F7E");
+            superACIP2unicode.put("EEm", "\u0F7B\u0F7E");
+            superACIP2unicode.put("im", "\u0F80\u0F7E");
+            superACIP2unicode.put("'Am", "\u0F71\u0F7E");
+            superACIP2unicode.put("'Im", "\u0F71\u0F72\u0F7E");
+            superACIP2unicode.put("'Em", "\u0F71\u0F7A\u0F7E");
+            superACIP2unicode.put("'Om", "\u0F71\u0F7C\u0F7E");
+            superACIP2unicode.put("'Um", "\u0F71\u0F74\u0F7E");
+            superACIP2unicode.put("'OOm", "\u0F71\u0F7D\u0F7E");
+            superACIP2unicode.put("'EEm", "\u0F71\u0F7B\u0F7E");
+            superACIP2unicode.put("'im", "\u0F71\u0F80\u0F7E");
+
+            superACIP2unicode.put("I:", "\u0F72\u0F7F");
+            superACIP2unicode.put("E:", "\u0F7A\u0F7F");
+            superACIP2unicode.put("O:", "\u0F7C\u0F7F");
+            superACIP2unicode.put("U:", "\u0F74\u0F7F");
+            superACIP2unicode.put("OO:", "\u0F7D\u0F7F");
+            superACIP2unicode.put("EE:", "\u0F7B\u0F7F");
+            superACIP2unicode.put("i:", "\u0F80\u0F7F");
+            superACIP2unicode.put("'A:", "\u0F71\u0F7F");
+            superACIP2unicode.put("'I:", "\u0F71\u0F72\u0F7F");
+            superACIP2unicode.put("'E:", "\u0F71\u0F7A\u0F7F");
+            superACIP2unicode.put("'O:", "\u0F71\u0F7C\u0F7F");
+            superACIP2unicode.put("'U:", "\u0F71\u0F74\u0F7F");
+            superACIP2unicode.put("'OO:", "\u0F71\u0F7D\u0F7F");
+            superACIP2unicode.put("'EE:", "\u0F71\u0F7B\u0F7F");
+            superACIP2unicode.put("'i:", "\u0F71\u0F80\u0F7F");
+
+            superACIP2unicode.put("Im:", "\u0F72\u0F7E\u0F7F");
+            superACIP2unicode.put("Em:", "\u0F7A\u0F7E\u0F7F");
+            superACIP2unicode.put("Om:", "\u0F7C\u0F7E\u0F7F");
+            superACIP2unicode.put("Um:", "\u0F74\u0F7E\u0F7F");
+            superACIP2unicode.put("OOm:", "\u0F7D\u0F7E\u0F7F");
+            superACIP2unicode.put("EEm:", "\u0F7B\u0F7E\u0F7F");
+            superACIP2unicode.put("im:", "\u0F80\u0F7E\u0F7F");
+            superACIP2unicode.put("'Am:", "\u0F71\u0F7E\u0F7F");
+            superACIP2unicode.put("'Im:", "\u0F71\u0F72\u0F7E\u0F7F");
+            superACIP2unicode.put("'Em:", "\u0F71\u0F7A\u0F7E\u0F7F");
+            superACIP2unicode.put("'Om:", "\u0F71\u0F7C\u0F7E\u0F7F");
+            superACIP2unicode.put("'Um:", "\u0F71\u0F74\u0F7E\u0F7F");
+            superACIP2unicode.put("'OOm:", "\u0F71\u0F7D\u0F7E\u0F7F");
+            superACIP2unicode.put("'EEm:", "\u0F71\u0F7B\u0F7E\u0F7F");
+            superACIP2unicode.put("'im:", "\u0F71\u0F80\u0F7E\u0F7F");
+            // :m does not appear, though you'd think it's as valid as m:.
+
+            // I doubt these will occur alone:
+            superACIP2unicode.put("m", "\u0F7E");
+            superACIP2unicode.put(":", "\u0F7F");
+
+            superACIP2unicode.put("Am", "\u0F7E");
+            superACIP2unicode.put("A:", "\u0F7F");
+
+            superACIP2unicode.put("0", "\u0F20");
+            superACIP2unicode.put("1", "\u0F21");
+            superACIP2unicode.put("2", "\u0F22");
+            superACIP2unicode.put("3", "\u0F23");
+            superACIP2unicode.put("4", "\u0F24");
+            superACIP2unicode.put("5", "\u0F25");
+            superACIP2unicode.put("6", "\u0F26");
+            superACIP2unicode.put("7", "\u0F27");
+            superACIP2unicode.put("8", "\u0F28");
+            superACIP2unicode.put("9", "\u0F29");
+
+            // DLC punctuation
+            superACIP2unicode.put("&", "\u0F85");
+            superACIP2unicode.put(",", "\u0F0D");
+            superACIP2unicode.put(" ", "\u0F0B");
+            superACIP2unicode.put(".", "\u0F0C");
+            superACIP2unicode.put("`", "\u0F08");
+            superACIP2unicode.put("`", "\u0F08");
+            superACIP2unicode.put("*", "\u0F04\u0F05");
+            superACIP2unicode.put("#", "\u0F04\u0F05\u0F05");
+            superACIP2unicode.put("%", "\u0F35");
+            superACIP2unicode.put(";", "\u0F11");
+            superACIP2unicode.put("\r", "\r");
+            superACIP2unicode.put("\t", "\t");
+            superACIP2unicode.put("\n", "\n");
+            superACIP2unicode.put("\\", "\u0F84"); // DLC FIXME: make this like a vowel
+            // DLC FIXME: what's the Unicode for caret, ^?
+            // DLC FIXME: what's the Unicode for o?
+            // DLC FIXME: what's the Unicode for x?
+
+        }
+        if (subscribed) {
+            String u = (String)subACIP2unicode.get(acip);
+            if (null != u) return u;
+        }
+        return (String)superACIP2unicode.get(acip);
+
+    }
 }