ACIP->Unicode now uses two characters for consonants instead of one. This matches the dislike for characters like U+0F77 etc.

ACIP->Tibetan was not giving an error for BCWA because it parsed like BCVA. Fixed.
2003-12-15 07:32:14 +00:00 · 2003-12-15 07:32:14 +00:00 · e7a9e7968f
commit e7a9e7968f
parent e9f7b2dfed
4 changed files with 102 additions and 54 deletions
--- a/source/org/thdl/tib/text/ttt/ACIPRules.java
+++ b/source/org/thdl/tib/text/ttt/ACIPRules.java
@ -24,6 +24,7 @@ import java.util.HashMap;
 import java.util.StringTokenizer;
 import java.util.List;

+import org.thdl.util.ThdlOptions;
 import org.thdl.tib.text.DuffCode;
 import org.thdl.tib.text.THDLWylieConstants;
 import org.thdl.tib.text.TibetanMachineWeb;
@ -206,7 +207,11 @@ public class ACIPRules {
    private static HashMap acipConsonant2wylie = null;
    /** Returns the EWTS corresponding to the given ACIP consonant
     *  (without the "A" vowel).  Returns null if there is no such
-     *  EWTS. */
+     *  EWTS.
+     *
+     *  <p>Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y",
+     *  even though sometimes the EWTS for those is "w", "R", or "Y".
+     *  Handle that in the caller. */
    static final String getWylieForACIPConsonant(String acip) {
        if (acipConsonant2wylie == null) {
            acipConsonant2wylie = new HashMap(37);
@ -242,7 +247,15 @@ public class ACIPRules {
            putMapping(acipConsonant2wylie, "TZ", "ts");
            putMapping(acipConsonant2wylie, "TS", "tsh");
            putMapping(acipConsonant2wylie, "DZ", "dz");
-            putMapping(acipConsonant2wylie, "W", "w");
+            putMapping(acipConsonant2wylie, "W", "W"
+                       /* NOTE WELL: sometimes "w", sometimes "W".
+                          Handle this in the caller.
+                          
+                          Reasoning for "W" instead of "w": r-w and
+                          r+w are both known hash keys.  We sort 'em
+                          out this way.  (They are the only things
+                          like this according to bug report #800166.)  */
+                       );
            putMapping(acipConsonant2wylie, "ZH", "zh");
            putMapping(acipConsonant2wylie, "Z", "z");
            putMapping(acipConsonant2wylie, "'", "'");
@ -329,24 +342,26 @@ public class ACIPRules {
     *  true.  Returns null if acip is unknown. */
    static String getUnicodeFor(String acip, boolean subscribed) {
        if (superACIP2unicode == null) {
+            final boolean compactUnicode
+                = ThdlOptions.getBooleanOption("thdl.acip.to.unicode.conversions.use.0F52.et.cetera");
            superACIP2unicode = new HashMap(144);
            subACIP2unicode = new HashMap(42);

            // oddball:
            subACIP2unicode.put("V", "\u0FAD");

-            superACIP2unicode.put("DH", "\u0F52");
-            subACIP2unicode.put("DH", "\u0FA2");
-            superACIP2unicode.put("BH", "\u0F57");
-            subACIP2unicode.put("BH", "\u0FA7");
-            superACIP2unicode.put("dH", "\u0F4D");
-            subACIP2unicode.put("dH", "\u0F9D");
-            superACIP2unicode.put("DZH", "\u0F5C");
-            subACIP2unicode.put("DZH", "\u0FAC");
-            superACIP2unicode.put("Ksh", "\u0F69");
-            subACIP2unicode.put("Ksh", "\u0FB9");
-            superACIP2unicode.put("GH", "\u0F43");
-            subACIP2unicode.put("GH", "\u0F93");
+            superACIP2unicode.put("DH", (compactUnicode ? "\u0F52" : "\u0F51\u0FB7"));
+            subACIP2unicode.put("DH", (compactUnicode ? "\u0FA2" : "\u0FA1\u0FB7"));
+            superACIP2unicode.put("BH", (compactUnicode ? "\u0F57" : "\u0F56\u0FB7"));
+            subACIP2unicode.put("BH", (compactUnicode ? "\u0FA7" : "\u0FA6\u0FB7"));
+            superACIP2unicode.put("dH", (compactUnicode ? "\u0F4D" : "\u0F4C\u0FB7"));
+            subACIP2unicode.put("dH", (compactUnicode ? "\u0F9D" : "\u0F9C\u0FB7"));
+            superACIP2unicode.put("DZH", (compactUnicode ? "\u0F5C" : "\u0F5B\u0FB7"));
+            subACIP2unicode.put("DZH", (compactUnicode ? "\u0FAC" : "\u0FAB\u0FB7"));
+            superACIP2unicode.put("Ksh", (compactUnicode ? "\u0F69" : "\u0F40\u0FB5"));
+            subACIP2unicode.put("Ksh", (compactUnicode ? "\u0FB9" : "\u0F90\u0FB5"));
+            superACIP2unicode.put("GH", (compactUnicode ? "\u0F43" : "\u0F42\u0FB7"));
+            subACIP2unicode.put("GH", (compactUnicode ? "\u0F93" : "\u0F92\u0FB7"));
            superACIP2unicode.put("K", "\u0F40");
            subACIP2unicode.put("K", "\u0F90");
            superACIP2unicode.put("KH", "\u0F41");
--- a/source/org/thdl/tib/text/ttt/PackageTest.java
+++ b/source/org/thdl/tib/text/ttt/PackageTest.java
@ -7449,13 +7449,13 @@ M+NA
        uhelp("NA+YA", "\u0f53\u0fb1"); // FIXME: warn about the extra A
        uhelp("NE+YA", "[#ERROR CONVERTING ACIP DOCUMENT: The tsheg bar (\"syllable\") NE+YA has these errors: Cannot convert ACIP NE+-YA because + is not an ACIP consonant]");
        uhelp("tRAStA", "\u0f4a\u0fb2\u0f66\u0f9a");
-        uhelp("DZHDZHA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP DZHDZHA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f5c\u0fac"); // tricky because DZHDZA is not in TMW but DZHDZHA is
-        uhelp("DZHDZA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP DZHDZA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f5c\u0fab");
+        uhelp("DZHDZHA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP DZHDZHA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f5b\u0fb7\u0fab\u0fb7"); // tricky because DZHDZA is not in TMW but DZHDZHA is
+        uhelp("DZHDZA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP DZHDZA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f5b\u0fb7\u0fab");
        uhelp("P+S+N+YA", "\u0f54\u0fb6\u0fa3\u0fb1");
        uhelp("P+S+NYA", "\u0f54\u0fb6\u0f99");
        uhelp("PSNYA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP PSNYA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f54\u0fb6\u0f99"); // Is this P+S+N+YA?  No, it's P+S+NYA.  But warn!
        uhelp("NNYA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP NNYA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f53\u0f99");
-        uhelp("GHNYA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP GHNYA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f43\u0f99");
+        uhelp("GHNYA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP GHNYA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f42\u0fb7\u0f99");

        // TS+NYA and T+S+N+YA are both legal, so what is TSNYA?
        // Private correspondence with Robert Chilton says that it is
@ -7471,7 +7471,7 @@ M+NA
        uhelp("KAo KHA", "\u0f40\u0f37\u0f0b\u0f41");
        uhelp("KA KAo KHA", "\u0f40\u0f0b\u0f40\u0f37\u0f0b\u0f41");
        uhelp("KAx", "\u0f40[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot convert the ACIP {x} to Tibetan because it is unclear what the result should be.]");
-        uhelp("G+DHA", "\u0f42\u0fa2");
+        uhelp("G+DHA", "\u0f42\u0fa1\u0fb7");
        uhelp("P'EE", "\u0f54\u0f71\u0f7b");

        uhelp("KA", "\u0f40");
@ -7523,9 +7523,9 @@ M+NA

        uhelp("/NY'EE/", "\u0f3C\u0f49\u0F71\u0F7B\u0f3D");
        uhelp("*#HUm: G+DHOO GRO`;.,",
-              "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
+              "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa1\u0fb7\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
        uhelp("*#HUm: K+DHA GRO`;.,",
-              "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f40\u0fa2\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
+              "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f40\u0fa1\u0fb7\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
        uhelp("HA,\nHA\n\nHA", "\u0f67\u0f0d \u0f67\u0f0b\n\n\u0f67");
        uhelp("NGA,", "\u0f44\u0f0c\u0f0d");
        uhelp("NGA,\nHA\n\nHA", "\u0f44\u0f0c\u0f0d \u0f67\u0f0b\n\n\u0f67");
@ -7538,6 +7538,36 @@ M+NA
        uhelp("GU, ,KHO", "\u0f42\u0f74\u0f0d \u0f0d\u0f41\u0f7c");
        uhelp("GU  ,KHO", "\u0f42\u0f74\u0f0b \u0f0d\u0f41\u0f7c"); // FIXME: missing a shad after GU, warn about that.
        uhelp("GA  HA", "\u0f42\u0f0b \u0f67");
+        uhelp("WA", "\u0f5d");
+        uhelp("W", "\u0f5d");
+        uhelp("WO", "\u0f5d\u0f7c");
+        uhelp("WWA", "\u0f5d\u0fba");
+        uhelp("W+WA", "\u0f5d\u0fba");
+        uhelp("WNA", "\u0f5d\u0fa3");
+        uhelp("WN", "\u0f5d\u0fa3");
+        uhelp("W+NA", "\u0f5d\u0fa3");
+        uhelp("W+N", "\u0f5d\u0fa3");
+        uhelp("W+YA", "\u0f5d\u0fb1");
+        uhelp("W+Y", "\u0f5d\u0fb1");
+        uhelp("WYA", "\u0f5d\u0fb1");
+        uhelp("WY", "\u0f5d\u0fb1");
+        uhelp("WR", "\u0f5d\u0fb2");
+        uhelp("WRA", "\u0f5d\u0fb2");
+        uhelp("W+RA", "\u0f5d\u0fb2");
+        uhelp("W+R", "\u0f5d\u0fb2");
+        uhelp("BCWA", "\u0f56\u0f95\u0fba");
+        uhelp("BCW", "\u0f56\u0f95\u0fba");
+        uhelp("BCWO", "\u0f56\u0f95\u0fba\u0f7c");
+        uhelp("BCVA", "\u0f56\u0f45\u0fad");
+        uhelp("BCV", "\u0f56\u0f45\u0fad");
+        uhelp("BCV'O", "\u0f56\u0f45\u0fad\u0f71\u0f7c");
+        uhelp("BCV'A", "\u0f56\u0f45\u0fad\u0f71");
+        uhelp("BCV'", "\u0f56\u0f95\u0fad\u0fb0");
+        uhelp("GYA", "\u0f42\u0fb1");
+        uhelp("GY", "\u0f42\u0fb1");
+        uhelp("G-YA", "\u0f42\u0f61");
+        uhelp("GA-YA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a useless disambiguator in GA-YA.]\u0f42\u0f61");
+        uhelp("GA-YO", "[#WARNING CONVERTING ACIP DOCUMENT: There is a useless disambiguator in GA-YO.]\u0f42\u0f61\u0F7c");
    }
    public void testFixedFormSubjoinedConsonants() {
        // Usual subjoined RA:
@ -7563,7 +7593,7 @@ M+NA
              + "\u0f61\u0fbb\u0f7b\u0f0b" // Y+YEE
              + "\u0f4e\u0f9c\u0fbc\u0fb1\u0f0b" // ndRYA
              + "\u0f4e\u0f9c\u0fbc\u0fb1\u0f7b\u0f0b" // n+d+R+YEE
-              + "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP KshR was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f69\u0fbc\u0f0b" // KshR
+              + "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP KshR was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f40\u0fb5\u0fbc\u0f0b" // KshR
              + "\u0f40\u0fb5\u0fbc\u0f7b\u0f0b" // K+sh+REE
              + "\u0f4e\u0f9c\u0fbb\u0f0b" // ndY
              + "\u0f4e\u0f9c\u0fbb\u0f7b\u0f0d" // n+d+YEE
--- a/source/org/thdl/tib/text/ttt/TPair.java
+++ b/source/org/thdl/tib/text/ttt/TPair.java
@ -184,7 +184,11 @@ class TPair {

    /** Returns the EWTS Wylie that corresponds to this pair if
     *  justLeft is false, or the EWTS Wylie that corresponds to just
-     *  {@link #getLeft()} if justLeft is true. */
+     *  {@link #getLeft()} if justLeft is true.
+     *
+     *  <p>Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y",
+     *  even though sometimes the EWTS for those is "w", "R", or "Y".
+     *  Handle that in the caller. */
    String getWylie(boolean justLeft) {
        String leftWylie = null;
        if (getLeft() != null) {
--- a/source/org/thdl/tib/text/ttt/TPairList.java
+++ b/source/org/thdl/tib/text/ttt/TPairList.java
@ -660,25 +660,7 @@ class TPairList {
        TPair lastPair = get(size() - 1);
        wylieForConsonant.append(lastPair.getWylie(true));
        String hashKey = wylieForConsonant.toString();
-        // r-w and r+w are both known hash keys.  Sort 'em out.  They
-        // are the only things like this according to bug report
-        // #800166.
-        if ("r+w".equals(hashKey)) {
-            boolean sawWazur = false;
-            for (int x = 0; x < size(); x++) {
-                TPair p = get(x);
-                if ("V".equals(get(x).getLeft())) {
-                    sawWazur = true;
-                    break;
-                }
-            }
-            if (sawWazur)
-                hashKey = "r-w";
-            else
-                hashKey = "r+W"; // because EWTS has special handling
-                                 // for full-formed subjoined
-                                 // consonants
-        } else {
+
        // Because EWTS has special handling for full-formed
        // subjoined consonants, we have special handling here.
        if ("r+y".equals(hashKey))
@ -691,7 +673,24 @@ class TPairList {
            hashKey = "N+D+R+y";
        else if ("k+Sh+r".equals(hashKey))
            hashKey = "k+Sh+R";
-        }
+        
+        // TPair.getWylie(..) returns "W" sometimes when "w" is what
+        // really should be returned.  ("V" always causes "w" to be
+        // returned, which is fine.)  We'll change "W" to "w" here if
+        // we need to.  We do it only for a few known stacks (the ones
+        // in TMW).
+        if ("W".equals(hashKey))
+            hashKey = "w";
+        else if ("W+y".equals(hashKey))
+            hashKey = "w+y";
+        else if ("W+r".equals(hashKey))
+            hashKey = "w+r";
+        else if ("W+n".equals(hashKey))
+            hashKey = "w+n";
+        else if ("W+W".equals(hashKey))
+            hashKey = "w+W";
+        // We're NOT doing it for r+W etc., on purpose.
+
        if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {
            hashKey = hashKey.replace('+', '-');
            if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {