Fixed ACIP->Unicode bug for YYE etc., things with full-formed

subjoined consonants and vowels. Fixed ACIP->TMW for YYA etc., things with full-formed subjoined consonants.
2003-12-14 07:36:21 +00:00 · 2003-12-14 07:36:21 +00:00 · 76c2e969ac
commit 76c2e969ac
parent f625c937ee
4 changed files with 91 additions and 29 deletions
--- a/source/org/thdl/tib/text/ttt/PackageTest.java
+++ b/source/org/thdl/tib/text/ttt/PackageTest.java
@ -7553,6 +7553,21 @@ M+NA
        uhelp("n+d+Y", "\u0f4e\u0f9c\u0fbb");
        uhelp("Y+Y", "\u0f61\u0fbb");
        uhelp("R+Y", "\u0f62\u0fbb");
+
+        uhelp("RVA R+VEE RWA R+WEE YYA Y+YEE ndRYA n+d+R+YEE KshR K+sh+REE ndY n+d+YEE,",
+              "\u0f62\u0fad\u0f0b" // RVA
+              + "\u0f62\u0fad\u0f7b\u0f0b" //R+VEE
+              + "\u0f62\u0fba\u0f0b" // RWA
+              + "\u0f62\u0fba\u0f7b\u0f0b" // R+WEE
+              + "\u0f61\u0fbb\u0f0b" // YYA
+              + "\u0f61\u0fbb\u0f7b\u0f0b" // Y+YEE
+              + "\u0f4e\u0f9c\u0fbc\u0fb1\u0f0b" // ndRYA
+              + "\u0f4e\u0f9c\u0fbc\u0fb1\u0f7b\u0f0b" // n+d+R+YEE
+              + "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP KshR was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f69\u0fbc\u0f0b" // KshR
+              + "\u0f40\u0fb5\u0fbc\u0f7b\u0f0b" // K+sh+REE
+              + "\u0f4e\u0f9c\u0fbb\u0f0b" // ndY
+              + "\u0f4e\u0f9c\u0fbb\u0f7b\u0f0d" // n+d+YEE
+              ); // DLC FIXME: test the TMW for these, too, it was broken once.
    }

    /** Tests some more tsheg bars, these from Dr. Lacey's critical
--- a/source/org/thdl/tib/text/ttt/TPair.java
+++ b/source/org/thdl/tib/text/ttt/TPair.java
@ -211,16 +211,25 @@ class TPair {
     *  subscribed) pair to sb.  FIXME: which normalization form,
     *  if any? */
    void getUnicode(StringBuffer sb, boolean subscribed) {
+        getUnicode(sb, sb, subscribed);
+    }
+
+    /** Appends legal Unicode corresponding to this (possible
+     *  subscribed) pair to consonantSB (for the non-vowel part) and
+     *  vowelSB (for the vowelish part ({'EEm:}, e.g.).  FIXME: which
+     *  normalization form, if any? */
+    void getUnicode(StringBuffer consonantSB, StringBuffer vowelSB,
+                    boolean subscribed) {
        if (null != getLeft()) {
            String x = ACIPRules.getUnicodeFor(getLeft(), subscribed);
            if (null == x) throw new Error("TPair: " + getLeft() + " has no Uni");
-            sb.append(x);
+            consonantSB.append(x);
        }
        if (null != getRight()
            && !("-".equals(getRight()) || "+".equals(getRight()) || "A".equals(getRight()))) {
            String x = ACIPRules.getUnicodeFor(getRight(), subscribed);
            if (null == x) throw new Error("TPair: " + getRight() + " has no Uni");
-            sb.append(x);
+            vowelSB.append(x);
        }
    }

--- a/source/org/thdl/tib/text/ttt/TPairList.java
+++ b/source/org/thdl/tib/text/ttt/TPairList.java
@ -23,6 +23,7 @@ import org.thdl.tib.text.DuffCode;
 import org.thdl.tib.text.TGCPair;
 import org.thdl.util.ThdlDebug;

+import java.util.HashMap;
 import java.util.ArrayList;

 /** A list of {@link TPair TPairs}, typically corresponding to
@ -592,14 +593,58 @@ class TPairList {
        }
    }

+    private static HashMap unicodeExceptionsMap = null;
+
    /** Appends legal Unicode corresponding to this stack to sb.
     *  FIXME: which normalization form, if any? */
    void getUnicode(StringBuffer sb) {
+        // The question is this: U+0FB1 or U+0FBB?  U+0FB2 or U+0FBC?
+        // The answer: always the usual form, not the full form,
+        // except for a few known stacks (all the ones with full-form,
+        // non-WA subjoined consonants in TMW: [in EWTS, they are:]
+        // r+Y, N+D+Y, N+D+R+y, k+Sh+R).  Note that wa-zur, U+0FAD, is
+        // never confused for U+0FBA because "V" and "W" are different
+        // transliterations.  EWTS {r+W} thus needs no special
+        // treatment during ACIP->Unicode.
+
+        StringBuffer nonVowelSB = new StringBuffer();
+        int beginningIndex = sb.length();
        boolean subscribed = false;
-        for (int i = 0; i < size(); i++) {
-            get(i).getUnicode(sb, subscribed);
+        int szz = size();
+        int i;
+        for (i = 0; i + ((1 == szz) ? 0 : 1) < szz; i++) {
+            TPair p = get(i);
+
+            // FIXME: change this to an assertion:
+            if ((1 != szz) && null != p.getRight() && !"+".equals(p.getRight()))
+                throw new Error("Oops -- this stack (i.e., " + toString() + ") is funny, so we can't generate proper Unicode for it.  i is " + i + " and size is " + szz);
+
+            p.getUnicode(nonVowelSB, subscribed);
            subscribed = true;
        }
+        if (szz > 1) {
+            TPair p = get(i);
+            StringBuffer vowelSB = new StringBuffer();
+            p.getUnicode(nonVowelSB, vowelSB, subscribed /* which is true */);
+
+            if (null == unicodeExceptionsMap) {
+                unicodeExceptionsMap = new HashMap();
+                unicodeExceptionsMap.put("\u0f69\u0fb2", "\u0f69\u0fbc"); // KshR (variety 1)
+                unicodeExceptionsMap.put("\u0f40\u0fb5\u0fb2", "\u0f40\u0fb5\u0fbc"); // KshR (variety 2)
+                unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb2\u0fb1", "\u0f4e\u0f9c\u0fbc\u0fb1"); // ndRY
+                unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb1", "\u0f4e\u0f9c\u0fbb"); // ndY
+                unicodeExceptionsMap.put("\u0f61\u0fb1", "\u0f61\u0fbb"); // YY
+                unicodeExceptionsMap.put("\u0f62\u0fb1", "\u0f62\u0fbb"); // RY
+            }
+            String mapEntry = (String)unicodeExceptionsMap.get(nonVowelSB.toString());
+            if (null != mapEntry)
+                sb.append(mapEntry);
+            else
+                sb.append(nonVowelSB);
+            sb.append(vowelSB);
+        } else {
+            sb.append(nonVowelSB);
+        }
    }

    /** Appends the DuffCodes that correspond to this grapheme cluster
@ -629,6 +674,23 @@ class TPairList {
            }
            if (sawWazur)
                hashKey = "r-w";
+            else
+                hashKey = "r+W"; // because EWTS has special handling
+                                 // for full-formed subjoined
+                                 // consonants
+        } else {
+            // Because EWTS has special handling for full-formed
+            // subjoined consonants, we have special handling here.
+            if ("r+y".equals(hashKey))
+                hashKey = "r+Y";
+            else if ("y+y".equals(hashKey))
+                hashKey = "y+Y";
+            else if ("N+D+y".equals(hashKey))
+                hashKey = "N+D+Y";
+            else if ("N+D+r+y".equals(hashKey))
+                hashKey = "N+D+R+y";
+            else if ("k+Sh+r".equals(hashKey))
+                hashKey = "k+Sh+R";
        }
        if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {
            hashKey = hashKey.replace('+', '-');
--- a/source/org/thdl/tib/text/ttt/TStackList.java
+++ b/source/org/thdl/tib/text/ttt/TStackList.java
@ -225,37 +225,13 @@ class TStackList {
        return false;
    }

-    private static HashMap unicodeExceptionsMap = null;
-
    /** Returns legal Unicode corresponding to this tsheg bar.  FIXME: which normalization form, if any? */
    String getUnicode() {
-        // The question is this: U+0FB1 or U+0FBB?  U+0FB2 or
-        // U+0FBC?  The answer: always the usual form, not the
-        // full form, except for a few known stacks (all the ones
-        // with full form subjoined consonants in TMW).  Note that
-        // wa-zur, U+0FAD, is never confused for U+0FBA because
-        // "V" and "W" are different transliterations.
-
        StringBuffer u = new StringBuffer(size());
        for (int i = 0; i < size(); i++) {
            get(i).getUnicode(u);
        }
-
-        String us = u.toString();
-        if (null == unicodeExceptionsMap) {
-            unicodeExceptionsMap = new HashMap();
-            unicodeExceptionsMap.put("\u0f69\u0fb2", "\u0f69\u0fbc"); // KshR
-            unicodeExceptionsMap.put("\u0f40\u0fb5\u0fb2", "\u0f40\u0fb5\u0fbc"); // KshR
-            unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb2\u0fb1", "\u0f4e\u0f9c\u0fbc\u0fb1"); // ndRY
-            unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb1", "\u0f4e\u0f9c\u0fbb"); // ndY
-            unicodeExceptionsMap.put("\u0f61\u0fb1", "\u0f61\u0fbb"); // YY
-            unicodeExceptionsMap.put("\u0f62\u0fb1", "\u0f62\u0fbb"); // RY
-        }
-        String mapEntry = (String)unicodeExceptionsMap.get(us);
-        if (null != mapEntry)
-            return mapEntry;
-        else
-            return us;
+        return u.toString();
    }

    /** Returns the DuffCodes and errors corresponding to this stack