Fixed ACIP->Unicode bug for YYE etc., things with full-formed

subjoined consonants and vowels. Fixed ACIP->TMW for YYA etc., things with full-formed subjoined consonants.
2003-12-14 07:36:21 +00:00 · 2003-12-14 07:36:21 +00:00 · 76c2e969ac
commit 76c2e969ac
parent f625c937ee
4 changed files with 91 additions and 29 deletions
--- a/source/org/thdl/tib/text/ttt/PackageTest.java
+++ b/source/org/thdl/tib/text/ttt/PackageTest.java
@ -7553,6 +7553,21 @@ M+NA
        uhelp("n+d+Y", "\u0f4e\u0f9c\u0fbb");
        uhelp("Y+Y", "\u0f61\u0fbb");
        uhelp("R+Y", "\u0f62\u0fbb");
        uhelp("RVA R+VEE RWA R+WEE YYA Y+YEE ndRYA n+d+R+YEE KshR K+sh+REE ndY n+d+YEE,",
              "\u0f62\u0fad\u0f0b" // RVA
              + "\u0f62\u0fad\u0f7b\u0f0b" //R+VEE
              + "\u0f62\u0fba\u0f0b" // RWA
              + "\u0f62\u0fba\u0f7b\u0f0b" // R+WEE
              + "\u0f61\u0fbb\u0f0b" // YYA
              + "\u0f61\u0fbb\u0f7b\u0f0b" // Y+YEE
              + "\u0f4e\u0f9c\u0fbc\u0fb1\u0f0b" // ndRYA
              + "\u0f4e\u0f9c\u0fbc\u0fb1\u0f7b\u0f0b" // n+d+R+YEE
              + "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP KshR was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f69\u0fbc\u0f0b" // KshR
              + "\u0f40\u0fb5\u0fbc\u0f7b\u0f0b" // K+sh+REE
              + "\u0f4e\u0f9c\u0fbb\u0f0b" // ndY
              + "\u0f4e\u0f9c\u0fbb\u0f7b\u0f0d" // n+d+YEE
              ); // DLC FIXME: test the TMW for these, too, it was broken once.
    }
    /** Tests some more tsheg bars, these from Dr. Lacey's critical
--- a/source/org/thdl/tib/text/ttt/TPair.java
+++ b/source/org/thdl/tib/text/ttt/TPair.java
@ -211,16 +211,25 @@ class TPair {
     *  subscribed) pair to sb.  FIXME: which normalization form,
     *  if any? */
    void getUnicode(StringBuffer sb, boolean subscribed) {
        getUnicode(sb, sb, subscribed);
    }
    /** Appends legal Unicode corresponding to this (possible
     *  subscribed) pair to consonantSB (for the non-vowel part) and
     *  vowelSB (for the vowelish part ({'EEm:}, e.g.).  FIXME: which
     *  normalization form, if any? */
    void getUnicode(StringBuffer consonantSB, StringBuffer vowelSB,
                    boolean subscribed) {
        if (null != getLeft()) {
            String x = ACIPRules.getUnicodeFor(getLeft(), subscribed);
            if (null == x) throw new Error("TPair: " + getLeft() + " has no Uni");
-            sb.append(x);
+            consonantSB.append(x);
        }
        if (null != getRight()
            && !("-".equals(getRight()) || "+".equals(getRight()) || "A".equals(getRight()))) {
            String x = ACIPRules.getUnicodeFor(getRight(), subscribed);
            if (null == x) throw new Error("TPair: " + getRight() + " has no Uni");
-            sb.append(x);
+            vowelSB.append(x);
        }
    }
--- a/source/org/thdl/tib/text/ttt/TPairList.java
+++ b/source/org/thdl/tib/text/ttt/TPairList.java
@ -23,6 +23,7 @@ import org.thdl.tib.text.DuffCode;
 import org.thdl.tib.text.TGCPair;
 import org.thdl.util.ThdlDebug;
 import java.util.HashMap;
 import java.util.ArrayList;
 /** A list of {@link TPair TPairs}, typically corresponding to
@ -592,14 +593,58 @@ class TPairList {
        }
    }
    private static HashMap unicodeExceptionsMap = null;
    /** Appends legal Unicode corresponding to this stack to sb.
     *  FIXME: which normalization form, if any? */
    void getUnicode(StringBuffer sb) {
        // The question is this: U+0FB1 or U+0FBB?  U+0FB2 or U+0FBC?
        // The answer: always the usual form, not the full form,
        // except for a few known stacks (all the ones with full-form,
        // non-WA subjoined consonants in TMW: [in EWTS, they are:]
        // r+Y, N+D+Y, N+D+R+y, k+Sh+R).  Note that wa-zur, U+0FAD, is
        // never confused for U+0FBA because "V" and "W" are different
        // transliterations.  EWTS {r+W} thus needs no special
        // treatment during ACIP->Unicode.
        StringBuffer nonVowelSB = new StringBuffer();
        int beginningIndex = sb.length();
        boolean subscribed = false;
-        for (int i = 0; i < size(); i++) {
+        int szz = size();
-            get(i).getUnicode(sb, subscribed);
+        int i;
        for (i = 0; i + ((1 == szz) ? 0 : 1) < szz; i++) {
            TPair p = get(i);
            // FIXME: change this to an assertion:
            if ((1 != szz) && null != p.getRight() && !"+".equals(p.getRight()))
                throw new Error("Oops -- this stack (i.e., " + toString() + ") is funny, so we can't generate proper Unicode for it.  i is " + i + " and size is " + szz);
            p.getUnicode(nonVowelSB, subscribed);
            subscribed = true;
        }
        if (szz > 1) {
            TPair p = get(i);
            StringBuffer vowelSB = new StringBuffer();
            p.getUnicode(nonVowelSB, vowelSB, subscribed /* which is true */);
            if (null == unicodeExceptionsMap) {
                unicodeExceptionsMap = new HashMap();
                unicodeExceptionsMap.put("\u0f69\u0fb2", "\u0f69\u0fbc"); // KshR (variety 1)
                unicodeExceptionsMap.put("\u0f40\u0fb5\u0fb2", "\u0f40\u0fb5\u0fbc"); // KshR (variety 2)
                unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb2\u0fb1", "\u0f4e\u0f9c\u0fbc\u0fb1"); // ndRY
                unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb1", "\u0f4e\u0f9c\u0fbb"); // ndY
                unicodeExceptionsMap.put("\u0f61\u0fb1", "\u0f61\u0fbb"); // YY
                unicodeExceptionsMap.put("\u0f62\u0fb1", "\u0f62\u0fbb"); // RY
            }
            String mapEntry = (String)unicodeExceptionsMap.get(nonVowelSB.toString());
            if (null != mapEntry)
                sb.append(mapEntry);
            else
                sb.append(nonVowelSB);
            sb.append(vowelSB);
        } else {
            sb.append(nonVowelSB);
        }
    }
    /** Appends the DuffCodes that correspond to this grapheme cluster
@ -629,6 +674,23 @@ class TPairList {
            }
            if (sawWazur)
                hashKey = "r-w";
            else
                hashKey = "r+W"; // because EWTS has special handling
                                 // for full-formed subjoined
                                 // consonants
        } else {
            // Because EWTS has special handling for full-formed
            // subjoined consonants, we have special handling here.
            if ("r+y".equals(hashKey))
                hashKey = "r+Y";
            else if ("y+y".equals(hashKey))
                hashKey = "y+Y";
            else if ("N+D+y".equals(hashKey))
                hashKey = "N+D+Y";
            else if ("N+D+r+y".equals(hashKey))
                hashKey = "N+D+R+y";
            else if ("k+Sh+r".equals(hashKey))
                hashKey = "k+Sh+R";
        }
        if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {
            hashKey = hashKey.replace('+', '-');
--- a/source/org/thdl/tib/text/ttt/TStackList.java
+++ b/source/org/thdl/tib/text/ttt/TStackList.java
@ -225,37 +225,13 @@ class TStackList {
        return false;
    }
    private static HashMap unicodeExceptionsMap = null;
    /** Returns legal Unicode corresponding to this tsheg bar.  FIXME: which normalization form, if any? */
    String getUnicode() {
        // The question is this: U+0FB1 or U+0FBB?  U+0FB2 or
        // U+0FBC?  The answer: always the usual form, not the
        // full form, except for a few known stacks (all the ones
        // with full form subjoined consonants in TMW).  Note that
        // wa-zur, U+0FAD, is never confused for U+0FBA because
        // "V" and "W" are different transliterations.
        StringBuffer u = new StringBuffer(size());
        for (int i = 0; i < size(); i++) {
            get(i).getUnicode(u);
        }
-
+        return u.toString();
        String us = u.toString();
        if (null == unicodeExceptionsMap) {
            unicodeExceptionsMap = new HashMap();
            unicodeExceptionsMap.put("\u0f69\u0fb2", "\u0f69\u0fbc"); // KshR
            unicodeExceptionsMap.put("\u0f40\u0fb5\u0fb2", "\u0f40\u0fb5\u0fbc"); // KshR
            unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb2\u0fb1", "\u0f4e\u0f9c\u0fbc\u0fb1"); // ndRY
            unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb1", "\u0f4e\u0f9c\u0fbb"); // ndY
            unicodeExceptionsMap.put("\u0f61\u0fb1", "\u0f61\u0fbb"); // YY
            unicodeExceptionsMap.put("\u0f62\u0fb1", "\u0f62\u0fbb"); // RY
        }
        String mapEntry = (String)unicodeExceptionsMap.get(us);
        if (null != mapEntry)
            return mapEntry;
        else
            return us;
    }
    /** Returns the DuffCodes and errors corresponding to this stack