From 76c2e969ac9079014028bc1035efe931ed05ef1f Mon Sep 17 00:00:00 2001
From: dchandler <dchandler>
Date: Sun, 14 Dec 2003 07:36:21 +0000
Subject: [PATCH] Fixed ACIP->Unicode bug for YYE etc., things with full-formed
 subjoined consonants and vowels.

Fixed ACIP->TMW for YYA etc., things with full-formed subjoined
consonants.
---
 source/org/thdl/tib/text/ttt/PackageTest.java | 15 +++++
 source/org/thdl/tib/text/ttt/TPair.java       | 13 +++-
 source/org/thdl/tib/text/ttt/TPairList.java   | 66 ++++++++++++++++++-
 source/org/thdl/tib/text/ttt/TStackList.java  | 26 +-------
 4 files changed, 91 insertions(+), 29 deletions(-)
diff --git a/source/org/thdl/tib/text/ttt/PackageTest.java b/source/org/thdl/tib/text/ttt/PackageTest.java
index 0c97c6d..a69b750 100644
--- a/source/org/thdl/tib/text/ttt/PackageTest.java
+++ b/source/org/thdl/tib/text/ttt/PackageTest.java
@@ -7553,6 +7553,21 @@ M+NA
         uhelp("n+d+Y", "\u0f4e\u0f9c\u0fbb");
         uhelp("Y+Y", "\u0f61\u0fbb");
         uhelp("R+Y", "\u0f62\u0fbb");
+
+        uhelp("RVA R+VEE RWA R+WEE YYA Y+YEE ndRYA n+d+R+YEE KshR K+sh+REE ndY n+d+YEE,",
+              "\u0f62\u0fad\u0f0b" // RVA
+              + "\u0f62\u0fad\u0f7b\u0f0b" //R+VEE
+              + "\u0f62\u0fba\u0f0b" // RWA
+              + "\u0f62\u0fba\u0f7b\u0f0b" // R+WEE
+              + "\u0f61\u0fbb\u0f0b" // YYA
+              + "\u0f61\u0fbb\u0f7b\u0f0b" // Y+YEE
+              + "\u0f4e\u0f9c\u0fbc\u0fb1\u0f0b" // ndRYA
+              + "\u0f4e\u0f9c\u0fbc\u0fb1\u0f7b\u0f0b" // n+d+R+YEE
+              + "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP KshR was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f69\u0fbc\u0f0b" // KshR
+              + "\u0f40\u0fb5\u0fbc\u0f7b\u0f0b" // K+sh+REE
+              + "\u0f4e\u0f9c\u0fbb\u0f0b" // ndY
+              + "\u0f4e\u0f9c\u0fbb\u0f7b\u0f0d" // n+d+YEE
+              ); // DLC FIXME: test the TMW for these, too, it was broken once.
     }
 
     /** Tests some more tsheg bars, these from Dr. Lacey's critical
diff --git a/source/org/thdl/tib/text/ttt/TPair.java b/source/org/thdl/tib/text/ttt/TPair.java
index 4a50fa0..318f62d 100644
--- a/source/org/thdl/tib/text/ttt/TPair.java
+++ b/source/org/thdl/tib/text/ttt/TPair.java
@@ -211,16 +211,25 @@ class TPair {
      *  subscribed) pair to sb.  FIXME: which normalization form,
      *  if any? */
     void getUnicode(StringBuffer sb, boolean subscribed) {
+        getUnicode(sb, sb, subscribed);
+    }
+
+    /** Appends legal Unicode corresponding to this (possible
+     *  subscribed) pair to consonantSB (for the non-vowel part) and
+     *  vowelSB (for the vowelish part ({'EEm:}, e.g.).  FIXME: which
+     *  normalization form, if any? */
+    void getUnicode(StringBuffer consonantSB, StringBuffer vowelSB,
+                    boolean subscribed) {
         if (null != getLeft()) {
             String x = ACIPRules.getUnicodeFor(getLeft(), subscribed);
             if (null == x) throw new Error("TPair: " + getLeft() + " has no Uni");
-            sb.append(x);
+            consonantSB.append(x);
         }
         if (null != getRight()
             && !("-".equals(getRight()) || "+".equals(getRight()) || "A".equals(getRight()))) {
             String x = ACIPRules.getUnicodeFor(getRight(), subscribed);
             if (null == x) throw new Error("TPair: " + getRight() + " has no Uni");
-            sb.append(x);
+            vowelSB.append(x);
         }
     }
 
diff --git a/source/org/thdl/tib/text/ttt/TPairList.java b/source/org/thdl/tib/text/ttt/TPairList.java
index 991bef7..b9dcaee 100644
--- a/source/org/thdl/tib/text/ttt/TPairList.java
+++ b/source/org/thdl/tib/text/ttt/TPairList.java
@@ -23,6 +23,7 @@ import org.thdl.tib.text.DuffCode;
 import org.thdl.tib.text.TGCPair;
 import org.thdl.util.ThdlDebug;
 
+import java.util.HashMap;
 import java.util.ArrayList;
 
 /** A list of {@link TPair TPairs}, typically corresponding to
@@ -592,14 +593,58 @@ class TPairList {
         }
     }
 
+    private static HashMap unicodeExceptionsMap = null;
+
     /** Appends legal Unicode corresponding to this stack to sb.
      *  FIXME: which normalization form, if any? */
     void getUnicode(StringBuffer sb) {
+        // The question is this: U+0FB1 or U+0FBB?  U+0FB2 or U+0FBC?
+        // The answer: always the usual form, not the full form,
+        // except for a few known stacks (all the ones with full-form,
+        // non-WA subjoined consonants in TMW: [in EWTS, they are:]
+        // r+Y, N+D+Y, N+D+R+y, k+Sh+R).  Note that wa-zur, U+0FAD, is
+        // never confused for U+0FBA because "V" and "W" are different
+        // transliterations.  EWTS {r+W} thus needs no special
+        // treatment during ACIP->Unicode.
+
+        StringBuffer nonVowelSB = new StringBuffer();
+        int beginningIndex = sb.length();
         boolean subscribed = false;
-        for (int i = 0; i < size(); i++) {
-            get(i).getUnicode(sb, subscribed);
+        int szz = size();
+        int i;
+        for (i = 0; i + ((1 == szz) ? 0 : 1) < szz; i++) {
+            TPair p = get(i);
+
+            // FIXME: change this to an assertion:
+            if ((1 != szz) && null != p.getRight() && !"+".equals(p.getRight()))
+                throw new Error("Oops -- this stack (i.e., " + toString() + ") is funny, so we can't generate proper Unicode for it.  i is " + i + " and size is " + szz);
+
+            p.getUnicode(nonVowelSB, subscribed);
             subscribed = true;
         }
+        if (szz > 1) {
+            TPair p = get(i);
+            StringBuffer vowelSB = new StringBuffer();
+            p.getUnicode(nonVowelSB, vowelSB, subscribed /* which is true */);
+
+            if (null == unicodeExceptionsMap) {
+                unicodeExceptionsMap = new HashMap();
+                unicodeExceptionsMap.put("\u0f69\u0fb2", "\u0f69\u0fbc"); // KshR (variety 1)
+                unicodeExceptionsMap.put("\u0f40\u0fb5\u0fb2", "\u0f40\u0fb5\u0fbc"); // KshR (variety 2)
+                unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb2\u0fb1", "\u0f4e\u0f9c\u0fbc\u0fb1"); // ndRY
+                unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb1", "\u0f4e\u0f9c\u0fbb"); // ndY
+                unicodeExceptionsMap.put("\u0f61\u0fb1", "\u0f61\u0fbb"); // YY
+                unicodeExceptionsMap.put("\u0f62\u0fb1", "\u0f62\u0fbb"); // RY
+            }
+            String mapEntry = (String)unicodeExceptionsMap.get(nonVowelSB.toString());
+            if (null != mapEntry)
+                sb.append(mapEntry);
+            else
+                sb.append(nonVowelSB);
+            sb.append(vowelSB);
+        } else {
+            sb.append(nonVowelSB);
+        }
     }
 
     /** Appends the DuffCodes that correspond to this grapheme cluster
@@ -629,6 +674,23 @@ class TPairList {
             }
             if (sawWazur)
                 hashKey = "r-w";
+            else
+                hashKey = "r+W"; // because EWTS has special handling
+                                 // for full-formed subjoined
+                                 // consonants
+        } else {
+            // Because EWTS has special handling for full-formed
+            // subjoined consonants, we have special handling here.
+            if ("r+y".equals(hashKey))
+                hashKey = "r+Y";
+            else if ("y+y".equals(hashKey))
+                hashKey = "y+Y";
+            else if ("N+D+y".equals(hashKey))
+                hashKey = "N+D+Y";
+            else if ("N+D+r+y".equals(hashKey))
+                hashKey = "N+D+R+y";
+            else if ("k+Sh+r".equals(hashKey))
+                hashKey = "k+Sh+R";
         }
         if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {
             hashKey = hashKey.replace('+', '-');
diff --git a/source/org/thdl/tib/text/ttt/TStackList.java b/source/org/thdl/tib/text/ttt/TStackList.java
index b27c407..412a15f 100644
--- a/source/org/thdl/tib/text/ttt/TStackList.java
+++ b/source/org/thdl/tib/text/ttt/TStackList.java
@@ -225,37 +225,13 @@ class TStackList {
         return false;
     }
 
-    private static HashMap unicodeExceptionsMap = null;
-
     /** Returns legal Unicode corresponding to this tsheg bar.  FIXME: which normalization form, if any? */
     String getUnicode() {
-        // The question is this: U+0FB1 or U+0FBB?  U+0FB2 or
-        // U+0FBC?  The answer: always the usual form, not the
-        // full form, except for a few known stacks (all the ones
-        // with full form subjoined consonants in TMW).  Note that
-        // wa-zur, U+0FAD, is never confused for U+0FBA because
-        // "V" and "W" are different transliterations.
-
         StringBuffer u = new StringBuffer(size());
         for (int i = 0; i < size(); i++) {
             get(i).getUnicode(u);
         }
-
-        String us = u.toString();
-        if (null == unicodeExceptionsMap) {
-            unicodeExceptionsMap = new HashMap();
-            unicodeExceptionsMap.put("\u0f69\u0fb2", "\u0f69\u0fbc"); // KshR
-            unicodeExceptionsMap.put("\u0f40\u0fb5\u0fb2", "\u0f40\u0fb5\u0fbc"); // KshR
-            unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb2\u0fb1", "\u0f4e\u0f9c\u0fbc\u0fb1"); // ndRY
-            unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb1", "\u0f4e\u0f9c\u0fbb"); // ndY
-            unicodeExceptionsMap.put("\u0f61\u0fb1", "\u0f61\u0fbb"); // YY
-            unicodeExceptionsMap.put("\u0f62\u0fb1", "\u0f62\u0fbb"); // RY
-        }
-        String mapEntry = (String)unicodeExceptionsMap.get(us);
-        if (null != mapEntry)
-            return mapEntry;
-        else
-            return us;
+        return u.toString();
     }
 
     /** Returns the DuffCodes and errors corresponding to this stack