diff --git a/source/org/thdl/tib/text/ttt/PackageTest.java b/source/org/thdl/tib/text/ttt/PackageTest.java index 0c97c6d..a69b750 100644 --- a/source/org/thdl/tib/text/ttt/PackageTest.java +++ b/source/org/thdl/tib/text/ttt/PackageTest.java @@ -7553,6 +7553,21 @@ M+NA uhelp("n+d+Y", "\u0f4e\u0f9c\u0fbb"); uhelp("Y+Y", "\u0f61\u0fbb"); uhelp("R+Y", "\u0f62\u0fbb"); + + uhelp("RVA R+VEE RWA R+WEE YYA Y+YEE ndRYA n+d+R+YEE KshR K+sh+REE ndY n+d+YEE,", + "\u0f62\u0fad\u0f0b" // RVA + + "\u0f62\u0fad\u0f7b\u0f0b" //R+VEE + + "\u0f62\u0fba\u0f0b" // RWA + + "\u0f62\u0fba\u0f7b\u0f0b" // R+WEE + + "\u0f61\u0fbb\u0f0b" // YYA + + "\u0f61\u0fbb\u0f7b\u0f0b" // Y+YEE + + "\u0f4e\u0f9c\u0fbc\u0fb1\u0f0b" // ndRYA + + "\u0f4e\u0f9c\u0fbc\u0fb1\u0f7b\u0f0b" // n+d+R+YEE + + "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP KshR was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f69\u0fbc\u0f0b" // KshR + + "\u0f40\u0fb5\u0fbc\u0f7b\u0f0b" // K+sh+REE + + "\u0f4e\u0f9c\u0fbb\u0f0b" // ndY + + "\u0f4e\u0f9c\u0fbb\u0f7b\u0f0d" // n+d+YEE + ); // DLC FIXME: test the TMW for these, too, it was broken once. } /** Tests some more tsheg bars, these from Dr. Lacey's critical diff --git a/source/org/thdl/tib/text/ttt/TPair.java b/source/org/thdl/tib/text/ttt/TPair.java index 4a50fa0..318f62d 100644 --- a/source/org/thdl/tib/text/ttt/TPair.java +++ b/source/org/thdl/tib/text/ttt/TPair.java @@ -211,16 +211,25 @@ class TPair { * subscribed) pair to sb. FIXME: which normalization form, * if any? */ void getUnicode(StringBuffer sb, boolean subscribed) { + getUnicode(sb, sb, subscribed); + } + + /** Appends legal Unicode corresponding to this (possible + * subscribed) pair to consonantSB (for the non-vowel part) and + * vowelSB (for the vowelish part ({'EEm:}, e.g.). FIXME: which + * normalization form, if any? */ + void getUnicode(StringBuffer consonantSB, StringBuffer vowelSB, + boolean subscribed) { if (null != getLeft()) { String x = ACIPRules.getUnicodeFor(getLeft(), subscribed); if (null == x) throw new Error("TPair: " + getLeft() + " has no Uni"); - sb.append(x); + consonantSB.append(x); } if (null != getRight() && !("-".equals(getRight()) || "+".equals(getRight()) || "A".equals(getRight()))) { String x = ACIPRules.getUnicodeFor(getRight(), subscribed); if (null == x) throw new Error("TPair: " + getRight() + " has no Uni"); - sb.append(x); + vowelSB.append(x); } } diff --git a/source/org/thdl/tib/text/ttt/TPairList.java b/source/org/thdl/tib/text/ttt/TPairList.java index 991bef7..b9dcaee 100644 --- a/source/org/thdl/tib/text/ttt/TPairList.java +++ b/source/org/thdl/tib/text/ttt/TPairList.java @@ -23,6 +23,7 @@ import org.thdl.tib.text.DuffCode; import org.thdl.tib.text.TGCPair; import org.thdl.util.ThdlDebug; +import java.util.HashMap; import java.util.ArrayList; /** A list of {@link TPair TPairs}, typically corresponding to @@ -592,14 +593,58 @@ class TPairList { } } + private static HashMap unicodeExceptionsMap = null; + /** Appends legal Unicode corresponding to this stack to sb. * FIXME: which normalization form, if any? */ void getUnicode(StringBuffer sb) { + // The question is this: U+0FB1 or U+0FBB? U+0FB2 or U+0FBC? + // The answer: always the usual form, not the full form, + // except for a few known stacks (all the ones with full-form, + // non-WA subjoined consonants in TMW: [in EWTS, they are:] + // r+Y, N+D+Y, N+D+R+y, k+Sh+R). Note that wa-zur, U+0FAD, is + // never confused for U+0FBA because "V" and "W" are different + // transliterations. EWTS {r+W} thus needs no special + // treatment during ACIP->Unicode. + + StringBuffer nonVowelSB = new StringBuffer(); + int beginningIndex = sb.length(); boolean subscribed = false; - for (int i = 0; i < size(); i++) { - get(i).getUnicode(sb, subscribed); + int szz = size(); + int i; + for (i = 0; i + ((1 == szz) ? 0 : 1) < szz; i++) { + TPair p = get(i); + + // FIXME: change this to an assertion: + if ((1 != szz) && null != p.getRight() && !"+".equals(p.getRight())) + throw new Error("Oops -- this stack (i.e., " + toString() + ") is funny, so we can't generate proper Unicode for it. i is " + i + " and size is " + szz); + + p.getUnicode(nonVowelSB, subscribed); subscribed = true; } + if (szz > 1) { + TPair p = get(i); + StringBuffer vowelSB = new StringBuffer(); + p.getUnicode(nonVowelSB, vowelSB, subscribed /* which is true */); + + if (null == unicodeExceptionsMap) { + unicodeExceptionsMap = new HashMap(); + unicodeExceptionsMap.put("\u0f69\u0fb2", "\u0f69\u0fbc"); // KshR (variety 1) + unicodeExceptionsMap.put("\u0f40\u0fb5\u0fb2", "\u0f40\u0fb5\u0fbc"); // KshR (variety 2) + unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb2\u0fb1", "\u0f4e\u0f9c\u0fbc\u0fb1"); // ndRY + unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb1", "\u0f4e\u0f9c\u0fbb"); // ndY + unicodeExceptionsMap.put("\u0f61\u0fb1", "\u0f61\u0fbb"); // YY + unicodeExceptionsMap.put("\u0f62\u0fb1", "\u0f62\u0fbb"); // RY + } + String mapEntry = (String)unicodeExceptionsMap.get(nonVowelSB.toString()); + if (null != mapEntry) + sb.append(mapEntry); + else + sb.append(nonVowelSB); + sb.append(vowelSB); + } else { + sb.append(nonVowelSB); + } } /** Appends the DuffCodes that correspond to this grapheme cluster @@ -629,6 +674,23 @@ class TPairList { } if (sawWazur) hashKey = "r-w"; + else + hashKey = "r+W"; // because EWTS has special handling + // for full-formed subjoined + // consonants + } else { + // Because EWTS has special handling for full-formed + // subjoined consonants, we have special handling here. + if ("r+y".equals(hashKey)) + hashKey = "r+Y"; + else if ("y+y".equals(hashKey)) + hashKey = "y+Y"; + else if ("N+D+y".equals(hashKey)) + hashKey = "N+D+Y"; + else if ("N+D+r+y".equals(hashKey)) + hashKey = "N+D+R+y"; + else if ("k+Sh+r".equals(hashKey)) + hashKey = "k+Sh+R"; } if (!TibetanMachineWeb.isKnownHashKey(hashKey)) { hashKey = hashKey.replace('+', '-'); diff --git a/source/org/thdl/tib/text/ttt/TStackList.java b/source/org/thdl/tib/text/ttt/TStackList.java index b27c407..412a15f 100644 --- a/source/org/thdl/tib/text/ttt/TStackList.java +++ b/source/org/thdl/tib/text/ttt/TStackList.java @@ -225,37 +225,13 @@ class TStackList { return false; } - private static HashMap unicodeExceptionsMap = null; - /** Returns legal Unicode corresponding to this tsheg bar. FIXME: which normalization form, if any? */ String getUnicode() { - // The question is this: U+0FB1 or U+0FBB? U+0FB2 or - // U+0FBC? The answer: always the usual form, not the - // full form, except for a few known stacks (all the ones - // with full form subjoined consonants in TMW). Note that - // wa-zur, U+0FAD, is never confused for U+0FBA because - // "V" and "W" are different transliterations. - StringBuffer u = new StringBuffer(size()); for (int i = 0; i < size(); i++) { get(i).getUnicode(u); } - - String us = u.toString(); - if (null == unicodeExceptionsMap) { - unicodeExceptionsMap = new HashMap(); - unicodeExceptionsMap.put("\u0f69\u0fb2", "\u0f69\u0fbc"); // KshR - unicodeExceptionsMap.put("\u0f40\u0fb5\u0fb2", "\u0f40\u0fb5\u0fbc"); // KshR - unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb2\u0fb1", "\u0f4e\u0f9c\u0fbc\u0fb1"); // ndRY - unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb1", "\u0f4e\u0f9c\u0fbb"); // ndY - unicodeExceptionsMap.put("\u0f61\u0fb1", "\u0f61\u0fbb"); // YY - unicodeExceptionsMap.put("\u0f62\u0fb1", "\u0f62\u0fbb"); // RY - } - String mapEntry = (String)unicodeExceptionsMap.get(us); - if (null != mapEntry) - return mapEntry; - else - return us; + return u.toString(); } /** Returns the DuffCodes and errors corresponding to this stack