Fixed ACIP->Unicode bug for YYE etc., things with full-formed

subjoined consonants and vowels.

Fixed ACIP->TMW for YYA etc., things with full-formed subjoined
consonants.
This commit is contained in:
dchandler 2003-12-14 07:36:21 +00:00
parent f625c937ee
commit 76c2e969ac
4 changed files with 91 additions and 29 deletions

View File

@ -7553,6 +7553,21 @@ M+NA
uhelp("n+d+Y", "\u0f4e\u0f9c\u0fbb");
uhelp("Y+Y", "\u0f61\u0fbb");
uhelp("R+Y", "\u0f62\u0fbb");
uhelp("RVA R+VEE RWA R+WEE YYA Y+YEE ndRYA n+d+R+YEE KshR K+sh+REE ndY n+d+YEE,",
"\u0f62\u0fad\u0f0b" // RVA
+ "\u0f62\u0fad\u0f7b\u0f0b" //R+VEE
+ "\u0f62\u0fba\u0f0b" // RWA
+ "\u0f62\u0fba\u0f7b\u0f0b" // R+WEE
+ "\u0f61\u0fbb\u0f0b" // YYA
+ "\u0f61\u0fbb\u0f7b\u0f0b" // Y+YEE
+ "\u0f4e\u0f9c\u0fbc\u0fb1\u0f0b" // ndRYA
+ "\u0f4e\u0f9c\u0fbc\u0fb1\u0f7b\u0f0b" // n+d+R+YEE
+ "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP KshR was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f69\u0fbc\u0f0b" // KshR
+ "\u0f40\u0fb5\u0fbc\u0f7b\u0f0b" // K+sh+REE
+ "\u0f4e\u0f9c\u0fbb\u0f0b" // ndY
+ "\u0f4e\u0f9c\u0fbb\u0f7b\u0f0d" // n+d+YEE
); // DLC FIXME: test the TMW for these, too, it was broken once.
}
/** Tests some more tsheg bars, these from Dr. Lacey's critical

View File

@ -211,16 +211,25 @@ class TPair {
* subscribed) pair to sb. FIXME: which normalization form,
* if any? */
void getUnicode(StringBuffer sb, boolean subscribed) {
getUnicode(sb, sb, subscribed);
}
/** Appends legal Unicode corresponding to this (possible
* subscribed) pair to consonantSB (for the non-vowel part) and
* vowelSB (for the vowelish part ({'EEm:}, e.g.). FIXME: which
* normalization form, if any? */
void getUnicode(StringBuffer consonantSB, StringBuffer vowelSB,
boolean subscribed) {
if (null != getLeft()) {
String x = ACIPRules.getUnicodeFor(getLeft(), subscribed);
if (null == x) throw new Error("TPair: " + getLeft() + " has no Uni");
sb.append(x);
consonantSB.append(x);
}
if (null != getRight()
&& !("-".equals(getRight()) || "+".equals(getRight()) || "A".equals(getRight()))) {
String x = ACIPRules.getUnicodeFor(getRight(), subscribed);
if (null == x) throw new Error("TPair: " + getRight() + " has no Uni");
sb.append(x);
vowelSB.append(x);
}
}

View File

@ -23,6 +23,7 @@ import org.thdl.tib.text.DuffCode;
import org.thdl.tib.text.TGCPair;
import org.thdl.util.ThdlDebug;
import java.util.HashMap;
import java.util.ArrayList;
/** A list of {@link TPair TPairs}, typically corresponding to
@ -592,14 +593,58 @@ class TPairList {
}
}
private static HashMap unicodeExceptionsMap = null;
/** Appends legal Unicode corresponding to this stack to sb.
* FIXME: which normalization form, if any? */
void getUnicode(StringBuffer sb) {
// The question is this: U+0FB1 or U+0FBB? U+0FB2 or U+0FBC?
// The answer: always the usual form, not the full form,
// except for a few known stacks (all the ones with full-form,
// non-WA subjoined consonants in TMW: [in EWTS, they are:]
// r+Y, N+D+Y, N+D+R+y, k+Sh+R). Note that wa-zur, U+0FAD, is
// never confused for U+0FBA because "V" and "W" are different
// transliterations. EWTS {r+W} thus needs no special
// treatment during ACIP->Unicode.
StringBuffer nonVowelSB = new StringBuffer();
int beginningIndex = sb.length();
boolean subscribed = false;
for (int i = 0; i < size(); i++) {
get(i).getUnicode(sb, subscribed);
int szz = size();
int i;
for (i = 0; i + ((1 == szz) ? 0 : 1) < szz; i++) {
TPair p = get(i);
// FIXME: change this to an assertion:
if ((1 != szz) && null != p.getRight() && !"+".equals(p.getRight()))
throw new Error("Oops -- this stack (i.e., " + toString() + ") is funny, so we can't generate proper Unicode for it. i is " + i + " and size is " + szz);
p.getUnicode(nonVowelSB, subscribed);
subscribed = true;
}
if (szz > 1) {
TPair p = get(i);
StringBuffer vowelSB = new StringBuffer();
p.getUnicode(nonVowelSB, vowelSB, subscribed /* which is true */);
if (null == unicodeExceptionsMap) {
unicodeExceptionsMap = new HashMap();
unicodeExceptionsMap.put("\u0f69\u0fb2", "\u0f69\u0fbc"); // KshR (variety 1)
unicodeExceptionsMap.put("\u0f40\u0fb5\u0fb2", "\u0f40\u0fb5\u0fbc"); // KshR (variety 2)
unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb2\u0fb1", "\u0f4e\u0f9c\u0fbc\u0fb1"); // ndRY
unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb1", "\u0f4e\u0f9c\u0fbb"); // ndY
unicodeExceptionsMap.put("\u0f61\u0fb1", "\u0f61\u0fbb"); // YY
unicodeExceptionsMap.put("\u0f62\u0fb1", "\u0f62\u0fbb"); // RY
}
String mapEntry = (String)unicodeExceptionsMap.get(nonVowelSB.toString());
if (null != mapEntry)
sb.append(mapEntry);
else
sb.append(nonVowelSB);
sb.append(vowelSB);
} else {
sb.append(nonVowelSB);
}
}
/** Appends the DuffCodes that correspond to this grapheme cluster
@ -629,6 +674,23 @@ class TPairList {
}
if (sawWazur)
hashKey = "r-w";
else
hashKey = "r+W"; // because EWTS has special handling
// for full-formed subjoined
// consonants
} else {
// Because EWTS has special handling for full-formed
// subjoined consonants, we have special handling here.
if ("r+y".equals(hashKey))
hashKey = "r+Y";
else if ("y+y".equals(hashKey))
hashKey = "y+Y";
else if ("N+D+y".equals(hashKey))
hashKey = "N+D+Y";
else if ("N+D+r+y".equals(hashKey))
hashKey = "N+D+R+y";
else if ("k+Sh+r".equals(hashKey))
hashKey = "k+Sh+R";
}
if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {
hashKey = hashKey.replace('+', '-');

View File

@ -225,37 +225,13 @@ class TStackList {
return false;
}
private static HashMap unicodeExceptionsMap = null;
/** Returns legal Unicode corresponding to this tsheg bar. FIXME: which normalization form, if any? */
String getUnicode() {
// The question is this: U+0FB1 or U+0FBB? U+0FB2 or
// U+0FBC? The answer: always the usual form, not the
// full form, except for a few known stacks (all the ones
// with full form subjoined consonants in TMW). Note that
// wa-zur, U+0FAD, is never confused for U+0FBA because
// "V" and "W" are different transliterations.
StringBuffer u = new StringBuffer(size());
for (int i = 0; i < size(); i++) {
get(i).getUnicode(u);
}
String us = u.toString();
if (null == unicodeExceptionsMap) {
unicodeExceptionsMap = new HashMap();
unicodeExceptionsMap.put("\u0f69\u0fb2", "\u0f69\u0fbc"); // KshR
unicodeExceptionsMap.put("\u0f40\u0fb5\u0fb2", "\u0f40\u0fb5\u0fbc"); // KshR
unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb2\u0fb1", "\u0f4e\u0f9c\u0fbc\u0fb1"); // ndRY
unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb1", "\u0f4e\u0f9c\u0fbb"); // ndY
unicodeExceptionsMap.put("\u0f61\u0fb1", "\u0f61\u0fbb"); // YY
unicodeExceptionsMap.put("\u0f62\u0fb1", "\u0f62\u0fbb"); // RY
}
String mapEntry = (String)unicodeExceptionsMap.get(us);
if (null != mapEntry)
return mapEntry;
else
return us;
return u.toString();
}
/** Returns the DuffCodes and errors corresponding to this stack