Fixed ACIP->Unicode bug for YYE etc., things with full-formed
subjoined consonants and vowels. Fixed ACIP->TMW for YYA etc., things with full-formed subjoined consonants.
This commit is contained in:
parent
f625c937ee
commit
76c2e969ac
4 changed files with 91 additions and 29 deletions
|
@ -7553,6 +7553,21 @@ M+NA
|
||||||
uhelp("n+d+Y", "\u0f4e\u0f9c\u0fbb");
|
uhelp("n+d+Y", "\u0f4e\u0f9c\u0fbb");
|
||||||
uhelp("Y+Y", "\u0f61\u0fbb");
|
uhelp("Y+Y", "\u0f61\u0fbb");
|
||||||
uhelp("R+Y", "\u0f62\u0fbb");
|
uhelp("R+Y", "\u0f62\u0fbb");
|
||||||
|
|
||||||
|
uhelp("RVA R+VEE RWA R+WEE YYA Y+YEE ndRYA n+d+R+YEE KshR K+sh+REE ndY n+d+YEE,",
|
||||||
|
"\u0f62\u0fad\u0f0b" // RVA
|
||||||
|
+ "\u0f62\u0fad\u0f7b\u0f0b" //R+VEE
|
||||||
|
+ "\u0f62\u0fba\u0f0b" // RWA
|
||||||
|
+ "\u0f62\u0fba\u0f7b\u0f0b" // R+WEE
|
||||||
|
+ "\u0f61\u0fbb\u0f0b" // YYA
|
||||||
|
+ "\u0f61\u0fbb\u0f7b\u0f0b" // Y+YEE
|
||||||
|
+ "\u0f4e\u0f9c\u0fbc\u0fb1\u0f0b" // ndRYA
|
||||||
|
+ "\u0f4e\u0f9c\u0fbc\u0fb1\u0f7b\u0f0b" // n+d+R+YEE
|
||||||
|
+ "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP KshR was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f69\u0fbc\u0f0b" // KshR
|
||||||
|
+ "\u0f40\u0fb5\u0fbc\u0f7b\u0f0b" // K+sh+REE
|
||||||
|
+ "\u0f4e\u0f9c\u0fbb\u0f0b" // ndY
|
||||||
|
+ "\u0f4e\u0f9c\u0fbb\u0f7b\u0f0d" // n+d+YEE
|
||||||
|
); // DLC FIXME: test the TMW for these, too, it was broken once.
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Tests some more tsheg bars, these from Dr. Lacey's critical
|
/** Tests some more tsheg bars, these from Dr. Lacey's critical
|
||||||
|
|
|
@ -211,16 +211,25 @@ class TPair {
|
||||||
* subscribed) pair to sb. FIXME: which normalization form,
|
* subscribed) pair to sb. FIXME: which normalization form,
|
||||||
* if any? */
|
* if any? */
|
||||||
void getUnicode(StringBuffer sb, boolean subscribed) {
|
void getUnicode(StringBuffer sb, boolean subscribed) {
|
||||||
|
getUnicode(sb, sb, subscribed);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Appends legal Unicode corresponding to this (possible
|
||||||
|
* subscribed) pair to consonantSB (for the non-vowel part) and
|
||||||
|
* vowelSB (for the vowelish part ({'EEm:}, e.g.). FIXME: which
|
||||||
|
* normalization form, if any? */
|
||||||
|
void getUnicode(StringBuffer consonantSB, StringBuffer vowelSB,
|
||||||
|
boolean subscribed) {
|
||||||
if (null != getLeft()) {
|
if (null != getLeft()) {
|
||||||
String x = ACIPRules.getUnicodeFor(getLeft(), subscribed);
|
String x = ACIPRules.getUnicodeFor(getLeft(), subscribed);
|
||||||
if (null == x) throw new Error("TPair: " + getLeft() + " has no Uni");
|
if (null == x) throw new Error("TPair: " + getLeft() + " has no Uni");
|
||||||
sb.append(x);
|
consonantSB.append(x);
|
||||||
}
|
}
|
||||||
if (null != getRight()
|
if (null != getRight()
|
||||||
&& !("-".equals(getRight()) || "+".equals(getRight()) || "A".equals(getRight()))) {
|
&& !("-".equals(getRight()) || "+".equals(getRight()) || "A".equals(getRight()))) {
|
||||||
String x = ACIPRules.getUnicodeFor(getRight(), subscribed);
|
String x = ACIPRules.getUnicodeFor(getRight(), subscribed);
|
||||||
if (null == x) throw new Error("TPair: " + getRight() + " has no Uni");
|
if (null == x) throw new Error("TPair: " + getRight() + " has no Uni");
|
||||||
sb.append(x);
|
vowelSB.append(x);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,7 @@ import org.thdl.tib.text.DuffCode;
|
||||||
import org.thdl.tib.text.TGCPair;
|
import org.thdl.tib.text.TGCPair;
|
||||||
import org.thdl.util.ThdlDebug;
|
import org.thdl.util.ThdlDebug;
|
||||||
|
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
|
||||||
/** A list of {@link TPair TPairs}, typically corresponding to
|
/** A list of {@link TPair TPairs}, typically corresponding to
|
||||||
|
@ -592,14 +593,58 @@ class TPairList {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static HashMap unicodeExceptionsMap = null;
|
||||||
|
|
||||||
/** Appends legal Unicode corresponding to this stack to sb.
|
/** Appends legal Unicode corresponding to this stack to sb.
|
||||||
* FIXME: which normalization form, if any? */
|
* FIXME: which normalization form, if any? */
|
||||||
void getUnicode(StringBuffer sb) {
|
void getUnicode(StringBuffer sb) {
|
||||||
|
// The question is this: U+0FB1 or U+0FBB? U+0FB2 or U+0FBC?
|
||||||
|
// The answer: always the usual form, not the full form,
|
||||||
|
// except for a few known stacks (all the ones with full-form,
|
||||||
|
// non-WA subjoined consonants in TMW: [in EWTS, they are:]
|
||||||
|
// r+Y, N+D+Y, N+D+R+y, k+Sh+R). Note that wa-zur, U+0FAD, is
|
||||||
|
// never confused for U+0FBA because "V" and "W" are different
|
||||||
|
// transliterations. EWTS {r+W} thus needs no special
|
||||||
|
// treatment during ACIP->Unicode.
|
||||||
|
|
||||||
|
StringBuffer nonVowelSB = new StringBuffer();
|
||||||
|
int beginningIndex = sb.length();
|
||||||
boolean subscribed = false;
|
boolean subscribed = false;
|
||||||
for (int i = 0; i < size(); i++) {
|
int szz = size();
|
||||||
get(i).getUnicode(sb, subscribed);
|
int i;
|
||||||
|
for (i = 0; i + ((1 == szz) ? 0 : 1) < szz; i++) {
|
||||||
|
TPair p = get(i);
|
||||||
|
|
||||||
|
// FIXME: change this to an assertion:
|
||||||
|
if ((1 != szz) && null != p.getRight() && !"+".equals(p.getRight()))
|
||||||
|
throw new Error("Oops -- this stack (i.e., " + toString() + ") is funny, so we can't generate proper Unicode for it. i is " + i + " and size is " + szz);
|
||||||
|
|
||||||
|
p.getUnicode(nonVowelSB, subscribed);
|
||||||
subscribed = true;
|
subscribed = true;
|
||||||
}
|
}
|
||||||
|
if (szz > 1) {
|
||||||
|
TPair p = get(i);
|
||||||
|
StringBuffer vowelSB = new StringBuffer();
|
||||||
|
p.getUnicode(nonVowelSB, vowelSB, subscribed /* which is true */);
|
||||||
|
|
||||||
|
if (null == unicodeExceptionsMap) {
|
||||||
|
unicodeExceptionsMap = new HashMap();
|
||||||
|
unicodeExceptionsMap.put("\u0f69\u0fb2", "\u0f69\u0fbc"); // KshR (variety 1)
|
||||||
|
unicodeExceptionsMap.put("\u0f40\u0fb5\u0fb2", "\u0f40\u0fb5\u0fbc"); // KshR (variety 2)
|
||||||
|
unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb2\u0fb1", "\u0f4e\u0f9c\u0fbc\u0fb1"); // ndRY
|
||||||
|
unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb1", "\u0f4e\u0f9c\u0fbb"); // ndY
|
||||||
|
unicodeExceptionsMap.put("\u0f61\u0fb1", "\u0f61\u0fbb"); // YY
|
||||||
|
unicodeExceptionsMap.put("\u0f62\u0fb1", "\u0f62\u0fbb"); // RY
|
||||||
|
}
|
||||||
|
String mapEntry = (String)unicodeExceptionsMap.get(nonVowelSB.toString());
|
||||||
|
if (null != mapEntry)
|
||||||
|
sb.append(mapEntry);
|
||||||
|
else
|
||||||
|
sb.append(nonVowelSB);
|
||||||
|
sb.append(vowelSB);
|
||||||
|
} else {
|
||||||
|
sb.append(nonVowelSB);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Appends the DuffCodes that correspond to this grapheme cluster
|
/** Appends the DuffCodes that correspond to this grapheme cluster
|
||||||
|
@ -629,6 +674,23 @@ class TPairList {
|
||||||
}
|
}
|
||||||
if (sawWazur)
|
if (sawWazur)
|
||||||
hashKey = "r-w";
|
hashKey = "r-w";
|
||||||
|
else
|
||||||
|
hashKey = "r+W"; // because EWTS has special handling
|
||||||
|
// for full-formed subjoined
|
||||||
|
// consonants
|
||||||
|
} else {
|
||||||
|
// Because EWTS has special handling for full-formed
|
||||||
|
// subjoined consonants, we have special handling here.
|
||||||
|
if ("r+y".equals(hashKey))
|
||||||
|
hashKey = "r+Y";
|
||||||
|
else if ("y+y".equals(hashKey))
|
||||||
|
hashKey = "y+Y";
|
||||||
|
else if ("N+D+y".equals(hashKey))
|
||||||
|
hashKey = "N+D+Y";
|
||||||
|
else if ("N+D+r+y".equals(hashKey))
|
||||||
|
hashKey = "N+D+R+y";
|
||||||
|
else if ("k+Sh+r".equals(hashKey))
|
||||||
|
hashKey = "k+Sh+R";
|
||||||
}
|
}
|
||||||
if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {
|
if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {
|
||||||
hashKey = hashKey.replace('+', '-');
|
hashKey = hashKey.replace('+', '-');
|
||||||
|
|
|
@ -225,37 +225,13 @@ class TStackList {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
private static HashMap unicodeExceptionsMap = null;
|
|
||||||
|
|
||||||
/** Returns legal Unicode corresponding to this tsheg bar. FIXME: which normalization form, if any? */
|
/** Returns legal Unicode corresponding to this tsheg bar. FIXME: which normalization form, if any? */
|
||||||
String getUnicode() {
|
String getUnicode() {
|
||||||
// The question is this: U+0FB1 or U+0FBB? U+0FB2 or
|
|
||||||
// U+0FBC? The answer: always the usual form, not the
|
|
||||||
// full form, except for a few known stacks (all the ones
|
|
||||||
// with full form subjoined consonants in TMW). Note that
|
|
||||||
// wa-zur, U+0FAD, is never confused for U+0FBA because
|
|
||||||
// "V" and "W" are different transliterations.
|
|
||||||
|
|
||||||
StringBuffer u = new StringBuffer(size());
|
StringBuffer u = new StringBuffer(size());
|
||||||
for (int i = 0; i < size(); i++) {
|
for (int i = 0; i < size(); i++) {
|
||||||
get(i).getUnicode(u);
|
get(i).getUnicode(u);
|
||||||
}
|
}
|
||||||
|
return u.toString();
|
||||||
String us = u.toString();
|
|
||||||
if (null == unicodeExceptionsMap) {
|
|
||||||
unicodeExceptionsMap = new HashMap();
|
|
||||||
unicodeExceptionsMap.put("\u0f69\u0fb2", "\u0f69\u0fbc"); // KshR
|
|
||||||
unicodeExceptionsMap.put("\u0f40\u0fb5\u0fb2", "\u0f40\u0fb5\u0fbc"); // KshR
|
|
||||||
unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb2\u0fb1", "\u0f4e\u0f9c\u0fbc\u0fb1"); // ndRY
|
|
||||||
unicodeExceptionsMap.put("\u0f4e\u0f9c\u0fb1", "\u0f4e\u0f9c\u0fbb"); // ndY
|
|
||||||
unicodeExceptionsMap.put("\u0f61\u0fb1", "\u0f61\u0fbb"); // YY
|
|
||||||
unicodeExceptionsMap.put("\u0f62\u0fb1", "\u0f62\u0fbb"); // RY
|
|
||||||
}
|
|
||||||
String mapEntry = (String)unicodeExceptionsMap.get(us);
|
|
||||||
if (null != mapEntry)
|
|
||||||
return mapEntry;
|
|
||||||
else
|
|
||||||
return us;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the DuffCodes and errors corresponding to this stack
|
/** Returns the DuffCodes and errors corresponding to this stack
|
||||||
|
|
Loading…
Reference in a new issue