ACIP->Unicode now uses two characters for consonants instead of one. This matches the dislike for characters like U+0F77 etc.
ACIP->Tibetan was not giving an error for BCWA because it parsed like BCVA. Fixed.
This commit is contained in:
parent
e9f7b2dfed
commit
e7a9e7968f
4 changed files with 102 additions and 54 deletions
|
@ -24,6 +24,7 @@ import java.util.HashMap;
|
|||
import java.util.StringTokenizer;
|
||||
import java.util.List;
|
||||
|
||||
import org.thdl.util.ThdlOptions;
|
||||
import org.thdl.tib.text.DuffCode;
|
||||
import org.thdl.tib.text.THDLWylieConstants;
|
||||
import org.thdl.tib.text.TibetanMachineWeb;
|
||||
|
@ -206,7 +207,11 @@ public class ACIPRules {
|
|||
private static HashMap acipConsonant2wylie = null;
|
||||
/** Returns the EWTS corresponding to the given ACIP consonant
|
||||
* (without the "A" vowel). Returns null if there is no such
|
||||
* EWTS. */
|
||||
* EWTS.
|
||||
*
|
||||
* <p>Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y",
|
||||
* even though sometimes the EWTS for those is "w", "R", or "Y".
|
||||
* Handle that in the caller. */
|
||||
static final String getWylieForACIPConsonant(String acip) {
|
||||
if (acipConsonant2wylie == null) {
|
||||
acipConsonant2wylie = new HashMap(37);
|
||||
|
@ -242,7 +247,15 @@ public class ACIPRules {
|
|||
putMapping(acipConsonant2wylie, "TZ", "ts");
|
||||
putMapping(acipConsonant2wylie, "TS", "tsh");
|
||||
putMapping(acipConsonant2wylie, "DZ", "dz");
|
||||
putMapping(acipConsonant2wylie, "W", "w");
|
||||
putMapping(acipConsonant2wylie, "W", "W"
|
||||
/* NOTE WELL: sometimes "w", sometimes "W".
|
||||
Handle this in the caller.
|
||||
|
||||
Reasoning for "W" instead of "w": r-w and
|
||||
r+w are both known hash keys. We sort 'em
|
||||
out this way. (They are the only things
|
||||
like this according to bug report #800166.) */
|
||||
);
|
||||
putMapping(acipConsonant2wylie, "ZH", "zh");
|
||||
putMapping(acipConsonant2wylie, "Z", "z");
|
||||
putMapping(acipConsonant2wylie, "'", "'");
|
||||
|
@ -329,24 +342,26 @@ public class ACIPRules {
|
|||
* true. Returns null if acip is unknown. */
|
||||
static String getUnicodeFor(String acip, boolean subscribed) {
|
||||
if (superACIP2unicode == null) {
|
||||
final boolean compactUnicode
|
||||
= ThdlOptions.getBooleanOption("thdl.acip.to.unicode.conversions.use.0F52.et.cetera");
|
||||
superACIP2unicode = new HashMap(144);
|
||||
subACIP2unicode = new HashMap(42);
|
||||
|
||||
// oddball:
|
||||
subACIP2unicode.put("V", "\u0FAD");
|
||||
|
||||
superACIP2unicode.put("DH", "\u0F52");
|
||||
subACIP2unicode.put("DH", "\u0FA2");
|
||||
superACIP2unicode.put("BH", "\u0F57");
|
||||
subACIP2unicode.put("BH", "\u0FA7");
|
||||
superACIP2unicode.put("dH", "\u0F4D");
|
||||
subACIP2unicode.put("dH", "\u0F9D");
|
||||
superACIP2unicode.put("DZH", "\u0F5C");
|
||||
subACIP2unicode.put("DZH", "\u0FAC");
|
||||
superACIP2unicode.put("Ksh", "\u0F69");
|
||||
subACIP2unicode.put("Ksh", "\u0FB9");
|
||||
superACIP2unicode.put("GH", "\u0F43");
|
||||
subACIP2unicode.put("GH", "\u0F93");
|
||||
superACIP2unicode.put("DH", (compactUnicode ? "\u0F52" : "\u0F51\u0FB7"));
|
||||
subACIP2unicode.put("DH", (compactUnicode ? "\u0FA2" : "\u0FA1\u0FB7"));
|
||||
superACIP2unicode.put("BH", (compactUnicode ? "\u0F57" : "\u0F56\u0FB7"));
|
||||
subACIP2unicode.put("BH", (compactUnicode ? "\u0FA7" : "\u0FA6\u0FB7"));
|
||||
superACIP2unicode.put("dH", (compactUnicode ? "\u0F4D" : "\u0F4C\u0FB7"));
|
||||
subACIP2unicode.put("dH", (compactUnicode ? "\u0F9D" : "\u0F9C\u0FB7"));
|
||||
superACIP2unicode.put("DZH", (compactUnicode ? "\u0F5C" : "\u0F5B\u0FB7"));
|
||||
subACIP2unicode.put("DZH", (compactUnicode ? "\u0FAC" : "\u0FAB\u0FB7"));
|
||||
superACIP2unicode.put("Ksh", (compactUnicode ? "\u0F69" : "\u0F40\u0FB5"));
|
||||
subACIP2unicode.put("Ksh", (compactUnicode ? "\u0FB9" : "\u0F90\u0FB5"));
|
||||
superACIP2unicode.put("GH", (compactUnicode ? "\u0F43" : "\u0F42\u0FB7"));
|
||||
subACIP2unicode.put("GH", (compactUnicode ? "\u0F93" : "\u0F92\u0FB7"));
|
||||
superACIP2unicode.put("K", "\u0F40");
|
||||
subACIP2unicode.put("K", "\u0F90");
|
||||
superACIP2unicode.put("KH", "\u0F41");
|
||||
|
|
|
@ -7449,13 +7449,13 @@ M+NA
|
|||
uhelp("NA+YA", "\u0f53\u0fb1"); // FIXME: warn about the extra A
|
||||
uhelp("NE+YA", "[#ERROR CONVERTING ACIP DOCUMENT: The tsheg bar (\"syllable\") NE+YA has these errors: Cannot convert ACIP NE+-YA because + is not an ACIP consonant]");
|
||||
uhelp("tRAStA", "\u0f4a\u0fb2\u0f66\u0f9a");
|
||||
uhelp("DZHDZHA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP DZHDZHA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f5c\u0fac"); // tricky because DZHDZA is not in TMW but DZHDZHA is
|
||||
uhelp("DZHDZA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP DZHDZA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f5c\u0fab");
|
||||
uhelp("DZHDZHA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP DZHDZHA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f5b\u0fb7\u0fab\u0fb7"); // tricky because DZHDZA is not in TMW but DZHDZHA is
|
||||
uhelp("DZHDZA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP DZHDZA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f5b\u0fb7\u0fab");
|
||||
uhelp("P+S+N+YA", "\u0f54\u0fb6\u0fa3\u0fb1");
|
||||
uhelp("P+S+NYA", "\u0f54\u0fb6\u0f99");
|
||||
uhelp("PSNYA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP PSNYA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f54\u0fb6\u0f99"); // Is this P+S+N+YA? No, it's P+S+NYA. But warn!
|
||||
uhelp("NNYA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP NNYA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f53\u0f99");
|
||||
uhelp("GHNYA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP GHNYA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f43\u0f99");
|
||||
uhelp("GHNYA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP GHNYA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f42\u0fb7\u0f99");
|
||||
|
||||
// TS+NYA and T+S+N+YA are both legal, so what is TSNYA?
|
||||
// Private correspondence with Robert Chilton says that it is
|
||||
|
@ -7471,7 +7471,7 @@ M+NA
|
|||
uhelp("KAo KHA", "\u0f40\u0f37\u0f0b\u0f41");
|
||||
uhelp("KA KAo KHA", "\u0f40\u0f0b\u0f40\u0f37\u0f0b\u0f41");
|
||||
uhelp("KAx", "\u0f40[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot convert the ACIP {x} to Tibetan because it is unclear what the result should be.]");
|
||||
uhelp("G+DHA", "\u0f42\u0fa2");
|
||||
uhelp("G+DHA", "\u0f42\u0fa1\u0fb7");
|
||||
uhelp("P'EE", "\u0f54\u0f71\u0f7b");
|
||||
|
||||
uhelp("KA", "\u0f40");
|
||||
|
@ -7523,9 +7523,9 @@ M+NA
|
|||
|
||||
uhelp("/NY'EE/", "\u0f3C\u0f49\u0F71\u0F7B\u0f3D");
|
||||
uhelp("*#HUm: G+DHOO GRO`;.,",
|
||||
"\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
|
||||
"\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa1\u0fb7\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
|
||||
uhelp("*#HUm: K+DHA GRO`;.,",
|
||||
"\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f40\u0fa2\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
|
||||
"\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f40\u0fa1\u0fb7\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
|
||||
uhelp("HA,\nHA\n\nHA", "\u0f67\u0f0d \u0f67\u0f0b\n\n\u0f67");
|
||||
uhelp("NGA,", "\u0f44\u0f0c\u0f0d");
|
||||
uhelp("NGA,\nHA\n\nHA", "\u0f44\u0f0c\u0f0d \u0f67\u0f0b\n\n\u0f67");
|
||||
|
@ -7538,6 +7538,36 @@ M+NA
|
|||
uhelp("GU, ,KHO", "\u0f42\u0f74\u0f0d \u0f0d\u0f41\u0f7c");
|
||||
uhelp("GU ,KHO", "\u0f42\u0f74\u0f0b \u0f0d\u0f41\u0f7c"); // FIXME: missing a shad after GU, warn about that.
|
||||
uhelp("GA HA", "\u0f42\u0f0b \u0f67");
|
||||
uhelp("WA", "\u0f5d");
|
||||
uhelp("W", "\u0f5d");
|
||||
uhelp("WO", "\u0f5d\u0f7c");
|
||||
uhelp("WWA", "\u0f5d\u0fba");
|
||||
uhelp("W+WA", "\u0f5d\u0fba");
|
||||
uhelp("WNA", "\u0f5d\u0fa3");
|
||||
uhelp("WN", "\u0f5d\u0fa3");
|
||||
uhelp("W+NA", "\u0f5d\u0fa3");
|
||||
uhelp("W+N", "\u0f5d\u0fa3");
|
||||
uhelp("W+YA", "\u0f5d\u0fb1");
|
||||
uhelp("W+Y", "\u0f5d\u0fb1");
|
||||
uhelp("WYA", "\u0f5d\u0fb1");
|
||||
uhelp("WY", "\u0f5d\u0fb1");
|
||||
uhelp("WR", "\u0f5d\u0fb2");
|
||||
uhelp("WRA", "\u0f5d\u0fb2");
|
||||
uhelp("W+RA", "\u0f5d\u0fb2");
|
||||
uhelp("W+R", "\u0f5d\u0fb2");
|
||||
uhelp("BCWA", "\u0f56\u0f95\u0fba");
|
||||
uhelp("BCW", "\u0f56\u0f95\u0fba");
|
||||
uhelp("BCWO", "\u0f56\u0f95\u0fba\u0f7c");
|
||||
uhelp("BCVA", "\u0f56\u0f45\u0fad");
|
||||
uhelp("BCV", "\u0f56\u0f45\u0fad");
|
||||
uhelp("BCV'O", "\u0f56\u0f45\u0fad\u0f71\u0f7c");
|
||||
uhelp("BCV'A", "\u0f56\u0f45\u0fad\u0f71");
|
||||
uhelp("BCV'", "\u0f56\u0f95\u0fad\u0fb0");
|
||||
uhelp("GYA", "\u0f42\u0fb1");
|
||||
uhelp("GY", "\u0f42\u0fb1");
|
||||
uhelp("G-YA", "\u0f42\u0f61");
|
||||
uhelp("GA-YA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a useless disambiguator in GA-YA.]\u0f42\u0f61");
|
||||
uhelp("GA-YO", "[#WARNING CONVERTING ACIP DOCUMENT: There is a useless disambiguator in GA-YO.]\u0f42\u0f61\u0F7c");
|
||||
}
|
||||
public void testFixedFormSubjoinedConsonants() {
|
||||
// Usual subjoined RA:
|
||||
|
@ -7563,7 +7593,7 @@ M+NA
|
|||
+ "\u0f61\u0fbb\u0f7b\u0f0b" // Y+YEE
|
||||
+ "\u0f4e\u0f9c\u0fbc\u0fb1\u0f0b" // ndRYA
|
||||
+ "\u0f4e\u0f9c\u0fbc\u0fb1\u0f7b\u0f0b" // n+d+R+YEE
|
||||
+ "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP KshR was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f69\u0fbc\u0f0b" // KshR
|
||||
+ "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP KshR was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f40\u0fb5\u0fbc\u0f0b" // KshR
|
||||
+ "\u0f40\u0fb5\u0fbc\u0f7b\u0f0b" // K+sh+REE
|
||||
+ "\u0f4e\u0f9c\u0fbb\u0f0b" // ndY
|
||||
+ "\u0f4e\u0f9c\u0fbb\u0f7b\u0f0d" // n+d+YEE
|
||||
|
|
|
@ -184,7 +184,11 @@ class TPair {
|
|||
|
||||
/** Returns the EWTS Wylie that corresponds to this pair if
|
||||
* justLeft is false, or the EWTS Wylie that corresponds to just
|
||||
* {@link #getLeft()} if justLeft is true. */
|
||||
* {@link #getLeft()} if justLeft is true.
|
||||
*
|
||||
* <p>Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y",
|
||||
* even though sometimes the EWTS for those is "w", "R", or "Y".
|
||||
* Handle that in the caller. */
|
||||
String getWylie(boolean justLeft) {
|
||||
String leftWylie = null;
|
||||
if (getLeft() != null) {
|
||||
|
|
|
@ -660,25 +660,7 @@ class TPairList {
|
|||
TPair lastPair = get(size() - 1);
|
||||
wylieForConsonant.append(lastPair.getWylie(true));
|
||||
String hashKey = wylieForConsonant.toString();
|
||||
// r-w and r+w are both known hash keys. Sort 'em out. They
|
||||
// are the only things like this according to bug report
|
||||
// #800166.
|
||||
if ("r+w".equals(hashKey)) {
|
||||
boolean sawWazur = false;
|
||||
for (int x = 0; x < size(); x++) {
|
||||
TPair p = get(x);
|
||||
if ("V".equals(get(x).getLeft())) {
|
||||
sawWazur = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (sawWazur)
|
||||
hashKey = "r-w";
|
||||
else
|
||||
hashKey = "r+W"; // because EWTS has special handling
|
||||
// for full-formed subjoined
|
||||
// consonants
|
||||
} else {
|
||||
|
||||
// Because EWTS has special handling for full-formed
|
||||
// subjoined consonants, we have special handling here.
|
||||
if ("r+y".equals(hashKey))
|
||||
|
@ -691,7 +673,24 @@ class TPairList {
|
|||
hashKey = "N+D+R+y";
|
||||
else if ("k+Sh+r".equals(hashKey))
|
||||
hashKey = "k+Sh+R";
|
||||
}
|
||||
|
||||
// TPair.getWylie(..) returns "W" sometimes when "w" is what
|
||||
// really should be returned. ("V" always causes "w" to be
|
||||
// returned, which is fine.) We'll change "W" to "w" here if
|
||||
// we need to. We do it only for a few known stacks (the ones
|
||||
// in TMW).
|
||||
if ("W".equals(hashKey))
|
||||
hashKey = "w";
|
||||
else if ("W+y".equals(hashKey))
|
||||
hashKey = "w+y";
|
||||
else if ("W+r".equals(hashKey))
|
||||
hashKey = "w+r";
|
||||
else if ("W+n".equals(hashKey))
|
||||
hashKey = "w+n";
|
||||
else if ("W+W".equals(hashKey))
|
||||
hashKey = "w+W";
|
||||
// We're NOT doing it for r+W etc., on purpose.
|
||||
|
||||
if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {
|
||||
hashKey = hashKey.replace('+', '-');
|
||||
if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {
|
||||
|
|
Loading…
Reference in a new issue