ACIP->Unicode now uses two characters for consonants instead of one. This matches the dislike for characters like U+0F77 etc.

ACIP->Tibetan was not giving an error for BCWA because it parsed like BCVA.  Fixed.
This commit is contained in:
dchandler 2003-12-15 07:32:14 +00:00
parent e9f7b2dfed
commit e7a9e7968f
4 changed files with 102 additions and 54 deletions

View file

@ -24,6 +24,7 @@ import java.util.HashMap;
import java.util.StringTokenizer; import java.util.StringTokenizer;
import java.util.List; import java.util.List;
import org.thdl.util.ThdlOptions;
import org.thdl.tib.text.DuffCode; import org.thdl.tib.text.DuffCode;
import org.thdl.tib.text.THDLWylieConstants; import org.thdl.tib.text.THDLWylieConstants;
import org.thdl.tib.text.TibetanMachineWeb; import org.thdl.tib.text.TibetanMachineWeb;
@ -206,7 +207,11 @@ public class ACIPRules {
private static HashMap acipConsonant2wylie = null; private static HashMap acipConsonant2wylie = null;
/** Returns the EWTS corresponding to the given ACIP consonant /** Returns the EWTS corresponding to the given ACIP consonant
* (without the "A" vowel). Returns null if there is no such * (without the "A" vowel). Returns null if there is no such
* EWTS. */ * EWTS.
*
* <p>Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y",
* even though sometimes the EWTS for those is "w", "R", or "Y".
* Handle that in the caller. */
static final String getWylieForACIPConsonant(String acip) { static final String getWylieForACIPConsonant(String acip) {
if (acipConsonant2wylie == null) { if (acipConsonant2wylie == null) {
acipConsonant2wylie = new HashMap(37); acipConsonant2wylie = new HashMap(37);
@ -242,7 +247,15 @@ public class ACIPRules {
putMapping(acipConsonant2wylie, "TZ", "ts"); putMapping(acipConsonant2wylie, "TZ", "ts");
putMapping(acipConsonant2wylie, "TS", "tsh"); putMapping(acipConsonant2wylie, "TS", "tsh");
putMapping(acipConsonant2wylie, "DZ", "dz"); putMapping(acipConsonant2wylie, "DZ", "dz");
putMapping(acipConsonant2wylie, "W", "w"); putMapping(acipConsonant2wylie, "W", "W"
/* NOTE WELL: sometimes "w", sometimes "W".
Handle this in the caller.
Reasoning for "W" instead of "w": r-w and
r+w are both known hash keys. We sort 'em
out this way. (They are the only things
like this according to bug report #800166.) */
);
putMapping(acipConsonant2wylie, "ZH", "zh"); putMapping(acipConsonant2wylie, "ZH", "zh");
putMapping(acipConsonant2wylie, "Z", "z"); putMapping(acipConsonant2wylie, "Z", "z");
putMapping(acipConsonant2wylie, "'", "'"); putMapping(acipConsonant2wylie, "'", "'");
@ -329,24 +342,26 @@ public class ACIPRules {
* true. Returns null if acip is unknown. */ * true. Returns null if acip is unknown. */
static String getUnicodeFor(String acip, boolean subscribed) { static String getUnicodeFor(String acip, boolean subscribed) {
if (superACIP2unicode == null) { if (superACIP2unicode == null) {
final boolean compactUnicode
= ThdlOptions.getBooleanOption("thdl.acip.to.unicode.conversions.use.0F52.et.cetera");
superACIP2unicode = new HashMap(144); superACIP2unicode = new HashMap(144);
subACIP2unicode = new HashMap(42); subACIP2unicode = new HashMap(42);
// oddball: // oddball:
subACIP2unicode.put("V", "\u0FAD"); subACIP2unicode.put("V", "\u0FAD");
superACIP2unicode.put("DH", "\u0F52"); superACIP2unicode.put("DH", (compactUnicode ? "\u0F52" : "\u0F51\u0FB7"));
subACIP2unicode.put("DH", "\u0FA2"); subACIP2unicode.put("DH", (compactUnicode ? "\u0FA2" : "\u0FA1\u0FB7"));
superACIP2unicode.put("BH", "\u0F57"); superACIP2unicode.put("BH", (compactUnicode ? "\u0F57" : "\u0F56\u0FB7"));
subACIP2unicode.put("BH", "\u0FA7"); subACIP2unicode.put("BH", (compactUnicode ? "\u0FA7" : "\u0FA6\u0FB7"));
superACIP2unicode.put("dH", "\u0F4D"); superACIP2unicode.put("dH", (compactUnicode ? "\u0F4D" : "\u0F4C\u0FB7"));
subACIP2unicode.put("dH", "\u0F9D"); subACIP2unicode.put("dH", (compactUnicode ? "\u0F9D" : "\u0F9C\u0FB7"));
superACIP2unicode.put("DZH", "\u0F5C"); superACIP2unicode.put("DZH", (compactUnicode ? "\u0F5C" : "\u0F5B\u0FB7"));
subACIP2unicode.put("DZH", "\u0FAC"); subACIP2unicode.put("DZH", (compactUnicode ? "\u0FAC" : "\u0FAB\u0FB7"));
superACIP2unicode.put("Ksh", "\u0F69"); superACIP2unicode.put("Ksh", (compactUnicode ? "\u0F69" : "\u0F40\u0FB5"));
subACIP2unicode.put("Ksh", "\u0FB9"); subACIP2unicode.put("Ksh", (compactUnicode ? "\u0FB9" : "\u0F90\u0FB5"));
superACIP2unicode.put("GH", "\u0F43"); superACIP2unicode.put("GH", (compactUnicode ? "\u0F43" : "\u0F42\u0FB7"));
subACIP2unicode.put("GH", "\u0F93"); subACIP2unicode.put("GH", (compactUnicode ? "\u0F93" : "\u0F92\u0FB7"));
superACIP2unicode.put("K", "\u0F40"); superACIP2unicode.put("K", "\u0F40");
subACIP2unicode.put("K", "\u0F90"); subACIP2unicode.put("K", "\u0F90");
superACIP2unicode.put("KH", "\u0F41"); superACIP2unicode.put("KH", "\u0F41");

View file

@ -7449,13 +7449,13 @@ M+NA
uhelp("NA+YA", "\u0f53\u0fb1"); // FIXME: warn about the extra A uhelp("NA+YA", "\u0f53\u0fb1"); // FIXME: warn about the extra A
uhelp("NE+YA", "[#ERROR CONVERTING ACIP DOCUMENT: The tsheg bar (\"syllable\") NE+YA has these errors: Cannot convert ACIP NE+-YA because + is not an ACIP consonant]"); uhelp("NE+YA", "[#ERROR CONVERTING ACIP DOCUMENT: The tsheg bar (\"syllable\") NE+YA has these errors: Cannot convert ACIP NE+-YA because + is not an ACIP consonant]");
uhelp("tRAStA", "\u0f4a\u0fb2\u0f66\u0f9a"); uhelp("tRAStA", "\u0f4a\u0fb2\u0f66\u0f9a");
uhelp("DZHDZHA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP DZHDZHA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f5c\u0fac"); // tricky because DZHDZA is not in TMW but DZHDZHA is uhelp("DZHDZHA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP DZHDZHA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f5b\u0fb7\u0fab\u0fb7"); // tricky because DZHDZA is not in TMW but DZHDZHA is
uhelp("DZHDZA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP DZHDZA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f5c\u0fab"); uhelp("DZHDZA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP DZHDZA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f5b\u0fb7\u0fab");
uhelp("P+S+N+YA", "\u0f54\u0fb6\u0fa3\u0fb1"); uhelp("P+S+N+YA", "\u0f54\u0fb6\u0fa3\u0fb1");
uhelp("P+S+NYA", "\u0f54\u0fb6\u0f99"); uhelp("P+S+NYA", "\u0f54\u0fb6\u0f99");
uhelp("PSNYA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP PSNYA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f54\u0fb6\u0f99"); // Is this P+S+N+YA? No, it's P+S+NYA. But warn! uhelp("PSNYA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP PSNYA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f54\u0fb6\u0f99"); // Is this P+S+N+YA? No, it's P+S+NYA. But warn!
uhelp("NNYA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP NNYA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f53\u0f99"); uhelp("NNYA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP NNYA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f53\u0f99");
uhelp("GHNYA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP GHNYA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f43\u0f99"); uhelp("GHNYA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP GHNYA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f42\u0fb7\u0f99");
// TS+NYA and T+S+N+YA are both legal, so what is TSNYA? // TS+NYA and T+S+N+YA are both legal, so what is TSNYA?
// Private correspondence with Robert Chilton says that it is // Private correspondence with Robert Chilton says that it is
@ -7471,7 +7471,7 @@ M+NA
uhelp("KAo KHA", "\u0f40\u0f37\u0f0b\u0f41"); uhelp("KAo KHA", "\u0f40\u0f37\u0f0b\u0f41");
uhelp("KA KAo KHA", "\u0f40\u0f0b\u0f40\u0f37\u0f0b\u0f41"); uhelp("KA KAo KHA", "\u0f40\u0f0b\u0f40\u0f37\u0f0b\u0f41");
uhelp("KAx", "\u0f40[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot convert the ACIP {x} to Tibetan because it is unclear what the result should be.]"); uhelp("KAx", "\u0f40[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot convert the ACIP {x} to Tibetan because it is unclear what the result should be.]");
uhelp("G+DHA", "\u0f42\u0fa2"); uhelp("G+DHA", "\u0f42\u0fa1\u0fb7");
uhelp("P'EE", "\u0f54\u0f71\u0f7b"); uhelp("P'EE", "\u0f54\u0f71\u0f7b");
uhelp("KA", "\u0f40"); uhelp("KA", "\u0f40");
@ -7523,9 +7523,9 @@ M+NA
uhelp("/NY'EE/", "\u0f3C\u0f49\u0F71\u0F7B\u0f3D"); uhelp("/NY'EE/", "\u0f3C\u0f49\u0F71\u0F7B\u0f3D");
uhelp("*#HUm: G+DHOO GRO`;.,", uhelp("*#HUm: G+DHOO GRO`;.,",
"\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d"); "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa1\u0fb7\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
uhelp("*#HUm: K+DHA GRO`;.,", uhelp("*#HUm: K+DHA GRO`;.,",
"\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f40\u0fa2\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d"); "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f40\u0fa1\u0fb7\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
uhelp("HA,\nHA\n\nHA", "\u0f67\u0f0d \u0f67\u0f0b\n\n\u0f67"); uhelp("HA,\nHA\n\nHA", "\u0f67\u0f0d \u0f67\u0f0b\n\n\u0f67");
uhelp("NGA,", "\u0f44\u0f0c\u0f0d"); uhelp("NGA,", "\u0f44\u0f0c\u0f0d");
uhelp("NGA,\nHA\n\nHA", "\u0f44\u0f0c\u0f0d \u0f67\u0f0b\n\n\u0f67"); uhelp("NGA,\nHA\n\nHA", "\u0f44\u0f0c\u0f0d \u0f67\u0f0b\n\n\u0f67");
@ -7538,6 +7538,36 @@ M+NA
uhelp("GU, ,KHO", "\u0f42\u0f74\u0f0d \u0f0d\u0f41\u0f7c"); uhelp("GU, ,KHO", "\u0f42\u0f74\u0f0d \u0f0d\u0f41\u0f7c");
uhelp("GU ,KHO", "\u0f42\u0f74\u0f0b \u0f0d\u0f41\u0f7c"); // FIXME: missing a shad after GU, warn about that. uhelp("GU ,KHO", "\u0f42\u0f74\u0f0b \u0f0d\u0f41\u0f7c"); // FIXME: missing a shad after GU, warn about that.
uhelp("GA HA", "\u0f42\u0f0b \u0f67"); uhelp("GA HA", "\u0f42\u0f0b \u0f67");
uhelp("WA", "\u0f5d");
uhelp("W", "\u0f5d");
uhelp("WO", "\u0f5d\u0f7c");
uhelp("WWA", "\u0f5d\u0fba");
uhelp("W+WA", "\u0f5d\u0fba");
uhelp("WNA", "\u0f5d\u0fa3");
uhelp("WN", "\u0f5d\u0fa3");
uhelp("W+NA", "\u0f5d\u0fa3");
uhelp("W+N", "\u0f5d\u0fa3");
uhelp("W+YA", "\u0f5d\u0fb1");
uhelp("W+Y", "\u0f5d\u0fb1");
uhelp("WYA", "\u0f5d\u0fb1");
uhelp("WY", "\u0f5d\u0fb1");
uhelp("WR", "\u0f5d\u0fb2");
uhelp("WRA", "\u0f5d\u0fb2");
uhelp("W+RA", "\u0f5d\u0fb2");
uhelp("W+R", "\u0f5d\u0fb2");
uhelp("BCWA", "\u0f56\u0f95\u0fba");
uhelp("BCW", "\u0f56\u0f95\u0fba");
uhelp("BCWO", "\u0f56\u0f95\u0fba\u0f7c");
uhelp("BCVA", "\u0f56\u0f45\u0fad");
uhelp("BCV", "\u0f56\u0f45\u0fad");
uhelp("BCV'O", "\u0f56\u0f45\u0fad\u0f71\u0f7c");
uhelp("BCV'A", "\u0f56\u0f45\u0fad\u0f71");
uhelp("BCV'", "\u0f56\u0f95\u0fad\u0fb0");
uhelp("GYA", "\u0f42\u0fb1");
uhelp("GY", "\u0f42\u0fb1");
uhelp("G-YA", "\u0f42\u0f61");
uhelp("GA-YA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a useless disambiguator in GA-YA.]\u0f42\u0f61");
uhelp("GA-YO", "[#WARNING CONVERTING ACIP DOCUMENT: There is a useless disambiguator in GA-YO.]\u0f42\u0f61\u0F7c");
} }
public void testFixedFormSubjoinedConsonants() { public void testFixedFormSubjoinedConsonants() {
// Usual subjoined RA: // Usual subjoined RA:
@ -7563,7 +7593,7 @@ M+NA
+ "\u0f61\u0fbb\u0f7b\u0f0b" // Y+YEE + "\u0f61\u0fbb\u0f7b\u0f0b" // Y+YEE
+ "\u0f4e\u0f9c\u0fbc\u0fb1\u0f0b" // ndRYA + "\u0f4e\u0f9c\u0fbc\u0fb1\u0f0b" // ndRYA
+ "\u0f4e\u0f9c\u0fbc\u0fb1\u0f7b\u0f0b" // n+d+R+YEE + "\u0f4e\u0f9c\u0fbc\u0fb1\u0f7b\u0f0b" // n+d+R+YEE
+ "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP KshR was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f69\u0fbc\u0f0b" // KshR + "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP KshR was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f40\u0fb5\u0fbc\u0f0b" // KshR
+ "\u0f40\u0fb5\u0fbc\u0f7b\u0f0b" // K+sh+REE + "\u0f40\u0fb5\u0fbc\u0f7b\u0f0b" // K+sh+REE
+ "\u0f4e\u0f9c\u0fbb\u0f0b" // ndY + "\u0f4e\u0f9c\u0fbb\u0f0b" // ndY
+ "\u0f4e\u0f9c\u0fbb\u0f7b\u0f0d" // n+d+YEE + "\u0f4e\u0f9c\u0fbb\u0f7b\u0f0d" // n+d+YEE

View file

@ -184,7 +184,11 @@ class TPair {
/** Returns the EWTS Wylie that corresponds to this pair if /** Returns the EWTS Wylie that corresponds to this pair if
* justLeft is false, or the EWTS Wylie that corresponds to just * justLeft is false, or the EWTS Wylie that corresponds to just
* {@link #getLeft()} if justLeft is true. */ * {@link #getLeft()} if justLeft is true.
*
* <p>Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y",
* even though sometimes the EWTS for those is "w", "R", or "Y".
* Handle that in the caller. */
String getWylie(boolean justLeft) { String getWylie(boolean justLeft) {
String leftWylie = null; String leftWylie = null;
if (getLeft() != null) { if (getLeft() != null) {

View file

@ -660,25 +660,7 @@ class TPairList {
TPair lastPair = get(size() - 1); TPair lastPair = get(size() - 1);
wylieForConsonant.append(lastPair.getWylie(true)); wylieForConsonant.append(lastPair.getWylie(true));
String hashKey = wylieForConsonant.toString(); String hashKey = wylieForConsonant.toString();
// r-w and r+w are both known hash keys. Sort 'em out. They
// are the only things like this according to bug report
// #800166.
if ("r+w".equals(hashKey)) {
boolean sawWazur = false;
for (int x = 0; x < size(); x++) {
TPair p = get(x);
if ("V".equals(get(x).getLeft())) {
sawWazur = true;
break;
}
}
if (sawWazur)
hashKey = "r-w";
else
hashKey = "r+W"; // because EWTS has special handling
// for full-formed subjoined
// consonants
} else {
// Because EWTS has special handling for full-formed // Because EWTS has special handling for full-formed
// subjoined consonants, we have special handling here. // subjoined consonants, we have special handling here.
if ("r+y".equals(hashKey)) if ("r+y".equals(hashKey))
@ -691,7 +673,24 @@ class TPairList {
hashKey = "N+D+R+y"; hashKey = "N+D+R+y";
else if ("k+Sh+r".equals(hashKey)) else if ("k+Sh+r".equals(hashKey))
hashKey = "k+Sh+R"; hashKey = "k+Sh+R";
}
// TPair.getWylie(..) returns "W" sometimes when "w" is what
// really should be returned. ("V" always causes "w" to be
// returned, which is fine.) We'll change "W" to "w" here if
// we need to. We do it only for a few known stacks (the ones
// in TMW).
if ("W".equals(hashKey))
hashKey = "w";
else if ("W+y".equals(hashKey))
hashKey = "w+y";
else if ("W+r".equals(hashKey))
hashKey = "w+r";
else if ("W+n".equals(hashKey))
hashKey = "w+n";
else if ("W+W".equals(hashKey))
hashKey = "w+W";
// We're NOT doing it for r+W etc., on purpose.
if (!TibetanMachineWeb.isKnownHashKey(hashKey)) { if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {
hashKey = hashKey.replace('+', '-'); hashKey = hashKey.replace('+', '-');
if (!TibetanMachineWeb.isKnownHashKey(hashKey)) { if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {