diff --git a/source/org/thdl/tib/text/ttt/ACIPRules.java b/source/org/thdl/tib/text/ttt/ACIPRules.java index fcb6269..1ac89a8 100644 --- a/source/org/thdl/tib/text/ttt/ACIPRules.java +++ b/source/org/thdl/tib/text/ttt/ACIPRules.java @@ -24,6 +24,7 @@ import java.util.HashMap; import java.util.StringTokenizer; import java.util.List; +import org.thdl.util.ThdlOptions; import org.thdl.tib.text.DuffCode; import org.thdl.tib.text.THDLWylieConstants; import org.thdl.tib.text.TibetanMachineWeb; @@ -206,7 +207,11 @@ public class ACIPRules { private static HashMap acipConsonant2wylie = null; /** Returns the EWTS corresponding to the given ACIP consonant * (without the "A" vowel). Returns null if there is no such - * EWTS. */ + * EWTS. + * + *
Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y", + * even though sometimes the EWTS for those is "w", "R", or "Y". + * Handle that in the caller. */ static final String getWylieForACIPConsonant(String acip) { if (acipConsonant2wylie == null) { acipConsonant2wylie = new HashMap(37); @@ -242,7 +247,15 @@ public class ACIPRules { putMapping(acipConsonant2wylie, "TZ", "ts"); putMapping(acipConsonant2wylie, "TS", "tsh"); putMapping(acipConsonant2wylie, "DZ", "dz"); - putMapping(acipConsonant2wylie, "W", "w"); + putMapping(acipConsonant2wylie, "W", "W" + /* NOTE WELL: sometimes "w", sometimes "W". + Handle this in the caller. + + Reasoning for "W" instead of "w": r-w and + r+w are both known hash keys. We sort 'em + out this way. (They are the only things + like this according to bug report #800166.) */ + ); putMapping(acipConsonant2wylie, "ZH", "zh"); putMapping(acipConsonant2wylie, "Z", "z"); putMapping(acipConsonant2wylie, "'", "'"); @@ -329,24 +342,26 @@ public class ACIPRules { * true. Returns null if acip is unknown. */ static String getUnicodeFor(String acip, boolean subscribed) { if (superACIP2unicode == null) { + final boolean compactUnicode + = ThdlOptions.getBooleanOption("thdl.acip.to.unicode.conversions.use.0F52.et.cetera"); superACIP2unicode = new HashMap(144); subACIP2unicode = new HashMap(42); // oddball: subACIP2unicode.put("V", "\u0FAD"); - superACIP2unicode.put("DH", "\u0F52"); - subACIP2unicode.put("DH", "\u0FA2"); - superACIP2unicode.put("BH", "\u0F57"); - subACIP2unicode.put("BH", "\u0FA7"); - superACIP2unicode.put("dH", "\u0F4D"); - subACIP2unicode.put("dH", "\u0F9D"); - superACIP2unicode.put("DZH", "\u0F5C"); - subACIP2unicode.put("DZH", "\u0FAC"); - superACIP2unicode.put("Ksh", "\u0F69"); - subACIP2unicode.put("Ksh", "\u0FB9"); - superACIP2unicode.put("GH", "\u0F43"); - subACIP2unicode.put("GH", "\u0F93"); + superACIP2unicode.put("DH", (compactUnicode ? "\u0F52" : "\u0F51\u0FB7")); + subACIP2unicode.put("DH", (compactUnicode ? "\u0FA2" : "\u0FA1\u0FB7")); + superACIP2unicode.put("BH", (compactUnicode ? "\u0F57" : "\u0F56\u0FB7")); + subACIP2unicode.put("BH", (compactUnicode ? "\u0FA7" : "\u0FA6\u0FB7")); + superACIP2unicode.put("dH", (compactUnicode ? "\u0F4D" : "\u0F4C\u0FB7")); + subACIP2unicode.put("dH", (compactUnicode ? "\u0F9D" : "\u0F9C\u0FB7")); + superACIP2unicode.put("DZH", (compactUnicode ? "\u0F5C" : "\u0F5B\u0FB7")); + subACIP2unicode.put("DZH", (compactUnicode ? "\u0FAC" : "\u0FAB\u0FB7")); + superACIP2unicode.put("Ksh", (compactUnicode ? "\u0F69" : "\u0F40\u0FB5")); + subACIP2unicode.put("Ksh", (compactUnicode ? "\u0FB9" : "\u0F90\u0FB5")); + superACIP2unicode.put("GH", (compactUnicode ? "\u0F43" : "\u0F42\u0FB7")); + subACIP2unicode.put("GH", (compactUnicode ? "\u0F93" : "\u0F92\u0FB7")); superACIP2unicode.put("K", "\u0F40"); subACIP2unicode.put("K", "\u0F90"); superACIP2unicode.put("KH", "\u0F41"); diff --git a/source/org/thdl/tib/text/ttt/PackageTest.java b/source/org/thdl/tib/text/ttt/PackageTest.java index 02a5d49..242c28a 100644 --- a/source/org/thdl/tib/text/ttt/PackageTest.java +++ b/source/org/thdl/tib/text/ttt/PackageTest.java @@ -7449,13 +7449,13 @@ M+NA uhelp("NA+YA", "\u0f53\u0fb1"); // FIXME: warn about the extra A uhelp("NE+YA", "[#ERROR CONVERTING ACIP DOCUMENT: The tsheg bar (\"syllable\") NE+YA has these errors: Cannot convert ACIP NE+-YA because + is not an ACIP consonant]"); uhelp("tRAStA", "\u0f4a\u0fb2\u0f66\u0f9a"); - uhelp("DZHDZHA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP DZHDZHA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f5c\u0fac"); // tricky because DZHDZA is not in TMW but DZHDZHA is - uhelp("DZHDZA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP DZHDZA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f5c\u0fab"); + uhelp("DZHDZHA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP DZHDZHA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f5b\u0fb7\u0fab\u0fb7"); // tricky because DZHDZA is not in TMW but DZHDZHA is + uhelp("DZHDZA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP DZHDZA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f5b\u0fb7\u0fab"); uhelp("P+S+N+YA", "\u0f54\u0fb6\u0fa3\u0fb1"); uhelp("P+S+NYA", "\u0f54\u0fb6\u0f99"); uhelp("PSNYA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP PSNYA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f54\u0fb6\u0f99"); // Is this P+S+N+YA? No, it's P+S+NYA. But warn! uhelp("NNYA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP NNYA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f53\u0f99"); - uhelp("GHNYA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP GHNYA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f43\u0f99"); + uhelp("GHNYA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP GHNYA was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f42\u0fb7\u0f99"); // TS+NYA and T+S+N+YA are both legal, so what is TSNYA? // Private correspondence with Robert Chilton says that it is @@ -7471,7 +7471,7 @@ M+NA uhelp("KAo KHA", "\u0f40\u0f37\u0f0b\u0f41"); uhelp("KA KAo KHA", "\u0f40\u0f0b\u0f40\u0f37\u0f0b\u0f41"); uhelp("KAx", "\u0f40[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot convert the ACIP {x} to Tibetan because it is unclear what the result should be.]"); - uhelp("G+DHA", "\u0f42\u0fa2"); + uhelp("G+DHA", "\u0f42\u0fa1\u0fb7"); uhelp("P'EE", "\u0f54\u0f71\u0f7b"); uhelp("KA", "\u0f40"); @@ -7523,9 +7523,9 @@ M+NA uhelp("/NY'EE/", "\u0f3C\u0f49\u0F71\u0F7B\u0f3D"); uhelp("*#HUm: G+DHOO GRO`;.,", - "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d"); + "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa1\u0fb7\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d"); uhelp("*#HUm: K+DHA GRO`;.,", - "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f40\u0fa2\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d"); + "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f40\u0fa1\u0fb7\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d"); uhelp("HA,\nHA\n\nHA", "\u0f67\u0f0d \u0f67\u0f0b\n\n\u0f67"); uhelp("NGA,", "\u0f44\u0f0c\u0f0d"); uhelp("NGA,\nHA\n\nHA", "\u0f44\u0f0c\u0f0d \u0f67\u0f0b\n\n\u0f67"); @@ -7538,6 +7538,36 @@ M+NA uhelp("GU, ,KHO", "\u0f42\u0f74\u0f0d \u0f0d\u0f41\u0f7c"); uhelp("GU ,KHO", "\u0f42\u0f74\u0f0b \u0f0d\u0f41\u0f7c"); // FIXME: missing a shad after GU, warn about that. uhelp("GA HA", "\u0f42\u0f0b \u0f67"); + uhelp("WA", "\u0f5d"); + uhelp("W", "\u0f5d"); + uhelp("WO", "\u0f5d\u0f7c"); + uhelp("WWA", "\u0f5d\u0fba"); + uhelp("W+WA", "\u0f5d\u0fba"); + uhelp("WNA", "\u0f5d\u0fa3"); + uhelp("WN", "\u0f5d\u0fa3"); + uhelp("W+NA", "\u0f5d\u0fa3"); + uhelp("W+N", "\u0f5d\u0fa3"); + uhelp("W+YA", "\u0f5d\u0fb1"); + uhelp("W+Y", "\u0f5d\u0fb1"); + uhelp("WYA", "\u0f5d\u0fb1"); + uhelp("WY", "\u0f5d\u0fb1"); + uhelp("WR", "\u0f5d\u0fb2"); + uhelp("WRA", "\u0f5d\u0fb2"); + uhelp("W+RA", "\u0f5d\u0fb2"); + uhelp("W+R", "\u0f5d\u0fb2"); + uhelp("BCWA", "\u0f56\u0f95\u0fba"); + uhelp("BCW", "\u0f56\u0f95\u0fba"); + uhelp("BCWO", "\u0f56\u0f95\u0fba\u0f7c"); + uhelp("BCVA", "\u0f56\u0f45\u0fad"); + uhelp("BCV", "\u0f56\u0f45\u0fad"); + uhelp("BCV'O", "\u0f56\u0f45\u0fad\u0f71\u0f7c"); + uhelp("BCV'A", "\u0f56\u0f45\u0fad\u0f71"); + uhelp("BCV'", "\u0f56\u0f95\u0fad\u0fb0"); + uhelp("GYA", "\u0f42\u0fb1"); + uhelp("GY", "\u0f42\u0fb1"); + uhelp("G-YA", "\u0f42\u0f61"); + uhelp("GA-YA", "[#WARNING CONVERTING ACIP DOCUMENT: There is a useless disambiguator in GA-YA.]\u0f42\u0f61"); + uhelp("GA-YO", "[#WARNING CONVERTING ACIP DOCUMENT: There is a useless disambiguator in GA-YO.]\u0f42\u0f61\u0F7c"); } public void testFixedFormSubjoinedConsonants() { // Usual subjoined RA: @@ -7563,7 +7593,7 @@ M+NA + "\u0f61\u0fbb\u0f7b\u0f0b" // Y+YEE + "\u0f4e\u0f9c\u0fbc\u0fb1\u0f0b" // ndRYA + "\u0f4e\u0f9c\u0fbc\u0fb1\u0f7b\u0f0b" // n+d+R+YEE - + "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP KshR was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f69\u0fbc\u0f0b" // KshR + + "[#WARNING CONVERTING ACIP DOCUMENT: There is a chance that the ACIP KshR was intended to represent more consonants than we parsed it as representing -- NNYA, e.g., means N+NYA, but you can imagine seeing N+N+YA and typing NNYA for it too.]\u0f40\u0fb5\u0fbc\u0f0b" // KshR + "\u0f40\u0fb5\u0fbc\u0f7b\u0f0b" // K+sh+REE + "\u0f4e\u0f9c\u0fbb\u0f0b" // ndY + "\u0f4e\u0f9c\u0fbb\u0f7b\u0f0d" // n+d+YEE diff --git a/source/org/thdl/tib/text/ttt/TPair.java b/source/org/thdl/tib/text/ttt/TPair.java index 318f62d..dd08025 100644 --- a/source/org/thdl/tib/text/ttt/TPair.java +++ b/source/org/thdl/tib/text/ttt/TPair.java @@ -184,7 +184,11 @@ class TPair { /** Returns the EWTS Wylie that corresponds to this pair if * justLeft is false, or the EWTS Wylie that corresponds to just - * {@link #getLeft()} if justLeft is true. */ + * {@link #getLeft()} if justLeft is true. + * + *
Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y", + * even though sometimes the EWTS for those is "w", "R", or "Y". + * Handle that in the caller. */ String getWylie(boolean justLeft) { String leftWylie = null; if (getLeft() != null) { diff --git a/source/org/thdl/tib/text/ttt/TPairList.java b/source/org/thdl/tib/text/ttt/TPairList.java index b9dcaee..3bd1c77 100644 --- a/source/org/thdl/tib/text/ttt/TPairList.java +++ b/source/org/thdl/tib/text/ttt/TPairList.java @@ -660,38 +660,37 @@ class TPairList { TPair lastPair = get(size() - 1); wylieForConsonant.append(lastPair.getWylie(true)); String hashKey = wylieForConsonant.toString(); - // r-w and r+w are both known hash keys. Sort 'em out. They - // are the only things like this according to bug report - // #800166. - if ("r+w".equals(hashKey)) { - boolean sawWazur = false; - for (int x = 0; x < size(); x++) { - TPair p = get(x); - if ("V".equals(get(x).getLeft())) { - sawWazur = true; - break; - } - } - if (sawWazur) - hashKey = "r-w"; - else - hashKey = "r+W"; // because EWTS has special handling - // for full-formed subjoined - // consonants - } else { - // Because EWTS has special handling for full-formed - // subjoined consonants, we have special handling here. - if ("r+y".equals(hashKey)) - hashKey = "r+Y"; - else if ("y+y".equals(hashKey)) - hashKey = "y+Y"; - else if ("N+D+y".equals(hashKey)) - hashKey = "N+D+Y"; - else if ("N+D+r+y".equals(hashKey)) - hashKey = "N+D+R+y"; - else if ("k+Sh+r".equals(hashKey)) - hashKey = "k+Sh+R"; - } + + // Because EWTS has special handling for full-formed + // subjoined consonants, we have special handling here. + if ("r+y".equals(hashKey)) + hashKey = "r+Y"; + else if ("y+y".equals(hashKey)) + hashKey = "y+Y"; + else if ("N+D+y".equals(hashKey)) + hashKey = "N+D+Y"; + else if ("N+D+r+y".equals(hashKey)) + hashKey = "N+D+R+y"; + else if ("k+Sh+r".equals(hashKey)) + hashKey = "k+Sh+R"; + + // TPair.getWylie(..) returns "W" sometimes when "w" is what + // really should be returned. ("V" always causes "w" to be + // returned, which is fine.) We'll change "W" to "w" here if + // we need to. We do it only for a few known stacks (the ones + // in TMW). + if ("W".equals(hashKey)) + hashKey = "w"; + else if ("W+y".equals(hashKey)) + hashKey = "w+y"; + else if ("W+r".equals(hashKey)) + hashKey = "w+r"; + else if ("W+n".equals(hashKey)) + hashKey = "w+n"; + else if ("W+W".equals(hashKey)) + hashKey = "w+W"; + // We're NOT doing it for r+W etc., on purpose. + if (!TibetanMachineWeb.isKnownHashKey(hashKey)) { hashKey = hashKey.replace('+', '-'); if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {