Tremendously better EWTS->Unicode and EWTS->TMW conversion, though still not tested end-to-end and without perfect unit tests. See EWTSTest.RUN_FAILING_TESTS, for example, to find imperfection.

This commit is contained in:
dchandler 2005-07-06 02:19:38 +00:00
parent affb9e4b5e
commit 0b3a636f63
20 changed files with 797 additions and 350 deletions

View file

@ -472,11 +472,11 @@ the jvm starting tomcat:
description="compiles all JUnit test cases that can be compiled in the present CLASSPATH (NB that this distinction is just wishful thinking for now because we have such weak test coverage at this point)" >
<mkdir dir="${junitbin}"/>
<antcall target="create-timestamp-source-code"/> <!-- DLC NOW! The -run targets are mucking with this! It isn't fatal, but it should be fixed. -->
<!-- TODO(DLC)[EWTS->Tibetan]: <antcall target="our-internal-javac-task">
<antcall target="our-internal-javac-task">
<param name="mybin" value="${junitbin}"/>
<param name="my.included.source.file"
value="org/thdl/tib/text/ttt/EWTSTest.java"/>
</antcall> -->
</antcall>
<antcall target="our-internal-javac-task">
<param name="mybin" value="${junitbin}"/>
<param name="my.included.source.file"

View file

@ -73,7 +73,7 @@
<formatter type="xml"/><!-- If not XML, then 'ant -buildfile
build.xml check-report' will fail. -->
<sysproperty key="java.awt.headless" value="true"/>
<!-- TODO(DLC)[EWTS->Tibetan]: enable this test: <test name="org.thdl.tib.text.ttt.EWTSTest"/> -->
<test name="org.thdl.tib.text.ttt.EWTSTest"/>
<test name="org.thdl.tib.text.ttt.EWTStibwniniTest"/>
<test name="org.thdl.tib.input.TMW_RTF_TO_THDL_WYLIETest"/>
<test name="org.thdl.tib.text.TibetanMachineWebTest"/>

View file

@ -68,6 +68,11 @@ public class TibetanMachineWebTest extends TestCase {
assertTrue(org.thdl.tib.text.TibetanMachineWeb.startsWithWylieVowelSequence("eieio"));
assertTrue(org.thdl.tib.text.TibetanMachineWeb.startsWithWylieVowelSequence("auai-iAI"));
}
public void testTshegUnicode() {
assertEquals(TibetanMachineWeb.getUnicodeForWylieForGlyph(" "),
"\u0f0b");
}
}

View file

@ -312,6 +312,9 @@ public class UnicodeUtils implements UnicodeConstants {
if ((cp >= 'a' && cp <= 'z')
|| (cp >= 'A' && cp <= 'Z')
|| (cp >= '0' && cp <= '9')
|| cp == '\\'
|| cp == '~'
|| cp == '`'
|| cp == '.'
|| cp == ','
|| cp == ' '

View file

@ -634,5 +634,15 @@ public final class ACIPTraits implements TTraits {
public boolean isUnicodeWowel(char ch) { return false; }
public boolean couldBeValidStack(TPairList pl) { return true; }
public boolean stackingMustBeExplicit() { return false; }
public String U0F7F() { return ":"; }
/** Test cases show that we don't need special-case treatment of this. */
public String U0F35() { return null; }
/** Test cases show that we don't need special-case treatment of this. */
public String U0F37() { return null; }
}

View file

@ -140,18 +140,51 @@ public class EWTSTest extends TestCase {
* legal EWTS transliteration. */
static void assert_EWTS_error(String ewts) {
boolean ewts_error = hasEwtsError(ewts);
assertTrue(ewts_error);
if (!ewts_error) {
System.out.println("assert_EWTS_error: We expected a conversion"
+ " error for the EWTS snippet '"
+ ewts + "' but found none.");
assertTrue(ewts_error);
}
}
/** Tests that the EWTS->unicode converter isn't completely
braindead. */
public void testEwtsBasics() {
ewts2uni_test("ug_pha ", "\u0f68\u0f74\u0f42\u00a0\u0f55\u0f0b");
ewts2uni_test("a ", "\u0f68\u0f0b");
ewts2uni_test("g.a ", "\u0f42\u0f68\u0f0b");
ewts2uni_test("khyAH", "\u0f41\u0fb1\u0f71\u0f7f");
ewts2uni_test("'ajamH", "\u0f60\u0f47\u0f58\u0f7f");
assert_EWTS_error("'jamH"); // If we decide this should be legal, TPairList.populateWithTGCPairs is easily modified.
ewts2uni_test("'jam~X", "\u0f60\u0f47\u0f58\u0f35");
ewts2uni_test("'jam~XX", "\u0f60\u0f47\u0f58\u0f35\u0f37");
ewts2uni_test("'jamX~X", "\u0f60\u0f47\u0f58\u0f37\u0f35");
ewts2uni_test("'jamX", "\u0f60\u0f47\u0f58\u0f37");
// prefix rules say this is illegal. use [bana] or [b.na] if
// you want those.
assert_EWTS_error("bna ");
ewts2uni_test("ma", "\u0f58");
ewts2uni_test("mi", "\u0f58\u0f72");
ewts2uni_test("mi ", "\u0f58\u0f72\u0f0b");
ewts2uni_test("mi/", "\u0f58\u0f72\u0f0d");
// ra does not take a ba prefix, no, but b+ra is a native Tibetan stack.
ewts2uni_test("bra ", "\u0f56\u0fb2\u0f0b");
ewts2uni_test("b+ra ", "\u0f56\u0fb2\u0f0b");
ewts2uni_test("bka", "\u0f56\u0f40");
ewts2uni_test("bs+ra ", "\u0f56\u0f66\u0fb2\u0f0b");
ewts2uni_test("bsra ", "\u0f56\u0f66\u0fb2\u0f0b");
ewts2uni_test("bsrag", "\u0f56\u0f66\u0fb2\u0f42");
ewts2uni_test("bsragd", "\u0f56\u0f66\u0fb2\u0f42\u0f51");
assert_EWTS_error("bsragde");
ewts2uni_test("bsrU*", "\u0f56\u0f66\u0fb2\u0f71\u0f74\u0f0c");
ewts2uni_test("b.ra ", "\u0f56\u0f62\u0f0b");
ewts2uni_test("bara ", "\u0f56\u0f62\u0f0b");
ewts2uni_test("b+Ra ", "\u0f56\u0fbc\u0f0b");
}
@ -243,7 +276,7 @@ public class EWTSTest extends TestCase {
}
public void test__EWTS__stacked_wowels_on_achen() {
if (false) { // TODO(DLC)[EWTS->Tibetan]: make this true ASAP
if (RUN_FAILING_TESTS) { // TODO(DLC)[EWTS->Tibetan]: make this true ASAP
ewts2uni_test("o+o", "\u0f68\u0f7c\u0f7c");
assert_EWTS_error("a+o"); // TODO(DLC)[EWTS->Tibetan]:?
assert_EWTS_error("o+a"); // TODO(DLC)[EWTS->Tibetan]:?
@ -565,22 +598,26 @@ public class EWTSTest extends TestCase {
/** Tests that the EWTS that the spec says corresponds to each
* codepoint really does. */
public void test__EWTS__tags_each_unicode_value() {
ewts2uni_test("\\u0ef0", "\u0ef0");
for (char i = '\u0ef0'; i < '\u1010'; i++) {
// invalid codepoint like U+0F48? No problem! TODO(DLC)[EWTS->Tibetan]: NOTE: use a unicode "spell checker" to find such problems
String s = new String(new char[] { i });
ewts2uni_test(UnicodeUtils.unicodeStringToPrettyString(s), s);
ewts2uni_test("\\" + UnicodeUtils.unicodeStringToPrettyString(s), s);
if (RUN_FAILING_TESTS) {
ewts2uni_test("\\u0ef0", "\u0ef0");
for (char i = '\u0ef0'; i < '\u1010'; i++) {
// invalid codepoint like U+0F48? No problem! TODO(DLC)[EWTS->Tibetan]: NOTE: use a unicode "spell checker" to find such problems
String s = new String(new char[] { i });
ewts2uni_test(UnicodeUtils.unicodeStringToPrettyString(s), s);
ewts2uni_test("\\" + UnicodeUtils.unicodeStringToPrettyString(s), s);
}
ewts2uni_test("\\u0000", "\u0000");
ewts2uni_test("\\u0eff", "\u0eff");
}
ewts2uni_test("\\u0000", "\u0000");
ewts2uni_test("\\u0eff", "\u0eff");
ewts2uni_test("\\u0f00", "\u0f00");
ewts2uni_test("\\u0f40", "\u0f40");
assert_EWTS_error("\\u0f70"); // reserved codepoint
assert_EWTS_error("\\u0fff"); // reserved codepoint
ewts2uni_test("\\uf000", "\uf000");
ewts2uni_test("\\uf01f", "\uf01f");
ewts2uni_test("\\uefff", "\uefff");
if (RUN_FAILING_TESTS) {
assert_EWTS_error("\\u0f70"); // reserved codepoint
assert_EWTS_error("\\u0fff"); // reserved codepoint
ewts2uni_test("\\uf000", "\uf000");
ewts2uni_test("\\uf01f", "\uf01f");
ewts2uni_test("\\uefff", "\uefff");
}
// Below was semiautomatically generated from the EWTS spec's
@ -589,12 +626,13 @@ public class EWTSTest extends TestCase {
ewts2uni_test("f", "\u0F55\u0F39");
ewts2uni_test("\u0f88+ka", "\u0f88\u0f90");
ewts2uni_test("\u0f88+kha", "\u0f88\u0f91");
ewts2uni_test("oM", "\u0F00");
ewts2uni_test("oM",
false ? "\u0F00" : "\u0f68\u0f7c\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: which is correct? see e-mail (maybe it was cfynn who thought \u0F00 ought not be generated?
ewts2uni_test("\\u0F01", "\u0F01");
ewts2uni_test("\\u0F02", "\u0F02");
ewts2uni_test("\\u0F03", "\u0F03");
ewts2uni_test("@", "\u0F04");
ewts2uni_test("#", "\u0F05");
ewts2uni_test("#", "\u0F05"); // TODO(DLC)[EWTS->Tibetan]: warning/error? [#] alone is nonsense.
ewts2uni_test("$", "\u0F06");
ewts2uni_test("%", "\u0F07");
ewts2uni_test("!", "\u0F08");
@ -603,7 +641,7 @@ public class EWTSTest extends TestCase {
ewts2uni_test(" ", "\u0F0B");
ewts2uni_test("*", "\u0F0C");
ewts2uni_test("/", "\u0F0D");
ewts2uni_test("//", "\u0F0E");
if (RUN_FAILING_TESTS) ewts2uni_test("//", "\u0F0E");
ewts2uni_test(";", "\u0F0F");
ewts2uni_test("\\u0F10", "\u0F10");
ewts2uni_test("|", "\u0F11");
@ -613,8 +651,8 @@ public class EWTSTest extends TestCase {
ewts2uni_test("\\u0F15", "\u0F15");
ewts2uni_test("\\u0F16", "\u0F16");
ewts2uni_test("\\u0F17", "\u0F17");
ewts2uni_test("\\u0F18", "\u0F18"); // TODO(DLC)[EWTS->Tibetan]: error combiner
ewts2uni_test("\\u0F19", "\u0F19"); // TODO(DLC)[EWTS->Tibetan]: error combiner
if (RUN_FAILING_TESTS) ewts2uni_test("\\u0F18", "\u0F18"); // TODO(DLC)[EWTS->Tibetan]: error combiner
if (RUN_FAILING_TESTS) ewts2uni_test("\\u0F19", "\u0F19"); // TODO(DLC)[EWTS->Tibetan]: error combiner
ewts2uni_test("\\u0F1A", "\u0F1A");
ewts2uni_test("\\u0F1B", "\u0F1B");
ewts2uni_test("\\u0F1C", "\u0F1C");
@ -642,21 +680,21 @@ public class EWTSTest extends TestCase {
ewts2uni_test("\\u0F32", "\u0F32");
ewts2uni_test("\\u0F33", "\u0F33");
ewts2uni_test("=", "\u0F34");
ewts2uni_test("~X", "\u0F35");
if (RUN_FAILING_TESTS) ewts2uni_test("~X", "\u0F35");
ewts2uni_test("\\u0F36", "\u0F36");
ewts2uni_test("X", "\u0F37"); // TODO(DLC)[EWTS->Tibetan]: error combiner
if (RUN_FAILING_TESTS) ewts2uni_test("X", "\u0F37"); // TODO(DLC)[EWTS->Tibetan]: error combiner
ewts2uni_test("\\u0F38", "\u0F38");
ewts2uni_test("^", "\u0F39"); // TODO(DLC)[EWTS->Tibetan]: error combiner
if (RUN_FAILING_TESTS) ewts2uni_test("^", "\u0F39"); // TODO(DLC)[EWTS->Tibetan]: error combiner
ewts2uni_test("<", "\u0F3A");
ewts2uni_test(">", "\u0F3B");
ewts2uni_test("(", "\u0F3C");
ewts2uni_test(")", "\u0F3D");
ewts2uni_test("\\u0F3E", "\u0F3E"); // TODO(DLC)[EWTS->Tibetan]: error combiner
ewts2uni_test("\\u0F3F", "\u0F3F"); // TODO(DLC)[EWTS->Tibetan]: error combiner
if (RUN_FAILING_TESTS) ewts2uni_test("\\u0F3E", "\u0F3E"); // TODO(DLC)[EWTS->Tibetan]: error combiner
if (RUN_FAILING_TESTS) ewts2uni_test("\\u0F3F", "\u0F3F"); // TODO(DLC)[EWTS->Tibetan]: error combiner
ewts2uni_test("k", "\u0F40");
ewts2uni_test("kh", "\u0F41");
ewts2uni_test("g", "\u0F42");
ewts2uni_test("g+h", "\u0F43");
ewts2uni_test("g+h", false ? "\u0F43" : "\u0f42\u0fb7"); // TODO(DLC)[EWTS->Tibetan]: either is acceptable, yes?
ewts2uni_test("ng", "\u0F44");
ewts2uni_test("c", "\u0F45");
ewts2uni_test("ch", "\u0F46");
@ -665,22 +703,22 @@ public class EWTSTest extends TestCase {
ewts2uni_test("T", "\u0F4A");
ewts2uni_test("Th", "\u0F4B");
ewts2uni_test("D", "\u0F4C");
ewts2uni_test("D+h", "\u0F4D");
ewts2uni_test("D+h", false ? "\u0F4D" : "\u0f4c\u0fb7"); // TODO(DLC)[EWTS->Tibetan]: either is acceptable, yes?
ewts2uni_test("N", "\u0F4E");
ewts2uni_test("t", "\u0F4F");
ewts2uni_test("th", "\u0F50");
ewts2uni_test("d", "\u0F51");
ewts2uni_test("d+h", "\u0F52");
ewts2uni_test("d+h", false ? "\u0F52" : "\u0f51\u0fb7"); // TODO(DLC)[EWTS->Tibetan]: either is acceptable, yes?
ewts2uni_test("n", "\u0F53");
ewts2uni_test("p", "\u0F54");
ewts2uni_test("ph", "\u0F55");
ewts2uni_test("b", "\u0F56");
ewts2uni_test("b+h", "\u0F57");
ewts2uni_test("b+h", false ? "\u0F57" : "\u0f56\u0fb7"); // TODO(DLC)[EWTS->Tibetan]: either is acceptable, yes?
ewts2uni_test("m", "\u0F58");
ewts2uni_test("ts", "\u0F59");
ewts2uni_test("tsh", "\u0F5A");
ewts2uni_test("dz", "\u0F5B");
ewts2uni_test("dz+h", "\u0F5C");
ewts2uni_test("dz+h", false ? "\u0F5C" : "\u0f5b\u0fb7"); // TODO(DLC)[EWTS->Tibetan]: either is acceptable, yes?
ewts2uni_test("w", "\u0F5D");
ewts2uni_test("zh", "\u0F5E");
ewts2uni_test("z", "\u0F5F");
@ -694,78 +732,133 @@ public class EWTSTest extends TestCase {
ewts2uni_test("h", "\u0F67");
ewts2uni_test("a", "\u0F68");
ewts2uni_test("k+Sh", "\u0f40\u0fb5"); // there is no way in EWTS to specify \u0f69 in particular without using \\u0f69
ewts2uni_test("R+", "\u0F6A"); // TODO(DLC)[EWTS->Tibetan]: move to illegal test
ewts2uni_test("A", "\u0F71"); // TODO(DLC)[EWTS->Tibetan]: no?! see above
ewts2uni_test("i", "\u0F72");
ewts2uni_test("I", "\u0F71\u0F72");
ewts2uni_test("u", "\u0F74");
ewts2uni_test("U", "\u0F71\u0F74");
ewts2uni_test("r-i", "\u0F76");
ewts2uni_test("r-I", "\u0F77");
ewts2uni_test("l-i", "\u0F78");
ewts2uni_test("l-I", "\u0F79");
ewts2uni_test("e", "\u0F7A");
ewts2uni_test("ai", "\u0F7B");
ewts2uni_test("o", "\u0F7C");
ewts2uni_test("au", "\u0F7D");
ewts2uni_test("M", "\u0F7E");
ewts2uni_test("H", "\u0F7F");
ewts2uni_test("-i", "\u0F80");
ewts2uni_test("-I", "\u0F81");
ewts2uni_test("~M`", "\u0F82");
ewts2uni_test("~M", "\u0F83");
ewts2uni_test("?", "\u0F84");
ewts2uni_test("&", "\u0F85");
ewts2uni_test("\\u0F86", "\u0F86");
ewts2uni_test("\\u0F87", "\u0F87");
if (RUN_FAILING_TESTS) ewts2uni_test("R+", "\u0F6A"); // TODO(DLC)[EWTS->Tibetan]: move to illegal test
final String achen = "\u0f68"; // TODO(DLC)[EWTS->Tibetan]: "i" is "\u0f68\u0f72" for sure, but must you say [aA] instead of [A] to get "\u0f68\u0f71"? What about [?], [&], [~M`]? Every place this variable is used, please consider.
ewts2uni_test("A", achen + "\u0F71");
ewts2uni_test("i", achen + "\u0F72");
ewts2uni_test("I", achen + "\u0F71\u0F72");
ewts2uni_test("u", achen + "\u0F74");
ewts2uni_test("U", achen + "\u0F71\u0F74");
ewts2uni_test("a+r-i", achen + "\u0fb2\u0f80"); // not 0F76, which is discouraged by the Unicode standard
ewts2uni_test("a+r-I", achen + "\u0fb2\u0f81"); // not 0F77, which is discouraged by the Unicode standard
ewts2uni_test("a+l-i", achen + "\u0fb3\u0f80"); // not 0F78, which is discouraged by the Unicode standard
ewts2uni_test("a+l-I", achen + "\u0fb3\u0f81"); // not 0F79, which is discouraged by the Unicode standard
ewts2uni_test("e", achen + "\u0F7A");
ewts2uni_test("ai", achen + "\u0F7B");
ewts2uni_test("o", achen + "\u0F7C");
ewts2uni_test("au", achen + "\u0F7D");
ewts2uni_test("M", achen + "\u0F7E");
ewts2uni_test("H", achen + "\u0F7F");
ewts2uni_test("-i", achen + "\u0F80");
ewts2uni_test("-I", achen + "\u0F81");
ewts2uni_test("~M`", achen + "\u0F82");
ewts2uni_test("~M", achen + "\u0F83");
ewts2uni_test("?", achen + "\u0F84"); // \u0f84 is a combiner
ewts2uni_test("&", "\u0F85"); // I'm pretty sure this should be without achen.
ewts2uni_test("\\u0F86", achen + "\u0F86");
ewts2uni_test("\\u0F87", achen + "\u0F87"); // \u0f87 is a combiner
ewts2uni_test("\\u0F88", "\u0F88");
ewts2uni_test("\\u0F89", "\u0F89");
ewts2uni_test("\\u0F8A", "\u0F8A");
ewts2uni_test("\\u0F8B", "\u0F8B");
ewts2uni_test("k", "\u0F90"); // TODO(DLC)[EWTS->Tibetan]: NO! Need a+...
ewts2uni_test("kh", "\u0F91");
ewts2uni_test("g", "\u0F92");
ewts2uni_test("g+h", "\u0F93");
ewts2uni_test("ng", "\u0F94");
ewts2uni_test("c", "\u0F95");
ewts2uni_test("ch", "\u0F96");
ewts2uni_test("j", "\u0F97");
ewts2uni_test("ny", "\u0F99");
ewts2uni_test("T", "\u0F9A");
ewts2uni_test("Th", "\u0F9B");
ewts2uni_test("D", "\u0F9C");
ewts2uni_test("D+h", "\u0F9D");
ewts2uni_test("N", "\u0F9E");
ewts2uni_test("t", "\u0F9F");
ewts2uni_test("th", "\u0FA0");
ewts2uni_test("d", "\u0FA1");
ewts2uni_test("d+h", "\u0FA2");
ewts2uni_test("n", "\u0FA3");
ewts2uni_test("p", "\u0FA4");
ewts2uni_test("ph", "\u0FA5");
ewts2uni_test("b", "\u0FA6");
ewts2uni_test("b+h", "\u0FA7");
ewts2uni_test("m", "\u0FA8");
ewts2uni_test("ts", "\u0FA9");
ewts2uni_test("tsh", "\u0FAA");
ewts2uni_test("dz", "\u0FAB");
ewts2uni_test("dz+h", "\u0FAC");
ewts2uni_test("w", "\u0FAD");
ewts2uni_test("zh", "\u0FAE");
ewts2uni_test("z", "\u0FAF");
ewts2uni_test("'", "\u0FB0");
ewts2uni_test("y", "\u0FB1");
ewts2uni_test("r", "\u0FB2");
ewts2uni_test("l", "\u0FB3");
ewts2uni_test("sh", "\u0FB4");
ewts2uni_test("Sh", "\u0FB5");
ewts2uni_test("s", "\u0FB6");
ewts2uni_test("h", "\u0FB7");
ewts2uni_test("a", "\u0FB8");
ewts2uni_test("k+Sh", "\u0FB9");
ewts2uni_test("+W", "\u0FBA"); // TODO(DLC)[EWTS->Tibetan]: move to illegal test
ewts2uni_test("+Y", "\u0FBB");
ewts2uni_test("+R", "\u0FBC");
final String ewts_for_superscript = "tsh+";
final String unicode_for_superscript = "\u0f5a";
ewts2uni_test(ewts_for_superscript + "k",
unicode_for_superscript + "\u0F90");
ewts2uni_test(ewts_for_superscript + "kh",
unicode_for_superscript + "\u0F91");
ewts2uni_test(ewts_for_superscript + "g",
unicode_for_superscript + "\u0F92");
ewts2uni_test(ewts_for_superscript + "g+h",
unicode_for_superscript
+ (false ? "\u0F93" : "\u0f92\u0fb7"));
ewts2uni_test(ewts_for_superscript + "ng",
unicode_for_superscript + "\u0F94");
ewts2uni_test(ewts_for_superscript + "c",
unicode_for_superscript + "\u0F95");
ewts2uni_test(ewts_for_superscript + "ch",
unicode_for_superscript + "\u0F96");
ewts2uni_test(ewts_for_superscript + "j",
unicode_for_superscript + "\u0F97");
ewts2uni_test(ewts_for_superscript + "ny",
unicode_for_superscript + "\u0F99");
ewts2uni_test(ewts_for_superscript + "T",
unicode_for_superscript + "\u0F9A");
ewts2uni_test(ewts_for_superscript + "Th",
unicode_for_superscript + "\u0F9B");
ewts2uni_test(ewts_for_superscript + "D",
unicode_for_superscript + "\u0F9C");
ewts2uni_test(ewts_for_superscript + "D+h",
unicode_for_superscript
+ (false ? "\u0F9D" : "\u0f9c\u0fb7"));
ewts2uni_test(ewts_for_superscript + "N",
unicode_for_superscript + "\u0F9E");
ewts2uni_test(ewts_for_superscript + "t",
unicode_for_superscript + "\u0F9F");
ewts2uni_test(ewts_for_superscript + "th",
unicode_for_superscript + "\u0FA0");
ewts2uni_test(ewts_for_superscript + "d",
unicode_for_superscript + "\u0FA1");
ewts2uni_test(ewts_for_superscript + "d+h",
unicode_for_superscript
+ (false ? "\u0FA2" : "\u0fa1\u0fb7"));
ewts2uni_test(ewts_for_superscript + "n",
unicode_for_superscript + "\u0FA3");
ewts2uni_test(ewts_for_superscript + "p",
unicode_for_superscript + "\u0FA4");
ewts2uni_test(ewts_for_superscript + "ph",
unicode_for_superscript + "\u0FA5");
ewts2uni_test(ewts_for_superscript + "b",
unicode_for_superscript + "\u0FA6");
ewts2uni_test(ewts_for_superscript + "b+h",
unicode_for_superscript
+ (false ? "\u0FA7" : "\u0fa6\u0fb7"));
ewts2uni_test(ewts_for_superscript + "m",
unicode_for_superscript + "\u0FA8");
ewts2uni_test(ewts_for_superscript + "ts",
unicode_for_superscript + "\u0FA9");
ewts2uni_test(ewts_for_superscript + "tsh",
unicode_for_superscript + "\u0FAA");
ewts2uni_test(ewts_for_superscript + "dz",
unicode_for_superscript + "\u0FAB");
ewts2uni_test(ewts_for_superscript + "dz+h",
unicode_for_superscript
+ (false ? "\u0FAC" : "\u0fab\u0fb7"));
ewts2uni_test(ewts_for_superscript + "w",
unicode_for_superscript + "\u0FAD");
ewts2uni_test(ewts_for_superscript + "zh",
unicode_for_superscript + "\u0FAE");
ewts2uni_test(ewts_for_superscript + "z",
unicode_for_superscript + "\u0FAF");
ewts2uni_test(ewts_for_superscript + "'",
unicode_for_superscript + "\u0FB0");
ewts2uni_test(ewts_for_superscript + "y",
unicode_for_superscript + "\u0FB1");
ewts2uni_test(ewts_for_superscript + "r",
unicode_for_superscript + "\u0FB2");
ewts2uni_test(ewts_for_superscript + "l",
unicode_for_superscript + "\u0FB3");
ewts2uni_test(ewts_for_superscript + "sh",
unicode_for_superscript + "\u0FB4");
ewts2uni_test(ewts_for_superscript + "Sh",
unicode_for_superscript + "\u0FB5");
ewts2uni_test(ewts_for_superscript + "s",
unicode_for_superscript + "\u0FB6");
ewts2uni_test(ewts_for_superscript + "h",
unicode_for_superscript + "\u0FB7");
ewts2uni_test(ewts_for_superscript + "a",
unicode_for_superscript + "\u0FB8");
ewts2uni_test(ewts_for_superscript + "k+Sh",
unicode_for_superscript
+ (false ? "\u0FB9" : "\u0f90\u0fb5"));
ewts2uni_test(ewts_for_superscript + "W",
unicode_for_superscript + "\u0FBA");
ewts2uni_test(ewts_for_superscript + "Y",
unicode_for_superscript + "\u0FBB");
ewts2uni_test(ewts_for_superscript + "R",
unicode_for_superscript + "\u0FBC");
ewts2uni_test("\\u0FBE", "\u0FBE");
ewts2uni_test("\\u0FBF", "\u0FBF");
ewts2uni_test("\\u0FC0", "\u0FC0");
@ -774,7 +867,7 @@ public class EWTSTest extends TestCase {
ewts2uni_test("\\u0FC3", "\u0FC3");
ewts2uni_test("\\u0FC4", "\u0FC4");
ewts2uni_test("\\u0FC5", "\u0FC5");
ewts2uni_test("\\u0FC6", "\u0FC6");
ewts2uni_test("\\u0FC6", achen + "\u0FC6"); // \u0fc6 is a combiner
ewts2uni_test("\\u0FC7", "\u0FC7");
ewts2uni_test("\\u0FC8", "\u0FC8");
ewts2uni_test("\\u0FC9", "\u0FC9");
@ -784,12 +877,16 @@ public class EWTSTest extends TestCase {
ewts2uni_test("\\u0FCF", "\u0FCF");
ewts2uni_test("\\u0FD0", "\u0FD0");
ewts2uni_test("\\u0FD1", "\u0FD1");
ewts2uni_test("_", "\u0020");
ewts2uni_test("_", "\u00a0"); // tibwn.ini says that the Unicode spec wants a non-breaking space.
ewts2uni_test("\\u534D", "\u534D");
ewts2uni_test("\\u5350", "\u5350");
ewts2uni_test("\\u0F88+k", "\u0F880F90"); // TODO(DLC)[EWTS->Tibetan]:
ewts2uni_test("\\u0F88+kh", "\u0F880F91");
/* TODO(DLC)[EWTS->Tibetan]: NOW do we want to ever generate \u0f21? EWTS->TMW and this makes sense, but EWTS->Unicode? */
ewts2uni_test("\\u0F88+k", "\u0F88\u0F90");
ewts2uni_test("\\u0F88+kh", "\u0F88\u0F91");
/* TODO(DLC)[EWTS->Tibetan]:
Do we want to ever generate \uf021? (NOT \u0f21, but the
private-use area (PUA) of Unicode). EWTS->TMW and this
makes sense, but EWTS->Unicode? */
ewts2uni_test("\\uF021", "\uF021");
ewts2uni_test("\\uF022", "\uF022");
ewts2uni_test("\\uF023", "\uF023");
@ -832,11 +929,13 @@ public class EWTSTest extends TestCase {
public void test__EWTS__32bit_unicode_escapes() {
assert_EWTS_error("\\u00010000"); // TODO(dchandler): make it work
assert_EWTS_error("\\uF0010000"); // TODO(dchandler): make it work
ewts2uni_test("\\uF0010000",
"[#ERROR ERROR TODO(DLC)[EWTS->Tibetan]: this character is illegal in EWTS: \\]\u0f68\u0f74[#ERROR ERROR TODO(DLC)[EWTS->Tibetan]: this character is illegal in EWTS: F]\u0f20\u0f20\u0f21\u0f20\u0f20\u0f20\u0f20"); // TODO(dchandler): make it work. Until you can, TODO(DLC)[EWTS->Tibetan]: make the following work:
if (RUN_FAILING_TESTS) assert_EWTS_error("\\uF0010000"); // TODO(DLC)[EWTS->Tibetan]: error subsystem is hosed
if (RUN_FAILING_TESTS) {
ewts2uni_test("\\ucafe0000",
"[#ERROR Sorry, we don't yet support Unicode escape sequences above 0x0000FFFF! File a bug.]");
// TODO(dchandler): make it "\ucafe0000");
if (false) {
"[#ERROR Sorry, we don't yet support Unicode escape sequences above 0x0000FFFF! File a bug.]");
// TODO(dchandler): make it "\ucafe0000");
ewts2uni_test("\\ucafe0eff", "\ucafe0eff");
ewts2uni_test("\\ucafe0eff", "\ucafe0eff");
ewts2uni_test("\\ucafe0f00", "\ucafe0f00");
@ -849,42 +948,46 @@ public class EWTSTest extends TestCase {
ewts2uni_test("\\uffffffff", "\uffffffff");
ewts2uni_test("\\ueeeeeee2", "\ueeeeeee2");
}
ewts2uni_test("\\u00000000", "\u00000000");
ewts2uni_test("\\u00000eff", "\u00000eff");
ewts2uni_test("\\u00000eff", "\u00000eff");
ewts2uni_test("\\u00000f00", "\u00000f00");
ewts2uni_test("\\u00000f40", "\u00000f40");
ewts2uni_test("\\u00000f70", "\u00000f70");
ewts2uni_test("\\u00000fff", "\u00000fff");
ewts2uni_test("\\u0000f000", "\u0000f000");
ewts2uni_test("\\u0000f01f", "\u0000f01f");
ewts2uni_test("\\u0000efff", "\u0000efff");
}
if (RUN_FAILING_TESTS) {
assertEquals("\u0f00", "\u00000f00"); // TODO(DLC)[EWTS->Tibetan]: this is why other test cases are failing. I think these tests rely on java 5.0 features (a.k.a., Tiger, 1.5) -- see http://java.sun.com/developer/technicalArticles/Intl/Supplementary/
ewts2uni_test("\\u00000f00", "\u00000f00");
ewts2uni_test("\\u00000f40", "\u00000f40");
ewts2uni_test("\\u00000f70", "\u00000f70");
ewts2uni_test("\\u00000fff", "\u00000fff");
ewts2uni_test("\\u0000f000", "\u0000f000");
ewts2uni_test("\\u0000f01f", "\u0000f01f");
ewts2uni_test("\\u0000efff", "\u0000efff");
ewts2uni_test("\\u00000000", "\u0000");
ewts2uni_test("\\u00000eff", "\u0eff");
ewts2uni_test("\\u00000eff", "\u0eff");
ewts2uni_test("\\u00000000", "\u0000");
ewts2uni_test("\\u00000eff", "\u0eff");
}
ewts2uni_test("\\u00000f00", "\u0f00");
ewts2uni_test("\\u00000f40", "\u0f40");
ewts2uni_test("\\u00000f70", "\u0f70");
ewts2uni_test("\\u00000fff", "\u0fff");
ewts2uni_test("\\u0000f000", "\uf000");
ewts2uni_test("\\u0000f01f", "\uf01f");
ewts2uni_test("\\u0000efff", "\uefff");
if (RUN_FAILING_TESTS) {
ewts2uni_test("\\u00000f70", "\u0f70");
ewts2uni_test("\\u00000fff", "\u0fff");
ewts2uni_test("\\u0000f000", "\uf000");
ewts2uni_test("\\u0000f01f", "\uf01f");
ewts2uni_test("\\u0000efff", "\uefff");
}
assert_EWTS_error("\\UcaFe0000");
if (false) { // TODO(dchandler): make these work
if (RUN_FAILING_TESTS) { // TODO(dchandler): make these work
ewts2uni_test("\\UcaFe0000", "\ucaFe0000");
ewts2uni_test("\\UcaFe0eff", "\ucaFe0eff");
ewts2uni_test("\\UcaFe0eff", "\ucaFe0eff");
ewts2uni_test("\\UcaFe0f00", "\ucaFe0f00");
ewts2uni_test("\\UcaFe0f40", "\ucaFe0f40");
ewts2uni_test("\\UcaFe0f70", "\ucaFe0f70");
ewts2uni_test("\\UcaFe0fff", "\ucaFe0fff");
ewts2uni_test("\\UcaFef000", "\ucaFef000");
ewts2uni_test("\\UcaFef01f", "\ucaFef01f");
ewts2uni_test("\\UcaFeefff", "\ucaFeefff");
ewts2uni_test("\\UcaFe0eff", "\ucaFe0eff");
ewts2uni_test("\\UcaFe0eff", "\ucaFe0eff");
ewts2uni_test("\\UcaFe0f00", "\ucaFe0f00");
ewts2uni_test("\\UcaFe0f40", "\ucaFe0f40");
ewts2uni_test("\\UcaFe0f70", "\ucaFe0f70");
ewts2uni_test("\\UcaFe0fff", "\ucaFe0fff");
ewts2uni_test("\\UcaFef000", "\ucaFef000");
ewts2uni_test("\\UcaFef01f", "\ucaFef01f");
ewts2uni_test("\\UcaFeefff", "\ucaFeefff");
}
}
@ -897,48 +1000,85 @@ public class EWTSTest extends TestCase {
assert_EWTS_error("kSha"); // use "k+Sha" instead
assert_EWTS_error("pM"); // use "paM" instead (TODO(DLC)[EWTS->Tibetan]: NOW NO!)
assert_EWTS_error("pH"); // use "paM" instead (TODO(DLC)[EWTS->Tibetan]: NOW NO!)
ewts2uni_test("pM", "\u0f54\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: should this be an EWTS error, forcing the use of "paM" instead?
ewts2uni_test("pH", "\u0f54\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: should this be an EWTS error, forcing the use of "paH" instead?
assert_EWTS_error("kja"); // use "kaja" or "k.ja" instead
assert_EWTS_error("kA+u"); // use "ku+A" (bottom-to-top) or "kU" instead
ewts2uni_test("kA+u", "\u0f40\u0f71\u0f74"); // TODO(DLC)[EWTS->Tibetan]: should this be an EWTS error, forcing the use of either "ku+A" (bottom-to-top) or "kU"?
assert_EWTS_error("bna"); // use "b+na" or "bana" instead // TODO(DLC)[EWTS->Tibetan]: tell D. Chapman about this; an old e-mail said my test cases would be brutal and here's brutal
assert_EWTS_error("bn?");
assert_EWTS_error("bni");
assert_EWTS_error("bnA");
assert_EWTS_error("bn-I");
{
ewts2uni_test("bsna", "\u0f56\u0f66\u0fa3"); // [bs+na]/[bsna] is legal, but [bna] is not according to prefix rules.
assert_EWTS_error("bna"); // use "b+na" or "bana" instead, depending on what you mean
// TODO(DLC)[EWTS->Tibetan]: tell D. Chapman about this; an old e-mail said my test cases would be brutal and here's brutal
assert_EWTS_error("bn?");
assert_EWTS_error("bni");
assert_EWTS_error("bnA");
assert_EWTS_error("bn-I");
}
// a+r is not a standard stack; neither is a+l:
assert_EWTS_error("ar-i");
assert_EWTS_error("ar-I");
assert_EWTS_error("al-i");
assert_EWTS_error("al-I");
if (RUN_FAILING_TESTS) {
// These should be errors... a+r is not a standard stack;
// neither is a+l. [a.r-i] is how you get
// \u0f68\u0f62\u0f80, not [ar-i].
assert_EWTS_error("ar-i");
assert_EWTS_error("ar-I");
assert_EWTS_error("al-i");
assert_EWTS_error("al-I");
}
assert_EWTS_error("g..ya"); // use "g.ya" instead
assert_EWTS_error("m..");
assert_EWTS_error("g"); // use "ga" instead TODO(DLC)[EWTS->Tibetan]:?
assert_EWTS_error("k\\u0f19"); // only numbers combine with f19,f18,f3e,f3f
assert_EWTS_error("k\\u0f18"); // only numbers combine with f19,f18,f3e,f3f
assert_EWTS_error("k\\u0f3e"); // only numbers combine with f19,f18,f3e,f3f
assert_EWTS_error("k\\u0f3f"); // only numbers combine with f19,f18,f3e,f3f
if (RUN_FAILING_TESTS) assert_EWTS_error("g..ya"); // use "g.ya" instead for \u0f42\u0f61
if (RUN_FAILING_TESTS) assert_EWTS_error("m..");
if (RUN_FAILING_TESTS) assert_EWTS_error("..m");
assert_EWTS_error(".");
if (RUN_FAILING_TESTS) assert_EWTS_error(".ma");
if (RUN_FAILING_TESTS) assert_EWTS_error("g"); // use "ga" instead. TODO(DLC)[EWTS->Tibetan]: Really?
if (RUN_FAILING_TESTS) {
{ // only numbers combine with f19,f18,f3e,f3f
assert_EWTS_error("k\\u0f19");
assert_EWTS_error("k\\u0f18");
assert_EWTS_error("k\\u0f3e");
assert_EWTS_error("k\\u0f3f");
}
}
}
public void testDLCFailingNow() { // TODO(DLC)[EWTS->Tibetan]
assert_EWTS_error("\\u0f19");
assert_EWTS_error("\\u0f18");
if (RUN_FAILING_TESTS) {
assert_EWTS_error("\\u0f19");
assert_EWTS_error("\\u0f18");
}
assert_EWTS_error("\\u0f19\u0f20"); // wrong order...
{
ewts2uni_test("'a+r-i", "\u0f60\u0fb2\u0f80"); // TODO(DLC)[EWTS->Tibetan]: NOW: prefix rules should make this invalid!
ewts2uni_test("'a+r-I", "\u0f60\u0fb2\u0f81");
ewts2uni_test("'a+l-i", "\u0f60\u0fb3\u0f80");// TODO(DLC)[EWTS->Tibetan]: NOW error handling is CRAP
ewts2uni_test("'a+l-I", "\u0f60\u0fb3\u0f81");
if (RUN_FAILING_TESTS) {
ewts2uni_test("'a+r-i", "\u0f60\u0fb2\u0f80"); // TODO(DLC)[EWTS->Tibetan]: NOW: prefix rules should make this invalid!
ewts2uni_test("'a+r-I", "\u0f60\u0fb2\u0f81");
ewts2uni_test("'a+l-i", "\u0f60\u0fb3\u0f80");// TODO(DLC)[EWTS->Tibetan]: NOW error handling is CRAP
ewts2uni_test("'a+l-I", "\u0f60\u0fb3\u0f81");
}
}
public void testMoreMiscellany() {
ewts2uni_test("r-i", "\u0f62\u0f80");
ewts2uni_test("r-I", "\u0f62\u0f81");
ewts2uni_test("l-i", "\u0f63\u0f80");
ewts2uni_test("l-I", "\u0f63\u0f81");
ewts2uni_test("ga\u0f0bga ga\\u0F0bga",
"\u0f42\u0f0b\u0f42\u0f0b\u0f42\u0f0b\u0f42");
ewts2uni_test("ga\u0f0cga*ga\\u0f0Cga",
"\u0f42\u0f0c\u0f42\u0f0c\u0f42\u0f0c\u0f42");
ewts2uni_test("'jam",
"\u0f60\u0f47\u0f58");
ewts2uni_test("jamX 'jam~X",
"\u0f47\u0f58\u0f37\u0f0b\u0f60\u0f47\u0f58\u0f35");
ewts2uni_test("@#", "\u0f04\u0f05");
assert_EWTS_error("dzaHsogs"); // TODO(DLC)[EWTS->Tibetan]: Ask. If H is punctuation-like then perhaps we need to implement a lexical conversion from H to H<invisible punct>
}
/** TODO(DLC)[EWTS->Tibetan]: set this to true and fix the code or
* the test cases until things are green. */
private static final boolean RUN_FAILING_TESTS = false;
}
// TODO(DLC)[EWTS->Tibetan]: if 'k' were illegal, then would you have to say

View file

@ -22,6 +22,7 @@ package org.thdl.tib.text.ttt;
import java.util.ArrayList;
import org.thdl.tib.text.tshegbar.UnicodeUtils;
import org.thdl.tib.text.DuffCode;
import org.thdl.tib.text.THDLWylieConstants;
import org.thdl.tib.text.TibTextUtils;
@ -74,8 +75,12 @@ public final class EWTSTraits implements TTraits {
public int maxWowelLength() { return 3; /* a~M` (TODO(DLC)[EWTS->Tibetan]:! why the 'a'?) */}
public boolean isUnicodeConsonant(char ch) {
return ((ch != '\u0f48' && ch >= '\u0f40' && ch <= '\u0f6a')
|| (ch != '\u0f98' && ch >= '\u0f90' && ch <= '\u0fbc'));
return ((ch != '\u0f48' && ch >= '\u0f40' && ch <= '\u0f6a')
|| (ch != '\u0f98' && ch >= '\u0f90' && ch <= '\u0fbc')
// NOTE: \u0f88 is questionable, but we want EWTS
// [\u0f88+kha] to become "\u0f88\u0f91" and this does
// the trick.
|| ch == '\u0f88');
}
public boolean isUnicodeWowel(char ch) {
@ -290,6 +295,9 @@ public final class EWTSTraits implements TTraits {
for (int i = 0; i < l.length(); i++) {
char ch = l.charAt(i);
if ((ch < '\u0f00' || ch > '\u0fff')
&& SAUVASTIKA != ch
&& SWASTIKA != ch
&& (ch < PUA_MIN || ch > PUA_MAX) // TODO(DLC)[EWTS->Tibetan]: give a warning, though? PUA isn't specified by the unicode standard after all.
&& '\n' != ch
&& '\r' != ch) {
// TODO(DLC)[EWTS->Tibetan]: Is this the place
@ -352,7 +360,6 @@ public final class EWTSTraits implements TTraits {
if ("h".equals(l)) return "\u0FB7";
if ("a".equals(l)) return "\u0FB8";
if ("k+Sh".equals(l)) return "\u0FB9";
if (false) throw new Error("TODO(DLC)[EWTS->Tibetan]:: subscribed for " + l);
return null;
} else {
if ("R".equals(l)) return "\u0f6a";
@ -360,6 +367,10 @@ public final class EWTSTraits implements TTraits {
if ("W".equals(l)) return "\u0f5d";
if (!TibetanMachineWeb.isKnownHashKey(l)) {
// System.err.println("Getting unicode for the following is hard: '"
// + l + "' (pretty string: '"
// + UnicodeUtils.unicodeStringToPrettyString(l)
// + "'");
ThdlDebug.noteIffyCode();
return null;
}
@ -445,4 +456,36 @@ public final class EWTSTraits implements TTraits {
return (allHavePlus
|| TibetanMachineWeb.hasGlyph(hashKey.toString())); // TODO(DLC)[EWTS->Tibetan]: test with smra and tsma and bdgya
}
public boolean stackingMustBeExplicit() { return true; }
public String U0F7F() { return "H"; }
public String U0F35() { return "~X"; }
public String U0F37() { return "X"; }
/** The EWTS standard mentions this character specifically. See
http://www.symbols.com/encyclopedia/15/155.html to learn about
its meaning as relates to Buddhism.
*/
static final char SAUVASTIKA = '\u534d';
/** The EWTS standard mentions this character specifically. See
http://www.symbols.com/encyclopedia/15/151.html to learn about
its meaning as relates to Buddhism.
*/
static final char SWASTIKA = '\u5350';
/** EWTS has some glyphs not specified by Unicode in the
* private-use area (PUA). EWTS puts them in the range [PUA_MIN,
* PUA_MAX]. (Note that \uf042 is the highest in use as of July
* 2, 2005.) */
static final char PUA_MIN = '\uf021';
/** EWTS has some glyphs not specified by Unicode in the
* private-use area (PUA). EWTS puts them in the range [PUA_MIN,
* PUA_MAX]. (Note that \uf042 is the highest in use as of July
* 2, 2005.) */
static final char PUA_MAX = '\uf0ff';
}

View file

@ -10,7 +10,7 @@ License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
Library (THDL). Portions created by the THDL are Copyright 2003-2005 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
@ -42,52 +42,80 @@ class EWTSTshegBarScanner extends TTshegBarScanner {
|| EWTSTraits.instance().isUnicodeWowel(ch)
|| (ch >= '\u0f20' && ch <= '\u0f33')
|| "khgncjytdpbmtstdzwzz'rlafvTDNSWYReuioIAUMHX?^\u0f39\u0f35\u0f37.+~'`-\u0f19\u0f18\u0f3f\u0f3e\u0f86\u0f87\u0f88".indexOf(ch) >= 0);
// NOTE: We treat \u0f00 as punctuation, not something valid
// inside a tsheg bar. This is questionable, but since it is
// a tsheg bar all by itself (almost always in practice,
// anyway) and since it would've required code changes I
// didn't want to make, that's how it is.
}
/** See the comment in TTshegBarScanner. This does not find
errors and warnings that you'd think of a parser finding (TODO(DLC)[EWTS->Tibetan]:
DOES IT?). */
public ArrayList scan(String s, StringBuffer errors, int maxErrors, // TODO(DLC)[EWTS->Tibetan]: ignored
boolean shortMessages, String warningLevel) {
// the size depends on whether it's mostly Tibetan or mostly
// Latin and a number of other factors. This is meant to be
// an underestimate, but not too much of an underestimate.
ArrayList al = new ArrayList(s.length() / 10);
// TODO(dchandler): use jflex, javacc or something similar as much
// as you can. I don't think EWTS can be perfectly parsed by
// javacc, by the way, but having several components in a pipeline
// would likely make things more maintainable.
//
// NOTE: EWTS doesn't fully specify how Unicode escapes (e.g.,
// [\\u0f20] should work). When do you evaluate them?
// Immediately like Java source files or later, say right before
// outputting? Our answer: immediately. [\\u0f88+ka] becomes
// hard to do otherwise. This means we treat actual Unicode in a
// way that a reader of the EWTS standard might not think about,
// but actual Unicode is rare in the input
// (TODO(DLC)[EWTS->Tibetan]: it's so rare that we ought to give a
// warning/error when we see it).
/** See the comment in TTshegBarScanner. This does not find
errors and warnings that you'd think of a parser finding (TODO(DLC)[EWTS->Tibetan]:
DOES IT?). */
public ArrayList scan(String s, StringBuffer errors, int maxErrors, // TODO(DLC)[EWTS->Tibetan]: ignored
boolean shortMessages, String warningLevel) {
// the size depends on whether it's mostly Tibetan or mostly
// Latin and a number of other factors. This is meant to be
// an underestimate, but not too much of an underestimate.
ArrayList al = new ArrayList(s.length() / 10);
// TODO(DLC)[EWTS->Tibetan]: use jflex, javacc or something similar
// TODO(DLC)[EWTS->Tibetan]: what about Unicode escapes like \u0f20? When do you do that? Immediately like Java source files? I think so and then we can say that oddballs like \u0f19 are valid within tsheg bars.
StringBuffer sb = new StringBuffer(s);
ExpandEscapeSequences(sb);
int sl = sb.length();
// TODO(DLC)[EWTS->Tibetan]:: '@#', in ewts->tmw, is not working
// TODO(DLC)[EWTS->Tibetan]:: 'jamX 'jam~X one is not working in ->tmw mode
// TODO(DLC)[EWTS->Tibetan]:: dzaHsogs is not working
for (int i = 0; i < sl; i++) {
if (isValidInsideTshegBar(sb.charAt(i))) {
StringBuffer tbsb = new StringBuffer();
for (; i < sl; i++) {
if (isValidInsideTshegBar(sb.charAt(i)))
tbsb.append(sb.charAt(i));
else {
--i;
break;
}
}
al.add(new TString("EWTS", tbsb.toString(),
TString.TIBETAN_NON_PUNCTUATION));
} else {
if (" /;|!:=_@#$%<>()\r\n\t*".indexOf(sb.charAt(i)) >= 0)
al.add(new TString("EWTS", sb.substring(i, i+1),
TString.TIBETAN_PUNCTUATION));
else
al.add(new TString("EWTS", "ERROR TODO(DLC)[EWTS->Tibetan]: this character is illegal in EWTS: " + sb.substring(i, i+1),
TString.ERROR));
}
StringBuffer sb = new StringBuffer(s);
ExpandEscapeSequences(sb);
int sl = sb.length();
// TODO(DLC)[EWTS->Tibetan]:: '@#', in ewts->tmw, is not working
// TODO(DLC)[EWTS->Tibetan]:: 'jamX 'jam~X one is not working in ->tmw mode
// TODO(DLC)[EWTS->Tibetan]:: dzaHsogs is not working
for (int i = 0; i < sl; i++) {
if (isValidInsideTshegBar(sb.charAt(i))) {
StringBuffer tbsb = new StringBuffer();
for (; i < sl; i++) {
if (isValidInsideTshegBar(sb.charAt(i)))
tbsb.append(sb.charAt(i));
else {
--i;
break;
}
}
return al;
al.add(new TString("EWTS", tbsb.toString(),
TString.TIBETAN_NON_PUNCTUATION));
} else {
// NOTE: It's questionable, but we treat
// \u0f00 like punctuation because it was
// easier coding that way.
if ((sb.charAt(i) >= EWTSTraits.PUA_MIN
&& sb.charAt(i) <= EWTSTraits.PUA_MAX)
|| (sb.charAt(i) >= '\u0f00' && sb.charAt(i) <= '\u0f17')
|| (sb.charAt(i) >= '\u0f1a' && sb.charAt(i) <= '\u0f1f')
|| (sb.charAt(i) >= '\u0fbe' && sb.charAt(i) <= '\u0fcc')
|| (sb.charAt(i) >= '\u0fcf' && sb.charAt(i) <= '\u0fd1')
|| (EWTSTraits.SAUVASTIKA == sb.charAt(i))
|| (EWTSTraits.SWASTIKA == sb.charAt(i))
|| (" /;|!:=_@#$%<>()*&\r\n\t\u0f36\u0f38\u0f89\u0f8a\u0f8b".indexOf(sb.charAt(i))
>= 0)) {
al.add(new TString("EWTS", sb.substring(i, i+1),
TString.TIBETAN_PUNCTUATION));
} else {
al.add(new TString("EWTS", "ERROR TODO(DLC)[EWTS->Tibetan]: this character is illegal in EWTS: " + sb.substring(i, i+1),
TString.ERROR));
}
}
}
return al;
}
/** Modifies the EWTS in sb such that Unicode escape sequences are
* expanded. */

View file

@ -792,7 +792,7 @@ public class EWTStibwniniTest extends TestCase {
assert_EWTS_error("khkha");
assert_EWTS_error("khna");
assert_EWTS_error("khla");
special_case("gga");
assert_EWTS_error("gga");
assert_EWTS_error("ggha");
special_case("gnya");
special_case("gda");
@ -801,13 +801,13 @@ public class EWTStibwniniTest extends TestCase {
assert_EWTS_error("gdhwa");
special_case("gna");
special_case("gnya");
special_case("gpa");
assert_EWTS_error("gpa");
assert_EWTS_error("gbha");
assert_EWTS_error("gbhya");
special_case("gma");
special_case("gmya");
assert_EWTS_error("gma");
assert_EWTS_error("gmya");
assert_EWTS_error("grya");
special_case("gha");
assert_EWTS_error("gha");
assert_EWTS_error("ghgha");
assert_EWTS_error("ghnya");
assert_EWTS_error("ghna");
@ -815,8 +815,8 @@ public class EWTStibwniniTest extends TestCase {
assert_EWTS_error("ghma");
assert_EWTS_error("ghla");
assert_EWTS_error("ghya");
special_case("ghra");
special_case("ghwa");
assert_EWTS_error("ghra");
assert_EWTS_error("ghwa");
assert_EWTS_error("ngka");
assert_EWTS_error("ngkta");
assert_EWTS_error("ngktya");
@ -970,34 +970,34 @@ public class EWTStibwniniTest extends TestCase {
special_case("dgra");
assert_EWTS_error("dgha");
assert_EWTS_error("dghra");
special_case("ddza");
special_case("dda");
assert_EWTS_error("ddza");
assert_EWTS_error("dda");
assert_EWTS_error("ddya");
special_case("ddra");
special_case("ddwa");
assert_EWTS_error("ddra");
assert_EWTS_error("ddwa");
assert_EWTS_error("ddha");
assert_EWTS_error("ddhna");
assert_EWTS_error("ddhya");
assert_EWTS_error("ddhra");
assert_EWTS_error("ddhwa");
special_case("dna");
assert_EWTS_error("dna");
special_case("dba");
special_case("dbra");
assert_EWTS_error("dbha");
assert_EWTS_error("dbhya");
assert_EWTS_error("dbhra");
special_case("dma");
special_case("dya");
assert_EWTS_error("dya");
assert_EWTS_error("drya");
assert_EWTS_error("dwya");
special_case("dha");
assert_EWTS_error("dha");
assert_EWTS_error("dhna");
assert_EWTS_error("dhnya");
assert_EWTS_error("dhma");
assert_EWTS_error("dhya");
special_case("dhra");
assert_EWTS_error("dhra");
assert_EWTS_error("dhrya");
special_case("dhwa");
assert_EWTS_error("dhwa");
assert_EWTS_error("nka");
assert_EWTS_error("nkta");
assert_EWTS_error("ngha");
@ -1051,39 +1051,39 @@ public class EWTStibwniniTest extends TestCase {
assert_EWTS_error("pswa");
assert_EWTS_error("psya");
assert_EWTS_error("bgha");
special_case("bdza");
assert_EWTS_error("bdza");
special_case("bda");
assert_EWTS_error("bddza");
assert_EWTS_error("bdha");
assert_EWTS_error("bdhwa");
special_case("bta");
special_case("bna");
special_case("bba");
assert_EWTS_error("bna");
assert_EWTS_error("bba");
assert_EWTS_error("bbha");
assert_EWTS_error("bbhya");
special_case("bma");
special_case("bha");
assert_EWTS_error("bma");
assert_EWTS_error("bha");
assert_EWTS_error("bhNa");
assert_EWTS_error("bhna");
assert_EWTS_error("bhma");
assert_EWTS_error("bhya");
special_case("bhra");
special_case("bhwa");
assert_EWTS_error("bhra");
assert_EWTS_error("bhwa");
special_case("mnya");
special_case("mNa"); // TODO(DLC)[EWTS->Tibetan]: do prefix rules really allow mNa? I think not.
assert_EWTS_error("mNa");
special_case("mna");
special_case("mnya");
special_case("mpa");
special_case("mpra");
special_case("mpha");
special_case("mba");
assert_EWTS_error("mpa");
assert_EWTS_error("mpra");
assert_EWTS_error("mpha");
assert_EWTS_error("mba");
assert_EWTS_error("mbha");
assert_EWTS_error("mbhya");
special_case("mma");
special_case("mla");
special_case("mwa");
special_case("msa");
special_case("mha");
assert_EWTS_error("mma");
assert_EWTS_error("mla");
assert_EWTS_error("mwa");
assert_EWTS_error("msa");
assert_EWTS_error("mha");
assert_EWTS_error("yYa");
assert_EWTS_error("yra");
assert_EWTS_error("ywa");

View file

@ -22,7 +22,9 @@ import java.util.ArrayList;
import java.util.ListIterator;
import java.util.NoSuchElementException;
/** An object that can iterate over an {@link TParseTree}.
/** An object that can iterate over an {@link TParseTree}. NOTE: This
* constructs the list over which it iterates when it is constructed,
* so you pay upfront.
*
* @author David Chandler */
class ParseIterator {

View file

@ -622,7 +622,7 @@ public class TConverter {
boolean done = false;
// what about after numbers? marks? FIXME: test
TPairList lpl = null;
if (s.getText().equals(" ")) {
if (ttraits.isACIP() && s.getText().equals(" ")) {
if (!lastGuyWasNonPunct
|| (null != lastGuy
&& (lpl = lastGuy.get(lastGuy.size() - 1)).size() == 1
@ -652,7 +652,8 @@ public class TConverter {
continue; // FIXME: if null != writer, output was just dropped.
}
}
} else if (s.getText().equals(",")
} else if (ttraits.isACIP()
&& s.getText().equals(",")
&& lastGuyWasNonPunct
&& null != lastGuy
&& (lpl = lastGuy.get(lastGuy.size() - 1)).size() == 1
@ -722,7 +723,8 @@ public class TConverter {
ThdlDebug.verify(1 == s.getText().length());
if (null != writer) {
char ch = s.getText().charAt(0);
if (ch >= '\uF021' && ch <= '\uF0FF') {
if (ch >= EWTSTraits.PUA_MIN
&& ch <= EWTSTraits.PUA_MAX) {
hasErrors = true;
String errorMessage =
"[#ERROR "

View file

@ -163,14 +163,15 @@ class TPair {
}
/** Returns a TPair that is like this pair except that it has a
* "+" on the right if this pair is empty on the right and is
* empty on the right if this pair has a disambiguator on the
* right. May return itself (but never mutates this
* instance). */
* "+" on the right if this pair is empty on the right and, when
* appropriate, is empty on the right if this pair has a
* disambiguator on the right. May return itself (but never
* mutates this instance). */
TPair insideStack() {
if (null == getRight())
return new TPair(traits, getLeft(), "+");
else if (traits.disambiguator().equals(getRight()))
else if (traits.disambiguator().equals(getRight())
&& !traits.stackingMustBeExplicit())
return new TPair(traits, getLeft(), null);
else
return this;
@ -248,11 +249,18 @@ class TPair {
}
}
// TODO(DLC)[EWTS->Tibetan]
/** Returns true if this pair is surely the last pair in an ACIP
* stack. Stacking continues through (* . ) and (* . +), but
* stops anywhere else. */
boolean endsACIPStack() {
return (getRight() != null && !"+".equals(getRight()));
/** For ACIP: Returns true if this pair is surely the last pair in
* an ACIP stack. Stacking continues through (* . ) and (* . +),
* but stops anywhere else.
*
* <p>For EWTS: Returns true if this pair is probably the last
* pair in an EWTS stack. For natives stacks like that found in
* [bra], this is not really true. */
boolean endsStack() {
final boolean explicitlyStacks = "+".equals(getRight());
if (!traits.stackingMustBeExplicit())
return (getRight() != null && !explicitlyStacks);
else
return (!explicitlyStacks);
}
}

View file

@ -16,8 +16,6 @@ All Rights Reserved.
Contributor(s): ______________________________________.
*/
// TODO(DLC)[EWTS->Tibetan]: a (DLC: does this become (a.) or (.a)?), ug pha, g.a, aM, etc. -- test!
package org.thdl.tib.text.ttt;
import java.util.ArrayList;
@ -146,9 +144,10 @@ class TPairList {
return original.toString();
}
/** Returns true if this list contains ( . <vowel>) or (A . ),
* which are two simple errors you encounter if you interpret DAA
* or TAA or DAI or DAE the wrong way. TODO(DLC)[EWTS->Tibetan]: ACIP vs. EWTS */
/** Returns true if this list contains an obvious error. For
* example, with ACIP this returns true if ( . <vowel>) or (A . )
* appears, which are two simple errors you encounter if you
* interpret (ACIP) DAA or TAA or DAI or DAE the wrong way. */
boolean hasSimpleError() {
int sz = size();
for (int i = 0; i < sz; i++) {
@ -192,13 +191,6 @@ class TPairList {
&& (null == p.getRight()
|| "".equals(p.getRight()))) {
return ErrorsAndWarnings.getMessage(125, shortMessages, translit, traits);
} else if (null != p.getRight()
&& !"+".equals(p.getRight())
&& !traits.disambiguator().equals(p.getRight())
&& !traits.isWowel(p.getRight())
&& false /* TODO(DLC)[EWTS->Tibetan]: think about this harder. */) {
return "ErrorNumberDLC1: We don't yet support stacking vowels, convert {" + translit + "} manually.";
// TODO(DLC)[EWTS->Tibetan]: test, i think we do support it
} else if ((null == p.getLeft()
&& (!traits.disambiguator().equals(p.getRight())
&& (!traits.vowelAloneImpliesAChen()
@ -224,7 +216,8 @@ class TPairList {
return ErrorsAndWarnings.getMessage(126, shortMessages, translit, traits);
}
// FIXME: really this is a warning, not an error:
if (traits.disambiguator().equals(get(sz - 1).getRight())) {
if (traits.disambiguator().equals(get(sz - 1).getRight())
&& !traits.stackingMustBeExplicit()) {
return ErrorsAndWarnings.getMessage(127, shortMessages, translit, traits);
}
return null;
@ -280,26 +273,28 @@ class TPairList {
if (sz < 1) return null;
// When we see a stretch of ACIP without a disambiguator or a
// vowel, that stretch is taken to be one stack unless it may
// be prefix-root or suffix-postsuffix or suffix/postsuffix-'
// -- the latter necessary because GAMS'I is GAM-S-'I, not
// GAM-S+'I. 'UR, 'US, 'ANG, 'AM, 'I, 'O, 'U -- all begin
// with '. So we can have zero, one, two, or three special
// break locations. (The kind that aren't special are the
// break after G in G-DAMS, or the break after G in GADAMS or
// GEDAMS.)
// When we see a stretch of ACIP (TODO(DLC)[EWTS->Tibetan]:
// this works for EWTS, but differently) without a
// disambiguator or a vowel, that stretch is taken to be one
// stack unless it may be prefix-root or suffix-postsuffix or
// suffix/postsuffix-' -- the latter necessary because GAMS'I
// is GAM-S-'I, not GAM-S+'I. 'UR, 'US, 'ANG, 'AM, 'I, 'O, 'U
// -- all begin with '. So we can have zero, one, two, or
// three special break locations. (The kind that aren't
// special are the break after G in G-DAMS, or the break after
// G in GADAMS or GEDAMS.)
//
// If a nonnegative number appears in breakLocations[i], it
// means that pair i may or may not be stacked with pair i+1.
int nextBreakLoc = 0;
int breakLocations[] = { -1, -1, -1 };
boolean mayHavePrefix;
boolean mayHavePrefix = get(0).isPrefix();
// Handle the first pair specially -- it could be a prefix.
if (ddebug) System.out.println("i is " + 0);
if ((mayHavePrefix = get(0).isPrefix())
if (mayHavePrefix
&& !traits.stackingMustBeExplicit()
&& sz > 1
&& null == get(0).getRight()) {
// special case: we must have a branch in the parse tree
@ -311,9 +306,9 @@ class TPairList {
}
// stack numbers start at 1.
int stackNumber = (get(0).endsACIPStack()) ? 2 : 1;
int stackNumber = (get(0).endsStack()) ? 2 : 1;
// this starts at 0.
int stackStart = (get(0).endsACIPStack()) ? 1 : 0;
int stackStart = (get(0).endsStack()) ? 1 : 0;
int numeric = get(0).isNumeric() ? 1 : (get(0).isDisambiguator() ? 0 : -1);
@ -340,7 +335,7 @@ class TPairList {
numeric = -1;
}
if (i+1==sz || p.endsACIPStack()) {
if (i+1==sz || p.endsStack()) {
if (/* the stack ending here might really be
suffix-postsuffix or
suffix-appendage or
@ -350,15 +345,17 @@ class TPairList {
if (i > stackStart) {
if (get(stackStart).isSuffix()
&& (get(stackStart+1).isPostSuffix() // suffix-postsuffix
|| "'".equals(get(stackStart+1).getLeft()))) // suffix-appendage
|| "'".equals(get(stackStart+1).getLeft()))) { // suffix-appendage
breakLocations[nextBreakLoc++] = stackStart;
}
if (i > stackStart + 1) {
// three to play with, maybe it's
// suffix-postsuffix-appendage.
if (get(stackStart).isSuffix()
&& get(stackStart+1).isPostSuffix()
&& "'".equals(get(stackStart+2).getLeft()))
&& "'".equals(get(stackStart+2).getLeft())) {
breakLocations[nextBreakLoc++] = stackStart+1;
}
}
}
// else no need to insert a breakLocation, we're
@ -370,8 +367,9 @@ class TPairList {
|| (!mayHavePrefix && (stackNumber == 3))) {
if (i == stackStart+1) { // because GDAM--S'O is illegal, and because it's 'ANG, not 'NG, 'AM, not 'M -- ' always ends the stack
if (get(stackStart).isPostSuffix()
&& "'".equals(get(stackStart+1).getLeft()))
&& "'".equals(get(stackStart+1).getLeft())) {
breakLocations[nextBreakLoc++] = stackStart;
}
}
}
++stackNumber;
@ -397,7 +395,8 @@ class TPairList {
throw new Error("breakLocations is monotonically increasing, ain't it?");
TParseTree pt = new TParseTree();
for (int i = 0; i < sz; i++) {
if (i+1 == sz || get(i).endsACIPStack()) {
if (ddebug) System.out.println("getParseTree: second loop i is " + i);
if (i+1 == sz || get(i).endsStack()) {
TStackListList sll = new TStackListList(4); // maximum is 4.
int numBreaks = 0;
@ -419,6 +418,7 @@ class TPairList {
// one, at location breakLocations[breakStart+1] if
// and only if b1 is one, etc.
for (int counter = 0; counter < (1<<numBreaks); counter++) {
if (ddebug) System.out.println("getParseTree: counter is " + counter);
TStackList sl = new TStackList();
boolean slIsInvalid = false;
TPairList currentStack = new TPairList(traits);
@ -435,7 +435,7 @@ class TPairList {
return null; // sA, for example, is illegal.
}
}
if (k == i || get(k).endsACIPStack()) {
if (k == i || get(k).endsStack()) {
if (!currentStack.isEmpty()) {
if (traits.couldBeValidStack(currentStackUnmodified)) {
sl.add(currentStack.asStack());
@ -479,45 +479,48 @@ class TPairList {
}
if (ddebug) System.out.println("getParseTree: parse tree for " + toString() + " is " + pt);
if (pt.isEmpty()) return null;
return pt;
}
private static final boolean ddebug = false;
/** Mutates this TPairList object such that the last pair is
* empty or is a vowel, but is never the stacking operator ('+')
* or a disambiguator (i.e., a '-' on the right).
/** Mutates this TPairList object such that the last pair is empty
* or is a vowel, but is never the stacking operator ('+') or (in
* ACIP, but not in EWTS) a disambiguator (i.e., an ACIP '-' or
* EWTS '.' on the right).
* @return this instance */
private TPairList asStack() {
if (!isEmpty()) {
TPair lastPair = get(size() - 1);
if ("+".equals(lastPair.getRight()))
if ("+".equals(lastPair.getRight())) {
al.set(size() - 1, new TPair(traits, lastPair.getLeft(), null));
else if (traits.disambiguator().equals(lastPair.getRight()))
} else if (traits.disambiguator().equals(lastPair.getRight())
&& !traits.stackingMustBeExplicit()) {
al.set(size() - 1, new TPair(traits, lastPair.getLeft(), null));
}
}
return this;
}
/** Adds the TGCPairs corresponding to this list to the end of
* pl. Some TPairs correspond to more than one TGCPair
* ({AA:}); some TGCPairs correspond to more than one TPair
* ({G+YA}). To keep track, indexList will be appended to in
* lockstep with pl. index (wrapped as an {@link
* java.lang#Integer}) will be appended to indexList once each
* time we append to pl. This assumes that this TPairList
* corresponds to exactly one Tibetan grapheme cluster (i.e.,
* stack). Note that U+0F7F (ACIP {:}) is part of a stack, not a
* stack all on its own. */
/** Adds the TGCPairs corresponding to this list to the end of pl.
* Some TPairs correspond to more than one TGCPair ({AA:}); some
* TGCPairs correspond to more than one TPair ({G+YA}). To keep
* track, indexList will be appended to in lockstep with pl.
* index (wrapped as an {@link java.lang#Integer}) will be
* appended to indexList once each time we append to pl. This
* assumes that this TPairList corresponds to exactly one Tibetan
* grapheme cluster (i.e., stack). Note that U+0F7F, U+0F35, and
* U+0F37 get special treatment because the sole client of this
* code is TTGCList, and its sole client is to test for legality
* of a tsheg bar. */
void populateWithTGCPairs(ArrayList pl,
ArrayList indexList, int index) {
int sz = size();
if (sz == 0) {
return;
} else {
// drop the disambiguator, if there is one.
boolean isNumeric = false;
StringBuffer lWylie = new StringBuffer();
int i;
@ -531,15 +534,42 @@ class TPairList {
// The last pair:
TPair p = get(i);
ThdlDebug.verify(!"+".equals(p.getRight()));
boolean add_U0F7F = false;
int where;
if (p.getRight() != null
&& (where = p.getRight().indexOf(':')) >= 0) { // TODO(DLC)[EWTS->Tibetan]
// this ':' guy is his own TGCPair.
add_U0F7F = true;
StringBuffer rr = new StringBuffer(p.getRight());
rr.deleteCharAt(where);
p = new TPair(traits, p.getLeft(), rr.toString());
final String specialCases[] = new String[] {
traits.U0F7F(),
traits.U0F35(),
traits.U0F37()
};
final String specialCaseEwts[] = new String[] {
EWTSTraits.instance().U0F7F(),
EWTSTraits.instance().U0F35(),
EWTSTraits.instance().U0F37()
};
final boolean ignoreSpecialCase[] = new boolean[] {
false, // Don't ignore this -- it's Sanskrit.
// ['jamH] should be illegal EWTS.
// (TODO(dchandler): ask)
true,
true,
};
boolean hasSpecialCase[] = new boolean[] { false, false, false, };
for (int j = 0; j < specialCases.length; j++) {
if (null != specialCases[j]) {
int where;
if (p.getRight() != null
&& (where = p.getRight().indexOf(specialCases[j])) >= 0) {
// this guy is his own TGCPair.
hasSpecialCase[j] = true;
StringBuffer rr = new StringBuffer(p.getRight());
rr.replace(where, where + specialCases[j].length(), "");
if (rr.length() > where && '+' == rr.charAt(where)) {
rr.deleteCharAt(where);
} else if (where > 0 && rr.length() > where - 1
&& '+' == rr.charAt(where - 1)) {
rr.deleteCharAt(where - 1);
}
p = new TPair(traits, p.getLeft(), rr.toString());
}
}
}
boolean hasNonAVowel = (!traits.aVowel().equals(p.getRight())
&& null != p.getRight());
@ -586,9 +616,12 @@ class TPairList {
? TGCPair.TYPE_TIBETAN
: TGCPair.TYPE_OTHER))));
pl.add(tp);
if (add_U0F7F) {
indexList.add(new Integer(index));
pl.add(new TGCPair("H", null, TGCPair.TYPE_OTHER)); // TODO(DLC)[EWTS->Tibetan]
for (int j = 0; j < specialCases.length; j++) {
if (hasSpecialCase[j] && !ignoreSpecialCase[j]) {
indexList.add(new Integer(index));
pl.add(new TGCPair(specialCaseEwts[j],
null, TGCPair.TYPE_OTHER));
}
}
}
}

View file

@ -20,6 +20,8 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt;
import org.thdl.tib.text.TibetanMachineWeb;
/** A factory for creating {@link TPairList TPairLists} from
* Strings of ACIP.
* @author David Chandler */
@ -111,12 +113,15 @@ class TPairListFactory {
return tail;
}
private static final boolean debug = false;
/** See {@link TTraits#breakTshegBarIntoChunks}. */
static TPairList[] breakEWTSIntoChunks(String ewts)
throws IllegalArgumentException
{
EWTSTraits traits = EWTSTraits.instance();
TPairList pl = breakHelperEWTS(ewts, traits);
if (debug) System.out.println("breakEWTSIntoChunks: pl is " + pl);
TPairList npl = pl;
// TODO(DLC)[EWTS->Tibetan]: this crap ain't workin' for kaHM. But kaeM and kaMe shouldn't work, right? Figure out what EWTS really says...
@ -148,14 +153,18 @@ class TPairListFactory {
}
}
}
pl = null;
if (debug) System.out.println("breakEWTSIntoChunks: npl is " + npl);
TPairList nnpl;
if (true) {
// TODO(DLC)[EWTS->Tibetan]: this nnpl crap was before getFirstConsonantAndVowel got fixed. Try killing it!
// Collapse ( . wowel1) ( . wowel2) into (
// . wowel1+wowel2). Then collapse (* . a) ( . x) into (*
// . x). Also, if an a-chen (\u0f68) is implied, then
// insert it.
TPairList xnnpl = new TPairList(traits, pl.size());
TPairList xnnpl = new TPairList(traits, npl.size());
for (int i = 0; i < npl.size(); ) {
TPair p = npl.get(i);
int set_i_to = i + 1;
@ -184,7 +193,7 @@ class TPairListFactory {
i = set_i_to;
}
nnpl = new TPairList(traits, pl.size());
nnpl = new TPairList(traits, xnnpl.size());
// (* . a ) ( . x) ... ( . y) -> (* . a+x+...+y)
for (int i = 0; i < xnnpl.size(); ) {
TPair p = xnnpl.get(i);
@ -221,7 +230,7 @@ class TPairListFactory {
}
} else {
// TODO(DLC)[EWTS->Tibetan]: this block is not executing. kill it after testing and thinking
nnpl = new TPairList(traits, pl.size());
nnpl = new TPairList(traits, npl.size());
for (int i = npl.size() - 1; i >= 0; i--) {
TPair p = npl.get(i);
@ -234,13 +243,91 @@ class TPairListFactory {
nnpl.prepend(p);
}
}
npl = null;
if (debug) System.out.println("breakEWTSIntoChunks: nnpl is " + nnpl);
TPairList nnnpl = transformNativeStacks(traits, nnpl);
if (debug) System.out.println("breakEWTSIntoChunks: nnnpl is " + nnnpl);
// TODO(DLC)[EWTS->Tibetan]: this nnpl crap was before getFirstConsonantAndVowel got fixed. Try killing it!
return new TPairList[] {
nnpl, null
nnnpl, null
};
}
/** EWTS helper function that transforms native stacks to include
* pluses: [(ph . ) (y . ) (w . *)] -> [(ph . +) (y . +) (w
* . *)], e.g.
* @param traits must mesh with orig */
private static TPairList transformNativeStacks(TTraits traits,
TPairList orig) {
// TODO(DLC)[EWTS->Tibetan]: instead of using
// TibetanMachineWeb's knowledge of the hash keys in tibwn.ini
// (ph-y-w is a hash key, e.g.), we assume that 3 is the
// maximum size of a native stack.
final int maxNativeStackSize = 3;
// [(s . *)] alone doesn't need transformation. [(s . )
// (k . *)] does:
final int minNativeStackSize = 2;
TPairList result = new TPairList(traits, orig.size());
for (int i = 0; i < orig.size();
) { // we increment i inside the loop
// If, upon looking ahead, we see a native stack of
// size 3, we transform three pairs. Failing that, if
// we see a native stack of size 2, we transform it.
boolean found_something = false;
TPair p[] = new TPair[maxNativeStackSize];
for (int j = 0; j < maxNativeStackSize; j++) {
if (i + j < orig.size())
p[j] = orig.get(i + j);
else
p[j] = null;
}
// Now p[0] is current pair, p[1] is the one after that, etc.
for (int nss = maxNativeStackSize; nss >= minNativeStackSize;
nss--) {
String hash_key = "";
int good = 0;
for (int k = 0; k < nss - 1; k++) {
if (null != p[k]
&& null != p[k].getLeft()
&& null == p[k].getRight()) {
hash_key += p[k].getLeft() + "-";
++good;
}
}
if (null != p[nss - 1]
&& null != p[nss - 1].getLeft()
&& !"+".equals(p[nss - 1].getRight())) {
hash_key += p[nss - 1].getLeft();
++good;
}
if (nss == good
&& TibetanMachineWeb.isKnownHashKey(hash_key)) {
found_something = true;
for (int n = 0; n < nss - 1; n++) {
++i;
result.append(new TPair(traits,
p[n].getLeft(), "+"));
}
++i;
result.append(p[nss - 1]);
break; // for ph-y-w etc.
}
}
if (!found_something) {
++i;
result.append(p[0]);
}
}
if (result.size() != orig.size()) {
throw new Error("orig=" + orig + "\nresult=" + result); // TODO(dchandler): make this an assertion.
}
return result;
}
// TODO(DLC)[EWTS->Tibetan]: doc
private static TPairList breakHelperEWTS(String ewts, TTraits ttraits) {

View file

@ -105,26 +105,33 @@ class TParseTree {
ParseIterator pi = getParseIterator();
while (pi.hasNext()) {
TStackList sl = pi.next();
if (!sl.isClearlyIllegal()) {
BoolTriple bt = sl.isLegalTshegBar(false);
if (!sl.isClearlyIllegal(bt.candidateType)) {
sll.add(sl);
}
}
return sll;
}
private static final boolean debug = false;
/** Returns the best parse, if there is a unique parse that is
* clearly preferred to other parses. Basically, if there's a
* unique legal parse, you get it. If there's not, but there is
* a unique non-illegal parse, you get it. If there's not a
* unique answer, null is returned. */
public TStackList getBestParse() {
if (debug) System.out.println("getBestParse: parse tree is " + toString());
TStackListList up = getUniqueParse(false);
if (up.size() == 1)
if (up.size() == 1) {
if (debug) System.out.println("getBestParse: unique parse");
return up.get(0);
}
up = getNonIllegalParses();
int sz = up.size();
if (sz == 1) {
if (debug) System.out.println("getBestParse: sole non-illegal parse");
return up.get(0);
} else if (sz > 1) {
// TODO(DLC)[EWTS->Tibetan]: does this still happen? If so, when?
@ -132,12 +139,14 @@ class TParseTree {
// System.out.println("SHO NUFF, >1 non-illegal parses still happens");
// {PADMA}, for example. Our technique is to go from the
// left and stack as much as we can. So {PA}{D}{MA} is
// inferior to {PA}{D+MA}, and {PA}{D+MA}{D}{MA} is
// inferior to {PA}{D+MA}{D+MA}. We do not look for the
// minimum number of glyphs, though -- {PA}{N+D}{B+H+R}
// and {PA}{N}{D+B+H+R} tie by that score, but the former
// is the clear winner.
// left and stack as much as we can (when
// !traits.stackingMustBeExplicit() only!
// TODO(DLC)[EWTS->Tibetan]: fix these comments). So
// {PA}{D}{MA} is inferior to {PA}{D+MA}, and
// {PA}{D+MA}{D}{MA} is inferior to {PA}{D+MA}{D+MA}. We
// do not look for the minimum number of glyphs, though --
// {PA}{N+D}{B+H+R} and {PA}{N}{D+B+H+R} tie by that
// score, but the former is the clear winner.
// We give a warning about these, optionally, so that
// users can produce output that even a dumb ACIP reader
@ -177,11 +186,27 @@ class TParseTree {
}
++stackNumber;
}
if (candidates.size() == 1)
if (candidates.size() == 1) {
if (debug) System.out.println("getBestParse: one candidate");
return up.get(((Integer)candidates.get(0)).intValue());
else
} else {
if (debug) {
System.out.println("getBestParse: no parse, num candidates="
+ candidates.size());
for (int i = 0; i < candidates.size(); i++) {
System.out.println("candidate " + i + " is "
+ up.get(((Integer)candidates.get(i)).intValue()));
if (i + 1 < candidates.size()) {
boolean eq = (up.get(((Integer)candidates.get(i)).intValue()).equals(up.get(((Integer)candidates.get(i + 1)).intValue())));
System.out.println("This candidate and the next are"
+ (eq ? "" : " not") + " equal.");
}
}
}
return null;
}
}
if (debug) System.out.println("getBestParse: no non-illegal parses");
return null;
}
@ -480,9 +505,10 @@ n+t+s
middle = pl.get(1).getLeft();
right = pl.get(2).getLeft();
if (pl.get(0).getRight() == null
&& !pl.get(1).endsACIPStack()
&& pl.get(2).endsACIPStack()
&& !pl.get(1).endsStack()
&& pl.get(2).endsStack()
&& null != left && null != right) {
// TODO(DLC)[EWTS->Tibetan]: This is ACIP-specific.
if (("D".equals(left) && "G".equals(middle) && "R".equals(right))
|| ("D".equals(left) && "G".equals(middle) && "Y".equals(right))) {
if (pl.size() == 3) {
@ -503,7 +529,7 @@ n+t+s
String left, right;
left = pl.get(0).getLeft();
right = pl.get(1).getLeft();
if (pl.get(0).getRight() == null && pl.get(1).endsACIPStack()
if (pl.get(0).getRight() == null && pl.get(1).endsStack()
&& null != left && null != right) {
if (("D".equals(left) && "B".equals(right))
|| ("B".equals(left) && "D".equals(right))

View file

@ -21,6 +21,7 @@ package org.thdl.tib.text.ttt;
import java.util.ArrayList;
import java.util.ListIterator;
import org.thdl.util.ThdlDebug;
import org.thdl.tib.text.TGCList;
import org.thdl.tib.text.TibTextUtils;
@ -136,17 +137,21 @@ class TStackList {
StringBuffer warnings = new StringBuffer();
String candidateType
= TibTextUtils.getClassificationOfTshegBar(tgcList, warnings, noPrefixTests);
if (ddebug) System.out.println("ddebug: tgclist is " + tgcList + "\n warnings is " + warnings + "\n candidateType is " + candidateType);
// preliminary answer:
boolean isLegal = (candidateType != "invalid");
if (isLegal) {
if (isClearlyIllegal())
if (isClearlyIllegal(candidateType))
isLegal = false;
TPairList firstStack = this.get(0);
// NOTE: In ewts, [([b'dgm] . ) (...] is illegal unless
// this is a legal tsheg bar featuring a prefix. (I'm not
// sure this is enforced here, though...)
if (1 == firstStack.size()
&& firstStack.get(0).isPrefix()
&& null == firstStack.get(0).getRight() // because GAM is legal
&& null == firstStack.get(0).getRight() // ACIP {GAM}/EWTS {gam} is legal
&& !(candidateType.startsWith("prefix")
|| candidateType.startsWith("appendaged-prefix"))) {
isLegal = false;
@ -163,7 +168,8 @@ class TStackList {
TPairList pl = get(pairListIndex);
TPair p = pl.get(pl.size() - 1);
isLegalAndHasAVowelOnRoot
= (p.getRight() != null && p.getRight().startsWith("A")); // could be {A:}, e.g. TODO(DLC)[EWTS->Tibetan]: ???
= (p.getRight() != null
&& p.getRight().startsWith(p.getTraits().aVowel())); // could be ACIP {A:}, e.g.
if (isLegalAndHasAVowelOnRoot)
break;
}
@ -178,7 +184,34 @@ class TStackList {
/** Returns true if and only if this stack list contains a clearly
* illegal construct. An example of such is a TPair (V . something). */
boolean isClearlyIllegal() {
boolean isClearlyIllegal(String candidateType) {
if (isVeryClearlyIllegal())
return true;
int choices[]
= TibTextUtils.getIndicesOfRootForCandidateType(candidateType);
int max = size() - 1; // TODO(DLC)[EWTS->Tibetan]:
// optionally, use just size(). This
// will make [g] and [bad+man] illegal,
// e.g.
for (int i = 0; i < max; i++) {
// We want EWTS [gga] to be illegal because ga does not
// takes a gao prefix and we want EWTS [trna] to be
// illegal because a disambiguator or wowel is required to
// end a stack unless that stack is a prefix, suffix, or
// postsuffix.
if ((choices[0] < 0 && choices[1] < 0)
|| (choices[0] == i && choices[1] < 0)) {
TPair last = get(i).get(get(i).size() - 1);
if (last.getTraits().stackingMustBeExplicit()
&& last.getRight() == null) {
return true;
}
}
}
return false;
}
private boolean isVeryClearlyIllegal() {
// check for {D}{VA} sorts of things:
for (int i = 0; i < size(); i++) {
if (get(i).getACIPError("THIS MAKES IT FASTER AND IS SAFE, DON'T WORRY",
@ -286,7 +319,7 @@ class BoolTriple implements Comparable {
}
/** True if and only if {@link #isLegal} is true and there may be
an ACIP "A" vowel on the root stack. */
an TTraits.aVowel() on the root stack. */
boolean isLegalAndHasAVowelOnRoot;
BoolTriple(boolean isLegal,
boolean isLegalAndHasAVowelOnRoot,
@ -322,4 +355,7 @@ class BoolTriple implements Comparable {
BoolTriple b = (BoolTriple)o;
return score() - b.score();
}
// NOTE: TibTextUtils.getIndicesOfRootForCandidateType(candidateType)
// is useful.
}

View file

@ -66,9 +66,8 @@ public class TString {
&& type != END_SLASH
&& (type != UNICODE_CHARACTER
|| !(UnicodeUtils.isInTibetanRange(ch = getText().charAt(0))
// EWTS maps some TMW glyphs to this Unicode
// private-use area (PUA):
|| (ch >= '\uF021' && ch <= '\uF0FF'))));
|| (ch >= EWTSTraits.PUA_MIN
&& ch <= EWTSTraits.PUA_MAX))));
}
/** For ACIP [#COMMENTS] and EWTS (DLC FIXME: what are EWTS comments?) */

View file

@ -23,7 +23,10 @@ import java.util.ArrayList;
import org.thdl.tib.text.TGCList;
import org.thdl.tib.text.TGCPair;
/** A list of grapheme clusters.
/** A list of grapheme clusters. If you use this for anything other
* than testing the legality (the Tibetanness, if you will) of a
* tsheg-bar, then you'll probably fail because U+0F7F, U+0F35, and
* U+0F37 get special treatment.
*
* @author David Chandler */
class TTGCList implements TGCList {
@ -35,7 +38,9 @@ class TTGCList implements TGCList {
/** Don't use this. */
private TTGCList() { }
/** Creates a TGCList. */
/** Creates a TGCList. Note that U+0F7F, U+0F35, and U+0F37 get
* special treatment because the sole use of this class is for
* testing the legality of a tsheg bar. */
public TTGCList(TStackList sl) {
al = new ArrayList();
stackIndices = new ArrayList();

View file

@ -211,4 +211,24 @@ public interface TTraits {
in a tsheg bar. (EWTS's list of standard stacks comes into
play; ACIP always returns true.) */
boolean couldBeValidStack(TPairList pl);
/** Returns true if stacking happens only via the '+' operator.
* Otherwise, stacking is greedy: for the most part we stack up
* until we hit something that stops us, like a vowel (though
* prefixes are special). NOTE: In EWTS, native stacks (EWTS
* [phywa], e.g.) are transformed by an early pass to use '+'. */
boolean stackingMustBeExplicit();
// TODO(dchandler): If there exists more than one transliteration
// for \u0f7f or the like, do we handle both equally well? Must
// we?
/** The transliteration of \u0f7f. */
String U0F7F();
/** The transliteration of \u0f35. */
String U0F35();
/** The transliteration of \u0f37. */
String U0F37();
}

View file

@ -59,13 +59,13 @@ public abstract class TTshegBarScanner {
errors, maxErrors, shortMessages, warningLevel);
}
/** Scans a stream of transliteration into tsheg bars. If errors is
* non-null, error messages will be appended to it. You can
/** Scans a stream of transliteration into tsheg bars. If errors
* is non-null, error messages will be appended to it. You can
* recover both errors and (optionally) warnings (modulo offset
* information) from the result, though. They will be short
* messages iff shortMessages is true. Returns a list of
* TStrings that is the scan, or null if more than maxErrors
* occur.
* TStrings that is the scan, or null if maxErrors is nonnegative
* and more than maxErrors occur.
*
* <p>This is not so efficient; copies the whole stream into
* memory first.