A reverter that converts Unicode to computer-friendly (but not, yet,

human-friendly) EWTS is here in alpha mode.  It probably doesn't deal
well with non-Tibetan.
This commit is contained in:
dchandler 2005-08-01 05:54:20 +00:00
parent 00afd75362
commit 5788416629
13 changed files with 496 additions and 47 deletions

View file

@ -20,8 +20,9 @@ package org.thdl.tib.text.reverter;
import junit.framework.TestCase;
import org.thdl.util.ThdlOptions;
import org.thdl.tib.text.tshegbar.UnicodeUtils;
import org.thdl.tib.text.ttt.ErrorsAndWarnings;
import org.thdl.util.ThdlOptions;
/** Tests the Converter class.
*
@ -47,9 +48,112 @@ public class ConverterTest extends TestCase {
ThdlOptions.setUserPreference("thdl.debug", true);
}
/** Asserts that converting s from Unicode to EWTS yields an
* error. */
private void err(String s) {
StringBuffer sb = new StringBuffer();
String ewts = Converter.convertToEwtsForComputers(s, sb);
boolean error = (sb.length() > 0);
if (!error) {
System.out.println("expected error but got EWTS '" + ewts
+ "' for "
+ UnicodeUtils.unicodeStringToPrettyString(s));
}
assertTrue(error);
}
/** Tests Converter.convertToEwtsForHumans. */
private void hconv(String uni, String ewts) {
System.out.println("TODO(dchandler): DLC: implement me");
}
/** Tests Converter.convertToEwtsForComputers. */
private void conv(String uni, String ewts) {
StringBuffer sb = new StringBuffer();
String actualEwts = Converter.convertToEwtsForComputers(uni, sb);
assertEquals("Expected " + ewts + " but got " + actualEwts + ":\n",
ewts, actualEwts);
boolean error = (sb.length() > 0);
assertTrue(!error);
}
public ConverterTest() { }
public void testUnicodeToEwts() {
assertEquals(Converter.convertToEwts("\u0f40", null), "ka");
conv("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b", "bar+tagasa ");
conv("\u0f40", "ka");
// TODO(dchandler): DLC Tibetans use Arabic numerals and English punctuation.
// conv("0123456789.\u0f40", "0123456789.ka");
conv("\u0f40\u0f7b", "kai");
conv("\u0f40\u0f76", "k+r-i");
conv("\u0f40\u0020\u0f40", "ka_ka");
conv("\u0f40\n\u0f40\t\u0f40\r\n", "ka\nka\tka\r\n");
conv("\u0f04\u0f05\u0f40\u0f0c\u00a0\u0f42", "@#ka*_ga");
conv("\u0f42\u0f61", "gaya");
hconv("\u0f42\u0f61", "g.ya");
conv("\u0f42\u0fb1", "g+ya");
hconv("\u0f42\u0fb1", "gya");
conv("\u0f54\u0f7e", "paM");
conv("\u0f54\u0f71\u0f7e", "pAM");
conv("\u0f54\u0f7e", "paM");
conv("\u0f54\u0f74\u0f7e", "puM");
conv("\u0f54\u0fc6", "p\\u0FC6");
conv("\u0f40\u0f72\u0f74", "ku+i"); // bottom-to-top
conv("\u0f40\u0f72\u0f74\u0f39", "k^u+i"); // 0f39 first
conv("\u0f40\u0f73", "kI");
conv("\u0f40\u0f71\u0f72", "kI");
conv("\u0f40\u0f72\u0f71", "kI");
conv("\u0f40\u0f73\u0f74", "kU+i");
err("\u0f48");
err("\u0f32\u0f39");
err("\u0f47\u0f98");
conv("\u0fcc", "\\u0FCC");
err("\u0fcd");
err("\u0f90");
err("\u0f90\u0fc6");
conv("\u0f0b\u0fc6", " \\u0FC6"); // ugly but legal...
err("\u0f0b\u0f90");
err("\u0f0b\u0f74");
err("\u0f0b\u0f7f");
err("\u0f0b\u0f3e");
conv("\u0f32\u0f18", "\\u0F32\\u0F18");
conv("\u0f54\u0fa4\u0f90", "p+p+ka");
// TODO(dchandler): warn("\u0f54\u0fa4\u0f90\u0f39"); (or do
// CCCVs work for this?)
if (false) {
// 0f39 could go with any of the three, so we give an error:
err("\u0f54\u0fa4\u0f90\u0f74\u0f39");
} else {
// TODO(dchandler): I want an error, not this:
conv("\u0f54\u0fa4\u0f90\u0f74\u0f39", "p+p+k^u");
}
conv("\u0f54\u0fa4\u0f90\u0f39", "p+p+k^a");
conv("\u0f55\u0f39", "fa");
conv("\u0f55\u0f74\u0f39", "fu");
conv("\u0f56\u0f39", "va");
conv("\u0f56\u0f74\u0f39", "vu");
conv("\u0f54\u0f39\u0fa4\u0f90", "p^+p+ka");
conv("\u0f40\u0f7e", "kaM");
conv("\u0f40\u0f83", "ka~M");
conv("\u0f40\u0f82", "ka~M`");
conv("\u0f40\u0f84", "ka?");
conv("\u0f40\u0f85\u0f40", "ka&ka");
err("\u0f7f");
conv("\u0f40\u0f7f", "kaH");
conv("\u0f40\u0f7f\u0f72", "kiH");
conv("\u0f40\u0f7f\u0f7f\u0f72\u0f7f", "kiHHH");
conv("\u0f40\u0f7f\u0f7e", "kaHM");
conv("\u0f40\u0f7e\u0f7f", "kaMH");
conv("\u0f40\u0f7f\u0f7e\u0f72", "kiHM");
conv("\u0f04\u0f05", "@#");
conv("\u0f04\u0f05\u0f05", "@##");
conv("\u0f04", "@"); // TODO(dchandler): Is this ever seen
// alone? warn/error otherwise.
conv("\u0f05", "#"); // TODO(dchandler): warn or error
}
}
// TODO(dchandler): DLC: test all these round-trip, i.e. assert that
// Uni->EWTS->Uni produces the same Uni.
// TODO(dchandler): test with ZWSP or joiners or whatever weird crap
// you can throw in legally to alter boundaries