A reverter that converts Unicode to computer-friendly (but not, yet,
human-friendly) EWTS is here in alpha mode. It probably doesn't deal well with non-Tibetan.
This commit is contained in:
parent
00afd75362
commit
5788416629
13 changed files with 496 additions and 47 deletions
|
@ -20,8 +20,9 @@ package org.thdl.tib.text.reverter;
|
|||
|
||||
import junit.framework.TestCase;
|
||||
|
||||
import org.thdl.util.ThdlOptions;
|
||||
import org.thdl.tib.text.tshegbar.UnicodeUtils;
|
||||
import org.thdl.tib.text.ttt.ErrorsAndWarnings;
|
||||
import org.thdl.util.ThdlOptions;
|
||||
|
||||
/** Tests the Converter class.
|
||||
*
|
||||
|
@ -47,9 +48,112 @@ public class ConverterTest extends TestCase {
|
|||
ThdlOptions.setUserPreference("thdl.debug", true);
|
||||
}
|
||||
|
||||
/** Asserts that converting s from Unicode to EWTS yields an
|
||||
* error. */
|
||||
private void err(String s) {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
String ewts = Converter.convertToEwtsForComputers(s, sb);
|
||||
boolean error = (sb.length() > 0);
|
||||
if (!error) {
|
||||
System.out.println("expected error but got EWTS '" + ewts
|
||||
+ "' for "
|
||||
+ UnicodeUtils.unicodeStringToPrettyString(s));
|
||||
}
|
||||
assertTrue(error);
|
||||
}
|
||||
|
||||
/** Tests Converter.convertToEwtsForHumans. */
|
||||
private void hconv(String uni, String ewts) {
|
||||
System.out.println("TODO(dchandler): DLC: implement me");
|
||||
}
|
||||
|
||||
/** Tests Converter.convertToEwtsForComputers. */
|
||||
private void conv(String uni, String ewts) {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
String actualEwts = Converter.convertToEwtsForComputers(uni, sb);
|
||||
assertEquals("Expected " + ewts + " but got " + actualEwts + ":\n",
|
||||
ewts, actualEwts);
|
||||
boolean error = (sb.length() > 0);
|
||||
assertTrue(!error);
|
||||
}
|
||||
|
||||
public ConverterTest() { }
|
||||
|
||||
public void testUnicodeToEwts() {
|
||||
assertEquals(Converter.convertToEwts("\u0f40", null), "ka");
|
||||
conv("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b", "bar+tagasa ");
|
||||
conv("\u0f40", "ka");
|
||||
// TODO(dchandler): DLC Tibetans use Arabic numerals and English punctuation.
|
||||
// conv("0123456789.\u0f40", "0123456789.ka");
|
||||
conv("\u0f40\u0f7b", "kai");
|
||||
conv("\u0f40\u0f76", "k+r-i");
|
||||
conv("\u0f40\u0020\u0f40", "ka_ka");
|
||||
conv("\u0f40\n\u0f40\t\u0f40\r\n", "ka\nka\tka\r\n");
|
||||
conv("\u0f04\u0f05\u0f40\u0f0c\u00a0\u0f42", "@#ka*_ga");
|
||||
conv("\u0f42\u0f61", "gaya");
|
||||
hconv("\u0f42\u0f61", "g.ya");
|
||||
conv("\u0f42\u0fb1", "g+ya");
|
||||
hconv("\u0f42\u0fb1", "gya");
|
||||
conv("\u0f54\u0f7e", "paM");
|
||||
conv("\u0f54\u0f71\u0f7e", "pAM");
|
||||
conv("\u0f54\u0f7e", "paM");
|
||||
conv("\u0f54\u0f74\u0f7e", "puM");
|
||||
conv("\u0f54\u0fc6", "p\\u0FC6");
|
||||
conv("\u0f40\u0f72\u0f74", "ku+i"); // bottom-to-top
|
||||
conv("\u0f40\u0f72\u0f74\u0f39", "k^u+i"); // 0f39 first
|
||||
conv("\u0f40\u0f73", "kI");
|
||||
conv("\u0f40\u0f71\u0f72", "kI");
|
||||
conv("\u0f40\u0f72\u0f71", "kI");
|
||||
conv("\u0f40\u0f73\u0f74", "kU+i");
|
||||
err("\u0f48");
|
||||
err("\u0f32\u0f39");
|
||||
err("\u0f47\u0f98");
|
||||
conv("\u0fcc", "\\u0FCC");
|
||||
err("\u0fcd");
|
||||
err("\u0f90");
|
||||
err("\u0f90\u0fc6");
|
||||
conv("\u0f0b\u0fc6", " \\u0FC6"); // ugly but legal...
|
||||
err("\u0f0b\u0f90");
|
||||
err("\u0f0b\u0f74");
|
||||
err("\u0f0b\u0f7f");
|
||||
err("\u0f0b\u0f3e");
|
||||
conv("\u0f32\u0f18", "\\u0F32\\u0F18");
|
||||
conv("\u0f54\u0fa4\u0f90", "p+p+ka");
|
||||
// TODO(dchandler): warn("\u0f54\u0fa4\u0f90\u0f39"); (or do
|
||||
// CCCVs work for this?)
|
||||
if (false) {
|
||||
// 0f39 could go with any of the three, so we give an error:
|
||||
err("\u0f54\u0fa4\u0f90\u0f74\u0f39");
|
||||
} else {
|
||||
// TODO(dchandler): I want an error, not this:
|
||||
conv("\u0f54\u0fa4\u0f90\u0f74\u0f39", "p+p+k^u");
|
||||
}
|
||||
conv("\u0f54\u0fa4\u0f90\u0f39", "p+p+k^a");
|
||||
conv("\u0f55\u0f39", "fa");
|
||||
conv("\u0f55\u0f74\u0f39", "fu");
|
||||
conv("\u0f56\u0f39", "va");
|
||||
conv("\u0f56\u0f74\u0f39", "vu");
|
||||
conv("\u0f54\u0f39\u0fa4\u0f90", "p^+p+ka");
|
||||
conv("\u0f40\u0f7e", "kaM");
|
||||
conv("\u0f40\u0f83", "ka~M");
|
||||
conv("\u0f40\u0f82", "ka~M`");
|
||||
conv("\u0f40\u0f84", "ka?");
|
||||
conv("\u0f40\u0f85\u0f40", "ka&ka");
|
||||
err("\u0f7f");
|
||||
conv("\u0f40\u0f7f", "kaH");
|
||||
conv("\u0f40\u0f7f\u0f72", "kiH");
|
||||
conv("\u0f40\u0f7f\u0f7f\u0f72\u0f7f", "kiHHH");
|
||||
conv("\u0f40\u0f7f\u0f7e", "kaHM");
|
||||
conv("\u0f40\u0f7e\u0f7f", "kaMH");
|
||||
conv("\u0f40\u0f7f\u0f7e\u0f72", "kiHM");
|
||||
conv("\u0f04\u0f05", "@#");
|
||||
conv("\u0f04\u0f05\u0f05", "@##");
|
||||
conv("\u0f04", "@"); // TODO(dchandler): Is this ever seen
|
||||
// alone? warn/error otherwise.
|
||||
conv("\u0f05", "#"); // TODO(dchandler): warn or error
|
||||
}
|
||||
}
|
||||
// TODO(dchandler): DLC: test all these round-trip, i.e. assert that
|
||||
// Uni->EWTS->Uni produces the same Uni.
|
||||
|
||||
// TODO(dchandler): test with ZWSP or joiners or whatever weird crap
|
||||
// you can throw in legally to alter boundaries
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue