A reverter that converts Unicode to computer-friendly (but not, yet,

human-friendly) EWTS is here in alpha mode. It probably doesn't deal well with non-Tibetan.
2005-08-01 05:54:20 +00:00 · 2005-08-01 05:54:20 +00:00 · 5788416629
commit 5788416629
parent 00afd75362
13 changed files with 496 additions and 47 deletions
--- a/source/org/thdl/tib/text/reverter/ConverterTest.java
+++ b/source/org/thdl/tib/text/reverter/ConverterTest.java
@ -20,8 +20,9 @@ package org.thdl.tib.text.reverter;

 import junit.framework.TestCase;

-import org.thdl.util.ThdlOptions;
+import org.thdl.tib.text.tshegbar.UnicodeUtils;
 import org.thdl.tib.text.ttt.ErrorsAndWarnings;
+import org.thdl.util.ThdlOptions;

 /** Tests the Converter class.
 *
@ -47,9 +48,112 @@ public class ConverterTest extends TestCase {
        ThdlOptions.setUserPreference("thdl.debug", true);
    }

+    /** Asserts that converting s from Unicode to EWTS yields an
+     *  error. */
+    private void err(String s) {
+        StringBuffer sb = new StringBuffer();
+        String ewts = Converter.convertToEwtsForComputers(s, sb);
+        boolean error = (sb.length() > 0);
+        if (!error) {
+            System.out.println("expected error but got EWTS '" + ewts
+                               + "' for "
+                               + UnicodeUtils.unicodeStringToPrettyString(s));
+        }
+        assertTrue(error);
+    }
+
+    /** Tests Converter.convertToEwtsForHumans. */
+    private void hconv(String uni, String ewts) {
+        System.out.println("TODO(dchandler): DLC: implement me");
+    }
+
+    /** Tests Converter.convertToEwtsForComputers. */
+    private void conv(String uni, String ewts) {
+        StringBuffer sb = new StringBuffer();
+        String actualEwts = Converter.convertToEwtsForComputers(uni, sb);
+        assertEquals("Expected " + ewts + " but got " + actualEwts + ":\n",
+                     ewts, actualEwts);
+        boolean error = (sb.length() > 0);
+        assertTrue(!error);
+    }
+
    public ConverterTest() { }

    public void testUnicodeToEwts() {
-        assertEquals(Converter.convertToEwts("\u0f40", null), "ka");
+        conv("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b", "bar+tagasa ");
+        conv("\u0f40", "ka");
+        // TODO(dchandler): DLC Tibetans use Arabic numerals and English punctuation.
+        // conv("0123456789.\u0f40", "0123456789.ka");
+        conv("\u0f40\u0f7b", "kai");
+        conv("\u0f40\u0f76", "k+r-i");
+        conv("\u0f40\u0020\u0f40", "ka_ka");
+        conv("\u0f40\n\u0f40\t\u0f40\r\n", "ka\nka\tka\r\n");
+        conv("\u0f04\u0f05\u0f40\u0f0c\u00a0\u0f42", "@#ka*_ga");
+        conv("\u0f42\u0f61", "gaya");
+        hconv("\u0f42\u0f61", "g.ya");
+        conv("\u0f42\u0fb1", "g+ya");
+        hconv("\u0f42\u0fb1", "gya");
+        conv("\u0f54\u0f7e", "paM");
+        conv("\u0f54\u0f71\u0f7e", "pAM");
+        conv("\u0f54\u0f7e", "paM");
+        conv("\u0f54\u0f74\u0f7e", "puM");
+        conv("\u0f54\u0fc6", "p\\u0FC6");
+        conv("\u0f40\u0f72\u0f74", "ku+i");  // bottom-to-top
+        conv("\u0f40\u0f72\u0f74\u0f39", "k^u+i");  // 0f39 first
+        conv("\u0f40\u0f73", "kI");
+        conv("\u0f40\u0f71\u0f72", "kI");
+        conv("\u0f40\u0f72\u0f71", "kI");
+        conv("\u0f40\u0f73\u0f74", "kU+i");
+        err("\u0f48");
+        err("\u0f32\u0f39");
+        err("\u0f47\u0f98");
+        conv("\u0fcc", "\\u0FCC");
+        err("\u0fcd");
+        err("\u0f90");
+        err("\u0f90\u0fc6");
+        conv("\u0f0b\u0fc6", " \\u0FC6");  // ugly but legal...
+        err("\u0f0b\u0f90");
+        err("\u0f0b\u0f74");
+        err("\u0f0b\u0f7f");
+        err("\u0f0b\u0f3e");
+        conv("\u0f32\u0f18", "\\u0F32\\u0F18");
+        conv("\u0f54\u0fa4\u0f90", "p+p+ka");
+        // TODO(dchandler): warn("\u0f54\u0fa4\u0f90\u0f39"); (or do
+        // CCCVs work for this?)
+        if (false) {
+            // 0f39 could go with any of the three, so we give an error:
+            err("\u0f54\u0fa4\u0f90\u0f74\u0f39");
+        } else {
+            // TODO(dchandler): I want an error, not this:
+            conv("\u0f54\u0fa4\u0f90\u0f74\u0f39", "p+p+k^u");
+        }
+        conv("\u0f54\u0fa4\u0f90\u0f39", "p+p+k^a");
+        conv("\u0f55\u0f39", "fa");
+        conv("\u0f55\u0f74\u0f39", "fu");
+        conv("\u0f56\u0f39", "va");
+        conv("\u0f56\u0f74\u0f39", "vu");
+        conv("\u0f54\u0f39\u0fa4\u0f90", "p^+p+ka");
+        conv("\u0f40\u0f7e", "kaM");
+        conv("\u0f40\u0f83", "ka~M");
+        conv("\u0f40\u0f82", "ka~M`");
+        conv("\u0f40\u0f84", "ka?");
+        conv("\u0f40\u0f85\u0f40", "ka&ka");
+        err("\u0f7f");
+        conv("\u0f40\u0f7f", "kaH");
+        conv("\u0f40\u0f7f\u0f72", "kiH");
+        conv("\u0f40\u0f7f\u0f7f\u0f72\u0f7f", "kiHHH");
+        conv("\u0f40\u0f7f\u0f7e", "kaHM");
+        conv("\u0f40\u0f7e\u0f7f", "kaMH");
+        conv("\u0f40\u0f7f\u0f7e\u0f72", "kiHM");
+        conv("\u0f04\u0f05", "@#");
+        conv("\u0f04\u0f05\u0f05", "@##");
+        conv("\u0f04", "@");  // TODO(dchandler): Is this ever seen
+                              // alone?  warn/error otherwise.
+        conv("\u0f05", "#");  // TODO(dchandler): warn or error
    }
 }
+// TODO(dchandler): DLC: test all these round-trip, i.e. assert that
+// Uni->EWTS->Uni produces the same Uni.
+
+// TODO(dchandler): test with ZWSP or joiners or whatever weird crap
+// you can throw in legally to alter boundaries