Jskad/source/org/thdl/tib/text/reverter/ConverterTest.java
dchandler 5788416629 A reverter that converts Unicode to computer-friendly (but not, yet,
human-friendly) EWTS is here in alpha mode.  It probably doesn't deal
well with non-Tibetan.
2005-08-01 05:54:20 +00:00

159 lines
6.2 KiB
Java

/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2005 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.reverter;
import junit.framework.TestCase;
import org.thdl.tib.text.tshegbar.UnicodeUtils;
import org.thdl.tib.text.ttt.ErrorsAndWarnings;
import org.thdl.util.ThdlOptions;
/** Tests the Converter class.
*
* @author David Chandler */
public class ConverterTest extends TestCase {
/** Invokes a text UI and runs all this class's tests. */
public static void main(String[] args) {
junit.textui.TestRunner.run(ConverterTest.class);
}
protected void setUp() {
// We don't want to use options.txt:
ThdlOptions.forTestingOnlyInitializeWithoutDefaultOptionsFile();
ThdlOptions.setUserPreference("thdl.acip.to.tibetan.warning.and.error.severities.are.built.in.defaults", "true");
ThdlOptions.setUserPreference("thdl.acip.to.tibetan.warning.severity.507", "Most");
ErrorsAndWarnings.setupSeverityMap();
// We don't want to load the TM or TMW font files ourselves:
ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
ThdlOptions.setUserPreference("thdl.debug", true);
}
/** Asserts that converting s from Unicode to EWTS yields an
* error. */
private void err(String s) {
StringBuffer sb = new StringBuffer();
String ewts = Converter.convertToEwtsForComputers(s, sb);
boolean error = (sb.length() > 0);
if (!error) {
System.out.println("expected error but got EWTS '" + ewts
+ "' for "
+ UnicodeUtils.unicodeStringToPrettyString(s));
}
assertTrue(error);
}
/** Tests Converter.convertToEwtsForHumans. */
private void hconv(String uni, String ewts) {
System.out.println("TODO(dchandler): DLC: implement me");
}
/** Tests Converter.convertToEwtsForComputers. */
private void conv(String uni, String ewts) {
StringBuffer sb = new StringBuffer();
String actualEwts = Converter.convertToEwtsForComputers(uni, sb);
assertEquals("Expected " + ewts + " but got " + actualEwts + ":\n",
ewts, actualEwts);
boolean error = (sb.length() > 0);
assertTrue(!error);
}
public ConverterTest() { }
public void testUnicodeToEwts() {
conv("\u0f56\u0f62\u0f9f\u0f42\u0f66\u0f0b", "bar+tagasa ");
conv("\u0f40", "ka");
// TODO(dchandler): DLC Tibetans use Arabic numerals and English punctuation.
// conv("0123456789.\u0f40", "0123456789.ka");
conv("\u0f40\u0f7b", "kai");
conv("\u0f40\u0f76", "k+r-i");
conv("\u0f40\u0020\u0f40", "ka_ka");
conv("\u0f40\n\u0f40\t\u0f40\r\n", "ka\nka\tka\r\n");
conv("\u0f04\u0f05\u0f40\u0f0c\u00a0\u0f42", "@#ka*_ga");
conv("\u0f42\u0f61", "gaya");
hconv("\u0f42\u0f61", "g.ya");
conv("\u0f42\u0fb1", "g+ya");
hconv("\u0f42\u0fb1", "gya");
conv("\u0f54\u0f7e", "paM");
conv("\u0f54\u0f71\u0f7e", "pAM");
conv("\u0f54\u0f7e", "paM");
conv("\u0f54\u0f74\u0f7e", "puM");
conv("\u0f54\u0fc6", "p\\u0FC6");
conv("\u0f40\u0f72\u0f74", "ku+i"); // bottom-to-top
conv("\u0f40\u0f72\u0f74\u0f39", "k^u+i"); // 0f39 first
conv("\u0f40\u0f73", "kI");
conv("\u0f40\u0f71\u0f72", "kI");
conv("\u0f40\u0f72\u0f71", "kI");
conv("\u0f40\u0f73\u0f74", "kU+i");
err("\u0f48");
err("\u0f32\u0f39");
err("\u0f47\u0f98");
conv("\u0fcc", "\\u0FCC");
err("\u0fcd");
err("\u0f90");
err("\u0f90\u0fc6");
conv("\u0f0b\u0fc6", " \\u0FC6"); // ugly but legal...
err("\u0f0b\u0f90");
err("\u0f0b\u0f74");
err("\u0f0b\u0f7f");
err("\u0f0b\u0f3e");
conv("\u0f32\u0f18", "\\u0F32\\u0F18");
conv("\u0f54\u0fa4\u0f90", "p+p+ka");
// TODO(dchandler): warn("\u0f54\u0fa4\u0f90\u0f39"); (or do
// CCCVs work for this?)
if (false) {
// 0f39 could go with any of the three, so we give an error:
err("\u0f54\u0fa4\u0f90\u0f74\u0f39");
} else {
// TODO(dchandler): I want an error, not this:
conv("\u0f54\u0fa4\u0f90\u0f74\u0f39", "p+p+k^u");
}
conv("\u0f54\u0fa4\u0f90\u0f39", "p+p+k^a");
conv("\u0f55\u0f39", "fa");
conv("\u0f55\u0f74\u0f39", "fu");
conv("\u0f56\u0f39", "va");
conv("\u0f56\u0f74\u0f39", "vu");
conv("\u0f54\u0f39\u0fa4\u0f90", "p^+p+ka");
conv("\u0f40\u0f7e", "kaM");
conv("\u0f40\u0f83", "ka~M");
conv("\u0f40\u0f82", "ka~M`");
conv("\u0f40\u0f84", "ka?");
conv("\u0f40\u0f85\u0f40", "ka&ka");
err("\u0f7f");
conv("\u0f40\u0f7f", "kaH");
conv("\u0f40\u0f7f\u0f72", "kiH");
conv("\u0f40\u0f7f\u0f7f\u0f72\u0f7f", "kiHHH");
conv("\u0f40\u0f7f\u0f7e", "kaHM");
conv("\u0f40\u0f7e\u0f7f", "kaMH");
conv("\u0f40\u0f7f\u0f7e\u0f72", "kiHM");
conv("\u0f04\u0f05", "@#");
conv("\u0f04\u0f05\u0f05", "@##");
conv("\u0f04", "@"); // TODO(dchandler): Is this ever seen
// alone? warn/error otherwise.
conv("\u0f05", "#"); // TODO(dchandler): warn or error
}
}
// TODO(dchandler): DLC: test all these round-trip, i.e. assert that
// Uni->EWTS->Uni produces the same Uni.
// TODO(dchandler): test with ZWSP or joiners or whatever weird crap
// you can throw in legally to alter boundaries