I did this stuff back in August. It's all in support of EWTS->Tibetan

conversion.  The tag 'TODO(DLC)[EWTS->Tibetan]' exists all over the
place.  EWTS->Tibetan isn't here yet; lexing isn't here yet; this is
mainly a refactoring so that the ACIP->Tibetan code can be reused to
do EWTS->Tibetan.

I'm committing this because tests pass (it shouldn't be breaking
anything), because I want a checkpoint, and because the laptop this
sandbox was on isn't my preferred development environment.
This commit is contained in:
dchandler 2005-02-21 01:16:10 +00:00
parent 83f499b7a8
commit 37bf9a736d
26 changed files with 43219 additions and 98 deletions

View file

@ -454,6 +454,16 @@ Contributor(s): ______________________________________.
<param name="my.included.source.file" <param name="my.included.source.file"
value="org/thdl/tib/text/ttt/PackageTest.java"/> value="org/thdl/tib/text/ttt/PackageTest.java"/>
</antcall> </antcall>
<antcall target="our-internal-javac-task">
<param name="mybin" value="${junitbin}"/>
<param name="my.included.source.file"
value="org/thdl/tib/text/ttt/EWTSTest.java"/>
</antcall>
<antcall target="our-internal-javac-task">
<param name="mybin" value="${junitbin}"/>
<param name="my.included.source.file"
value="org/thdl/tib/text/ttt/EWTStibwniniTest.java"/>
</antcall>
<antcall target="our-internal-javac-task"> <antcall target="our-internal-javac-task">
<param name="mybin" value="${junitbin}"/> <param name="mybin" value="${junitbin}"/>
<param name="my.included.source.file" <param name="my.included.source.file"

View file

@ -73,10 +73,12 @@
<formatter type="xml"/><!-- If not XML, then 'ant -buildfile <formatter type="xml"/><!-- If not XML, then 'ant -buildfile
build.xml check-report' will fail. --> build.xml check-report' will fail. -->
<sysproperty key="java.awt.headless" value="true"/> <sysproperty key="java.awt.headless" value="true"/>
<test name="org.thdl.tib.text.ttt.EWTSTest"/>
<test name="org.thdl.tib.text.ttt.EWTStibwniniTest"/>
<test name="org.thdl.tib.input.TMW_RTF_TO_THDL_WYLIETest"/>
<test name="org.thdl.tib.text.TibetanMachineWebTest"/> <test name="org.thdl.tib.text.TibetanMachineWebTest"/>
<test name="org.thdl.tib.text.ttt.PackageTest"/> <test name="org.thdl.tib.text.ttt.PackageTest"/>
<test name="org.thdl.tib.text.ttt.LotsOfTshegBarsTest"/> <test name="org.thdl.tib.text.ttt.LotsOfTshegBarsTest"/>
<test name="org.thdl.tib.input.TMW_RTF_TO_THDL_WYLIETest"/>
<test name="org.thdl.util.RTFFixerInputStreamTest"/> <test name="org.thdl.util.RTFFixerInputStreamTest"/>
<test name="org.thdl.util.ThdlLazyExceptionTest"/> <test name="org.thdl.util.ThdlLazyExceptionTest"/>
<test name="org.thdl.util.TrieTest"/> <test name="org.thdl.util.TrieTest"/>

View file

@ -26,7 +26,11 @@ keyboard.rtf.help.file.4 = Sambhota_keymap_one.rtf
keyboard.ini.file.4 = sambhota_keyboard_1.ini keyboard.ini.file.4 = sambhota_keyboard_1.ini
# This keyboard has many bugs. "So does the Extended Wylie keyboard," # This keyboard has many bugs. "So does the Extended Wylie keyboard,"
# you say, but this one has many many bugs. # you say, but this one has many many bugs. Want to fix it? Don't
# try the .ini file route at all. Instead, build a keyboard using the
# ACIP->TMW conversion technology in the package
# org.thdl.tib.text.ttt.
#
#keyboard.name.for.popup.5 = Asian Classics Input Project (ACIP) -- BUGGY #keyboard.name.for.popup.5 = Asian Classics Input Project (ACIP) -- BUGGY
#keyboard.rtf.help.file.5 = nil #keyboard.rtf.help.file.5 = nil
#keyboard.ini.file.5 = acip_keyboard.ini #keyboard.ini.file.5 = acip_keyboard.ini

View file

@ -32,7 +32,9 @@ import org.apache.commons.jrcs.diff.Revision;
/** /**
* @author David Chandler * @author David Chandler
* *
* Tests {@link org.thdl.tib.input.TibetanConverter} at the unit level. * Tests {@link org.thdl.tib.input.TibetanConverter} at the unit
* level. The name is a misnomer; we test more than just
* TMW.rtf->EWTS conversions.
*/ */
public class TMW_RTF_TO_THDL_WYLIETest extends TestCase { public class TMW_RTF_TO_THDL_WYLIETest extends TestCase {
/** /**
@ -52,6 +54,11 @@ public class TMW_RTF_TO_THDL_WYLIETest extends TestCase {
// We don't want to use options.txt: // We don't want to use options.txt:
ThdlOptions.forTestingOnlyInitializeWithoutDefaultOptionsFile(); ThdlOptions.forTestingOnlyInitializeWithoutDefaultOptionsFile();
ThdlOptions.setUserPreference("thdl.acip.to.tibetan.warning.and.error.severities.are.built.in.defaults", "true");
ThdlOptions.setUserPreference("thdl.acip.to.tibetan.warning.severity.507", "Most");
org.thdl.tib.text.ttt.ErrorsAndWarnings.setupSeverityMap();
// We do want debugging assertions: // We do want debugging assertions:
ThdlOptions.setUserPreference("thdl.debug", true); ThdlOptions.setUserPreference("thdl.debug", true);
} }
@ -77,9 +84,11 @@ public class TMW_RTF_TO_THDL_WYLIETest extends TestCase {
+ "tib" + File.separator + "tib" + File.separator
+ "input" + File.separator + "input" + File.separator
+ "TMW_RTF_TO_THDL_WYLIE" + testName + ".expected"; + "TMW_RTF_TO_THDL_WYLIE" + testName + ".expected";
assertTrue("The file the converter should've produced doesn't exist", assertTrue("The file the converter should've produced doesn't exist: "
+ actualFile,
new File(actualFile).exists()); new File(actualFile).exists());
assertTrue("The baseline file, the file containing the expected results, doesn't exist", assertTrue("The baseline file, the file containing the expected results, doesn't exist: "
+ expectedFile,
new File(expectedFile).exists()); new File(expectedFile).exists());
Revision rev = JDiff.getDiff(expectedFile, actualFile); Revision rev = JDiff.getDiff(expectedFile, actualFile);
assertTrue("JDiff.getDiff returned null", null != rev); assertTrue("JDiff.getDiff returned null", null != rev);
@ -100,16 +109,17 @@ public class TMW_RTF_TO_THDL_WYLIETest extends TestCase {
} }
private void helper(String testName, String mode, String extension, int erc) { private void helper(String testName, String inputExtension, String mode,
String extension, int erc, String errorMessageLength) {
String[] args = new String[] { String[] args = new String[] {
"--colors", "--colors",
"no", "no",
"--warning-level", "--warning-level",
"All", "All",
"--acip-to-tibetan-warning-and-error-messages", "--acip-to-tibetan-warning-and-error-messages",
"long", errorMessageLength,
mode, mode,
getTestFileName(testName) getTestFileName(testName, inputExtension)
}; };
boolean fileNotFound = false; boolean fileNotFound = false;
try { try {
@ -126,45 +136,53 @@ public class TMW_RTF_TO_THDL_WYLIETest extends TestCase {
testActualAndExpected(testName + "Result" + extension); testActualAndExpected(testName + "Result" + extension);
} }
private static String getTestFileName(String testName) { private static String getTestFileName(String testName,
String inputExtension) {
return "source" + File.separator return "source" + File.separator
+ "org" + File.separator + "org" + File.separator
+ "thdl" + File.separator + "thdl" + File.separator
+ "tib" + File.separator + "tib" + File.separator
+ "input" + File.separator + "input" + File.separator
// FIXME: one of the files named '.rtf' is really a text + "TMW_RTF_TO_THDL_WYLIE" + testName + inputExtension;
// file:
+ "TMW_RTF_TO_THDL_WYLIE" + testName + ".rtf";
} }
/** Tests the --find-some-non-tmw mode of {@link /** Tests the --find-some-non-tmw mode of {@link
* org.thdl.tib.input.TibetanConverter}. */ * org.thdl.tib.input.TibetanConverter}. */
public void testFindSomeNonTMWMode() { public void testFindSomeNonTMWMode() {
helper("Test1", "--find-some-non-tmw", "FindSome", 1); helper("Test1", ".rtf", "--find-some-non-tmw", "FindSome", 1, "long");
} }
/** Tests the --find-all-non-tmw mode of {@link /** Tests the --find-all-non-tmw mode of {@link
* org.thdl.tib.input.TibetanConverter}. */ * org.thdl.tib.input.TibetanConverter}. */
public void testFindAllNonTMWMode() { public void testFindAllNonTMWMode() {
helper("Test1", "--find-all-non-tmw", "FindAll", 1); helper("Test1", ".rtf", "--find-all-non-tmw", "FindAll", 1, "long");
} }
/** Tests the --to-wylie converter mode of {@link /** Tests the --to-wylie converter mode of {@link
* org.thdl.tib.input.TibetanConverter}. */ * org.thdl.tib.input.TibetanConverter}. */
public void testConverterMode() { public void testToWylieConverterMode() {
helper("Test1", "--to-wylie", "Conversion", 0); helper("Test1", ".rtf", "--to-wylie", "Conversion", 0, "long");
helper("Test2", "--to-wylie", "Conversion", 44); helper("Test2", ".rtf", "--to-wylie", "Conversion", 44, "long");
} }
/** Tests the --to-tibetan-machine, --to-tibetan-machine-web, /** Tests the --to-tibetan-machine, --to-tibetan-machine-web,
* --to-acip, and --acip-to-tmw converter modes of {@link * --to-acip, and --acip-to-tmw converter modes of {@link
* org.thdl.tib.input.TibetanConverter}. */ * org.thdl.tib.input.TibetanConverter}. */
public void testTMConverterMode() { public void testSomeConverters() {
helper("Test1", "--to-tibetan-machine", "TM", 0); /* TODO(DLC)[EWTS->Tibetan]: NOW runs out of memory
helper("Test2", "--to-tibetan-machine", "TM", 0); helper("Test4_aka_TD4222I1.INC", "", "--acip-to-tmw", "TMW", 46,
helper("Test2", "--to-tibetan-machine-web", "TMW", 0); "short"); */
helper("Test2", "--to-acip", "ACIP", 49); helper("Test4_aka_TD4222I1.INC", "", "--acip-to-unicode", "UNI", 46,
helper("Test3", "--acip-to-tmw", "TMW", 0); "short");
helper("Test1", ".rtf", "--to-tibetan-machine", "TM", 0, "long");
helper("Test2", ".rtf", "--to-tibetan-machine", "TM", 0, "long");
helper("Test2", ".rtf", "--to-tibetan-machine-web", "TMW", 0, "long");
helper("Test2", ".rtf", "--to-acip", "ACIP", 49, "long");
helper("Test3", ".acip", "--acip-to-tmw", "TMW", 0, "long");
} }
} }
// TODO(dchandler): put the line 'THIS IS ENGLISH' in
// TMW_RTF_TO_THDL_WYLIETest3.rtf; what would that mean? I did this once but
// didn't check it in...

View file

@ -0,0 +1,386 @@
[# \u0F40\u0F00: ]\u0F40\u0F00
[# \u0F40\u0F01: ]\u0F40\u0F01
[# \u0F40\u0F02: ]\u0F40\u0F02
[# \u0F40\u0F03: ]\u0F40\u0F03
[# \u0F40\u0F04: ]\u0F40\u0F04
[# \u0F40\u0F05: ]\u0F40\u0F05
[# \u0F40\u0F06: ]\u0F40\u0F06
[# \u0F40\u0F07: ]\u0F40\u0F07
[# \u0F40\u0F08: ]\u0F40\u0F08
[# \u0F40\u0F09: ]\u0F40\u0F09
[# \u0F40\u0F0A: ]\u0F40\u0F0A
[# \u0F40\u0F0B: ]\u0F40\u0F0B
[# \u0F40\u0F0C: ]\u0F40\u0F0C
[# \u0F40\u0F0D: ]\u0F40\u0F0D
[# \u0F40\u0F0E: ]\u0F40\u0F0E
[# \u0F40\u0F0F: ]\u0F40\u0F0F
[# \u0F40\u0F10: ]\u0F40\u0F10
[# \u0F40\u0F11: ]\u0F40\u0F11
[# \u0F40\u0F12: ]\u0F40\u0F12
[# \u0F40\u0F13: ]\u0F40\u0F13
[# \u0F40\u0F14: ]\u0F40\u0F14
[# \u0F40\u0F15: ]\u0F40\u0F15
[# \u0F40\u0F16: ]\u0F40\u0F16
[# \u0F40\u0F17: ]\u0F40\u0F17
[# \u0F40\u0F18: ]\u0F40\u0F18
[# \u0F40\u0F19: ]\u0F40\u0F19
[# \u0F40\u0F1A: ]\u0F40\u0F1A
[# \u0F40\u0F1B: ]\u0F40\u0F1B
[# \u0F40\u0F1C: ]\u0F40\u0F1C
[# \u0F40\u0F1D: ]\u0F40\u0F1D
[# \u0F40\u0F1E: ]\u0F40\u0F1E
[# \u0F40\u0F1F: ]\u0F40\u0F1F
[# \u0F40\u0F20: ]\u0F40\u0F20
[# \u0F40\u0F21: ]\u0F40\u0F21
[# \u0F40\u0F22: ]\u0F40\u0F22
[# \u0F40\u0F23: ]\u0F40\u0F23
[# \u0F40\u0F24: ]\u0F40\u0F24
[# \u0F40\u0F25: ]\u0F40\u0F25
[# \u0F40\u0F26: ]\u0F40\u0F26
[# \u0F40\u0F27: ]\u0F40\u0F27
[# \u0F40\u0F28: ]\u0F40\u0F28
[# \u0F40\u0F29: ]\u0F40\u0F29
[# \u0F40\u0F2A: ]\u0F40\u0F2A
[# \u0F40\u0F2B: ]\u0F40\u0F2B
[# \u0F40\u0F2C: ]\u0F40\u0F2C
[# \u0F40\u0F2D: ]\u0F40\u0F2D
[# \u0F40\u0F2E: ]\u0F40\u0F2E
[# \u0F40\u0F2F: ]\u0F40\u0F2F
[# \u0F40\u0F30: ]\u0F40\u0F30
[# \u0F40\u0F31: ]\u0F40\u0F31
[# \u0F40\u0F32: ]\u0F40\u0F32
[# \u0F40\u0F33: ]\u0F40\u0F33
[# \u0F40\u0F34: ]\u0F40\u0F34
[# \u0F40\u0F35: ]\u0F40\u0F35
[# \u0F40\u0F36: ]\u0F40\u0F36
[# \u0F40\u0F37: ]\u0F40\u0F37
[# \u0F40\u0F38: ]\u0F40\u0F38
[# \u0F40\u0F39: ]\u0F40\u0F39
[# \u0F40\u0F3A: ]\u0F40\u0F3A
[# \u0F40\u0F3B: ]\u0F40\u0F3B
[# \u0F40\u0F3C: ]\u0F40\u0F3C
[# \u0F40\u0F3D: ]\u0F40\u0F3D
[# \u0F40\u0F3E: ]\u0F40\u0F3E
[# \u0F40\u0F3F: ]\u0F40\u0F3F
[# \u0F40\u0F40: ]\u0F40\u0F40
[# \u0F40\u0F41: ]\u0F40\u0F41
[# \u0F40\u0F42: ]\u0F40\u0F42
[# \u0F40\u0F43: ]\u0F40\u0F43
[# \u0F40\u0F44: ]\u0F40\u0F44
[# \u0F40\u0F45: ]\u0F40\u0F45
[# \u0F40\u0F46: ]\u0F40\u0F46
[# \u0F40\u0F47: ]\u0F40\u0F47
[# \u0F40\u0F49: ]\u0F40\u0F49
[# \u0F40\u0F4A: ]\u0F40\u0F4A
[# \u0F40\u0F4B: ]\u0F40\u0F4B
[# \u0F40\u0F4C: ]\u0F40\u0F4C
[# \u0F40\u0F4D: ]\u0F40\u0F4D
[# \u0F40\u0F4E: ]\u0F40\u0F4E
[# \u0F40\u0F4F: ]\u0F40\u0F4F
[# \u0F40\u0F50: ]\u0F40\u0F50
[# \u0F40\u0F51: ]\u0F40\u0F51
[# \u0F40\u0F52: ]\u0F40\u0F52
[# \u0F40\u0F53: ]\u0F40\u0F53
[# \u0F40\u0F54: ]\u0F40\u0F54
[# \u0F40\u0F55: ]\u0F40\u0F55
[# \u0F40\u0F56: ]\u0F40\u0F56
[# \u0F40\u0F57: ]\u0F40\u0F57
[# \u0F40\u0F58: ]\u0F40\u0F58
[# \u0F40\u0F59: ]\u0F40\u0F59
[# \u0F40\u0F5A: ]\u0F40\u0F5A
[# \u0F40\u0F5B: ]\u0F40\u0F5B
[# \u0F40\u0F5C: ]\u0F40\u0F5C
[# \u0F40\u0F5D: ]\u0F40\u0F5D
[# \u0F40\u0F5E: ]\u0F40\u0F5E
[# \u0F40\u0F5F: ]\u0F40\u0F5F
[# \u0F40\u0F60: ]\u0F40\u0F60
[# \u0F40\u0F61: ]\u0F40\u0F61
[# \u0F40\u0F62: ]\u0F40\u0F62
[# \u0F40\u0F63: ]\u0F40\u0F63
[# \u0F40\u0F64: ]\u0F40\u0F64
[# \u0F40\u0F65: ]\u0F40\u0F65
[# \u0F40\u0F66: ]\u0F40\u0F66
[# \u0F40\u0F67: ]\u0F40\u0F67
[# \u0F40\u0F68: ]\u0F40\u0F68
[# \u0F40\u0F69: ]\u0F40\u0F69
[# \u0F40\u0F6A: ]\u0F40\u0F6A
[# \u0F40\u0F71: ]\u0F40\u0F71
[# \u0F40\u0F72: ]\u0F40\u0F72
[# \u0F40\u0F73: ]\u0F40\u0F73
[# \u0F40\u0F74: ]\u0F40\u0F74
[# \u0F40\u0F75: ]\u0F40\u0F75
[# \u0F40\u0F76: ]\u0F40\u0F76
[# \u0F40\u0F77: ]\u0F40\u0F77
[# \u0F40\u0F78: ]\u0F40\u0F78
[# \u0F40\u0F79: ]\u0F40\u0F79
[# \u0F40\u0F7A: ]\u0F40\u0F7A
[# \u0F40\u0F7B: ]\u0F40\u0F7B
[# \u0F40\u0F7C: ]\u0F40\u0F7C
[# \u0F40\u0F7D: ]\u0F40\u0F7D
[# \u0F40\u0F7E: ]\u0F40\u0F7E
[# \u0F40\u0F7F: ]\u0F40\u0F7F
[# \u0F40\u0F80: ]\u0F40\u0F80
[# \u0F40\u0F81: ]\u0F40\u0F81
[# \u0F40\u0F82: ]\u0F40\u0F82
[# \u0F40\u0F83: ]\u0F40\u0F83
[# \u0F40\u0F84: ]\u0F40\u0F84
[# \u0F40\u0F85: ]\u0F40\u0F85
[# \u0F40\u0F86: ]\u0F40\u0F86
[# \u0F40\u0F87: ]\u0F40\u0F87
[# \u0F40\u0F88: ]\u0F40\u0F88
[# \u0F40\u0F89: ]\u0F40\u0F89
[# \u0F40\u0F8A: ]\u0F40\u0F8A
[# \u0F40\u0F8B: ]\u0F40\u0F8B
[# \u0F40\u0F90: ]\u0F40\u0F90
[# \u0F40\u0F91: ]\u0F40\u0F91
[# \u0F40\u0F92: ]\u0F40\u0F92
[# \u0F40\u0F93: ]\u0F40\u0F93
[# \u0F40\u0F94: ]\u0F40\u0F94
[# \u0F40\u0F95: ]\u0F40\u0F95
[# \u0F40\u0F96: ]\u0F40\u0F96
[# \u0F40\u0F97: ]\u0F40\u0F97
[# \u0F40\u0F99: ]\u0F40\u0F99
[# \u0F40\u0F9A: ]\u0F40\u0F9A
[# \u0F40\u0F9B: ]\u0F40\u0F9B
[# \u0F40\u0F9C: ]\u0F40\u0F9C
[# \u0F40\u0F9D: ]\u0F40\u0F9D
[# \u0F40\u0F9E: ]\u0F40\u0F9E
[# \u0F40\u0F9F: ]\u0F40\u0F9F
[# \u0F40\u0FA0: ]\u0F40\u0FA0
[# \u0F40\u0FA1: ]\u0F40\u0FA1
[# \u0F40\u0FA2: ]\u0F40\u0FA2
[# \u0F40\u0FA3: ]\u0F40\u0FA3
[# \u0F40\u0FA4: ]\u0F40\u0FA4
[# \u0F40\u0FA5: ]\u0F40\u0FA5
[# \u0F40\u0FA6: ]\u0F40\u0FA6
[# \u0F40\u0FA7: ]\u0F40\u0FA7
[# \u0F40\u0FA8: ]\u0F40\u0FA8
[# \u0F40\u0FA9: ]\u0F40\u0FA9
[# \u0F40\u0FAA: ]\u0F40\u0FAA
[# \u0F40\u0FAB: ]\u0F40\u0FAB
[# \u0F40\u0FAC: ]\u0F40\u0FAC
[# \u0F40\u0FAD: ]\u0F40\u0FAD
[# \u0F40\u0FAE: ]\u0F40\u0FAE
[# \u0F40\u0FAF: ]\u0F40\u0FAF
[# \u0F40\u0FB0: ]\u0F40\u0FB0
[# \u0F40\u0FB1: ]\u0F40\u0FB1
[# \u0F40\u0FB2: ]\u0F40\u0FB2
[# \u0F40\u0FB3: ]\u0F40\u0FB3
[# \u0F40\u0FB4: ]\u0F40\u0FB4
[# \u0F40\u0FB5: ]\u0F40\u0FB5
[# \u0F40\u0FB6: ]\u0F40\u0FB6
[# \u0F40\u0FB7: ]\u0F40\u0FB7
[# \u0F40\u0FB8: ]\u0F40\u0FB8
[# \u0F40\u0FB9: ]\u0F40\u0FB9
[# \u0F40\u0FBA: ]\u0F40\u0FBA
[# \u0F40\u0FBB: ]\u0F40\u0FBB
[# \u0F40\u0FBC: ]\u0F40\u0FBC
[# \u0F40\u0FBE: ]\u0F40\u0FBE
[# \u0F40\u0FBF: ]\u0F40\u0FBF
[# \u0F40\u0FC0: ]\u0F40\u0FC0
[# \u0F40\u0FC1: ]\u0F40\u0FC1
[# \u0F40\u0FC2: ]\u0F40\u0FC2
[# \u0F40\u0FC3: ]\u0F40\u0FC3
[# \u0F40\u0FC4: ]\u0F40\u0FC4
[# \u0F40\u0FC5: ]\u0F40\u0FC5
[# \u0F40\u0FC6: ]\u0F40\u0FC6
[# \u0F40\u0FC7: ]\u0F40\u0FC7
[# \u0F40\u0FC8: ]\u0F40\u0FC8
[# \u0F40\u0FC9: ]\u0F40\u0FC9
[# \u0F40\u0FCA: ]\u0F40\u0FCA
[# \u0F40\u0FCB: ]\u0F40\u0FCB
[# \u0F40\u0FCC: ]\u0F40\u0FCC
[# \u0F40\u0FCF: ]\u0F40\u0FCF

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,6 @@
/* /*
The contents of this file are subject to the THDL Open Community License
// give B+DE to be very friendly to machines.The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/). (http://www.thdl.org/).
@ -202,7 +203,7 @@ public class TibetanMachineWeb implements THDLWylieConstants {
/** comma-delimited list of supported non-Tibetan consonants, such /** comma-delimited list of supported non-Tibetan consonants, such
* as Sanskrit consonants: */ * as Sanskrit consonants: */
private static final String otherConsonants // va and fa are treated pretty-much like Sanskrit. private static final String otherConsonants // va and fa are treated pretty-much like Sanskrit. // TODO(DLC)[EWTS->Tibetan]: now are v and f in EWTS?
= "T,Th,D,N,Sh,v,f"; = "T,Th,D,N,Sh,v,f";
/** comma-delimited list of supported numbers (superscribed, /** comma-delimited list of supported numbers (superscribed,

View file

@ -137,6 +137,7 @@ public class ACIPConverter {
* prefix rules in another * prefix rules in another
* @throws IOException if we cannot write to out * @throws IOException if we cannot write to out
*/ */
// TODO(DLC)[EWTS->Tibetan]: misnamed source file, this is TConverter.java nowadays
public static boolean convertToTMW(ArrayList scan, public static boolean convertToTMW(ArrayList scan,
OutputStream out, OutputStream out,
StringBuffer errors, StringBuffer errors,
@ -673,7 +674,7 @@ public class ACIPConverter {
} }
} }
} }
} } // TODO(DLC)[EWTS->Tibetan]: change this to have a "parse" phase that puts out error messagesf like 142 and figures out what a space means. This is a very long function that is difficult to maintain, and we want EWTS->Tibetan to be clean.
} else if (stype == TString.START_PAREN) { } else if (stype == TString.START_PAREN) {
if (null != writer) if (null != writer)
writer.write("[ERROR " writer.write("[ERROR "

View file

@ -30,6 +30,8 @@ import org.thdl.tib.text.THDLWylieConstants;
import org.thdl.tib.text.TibetanMachineWeb; import org.thdl.tib.text.TibetanMachineWeb;
import org.thdl.tib.text.TibTextUtils; import org.thdl.tib.text.TibTextUtils;
// TODO(DLC)[EWTS->Tibetan]: this and ACIPTraits -- unify?
/** Canonizes some facts regarding the ACIP transcription system. /** Canonizes some facts regarding the ACIP transcription system.
* @author David Chandler */ * @author David Chandler */
public class ACIPRules { public class ACIPRules {
@ -37,11 +39,11 @@ public class ACIPRules {
* three. */ * three. */
public static int MAX_CONSONANT_LENGTH = 3; public static int MAX_CONSONANT_LENGTH = 3;
/** {'EEm:}, the longest "vowel", has 5 characters, so this is /** {'EEm:}, the longest wowel, has 5 characters, so this is
* five. */ * five. */
public static int MAX_VOWEL_LENGTH = 5; public static int MAX_WOWEL_LENGTH = 5;
/** For O(1) {@link #isVowel(String)} calls. */ /** For O(1) {@link #isWowel(String)} calls. */
private static HashSet acipVowels = null; private static HashSet acipVowels = null;
private static String[][] baseVowels = new String[][] { private static String[][] baseVowels = new String[][] {
@ -58,10 +60,10 @@ public class ACIPRules {
{ "i", "-i", "A-i" } { "i", "-i", "A-i" }
}; };
/** Returns true if and only if s is an ACIP "vowel". You can't /** Returns true if and only if s is an ACIP wowel. You can't
* just call this any time -- A is a consonant and a vowel in * just call this any time -- A is both a consonant and a vowel
* ACIP, so you have to call this in the right context. */ * in ACIP, so you have to call this in the right context. */
public static boolean isVowel(String s) { public static boolean isWowel(String s) {
if (null == acipVowels) { if (null == acipVowels) {
acipVowels = new HashSet(baseVowels.length * 8); acipVowels = new HashSet(baseVowels.length * 8);
for (int i = 0; i < baseVowels.length; i++) { for (int i = 0; i < baseVowels.length; i++) {

View file

@ -0,0 +1,57 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2003-2004 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.ttt;
/** A singleton class that should contain (but due to laziness and
* ignorance probably does not contain) all the traits that make ACIP
* transliteration different from other (say, EWTS)
* transliterations. */
final class ACIPTraits implements TTraits {
/** sole instance of this class */
private static ACIPTraits singleton = null;
/** Just a constructor. */
private ACIPTraits() { }
/** Returns the singleton instance of this class. */
public static ACIPTraits instance() {
if (null == singleton) {
singleton = new ACIPTraits();
}
return singleton;
}
/** Returns "-". */
public String disambiguator() { return "-"; }
/** Returns '-'. */
public char disambiguatorChar() { return '-'; }
public int maxConsonantLength() { return ACIPRules.MAX_CONSONANT_LENGTH; }
public int maxWowelLength() { return ACIPRules.MAX_WOWEL_LENGTH; }
public boolean isConsonant(String s) { return ACIPRules.isConsonant(s); }
public boolean isWowel(String s) { return ACIPRules.isWowel(s); }
public boolean hasSimpleError(TPair p) {
return ("A".equals(p.getLeft()) && null == p.getRight());
}
}

View file

@ -0,0 +1,805 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2004 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.ttt;
import org.thdl.util.ThdlOptions;
import java.util.ArrayList;
import junit.framework.TestCase;
/** Tests this package's ability to understand EWTS and turn it into
* the appropriate TMW or Unicode.
*
* @author David Chandler */
public class EWTSTest extends TestCase {
/** Invokes a text UI and runs all this class's tests. */
public static void main(String[] args) {
junit.textui.TestRunner.run(EWTSTest.class);
}
protected void setUp() {
// We don't want to use options.txt:
ThdlOptions.forTestingOnlyInitializeWithoutDefaultOptionsFile();
ThdlOptions.setUserPreference("thdl.acip.to.tibetan.warning.and.error.severities.are.built.in.defaults", "true");
ThdlOptions.setUserPreference("thdl.acip.to.tibetan.warning.severity.507", "Most");
ErrorsAndWarnings.setupSeverityMap();
// We don't want to load the TM or TMW font files ourselves:
ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
ThdlOptions.setUserPreference("thdl.debug", true);
}
public EWTSTest() { }
/** Causes a JUnit test case failure unless the EWTS document ewts
* converts to the unicode expectedUnicode. */
static void ewts2uni_test(String ewts, String expectedUnicode) {
// TODO(DLC)[EWTS->Tibetan]: NOW! Implement me.
}
/** Causes a JUnit test case failure iff the EWTS document ewts is
* legal EWTS transliteration. */
static void assert_EWTS_error(String ewts) {
// TODO(DLC)[EWTS->Tibetan]: NOW! Implement me.
}
/** Miscellaneous tests of EWTS->Unicode conversion. */
public void test__EWTS__miscellany() {
ewts2uni_test("", "");
ewts2uni_test("0\\u0f19", "\u0f20\u0f19");
ewts2uni_test("0\\u0f18", "\u0f20\u0f18");
ewts2uni_test("0\\u0f3e", "\u0f20\u0f3e"); // TODO(DLC)[EWTS->Tibetan]: test ewts->tmw
ewts2uni_test("0\\u0f3f", "\u0f20\u0f3f"); // TODO(DLC)[EWTS->Tibetan]: test ewts->tmw
ewts2uni_test("R", "\u0f6A");
ewts2uni_test("Ra", "\u0f6A");
ewts2uni_test("R+ka", "\u0F6A\u0f90");
ewts2uni_test("k+Wa", "\u0f40\u0FBA");
ewts2uni_test("k+Ya", "\u0f40\u0FBB");
ewts2uni_test("k+Ra", "\u0f40\u0FBC");
ewts2uni_test("k+wa", "\u0f40\u0Fad");
ewts2uni_test("k+ya", "\u0f40\u0Fb3");
ewts2uni_test("k+ra", "\u0f40\u0Fb2");
ewts2uni_test("r-I", "\u0f62\u0f81");
ewts2uni_test("l-I", "\u0f63\u0f81");
ewts2uni_test("r-i", "\u0f62\u0f80");
ewts2uni_test("l-i", "\u0f63\u0f80");
ewts2uni_test("gr-i", "\u0f42\u0f76"); // TODO(DLC)[EWTS->Tibetan]: "\u0f42\u0fb2\u0f80"
ewts2uni_test("gr-I", "\u0f42\u0f77"); // TODO(DLC)[EWTS->Tibetan]: "\u0f42\u0fb2\u0f81"
ewts2uni_test("gl-i", "\u0f42\u0f78"); // TODO(DLC)[EWTS->Tibetan]: "\u0f42\u0fb3\u0f80"
ewts2uni_test("gl-I", "\u0f42\u0f79"); // TODO(DLC)[EWTS->Tibetan]: "\u0f42\u0fb3\u0f81"
}
/** Tests that our implementation of EWTS's wowels are correct,
* mostly by testing that the Unicode generated for a single
* wowel or set of wowels atop achen (U+0F68) is correct. */
public void test__EWTS__wowels_on_achen() {
ewts2uni_test("A", "\u0f68\u0f71");
ewts2uni_test("i", "\u0f68\u0f72");
ewts2uni_test("I", "\u0f68\u0f73");
ewts2uni_test("u", "\u0f68\u0f74");
ewts2uni_test("U", "\u0f68\u0f75");
ewts2uni_test("a+r-i", "\u0f68\u0f76");
ewts2uni_test("a+r-I", "\u0f68\u0f77");
ewts2uni_test("a+l-i", "\u0f68\u0f78");
ewts2uni_test("a+l-I", "\u0f68\u0f79");
ewts2uni_test("e", "\u0f68\u0f7a");
ewts2uni_test("ai", "\u0f68\u0f7b");
ewts2uni_test("o", "\u0f68\u0f7c");
ewts2uni_test("au", "\u0f68\u0f7d");
ewts2uni_test("aM", "\u0f68\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("aH", "\u0f68\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("-i", "\u0f68\u0f80");
ewts2uni_test("-I", "\u0f68\u0f81");
ewts2uni_test("a~M`", "\u0f68\u0f82"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("a~M", "\u0f68\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("a?", "\u0f68\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("a\\u0f86", "\u0f68\u0f86");
ewts2uni_test("a\\U0f86", "\u0f68\u0f86");
ewts2uni_test("a\\U0F86", "\u0f68\u0f86");
ewts2uni_test("a\\u0F86", "\u0f68\u0f86");
ewts2uni_test("a\\u00000f86", "\u0f68\u0f86");
ewts2uni_test("a\\u00000f86", "\u0f68\u0f86");
ewts2uni_test("a\\u00000F86", "\u0f68\u0f86");
ewts2uni_test("a\\u00000F86", "\u0f68\u0f86");
ewts2uni_test("a\\u0f87", "\u0f68\u0f87");
ewts2uni_test("aMH", "\u0f68\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("aHM", "\u0f68\u0f7f\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
// Than's e-mails of Aug 10 and Aug 11, 2004 say that A+i is
// the same as I and o+o is the same as au.
ewts2uni_test("A+i", "\u0f68\u0f73");
ewts2uni_test("o+o", "\u0f68\u0f7d");
ewts2uni_test("e+e", "\u0f68\u0f7b");
ewts2uni_test("e+e+e", "\u0f68\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:?
ewts2uni_test("e+e+e+e", "\u0f68\u0f7b\u0f7b"); // TODO(DLC)[EWTS->Tibetan]:?
ewts2uni_test("e+e+e+e+e", "\u0f68\u0f7b\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:?
ewts2uni_test("o+e", "\u0f68\u0f7c\u0f7a");
ewts2uni_test("u+A+i+o+e", "\u0f68\u0f74\u0f72\u0f7c\u0f7a");
ewts2uni_test("u+A+i+o+eHM", "\u0f68\u0f74\u0f72\u0f7c\u0f7a\u0f7f\u0f7e");
ewts2uni_test("u+A", "\u0f68\u0f75");
ewts2uni_test("a", "\u0f68");
}
/** Tests that our implementation of EWTS's wowels are correct,
* mostly by testing that the Unicode generated for a single
* wowel or set of wowels atop ka (U+0F40) is correct. */
public void test__EWTS__wowels_on_ka() {
ewts2uni_test("kA", "\u0f40\u0f71");
ewts2uni_test("ki", "\u0f40\u0f72");
ewts2uni_test("kI", "\u0f40\u0f73");
ewts2uni_test("ku", "\u0f40\u0f74");
ewts2uni_test("kU", "\u0f40\u0f75");
ewts2uni_test("ka+r-i", "\u0f40\u0f76");
ewts2uni_test("ka+r-I", "\u0f40\u0f77");
ewts2uni_test("ka+l-i", "\u0f40\u0f78");
ewts2uni_test("ka+l-I", "\u0f40\u0f79");
ewts2uni_test("ke", "\u0f40\u0f7a");
ewts2uni_test("kai", "\u0f40\u0f7b");
ewts2uni_test("ko", "\u0f40\u0f7c");
ewts2uni_test("kau", "\u0f40\u0f7d");
ewts2uni_test("kaM", "\u0f40\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("kaH", "\u0f40\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("k-i", "\u0f40\u0f80");
ewts2uni_test("k-I", "\u0f40\u0f81");
ewts2uni_test("ka~M`", "\u0f40\u0f82"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("ka~M", "\u0f40\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("ka?", "\u0f40\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("ka\\u0f86", "\u0f40\u0f86");
ewts2uni_test("ka\\U0f86", "\u0f40\u0f86");
ewts2uni_test("ka\\U0F86", "\u0f40\u0f86");
ewts2uni_test("ka\\u0F86", "\u0f40\u0f86");
ewts2uni_test("ka\\u00000f86", "\u0f40\u0f86");
ewts2uni_test("ka\\u00000f86", "\u0f40\u0f86");
ewts2uni_test("ka\\u00000F86", "\u0f40\u0f86");
ewts2uni_test("ka\\u00000F86", "\u0f40\u0f86");
ewts2uni_test("ka\\u0f87", "\u0f40\u0f87");
ewts2uni_test("kaMH", "\u0f40\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("kaHM", "\u0f40\u0f7f\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
// Than's e-mails of Aug 10 and Aug 11, 2004 say that A+i is
// the same as I and o+o is the same as au.
ewts2uni_test("kA+i", "\u0f40\u0f73");
ewts2uni_test("ko+o", "\u0f40\u0f7d");
ewts2uni_test("ke+e", "\u0f40\u0f7b");
ewts2uni_test("ke+e+e", "\u0f40\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:?
ewts2uni_test("ke+e+e+e", "\u0f40\u0f7b\u0f7b"); // TODO(DLC)[EWTS->Tibetan]:?
ewts2uni_test("ke+e+e+e+e", "\u0f40\u0f7b\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:?
ewts2uni_test("ko+e", "\u0f40\u0f7c\u0f7a");
ewts2uni_test("ku+A+i+o+e", "\u0f40\u0f74\u0f72\u0f7c\u0f7a");
ewts2uni_test("ku+A+i+o+eHM", "\u0f40\u0f74\u0f72\u0f7c\u0f7a\u0f7f\u0f7e");
ewts2uni_test("ku+A", "\u0f40\u0f75");
ewts2uni_test("k", "\u0f40");
ewts2uni_test("ka", "\u0f40");
}
/** Tests that our implementation of EWTS's wowels are correct,
* mostly by testing that the Unicode generated for a single
* wowel or set of wowels atop achung (U+0F60) is correct. */
public void test__EWTS__wowels_on_achung() {
ewts2uni_test("'A", "\u0f60\u0f71");
ewts2uni_test("'i", "\u0f60\u0f72");
ewts2uni_test("'I", "\u0f60\u0f73");
ewts2uni_test("'u", "\u0f60\u0f74");
ewts2uni_test("'U", "\u0f60\u0f75");
ewts2uni_test("'a+r-i", "\u0f60\u0f76");
ewts2uni_test("'a+r-I", "\u0f60\u0f77");
ewts2uni_test("'a+l-i", "\u0f60\u0f78");
ewts2uni_test("'a+l-I", "\u0f60\u0f79");
ewts2uni_test("'e", "\u0f60\u0f7a");
ewts2uni_test("'ai", "\u0f60\u0f7b");
ewts2uni_test("'o", "\u0f60\u0f7c");
ewts2uni_test("'au", "\u0f60\u0f7d");
ewts2uni_test("'aM", "\u0f60\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("'aH", "\u0f60\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("'-i", "\u0f60\u0f80");
ewts2uni_test("'-I", "\u0f60\u0f81");
ewts2uni_test("'a~M`", "\u0f60\u0f82"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("'a~M", "\u0f60\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("'a?", "\u0f60\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("'a\\u0f86", "\u0f60\u0f86");
ewts2uni_test("'a\\U0f86", "\u0f60\u0f86");
ewts2uni_test("'a\\U0F86", "\u0f60\u0f86");
ewts2uni_test("'a\\u0F86", "\u0f60\u0f86");
ewts2uni_test("'a\\u00000f86", "\u0f60\u0f86");
ewts2uni_test("'a\\u00000f86", "\u0f60\u0f86");
ewts2uni_test("'a\\u00000F86", "\u0f60\u0f86");
ewts2uni_test("'a\\u00000F86", "\u0f60\u0f86");
ewts2uni_test("'a\\u0f87", "\u0f60\u0f87");
ewts2uni_test("'aMH", "\u0f60\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("'aHM", "\u0f60\u0f7f\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
// Than's e-mails of Aug 10 and Aug 11, 2004 say that A+i is
// the same as I and o+o is the same as au.
ewts2uni_test("'A+i", "\u0f60\u0f73");
ewts2uni_test("'o+o", "\u0f60\u0f7d");
ewts2uni_test("'e+e", "\u0f60\u0f7b");
ewts2uni_test("'e+e+e", "\u0f60\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:?
ewts2uni_test("'e+e+e+e", "\u0f60\u0f7b\u0f7b"); // TODO(DLC)[EWTS->Tibetan]:?
ewts2uni_test("'e+e+e+e+e", "\u0f60\u0f7b\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:?
ewts2uni_test("'o+e", "\u0f60\u0f7c\u0f7a");
ewts2uni_test("'u+A+i+o+e", "\u0f60\u0f74\u0f72\u0f7c\u0f7a");
ewts2uni_test("'u+A+i+o+eHM", "\u0f60\u0f74\u0f72\u0f7c\u0f7a\u0f7f\u0f7e");
ewts2uni_test("'u+A", "\u0f60\u0f75");
ewts2uni_test("'", "\u0f60");
ewts2uni_test("'a", "\u0f60");
}
/** Tests that our implementation of EWTS's wowels are correct,
* mostly by testing that the Unicode generated for a single
* wowel or set of wowels atop k+Sh (U+0F69) is correct. */
public void test__EWTS__wowels_on_kSh() {
ewts2uni_test("k+ShA", "\u0f69\u0f71");
ewts2uni_test("k+Shi", "\u0f69\u0f72");
ewts2uni_test("k+ShI", "\u0f69\u0f73");
ewts2uni_test("k+Shu", "\u0f69\u0f74");
ewts2uni_test("k+ShU", "\u0f69\u0f75");
ewts2uni_test("k+Sha+r-i", "\u0f69\u0f76");
ewts2uni_test("k+Sha+r-I", "\u0f69\u0f77");
ewts2uni_test("k+Sha+l-i", "\u0f69\u0f78");
ewts2uni_test("k+Sha+l-I", "\u0f69\u0f79");
ewts2uni_test("k+She", "\u0f69\u0f7a");
ewts2uni_test("k+Shai", "\u0f69\u0f7b");
ewts2uni_test("k+Sho", "\u0f69\u0f7c");
ewts2uni_test("k+Shau", "\u0f69\u0f7d");
ewts2uni_test("k+ShaM", "\u0f69\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("k+ShaH", "\u0f69\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("k+Sh-i", "\u0f69\u0f80");
ewts2uni_test("k+Sh-I", "\u0f69\u0f81");
ewts2uni_test("k+Sha~M`", "\u0f69\u0f82"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("k+Sha~M", "\u0f69\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("k+Sha?", "\u0f69\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("k+Sha\\u0f86", "\u0f69\u0f86");
ewts2uni_test("k+Sha\\U0f86", "\u0f69\u0f86");
ewts2uni_test("k+Sha\\U0F86", "\u0f69\u0f86");
ewts2uni_test("k+Sha\\u0F86", "\u0f69\u0f86");
ewts2uni_test("k+Sha\\u00000f86", "\u0f69\u0f86");
ewts2uni_test("k+Sha\\u00000f86", "\u0f69\u0f86");
ewts2uni_test("k+Sha\\u00000F86", "\u0f69\u0f86");
ewts2uni_test("k+Sha\\u00000F86", "\u0f69\u0f86");
ewts2uni_test("k+Sha\\u0f87", "\u0f69\u0f87");
ewts2uni_test("k+ShaMH", "\u0f69\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("k+ShaHM", "\u0f69\u0f7f\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
// Than's e-mails of Aug 10 and Aug 11, 2004 say that A+i is
// the same as I and o+o is the same as au.
ewts2uni_test("k+ShA+i", "\u0f69\u0f73");
ewts2uni_test("k+Sho+o", "\u0f69\u0f7d");
ewts2uni_test("k+She+e", "\u0f69\u0f7b");
ewts2uni_test("k+She+e+e", "\u0f69\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:?
ewts2uni_test("k+She+e+e+e", "\u0f69\u0f7b\u0f7b"); // TODO(DLC)[EWTS->Tibetan]:?
ewts2uni_test("k+She+e+e+e+e", "\u0f69\u0f7b\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:?
ewts2uni_test("k+Sho+e", "\u0f69\u0f7c\u0f7a");
ewts2uni_test("k+Shu+A+i+o+e", "\u0f69\u0f74\u0f72\u0f7c\u0f7a");
ewts2uni_test("k+Shu+A+i+o+eHM", "\u0f69\u0f74\u0f72\u0f7c\u0f7a\u0f7f\u0f7e");
ewts2uni_test("k+Shu+A", "\u0f69\u0f75");
ewts2uni_test("k+Sh", "\u0f69");
ewts2uni_test("k+Sha", "\u0f69");
}
/** Tests that our implementation of EWTS's wowels are correct,
* mostly by testing that the Unicode generated for a single
* wowel or set of wowels atop phyw (U+0F55,0FB1,0FAD) is
* correct. */
public void test__EWTS__wowels_on_phyw() {
ewts2uni_test("phywA", "\u0f55\u0fb1\u0fad\u0f71");
ewts2uni_test("phywi", "\u0f55\u0fb1\u0fad\u0f72");
ewts2uni_test("phywI", "\u0f55\u0fb1\u0fad\u0f73");
ewts2uni_test("phywu", "\u0f55\u0fb1\u0fad\u0f74");
ewts2uni_test("phywU", "\u0f55\u0fb1\u0fad\u0f75");
ewts2uni_test("phywa+r-i", "\u0f55\u0fb1\u0fad\u0f76");
ewts2uni_test("phywa+r-I", "\u0f55\u0fb1\u0fad\u0f77");
ewts2uni_test("phywa+l-i", "\u0f55\u0fb1\u0fad\u0f78");
ewts2uni_test("phywa+l-I", "\u0f55\u0fb1\u0fad\u0f79");
ewts2uni_test("phywe", "\u0f55\u0fb1\u0fad\u0f7a");
ewts2uni_test("phywai", "\u0f55\u0fb1\u0fad\u0f7b");
ewts2uni_test("phywo", "\u0f55\u0fb1\u0fad\u0f7c");
ewts2uni_test("phywau", "\u0f55\u0fb1\u0fad\u0f7d");
ewts2uni_test("phywaM", "\u0f55\u0fb1\u0fad\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("phywaH", "\u0f55\u0fb1\u0fad\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("phyw-i", "\u0f55\u0fb1\u0fad\u0f80");
ewts2uni_test("phyw-I", "\u0f55\u0fb1\u0fad\u0f81");
ewts2uni_test("phywa~M`", "\u0f55\u0fb1\u0fad\u0f82"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("phywa~M", "\u0f55\u0fb1\u0fad\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("phywa?", "\u0f55\u0fb1\u0fad\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("phywa\\u0f86", "\u0f55\u0fb1\u0fad\u0f86");
ewts2uni_test("phywa\\U0f86", "\u0f55\u0fb1\u0fad\u0f86");
ewts2uni_test("phywa\\U0F86", "\u0f55\u0fb1\u0fad\u0f86");
ewts2uni_test("phywa\\u0F86", "\u0f55\u0fb1\u0fad\u0f86");
ewts2uni_test("phywa\\u00000f86", "\u0f55\u0fb1\u0fad\u0f86");
ewts2uni_test("phywa\\u00000f86", "\u0f55\u0fb1\u0fad\u0f86");
ewts2uni_test("phywa\\u00000F86", "\u0f55\u0fb1\u0fad\u0f86");
ewts2uni_test("phywa\\u00000F86", "\u0f55\u0fb1\u0fad\u0f86");
ewts2uni_test("phywa\\u0f87", "\u0f55\u0fb1\u0fad\u0f87");
ewts2uni_test("phywaMH", "\u0f55\u0fb1\u0fad\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("phywaHM", "\u0f55\u0fb1\u0fad\u0f7f\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
// Than's e-mails of Aug 10 and Aug 11, 2004 say that A+i is
// the same as I and o+o is the same as au.
ewts2uni_test("phywA+i", "\u0f55\u0fb1\u0fad\u0f73");
ewts2uni_test("phywo+o", "\u0f55\u0fb1\u0fad\u0f7d");
ewts2uni_test("phywe+e", "\u0f55\u0fb1\u0fad\u0f7b");
ewts2uni_test("phywe+e+e", "\u0f55\u0fb1\u0fad\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:?
ewts2uni_test("phywe+e+e+e", "\u0f55\u0fb1\u0fad\u0f7b\u0f7b"); // TODO(DLC)[EWTS->Tibetan]:?
ewts2uni_test("phywe+e+e+e+e", "\u0f55\u0fb1\u0fad\u0f7b\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:?
ewts2uni_test("phywo+e", "\u0f55\u0fb1\u0fad\u0f7c\u0f7a");
ewts2uni_test("phywu+A+i+o+e", "\u0f55\u0fb1\u0fad\u0f74\u0f72\u0f7c\u0f7a");
ewts2uni_test("phywu+A+i+o+eHM", "\u0f55\u0fb1\u0fad\u0f74\u0f72\u0f7c\u0f7a\u0f7f\u0f7e");
ewts2uni_test("phywu+A", "\u0f55\u0fb1\u0fad\u0f75");
ewts2uni_test("phyw", "\u0f55\u0fb1\u0fad");
ewts2uni_test("phywa", "\u0f55\u0fb1\u0fad");
}
/** Tests that our implementation of EWTS's wowels are correct,
* mostly by testing that the Unicode generated for a single
* wowel or set of wowels atop k+j+j+k+k+j
* (U+0F40,U+0F97,U+0F97,U+0F90,U+0F90,U+0F97) is correct. I
* chose this stack as an example of an absurd stack. */
public void test__EWTS__wowels_on_kjjkkj() {
ewts2uni_test("k+j+j+k+k+jA", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f71");
ewts2uni_test("k+j+j+k+k+ji", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f72");
ewts2uni_test("k+j+j+k+k+jI", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f73");
ewts2uni_test("k+j+j+k+k+ju", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f74");
ewts2uni_test("k+j+j+k+k+jU", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f75");
ewts2uni_test("k+j+j+k+k+ja+r-i", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f76");
ewts2uni_test("k+j+j+k+k+ja+r-I", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f77");
ewts2uni_test("k+j+j+k+k+ja+l-i", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f78");
ewts2uni_test("k+j+j+k+k+ja+l-I", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f79");
ewts2uni_test("k+j+j+k+k+je", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7a");
ewts2uni_test("k+j+j+k+k+jai", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7b");
ewts2uni_test("k+j+j+k+k+jo", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7c");
ewts2uni_test("k+j+j+k+k+jau", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7d");
ewts2uni_test("k+j+j+k+k+jaM", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("k+j+j+k+k+jaH", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("k+j+j+k+k+j-i", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f80");
ewts2uni_test("k+j+j+k+k+j-I", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f81");
ewts2uni_test("k+j+j+k+k+ja~M`", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f82"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("k+j+j+k+k+ja~M", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("k+j+j+k+k+ja?", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("k+j+j+k+k+ja\\u0f86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86");
ewts2uni_test("k+j+j+k+k+ja\\U0f86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86");
ewts2uni_test("k+j+j+k+k+ja\\U0F86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86");
ewts2uni_test("k+j+j+k+k+ja\\u0F86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86");
ewts2uni_test("k+j+j+k+k+ja\\u00000f86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86");
ewts2uni_test("k+j+j+k+k+ja\\u00000f86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86");
ewts2uni_test("k+j+j+k+k+ja\\u00000F86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86");
ewts2uni_test("k+j+j+k+k+ja\\u00000F86", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f86");
ewts2uni_test("k+j+j+k+k+ja\\u0f87", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f87");
ewts2uni_test("k+j+j+k+k+jaMH", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
ewts2uni_test("k+j+j+k+k+jaHM", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7f\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say
// Than's e-mails of Aug 10 and Aug 11, 2004 say that A+i is
// the same as I and o+o is the same as au.
ewts2uni_test("k+j+j+k+k+jA+i", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f73");
ewts2uni_test("k+j+j+k+k+jo+o", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7d");
ewts2uni_test("k+j+j+k+k+je+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7b");
ewts2uni_test("k+j+j+k+k+je+e+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:?
ewts2uni_test("k+j+j+k+k+je+e+e+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7b\u0f7b"); // TODO(DLC)[EWTS->Tibetan]:?
ewts2uni_test("k+j+j+k+k+je+e+e+e+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7b\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:?
ewts2uni_test("k+j+j+k+k+jo+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7c\u0f7a");
ewts2uni_test("k+j+j+k+k+ju+A+i+o+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f74\u0f72\u0f7c\u0f7a");
ewts2uni_test("k+j+j+k+k+ju+A+i+o+eHM", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f74\u0f72\u0f7c\u0f7a\u0f7f\u0f7e");
ewts2uni_test("k+j+j+k+k+ju+A", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f75");
ewts2uni_test("k+j+j+k+k+j", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97");
ewts2uni_test("k+j+j+k+k+ja", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97");
}
/** Tests that the EWTS that the spec says corresponds to each
* codepoint really does. */
public void test__EWTS__tags_each_unicode_value() {
ewts2uni_test("\\u0000", "\u0000");
ewts2uni_test("\\u0eff", "\u0eff");
ewts2uni_test("\\u0eff", "\u0eff");
ewts2uni_test("\\u0f00", "\u0f00");
ewts2uni_test("\\u0f40", "\u0f40");
ewts2uni_test("\\u0f70", "\u0f70");
ewts2uni_test("\\u0fff", "\u0fff");
ewts2uni_test("\\uf000", "\uf000");
ewts2uni_test("\\uf01f", "\uf01f");
ewts2uni_test("\\uefff", "\uefff");
ewts2uni_test("\\ucafe0000", "\ucafe0000");
ewts2uni_test("\\ucafe0eff", "\ucafe0eff");
ewts2uni_test("\\ucafe0eff", "\ucafe0eff");
ewts2uni_test("\\ucafe0f00", "\ucafe0f00");
ewts2uni_test("\\ucafe0f40", "\ucafe0f40");
ewts2uni_test("\\ucafe0f70", "\ucafe0f70");
ewts2uni_test("\\ucafe0fff", "\ucafe0fff");
ewts2uni_test("\\ucafef000", "\ucafef000");
ewts2uni_test("\\ucafef01f", "\ucafef01f");
ewts2uni_test("\\ucafeefff", "\ucafeefff");
ewts2uni_test("\\u00000000", "\u00000000");
ewts2uni_test("\\u00000eff", "\u00000eff");
ewts2uni_test("\\u00000eff", "\u00000eff");
ewts2uni_test("\\u00000f00", "\u00000f00");
ewts2uni_test("\\u00000f40", "\u00000f40");
ewts2uni_test("\\u00000f70", "\u00000f70");
ewts2uni_test("\\u00000fff", "\u00000fff");
ewts2uni_test("\\u0000f000", "\u0000f000");
ewts2uni_test("\\u0000f01f", "\u0000f01f");
ewts2uni_test("\\u0000efff", "\u0000efff");
ewts2uni_test("\\u00000000", "\u0000");
ewts2uni_test("\\u00000eff", "\u0eff");
ewts2uni_test("\\u00000eff", "\u0eff");
ewts2uni_test("\\u00000f00", "\u0f00");
ewts2uni_test("\\u00000f40", "\u0f40");
ewts2uni_test("\\u00000f70", "\u0f70");
ewts2uni_test("\\u00000fff", "\u0fff");
ewts2uni_test("\\u0000f000", "\uf000");
ewts2uni_test("\\u0000f01f", "\uf01f");
ewts2uni_test("\\u0000efff", "\uefff");
ewts2uni_test("\\UcaFe0000", "\ucaFe0000");
ewts2uni_test("\\UcaFe0eff", "\ucaFe0eff");
ewts2uni_test("\\UcaFe0eff", "\ucaFe0eff");
ewts2uni_test("\\UcaFe0f00", "\ucaFe0f00");
ewts2uni_test("\\UcaFe0f40", "\ucaFe0f40");
ewts2uni_test("\\UcaFe0f70", "\ucaFe0f70");
ewts2uni_test("\\UcaFe0fff", "\ucaFe0fff");
ewts2uni_test("\\UcaFef000", "\ucaFef000");
ewts2uni_test("\\UcaFef01f", "\ucaFef01f");
ewts2uni_test("\\UcaFeefff", "\ucaFeefff");
// Below was semiautomatically generated from the EWTS spec's
// 'ewts.xml' representation (early August 2004 edition):
ewts2uni_test("v", "\u0F56\u0F39");
ewts2uni_test("f", "\u0F55\u0F39");
ewts2uni_test("oM", "\u0F00");
ewts2uni_test("\\u0F01", "\u0F01");
ewts2uni_test("\\u0F02", "\u0F02");
ewts2uni_test("\\u0F03", "\u0F03");
ewts2uni_test("@", "\u0F04");
ewts2uni_test("#", "\u0F05");
ewts2uni_test("$", "\u0F06");
ewts2uni_test("%", "\u0F07");
ewts2uni_test("!", "\u0F08");
ewts2uni_test("\\u0F09", "\u0F09");
ewts2uni_test("\\u0F0A", "\u0F0A");
ewts2uni_test(" ", "\u0F0B");
ewts2uni_test("*", "\u0F0C");
ewts2uni_test("/", "\u0F0D");
ewts2uni_test("//", "\u0F0E");
ewts2uni_test(";", "\u0F0F");
ewts2uni_test("\\u0F10", "\u0F10");
ewts2uni_test("|", "\u0F11");
ewts2uni_test("\\u0F12", "\u0F12");
ewts2uni_test("\\u0F13", "\u0F13");
ewts2uni_test(":", "\u0F14");
ewts2uni_test("\\u0F15", "\u0F15");
ewts2uni_test("\\u0F16", "\u0F16");
ewts2uni_test("\\u0F17", "\u0F17");
ewts2uni_test("\\u0F18", "\u0F18"); // TODO(DLC)[EWTS->Tibetan]: error combiner
ewts2uni_test("\\u0F19", "\u0F19"); // TODO(DLC)[EWTS->Tibetan]: error combiner
ewts2uni_test("\\u0F1A", "\u0F1A");
ewts2uni_test("\\u0F1B", "\u0F1B");
ewts2uni_test("\\u0F1C", "\u0F1C");
ewts2uni_test("\\u0F1D", "\u0F1D");
ewts2uni_test("\\u0F1E", "\u0F1E");
ewts2uni_test("\\u0F1F", "\u0F1F");
ewts2uni_test("0", "\u0F20");
ewts2uni_test("1", "\u0F21");
ewts2uni_test("2", "\u0F22");
ewts2uni_test("3", "\u0F23");
ewts2uni_test("4", "\u0F24");
ewts2uni_test("5", "\u0F25");
ewts2uni_test("6", "\u0F26");
ewts2uni_test("7", "\u0F27");
ewts2uni_test("8", "\u0F28");
ewts2uni_test("9", "\u0F29");
ewts2uni_test("\\u0F2A", "\u0F2A");
ewts2uni_test("\\u0F2B", "\u0F2B");
ewts2uni_test("\\u0F2C", "\u0F2C");
ewts2uni_test("\\u0F2D", "\u0F2D");
ewts2uni_test("\\u0F2E", "\u0F2E");
ewts2uni_test("\\u0F2F", "\u0F2F");
ewts2uni_test("\\u0F30", "\u0F30");
ewts2uni_test("\\u0F31", "\u0F31");
ewts2uni_test("\\u0F32", "\u0F32");
ewts2uni_test("\\u0F33", "\u0F33");
ewts2uni_test("=", "\u0F34");
ewts2uni_test("~X", "\u0F35");
ewts2uni_test("\\u0F36", "\u0F36");
ewts2uni_test("X", "\u0F37"); // TODO(DLC)[EWTS->Tibetan]: error combiner
ewts2uni_test("\\u0F38", "\u0F38");
ewts2uni_test("^", "\u0F39"); // TODO(DLC)[EWTS->Tibetan]: error combiner
ewts2uni_test("<", "\u0F3A");
ewts2uni_test(">", "\u0F3B");
ewts2uni_test("(", "\u0F3C");
ewts2uni_test(")", "\u0F3D");
ewts2uni_test("\\u0F3E", "\u0F3E"); // TODO(DLC)[EWTS->Tibetan]: error combiner
ewts2uni_test("\\u0F3F", "\u0F3F"); // TODO(DLC)[EWTS->Tibetan]: error combiner
ewts2uni_test("k", "\u0F40");
ewts2uni_test("kh", "\u0F41");
ewts2uni_test("g", "\u0F42");
ewts2uni_test("g+h", "\u0F43");
ewts2uni_test("ng", "\u0F44");
ewts2uni_test("c", "\u0F45");
ewts2uni_test("ch", "\u0F46");
ewts2uni_test("j", "\u0F47");
ewts2uni_test("ny", "\u0F49");
ewts2uni_test("T", "\u0F4A");
ewts2uni_test("Th", "\u0F4B");
ewts2uni_test("D", "\u0F4C");
ewts2uni_test("D+h", "\u0F4D");
ewts2uni_test("N", "\u0F4E");
ewts2uni_test("t", "\u0F4F");
ewts2uni_test("th", "\u0F50");
ewts2uni_test("d", "\u0F51");
ewts2uni_test("d+h", "\u0F52");
ewts2uni_test("n", "\u0F53");
ewts2uni_test("p", "\u0F54");
ewts2uni_test("ph", "\u0F55");
ewts2uni_test("b", "\u0F56");
ewts2uni_test("b+h", "\u0F57");
ewts2uni_test("m", "\u0F58");
ewts2uni_test("ts", "\u0F59");
ewts2uni_test("tsh", "\u0F5A");
ewts2uni_test("dz", "\u0F5B");
ewts2uni_test("dz+h", "\u0F5C");
ewts2uni_test("w", "\u0F5D");
ewts2uni_test("zh", "\u0F5E");
ewts2uni_test("z", "\u0F5F");
ewts2uni_test("'", "\u0F60");
ewts2uni_test("y", "\u0F61");
ewts2uni_test("r", "\u0F62");
ewts2uni_test("l", "\u0F63");
ewts2uni_test("sh", "\u0F64");
ewts2uni_test("Sh", "\u0F65");
ewts2uni_test("s", "\u0F66");
ewts2uni_test("h", "\u0F67");
ewts2uni_test("a", "\u0F68");
ewts2uni_test("k+Sh", "\u0F69");
ewts2uni_test("R+", "\u0F6A"); // TODO(DLC)[EWTS->Tibetan]: move to illegal test
ewts2uni_test("A", "\u0F71");
ewts2uni_test("i", "\u0F72");
ewts2uni_test("I", "\u0F73");
ewts2uni_test("u", "\u0F74");
ewts2uni_test("U", "\u0F75");
ewts2uni_test("r-i", "\u0F76");
ewts2uni_test("r-I", "\u0F77");
ewts2uni_test("l-i", "\u0F78");
ewts2uni_test("l-I", "\u0F79");
ewts2uni_test("e", "\u0F7A");
ewts2uni_test("ai", "\u0F7B");
ewts2uni_test("o", "\u0F7C");
ewts2uni_test("au", "\u0F7D");
ewts2uni_test("M", "\u0F7E");
ewts2uni_test("H", "\u0F7F");
ewts2uni_test("-i", "\u0F80");
ewts2uni_test("-I", "\u0F81");
ewts2uni_test("~M`", "\u0F82");
ewts2uni_test("~M", "\u0F83");
ewts2uni_test("?", "\u0F84");
ewts2uni_test("&", "\u0F85");
ewts2uni_test("\\u0F86", "\u0F86");
ewts2uni_test("\\u0F87", "\u0F87");
ewts2uni_test("\\u0F88", "\u0F88");
ewts2uni_test("\\u0F89", "\u0F89");
ewts2uni_test("\\u0F8A", "\u0F8A");
ewts2uni_test("\\u0F8B", "\u0F8B");
ewts2uni_test("k", "\u0F90"); // TODO(DLC)[EWTS->Tibetan]: NO! Need a+...
ewts2uni_test("kh", "\u0F91");
ewts2uni_test("g", "\u0F92");
ewts2uni_test("g+h", "\u0F93");
ewts2uni_test("ng", "\u0F94");
ewts2uni_test("c", "\u0F95");
ewts2uni_test("ch", "\u0F96");
ewts2uni_test("j", "\u0F97");
ewts2uni_test("ny", "\u0F99");
ewts2uni_test("T", "\u0F9A");
ewts2uni_test("Th", "\u0F9B");
ewts2uni_test("D", "\u0F9C");
ewts2uni_test("D+h", "\u0F9D");
ewts2uni_test("N", "\u0F9E");
ewts2uni_test("t", "\u0F9F");
ewts2uni_test("th", "\u0FA0");
ewts2uni_test("d", "\u0FA1");
ewts2uni_test("d+h", "\u0FA2");
ewts2uni_test("n", "\u0FA3");
ewts2uni_test("p", "\u0FA4");
ewts2uni_test("ph", "\u0FA5");
ewts2uni_test("b", "\u0FA6");
ewts2uni_test("b+h", "\u0FA7");
ewts2uni_test("m", "\u0FA8");
ewts2uni_test("ts", "\u0FA9");
ewts2uni_test("tsh", "\u0FAA");
ewts2uni_test("dz", "\u0FAB");
ewts2uni_test("dz+h", "\u0FAC");
ewts2uni_test("w", "\u0FAD");
ewts2uni_test("zh", "\u0FAE");
ewts2uni_test("z", "\u0FAF");
ewts2uni_test("'", "\u0FB0");
ewts2uni_test("y", "\u0FB1");
ewts2uni_test("r", "\u0FB2");
ewts2uni_test("l", "\u0FB3");
ewts2uni_test("sh", "\u0FB4");
ewts2uni_test("Sh", "\u0FB5");
ewts2uni_test("s", "\u0FB6");
ewts2uni_test("h", "\u0FB7");
ewts2uni_test("a", "\u0FB8");
ewts2uni_test("k+Sh", "\u0FB9");
ewts2uni_test("+W", "\u0FBA"); // TODO(DLC)[EWTS->Tibetan]: move to illegal test
ewts2uni_test("+Y", "\u0FBB");
ewts2uni_test("+R", "\u0FBC");
ewts2uni_test("\\u0FBE", "\u0FBE");
ewts2uni_test("\\u0FBF", "\u0FBF");
ewts2uni_test("\\u0FC0", "\u0FC0");
ewts2uni_test("\\u0FC1", "\u0FC1");
ewts2uni_test("\\u0FC2", "\u0FC2");
ewts2uni_test("\\u0FC3", "\u0FC3");
ewts2uni_test("\\u0FC4", "\u0FC4");
ewts2uni_test("\\u0FC5", "\u0FC5");
ewts2uni_test("\\u0FC6", "\u0FC6");
ewts2uni_test("\\u0FC7", "\u0FC7");
ewts2uni_test("\\u0FC8", "\u0FC8");
ewts2uni_test("\\u0FC9", "\u0FC9");
ewts2uni_test("\\u0FCA", "\u0FCA");
ewts2uni_test("\\u0FCB", "\u0FCB");
ewts2uni_test("\\u0FCC", "\u0FCC");
ewts2uni_test("\\u0FCF", "\u0FCF");
ewts2uni_test("\\u0FD0", "\u0FD0");
ewts2uni_test("\\u0FD1", "\u0FD1");
ewts2uni_test("_", "\u0020");
ewts2uni_test("\\u534D", "\u534D");
ewts2uni_test("\\u5350", "\u5350");
ewts2uni_test("\\u0F88+k", "\u0F880F90"); // TODO(DLC)[EWTS->Tibetan]:
ewts2uni_test("\\u0F88+kh", "\u0F880F91");
/* TODO(DLC)[EWTS->Tibetan]: NOW do we want to ever generate \u0f21? EWTS->TMW and this makes sense, but EWTS->Unicode? */
ewts2uni_test("\\uF021", "\uF021");
ewts2uni_test("\\uF022", "\uF022");
ewts2uni_test("\\uF023", "\uF023");
ewts2uni_test("\\uF024", "\uF024");
ewts2uni_test("\\uF025", "\uF025");
ewts2uni_test("\\uF026", "\uF026");
ewts2uni_test("\\uF027", "\uF027");
ewts2uni_test("\\uF028", "\uF028");
ewts2uni_test("\\uF029", "\uF029");
ewts2uni_test("\\uF02A", "\uF02A");
ewts2uni_test("\\uF02B", "\uF02B");
ewts2uni_test("\\uF02C", "\uF02C");
ewts2uni_test("\\uF02D", "\uF02D");
ewts2uni_test("\\uF02E", "\uF02E");
ewts2uni_test("\\uF02F", "\uF02F");
ewts2uni_test("\\uF030", "\uF030");
ewts2uni_test("\\uF031", "\uF031");
ewts2uni_test("\\uF032", "\uF032");
ewts2uni_test("\\uF033", "\uF033");
ewts2uni_test("\\uF034", "\uF034");
ewts2uni_test("\\uF035", "\uF035");
ewts2uni_test("\\uF036", "\uF036");
ewts2uni_test("\\uF037", "\uF037");
ewts2uni_test("\\uF038", "\uF038");
ewts2uni_test("\\uF039", "\uF039");
ewts2uni_test("\\uF03A", "\uF03A");
ewts2uni_test("\\uF03B", "\uF03B");
ewts2uni_test("\\uF03C", "\uF03C");
ewts2uni_test("\\uF03D", "\uF03D");
ewts2uni_test("\\uF03E", "\uF03E");
ewts2uni_test("\\uF03F", "\uF03F");
ewts2uni_test("\\uF040", "\uF040");
ewts2uni_test("\\uF041", "\uF041");
ewts2uni_test("\\uF042", "\uF042");
}
// TODO(DLC)[EWTS->Tibetan]: test that "\[JAVA_SOURCE_WILL_NOT_COMPILE_WITHOUT_ME]uxxxx " works out well
/** Tests that certain strings are not legal EWTS. */
public void test__EWTS__illegal_things() {
assert_EWTS_error("k\\u0f19"); // only numbers combine with f19,f18,f3e,f3f
assert_EWTS_error("k\\u0f18"); // only numbers combine with f19,f18,f3e,f3f
assert_EWTS_error("k\\u0f3e"); // only numbers combine with f19,f18,f3e,f3f
assert_EWTS_error("k\\u0f3f"); // only numbers combine with f19,f18,f3e,f3f
assert_EWTS_error("kSha"); // use "k+Sha" instead
assert_EWTS_error("pM"); // use "paM" instead (TODO(DLC)[EWTS->Tibetan]: NOW NO!)
assert_EWTS_error("pH"); // use "paM" instead (TODO(DLC)[EWTS->Tibetan]: NOW NO!)
assert_EWTS_error("kja"); // use "kaja" or "k.ja" instead
assert_EWTS_error("kA+u"); // use "ku+A" (bottom-to-top) or "kU" instead
assert_EWTS_error("bna"); // use "b+na" or "bana" instead // TODO(DLC)[EWTS->Tibetan]: tell D. Chapman about this; an old e-mail said my test cases would be brutal and here's brutal
assert_EWTS_error("bn?");
assert_EWTS_error("bni");
assert_EWTS_error("bnA");
assert_EWTS_error("bn-I");
// a+r is not a standard stack; neither is a+l:
assert_EWTS_error("ar-i");
assert_EWTS_error("ar-I");
assert_EWTS_error("al-i");
assert_EWTS_error("al-I");
assert_EWTS_error("g..ya"); // use "g.ya" instead
assert_EWTS_error("g"); // use "ga" instead TODO(DLC)[EWTS->Tibetan]:?
}
}
// TODO(DLC)[EWTS->Tibetan]: if 'k' were illegal, then would you have to say
// 'ka\u0f84' or would 'k\u0f84' be legal?
// TODO(DLC)[EWTS->Tibetan]: ask than what's the order, top to bottom, of
// u,i,o,e,M,A,I,-i,-I,ai,au,etc.? TODO(DLC)[EWTS->Tibetan]: ANSWER: Basically, there are a few classes -- above, below, both.
// TODO(DLC)[EWTS->Tibetan]: NOW: write a tool that takes Tibetan Unicode and finds
// flaws in it. E.g., if Unicode 4.0 says that
// \u0f40\u0f7a\u0f74 is illegal (thus \u0f40\u0f74\u0f7a is
// what you probably intended), have it find \u0f7a\u0f74.
//
// TODO(DLC)[EWTS->Tibetan]:: and have it find \u0f7a\u0f7a and suggest \u0f7b, etc.
//
// TODO(DLC)[EWTS->Tibetan]: and \u0f7f\u0f7e is probably illegal and should be switched?
// TODO(DLC)[EWTS->Tibetan]: flesh out \[JAVA_SOURCE_WILL_NOT_COMPILE_WITHOUT_ME]u rules in lexing, is it like Java (where in Java source code, escapes are done in a pre-lexing pass)? no, right, \u0060 causes \u0060 in the output... and \u0f40a is not like ka. escapes separate tsheg bars as far as lexing is concerned, yes? But we use them (and only them, i.e. there is no other transliteration available) for some Tibetan Unicode characters, and then ka\[JAVA_SOURCE_WILL_NOT_COMPILE_WITHOUT_ME]u0fXX may need to seem Java-ish, maybe?
// TODO(DLC)[EWTS->Tibetan]: spell-check ewts spec, puncutation e.g.
// TODO(DLC)[EWTS->Tibetan]: ask than aM, not M, is legal, what else is like this? ~M`? 0f84?
// TODO(DLC)[EWTS->Tibetan]: NOW 0f84 ? not a? but ? according to rule n=7
/* TODO(DLC)[EWTS->Tibetan]: make a method that tests the unicode directly and by going from ewts/acip->tmw->unicode. */
// TODO(DLC)[EWTS->Tibetan]: s/anyways/anyway/g in ewts spec
// TODO(DLC)[EWTS->Tibetan]: s/(Spacebar)/(Space)/g
/* TODO(DLC)[EWTS->Tibetan]: in spec, inconsistency:
<code>0F880F90</code>
<code>0F880F91</code>
<code rend="U+0F55 U+0F39">\u0F55\u0F39</code>
<code rend="U+0F56 U+0F39">\u0F56\u0F39</code>
TODO(DLC)[EWTS->Tibetan]:: also, <equiv>F042</equiv> is inconsistent with <equiv></equiv> for U+0f01.
*/

View file

@ -0,0 +1,82 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2004 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.ttt;
/** A singleton class that should contain (but due to laziness and
* ignorance probably does not contain) all the traits that make EWTS
* transliteration different from other (say, ACIP) transliteration
* schemes. */
final class EWTSTraits implements TTraits {
/** sole instance of this class */
private static EWTSTraits singleton = null;
/** Just a constructor. */
private EWTSTraits() { }
/** */
public static EWTSTraits instance() {
if (null == singleton) {
singleton = new EWTSTraits();
}
return singleton;
}
/** Returns ".". */
public String disambiguator() { return "."; }
/** Returns '.'. */
public char disambiguatorChar() { return '.'; }
public boolean hasSimpleError(TPair p) {
return ("a".equals(p.getLeft()) && null == p.getRight()); // TODO(DLC)[EWTS->Tibetan]: (a.e) is bad, one of (.a) or (a.) is bad
}
/** {tsh}, the longest consonant, has 3 characters, so this is
* three. */
public int maxConsonantLength() { return 3; }
/** {-i~M`}, in a tie for the longest wowel, has 6 characters, so
* this is six. (No, 'l-i' and 'r-i' are not wowels (but '-i'
* is). */
public int maxWowelLength() { return 5; }
// TODO(DLC)[EWTS->Tibetan]: u,e,i,o? If not, document the special treatment in this function's comment
public boolean isConsonant(String s) {
// TODO(DLC)[EWTS->Tibetan]: just g for now
return "g".equals(s);
}
public boolean isWowel(String s) {
// TODO(DLC)[EWTS->Tibetan]: all non-consonant combiners? 0f71 0f87 etc.?
return ("a".equals(s)
|| "e".equals(s)
|| "i".equals(s)
|| "o".equals(s)
|| "u".equals(s)
|| "?".equals(s) // TODO(DLC)[EWTS->Tibetan]: 0f84 virama???
// TODO(DLC)[EWTS->Tibetan]: & ~M` ~M ???
|| "U".equals(s)
|| "I".equals(s)
|| "A".equals(s)
|| "-i".equals(s)
|| "-I".equals(s)
|| "H".equals(s)
|| "M".equals(s)); // TODO(DLC)[EWTS->Tibetan]:???
}
}

File diff suppressed because it is too large Load diff

View file

@ -340,7 +340,7 @@ public class ErrorsAndWarnings {
user preferences and falling back on built-in defaults if user preferences and falling back on built-in defaults if
necessary (which it shouldn't be -- options.txt should be in necessary (which it shouldn't be -- options.txt should be in
the JAR with this class file. */ the JAR with this class file. */
static void setupSeverityMap() { public static void setupSeverityMap() {
// errors: // errors:
for (int i = MIN_ERROR; i <= MAX_ERROR; i++) { for (int i = MIN_ERROR; i <= MAX_ERROR; i++) {
severityMap.put(new Integer(i), "ERROR"); severityMap.put(new Integer(i), "ERROR");

View file

@ -13,7 +13,7 @@ License for the specific terms governing rights and limitations under the
License. License.
The Initial Developer of this software is the Tibetan and Himalayan Digital The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2003 THDL. Library (THDL). Portions created by the THDL are Copyright 2003-2004 THDL.
All Rights Reserved. All Rights Reserved.
Contributor(s): ______________________________________. Contributor(s): ______________________________________.
@ -29,7 +29,7 @@ import junit.framework.TestCase;
/** Tests this package, especially {@link #TPairListFactory} and /** Tests this package, especially {@link #TPairListFactory} and
* {@link TPairList}. * {@link TPairList}. Tests use ACIP more than EWTS.
* *
* @author David Chandler */ * @author David Chandler */
public class PackageTest extends TestCase { public class PackageTest extends TestCase {
@ -10495,7 +10495,7 @@ tstHelper("shKA");
} }
} }
/** Tests warning/error messages 512 and 507 */
public void test512And507() { public void test512And507() {
// Plain "GNY" is interpreted as two stacks, so no 512 warning // Plain "GNY" is interpreted as two stacks, so no 512 warning
// is given. FIXME 946058 // is given. FIXME 946058

View file

@ -24,10 +24,12 @@ import org.thdl.tib.text.DuffCode;
import java.util.ArrayList; import java.util.ArrayList;
/** An ordered pair used in ACIP-to-TMW conversion. The left side is /** An ordered pair used in ACIP/EWTS-to-TMW/Unicode conversion. The
* the consonant or empty; the right side is the vowel, '+', or '-'. * left side is the consonant or empty; the right side is either the
* vowel or '+' (indicating stacking) or a disambiguator (i.e., '-'
* in ACIP or '.' in EWTS).
* @author David Chandler */ * @author David Chandler */
/* BIG FIXME: make this package work for EWTS, not just ACIP. */ /* BIG FIXME: make this package work for EWTS, not just ACIP. (TODO(DLC)[EWTS->Tibetan]: does it?) */
class TPair { class TPair {
/** The left side, or null if there is no left side. That is, the /** The left side, or null if there is no left side. That is, the
* non-vowel, non-'m', non-':', non-'-', non-'+' guy. */ * non-vowel, non-'m', non-':', non-'-', non-'+' guy. */
@ -72,13 +74,13 @@ class TPair {
+ ((r == null) ? 0 : r.length())); + ((r == null) ? 0 : r.length()));
} }
/** Returns an TPair that is like this one except that it is /** Returns a TPair that is like this one except that it is
* missing N characters. The characters are taken from r, the * missing N characters. The characters are taken from r, the
* right side, first and from l, the left side, second. The pair * right side, first and from l, the left side, second. The pair
* returned may be illegal, such as the (A . ') you can get from * returned may be illegal, such as the (A . ') you can get from
* ACIP {A'AAMA}. * ACIP {A'AAMA}.
* @throws IllegalArgumentException if N is out of range */ * @throws IllegalArgumentException if N is out of range */
TPair minusNRightmostACIPCharacters(int N) TPair minusNRightmostTransliterationCharacters(int N)
throws IllegalArgumentException throws IllegalArgumentException
{ {
int sz; int sz;
@ -107,7 +109,7 @@ class TPair {
return false; return false;
if (null != l && !ACIPRules.isConsonant(l)) if (null != l && !ACIPRules.isConsonant(l))
return false; return false;
if (null != r && !ACIPRules.isVowel(r)) if (null != r && !ACIPRules.isWowel(r))
return false; return false;
return true; return true;
} }
@ -117,7 +119,7 @@ class TPair {
boolean isPrefix() { boolean isPrefix() {
return (null != l return (null != l
&& ((null == r || "".equals(r)) && ((null == r || "".equals(r))
|| "-".equals(r) || "-".equals(r) // TODO(DLC)[EWTS->Tibetan]
|| "A".equals(r)) // FIXME: though check for BASKYABS and warn because BSKYABS is more common || "A".equals(r)) // FIXME: though check for BASKYABS and warn because BSKYABS is more common
&& ACIPRules.isACIPPrefix(l)); && ACIPRules.isACIPPrefix(l));
} }
@ -158,7 +160,7 @@ class TPair {
return false; return false;
} }
/** Returns an TPair that is like this pair except that it has /** Returns a TPair that is like this pair except that it has
* a "+" on the right if this pair is empty on the right and is * a "+" on the right if this pair is empty on the right and is
* empty on the right if this pair has a disambiguator (i.e., a * empty on the right if this pair has a disambiguator (i.e., a
* '-') on the right. May return itself (but never mutates this * '-') on the right. May return itself (but never mutates this

View file

@ -16,6 +16,8 @@ All Rights Reserved.
Contributor(s): ______________________________________. Contributor(s): ______________________________________.
*/ */
// TODO(DLC)[EWTS->Tibetan]: a (DLC: does this become (a.) or (.a)?), ug pha, g.a, aM, etc. -- test!
package org.thdl.tib.text.ttt; package org.thdl.tib.text.ttt;
import org.thdl.tib.text.TibetanMachineWeb; import org.thdl.tib.text.TibetanMachineWeb;
@ -27,7 +29,7 @@ import java.util.HashMap;
import java.util.ArrayList; import java.util.ArrayList;
/** A list of {@link TPair TPairs}, typically corresponding to /** A list of {@link TPair TPairs}, typically corresponding to
* one tsheg bar. <i>l</i>' in the design doc is an TPairList. * one tsheg bar. <i>l</i>' in the design doc is a TPairList.
* *
* @author David Chandler */ * @author David Chandler */
class TPairList { class TPairList {
@ -136,13 +138,13 @@ class TPairList {
/** Returns true if this list contains ( . <vowel>) or (A . ), /** Returns true if this list contains ( . <vowel>) or (A . ),
* which are two simple errors you encounter if you interpret DAA * which are two simple errors you encounter if you interpret DAA
* or TAA or DAI or DAE the wrong way. */ * or TAA or DAI or DAE the wrong way. TODO(DLC)[EWTS->Tibetan]: ACIP vs. EWTS */
boolean hasSimpleError() { boolean hasSimpleError(TTraits ttraits) {
int sz = size(); int sz = size();
for (int i = 0; i < sz; i++) { for (int i = 0; i < sz; i++) {
TPair p = get(i); TPair p = get(i);
if ((null == p.getLeft() && !"-".equals(p.getRight())) if ((null == p.getLeft() && !ttraits.disambiguator().equals(p.getRight()))
|| ("A".equals(p.getLeft()) && null == p.getRight())) || ttraits.hasSimpleError(p))
return true; return true;
} }
return false; return false;
@ -205,7 +207,7 @@ class TPairList {
return null; return null;
} }
/** Returns true if and only if either x is an TPairList object /** Returns true if and only if either x is a TPairList object
* representing the same TPairs in the same order or x is a * representing the same TPairs in the same order or x is a
* String that is equals to the result of {@link #toString()}. */ * String that is equals to the result of {@link #toString()}. */
public boolean equals(Object x) { public boolean equals(Object x) {

View file

@ -16,6 +16,8 @@ All Rights Reserved.
Contributor(s): ______________________________________. Contributor(s): ______________________________________.
*/ */
// TODO(DLC)[EWTS->Tibetan]: If EWTS still has 'v', warn about it if it looks like someone thinks that ACIP's usage of it for wa-zur is how EWTS does things.
package org.thdl.tib.text.ttt; package org.thdl.tib.text.ttt;
/** A factory for creating {@link TPairList TPairLists} from /** A factory for creating {@link TPairList TPairLists} from
@ -38,7 +40,7 @@ class TPairListFactory {
* rest would be suboptimal, so we backtrack to [(T . )] and then * rest would be suboptimal, so we backtrack to [(T . )] and then
* finally become [(T . ), (A . A)]. We look for (A . ) and ( * finally become [(T . ), (A . A)]. We look for (A . ) and (
* . <vowel>) in the rest in order to say "the rest would be * . <vowel>) in the rest in order to say "the rest would be
* suboptimal", i.e. we use TPairList.hasSimpleError().</p> * suboptimal", i.e. we use TPairList.hasSimpleError(TTraits).</p>
* *
* <p>There is one case where we break things up into two pair * <p>There is one case where we break things up into two pair
* lists if and only if specialHandlingForAppendages is true -- I * lists if and only if specialHandlingForAppendages is true -- I
@ -65,19 +67,20 @@ class TPairListFactory {
* @return an array of one or two pair lists, if the former, then * @return an array of one or two pair lists, if the former, then
* the second element will be null, if the latter, the second * the second element will be null, if the latter, the second
* element will have (* . ), (' . *) instead of (* . '*) which * element will have (* . ), (' . *) instead of (* . '*) which
* the former has @throws IllegalArgumentException if acip is too * the former has
* large for us to break into chunks (we're recursive, not * @throws IllegalArgumentException if acip is too large for us
* iterative, so the boundary can be increased a lot if you care, * to break into chunks (we're recursive, not iterative, so the
* but you don't) */ * boundary can be increased a lot if you care, but you don't) */
static TPairList[] breakACIPIntoChunks(String acip, static TPairList[] breakACIPIntoChunks(String acip,
boolean specialHandlingForAppendages) boolean specialHandlingForAppendages)
throws IllegalArgumentException throws IllegalArgumentException
{ {
try { try {
TPairList a = breakHelper(acip, true, false); TTraits ttraits = ACIPTraits.instance();
TPairList a = breakHelperACIP(acip, true, false, ttraits);
TPairList b = null; TPairList b = null;
if (specialHandlingForAppendages) if (specialHandlingForAppendages)
b = breakHelper(acip, false, false); b = breakHelperACIP(acip, false, false, ttraits);
if (null != b && a.equals(b)) if (null != b && a.equals(b))
return new TPairList[] { a, null }; return new TPairList[] { a, null };
else else
@ -88,6 +91,22 @@ class TPairListFactory {
throw new IllegalArgumentException("Input too large[2]: " + acip); throw new IllegalArgumentException("Input too large[2]: " + acip);
} }
} }
/** TODO(DLC)[EWTS->Tibetan]: doc */
static TPairList[] breakEWTSIntoChunks(String ewts)
throws IllegalArgumentException
{
try {
return new TPairList[] {
breakHelperEWTS(ewts, EWTSTraits.instance()), null
};
} catch (StackOverflowError e) {
throw new IllegalArgumentException("Input too large[1]: " + ewts);
} catch (OutOfMemoryError e) {
throw new IllegalArgumentException("Input too large[2]: " + ewts);
}
}
/** Helps {@link #breakACIPIntoChunks(String,boolean)}. /** Helps {@link #breakACIPIntoChunks(String,boolean)}.
* @param tickIsVowel true if and only if you want to treat the * @param tickIsVowel true if and only if you want to treat the
* ACIP {'} as an U+0F71 vowel instead of the full-sized * ACIP {'} as an U+0F71 vowel instead of the full-sized
@ -96,7 +115,9 @@ class TPairListFactory {
* @param weHaveSeenVowelAlready true if and only if, in our * @param weHaveSeenVowelAlready true if and only if, in our
* recursion, we've already found one vowel (not a disambiguator, * recursion, we've already found one vowel (not a disambiguator,
* but a vowel like "A", "E", "Um:", "m", "'U", etc.) */ * but a vowel like "A", "E", "Um:", "m", "'U", etc.) */
private static TPairList breakHelper(String acip, boolean tickIsVowel, boolean weHaveSeenVowelAlready) { private static TPairList breakHelperACIP(String acip, boolean tickIsVowel,
boolean weHaveSeenVowelAlready,
TTraits ttraits) {
// base case for our recursion: // base case for our recursion:
if ("".equals(acip)) if ("".equals(acip))
@ -104,7 +125,7 @@ class TPairListFactory {
StringBuffer acipBuf = new StringBuffer(acip); StringBuffer acipBuf = new StringBuffer(acip);
int howMuchBuf[] = new int[1]; int howMuchBuf[] = new int[1];
TPair head = getFirstConsonantAndVowel(acipBuf, howMuchBuf); TPair head = getFirstConsonantAndVowel(acipBuf, howMuchBuf, ttraits);
int howMuch = howMuchBuf[0]; int howMuch = howMuchBuf[0];
if (!tickIsVowel if (!tickIsVowel
&& null != head.getLeft() && null != head.getLeft()
@ -122,26 +143,63 @@ class TPairListFactory {
TPairList tail; TPairList tail;
if ((tail if ((tail
= breakHelper(acipBuf.substring(howMuch), = breakHelperACIP(acipBuf.substring(howMuch),
tickIsVowel, tickIsVowel,
weHaveSeenVowelAlready weHaveSeenVowelAlready
|| (head.getRight() != null || (head.getRight() != null
&& !"+".equals(head.getRight()) && !"+".equals(head.getRight())
&& !"-".equals(head.getRight())))).hasSimpleError()) { && !"-".equals(head.getRight())),
ttraits)).hasSimpleError(ttraits)) {
for (int i = 1; i < howMuch; i++) { for (int i = 1; i < howMuch; i++) {
// try giving i characters back if that leaves us with // try giving i characters back if that leaves us with
// a legal head and makes the rest free of simple // a legal head and makes the rest free of simple
// errors. // errors.
TPairList newTail = null; TPairList newTail = null;
TPair newHead; TPair newHead;
if ((newHead = head.minusNRightmostACIPCharacters(i)).isLegal() if ((newHead = head.minusNRightmostTransliterationCharacters(i)).isLegal()
&& !(newTail && !(newTail
= breakHelper(acipBuf.substring(howMuch - i), = breakHelperACIP(acipBuf.substring(howMuch - i),
tickIsVowel, tickIsVowel,
weHaveSeenVowelAlready weHaveSeenVowelAlready
|| (newHead.getRight() != null || (newHead.getRight() != null
&& !"+".equals(newHead.getRight()) && !"+".equals(newHead.getRight())
&& !"-".equals(newHead.getRight())))).hasSimpleError()) { && !"-".equals(newHead.getRight())),
ttraits)).hasSimpleError(ttraits)) {
newTail.prepend(newHead);
return newTail;
}
}
// It didn't work. Return the first thing we'd thought
// of: head appended with tail. (I.e., fall through.)
}
tail.prepend(head);
return tail;
}
// TODO(DLC)[EWTS->Tibetan]: doc
private static TPairList breakHelperEWTS(String ewts, TTraits ttraits /* TODO(DLC)[EWTS->Tibetan]: use */) {
// base case for our recursion:
if ("".equals(ewts))
return new TPairList();
StringBuffer ewtsBuf = new StringBuffer(ewts);
int howMuchBuf[] = new int[1];
TPair head = getFirstConsonantAndVowel(ewtsBuf, howMuchBuf, ttraits);
int howMuch = howMuchBuf[0];
TPairList tail;
if ((tail = breakHelperEWTS(ewtsBuf.substring(howMuch),
ttraits)).hasSimpleError(ttraits)) {
for (int i = 1; i < howMuch; i++) {
// try giving i characters back if that leaves us with
// a legal head and makes the rest free of simple
// errors.
TPairList newTail = null;
TPair newHead;
if ((newHead = head.minusNRightmostTransliterationCharacters(i)).isLegal()
&& !(newTail
= breakHelperEWTS(ewtsBuf.substring(howMuch - i), ttraits)).hasSimpleError(ttraits)) {
newTail.prepend(newHead); newTail.prepend(newHead);
return newTail; return newTail;
} }
@ -162,14 +220,16 @@ class TPairListFactory {
* A later phase will need to turn that into {N+YE} or an error * A later phase will need to turn that into {N+YE} or an error
* or whatever you like. howMuch[0] will be set to the number of * or whatever you like. howMuch[0] will be set to the number of
* characters of acip that this call has consumed. */ * characters of acip that this call has consumed. */
private static TPair getFirstConsonantAndVowel(StringBuffer acip, private static TPair getFirstConsonantAndVowel(StringBuffer acip, // TODO(DLC)[EWTS->Tibetan]: function name needs ACIP in it?
int howMuch[]) { int howMuch[],
TTraits ttraits) {
// Note that it is *not* the case that if acip.substring(0, N) // Note that it is *not* the case that if acip.substring(0, N)
// is legal (according to TPair.isLegal()), then // is legal (according to TPair.isLegal()), then
// acip.substring(0, N-1) is legal for all N. For example, // acip.substring(0, N-1) is legal for all N. For example,
// think of {shA} and {KshA}. However, 's' is the only tricky // think of ACIP's {shA} and {KshA}. However, 's' is the only
// fellow, so it is true that acip.substring(0, N-1) is either // tricky fellow, so it is true that acip.substring(0, N-1) is
// legal or ends with 's' if acip.substring(0, N) is legal. // either legal or ends with 's' if acip.substring(0, N) is
// legal.
// //
// We don't, however, use this approach. We just try to find // We don't, however, use this approach. We just try to find
// a consonant of length 3, and then, failing that, of length // a consonant of length 3, and then, failing that, of length
@ -180,9 +240,9 @@ class TPairListFactory {
howMuch[0] = 0; howMuch[0] = 0;
return new TPair(null, null); return new TPair(null, null);
} }
if (acip.charAt(0) == '-') { if (acip.charAt(0) == ttraits.disambiguatorChar()) {
howMuch[0] = 1; howMuch[0] = 1;
return new TPair(null, "-"); return new TPair(null, ttraits.disambiguator());
} }
char ch = acip.charAt(0); char ch = acip.charAt(0);
@ -190,32 +250,33 @@ class TPairListFactory {
// like seeing 1-2-3-4. // like seeing 1-2-3-4.
if (ch >= '0' && ch <= '9') { if (ch >= '0' && ch <= '9') {
howMuch[0] = 1; // not 2... howMuch[0] = 1; // not 2...
return new TPair(acip.substring(0, 1), (xl == 1) ? null : "-"); return new TPair(acip.substring(0, 1), (xl == 1) ? null : ttraits.disambiguator());
} }
String l = null, r = null; String l = null, r = null;
for (i = Math.min(ACIPRules.MAX_CONSONANT_LENGTH, xl); i >= 1; i--) { for (i = Math.min(ttraits.maxConsonantLength(), xl); i >= 1; i--) {
String t = null; String t = null;
if (ACIPRules.isConsonant(t = acip.substring(0, i))) { if (ttraits.isConsonant(t = acip.substring(0, i))) {
l = t; l = t;
break; break;
} }
} }
int ll = (null == l) ? 0 : l.length(); int ll = (null == l) ? 0 : l.length();
if (null != l && xl > ll && acip.charAt(ll) == '-') { if (null != l && xl > ll && acip.charAt(ll) == ttraits.disambiguatorChar()) {
howMuch[0] = l.length() + 1; howMuch[0] = l.length() + 1;
return new TPair(l, "-"); return new TPair(l, ttraits.disambiguator());
} }
if (null != l && xl > ll && acip.charAt(ll) == '+') { if (null != l && xl > ll && acip.charAt(ll) == '+') {
howMuch[0] = l.length() + 1; howMuch[0] = l.length() + 1;
return new TPair(l, "+"); return new TPair(l, "+");
} }
for (i = Math.min(ACIPRules.MAX_VOWEL_LENGTH, xl - ll); i >= 1; i--) { for (i = Math.min(ttraits.maxWowelLength(), xl - ll); i >= 1; i--) {
String t = null; String t = null;
if (ACIPRules.isVowel(t = acip.substring(ll, ll + i)) if (ttraits.isWowel(t = acip.substring(ll, ll + i))
// Or these, which we massage into "Am", "Am:", and // Or these, which we massage into "Am", "Am:", and
// "A:" because I didn't think {Pm} should be treated // "A:" because I didn't think {Pm} should be treated
// like {PAm} originally: // like {PAm} originally:
// TODO(DLC)[EWTS->Tibetan]: NOW NIGHTMARE
|| "m".equals(t) || "m:".equals(t) || ":".equals(t)) { || "m".equals(t) || "m:".equals(t) || ":".equals(t)) {
r = t; r = t;
break; break;
@ -224,14 +285,14 @@ class TPairListFactory {
// Treat {BATA+SA'I} like {BAT+SA'I}: // Treat {BATA+SA'I} like {BAT+SA'I}:
int z; int z;
if (null != l && "A".equals(r) && ((z = ll + "A".length()) < xl) if (null != l && /* TODO(DLC)[EWTS->Tibetan]: */"A".equals(r) && ((z = ll + /* TODO(DLC)[EWTS->Tibetan]: */"A".length()) < xl)
&& acip.charAt(z) == '+') { && acip.charAt(z) == '+') {
acip.deleteCharAt(z-1); acip.deleteCharAt(z-1);
howMuch[0] = l.length() + 1; howMuch[0] = l.length() + 1;
return new TPair(l, "+"); return new TPair(l, "+");
} }
// Allow Pm to mean PAm, P: to mean PA:, Pm: to mean PAm:. // Allow Pm to mean PAm, P: to mean PA:, Pm: to mean PAm:. /* TODO(DLC)[EWTS->Tibetan]: */
int mod = 0; int mod = 0;
if ("m".equals(r)) { r = "Am"; mod = -1; } if ("m".equals(r)) { r = "Am"; mod = -1; }
if (":".equals(r)) { r = "A:"; mod = -1; } if (":".equals(r)) { r = "A:"; mod = -1; }
@ -239,19 +300,20 @@ class TPairListFactory {
if (":m".equals(r)) { r = "A:m"; mod = -1; } // not seen, though... if (":m".equals(r)) { r = "A:m"; mod = -1; } // not seen, though...
// what if we see a character that's not part of any vowel or // what if we see a character that's not part of any wowel or
// consonant? We return it. // consonant? We return it.
if (null == l && null == r) { if (null == l && null == r) {
howMuch[0] = 1; // not 2... howMuch[0] = 1; // not 2...
// add a '-' to avoid exponentials: // add a disambiguator to avoid exponential running time:
return new TPair(acip.substring(0, 1), (xl == 1) ? null : "-"); return new TPair(acip.substring(0, 1),
(xl == 1) ? null : ttraits.disambiguator());
} }
howMuch[0] = (((l == null) ? 0 : l.length()) howMuch[0] = (((l == null) ? 0 : l.length())
+ ((r == null) ? 0 : r.length()) + ((r == null) ? 0 : r.length())
+ mod); + mod);
return new TPair(l, r); return new TPair(l, r);
} } // TODO(DLC)[EWTS->Tibetan]:
} }

View file

@ -246,7 +246,7 @@ class TParseTree {
return al.toString(); return al.toString();
} }
/** Returns true if and only if either x is an TParseTree /** Returns true if and only if either x is a TParseTree
* object representing the same TPairLists in the same order * object representing the same TPairLists in the same order
* or x is a String that is equals to the result of {@link * or x is a String that is equals to the result of {@link
* #toString()}. */ * #toString()}. */

View file

@ -100,7 +100,7 @@ class TStackList {
return al.toString(); return al.toString();
} }
/** Returns true if and only if either x is an TStackList /** Returns true if and only if either x is a TStackList
* object representing the same TPairLists in the same * object representing the same TPairLists in the same
* order or x is a String that is equals to the result of {@link * order or x is a String that is equals to the result of {@link
* #toString()}. */ * #toString()}. */

View file

@ -62,7 +62,7 @@ class TStackListList {
return al.toString(); return al.toString();
} }
/** Returns true if and only if either x is an TStackListList /** Returns true if and only if either x is a TStackListList
* object representing the same TStackList objects in the same * object representing the same TStackList objects in the same
* order or x is a String that is equals to the result of {@link * order or x is a String that is equals to the result of {@link
* #toString()}. */ * #toString()}. */

View file

@ -53,7 +53,7 @@ public class TString {
return encoding; return encoding;
} }
/** Returns true if and only if an TString with type <i>type</i> /** Returns true if and only if a TString with type <i>type</i>
* is to be converted to something other than Tibetan text. * is to be converted to something other than Tibetan text.
* (Chinese Unicode, Latin, etc. all qualify as non-Tibetan.) */ * (Chinese Unicode, Latin, etc. all qualify as non-Tibetan.) */
public boolean isLatin() { public boolean isLatin() {

View file

@ -53,7 +53,7 @@ class TTGCList implements TGCList {
/** Returns the number of TGCPairs in this list. */ /** Returns the number of TGCPairs in this list. */
public int size() { return al.size(); } public int size() { return al.size(); }
/** Returns a zero-based index of an TPairList inside the stack /** Returns a zero-based index of a TPairList inside the stack
* list from which this list was constructed. This pair list is * list from which this list was constructed. This pair list is
* the one that caused the TGCPair at index tgcPairIndex to come * the one that caused the TGCPair at index tgcPairIndex to come
* into existence. */ * into existence. */

View file

@ -0,0 +1,70 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2003-2004 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.ttt;
/** A TTraits object encapsulates all the things that make a
* particular Roman transliteration scheme unique. If both EWTS and
* ACIP transliterations have a property in common, then it's likely
* encoded in a manner that's hard to modify. But if they differ in
* some respect, then that difference should be encoded in a TTraits
* object.
*
* <p>It is very likely that classes that implement this interface
* will choose to use the design pattern 'singleton'. */
interface TTraits {
/** Returns the disambiguator for this transliteration scheme,
* which had better be a string containing just one character
* lest {@link #disambiguatorChar()} become nonsensical for
* certain transliteration systems. A disambiguator is the
* string that separates two bits of transliteration that would
* otherwise be interpreted differently. The canonical example
* is EWTS's {gya} versus {g.ya}, which, due to the disambiguator
* in the latter, are different sequences of Unicode. */
String disambiguator();
/** For convenience, a convenience that is possible because ACIP's
* and EWTS's disambiguator strings both have length one, this
* method returns the sole character in the string returned by
* {@link #disambiguator()}. */
char disambiguatorChar();
/** Returns the maximum number of characters of transliteration
* required to denote a Tibetan consonant. */
int maxConsonantLength();
/** Returns the maximum number of characters of transliteration
* required to denote a Tibetan wowel, i.e. a vowel or one or
* more hangers-on like U+TODO(DLC)[EWTS->Tibetan]:_NOW or both. */
int maxWowelLength();
// TODO(DLC)[EWTS->Tibetan]: use the term 'wowel' everywhere, never "vowel" unless you mean just {e,i,o,u}
/** Returns true if and only if <em>s</em> is a stretch of
* transliteration corresponding to a Tibetan consonant (without
* any wowel) */
boolean isConsonant(String s);
/** Returns true if and only if <em>s</em> is a stretch of
* transliteration corresponding to a Tibetan wowel (without any
* [achen or other] consonant) */
boolean isWowel(String s);
/** Returns true if and only if the pair given has a simple error
* other than being a mere disambiguator. */
boolean hasSimpleError(TPair p);
}