TMW->EWTS:

Fixed part of bug 998476 and part of an undocumented bug.  Discovered a
new bug, "aM" should be generated but only "M" is.

The undocumented bug was that laMA was generated when lAM should have been.

The part of bug 998476 that was fixed: laM, laH, etc. are now generated.

This does nothing about paN etc.

Some refactoring here; this is not a minimal diff.

Added tests of TMW->EWTS that use ACIP to get the TMW in place
because EWTS->TMW is a faulty keyboard at present.
This commit is contained in:
dchandler 2005-02-07 03:17:40 +00:00
parent a82afad92c
commit 8dcb623382
10 changed files with 1033 additions and 699 deletions

View file

@ -444,6 +444,11 @@ Contributor(s): ______________________________________.
description="compiles all JUnit test cases that can be compiled in the present CLASSPATH (NB that this distinction is just wishful thinking for now because we have such weak test coverage at this point)" >
<mkdir dir="${junitbin}"/>
<antcall target="create-timestamp-source-code"/> <!-- DLC NOW! The -run targets are mucking with this! It isn't fatal, but it should be fixed. -->
<antcall target="our-internal-javac-task">
<param name="mybin" value="${junitbin}"/>
<param name="my.included.source.file"
value="org/thdl/tib/text/TibetanMachineWebTest.java"/>
</antcall>
<antcall target="our-internal-javac-task">
<param name="mybin" value="${junitbin}"/>
<param name="my.included.source.file"

View file

@ -73,6 +73,7 @@
<formatter type="xml"/><!-- If not XML, then 'ant -buildfile
build.xml check-report' will fail. -->
<sysproperty key="java.awt.headless" value="true"/>
<test name="org.thdl.tib.text.TibetanMachineWebTest"/>
<test name="org.thdl.tib.text.ttt.PackageTest"/>
<test name="org.thdl.tib.text.ttt.LotsOfTshegBarsTest"/>
<test name="org.thdl.tib.input.TMW_RTF_TO_THDL_WYLIETest"/>

Binary file not shown.

File diff suppressed because it is too large Load diff

View file

@ -20,11 +20,6 @@ package org.thdl.tib.input;
import junit.framework.TestCase;
import javax.swing.Action;
import javax.swing.KeyStroke;
import org.thdl.util.ThdlOptions;
/**
@author David Chandler
@ -45,7 +40,25 @@ public class TinyTest extends DuffPaneTestBase {
*/
public void testBug998476() {
enableEWTSKeyboard();
e("lM");
e("M");
e("laM");
e("lM", "laM");
e("kaH");
e("gam");
e("gam?");
e("?");
e("la?");
e("&");
e("la&");
e("H");
e("laH");
e("HM");
e("laHM");
}
}

View file

@ -18,6 +18,8 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text;
import org.thdl.util.ThdlDebug;
/** An ordered pair consisting of a Tibetan grapheme cluster's (see
{@link org.thdl.tib.text.tshegbar.UnicodeGraphemeCluster} for a
@ -105,8 +107,13 @@ public class TGCPair implements THDLWylieConstants {
b.append(ch);
}
}
if (vowelWylie != null)
if (vowelWylie != null) {
if (consonantWylie != null // we need a stack to put an 'a' onto
&& !TibetanMachineWeb.startsWithWylieVowelSequence(vowelWylie)) {
b.append("a"); // we want laM, not lM -- see bug 998476
}
b.append(vowelWylie);
}
return b.toString();
}
public String getACIP() {
@ -182,8 +189,9 @@ public class TGCPair implements THDLWylieConstants {
int realClassification = -37;
if (vowelWylie == null && classification == TYPE_TIBETAN)
realClassification = CONSONANTAL_WITHOUT_VOWEL;
if (vowelWylie != null && classification == TYPE_TIBETAN)
if (vowelWylie != null && classification == TYPE_TIBETAN) {
realClassification = CONSONANTAL_WITH_VOWEL;
}
if (vowelWylie == null && classification == TYPE_SANSKRIT)
realClassification = SANSKRIT_WITHOUT_VOWEL;
if (vowelWylie != null && classification == TYPE_SANSKRIT)
@ -204,10 +212,72 @@ public class TGCPair implements THDLWylieConstants {
if (vowelWylie.equals("uA") || vowelWylie.equals("Au"))
vowelWylie = "U";
}
this.vowelWylie = vowelWylie;
// Normalize vowelWylie such that any real vowel (i.e., a
// vowel for which TibetanMachineWeb.isWylieVowel(..) returns
// true) comes before any non-vowel but vowelish combining
// character like 'M', '?', '~M', '~M`', or 'H':
this.vowelWylie = normalizedVowel(vowelWylie);
this.classification = realClassification;
}
private static final int MAX_CHARACTERS_IN_VOWELISH = 3; // ~M` has 3 characters
/** Returns v such that all the normal vowels in v come first and
the combining marks (Sanskrit-ish) come last. Relative order
in each of the two groups is preserved. */
private static String normalizedVowel(String v) {
if (null == v || "".equals(v)) return null;
StringBuffer vowel_sb = new StringBuffer();
StringBuffer vowelish_sb = new StringBuffer();
int mark = 0;
int maxVowelishLength
= Math.max(TibetanMachineWeb.getMaxEwtsVowelLength(),
TGCPair.MAX_CHARACTERS_IN_VOWELISH);
for (int i = 0; i < v.length(); i++) {
// Grab ~M` if it's there because you don't want to grab
// ~M and then have ` left over, which is not vowelish.
for (int j = maxVowelishLength; j > 0; j--) {
if (i + j <= v.length()) {
String x = v.substring(mark, i + j);
if (TibetanMachineWeb.isWylieVowel(x)) {
mark = i + j;
i += j - 1;
vowel_sb.append(x);
} else if (isWylieVowelishButNotVowel(x)) {
mark = i + j;
i += j - 1;
vowelish_sb.append(x);
}
}
}
}
if (mark < v.length()) {
vowelish_sb.append(v.substring(mark));
ThdlDebug.noteIffyCode();
// FIXME(dchandler): what should I do here? I doubt v is
// valid.
}
if (vowelish_sb.length() > 0) { // just an optimization
vowel_sb.append(vowelish_sb);
return vowel_sb.toString();
} else {
ThdlDebug.verify(vowel_sb.toString().equals(v));
return v;
}
}
/** Returns true if v is in the set { "M", "H", "~M", "~M`", "?" }. */
private static boolean isWylieVowelishButNotVowel(String v) {
boolean ans = (v.equals("M")
|| v.equals("H")
|| v.equals("~M`")
|| v.equals("~M")
|| v.equals("?"));
ThdlDebug.verify(!ans || TGCPair.MAX_CHARACTERS_IN_VOWELISH >= v.length());
return ans;
}
public String toString() {
return "<TGCPair wylie=" + getWylie() + " classification="
+ classification + "/>";

View file

@ -1015,7 +1015,9 @@ public class TibTextUtils implements THDLWylieConstants {
// breaks TMW->ACIP and TMW->EWTS. Test it. When it
// does, revamp TGCPair to have a set of vowels. The
// output order should be consistent with the
// Unicode-imposed order on vowels.
// Unicode-imposed order on vowels. (Maybe modulo the
// CCV bug in Unicode w.r.t. above- and below-base
// vowels?)
} else {
// number or weird thing:

View file

@ -65,6 +65,7 @@ public class TibetanMachineWeb implements THDLWylieConstants {
private static Set sanskritStackSet = null;
private static Set numberSet = null;
private static Set vowelSet = null;
private static int maxEwtsVowelLength = -1;
private static Set puncSet = null;
private static Set topSet = null;
private static Set leftSet = null;
@ -374,6 +375,8 @@ public class TibetanMachineWeb implements THDLWylieConstants {
while (sTok.hasMoreTokens()) {
String ntk;
vowelSet.add(ntk = sTok.nextToken());
if (maxEwtsVowelLength < ntk.length())
maxEwtsVowelLength = ntk.length();
validInputSequences.put(ntk, anyOldObjectWillDo);
}
@ -805,6 +808,11 @@ public static boolean setKeyboard(TibetanKeyboard kb) {
hasAVowel = true;
aVowel = WYLIE_aVOWEL;
if (!vowelSet.contains(WYLIE_aVOWEL)) {
ThdlDebug.noteIffyCode();
// iffy because vowels contains 'a' and because
// maxEwtsVowelLength better be correct if this branch is
// ever taken
vowelSet.add(WYLIE_aVOWEL);
validInputSequences.put(WYLIE_aVOWEL, anyOldObjectWillDo);
}
@ -1111,6 +1119,12 @@ public static boolean isAmbiguousWylie(String x, String y) {
);
}
/** Returns the length in characters of the longest EWTS vowel. */
public static int getMaxEwtsVowelLength() {
ThdlDebug.verify(maxEwtsVowelLength > 0);
return maxEwtsVowelLength;
}
/**
* Checks to see if the passed string
* is a vowel in Extended Wylie.
@ -1122,6 +1136,22 @@ public static boolean isWylieVowel(String s) {
return vowelSet.contains(s);
}
/**
* Checks to see if the passed string begins with an EWTS vowel.
* @param s the string to be checked
* @return true if s is a vowel in
* Extended Wylie transliteration, false if not */
public static boolean startsWithWylieVowelSequence(String s) {
for (int i = 0; i < maxEwtsVowelLength; i++) {
if (i == s.length())
return false;
if (isWylieVowel(s.substring(0, i + 1)))
return true;
}
return false;
}
/** Returns true if and only if wylie is the THDL Extended Wylie for
an adornment. An adornment is something that is part of a stack
but is not a consonant, such as a Tibetan or Sanskrit vowel or a

View file

@ -0,0 +1,73 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2002-2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text;
import junit.framework.TestCase;
import org.thdl.util.ThdlOptions;
/**
@author David Chandler
Tests a tiny part of TibetanMachineWeb's functionality. This class
gets tested much better by other test classes, really, though it's
not exactly at the unit level. */
public class TibetanMachineWebTest extends TestCase {
public TibetanMachineWebTest(String a0) {
super(a0);
}
/** Invokes a text UI and runs all this class's tests. */
public static void main(String[] args) {
junit.textui.TestRunner.run(TibetanMachineWebTest.class);
}
protected void setUp() {
// We don't want to use options.txt:
ThdlOptions.forTestingOnlyInitializeWithoutDefaultOptionsFile();
ThdlOptions.setUserPreference("thdl.acip.to.tibetan.warning.and.error.severities.are.built.in.defaults", "true");
ThdlOptions.setUserPreference("thdl.acip.to.tibetan.warning.severity.507", "Most");
// We don't want to load the TM or TMW font files ourselves:
ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
ThdlOptions.setUserPreference("thdl.debug", true);
}
/** Tests {@link
TibetanMachineWeb#startsWithWylieVowelSequence(String)}. */
public void testStartsWithWylieVowelSequence() {
assertTrue(!org.thdl.tib.text.TibetanMachineWeb.startsWithWylieVowelSequence("M"));
assertTrue(!org.thdl.tib.text.TibetanMachineWeb.startsWithWylieVowelSequence("HM"));
assertTrue(!org.thdl.tib.text.TibetanMachineWeb.startsWithWylieVowelSequence("?"));
assertTrue(!org.thdl.tib.text.TibetanMachineWeb.startsWithWylieVowelSequence("~"));
assertTrue(!org.thdl.tib.text.TibetanMachineWeb.startsWithWylieVowelSequence("~M"));
assertTrue(!org.thdl.tib.text.TibetanMachineWeb.startsWithWylieVowelSequence("~M`"));
assertTrue(org.thdl.tib.text.TibetanMachineWeb.startsWithWylieVowelSequence("u"));
assertTrue(org.thdl.tib.text.TibetanMachineWeb.startsWithWylieVowelSequence("e"));
assertTrue(org.thdl.tib.text.TibetanMachineWeb.startsWithWylieVowelSequence("eu"));
assertTrue(org.thdl.tib.text.TibetanMachineWeb.startsWithWylieVowelSequence("eu"));
assertTrue(org.thdl.tib.text.TibetanMachineWeb.startsWithWylieVowelSequence("-I"));
assertTrue(org.thdl.tib.text.TibetanMachineWeb.startsWithWylieVowelSequence("eieio"));
assertTrue(org.thdl.tib.text.TibetanMachineWeb.startsWithWylieVowelSequence("auai-iAI"));
}
}

View file

@ -56,10 +56,152 @@ public class PackageTest extends TestCase {
public PackageTest() { }
/** Converts ACIP to TMW with no warnings. If errors occur there,
returns null. Otherwise, returns the result of TMW->ACIP,
which may be an error message. */
static String ACIP2TMW2ACIP(String ACIP) {
return ACIP2TMW2Translit(false, ACIP);
}
static String ACIP2TMW2EWTS(String ACIP) {
return ACIP2TMW2Translit(true, ACIP);
}
/** Tests TMW->EWTS conversion using ACIP to input those TMW
glyphs that are just hard to input (because of the currently
broken EWTS->TMW converter and keyboard) without getting ACIP
involved. */
public void testTmw2Ewts() {
assertEquals(ACIP2TMW2EWTS("\\u0F00"), "oM");
assertEquals(ACIP2TMW2EWTS("\\u0F01"), "\\u0F01");
// The EWTS spec would make you think that
// ACIP("\\u0f02")->TMW->EWTS would return "\\u0F02", but
// there is no single TMW glyph for U+0F02, so the round trip
// is impossible:
assertEquals(ACIP2TMW2EWTS("\\u0F02"), "'u~M`H");
// Similarly for U+0F03:
assertEquals(ACIP2TMW2EWTS("\\u0F03"), "'u~M`:");
assertEquals(ACIP2TMW2EWTS("\\u0F04"), "@");
assertEquals(ACIP2TMW2EWTS("\\u0F05"), "#");
assertEquals(ACIP2TMW2EWTS("\\u0F06"), "$");
assertEquals(ACIP2TMW2EWTS("\\u0F07"), "%");
assertEquals(ACIP2TMW2EWTS("\\u0F08"), "!");
assertEquals(ACIP2TMW2EWTS("\\u0F09"), "\\u0F09");
assertEquals(ACIP2TMW2EWTS("\\u0F0A"), "\\u0F0A");
assertEquals(ACIP2TMW2EWTS("\\u0F0B"), " ");
assertEquals(ACIP2TMW2EWTS("\\u0F0C"), "*");
assertEquals(ACIP2TMW2EWTS("\\u0F0D"), "/");
assertEquals(ACIP2TMW2EWTS("\\u0F0E"), "//");
assertEquals(ACIP2TMW2EWTS("\\u0F0F"), ";");
assertEquals(ACIP2TMW2EWTS("\\u0F10"), "\\u0F10");
assertEquals(ACIP2TMW2EWTS("\\u0F11"), "|");
assertEquals(ACIP2TMW2EWTS("\\u0F12"), "\\u0F12");
assertEquals(ACIP2TMW2EWTS("\\u0F13"), "\\u0F13");
assertEquals(ACIP2TMW2EWTS("\\u0F14"), ":");
assertEquals(ACIP2TMW2EWTS("\\u0F15"), "\\u0F15");
assertEquals(ACIP2TMW2EWTS("\\u0F16"), "\\u0F16");
assertEquals(ACIP2TMW2EWTS("\\u0F17"), "\\u0F17");
assertEquals(ACIP2TMW2EWTS("\\u0F18"), "\\u0F18");
assertEquals(ACIP2TMW2EWTS("\\u0F19"), "\\u0F19");
assertEquals(ACIP2TMW2EWTS("\\u0F1A"), "\\u0F1A");
assertEquals(ACIP2TMW2EWTS("\\u0F1B"), "\\u0F1B");
assertEquals(ACIP2TMW2EWTS("\\u0F1C"), "\\u0F1C");
assertEquals(ACIP2TMW2EWTS("\\u0F1D"), "\\u0F1D");
assertEquals(ACIP2TMW2EWTS("\\u0F1E"), "\\u0F1E");
assertEquals(ACIP2TMW2EWTS("\\u0F1F"), "\\u0F1F");
assertEquals(ACIP2TMW2EWTS("\\u0F20"), "0");
assertEquals(ACIP2TMW2EWTS("\\u0F21"), "1");
assertEquals(ACIP2TMW2EWTS("\\u0F22"), "2");
assertEquals(ACIP2TMW2EWTS("\\u0F23"), "3");
assertEquals(ACIP2TMW2EWTS("\\u0F24"), "4");
assertEquals(ACIP2TMW2EWTS("\\u0F25"), "5");
assertEquals(ACIP2TMW2EWTS("\\u0F26"), "6");
assertEquals(ACIP2TMW2EWTS("\\u0F27"), "7");
assertEquals(ACIP2TMW2EWTS("\\u0F28"), "8");
assertEquals(ACIP2TMW2EWTS("\\u0F29"), "9");
assertEquals(ACIP2TMW2EWTS("\\u0F2A"), "\\u0F2A");
assertEquals(ACIP2TMW2EWTS("\\u0F2B"), "\\u0F2B");
assertEquals(ACIP2TMW2EWTS("\\u0F2C"), "\\u0F2C");
assertEquals(ACIP2TMW2EWTS("\\u0F2D"), "\\u0F2D");
assertEquals(ACIP2TMW2EWTS("\\u0F2E"), "\\u0F2E");
assertEquals(ACIP2TMW2EWTS("\\u0F2F"), "\\u0F2F");
assertEquals(ACIP2TMW2EWTS("\\u0F30"), "\\u0F30");
assertEquals(ACIP2TMW2EWTS("\\u0F31"), "\\u0F31");
assertEquals(ACIP2TMW2EWTS("\\u0F32"), "\\u0F32");
assertEquals(ACIP2TMW2EWTS("\\u0F33"), "\\u0F33");
assertEquals(ACIP2TMW2EWTS("\\u0F34"), "=");
assertEquals(ACIP2TMW2EWTS("\\u0F35"), "~X");
assertEquals(ACIP2TMW2EWTS("\\u0F36"), "\\u0F36");
assertEquals(ACIP2TMW2EWTS("\\u0F37"), "X");
assertEquals(ACIP2TMW2EWTS("\\u0F38"), "\\u0F38");
assertEquals(ACIP2TMW2EWTS("\\u0F39"), "^");
assertEquals(ACIP2TMW2EWTS("\\u0F3A"), "<");
assertEquals(ACIP2TMW2EWTS("\\u0F3B"), ">");
assertEquals(ACIP2TMW2EWTS("\\u0F3C"), "(");
assertEquals(ACIP2TMW2EWTS("\\u0F3D"), ")");
assertEquals(ACIP2TMW2EWTS("\\u0F3E"), "}");
assertEquals(ACIP2TMW2EWTS("\\u0F3F"), "{");
assertEquals(ACIP2TMW2EWTS("\\u0F40"), "ka");
assertEquals(ACIP2TMW2EWTS("\\u0f63"), "la");
assertEquals(ACIP2TMW2EWTS("\\u0F48"), null);
assertEquals(ACIP2TMW2EWTS("\\u0f7e"), "M");
assertEquals(ACIP2TMW2EWTS("\\u0f7f"), "H");
assertEquals(ACIP2TMW2EWTS("\\u0f82"), "~M`");
assertEquals(ACIP2TMW2EWTS("\\u0f83"), "~M");
assertEquals(ACIP2TMW2EWTS("\\u0f84"), "?");
assertEquals(ACIP2TMW2EWTS("\\u0f85"), "&");
assertEquals(ACIP2TMW2EWTS("\\u0F86"), "\\u0F86");
assertEquals(ACIP2TMW2EWTS("\\u0F87"), "\\u0F87");
assertEquals(ACIP2TMW2EWTS("\\u0F88"), "\\u0F88");
assertEquals(ACIP2TMW2EWTS("\\u0F89"), "\\u0F89");
assertEquals(ACIP2TMW2EWTS("\\u0F8A"), "\\u0F8A");
assertEquals(ACIP2TMW2EWTS("\\u0F8B"), "\\u0F8B");
assertEquals(ACIP2TMW2EWTS("\\u0fb1"), "ya");
assertEquals(ACIP2TMW2EWTS("\\u0fbb"), "Ya");
assertEquals(ACIP2TMW2EWTS("\\u0FBE"), "\\u0FBE");
assertEquals(ACIP2TMW2EWTS("\\u0FBF"), "\\u0FBF");
assertEquals(ACIP2TMW2EWTS("\\u0FCF"), "\\u0FCF");
assertEquals(ACIP2TMW2EWTS("\\uF023"), "\\uF023"); // U+f023 is in the PUA of unicode. See EWTS spec.
assertEquals(ACIP2TMW2EWTS("LA \\u0020LA "), "la la "); // DLC FIXME: no '_'?
assertEquals(ACIP2TMW2EWTS("LA \\u00A0LA "), "la la "); // DLC FIXME: no '_'?
// TODO(dchandler): 0f00 - 0fff: test 'em all
assertEquals(ACIP2TMW2EWTS("\\u0f63"), "la");
assertEquals(ACIP2TMW2EWTS("\\u0f63\\u0f7e"), "laM");
assertEquals(ACIP2TMW2EWTS("\\u0f63\\u0f7e\\u0f71"), "lAM");
assertEquals(ACIP2TMW2EWTS("LA\\u0f82"), "la~M`");
assertEquals(ACIP2TMW2EWTS("LA\\u0f83"), "la~M");
assertEquals(ACIP2TMW2EWTS("\\u0f7f\\u0f82"), "H~M`");
assertEquals(ACIP2TMW2EWTS("LA\\u0f7f\\u0f82"), "laH~M`");
assertEquals(ACIP2TMW2EWTS("\\u0f7f\\u0f82\\u0F84"), "H~M`?");
assertEquals(ACIP2TMW2EWTS("LA\\u0f7f\\u0f82\\u0F84"), "laH~M`?");
assertEquals(ACIP2TMW2EWTS("\\u0f63\\u0f71\\u0f7e"), "lAM");
uhelp("\\u0f63\\u0f7e\\u0f71", "\u0f63\u0f7e\u0f71");
uhelp("\\u0f63\\u0f71\\u0f7e", "\u0f63\u0f71\u0f7e");
assertEquals(ACIP2TMW2EWTS("\\u0f68"), "a");
assertEquals(ACIP2TMW2EWTS("\\u0f68\\u0f72"), "i");
assertEquals(ACIP2TMW2EWTS("\\u0f68\\u0f7e"), "M"); // DLC FIXME: should be "aM"
assertEquals(ACIP2TMW2ACIP("\\u0f68\\u0f7e"), "Am");
assertEquals(ACIP2TMW2ACIP("\\u0f7e"), "m");
assertEquals(ACIP2TMW2EWTS("A+YA"), "a+ya");
assertEquals(ACIP2TMW2EWTS("A+R+YU"), "a+r+yu");
assertEquals(ACIP2TMW2EWTS("A+R+Yi"), "a+r+y-i");
}
/** Converts ACIP to TMW with no warnings. If errors occur there,
returns null. Otherwise, returns the result of TMW->ACIP or
TMW->EWTS (depending on EWTSNotACIP), which may be an error
message. */
static String ACIP2TMW2Translit(boolean EWTSNotACIP, String ACIP) {
StringBuffer errors = new StringBuffer();
ArrayList al = ACIPTshegBarScanner.scan(ACIP, errors, -1, false, "None");
if (null == al || errors.length() > 0)
@ -82,8 +224,10 @@ public class PackageTest extends TestCase {
} catch (java.io.IOException e) {
assertTrue("I/O exception?", false);
}
boolean noSuchACIP[] = new boolean[] { false };
return tdoc.getACIP(noSuchACIP);
if (EWTSNotACIP)
return tdoc.getWylie(new boolean[] { false });
else
return tdoc.getACIP(new boolean[] { false });
}