Numerous EWTS->Unicode and especially EWTS->TMW improvements.

Fixed ordering of Unicode wowels.  [ku+A] gives the correct Unicode
now, e.g.

EWTS->TMW looks better for some wacky wowels like, I'm guessing here, [ku+A].

EWTS->TMW should now give errors any time the full input isn't used.
Previously, wacky wowels like [kai+-i] would lead to some droppage.

EWTS->TMW->Unicode testing is now in effect.  This found a ton of
EWTS->TMW bugs, most or all of which are fixed now.

TMW->Unicode is improved/fixed for {
\u5350,\u534D,\u0F88+k,\u0F88+kh,U }.  (Why U?  "\u0f75" is
discouraged in favor of "\u0f71\u0f74".)

NOTE: TMW_RTF_TO_THDL_WYLIETest is still disabled for the nightly
builds' sake, but I ran it in my sandbox and it passed.
This commit is contained in:
dchandler 2005-07-11 02:51:06 +00:00
parent 36122778b4
commit 6d419fe641
19 changed files with 1014 additions and 547 deletions

View file

@ -21,6 +21,8 @@ package org.thdl.tib.text.ttt;
import java.math.BigInteger;
import java.util.ArrayList;
import org.thdl.tib.text.THDLWylieConstants;
/**
* This singleton class is able to break up Strings of EWTS text (for
* example, an entire sutra file) into tsheg bars, comments, etc.
@ -76,8 +78,11 @@ class EWTSTshegBarScanner extends TTshegBarScanner {
StringBuffer sb = new StringBuffer(s);
ExpandEscapeSequences(sb);
int sl = sb.length();
// TODO(DLC)[EWTS->Tibetan]:: '@#', in ewts->tmw, is not working
// TODO(DLC)[EWTS->Tibetan]:: 'jamX 'jam~X one is not working in ->tmw mode
// TODO(DLC)[EWTS->Tibetan]:: '@#', in ewts->tmw, is not working (probably because)
// TODO(DLC)[EWTS->Tibetan]:: '#', in ewts->tmw, is not working
//
// TODO(DLC)[EWTS->Tibetan]:: 'jamX one is not working in ewts->tmw mode in the sense that X appears under the last glyph of the three instead of the middle glyph
//
// TODO(DLC)[EWTS->Tibetan]:: dzaHsogs is not working
for (int i = 0; i < sl; i++) { // i is modified in the loop, also
if (isValidInsideTshegBar(sb.charAt(i))) {
@ -102,14 +107,14 @@ class EWTSTshegBarScanner extends TTshegBarScanner {
al.add(new TString("EWTS", "//",
TString.TIBETAN_PUNCTUATION));
++i;
} else if ((sb.charAt(i) >= EWTSTraits.PUA_MIN
&& sb.charAt(i) <= EWTSTraits.PUA_MAX)
} else if ((sb.charAt(i) >= THDLWylieConstants.PUA_MIN
&& sb.charAt(i) <= THDLWylieConstants.PUA_MAX)
|| (sb.charAt(i) >= '\u0f00' && sb.charAt(i) <= '\u0f17')
|| (sb.charAt(i) >= '\u0f1a' && sb.charAt(i) <= '\u0f1f')
|| (sb.charAt(i) >= '\u0fbe' && sb.charAt(i) <= '\u0fcc')
|| (sb.charAt(i) >= '\u0fcf' && sb.charAt(i) <= '\u0fd1')
|| (EWTSTraits.SAUVASTIKA == sb.charAt(i))
|| (EWTSTraits.SWASTIKA == sb.charAt(i))
|| (THDLWylieConstants.SAUVASTIKA == sb.charAt(i))
|| (THDLWylieConstants.SWASTIKA == sb.charAt(i))
|| (" /;|!:=_@#$%<>()*&\r\n\t\u0f36\u0f38\u0f89\u0f8a\u0f8b".indexOf(sb.charAt(i))
>= 0)) {
al.add(new TString("EWTS", sb.substring(i, i+1),
@ -186,7 +191,31 @@ class EWTSTshegBarScanner extends TTshegBarScanner {
// leave x == -1
}
if (x >= 0) {
sb.replace(i, i + "\\uXXXX".length(), new String(new char[] { (char)x }));
String replacement = String.valueOf((char)x);
if (false) {
// This would ruin EWTS->Unicode to
// help EWTS->TMW, so we don't do it.
// TODO(dchandler): Fix EWTS->TMW for
// \u0f02 and \u0f03.
// A nasty little HACK for you:
//
// TODO(dchandler): we may create "ga..u~M`H..ha" which may cause errors
String hack = null;
if ('\u0f02' == x) {
hack = "u~M`H"; // hard-coded EWTS
} else if ('\u0f03' == x) {
hack = "u~M`:"; // hard-coded EWTS
} else if ('\u0f00' == x) {
hack = "oM"; // hard-coded EWTS
}
if (null != hack) {
replacement = "." + hack + "."; // hard-coded EWTS disambiguators
i += replacement.length() - 1;
}
}
sb.replace(i, i + "\\uXXXX".length(), replacement);
continue;
}
}