Numerous EWTS->Unicode and especially EWTS->TMW improvements.

Fixed ordering of Unicode wowels. [ku+A] gives the correct Unicode now, e.g. EWTS->TMW looks better for some wacky wowels like, I'm guessing here, [ku+A]. EWTS->TMW should now give errors any time the full input isn't used. Previously, wacky wowels like [kai+-i] would lead to some droppage. EWTS->TMW->Unicode testing is now in effect. This found a ton of EWTS->TMW bugs, most or all of which are fixed now. TMW->Unicode is improved/fixed for { \u5350,\u534D,\u0F88+k,\u0F88+kh,U }. (Why U? "\u0f75" is discouraged in favor of "\u0f71\u0f74".) NOTE: TMW_RTF_TO_THDL_WYLIETest is still disabled for the nightly builds' sake, but I ran it in my sandbox and it passed.
2005-07-11 02:51:06 +00:00 · 2005-07-11 02:51:06 +00:00 · 6d419fe641
commit 6d419fe641
parent 36122778b4
19 changed files with 1014 additions and 547 deletions
--- a/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java
@ -21,6 +21,8 @@ package org.thdl.tib.text.ttt;
 import java.math.BigInteger;
 import java.util.ArrayList;

+import org.thdl.tib.text.THDLWylieConstants;
+
 /**
 * This singleton class is able to break up Strings of EWTS text (for
 * example, an entire sutra file) into tsheg bars, comments, etc.
@ -76,8 +78,11 @@ class EWTSTshegBarScanner extends TTshegBarScanner {
    StringBuffer sb = new StringBuffer(s);
    ExpandEscapeSequences(sb);
    int sl = sb.length();
-    // TODO(DLC)[EWTS->Tibetan]:: '@#', in ewts->tmw, is not working
-    // TODO(DLC)[EWTS->Tibetan]:: 'jamX 'jam~X one is not working in ->tmw mode
+    // TODO(DLC)[EWTS->Tibetan]:: '@#', in ewts->tmw, is not working (probably because)
+    // TODO(DLC)[EWTS->Tibetan]:: '#', in ewts->tmw, is not working
+    //
+    // TODO(DLC)[EWTS->Tibetan]:: 'jamX one is not working in ewts->tmw mode in the sense that X appears under the last glyph of the three instead of the middle glyph
+    //
    // TODO(DLC)[EWTS->Tibetan]:: dzaHsogs is not working
    for (int i = 0; i < sl; i++) {  // i is modified in the loop, also
      if (isValidInsideTshegBar(sb.charAt(i))) {
@ -102,14 +107,14 @@ class EWTSTshegBarScanner extends TTshegBarScanner {
          al.add(new TString("EWTS", "//",
                             TString.TIBETAN_PUNCTUATION));
          ++i;
-        } else if ((sb.charAt(i) >= EWTSTraits.PUA_MIN
-                    && sb.charAt(i) <= EWTSTraits.PUA_MAX)
+        } else if ((sb.charAt(i) >= THDLWylieConstants.PUA_MIN
+                    && sb.charAt(i) <= THDLWylieConstants.PUA_MAX)
                   || (sb.charAt(i) >= '\u0f00' && sb.charAt(i) <= '\u0f17')
                   || (sb.charAt(i) >= '\u0f1a' && sb.charAt(i) <= '\u0f1f')
                   || (sb.charAt(i) >= '\u0fbe' && sb.charAt(i) <= '\u0fcc')
                   || (sb.charAt(i) >= '\u0fcf' && sb.charAt(i) <= '\u0fd1')
-                   || (EWTSTraits.SAUVASTIKA == sb.charAt(i))
-                   || (EWTSTraits.SWASTIKA == sb.charAt(i))
+                   || (THDLWylieConstants.SAUVASTIKA == sb.charAt(i))
+                   || (THDLWylieConstants.SWASTIKA == sb.charAt(i))
                   || (" /;|!:=_@#$%<>()*&\r\n\t\u0f36\u0f38\u0f89\u0f8a\u0f8b".indexOf(sb.charAt(i))
                       >= 0)) {
          al.add(new TString("EWTS", sb.substring(i, i+1),
@ -186,7 +191,31 @@ class EWTSTshegBarScanner extends TTshegBarScanner {
                            // leave x == -1
                        }
                        if (x >= 0) {
-                            sb.replace(i, i + "\\uXXXX".length(), new String(new char[] { (char)x }));
+                            String replacement = String.valueOf((char)x);
+
+                            if (false) {
+                                // This would ruin EWTS->Unicode to
+                                // help EWTS->TMW, so we don't do it.
+                                // TODO(dchandler): Fix EWTS->TMW for
+                                // \u0f02 and \u0f03.
+
+                                // A nasty little HACK for you:
+                                //
+                                // TODO(dchandler): we may create "ga..u~M`H..ha" which may cause errors
+                                String hack = null;
+                                if ('\u0f02' == x) {
+                                    hack = "u~M`H";  // hard-coded EWTS
+                                } else if ('\u0f03' == x) {
+                                    hack = "u~M`:";  // hard-coded EWTS
+                                } else if ('\u0f00' == x) {
+                                    hack = "oM";  // hard-coded EWTS
+                                }
+                                if (null != hack) {
+                                    replacement = "." + hack + ".";  // hard-coded EWTS disambiguators
+                                    i += replacement.length() - 1;
+                                }
+                            }
+                            sb.replace(i, i + "\\uXXXX".length(), replacement);
                            continue;
                        }
                    }