ACIP->TMW now supports EWTS PUA {\uF021}-style escapes. Our extended ACIP is thus TMW-complete and useful for testing.

2003-12-08 07:15:27 +00:00 · 2003-12-08 07:15:27 +00:00 · a39c5c12b0
commit a39c5c12b0
parent 8f7322a056
3 changed files with 36 additions and 7 deletions
--- a/source/org/thdl/tib/text/TibetanMachineWeb.java
+++ b/source/org/thdl/tib/text/TibetanMachineWeb.java
@ -84,6 +84,8 @@ public class TibetanMachineWeb implements THDLWylieConstants {
        use special formatting to get those right (FIXME: warn
        whenever they're used). */
    private static DuffCode[][] UnicodeToTMW = new DuffCode[256][1];
+    /** For mapping codepoints U+F021..U+0FFF to TMW. */
+    private static DuffCode[][] NonUnicodeToTMW = new DuffCode[256][1];
    private static String fileName = "tibwn.ini";
    private static final String DELIMITER = "~";
    /** vowels that appear over the glyph: */
@ -603,6 +605,14 @@ public class TibetanMachineWeb implements THDLWylieConstants {
                                    // could well be null):
                                    TMWtoTM[duffCodes[TMW].getFontNum()-1][duffCodes[TMW].getCharNum()-32]
                                        = duffCodes[TM]; // TMW->TM mapping
+
+                                    if (wylie.toLowerCase().startsWith("\\uf0")) {
+                                        int x = Integer.parseInt(wylie.substring("\\u".length()), 16);
+                                        ThdlDebug.verify((x >= 0xF000
+                                                          && x <= 0xF0FF));
+                                        NonUnicodeToTMW[x - '\uF000']
+                                            = new DuffCode[] { duffCodes[TMW] };
+                                    }
                                    break;
                                // Vowels etc. to use with this glyph:
                                case 4:
@ -628,8 +638,8 @@ public class TibetanMachineWeb implements THDLWylieConstants {
                                            String subval = uTok.nextToken();
                                            ThdlDebug.verify(subval.length() == 4 || subval.length() == 3);
                                            try {
-                                                int x;
-                                                ThdlDebug.verify(((x = Integer.parseInt(subval, 16)) >= 0x0F00
+                                                int x = Integer.parseInt(subval, 16);
+                                                ThdlDebug.verify((x >= 0x0F00
                                                                  && x <= 0x0FFF)
                                                                 || x == 0x5350
                                                                 || x == 0x534D
@ -1769,9 +1779,14 @@ private static final String Unicode_tab = "\t";
        } else if ('\u0F81' == ch) {
            return tmwFor0F81;
        } else {
-            DuffCode[] x = UnicodeToTMW[ch - '\u0F00'];
-            if (null == x[0]) return null;
-            return x;
+            if (ch >= '\u0F00' && ch <= '\u0FFF') {
+                DuffCode[] x = UnicodeToTMW[ch - '\u0F00'];
+                if (null != x[0]) return x;
+            } else if (ch >= '\uF021' && ch <= '\uF0FF') {
+                DuffCode[] x = NonUnicodeToTMW[ch - '\uF000'];
+                if (null != x[0]) return x;
+            }
+            return null;
        }
    }

--- a/source/org/thdl/tib/text/ttt/ACIPConverter.java
+++ b/source/org/thdl/tib/text/ttt/ACIPConverter.java
@ -607,8 +607,18 @@ public class ACIPConverter {
                            }
                            continue;
                        } else if (stype == TString.UNICODE_CHARACTER) {
+                            ThdlDebug.verify(1 == s.getText().length());
                            if (null != writer) {
-                                unicode = s.getText();
+                                char ch = s.getText().charAt(0);
+                                if (ch >= '\uF021' && ch <= '\uF0FF') {
+                                    hasErrors = true;
+                                    String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: The Unicode escape '" + ch + "' with ordinal " + (int)ch + " is in the private-use area (PUA) of Unicode and will thus not be written out into the output lest you think other tools will be able to understand this non-standard construction.]";
+                                    writer.write(errorMessage);
+                                    if (null != errors)
+                                        errors.append(errorMessage + "\n");
+                                    continue; // FIXME: dropping output if null != tdoc
+                                } else
+                                    unicode = s.getText();
                            }
                            if (null != tdoc) {
                                duff = TibetanMachineWeb.mapUnicodeToTMW(s.getText().charAt(0));
--- a/source/org/thdl/tib/text/ttt/TString.java
+++ b/source/org/thdl/tib/text/ttt/TString.java
@ -41,6 +41,7 @@ public class TString {
     *  is to be converted to something other than Tibetan text.
     *  (Chinese Unicode, Latin, etc. all qualify as non-Tibetan.) */
    public boolean isLatin() {
+        char ch;
        return (type != TIBETAN_NON_PUNCTUATION
                && type != TIBETAN_PUNCTUATION
                && type != TSHEG_BAR_ADORNMENT
@ -49,7 +50,10 @@ public class TString {
                && type != START_SLASH
                && type != END_SLASH
                && (type != UNICODE_CHARACTER
-                    || !UnicodeUtils.isInTibetanRange(getText().charAt(0))));
+                    || !(UnicodeUtils.isInTibetanRange(ch = getText().charAt(0))
+                         // EWTS maps some TMW glyphs to this Unicode
+                         // private-use area (PUA):
+                         || (ch >= '\uF021' && ch <= '\uF0FF'))));
    }

    /** For ACIP [#COMMENTS] and EWTS (DLC FIXME: what are EWTS comments?) */