From a39c5c12b095cb7f519133512c9b7cc572c8a54a Mon Sep 17 00:00:00 2001
From: dchandler <dchandler>
Date: Mon, 8 Dec 2003 07:15:27 +0000
Subject: [PATCH] ACIP->TMW now supports EWTS PUA {\uF021}-style escapes.  Our
 extended ACIP is thus TMW-complete and useful for testing.

---
 .../org/thdl/tib/text/TibetanMachineWeb.java  | 25 +++++++++++++++----
 .../org/thdl/tib/text/ttt/ACIPConverter.java  | 12 ++++++++-
 source/org/thdl/tib/text/ttt/TString.java     |  6 ++++-
 3 files changed, 36 insertions(+), 7 deletions(-)
diff --git a/source/org/thdl/tib/text/TibetanMachineWeb.java b/source/org/thdl/tib/text/TibetanMachineWeb.java
index 0752ef2..011f694 100644
--- a/source/org/thdl/tib/text/TibetanMachineWeb.java
+++ b/source/org/thdl/tib/text/TibetanMachineWeb.java
@@ -84,6 +84,8 @@ public class TibetanMachineWeb implements THDLWylieConstants {
         use special formatting to get those right (FIXME: warn
         whenever they're used). */
     private static DuffCode[][] UnicodeToTMW = new DuffCode[256][1];
+    /** For mapping codepoints U+F021..U+0FFF to TMW. */
+    private static DuffCode[][] NonUnicodeToTMW = new DuffCode[256][1];
     private static String fileName = "tibwn.ini";
     private static final String DELIMITER = "~";
     /** vowels that appear over the glyph: */
@@ -603,6 +605,14 @@ public class TibetanMachineWeb implements THDLWylieConstants {
                                     // could well be null):
                                     TMWtoTM[duffCodes[TMW].getFontNum()-1][duffCodes[TMW].getCharNum()-32]
                                         = duffCodes[TM]; // TMW->TM mapping
+
+                                    if (wylie.toLowerCase().startsWith("\\uf0")) {
+                                        int x = Integer.parseInt(wylie.substring("\\u".length()), 16);
+                                        ThdlDebug.verify((x >= 0xF000
+                                                          && x <= 0xF0FF));
+                                        NonUnicodeToTMW[x - '\uF000']
+                                            = new DuffCode[] { duffCodes[TMW] };
+                                    }
                                     break;
                                 // Vowels etc. to use with this glyph:
                                 case 4:
@@ -628,8 +638,8 @@ public class TibetanMachineWeb implements THDLWylieConstants {
                                             String subval = uTok.nextToken();
                                             ThdlDebug.verify(subval.length() == 4 || subval.length() == 3);
                                             try {
-                                                int x;
-                                                ThdlDebug.verify(((x = Integer.parseInt(subval, 16)) >= 0x0F00
+                                                int x = Integer.parseInt(subval, 16);
+                                                ThdlDebug.verify((x >= 0x0F00
                                                                   && x <= 0x0FFF)
                                                                  || x == 0x5350
                                                                  || x == 0x534D
@@ -1769,9 +1779,14 @@ private static final String Unicode_tab = "\t";
         } else if ('\u0F81' == ch) {
             return tmwFor0F81;
         } else {
-            DuffCode[] x = UnicodeToTMW[ch - '\u0F00'];
-            if (null == x[0]) return null;
-            return x;
+            if (ch >= '\u0F00' && ch <= '\u0FFF') {
+                DuffCode[] x = UnicodeToTMW[ch - '\u0F00'];
+                if (null != x[0]) return x;
+            } else if (ch >= '\uF021' && ch <= '\uF0FF') {
+                DuffCode[] x = NonUnicodeToTMW[ch - '\uF000'];
+                if (null != x[0]) return x;
+            }
+            return null;
         }
     }
 
diff --git a/source/org/thdl/tib/text/ttt/ACIPConverter.java b/source/org/thdl/tib/text/ttt/ACIPConverter.java
index 0b551b9..ac27ba0 100644
--- a/source/org/thdl/tib/text/ttt/ACIPConverter.java
+++ b/source/org/thdl/tib/text/ttt/ACIPConverter.java
@@ -607,8 +607,18 @@ public class ACIPConverter {
                             }
                             continue;
                         } else if (stype == TString.UNICODE_CHARACTER) {
+                            ThdlDebug.verify(1 == s.getText().length());
                             if (null != writer) {
-                                unicode = s.getText();
+                                char ch = s.getText().charAt(0);
+                                if (ch >= '\uF021' && ch <= '\uF0FF') {
+                                    hasErrors = true;
+                                    String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: The Unicode escape '" + ch + "' with ordinal " + (int)ch + " is in the private-use area (PUA) of Unicode and will thus not be written out into the output lest you think other tools will be able to understand this non-standard construction.]";
+                                    writer.write(errorMessage);
+                                    if (null != errors)
+                                        errors.append(errorMessage + "\n");
+                                    continue; // FIXME: dropping output if null != tdoc
+                                } else
+                                    unicode = s.getText();
                             }
                             if (null != tdoc) {
                                 duff = TibetanMachineWeb.mapUnicodeToTMW(s.getText().charAt(0));
diff --git a/source/org/thdl/tib/text/ttt/TString.java b/source/org/thdl/tib/text/ttt/TString.java
index 632c531..22c4343 100644
--- a/source/org/thdl/tib/text/ttt/TString.java
+++ b/source/org/thdl/tib/text/ttt/TString.java
@@ -41,6 +41,7 @@ public class TString {
      *  is to be converted to something other than Tibetan text.
      *  (Chinese Unicode, Latin, etc. all qualify as non-Tibetan.) */
     public boolean isLatin() {
+        char ch;
         return (type != TIBETAN_NON_PUNCTUATION
                 && type != TIBETAN_PUNCTUATION
                 && type != TSHEG_BAR_ADORNMENT
@@ -49,7 +50,10 @@ public class TString {
                 && type != START_SLASH
                 && type != END_SLASH
                 && (type != UNICODE_CHARACTER
-                    || !UnicodeUtils.isInTibetanRange(getText().charAt(0))));
+                    || !(UnicodeUtils.isInTibetanRange(ch = getText().charAt(0))
+                         // EWTS maps some TMW glyphs to this Unicode
+                         // private-use area (PUA):
+                         || (ch >= '\uF021' && ch <= '\uF0FF'))));
     }
 
     /** For ACIP [#COMMENTS] and EWTS (DLC FIXME: what are EWTS comments?) */