ACIP->TMW and ACIP->Unicode now allow for Unicode escapes like K\u0F84. This means that the lack of support for ACIP's backslash, '\\', is mitigated because you can turn ACIP {K\} into ACIP {K\u0F84}.

Support for U+F021-U+F0FF, the PUA that the latest EWTS uses, is not provided.
2003-11-29 22:56:18 +00:00 · 2003-11-29 22:56:18 +00:00 · dfaae4be93
commit dfaae4be93
parent 946d8cbc72
6 changed files with 845 additions and 16 deletions
--- a/source/org/thdl/tib/text/ttt/ACIPConverter.java
+++ b/source/org/thdl/tib/text/ttt/ACIPConverter.java
@ -266,6 +266,8 @@ public class ACIPConverter {
        throws IOException
    {
        try {
+        if (null != tdoc && (toUnicode && !toRTF))
+            throw new Error("Doing both at once might work, but it's not been tested.  I bet some 'continue;' statements will need to go.");
        if (toUnicode && toRTF)
            throw new Error("FIXME: support this ACIP->Unicode.rtf mode so that KA (GA) shows up in two different font sizes.  See RFE 838591.");
        if (!toUnicode && !toRTF)
@ -363,7 +365,7 @@ public class ACIPConverter {
                    warnings.append('\n');
                }
            } else {
-                if (s.isLatin(stype)) {
+                if (s.isLatin()) {
                    lastGuyWasNonPunct = false;
                    lastGuy = null;
                    String text
@ -576,7 +578,7 @@ public class ACIPConverter {
                                        tdoc.appendRoman(tdocLocation[0], s.getText(),
                                                         Color.BLACK);
                                        tdocLocation[0] += s.getText().length();
-                                        continue;
+                                        continue; // FIXME: this means the unicode above doesn't go into the output if null != writer && null != tdoc?
                                    } else {
                                        String wy = ACIPRules.getWylieForACIPOther(s.getText());
                                        if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
@ -594,6 +596,24 @@ public class ACIPConverter {
                                tdoc.setTibetanFontSize(regularFontSize);
                            }
                            continue;
+                        } else if (stype == TString.UNICODE_CHARACTER) {
+                            if (null != writer) {
+                                unicode = s.getText();
+                            }
+                            if (null != tdoc) {
+                                duff = TibetanMachineWeb.mapUnicodeToTMW(s.getText().charAt(0));
+                                if (null == duff) {
+                                    hasErrors = true;
+                                    String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: The Unicode escape with ordinal " + (int)s.getText().charAt(0) + " does not match up with any TibetanMachineWeb glyph.]";
+                                    tdoc.appendRoman(tdocLocation[0],
+                                                     errorMessage,
+                                                     Color.RED);
+                                    tdocLocation[0] += errorMessage.length();
+                                    if (null != errors)
+                                        errors.append(errorMessage + "\n");
+                                    continue; // FIXME: if null != writer, we dropped some output.
+                                }
+                            }
                        } else {
                            throw new Error("forgot a case");
                        }
--- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
@ -23,6 +23,7 @@ import java.util.ArrayList;
 import java.util.Stack;

 import org.thdl.util.ThdlDebug;
+import org.thdl.util.ThdlOptions;

 /**
 * This class is able to break up Strings of ACIP text (for example, an
@ -903,11 +904,31 @@ public class ACIPTshegBarScanner {
                            errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
                                          + "Found an illegal, unprintable character.\n");
                    } else if ('\\' == ch) {
-                        al.add(new TString("Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly.  Sorry!  Please do complain to the maintainers.",
-                                           TString.ERROR));
-                        if (null != errors)
-                            errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
-                                          + "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly.  Sorry!  Please do complain to the maintainers.\n");
+                        int x = -1;
+                        if (!ThdlOptions.getBooleanOption("thdl.tib.text.disallow.unicode.character.escapes.in.acip")
+                            && i + 5 < sl && 'u' == s.charAt(i+1)) {
+                            try {
+                                if (!((x = Integer.parseInt(s.substring(i+2, i+6), 16)) >= 0x0000 && x <= 0xFFFF))
+                                    x = -1;
+                            } catch (NumberFormatException e) {
+                                // Though this is unlikely to be
+                                // legal, we allow it through.
+                                // (FIXME: warn.)
+                            }
+                        }
+                        if (x >= 0) {
+                            al.add(new TString(new String(new char[] { (char)x }),
+                                               TString.UNICODE_CHARACTER));
+                            i += "uXXXX".length();
+                            startOfString = i+1;
+                            break;
+                        } else {
+                            al.add(new TString("Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly.  Sorry!  Please do complain to the maintainers.",
+                                               TString.ERROR));
+                            if (null != errors)
+                                errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
+                                              + "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly.  Sorry!  Please do complain to the maintainers.\n");
+                        }
                    } else {
                        al.add(new TString("Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".",
                                           TString.ERROR));
--- a/source/org/thdl/tib/text/ttt/TString.java
+++ b/source/org/thdl/tib/text/ttt/TString.java
@ -19,6 +19,8 @@ Contributor(s): ______________________________________.
 package org.thdl.tib.text.ttt;

 import org.thdl.util.ThdlOptions;
+import org.thdl.util.ThdlDebug;
+import org.thdl.tib.text.tshegbar.UnicodeUtils;

 import java.util.HashSet;
 import java.io.*;
@ -35,16 +37,19 @@ public class TString {
    private int type;
    private String text;

-    /** Returns true if and only if an TString with type type is to
-     *  be converted to Latin, not Tibetan, text. */
-    public static boolean isLatin(int type) {
+    /** Returns true if and only if an TString with type <i>type</i>
+     *  is to be converted to something other than Tibetan text.
+     *  (Chinese Unicode, Latin, etc. all qualify as non-Tibetan.) */
+    public boolean isLatin() {
        return (type != TIBETAN_NON_PUNCTUATION
                && type != TIBETAN_PUNCTUATION
                && type != TSHEG_BAR_ADORNMENT
                && type != START_PAREN
                && type != END_PAREN
                && type != START_SLASH
-                && type != END_SLASH);
+                && type != END_SLASH
+                && (type != UNICODE_CHARACTER
+                    || !UnicodeUtils.isInTibetanRange(getText().charAt(0))));
    }

    /** For ACIP [#COMMENTS] and EWTS (DLC FIXME: what are EWTS comments?) */
@ -87,13 +92,15 @@ public class TString {
    public static final int WARNING = 17;
    /** For ACIP %, o, and x or EWTS (DLC FIXME: what are EWTS adornments?) */
    public static final int TSHEG_BAR_ADORNMENT = 18;
+    /** For "\\uMNOP", this TString will contain the string that has
+        just the sole character "\\uMNOP". */
+    public static final int UNICODE_CHARACTER = 19;
    /** For things that are not legal syntax, such as a file that
-     * contains just "[# HALF A COMMEN" */
-    public static final int ERROR = 19;
+     *  contains just "[# HALF A COMMEN".  THIS MUST COME LAST. */
+    public static final int ERROR = 20;

-    /** Returns true if and only if this string is Latin (usually
-     *  English).  Returns false if this string is transliteration of
-     *  Tibetan. */
+    /** Returns the type of this string, which is one of the
+        enumerated integer static final members of this class. */
    public int getType() {
        return type;
    }
@ -126,6 +133,8 @@ public class TString {
        String ftext = (TIBETAN_NON_PUNCTUATION == type)
            ? MidLexSubstitution.getFinalValueForTibetanNonPunctuationToken(text)
            : text;
+        // FIXME: assert this
+        ThdlDebug.verify(type != UNICODE_CHARACTER || text.length() == 1);
        setText(ftext);
        if ((outputAllTshegBars || outputUniqueTshegBars) && TIBETAN_NON_PUNCTUATION == type)
            outputTshegBar(ftext);
@ -182,6 +191,7 @@ public class TString {
        if (type == END_PAREN) typeString = "END_PAREN";
        if (type == WARNING) typeString = "WARNING";
        if (type == TSHEG_BAR_ADORNMENT) typeString = "TSHEG_BAR_ADORNMENT";
+        if (type == UNICODE_CHARACTER) typeString = "UNICODE_CHARACTER";
        if (type == ERROR) typeString = "ERROR";
        return typeString + ":{" + getText() + "}";
    }