ACIP->TMW and ACIP->Unicode now allow for Unicode escapes like K\u0F84. This means that the lack of support for ACIP's backslash, '\\', is mitigated because you can turn ACIP {K\} into ACIP {K\u0F84}.

Support for U+F021-U+F0FF, the PUA that the latest EWTS uses, is not provided.
2003-11-29 22:56:18 +00:00 · 2003-11-29 22:56:18 +00:00 · dfaae4be93
commit dfaae4be93
parent 946d8cbc72
6 changed files with 845 additions and 16 deletions
--- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
@ -23,6 +23,7 @@ import java.util.ArrayList;
 import java.util.Stack;

 import org.thdl.util.ThdlDebug;
+import org.thdl.util.ThdlOptions;

 /**
 * This class is able to break up Strings of ACIP text (for example, an
@ -903,11 +904,31 @@ public class ACIPTshegBarScanner {
                            errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
                                          + "Found an illegal, unprintable character.\n");
                    } else if ('\\' == ch) {
-                        al.add(new TString("Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly.  Sorry!  Please do complain to the maintainers.",
-                                           TString.ERROR));
-                        if (null != errors)
-                            errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
-                                          + "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly.  Sorry!  Please do complain to the maintainers.\n");
+                        int x = -1;
+                        if (!ThdlOptions.getBooleanOption("thdl.tib.text.disallow.unicode.character.escapes.in.acip")
+                            && i + 5 < sl && 'u' == s.charAt(i+1)) {
+                            try {
+                                if (!((x = Integer.parseInt(s.substring(i+2, i+6), 16)) >= 0x0000 && x <= 0xFFFF))
+                                    x = -1;
+                            } catch (NumberFormatException e) {
+                                // Though this is unlikely to be
+                                // legal, we allow it through.
+                                // (FIXME: warn.)
+                            }
+                        }
+                        if (x >= 0) {
+                            al.add(new TString(new String(new char[] { (char)x }),
+                                               TString.UNICODE_CHARACTER));
+                            i += "uXXXX".length();
+                            startOfString = i+1;
+                            break;
+                        } else {
+                            al.add(new TString("Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly.  Sorry!  Please do complain to the maintainers.",
+                                               TString.ERROR));
+                            if (null != errors)
+                                errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
+                                              + "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly.  Sorry!  Please do complain to the maintainers.\n");
+                        }
                    } else {
                        al.add(new TString("Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".",
                                           TString.ERROR));