ACIP->Unicode, without going through TMW, is now possible, so long as

\, the Sanskrit virama, is not used. Of the 1370-odd ACIP texts I've got here, about 57% make it through the gauntlet (fewer if you demand a vowel or disambiguator on every stack of a non-Tibetan tsheg bar).
2003-08-18 02:38:54 +00:00 · 2003-08-18 02:38:54 +00:00 · 1afb3a0fdd
commit 1afb3a0fdd
parent 245aac4911
12 changed files with 646 additions and 40 deletions
--- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
+++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
@ -341,7 +341,7 @@ public final class LegalTshegBar
            EWC_ta,  EWC_tha,  EWC_da,     EWC_na,
            EWC_pa,  EWC_pha,  EWC_ba,     EWC_ma,
            EWC_tsa, EWC_tsha, EWC_dza,    EWC_wa,
-            EWC_zha, EWC_za,   EWC_achung,  EWC_ya,
+            EWC_zha, EWC_za,   EWC_achung, EWC_ya,
            EWC_ra,  EWC_la,   EWC_sha,    EWC_sa,
            EWC_ha,  EWC_a
        });
@ -833,7 +833,7 @@ public final class LegalTshegBar
                    return internalThrowThing(throwIfIllegal,
                                              errorBuf,
                                              "Illegal suffix -- not one of the ten legal suffixes: "
-                                              + UnicodeUtils.unicodeCodepointToString(suffix.charAt(0)));
+                                              + UnicodeUtils.unicodeCodepointToString(suffix.charAt(0), false));
                }
            }
        }
--- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
@ -286,8 +286,41 @@ public class UnicodeUtils implements UnicodeConstants {
    }

    /** Returns a human-readable, ASCII form of the Unicode codepoint
-        cp. */
-    public static String unicodeCodepointToString(char cp) {
+        cp. If shortenIfPossible is true, then printable ASCII
+        characters will appear as themselves. */
+    public static String unicodeCodepointToString(char cp,
+                                                  boolean shortenIfPossible) {
+        if (shortenIfPossible) {
+            if ((cp >= 'a' && cp <= 'z')
+                || (cp >= 'A' && cp <= 'Z')
+                || (cp >= '0' && cp <= '9')
+                || cp == '.'
+                || cp == ','
+                || cp == ' '
+                || cp == '\''
+                || cp == '"'
+                || cp == '+'
+                || cp == '-'
+                || cp == '='
+                || cp == '_'
+                || cp == '@'
+                || cp == '!'
+                || cp == '#'
+                || cp == '$'
+                || cp == '%'
+                || cp == '^'
+                || cp == '&'
+                || cp == '*'
+                || cp == '\t'
+                || cp == ':'
+                || cp == '['
+                || cp == ']'
+                || cp == '('
+                || cp == ')'
+                || cp == '{'
+                || cp == '}')
+                return new String(new char[] { cp });
+        }
        if (cp < '\u0010')
            return "\\u000" + Integer.toHexString((int)cp);
        else if (cp < '\u0100')
@ -304,7 +337,19 @@ public class UnicodeUtils implements UnicodeConstants {
    public static String unicodeStringToString(String s) {
        StringBuffer sb = new StringBuffer(s.length() * 6);
        for (int i = 0; i < s.length(); i++) {
-            sb.append(unicodeCodepointToString(s.charAt(i)));
+            sb.append(unicodeCodepointToString(s.charAt(i), false));
+        }
+        return sb.toString();
+    }
+
+    /**
+     * Returns the most succinct possible, human-readable, ASCII form
+     * of the String s of Unicode codepoints. */
+    public static String unicodeStringToPrettyString(String s) {
+        if (s == null) return "null";
+        StringBuffer sb = new StringBuffer(s.length() * 6);
+        for (int i = 0; i < s.length(); i++) {
+            sb.append(unicodeCodepointToString(s.charAt(i), true));
        }
        return sb.toString();
    }
--- a/source/org/thdl/tib/text/tshegbar/UnicodeUtilsTest.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtilsTest.java
@ -321,15 +321,15 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants {
     * Tests the {@link UnicodeUtils#unicodeCodepointToString(char)}
     * method. */
    public void testUnicodeCodepointToString() {
-        assertTrue(UnicodeUtils.unicodeCodepointToString('\u0000').equals("\\u0000"));
-        assertTrue(UnicodeUtils.unicodeCodepointToString('\u0001').equals("\\u0001"));
-        assertTrue(UnicodeUtils.unicodeCodepointToString('\u000F').equals("\\u000f"));
-        assertTrue(UnicodeUtils.unicodeCodepointToString('\u001F').equals("\\u001f"));
-        assertTrue(UnicodeUtils.unicodeCodepointToString('\u00fF').equals("\\u00ff"));
-        assertTrue(UnicodeUtils.unicodeCodepointToString('\u01fF').equals("\\u01ff"));
-        assertTrue(UnicodeUtils.unicodeCodepointToString('\u0ffF').equals("\\u0fff"));
-        assertTrue(UnicodeUtils.unicodeCodepointToString('\u1ffF').equals("\\u1fff"));
-        assertTrue(UnicodeUtils.unicodeCodepointToString('\ufffF').equals("\\uffff"));
+        assertTrue(UnicodeUtils.unicodeCodepointToString('\u0000', false).equals("\\u0000"));
+        assertTrue(UnicodeUtils.unicodeCodepointToString('\u0001', false).equals("\\u0001"));
+        assertTrue(UnicodeUtils.unicodeCodepointToString('\u000F', false).equals("\\u000f"));
+        assertTrue(UnicodeUtils.unicodeCodepointToString('\u001F', false).equals("\\u001f"));
+        assertTrue(UnicodeUtils.unicodeCodepointToString('\u00fF', false).equals("\\u00ff"));
+        assertTrue(UnicodeUtils.unicodeCodepointToString('\u01fF', false).equals("\\u01ff"));
+        assertTrue(UnicodeUtils.unicodeCodepointToString('\u0ffF', false).equals("\\u0fff"));
+        assertTrue(UnicodeUtils.unicodeCodepointToString('\u1ffF', false).equals("\\u1fff"));
+        assertTrue(UnicodeUtils.unicodeCodepointToString('\ufffF', false).equals("\\uffff"));
    }

    /**