A reverter that converts Unicode to computer-friendly (but not, yet,

human-friendly) EWTS is here in alpha mode. It probably doesn't deal well with non-Tibetan.
2005-08-01 05:54:20 +00:00 · 2005-08-01 05:54:20 +00:00 · 5788416629
commit 5788416629
parent 00afd75362
13 changed files with 496 additions and 47 deletions
--- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
@ -102,7 +102,10 @@ public class UnicodeUtils implements UnicodeConstants {
        nor NFKD breaks down <code>U+0F00</code> into its constituent
        codepoints.  NFTHDL uses a maximum of codepoints, and it never
        uses codepoints whose use has been {@link #isDiscouraged(char)
-        discouraged}.
+        discouraged}.  NFTHDL also does not screw things up by using
+        the standard-but-wrong CCCVs.  It sorts stretches of combining
+        characters wisely as per
+        {@link http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml}.

        <p>The Tibetan passages of the returned string are in the
        chosen normalized form, but codepoints outside of the {@link
@ -136,6 +139,9 @@ public class UnicodeUtils implements UnicodeConstants {
                tibetanUnicode.insert(offset, s);
            }
        }
+        if (normForm == NORM_NFTHDL) {
+            fixSomeOrderingErrorsInTibetanUnicode(tibetanUnicode);
+        }
    }

    /** Like {@link #toMostlyDecomposedUnicode(StringBuffer, byte)},
@ -418,7 +424,39 @@ public class UnicodeUtils implements UnicodeConstants {
     *  product.)
     */
    private static char unicode_pairs[][]
-        = { { '\u0f71', '\u0f74' },
+        = {
+            /* TODO(dchandler): use regex
+             * "[\u0f39\u0f71-\u0f84\u0f86\u0f87]{2,}" to find patches
+             * that need sorting and then sort each of those.  This
+             * cross product is ugly. */
+            
+            { '\u0f39', '\u0f71' },
+            { '\u0f39', '\u0f72' },
+            { '\u0f39', '\u0f74' },
+            { '\u0f39', '\u0f7a' },
+            { '\u0f39', '\u0f7b' },
+            { '\u0f39', '\u0f7c' },
+            { '\u0f39', '\u0f7d' },
+            { '\u0f39', '\u0f7e' },
+            { '\u0f39', '\u0f7f' },
+            { '\u0f39', '\u0f80' },
+            { '\u0f39', '\u0f82' },
+            { '\u0f39', '\u0f83' },
+
+            { '\u0f71', '\u0f7f' },
+            { '\u0f72', '\u0f7f' },
+            { '\u0f74', '\u0f7f' },
+            { '\u0f7a', '\u0f7f' },
+            { '\u0f7b', '\u0f7f' },
+            { '\u0f7c', '\u0f7f' },
+            { '\u0f7d', '\u0f7f' },
+            // but not { '\u0f7e', '\u0f7f' },
+            { '\u0f39', '\u0f7f' },
+            { '\u0f80', '\u0f7f' },
+            { '\u0f82', '\u0f7f' },
+            { '\u0f83', '\u0f7f' },
+
+            { '\u0f71', '\u0f74' },

            { '\u0f71', '\u0f72' },
            { '\u0f71', '\u0f7a' },
@ -489,7 +527,9 @@ public class UnicodeUtils implements UnicodeConstants {
     *  the same file modulo Unicode booboos would be better.  </p>
     *
     *  @param sb the buffer to be mutated
-     *  @return true if sb was mutated */
+     *  @return true if sb was mutated
+     *  @see <a href="http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml">Tibetan Encoding Model</a>
+     */
    public static boolean fixSomeOrderingErrorsInTibetanUnicode(StringBuffer sb) {
        boolean mutated = false;
        int len = sb.length();
@ -512,25 +552,5 @@ public class UnicodeUtils implements UnicodeConstants {
        } while (mutated_this_time_through);
        return mutated;
    }
-
-    /** Returns true iff ch is a valid Tibetan codepoint in Unicode
-     *  4.0: */
-    public boolean isTibetanUnicodeCodepoint(char ch) {
-        // NOTE: could use an array of 256 booleans for speed but I'm lazy
-        return ((ch >= '\u0f00' && ch <= '\u0fcf')
-                && !(ch == '\u0f48'
-                     || (ch > '\u0f6a' && ch < '\u0f71')
-                     || (ch > '\u0f8b' && ch < '\u0f90')
-                     || ch == '\u0f98'
-                     || ch == '\u0fbd'
-                     || ch == '\u0fcd'
-                     || ch == '\u0fce'));
-    }
-
-    /** Returns true iff ch is in 0F00-0FFF but isn't a valid Tibetan
-     *  codepoint in Unicode 4.0: */
-    public boolean isInvalidTibetanUnicode(char ch) {
-        return (isInTibetanRange(ch) && !isTibetanUnicodeCodepoint(ch));
-    }
 }