Now uses terminology from the Unicode standard. No more talk of

characters, for example. Normalization forms NFKD and NFD are supported for the Tibetan Unicode range. I don't like either, actually. I've tested NFKD, but I've not yet committed the tests.
2002-12-15 03:35:24 +00:00 · 2002-12-15 03:35:24 +00:00 · a42347b224
commit a42347b224
parent 3199ff7926
7 changed files with 210 additions and 136 deletions
--- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
@ -19,15 +19,15 @@ Contributor(s): ______________________________________.
 package org.thdl.tib.text.tshegbar;

 /** <p>This non-instantiable class contains utility routines for
- *  dealing with Tibetan Unicode characters and strings of such
- *  characters.</p>
+ *  dealing with Tibetan Unicode codepoints and strings of such
+ *  codepoints.</p>
 *
 *  @author David Chandler */
-public class UnicodeUtils {
+public class UnicodeUtils implements UnicodeConstants {
    /** Do not use this, as this class is not instantiable. */
    private UnicodeUtils() { super(); }

-    /** Returns true iff x is a Unicode character that represents a
+    /** Returns true iff x is a Unicode codepoint that represents a
        consonant or two-consonant stack that has a Unicode code
        point.  Returns true only for the usual suspects (like
        <code>&#92;u0F40</code>) and for Sanskrit consonants (like
@ -40,7 +40,7 @@ public class UnicodeUtils {
                && (x >= '\u0F40' && x <= '\u0F6A'));
    }

-    /** Returns true iff x is a Unicode character that represents a
+    /** Returns true iff x is a Unicode codepoint that represents a
        subjoined consonant or subjoined two-consonant stack that has
        a Unicode code point.  Returns true only for the usual
        suspects (like <code>&#92;u0F90</code>) and for Sanskrit
@ -61,7 +61,7 @@ public class UnicodeUtils {
        '&#92;u0F6A'.  The new consonants (for transcribing Chinese, I
        believe) "&#92;u0F55&#92;u0F39" (which EWTS calls "fa"),
        "&#92;u0F56&#92;u0F39" ("va"), and "&#92;u0F5F&#92;u0F39" ("Dza") are
-        two-character sequences, but you should be aware of them
+        two-codepoint sequences, but you should be aware of them
        also. */
    public static boolean isPreferredFormOfConsonant(char x) {
        return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */)
@ -73,16 +73,16 @@ public class UnicodeUtils {
                && (x != '\u0F5C'));
    }

-    /** Returns true iff unicodeChar is a character from the Unicode
+    /** Returns true iff unicodeCP is a codepoint from the Unicode
        range U+0F00-U+0FFF.
        @see #isEntirelyTibetanUnicode(String) */
-    public static boolean isInTibetanRange(char unicodeChar) {
-        return (unicodeChar >= '\u0F00' && unicodeChar <= '\u0FFF');
+    public static boolean isInTibetanRange(char unicodeCP) {
+        return (unicodeCP >= '\u0F00' && unicodeCP <= '\u0FFF');
    }

-    /** Returns true iff unicodeString consists only of characters
+    /** Returns true iff unicodeString consists only of codepoints
        from the Unicode range U+0F00-U+0FFF.  (Note that these
-        characters are typically not enough to represent a Tibetan
+        codepoints are typically not enough to represent a Tibetan
        text, you may need ZWSP (zero-width space) and various
        whitespace from other ranges.) */
    public static boolean isEntirelyTibetanUnicode(String unicodeString) {
@ -93,21 +93,40 @@ public class UnicodeUtils {
        return true;
    }

-    /** Modifies tibetanUnicode so that it is equivalent, according to
-        the Unicode 3.2 standard, to the input buffer.  The Tibetan
-        passages of the returned string are in THDL-canonical form,
-        however.  This form uses a maximum of characters, in general,
-        and never uses characters whose use has been {@link
-        #isDiscouraged(char) discouraged}.  If the input contains
-        characters for which {@link #isInTibetanRange(char)} is not
-        true, then they will not be modified.
+    /** Puts the Tibetan codepoints in tibetanUnicode, a sequence of
+        Unicode codepoints, into Normalization Form KD (NFKD) as
+        specified by Unicode 3.2.  The Tibetan passages of the
+        returned string are in NFKD, but codepoints outside of the
+        range <code>U+0F00</code>-<code>U+0FFF</code> are not
+        necessarily put into NFKD.  This form uses a maximum of
+        codepoints, and it never uses codepoints whose use has been
+        {@link #isDiscouraged(char) discouraged}.  It would be David
+        Chandler's very favorite form if not for the fact that
+        <code>U+0F0C</code> normalizes to <code>U+0F0B</code> in NFKD.
+        NFD is thus David Chandler's favorite, though it does not
+        decompose <code>U+0F77</code> and <code>U+0F79</code> (for
+        some reason, hopefully a well-thought-out one).
+
+        <p>Recall that NFKD, as it applies to Tibetan codepoints, is
+        closed under string concatenation and under substringing.
+        Note again that if the input contains codepoints for which
+        {@link #isInTibetanRange(char)} is not true, then they will
+        not be modified.</p>
    
        <p>Note well that only well-formed input guarantees
-        well-formed output.</p> */
-    public static void toCanonicalForm(StringBuffer tibetanUnicode) {
+        well-formed output.</p>
+
+        @param tibetanUnicode the codepoints to be decomposed
+        @param normForm NORM_NFKD or NORM_NFD */
+    public static void toMostlyDecomposedUnicode(StringBuffer tibetanUnicode,
+                                                 byte normForm)
+    {
+        if (normForm != NORM_NFD && normForm != NORM_NFKD)
+            throw new IllegalArgumentException("normForm must be NORM_NFD or NORM_NFKD for decomposition to work");
        int offset = 0;
        while (offset < tibetanUnicode.length()) {
-            String s = toCanonicalForm(tibetanUnicode.charAt(offset));
+            String s
+                = toNormalizedForm(tibetanUnicode.charAt(offset), normForm);
            if (null == s) {
                ++offset;
            } else {
@ -118,67 +137,88 @@ public class UnicodeUtils {
        }
    }

-    /** Like {@link #toCanonicalForm(StringBuffer)}, but does not
-        modify its input.  Instead, it returns the canonically-formed
-        version of tibetanUnicode. */
-    public static String toCanonicalForm(String tibetanUnicode) {
+    /** Like {@link #toMostlyDecomposedUnicode(StringBuffer, byte)},
+        but does not modify its input.  Instead, it returns the NFKD-
+        or NFD-normalized version of tibetanUnicode. */
+    public static String toMostlyDecomposedUnicode(String tibetanUnicode,
+                                                   byte normForm)
+    {
        StringBuffer sb = new StringBuffer(tibetanUnicode);
-        toCanonicalForm(sb);
+        toMostlyDecomposedUnicode(sb, normForm);
        return sb.toString();
    }

-    /** There are 19 characters in the Tibetan range of Unicode 3.2
-        which can be decomposed into longer strings of characters in
-        the Tibetan range of Unicode.  These 19 are said not to be in
-        THDL-canonical form.  This routine returns the canonical form
-        for such characters, and returns null for characters that are
-        already canonical or are not in the Tibetan range of Unicode.
-        @param tibetanUnicodeChar the character to canonicalize
-        @return null if tibetanUnicodeChar is canonical, or a string
-        of two or three characters otherwise */
-    public static String toCanonicalForm(char tibetanUnicodeChar) {
-        switch (tibetanUnicodeChar) {
-        case '\u0F43': return new String(new char[] { '\u0F42', '\u0FB7' });
-        case '\u0F4D': return new String(new char[] { '\u0F4C', '\u0FB7' });
-        case '\u0F52': return new String(new char[] { '\u0F51', '\u0FB7' });
-        case '\u0F57': return new String(new char[] { '\u0F56', '\u0FB7' });
-        case '\u0F5C': return new String(new char[] { '\u0F5B', '\u0FB7' });
-        case '\u0F69': return new String(new char[] { '\u0F40', '\u0FB5' });
-        case '\u0F73': return new String(new char[] { '\u0F71', '\u0F72' });
-        case '\u0F75': return new String(new char[] { '\u0F71', '\u0F74' });
-        case '\u0F76': return new String(new char[] { '\u0FB2', '\u0F80' });
-        case '\u0F77': return new String(new char[] { '\u0FB2', '\u0F71', '\u0F80' });
-        case '\u0F78': return new String(new char[] { '\u0FB3', '\u0F80' });
-        case '\u0F79': return new String(new char[] { '\u0FB3', '\u0F71', '\u0F80' });
-        case '\u0F81': return new String(new char[] { '\u0F71', '\u0F80' });
-        case '\u0F93': return new String(new char[] { '\u0F92', '\u0FB7' });
-        case '\u0F9D': return new String(new char[] { '\u0F9C', '\u0FB7' });
-        case '\u0FA2': return new String(new char[] { '\u0FA1', '\u0FB7' });
-        case '\u0FA7': return new String(new char[] { '\u0FA6', '\u0FB7' });
-        case '\u0FAC': return new String(new char[] { '\u0FAB', '\u0FB7' });
-        case '\u0FB9': return new String(new char[] { '\u0F90', '\u0FB5' });
+    /** There are 19 codepoints in the Tibetan range of Unicode 3.2
+        which can be decomposed into longer strings of codepoints in
+        the Tibetan range of Unicode.  Often one wants to manipulate
+        decomposed codepoint strings.  Also, HTML and XML are W3C
+        standards that require certain normalization forms.  This
+        routine returns a chosen normalized form for such codepoints,
+        and returns null for codepoints that are already normalized or
+        are not in the Tibetan range of Unicode.
+        @param tibetanUnicodeCP the codepoint to normalize
+        @param normalizationForm NORM_NFKD or NORM_NFD if you expect
+        something nontrivial to happen
+        @return null if tibetanUnicodeCP is already in the chosen
+        normalized form, or a string of two or three codepoints
+        otherwise */
+    public static String toNormalizedForm(char tibetanUnicodeCP, byte normalizationForm) {
+        if (normalizationForm == NORM_NFKD
+            || normalizationForm == NORM_NFD) {
+            // Where not specified, the NFKD form is also the NFD form.
+            switch (tibetanUnicodeCP) {
+            case '\u0F0C': return ((normalizationForm == NORM_NFKD)
+                                   ? "\u0F0B" : null);
+            case '\u0F43': return "\u0F42\u0FB7";
+            case '\u0F4D': return "\u0F4C\u0FB7";
+            case '\u0F52': return "\u0F51\u0FB7";
+            case '\u0F57': return "\u0F56\u0FB7";
+            case '\u0F5C': return "\u0F5B\u0FB7";
+            case '\u0F69': return "\u0F40\u0FB5";
+            case '\u0F73': return "\u0F71\u0F72";
+            case '\u0F75': return "\u0F71\u0F74";
+            case '\u0F76': return "\u0FB2\u0F80";
+            // I do not understand why NFD does not decompose this codepoint:
+            case '\u0F77': return ((normalizationForm == NORM_NFKD)
+                                   ? "\u0FB2\u0F71\u0F80" : null);
+            case '\u0F78': return "\u0FB3\u0F80";
+            // I do not understand why NFD does not decompose this codepoint:
+            case '\u0F79': return ((normalizationForm == NORM_NFKD)
+                                   ? "\u0FB3\u0F71\u0F80" : null);

-        default:
-            return null;
+            case '\u0F81': return "\u0F71\u0F80";
+            case '\u0F93': return "\u0F92\u0FB7";
+            case '\u0F9D': return "\u0F9C\u0FB7";
+            case '\u0FA2': return "\u0FA1\u0FB7";
+            case '\u0FA7': return "\u0FA6\u0FB7";
+            case '\u0FAC': return "\u0FAB\u0FB7";
+            case '\u0FB9': return "\u0F90\u0FB5";
+
+            default:
+                return null;
+            }
        }
+        return null;
    }

-    /** Returns true iff tibetanUnicodeChar {@link
-        #isInTibetanRange(char)} and if the Unicode 3.2 standard
-        discourages the use of tibetanUnicodeChar. */
-    public static boolean isDiscouraged(char tibetanUnicodeChar) {
-        return ('\u0F73' == tibetanUnicodeChar
-                || '\u0F75' == tibetanUnicodeChar
-                || '\u0F77' == tibetanUnicodeChar
-                || '\u0F81' == tibetanUnicodeChar);
+    /** Returns true iff tibetanUnicodeCP {@link
+        #isInTibetanRange(char) is a Tibetan codepoint} and if the
+        Unicode 3.2 standard discourages the use of
+        tibetanUnicodeCP. */
+    public static boolean isDiscouraged(char tibetanUnicodeCP) {
+        return ('\u0F73' == tibetanUnicodeCP
+                || '\u0F75' == tibetanUnicodeCP
+                || '\u0F77' == tibetanUnicodeCP
+                || '\u0F79' == tibetanUnicodeCP
+                || '\u0F81' == tibetanUnicodeCP);
        /* DLC FIXME -- I was using 3.0 p.437-440, check 3.2. */
    }

    /** Returns true iff ch corresponds to the Tibetan letter ra.
-        Several Unicode characters correspond to the Tibetan letter ra
+        Several Unicode codepoints correspond to the Tibetan letter ra
        (in its subscribed form or otherwise).  Oftentimes,
        <code>&#92;u0F62</code> is thought of as the nominal
-        representation.  Returns false for some characters that
+        representation.  Returns false for some codepoints that
        contain ra but are not merely ra, such as <code>&#92;u0F77</code> */
    public static boolean isRa(char ch) {
        return ('\u0F62' == ch
@ -188,7 +228,7 @@ public class UnicodeUtils {
    }

    /** Returns true iff ch corresponds to the Tibetan letter wa.
-        Several Unicode characters correspond to the Tibetan letter
+        Several Unicode codepoints correspond to the Tibetan letter
        wa.  Oftentimes, <code>&#92;u0F5D</code> is thought of as the
        nominal representation. */
    public static boolean isWa(char ch) {
@ -198,7 +238,7 @@ public class UnicodeUtils {
    }

    /** Returns true iff ch corresponds to the Tibetan letter ya.
-        Several Unicode characters correspond to the Tibetan letter
+        Several Unicode codepoints correspond to the Tibetan letter
        ya.  Oftentimes, <code>&#92;u0F61</code> is thought of as the
        nominal representation. */
    public static boolean isYa(char ch) {
@ -207,14 +247,14 @@ public class UnicodeUtils {
                || '\u0FBB' == ch);
    }

-    /** Returns true iff there exists at least one character ch in
-        unicodeString such that ch {@link #isRa(char) is ra} or contains
+    /** Returns true iff there exists at least one codepoint cp in
+        unicodeString such that cp {@link #isRa(char) is ra} or contains
        ra (like <code>&#92;u0F77</code>).  This method is not implemented
        as fast as it could be.  It calls on the canonicalization code
        in order to maximize reuse and minimize the possibility of
        coder error. */
    public static boolean containsRa(String unicodeString) {
-        String canonForm = toCanonicalForm(unicodeString);
+        String canonForm = toMostlyDecomposedUnicode(unicodeString, NORM_NFKD);
        for (int i = 0; i < canonForm.length(); i++) {
            if (isRa(canonForm.charAt(i)))
                return true;
@ -223,11 +263,13 @@ public class UnicodeUtils {
    }
    /** Inefficient shortcut.
        @see #containsRa(String) */
-    public static boolean containsRa(char unicodeChar) {
-        return containsRa(new String(new char[] { unicodeChar }));
+    public static boolean containsRa(char unicodeCP) {
+        return containsRa(new String(new char[] { unicodeCP }));
    }

-    public static String unicodeCharToString(char ch) {
+    /** Returns a human-readable, ASCII form of the Unicode codepoint
+        ch. */
+    public static String unicodeCPToString(char ch) {
        return "U+" + Integer.toHexString((int)ch);
    }
 }