More and better tests; fixed some bugs in LegalTshegBar.

2003-03-28 03:49:49 +00:00 · 2003-03-28 03:49:49 +00:00 · 2b81020b0e
commit 2b81020b0e
parent 35a9869aac
4 changed files with 317 additions and 68 deletions
--- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
@ -30,11 +30,11 @@ public class UnicodeUtils implements UnicodeConstants {
    /** Returns true iff x is a Unicode codepoint that represents a
        consonant or two-consonant stack that has a Unicode code
        point.  Returns true only for the usual suspects (like
-        <code>&#92;u0F40</code>) and for Sanskrit consonants (like
-        <code>&#92;u0F71</code>) and the simple two-consonant stacks in
-        Unicode (like <code>&#92;u0F43</code>).  Returns false for, among
+        <code>U+0F40</code>) and for Sanskrit consonants (like
+        <code>U+0F71</code>) and the simple two-consonant stacks in
+        Unicode (like <code>U+0F43</code>).  Returns false for, among
        other things, subjoined consonants like
-        <code>&#92;u0F90</code>. */
+        <code>U+0F90</code>. */
    public static boolean isNonSubjoinedConsonant(char x) {
        return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */)
                && (x >= '\u0F40' && x <= '\u0F6A'));
@ -43,11 +43,11 @@ public class UnicodeUtils implements UnicodeConstants {
    /** Returns true iff x is a Unicode codepoint that represents a
        subjoined consonant or subjoined two-consonant stack that has
        a Unicode code point.  Returns true only for the usual
-        suspects (like <code>&#92;u0F90</code>) and for Sanskrit
-        consonants (like <code>&#92;u0F9C</code>) and the simple
-        two-consonant stacks in Unicode (like <code>&#92;u0FAC</code>).
+        suspects (like <code>U+0F90</code>) and for Sanskrit
+        consonants (like <code>U+0F9C</code>) and the simple
+        two-consonant stacks in Unicode (like <code>U+0FAC</code>).
        Returns false for, among other things, non-subjoined
-        consonants like <code>&#92;u0F40</code>. */
+        consonants like <code>U+0F40</code>. */
    public static boolean isSubjoinedConsonant(char x) {
        return ((x != '\u0F98' /* reserved in Unicode 3.2, but not in use */)
                && (x >= '\u0F90' && x <= '\u0FBC'));
@ -56,13 +56,13 @@ public class UnicodeUtils implements UnicodeConstants {
    /** Returns true iff x is the preferred representation of a
        Tibetan or Sanskrit consonant and cannot be broken down any
        further.  Returns false for, among other things, subjoined
-        consonants like <code>&#92;u0F90</code>, two-component consonants
-        like <code>&#92;u0F43</code>, and fixed-form consonants like
-        '&#92;u0F6A'.  The new consonants (for transcribing Chinese, I
-        believe) "&#92;u0F55&#92;u0F39" (which EWTS calls "fa"),
-        "&#92;u0F56&#92;u0F39" ("va"), and "&#92;u0F5F&#92;u0F39" ("Dza") are
-        two-codepoint sequences, but you should be aware of them
-        also. */
+        consonants like <code>U+0F90</code>, two-component consonants
+        like <code>U+0F43</code>, and fixed-form consonants like
+        <code>U+0F6A</code>.  The new consonants (for transcribing
+        Chinese, I believe) "&#92;u0F55&#92;u0F39" (which EWTS calls
+        "fa"), "&#92;u0F56&#92;u0F39" ("va"), and
+        "&#92;u0F5F&#92;u0F39" ("Dza") are two-codepoint sequences,
+        but you should be aware of them also. */
    public static boolean isPreferredFormOfConsonant(char x) {
        return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */)
                && (x >= '\u0F40' && x <= '\u0F68')
@ -97,7 +97,7 @@ public class UnicodeUtils implements UnicodeConstants {
        Unicode codepoints, into either Normalization Form KD (NFKD),
        D (NFD), or THDL (NFTHDL), depending on the value of normForm.
        NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed
-        for {@link org.thdl.tib.text.tshegbar#UnicodeGraphemeCluster}
+        for {@link org.thdl.tib.text.tshegbar.UnicodeGraphemeCluster}
        because NFKD normalizes <code>U+0F0C</code> and neither NFD
        nor NFKD breaks down <code>U+0F00</code> into its constituent
        codepoints.  NFTHDL uses a maximum of codepoints, and it never
@ -247,7 +247,7 @@ public class UnicodeUtils implements UnicodeConstants {

    /** Returns true iff ch corresponds to the Tibetan letter wa.
        Several Unicode codepoints correspond to the Tibetan letter
-        wa.  Oftentimes, <code>&#92;u0F5D</code> is thought of as the
+        wa.  Oftentimes, <code>U+0F5D</code> is thought of as the
        nominal representation. */
    public static boolean isWa(char ch) {
        return ('\u0F5D' == ch
@ -257,7 +257,7 @@ public class UnicodeUtils implements UnicodeConstants {

    /** Returns true iff ch corresponds to the Tibetan letter ya.
        Several Unicode codepoints correspond to the Tibetan letter
-        ya.  Oftentimes, <code>&#92;u0F61</code> is thought of as the
+        ya.  Oftentimes, <code>U+0F61</code> is thought of as the
        nominal representation. */
    public static boolean isYa(char ch) {
        return ('\u0F61' == ch
@ -267,7 +267,7 @@ public class UnicodeUtils implements UnicodeConstants {

    /** Returns true iff there exists at least one codepoint cp in
        unicodeString such that cp {@link #isRa(char) is ra} or contains
-        ra (like <code>&#92;u0F77</code>).  This method is not implemented
+        ra (like <code>U+0F77</code>).  This method is not implemented
        as fast as it could be.  It calls on the canonicalization code
        in order to maximize reuse and minimize the possibility of
        coder error. */
@ -298,6 +298,9 @@ public class UnicodeUtils implements UnicodeConstants {
            return "\\u" + Integer.toHexString((int)cp);
    }

+    /**
+     * Returns a human-readable, ASCII form of the String s of Unicode
+     * codepoints. */
    public static String unicodeStringToString(String s) {
        StringBuffer sb = new StringBuffer(s.length() * 6);
        for (int i = 0; i < s.length(); i++) {