Extended Wylie is referred to as THDL Extended Wylie or THDL Wylie

because a Japanese scholar has an "Extended Wylie" also. NFKD and NFD have a new brother, NFTHDL. I wish there weren't a need, but as my yet-to-be-put-into-CVS break-unicode-into-grapheme-clusters code demonstrates, the-need-is-there. forgive-me for the hyphens, it's late.
2002-12-15 06:57:32 +00:00 · 2002-12-15 06:57:32 +00:00 · 8e8a23c6a6
commit 8e8a23c6a6
parent a42347b224
4 changed files with 83 additions and 63 deletions
--- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
+++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
@ -346,7 +346,7 @@ public class LegalTshegBar
        });

    /** Returns a two-codepoint string consisting of the Unicode
-     *  representation of what Extended Wylie calls
+     *  representation of what THDL Extended Wylie calls
     *  <code>'i</code>. */
    public static String getConnectiveCaseSuffix() {
        return connectiveCaseSuffix;
@ -382,8 +382,8 @@ public class LegalTshegBar


    /** Returns an array of Unicode strings, all the legal suffix
-        particles.  In Extended Wylie, these are: <ul> <li>'i</li>
-        <li>'o</li> <li>'u</li> <li>'am</li> </ul>
+        particles.  In THDL Extended Wylie, these are: <ul>
+        <li>'i</li> <li>'o</li> <li>'u</li> <li>'am</li> </ul>
    
        <p>This is not very efficient.</p> */
    public static String[] getPossibleSuffixParticles() {
@ -823,9 +823,9 @@ public class LegalTshegBar
      isTransliteratedSanskrit(), boolean isTransliteratedChinese()
      (design: contains fa or va, maybe?). */

-    /** Returns a StringBuffer that holds the extended wylie
+    /** Returns a StringBuffer that holds the THDL extended wylie
     *  representation of this syllable. */
-    public StringBuffer getExtendedWylie() {
+    public StringBuffer getThdlWylie() {
        StringBuffer sb = new StringBuffer();
        char rootLetter = getRootLetter();
        if (hasPrefix()) {
@ -837,7 +837,7 @@ public class LegalTshegBar

            boolean disambiguatorNeeded = false;
            char prefix = getPrefix();
-            sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(prefix));
+            sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(prefix));
            if (!hasHeadLetter()) {
                if (EWC_ya == rootLetter) {
                    if (isConsonantThatTakesYaBtags(prefix))
@ -857,67 +857,67 @@ public class LegalTshegBar
                sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
        }
        if (hasHeadLetter())
-            sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getHeadLetter()));
-        sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(rootLetter));
+            sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter()));
+        sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(rootLetter));
        if (hasSubjoinedLetter())
-            sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getSubjoinedLetter()));
+            sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter()));
        if (hasWaZurSubjoinedToRootLetter())
-            sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EWSUB_wa_zur));
+            sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EWSUB_wa_zur));

-        // a-chung is treated, in Extended Wylie, like a vowel.  I.e.,
-        // you don't have 'pAa', you have 'pA'.
+        // a-chung is treated, in THDL Extended Wylie, like a vowel.
+        // I.e., you don't have 'pAa', you have 'pA'.
        if (hasAChungOnRootLetter()) {
            if (hasExplicitVowel()) {
                if (EWV_i == getVowel()) {
-                    sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar('\u0F73'));
+                    sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F73'));
                } else if (EWV_u == getVowel()) {
-                    sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar('\u0F75'));
+                    sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F75'));
                } else if (EWV_e == getVowel() || EWV_o == getVowel()) {
                    // The exception to the rule for a-chung and vowels...

                    // DLC FIXME: are these allowed in legal Tibetan?
                    // EWTS would have special cases for them if so,
                    // I'd wager...
-                    sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EW_achung));
-                    sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel()));
+                    sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung));
+                    sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()));
                } else {
                    ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?");
                }
            } else {
-                sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EW_achung));
+                sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung));
            }
        } else {
            if (hasExplicitVowel())
-                sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel()));
+                sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()));
            else
                sb.append("a");
        }

        if (hasSuffix()) {
            String suf = getSuffix();
-            sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(suf.charAt(0)));
+            sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(0)));
            if (suf.length() > 1) {
                // DLC assert, don't verify, that the length is two.
                // This could change if I learn of more suffix
                // particles.
                ThdlDebug.verify(2 == suf.length());
-                sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(suf.charAt(1)));
+                sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(1)));
            }
        }
        if (hasPostsuffix())
-            sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPostsuffix()));
+            sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix()));
        return sb;
    }


-    // DLC: toXML for the dense XML
    /** Returns a <legalTibetanSyllable> element that contains only
-     *  the Extended Wylie transliteration for the whole syllable and a note that the . */
+     *  the THDL Extended Wylie transliteration for the whole syllable
+     *  and a note about the transliteration. */
    public String toConciseXML() {
        // DLC version-control the EWTS document. 0.5 is used below:
        return ("<legalTibetanSyllable "
                + "transliterationType=\"THDL Extended Wylie 0.5\" "
-                + "transliteration=\"" + getExtendedWylie() + "\"" + "/>");
+                + "transliteration=\"" + getThdlWylie() + "\"" + "/>");
    }

    /** Returns a <legalTibetanSyllable> element that contains the
@ -929,18 +929,18 @@ public class LegalTshegBar
                + "transliterationType=\"THDL Extended Wylie 0.5\" "
                + (hasPrefix()
                   ? ("prefix=\""
-                      + UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPrefix()) + "\" ")
+                      + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPrefix()) + "\" ")
                   : "")
                + (hasHeadLetter()
                   ? ("headLetter=\""
-                      + UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getHeadLetter())
+                      + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter())
                      + "\" ")
                   : "")
                + ("rootLetter=\""
-                   + UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getRootLetter()) + "\" ")
+                   + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getRootLetter()) + "\" ")
                + (hasSubjoinedLetter()
                   ? ("subjoinedLetter=\""
-                      + UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getSubjoinedLetter())
+                      + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter())
                      + "\" ")
                   : "")
                + (hasWaZurSubjoinedToRootLetter()
@ -953,17 +953,17 @@ public class LegalTshegBar
                // DLC NOW: what about the root letter a, i.e. &#92;u0F68 ?  do we want the EWTS to be 'aa' ?
                + ("vowel=\""
                   + (hasExplicitVowel()
-                      ? UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel())
+                      ? UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel())
                      : "a")
                   + "\" ")
                + (hasSuffix()
                   ? ("suffix=\""
-                      + UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeString(getSuffix())
+                      + UnicodeCharToThdlWylie.getThdlWylieForUnicodeString(getSuffix())
                      + "\" ")
                   : "")
                + (hasPostsuffix()
                   ? ("postsuffix=\""
-                      + UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPostsuffix())
+                      + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix())
                      + "\" ")
                   : "")
                + "/>");
--- a/source/org/thdl/tib/text/tshegbar/TshegBar.java
+++ b/source/org/thdl/tib/text/tshegbar/TshegBar.java
@ -58,7 +58,7 @@ package org.thdl.tib.text.tshegbar;
 *  <p> This class allows for invalid tsheg bars, like those
 *  containing more than one prefix, more than two suffixes, an
 *  invalid postsuffix (secondary suffix), more than one consonant
- *  stack (excluding the special case of what we call in Extended
+ *  stack (excluding the special case of what we call in THDL Extended
 *  Wylie "'i", which is technically a consonant stack but is used in
 *  Tibetan like a suffix).</p>.
 *
--- a/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java
@ -40,6 +40,12 @@ public interface UnicodeConstants {
    static final byte NORM_NFD = 3;
    /** Refers to Normalization Form KD: */
    static final byte NORM_NFKD = 4;
+    /** Refers to Normalization Form THDL, which is NFD except for
+        <code>U+0F77</code> and <code>U+0F79</code>, which are
+        normalized according to NFKD.  This is the One True
+        Normalization Form, as it leaves no precomposed codepoints and
+        does not normalize <code>U+0F0C</code>. */
+    static final byte NORM_NFTHDL = 5;


    /** for those times when you need a char to represent a
--- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
@ -94,35 +94,34 @@ public class UnicodeUtils implements UnicodeConstants {
    }

    /** Puts the Tibetan codepoints in tibetanUnicode, a sequence of
-        Unicode codepoints, into Normalization Form KD (NFKD) as
-        specified by Unicode 3.2.  The Tibetan passages of the
-        returned string are in NFKD, but codepoints outside of the
-        range <code>U+0F00</code>-<code>U+0FFF</code> are not
-        necessarily put into NFKD.  This form uses a maximum of
+        Unicode codepoints, into either Normalization Form KD (NFKD),
+        D (NFD), or THDL (NFTHDL), depending on the value of normForm.
+        NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed
+        for {@link org.thdl.tib.text.tshegbar#GraphemeCluster} because
+        NFKD normalizes <code>U+0F0C</code>.  NFTHDL uses a maximum of
        codepoints, and it never uses codepoints whose use has been
-        {@link #isDiscouraged(char) discouraged}.  It would be David
-        Chandler's very favorite form if not for the fact that
-        <code>U+0F0C</code> normalizes to <code>U+0F0B</code> in NFKD.
-        NFD is thus David Chandler's favorite, though it does not
-        decompose <code>U+0F77</code> and <code>U+0F79</code> (for
-        some reason, hopefully a well-thought-out one).
+        {@link #isDiscouraged(char) discouraged}.

-        <p>Recall that NFKD, as it applies to Tibetan codepoints, is
-        closed under string concatenation and under substringing.
-        Note again that if the input contains codepoints for which
-        {@link #isInTibetanRange(char)} is not true, then they will
-        not be modified.</p>
+        <p>The Tibetan passages of the returned string are in the
+        chosen normalized form, but codepoints outside of the {@link
+        #isInTibetanRange(char) range}
+        <code>U+0F00</code>-<code>U+0FFF</code> are not necessarily
+        put into normalized form.</p>
+
+        <p>Recall that normalized forms are not necessarily closed
+        under string concatenation, but are closed under
+        substringing.</p>
    
        <p>Note well that only well-formed input guarantees
        well-formed output.</p>

        @param tibetanUnicode the codepoints to be decomposed
-        @param normForm NORM_NFKD or NORM_NFD */
+        @param normForm NORM_NFKD, NORM_NFTHDL, or NORM_NFD */
    public static void toMostlyDecomposedUnicode(StringBuffer tibetanUnicode,
                                                 byte normForm)
    {
-        if (normForm != NORM_NFD && normForm != NORM_NFKD)
-            throw new IllegalArgumentException("normForm must be NORM_NFD or NORM_NFKD for decomposition to work");
+        if (normForm != NORM_NFD && normForm != NORM_NFKD && normForm != NORM_NFTHDL)
+            throw new IllegalArgumentException("normForm must be NORM_NFD, NORM_NFTHDL, or NORM_NFKD for decomposition to work");
        int offset = 0;
        while (offset < tibetanUnicode.length()) {
            String s
@ -157,15 +156,19 @@ public class UnicodeUtils implements UnicodeConstants {
        and returns null for codepoints that are already normalized or
        are not in the Tibetan range of Unicode.
        @param tibetanUnicodeCP the codepoint to normalize
-        @param normalizationForm NORM_NFKD or NORM_NFD if you expect
-        something nontrivial to happen
+        @param normalizationForm NORM_NFTHDL, NORM_NFKD, or NORM_NFD
+        if you expect something nontrivial to happen
        @return null if tibetanUnicodeCP is already in the chosen
        normalized form, or a string of two or three codepoints
        otherwise */
-    public static String toNormalizedForm(char tibetanUnicodeCP, byte normalizationForm) {
+    public static String toNormalizedForm(char tibetanUnicodeCP,
+                                          byte normalizationForm)
+    {
        if (normalizationForm == NORM_NFKD
-            || normalizationForm == NORM_NFD) {
-            // Where not specified, the NFKD form is also the NFD form.
+            || normalizationForm == NORM_NFD
+            || normalizationForm == NORM_NFTHDL) {
+            // Where not specified, the NFKD and NFTHDL forms are
+            // identical to the NFD form.
            switch (tibetanUnicodeCP) {
            case '\u0F0C': return ((normalizationForm == NORM_NFKD)
                                   ? "\u0F0B" : null);
@ -178,14 +181,25 @@ public class UnicodeUtils implements UnicodeConstants {
            case '\u0F73': return "\u0F71\u0F72";
            case '\u0F75': return "\u0F71\u0F74";
            case '\u0F76': return "\u0FB2\u0F80";
-            // I do not understand why NFD does not decompose this codepoint:
-            case '\u0F77': return ((normalizationForm == NORM_NFKD)
-                                   ? "\u0FB2\u0F71\u0F80" : null);
+            case '\u0F77': {
+                // I do not understand why NFD does not decompose this
+                // codepoint, hence NORM_NFTHDL does:
+                if (normalizationForm == NORM_NFKD
+                    || normalizationForm == NORM_NFTHDL)
+                    return "\u0FB2\u0F71\u0F80";
+                else
+                    return null;
+            }
            case '\u0F78': return "\u0FB3\u0F80";
-            // I do not understand why NFD does not decompose this codepoint:
-            case '\u0F79': return ((normalizationForm == NORM_NFKD)
-                                   ? "\u0FB3\u0F71\u0F80" : null);
-
+            case '\u0F79': {
+                // I do not understand why NFD does not decompose this
+                // codepoint, hence NORM_NFTHDL does:
+                if (normalizationForm == NORM_NFKD
+                    || normalizationForm == NORM_NFTHDL)
+                    return "\u0FB3\u0F71\u0F80";
+                else
+                    return null;
+            }
            case '\u0F81': return "\u0F71\u0F80";
            case '\u0F93': return "\u0F92\u0FB7";
            case '\u0F9D': return "\u0F9C\u0FB7";