Renamed UnicodeCharToExtendedWylie to

UnicodeCodepointToThdlWylie.java. Added a new class, UnicodeGraphemeCluster, that can tell you the components of a grapheme cluster from top to bottom. It does not yet have good error checking; it is not yet finished. Next is to parse clean Unicode into GraphemeClusters. After that comes scanning dirty Unicode into best-guess GraphemeClusters, and scanning dirty Unicode to get nice error messages.
2002-12-17 13:51:18 +00:00 · 2002-12-17 13:51:18 +00:00 · 7ea185fa01
commit 7ea185fa01
parent 8e8a23c6a6
4 changed files with 481 additions and 69 deletions
--- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
+++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
@ -748,7 +748,7 @@ public class LegalTshegBar
                if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) {
                    return internalThrowThing(throwIfIllegal,
                                              "Illegal suffix -- not one of the ten legal suffixes: "
-                                              + UnicodeUtils.unicodeCPToString(suffix.charAt(0)));
+                                              + UnicodeUtils.unicodeCodepointToString(suffix.charAt(0)));
                }
            }
        }
@ -837,7 +837,7 @@ public class LegalTshegBar
            boolean disambiguatorNeeded = false;
            char prefix = getPrefix();
-            sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(prefix));
+            sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(prefix));
            if (!hasHeadLetter()) {
                if (EWC_ya == rootLetter) {
                    if (isConsonantThatTakesYaBtags(prefix))
@ -857,55 +857,55 @@ public class LegalTshegBar
                sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
        }
        if (hasHeadLetter())
-            sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter()));
+            sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter()));
-        sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(rootLetter));
+        sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(rootLetter));
        if (hasSubjoinedLetter())
-            sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter()));
+            sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getSubjoinedLetter()));
        if (hasWaZurSubjoinedToRootLetter())
-            sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EWSUB_wa_zur));
+            sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EWSUB_wa_zur));
        // a-chung is treated, in THDL Extended Wylie, like a vowel.
        // I.e., you don't have 'pAa', you have 'pA'.
        if (hasAChungOnRootLetter()) {
            if (hasExplicitVowel()) {
                if (EWV_i == getVowel()) {
-                    sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F73'));
+                    sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint('\u0F73'));
                } else if (EWV_u == getVowel()) {
-                    sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F75'));
+                    sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint('\u0F75'));
                } else if (EWV_e == getVowel() || EWV_o == getVowel()) {
                    // The exception to the rule for a-chung and vowels...
                    // DLC FIXME: are these allowed in legal Tibetan?
                    // EWTS would have special cases for them if so,
                    // I'd wager...
-                    sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung));
+                    sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung));
-                    sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()));
+                    sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel()));
                } else {
                    ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?");
                }
            } else {
-                sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung));
+                sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung));
            }
        } else {
            if (hasExplicitVowel())
-                sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()));
+                sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel()));
            else
                sb.append("a");
        }
        if (hasSuffix()) {
            String suf = getSuffix();
-            sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(0)));
+            sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(0)));
            if (suf.length() > 1) {
                // DLC assert, don't verify, that the length is two.
                // This could change if I learn of more suffix
                // particles.
                ThdlDebug.verify(2 == suf.length());
-                sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(1)));
+                sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(1)));
            }
        }
        if (hasPostsuffix())
-            sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix()));
+            sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix()));
        return sb;
    }
@ -929,18 +929,18 @@ public class LegalTshegBar
                + "transliterationType=\"THDL Extended Wylie 0.5\" "
                + (hasPrefix()
                   ? ("prefix=\""
-                      + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPrefix()) + "\" ")
+                      + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPrefix()) + "\" ")
                   : "")
                + (hasHeadLetter()
                   ? ("headLetter=\""
-                      + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter())
+                      + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter())
                      + "\" ")
                   : "")
                + ("rootLetter=\""
-                   + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getRootLetter()) + "\" ")
+                   + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getRootLetter()) + "\" ")
                + (hasSubjoinedLetter()
                   ? ("subjoinedLetter=\""
-                      + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter())
+                      + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getSubjoinedLetter())
                      + "\" ")
                   : "")
                + (hasWaZurSubjoinedToRootLetter()
@ -953,17 +953,17 @@ public class LegalTshegBar
                // DLC NOW: what about the root letter a, i.e. &#92;u0F68 ?  do we want the EWTS to be 'aa' ?
                + ("vowel=\""
                   + (hasExplicitVowel()
-                      ? UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel())
+                      ? UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel())
                      : "a")
                   + "\" ")
                + (hasSuffix()
                   ? ("suffix=\""
-                      + UnicodeCharToThdlWylie.getThdlWylieForUnicodeString(getSuffix())
+                      + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeString(getSuffix())
                      + "\" ")
                   : "")
                + (hasPostsuffix()
                   ? ("postsuffix=\""
-                      + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix())
+                      + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix())
                      + "\" ")
                   : "")
                + "/>");
--- a/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java
@ -21,7 +21,7 @@ package org.thdl.tib.text.tshegbar;
 import org.thdl.tib.text.TibetanMachineWeb;
 /** This noninstantiable class allows for converting from Unicode
- *  codepoints to Extended Wylie.  It cannot be used for long
+ *  codepoints to THDL Extended Wylie.  It cannot be used for long
 *  stretches of text, though, as it is unaware of context, which is
 *  essential to understanding a non-trivial string of Tibetan
 *  Unicode.
@ -29,21 +29,22 @@ import org.thdl.tib.text.TibetanMachineWeb;
 *  <p>See the document by Nathaniel Garson and David Germano entitled
 *  <i>Extended Wylie Transliteration Scheme</i>.  Note that there are
 *  a couple of issues with the November 18, 2001 revision of that
- *  document; these issues are in the Bugs tracker at our SourceForge site.</p>
+ *  document; these issues are in the Bugs tracker at our SourceForge
 *  site.</p>
 *
 *  @see <a href="http://sourceforge.net/projects/thdltools">SourceForge site</a>
 *
 *  @author David Chandler */
-public class UnicodeCharToExtendedWylie {
+public class UnicodeCodepointToThdlWylie {
-    /** Returns the extended Wylie for the very simple sequence x.
+    /** Returns the THDL extended Wylie for the very simple sequence
-     *  Returns null iff some (Unicode) char in s has no extended
+     *  x.  Returns null iff some (Unicode) char in s has no THDL
-     *  Wylie representation.  This is unaware of context, so use it
+     *  extended Wylie representation.  This is unaware of context, so
-     *  sparingly. */
+     *  use it sparingly. */
-    public static StringBuffer getExtendedWylieForUnicodeString(String x) {
+    public static StringBuffer getThdlWylieForUnicodeString(String x) {
        StringBuffer sb = new StringBuffer();
        for (int i = 0; i < x.length(); i++) {
-            String ew = getExtendedWylieForUnicodeChar(x.charAt(i));
+            String ew = getThdlWylieForUnicodeCodepoint(x.charAt(i));
            if (null == ew)
                return null;
            sb.append(ew);
@ -51,12 +52,14 @@ public class UnicodeCharToExtendedWylie {
        return sb;
    }
-    /** Returns the extended Wylie for x, or null if there is none.
+    /** Returns the THDL extended Wylie for x, or null if there is
-     *  Understand that multiple Unicode code points (chars) map to
+     *  none.  Understand that multiple Unicode code points (chars)
-     *  the same Extended Wylie representation.  Understand also that
+     *  map to the same THDL Extended Wylie representation.
-     *  the scrap of Extended Wylie returned is only valid in certain
+     *  Understand also that the scrap of THDL Extended Wylie returned
-     *  contexts.  For example, not all consonants take ra-btags.  DLC NOW what about canonicalization? */
+     *  is only valid in certain contexts.  For example, not all
-    public static String getExtendedWylieForUnicodeChar(char x) {
+     *  consonants take ra-btags.  DLC NOW what about
     *  canonicalization? */
    public static String getThdlWylieForUnicodeCodepoint(char x) {
        switch (x) {
        case '\u0F00': return "oM";
@ -130,9 +133,9 @@ public class UnicodeCharToExtendedWylie {
        case '\u0F40': return "k";
        case '\u0F41': return "kh";
        case '\u0F42': return "g";
-        case '\u0F43': return (getExtendedWylieForUnicodeChar('\u0F42')
+        case '\u0F43': return (getThdlWylieForUnicodeCodepoint('\u0F42')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0F44': return "ng";
        case '\u0F45': return "c";
        case '\u0F46': return "ch";
@ -142,31 +145,31 @@ public class UnicodeCharToExtendedWylie {
        case '\u0F4A': return "T";
        case '\u0F4B': return "Th";
        case '\u0F4C': return "D";
-        case '\u0F4D': return (getExtendedWylieForUnicodeChar('\u0F4C')
+        case '\u0F4D': return (getThdlWylieForUnicodeCodepoint('\u0F4C')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0F4E': return "N";
        case '\u0F4F': return "t";
        case '\u0F50': return "th";
        case '\u0F51': return "d";
-        case '\u0F52': return (getExtendedWylieForUnicodeChar('\u0F51')
+        case '\u0F52': return (getThdlWylieForUnicodeCodepoint('\u0F51')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0F53': return "n";
        case '\u0F54': return "p";
        case '\u0F55': return "ph";
        case '\u0F56': return "b";
-        case '\u0F57': return (getExtendedWylieForUnicodeChar('\u0F56')
+        case '\u0F57': return (getThdlWylieForUnicodeCodepoint('\u0F56')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0F58': return "m";
        case '\u0F59': return "ts";
        case '\u0F5A': return "tsh";
        case '\u0F5B': return "dz";
-        case '\u0F5C': return (getExtendedWylieForUnicodeChar('\u0F5B')
+        case '\u0F5C': return (getThdlWylieForUnicodeCodepoint('\u0F5B')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0F5D': return "w";
        case '\u0F5E': return "zh";
        case '\u0F5F': return "z";
@ -180,9 +183,9 @@ public class UnicodeCharToExtendedWylie {
        case '\u0F66': return "s";
        case '\u0F67': return "h";
        case '\u0F68': return "a"; // DLC: maybe the empty string is OK here because typing just 'i' into Jskad causes root letter \u0F68 to appear... yuck...
-        case '\u0F69': return (getExtendedWylieForUnicodeChar('\u0F40')
+        case '\u0F69': return (getThdlWylieForUnicodeCodepoint('\u0F40')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB5'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB5'));
        case '\u0F6A': return "r";
        case '\u0F6B': return null;
        case '\u0F6C': return null;
@ -227,9 +230,9 @@ public class UnicodeCharToExtendedWylie {
        case '\u0F90': return "k";
        case '\u0F91': return "kh";
        case '\u0F92': return "g";
-        case '\u0F93': return (getExtendedWylieForUnicodeChar('\u0F92')
+        case '\u0F93': return (getThdlWylieForUnicodeCodepoint('\u0F92')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0F94': return "ng";
        case '\u0F95': return "c";
        case '\u0F96': return "ch";
@ -239,31 +242,31 @@ public class UnicodeCharToExtendedWylie {
        case '\u0F9A': return "T";
        case '\u0F9B': return "Th";
        case '\u0F9C': return "D";
-        case '\u0F9D': return (getExtendedWylieForUnicodeChar('\u0F92')
+        case '\u0F9D': return (getThdlWylieForUnicodeCodepoint('\u0F92')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0F9E': return "N";
        case '\u0F9F': return "t";
        case '\u0FA0': return "th";
        case '\u0FA1': return "d";
-        case '\u0FA2': return (getExtendedWylieForUnicodeChar('\u0FA1')
+        case '\u0FA2': return (getThdlWylieForUnicodeCodepoint('\u0FA1')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0FA3': return "n";
        case '\u0FA4': return "p";
        case '\u0FA5': return "ph";
        case '\u0FA6': return "b";
-        case '\u0FA7': return (getExtendedWylieForUnicodeChar('\u0FA6')
+        case '\u0FA7': return (getThdlWylieForUnicodeCodepoint('\u0FA6')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0FA8': return "m";
        case '\u0FA9': return "ts";
        case '\u0FAA': return "tsh";
        case '\u0FAB': return "dz";
-        case '\u0FAC': return (getExtendedWylieForUnicodeChar('\u0FAB')
+        case '\u0FAC': return (getThdlWylieForUnicodeCodepoint('\u0FAB')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0FAD': return "w";
        case '\u0FAE': return "zh";
        case '\u0FAF': return "z";
@ -277,9 +280,9 @@ public class UnicodeCharToExtendedWylie {
        case '\u0FB6': return "s";
        case '\u0FB7': return "h";
        case '\u0FB8': return "a"; // DLC see note on \u0F68 ...
-        case '\u0FB9': return (getExtendedWylieForUnicodeChar('\u0F90')
+        case '\u0FB9': return (getThdlWylieForUnicodeCodepoint('\u0F90')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB5'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB5'));
        case '\u0FBA': return "w";
        case '\u0FBB': return "y";
        case '\u0FBC': return "r";
@ -309,7 +312,7 @@ public class UnicodeCharToExtendedWylie {
            // This codepoint is in the range 0FD0-0FFF or is not in
            // the Tibetan range at all.  In either case, there is no
-            // corresponding Extended Wylie.
+            // corresponding THDL Extended Wylie.
            return null;
        }
        } // end switch
--- a/source/org/thdl/tib/text/tshegbar/UnicodeGraphemeCluster.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeGraphemeCluster.java
@ -0,0 +1,377 @@
 /*
 The contents of this file are subject to the THDL Open Community License
 Version 1.0 (the "License"); you may not use this file except in compliance
 with the License. You may obtain a copy of the License on the THDL web site 
 (http://www.thdl.org/).
 Software distributed under the License is distributed on an "AS IS" basis, 
 WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
 License for the specific terms governing rights and limitations under the 
 License. 
 The Initial Developer of this software is the Tibetan and Himalayan Digital
 Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
 All Rights Reserved. 
 Contributor(s): ______________________________________.
 */
 package org.thdl.tib.text.tshegbar;
 import java.util.Vector;
 import org.thdl.util.ThdlDebug;
 /** A UnicodeGraphemeCluster is either a non-Tibetan codepoint (such
 *  as whitespace or control characters or a Latin "character"), or a
 *  vertically stacked set of Tibetan consonants, vowels, marks, and
 *  signs.  The Unicode string
 *  <code>"&#92;u0F40&#92;u0F0B&#92;u0F41&#92;u0F0B"</code> specifies
 *  four UnicodeGraphemeClusters (the name of the Tibetan alphabet,
 *  you might notice), while the Unicode string
 *  <code>"&#92;u0F66&#92;u0FA5&#92;u0F39&#92;u0F90&#92;u0FB5&#92;u0F71&#92;u0F80&#92;u0F7F"</code>
 *  is one Tibetan stack, sa over fa over ka over Sha with an a-chung,
 *  a reversed gi-gu, and a visarga, plus a ngas-bzung-sgor-rtags mark
 *  underneath all of that.  I assume the latter grapheme cluster is
 *  nonsense, but it is considered one grapheme cluster because all
 *  but the first char are combining chars.  See Unicode Technical
 *  Report 29.
 *
 *  <p>As the above example demonstrates, not all
 *  UnicodeGraphemeClusters are syntactically legal in the Tibetan
 *  language.  Not all of them are syntactically legal in Sanskrit
 *  transcribed in the Tibetan alphabet, either.</p>
 *
 *  <p>The Unicode 3.2 standard (see especially Technical Report 29)
 *  refers to "grapheme clusters."  A UnicodeGraphemeCluster is
 *  precisely a grapheme cluster as described by that standard.  We
 *  interpret the standard as saying that <code>U+0F3E</code> and
 *  <code>U+0F3F</code> are each grapheme clusters unto themselves,
 *  even though they are combining codepoints.</p>
 *
 *  @author David Chandler */
 public class UnicodeGraphemeCluster
    implements UnicodeReadyThunk, UnicodeConstants
 {
    /** @see #getCPHeight(char) */
    private static final int MIN_HEIGHT = -6;
    /** @see #getCPHeight(char) */
    private static final int MAX_HEIGHT = 3;
    /** The Unicode codepoints that compose this grapheme cluster.
        This is legal, i.e. if there is a Tibetan vowel, it is the
        last codepoint.  It is in Normalization Form THDL (NFTHDL). */
    private String unicodeString;
    /** Do not use this constructor. */
    private UnicodeGraphemeCluster() { super(); }
    /** Creates a new GraphemeCluster given a legal sequence of
        Unicode codepoints corresponding to a single grapheme
        cluster.
        @exception IllegalArgumentException if unicodeString is not a
        syntactically correct Unicode 3.2 sequence (if it begins with
        a combining codepoint or has a Tibetan vowel before another
        combining character, for example, or if it is more than one
        grapheme cluster.  Note that syntactical correctness for
        non-Tibetan codepoints is not likely to be known by this
        routine. */
    public UnicodeGraphemeCluster(String unicodeString)
        throws IllegalArgumentException
    {
        // check legality:
        // DLC NOW FIXME
        // convert to NFTHDL:
        this.unicodeString
            = UnicodeUtils.toMostlyDecomposedUnicode(unicodeString, NORM_NFTHDL);
    }
    /** Returns a string of codepoints in NFTHDL form. */
    public String getUnicodeRepresentation() {
        return unicodeString;
    }
    /** Returns true. */
    public boolean hasUnicodeRepresentation() {
        return true;
    }
    /** Returns true iff this stack could occur in syntactically
     *  correct, run-of-the-mill Tibetan (as opposed to Tibetanized
     *  Sanksrit, Chinese, et cetera).  sga is a legal Tibetan stack,
     *  but g+g is not, for example. */
    public boolean isLegalTibetan() {
        // DLC FIXME: for those odd head marks etc., return true even
        // though hasUnicodeRepresentation() will return false.
        // Note that ra-btags and wa-zur both be present in legal
        // Tibetan.
        throw new Error("DLC FIXME: not yet implemented.");
    }
    /** Returns a <unicodeGraphemeCluster> element that contains the
     *  THDL Extended Wylie transliteration for this cluster. */
    public String toConciseXML() {
        throw new Error("DLC NOW unimplemented");
    }
    /** Returns a <unicodeGraphemeCluster> element that contains this
     *  cluster broken down into its constituent decomposed
     *  codepoints. */
    public String toVerboseXML() {
        throw new Error("DLC NOW unimplemented");
    }
    /** Returns the THDL Extended Wylie transliteration of this
        grapheme cluster, or null if there is none (which happens for
        a few Tibetan codepoints, if you'll recall). If needsVowel is
        true, then an "a" will be appended when there is no EW_achung
        or explicit simple vowel.  If there is an explicit vowel or
        EW_achung, it will always be present.  Note that needsVowel is
        provided because btags is the preferred THDL Extended Wylie
        for the four contiguous grapheme clusters
        <code>"&#92;u0F56&#92;u0F4F&#92;u0F42&#92;u0F66"</code>, and
        needsVowel must be set to false for all but the grapheme
        cluster corresponding to <code>&#92;u0F4F</code> if you wish
        to get the preferred THDL Extended Wylie. */
    public String getThdlWylie(boolean needsVowel) {
        throw new Error("DLC NOW unimplemented");
    }
    /** Given some (possibly unnormalized) Unicode 3.2 string unicode,
        appends grapheme clusters to the vector of GraphemeClusters
        grcls if grcls is nonnulla.  Performs good error checking if
        validate is true.  If an error is found, grcls may have been
        modified if nonnull.  Setting grcls to null and setting
        validate to true is sometimes useful for testing the validity
        of a Unicode string.
        @return the number of grapheme clusters that were or would
        have been added to grcls
        @exception BadTibetanUnicodeException if the unicode is not
        syntactically legal
        @exception IllegalArgumentException if correctErrors and
        validate are both true
        @exception NullPointerException if unicode is null */
    public static int breakUnicodeIntoGraphemeClusters(Vector grcls,
                                                       String unicode,
                                                       boolean validate,
                                                       boolean correctErrors)
        throws // DLC SOON: BadTibetanUnicodeException, 
               IllegalArgumentException, NullPointerException
    {
        if (validate && correctErrors) {
            throw new IllegalArgumentException("validate and correctErrors cannot both be true.");
        }
        throw new Error("DLC NOW unimplemented");
        /*
            if (start == i) {
                // special tests at the beginning of input.
                if (0 != height || UnicodeUtils.combinesLeftToRight(cp)) {
                    throw new BadTibetanUnicodeException("A combining codepoint was found at the start of input or after a mark that ends a grapheme cluster.");
                }
            }
            if (height == last_height) {
                if ('\u0F39' == cp) {
                    if (!UnicodeUtils.isTibetanConsonant(last_cp)) {
                        throw new BadTibetanUnicodeException("U+0F39 can only occur after a (possibly subjoined) Tibetan consonant");
                    }
                } else {
                    // DLC:                    cp BEGINS A NEW GRAPHEME CLUSTER!!!
                }
            }
            // Test to see if this last character has ended this
            // grapheme cluster:
            if (UnicodeUtils.isTibetanTerminatingVowel(cp)) {
                // DLC: cp ENDS A GRAPHEME CLUSTER!!!
            }
        */
    }
    /** FIXMEDOC */
    public String getTopToBottomCodepoints() {
        return getTopToBottomCodepoints(new StringBuffer(unicodeString),
                                        0, unicodeString.length()).toString();
    }
    /** Returns a new StringBuffer consisting of the codepoints in
        NFTHDLString at indices [start, end) sorted in top-to-bottom
        order, or null on some occasions when NFTHDLString is already
        sorted.  A top-to-bottom ordering is a useful form for
        applications wishing to render the grapheme cluster.  Note
        that this method is only useful if NFTHDLString is part of or
        an entire grapheme cluster.  Does no error checking on
        NFTHDLString.
        @param NFTHDLString a buffer with characters at indices i,
        where start <= i < end, being the Unicode codepoints for a
        single grapheme cluster or part of a grapheme cluster
        @param start NFTHDLString.charAt(start) is the first codepoint
        dealt with
        @param end NFTHDLString.charAt(end) is the first codepoint NOT
        dealt with
        @return null only if (but not necessarily if) NFTHDLString is
        already sorted top-to-bottom, or the sorted form of
        NFTHDLString */
    private static StringBuffer getTopToBottomCodepoints(StringBuffer NFTHDLString, /* DLC FIXME: for efficiency, use a ThdlCharIterator. */
                                                         int start, int end)
    {
        if (end <= start) /* 0-length string. */
            return null;
        if (start + 1 == end) /* 1-length string. */
            return null;
        // else we have a string of length >= 2.
        // We'll use the world's fastest sorting algorithm.  Linear
        // time, baby.  Here are the ten or so mailboxes for our
        // postman's sort:
        StringBuffer chunksAtCommonHeights[]
            = new StringBuffer[(MAX_HEIGHT + 1) - MIN_HEIGHT];
        for (int i = start; i < end; i++) {
            char cp = NFTHDLString.charAt(i);
            int height = getCPHeight(cp);
            // initialize mailbox if necessary.
            if (null == chunksAtCommonHeights[height - MIN_HEIGHT]) {
                chunksAtCommonHeights[height - MIN_HEIGHT]
                    = new StringBuffer(2);
            }
            // put this cp into the correct mailbox.
            chunksAtCommonHeights[height - MIN_HEIGHT].append(cp);
        }
        // Now concatenate together the mailboxes:
        StringBuffer sb = new StringBuffer(end - start);
        for (int h = MAX_HEIGHT; h >= MIN_HEIGHT; h--) {
            if (null != chunksAtCommonHeights[h - MIN_HEIGHT]) {
                sb.append(chunksAtCommonHeights[h - MIN_HEIGHT]);
            }
        }
        return sb;
    }
    /** Returns the <i>height</i> for the Tibetan Unicode codepoint x.
        This relative height is 0 for a base consonant, digit,
        punctuation, mark, or sign.  It is -1 for a subjoined
        consonant, -2 for EWSUB_wa_zur, -3 for EW_achung, +1 for
        EWV_gigu, and so on according to the height these codepoints
        appear relative to one another when on the same stack.  If two
        codepoints have equal height, they should not exist in the
        same grapheme cluster unless one is <code>U+0F39</code>, which
        is an integral part of a consonant when tacked on to, e.g.,
        EWC_PHA.
        <p>If x is not a Unicode 3.2 codepoint in the Tibetan range,
        or if x is not in NFTHDL form, 0 is returned.  The height code
        of <code>U+0F76</code> is not valid, and it is not an accident
        that <code>U+0F76</code> is not in NFTHDL form.</p> */
    private static int getCPHeight(char x) {
        // DLC make this an assertion:
        ThdlDebug.verify(null == UnicodeUtils.toNormalizedForm(x, NORM_NFTHDL));
        if (x >= '\u0F90' && x <= '\u0FAC'
            || x >= '\u0FAE' && x <= '\u0FBC') {
            // subjoined consonant.  Note that wa-zur is an exception.
            return -1;
        } else if (x >= '\u0F00' && x <= '\u0F17'
                   || x >= '\u0F1A' && x <= '\u0F34'
                   || x >= '\u0F3A' && x <= '\u0F3D'
                   || x >= '\u0F40' && x <= '\u0F6A' // consonants
                   || x >= '\u0F88' && x <= '\u0F8B'
                   || x >= '\u0FBE' && x <= '\u0FCF') {
            // neutral height:
            return 0;
        } else { // Oddballs.
            switch (x) {
            //
            // non-combining:
            //
            case '\u0F36':
            case '\u0F38':
            case '\u0F85':
                return 0;
            //
            // combining, but left-to-right combining:
            //
            case '\u0F3E':
            case '\u0F3F':
            case '\u0F7F':
                return 0;
            //
            // combining by coming below:
            //
            case '\u0FAD':
                return -2; // wa-zur
            case '\u0F71':
                return -3; // a-chung
            case '\u0F74':
            case '\u0F84':
                return -4; // DLC CHECKME
            case '\u0F18': // combines with digits
            case '\u0F19': // combines with digits
                return -5;
            case '\u0F35':
            case '\u0F37':
            case '\u0FC6': {
                ThdlDebug.verify(-6 == MIN_HEIGHT);
                return -6; // min height
            }
            //
            // combining by coming above:
            //
            case '\u0F72':
            case '\u0F7A':
            case '\u0F7B':
            case '\u0F7C':
            case '\u0F7D':
            case '\u0F80':
                return 1;
            case '\u0F7E':
            case '\u0F82':
            case '\u0F83':
                return 2; // these three come above 0F7C, right? (DLC CHECKME)
            case '\u0F86':
            case '\u0F87': {
                ThdlDebug.verify(3 == MAX_HEIGHT);
                return 3; // max height
            }
            //
            // exceptional case:
            //
            // some would say +1, but then "\u0F40\u0FA5\u0F39" will
            // not have a5 combine with 39.  Unicode could well have
            // put in a single codepoint for "\u0FA5\u0F39" IMO.
            case '\u0F39': return 0;
            default: {
                if (x >= '\u0F00' && x <= '\u0FFF') {
                    // This wasn't explicitly handled?  Hmmm... This
                    // won't ever happen for NFTHDL-formed input.
                    ThdlDebug.noteIffyCode();
                }
                // This codepoint is not in the Tibetan range.
                return 0;
            }
            } // end switch
        }
    }
    /** DLC SOON */
    public boolean isTibetan() {
        throw new Error("DLC FIXME: not yet implemented.");
    }
 }
--- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
@ -97,10 +97,12 @@ public class UnicodeUtils implements UnicodeConstants {
        Unicode codepoints, into either Normalization Form KD (NFKD),
        D (NFD), or THDL (NFTHDL), depending on the value of normForm.
        NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed
-        for {@link org.thdl.tib.text.tshegbar#GraphemeCluster} because
+        for {@link org.thdl.tib.text.tshegbar#UnicodeGraphemeCluster}
-        NFKD normalizes <code>U+0F0C</code>.  NFTHDL uses a maximum of
+        because NFKD normalizes <code>U+0F0C</code> and neither NFD
-        codepoints, and it never uses codepoints whose use has been
+        nor NFKD breaks down <code>U+0F00</code> into its constituent
-        {@link #isDiscouraged(char) discouraged}.
+        codepoints.  NFTHDL uses a maximum of codepoints, and it never
        uses codepoints whose use has been {@link #isDiscouraged(char)
        discouraged}.
        <p>The Tibetan passages of the returned string are in the
        chosen normalized form, but codepoints outside of the {@link
@ -170,6 +172,8 @@ public class UnicodeUtils implements UnicodeConstants {
            // Where not specified, the NFKD and NFTHDL forms are
            // identical to the NFD form.
            switch (tibetanUnicodeCP) {
            case '\u0F00': return ((normalizationForm == NORM_NFTHDL)
                                   ? "\u0F68\u0F7C\u0F7E" : null);
            case '\u0F0C': return ((normalizationForm == NORM_NFKD)
                                   ? "\u0F0B" : null);
            case '\u0F43': return "\u0F42\u0FB7";
@ -282,9 +286,37 @@ public class UnicodeUtils implements UnicodeConstants {
    }
    /** Returns a human-readable, ASCII form of the Unicode codepoint
-        ch. */
+        cp. */
-    public static String unicodeCPToString(char ch) {
+    public static String unicodeCodepointToString(char cp) {
-        return "U+" + Integer.toHexString((int)ch);
+        if (cp < '\u0010')
            return "\\u000" + Integer.toHexString((int)cp);
        else if (cp < '\u0100')
            return "\\u00" + Integer.toHexString((int)cp);
        else if (cp < '\u1000')
            return "\\u0" + Integer.toHexString((int)cp);
        else
            return "\\u" + Integer.toHexString((int)cp);
    }
    public static String unicodeStringToString(String s) {
        StringBuffer sb = new StringBuffer(s.length() * 6);
        for (int i = 0; i < s.length(); i++) {
            sb.append(unicodeCodepointToString(s.charAt(i)));
        }
        return sb.toString();
    }
    /** Returns true iff cp is a Unicode 3.2 Tibetan consonant,
        subjoined or not.  This counts precomposed consonant stacks
        like <code>U+0FA7</code> as consonants.  If you don't wish to
        treat such as consonants, then put the input into NORM_NFD,
        NORM_NFKD, or NORM_NFTHDL first.  If it changes under such a
        normalization, it is a precomposed consonant. */
    public static boolean isTibetanConsonant(char cp) {
        return (((cp >= '\u0F40' && cp <= '\u0F6A')
                 || (cp >= '\u0F90' && cp <= '\u0FBC'))
                && '\u0F48' != cp
                && '\u0F98' != cp);
    }
 }