Renamed UnicodeCharToExtendedWylie to

UnicodeCodepointToThdlWylie.java. Added a new class, UnicodeGraphemeCluster, that can tell you the components of a grapheme cluster from top to bottom. It does not yet have good error checking; it is not yet finished. Next is to parse clean Unicode into GraphemeClusters. After that comes scanning dirty Unicode into best-guess GraphemeClusters, and scanning dirty Unicode to get nice error messages.
2002-12-17 13:51:18 +00:00 · 2002-12-17 13:51:18 +00:00 · 7ea185fa01
commit 7ea185fa01
parent 8e8a23c6a6
4 changed files with 481 additions and 69 deletions
--- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
+++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
@ -748,7 +748,7 @@ public class LegalTshegBar
                if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) {
                    return internalThrowThing(throwIfIllegal,
                                              "Illegal suffix -- not one of the ten legal suffixes: "
-                                              + UnicodeUtils.unicodeCPToString(suffix.charAt(0)));
+                                              + UnicodeUtils.unicodeCodepointToString(suffix.charAt(0)));
                }
            }
        }
@ -837,7 +837,7 @@ public class LegalTshegBar

            boolean disambiguatorNeeded = false;
            char prefix = getPrefix();
-            sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(prefix));
+            sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(prefix));
            if (!hasHeadLetter()) {
                if (EWC_ya == rootLetter) {
                    if (isConsonantThatTakesYaBtags(prefix))
@ -857,55 +857,55 @@ public class LegalTshegBar
                sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
        }
        if (hasHeadLetter())
-            sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter()));
-        sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(rootLetter));
+            sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter()));
+        sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(rootLetter));
        if (hasSubjoinedLetter())
-            sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter()));
+            sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getSubjoinedLetter()));
        if (hasWaZurSubjoinedToRootLetter())
-            sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EWSUB_wa_zur));
+            sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EWSUB_wa_zur));

        // a-chung is treated, in THDL Extended Wylie, like a vowel.
        // I.e., you don't have 'pAa', you have 'pA'.
        if (hasAChungOnRootLetter()) {
            if (hasExplicitVowel()) {
                if (EWV_i == getVowel()) {
-                    sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F73'));
+                    sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint('\u0F73'));
                } else if (EWV_u == getVowel()) {
-                    sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F75'));
+                    sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint('\u0F75'));
                } else if (EWV_e == getVowel() || EWV_o == getVowel()) {
                    // The exception to the rule for a-chung and vowels...

                    // DLC FIXME: are these allowed in legal Tibetan?
                    // EWTS would have special cases for them if so,
                    // I'd wager...
-                    sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung));
-                    sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()));
+                    sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung));
+                    sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel()));
                } else {
                    ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?");
                }
            } else {
-                sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung));
+                sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung));
            }
        } else {
            if (hasExplicitVowel())
-                sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()));
+                sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel()));
            else
                sb.append("a");
        }

        if (hasSuffix()) {
            String suf = getSuffix();
-            sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(0)));
+            sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(0)));
            if (suf.length() > 1) {
                // DLC assert, don't verify, that the length is two.
                // This could change if I learn of more suffix
                // particles.
                ThdlDebug.verify(2 == suf.length());
-                sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(1)));
+                sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(1)));
            }
        }
        if (hasPostsuffix())
-            sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix()));
+            sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix()));
        return sb;
    }

@ -929,18 +929,18 @@ public class LegalTshegBar
                + "transliterationType=\"THDL Extended Wylie 0.5\" "
                + (hasPrefix()
                   ? ("prefix=\""
-                      + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPrefix()) + "\" ")
+                      + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPrefix()) + "\" ")
                   : "")
                + (hasHeadLetter()
                   ? ("headLetter=\""
-                      + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter())
+                      + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter())
                      + "\" ")
                   : "")
                + ("rootLetter=\""
-                   + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getRootLetter()) + "\" ")
+                   + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getRootLetter()) + "\" ")
                + (hasSubjoinedLetter()
                   ? ("subjoinedLetter=\""
-                      + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter())
+                      + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getSubjoinedLetter())
                      + "\" ")
                   : "")
                + (hasWaZurSubjoinedToRootLetter()
@ -953,17 +953,17 @@ public class LegalTshegBar
                // DLC NOW: what about the root letter a, i.e. &#92;u0F68 ?  do we want the EWTS to be 'aa' ?
                + ("vowel=\""
                   + (hasExplicitVowel()
-                      ? UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel())
+                      ? UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel())
                      : "a")
                   + "\" ")
                + (hasSuffix()
                   ? ("suffix=\""
-                      + UnicodeCharToThdlWylie.getThdlWylieForUnicodeString(getSuffix())
+                      + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeString(getSuffix())
                      + "\" ")
                   : "")
                + (hasPostsuffix()
                   ? ("postsuffix=\""
-                      + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix())
+                      + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix())
                      + "\" ")
                   : "")
                + "/>");
--- a/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java
@ -21,7 +21,7 @@ package org.thdl.tib.text.tshegbar;
 import org.thdl.tib.text.TibetanMachineWeb;

 /** This noninstantiable class allows for converting from Unicode
- *  codepoints to Extended Wylie.  It cannot be used for long
+ *  codepoints to THDL Extended Wylie.  It cannot be used for long
 *  stretches of text, though, as it is unaware of context, which is
 *  essential to understanding a non-trivial string of Tibetan
 *  Unicode.
@ -29,21 +29,22 @@ import org.thdl.tib.text.TibetanMachineWeb;
 *  <p>See the document by Nathaniel Garson and David Germano entitled
 *  <i>Extended Wylie Transliteration Scheme</i>.  Note that there are
 *  a couple of issues with the November 18, 2001 revision of that
- *  document; these issues are in the Bugs tracker at our SourceForge site.</p>
+ *  document; these issues are in the Bugs tracker at our SourceForge
+ *  site.</p>
 *
 *  @see <a href="http://sourceforge.net/projects/thdltools">SourceForge site</a>
 *
 *  @author David Chandler */
-public class UnicodeCharToExtendedWylie {
+public class UnicodeCodepointToThdlWylie {

-    /** Returns the extended Wylie for the very simple sequence x.
-     *  Returns null iff some (Unicode) char in s has no extended
-     *  Wylie representation.  This is unaware of context, so use it
-     *  sparingly. */
-    public static StringBuffer getExtendedWylieForUnicodeString(String x) {
+    /** Returns the THDL extended Wylie for the very simple sequence
+     *  x.  Returns null iff some (Unicode) char in s has no THDL
+     *  extended Wylie representation.  This is unaware of context, so
+     *  use it sparingly. */
+    public static StringBuffer getThdlWylieForUnicodeString(String x) {
        StringBuffer sb = new StringBuffer();
        for (int i = 0; i < x.length(); i++) {
-            String ew = getExtendedWylieForUnicodeChar(x.charAt(i));
+            String ew = getThdlWylieForUnicodeCodepoint(x.charAt(i));
            if (null == ew)
                return null;
            sb.append(ew);
@ -51,12 +52,14 @@ public class UnicodeCharToExtendedWylie {
        return sb;
    }

-    /** Returns the extended Wylie for x, or null if there is none.
-     *  Understand that multiple Unicode code points (chars) map to
-     *  the same Extended Wylie representation.  Understand also that
-     *  the scrap of Extended Wylie returned is only valid in certain
-     *  contexts.  For example, not all consonants take ra-btags.  DLC NOW what about canonicalization? */
-    public static String getExtendedWylieForUnicodeChar(char x) {
+    /** Returns the THDL extended Wylie for x, or null if there is
+     *  none.  Understand that multiple Unicode code points (chars)
+     *  map to the same THDL Extended Wylie representation.
+     *  Understand also that the scrap of THDL Extended Wylie returned
+     *  is only valid in certain contexts.  For example, not all
+     *  consonants take ra-btags.  DLC NOW what about
+     *  canonicalization? */
+    public static String getThdlWylieForUnicodeCodepoint(char x) {
        switch (x) {

        case '\u0F00': return "oM";
@ -130,9 +133,9 @@ public class UnicodeCharToExtendedWylie {
        case '\u0F40': return "k";
        case '\u0F41': return "kh";
        case '\u0F42': return "g";
-        case '\u0F43': return (getExtendedWylieForUnicodeChar('\u0F42')
+        case '\u0F43': return (getThdlWylieForUnicodeCodepoint('\u0F42')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0F44': return "ng";
        case '\u0F45': return "c";
        case '\u0F46': return "ch";
@ -142,31 +145,31 @@ public class UnicodeCharToExtendedWylie {
        case '\u0F4A': return "T";
        case '\u0F4B': return "Th";
        case '\u0F4C': return "D";
-        case '\u0F4D': return (getExtendedWylieForUnicodeChar('\u0F4C')
+        case '\u0F4D': return (getThdlWylieForUnicodeCodepoint('\u0F4C')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0F4E': return "N";
        case '\u0F4F': return "t";

        case '\u0F50': return "th";
        case '\u0F51': return "d";
-        case '\u0F52': return (getExtendedWylieForUnicodeChar('\u0F51')
+        case '\u0F52': return (getThdlWylieForUnicodeCodepoint('\u0F51')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0F53': return "n";
        case '\u0F54': return "p";
        case '\u0F55': return "ph";
        case '\u0F56': return "b";
-        case '\u0F57': return (getExtendedWylieForUnicodeChar('\u0F56')
+        case '\u0F57': return (getThdlWylieForUnicodeCodepoint('\u0F56')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0F58': return "m";
        case '\u0F59': return "ts";
        case '\u0F5A': return "tsh";
        case '\u0F5B': return "dz";
-        case '\u0F5C': return (getExtendedWylieForUnicodeChar('\u0F5B')
+        case '\u0F5C': return (getThdlWylieForUnicodeCodepoint('\u0F5B')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0F5D': return "w";
        case '\u0F5E': return "zh";
        case '\u0F5F': return "z";
@ -180,9 +183,9 @@ public class UnicodeCharToExtendedWylie {
        case '\u0F66': return "s";
        case '\u0F67': return "h";
        case '\u0F68': return "a"; // DLC: maybe the empty string is OK here because typing just 'i' into Jskad causes root letter \u0F68 to appear... yuck...
-        case '\u0F69': return (getExtendedWylieForUnicodeChar('\u0F40')
+        case '\u0F69': return (getThdlWylieForUnicodeCodepoint('\u0F40')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB5'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB5'));
        case '\u0F6A': return "r";
        case '\u0F6B': return null;
        case '\u0F6C': return null;
@ -227,9 +230,9 @@ public class UnicodeCharToExtendedWylie {
        case '\u0F90': return "k";
        case '\u0F91': return "kh";
        case '\u0F92': return "g";
-        case '\u0F93': return (getExtendedWylieForUnicodeChar('\u0F92')
+        case '\u0F93': return (getThdlWylieForUnicodeCodepoint('\u0F92')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0F94': return "ng";
        case '\u0F95': return "c";
        case '\u0F96': return "ch";
@ -239,31 +242,31 @@ public class UnicodeCharToExtendedWylie {
        case '\u0F9A': return "T";
        case '\u0F9B': return "Th";
        case '\u0F9C': return "D";
-        case '\u0F9D': return (getExtendedWylieForUnicodeChar('\u0F92')
+        case '\u0F9D': return (getThdlWylieForUnicodeCodepoint('\u0F92')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0F9E': return "N";
        case '\u0F9F': return "t";

        case '\u0FA0': return "th";
        case '\u0FA1': return "d";
-        case '\u0FA2': return (getExtendedWylieForUnicodeChar('\u0FA1')
+        case '\u0FA2': return (getThdlWylieForUnicodeCodepoint('\u0FA1')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0FA3': return "n";
        case '\u0FA4': return "p";
        case '\u0FA5': return "ph";
        case '\u0FA6': return "b";
-        case '\u0FA7': return (getExtendedWylieForUnicodeChar('\u0FA6')
+        case '\u0FA7': return (getThdlWylieForUnicodeCodepoint('\u0FA6')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0FA8': return "m";
        case '\u0FA9': return "ts";
        case '\u0FAA': return "tsh";
        case '\u0FAB': return "dz";
-        case '\u0FAC': return (getExtendedWylieForUnicodeChar('\u0FAB')
+        case '\u0FAC': return (getThdlWylieForUnicodeCodepoint('\u0FAB')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB7'));
        case '\u0FAD': return "w";
        case '\u0FAE': return "zh";
        case '\u0FAF': return "z";
@ -277,9 +280,9 @@ public class UnicodeCharToExtendedWylie {
        case '\u0FB6': return "s";
        case '\u0FB7': return "h";
        case '\u0FB8': return "a"; // DLC see note on \u0F68 ...
-        case '\u0FB9': return (getExtendedWylieForUnicodeChar('\u0F90')
+        case '\u0FB9': return (getThdlWylieForUnicodeCodepoint('\u0F90')
                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
-                               + getExtendedWylieForUnicodeChar('\u0FB5'));
+                               + getThdlWylieForUnicodeCodepoint('\u0FB5'));
        case '\u0FBA': return "w";
        case '\u0FBB': return "y";
        case '\u0FBC': return "r";
@ -309,7 +312,7 @@ public class UnicodeCharToExtendedWylie {

            // This codepoint is in the range 0FD0-0FFF or is not in
            // the Tibetan range at all.  In either case, there is no
-            // corresponding Extended Wylie.
+            // corresponding THDL Extended Wylie.
            return null;
        }
        } // end switch
--- a/source/org/thdl/tib/text/tshegbar/UnicodeGraphemeCluster.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeGraphemeCluster.java
@ -0,0 +1,377 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site 
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis, 
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
+License for the specific terms governing rights and limitations under the 
+License. 
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
+All Rights Reserved. 
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.tshegbar;
+
+import java.util.Vector;
+
+import org.thdl.util.ThdlDebug;
+
+/** A UnicodeGraphemeCluster is either a non-Tibetan codepoint (such
+ *  as whitespace or control characters or a Latin "character"), or a
+ *  vertically stacked set of Tibetan consonants, vowels, marks, and
+ *  signs.  The Unicode string
+ *  <code>"&#92;u0F40&#92;u0F0B&#92;u0F41&#92;u0F0B"</code> specifies
+ *  four UnicodeGraphemeClusters (the name of the Tibetan alphabet,
+ *  you might notice), while the Unicode string
+ *  <code>"&#92;u0F66&#92;u0FA5&#92;u0F39&#92;u0F90&#92;u0FB5&#92;u0F71&#92;u0F80&#92;u0F7F"</code>
+ *  is one Tibetan stack, sa over fa over ka over Sha with an a-chung,
+ *  a reversed gi-gu, and a visarga, plus a ngas-bzung-sgor-rtags mark
+ *  underneath all of that.  I assume the latter grapheme cluster is
+ *  nonsense, but it is considered one grapheme cluster because all
+ *  but the first char are combining chars.  See Unicode Technical
+ *  Report 29.
+ *
+ *  <p>As the above example demonstrates, not all
+ *  UnicodeGraphemeClusters are syntactically legal in the Tibetan
+ *  language.  Not all of them are syntactically legal in Sanskrit
+ *  transcribed in the Tibetan alphabet, either.</p>
+ *
+ *  <p>The Unicode 3.2 standard (see especially Technical Report 29)
+ *  refers to "grapheme clusters."  A UnicodeGraphemeCluster is
+ *  precisely a grapheme cluster as described by that standard.  We
+ *  interpret the standard as saying that <code>U+0F3E</code> and
+ *  <code>U+0F3F</code> are each grapheme clusters unto themselves,
+ *  even though they are combining codepoints.</p>
+ *
+ *  @author David Chandler */
+public class UnicodeGraphemeCluster
+    implements UnicodeReadyThunk, UnicodeConstants
+{
+    /** @see #getCPHeight(char) */
+    private static final int MIN_HEIGHT = -6;
+    /** @see #getCPHeight(char) */
+    private static final int MAX_HEIGHT = 3;
+
+    /** The Unicode codepoints that compose this grapheme cluster.
+        This is legal, i.e. if there is a Tibetan vowel, it is the
+        last codepoint.  It is in Normalization Form THDL (NFTHDL). */
+    private String unicodeString;
+
+    /** Do not use this constructor. */
+    private UnicodeGraphemeCluster() { super(); }
+
+    /** Creates a new GraphemeCluster given a legal sequence of
+        Unicode codepoints corresponding to a single grapheme
+        cluster.
+        @exception IllegalArgumentException if unicodeString is not a
+        syntactically correct Unicode 3.2 sequence (if it begins with
+        a combining codepoint or has a Tibetan vowel before another
+        combining character, for example, or if it is more than one
+        grapheme cluster.  Note that syntactical correctness for
+        non-Tibetan codepoints is not likely to be known by this
+        routine. */
+    public UnicodeGraphemeCluster(String unicodeString)
+        throws IllegalArgumentException
+    {
+        // check legality:
+        // DLC NOW FIXME
+
+        // convert to NFTHDL:
+        this.unicodeString
+            = UnicodeUtils.toMostlyDecomposedUnicode(unicodeString, NORM_NFTHDL);
+    }
+
+    /** Returns a string of codepoints in NFTHDL form. */
+    public String getUnicodeRepresentation() {
+        return unicodeString;
+    }
+
+    /** Returns true. */
+    public boolean hasUnicodeRepresentation() {
+        return true;
+    }
+
+    /** Returns true iff this stack could occur in syntactically
+     *  correct, run-of-the-mill Tibetan (as opposed to Tibetanized
+     *  Sanksrit, Chinese, et cetera).  sga is a legal Tibetan stack,
+     *  but g+g is not, for example. */
+    public boolean isLegalTibetan() {
+        // DLC FIXME: for those odd head marks etc., return true even
+        // though hasUnicodeRepresentation() will return false.
+        
+        // Note that ra-btags and wa-zur both be present in legal
+        // Tibetan.
+
+        throw new Error("DLC FIXME: not yet implemented.");
+    }
+
+    /** Returns a <unicodeGraphemeCluster> element that contains the
+     *  THDL Extended Wylie transliteration for this cluster. */
+    public String toConciseXML() {
+        throw new Error("DLC NOW unimplemented");
+    }
+
+    /** Returns a <unicodeGraphemeCluster> element that contains this
+     *  cluster broken down into its constituent decomposed
+     *  codepoints. */
+    public String toVerboseXML() {
+        throw new Error("DLC NOW unimplemented");
+    }
+
+    /** Returns the THDL Extended Wylie transliteration of this
+        grapheme cluster, or null if there is none (which happens for
+        a few Tibetan codepoints, if you'll recall). If needsVowel is
+        true, then an "a" will be appended when there is no EW_achung
+        or explicit simple vowel.  If there is an explicit vowel or
+        EW_achung, it will always be present.  Note that needsVowel is
+        provided because btags is the preferred THDL Extended Wylie
+        for the four contiguous grapheme clusters
+        <code>"&#92;u0F56&#92;u0F4F&#92;u0F42&#92;u0F66"</code>, and
+        needsVowel must be set to false for all but the grapheme
+        cluster corresponding to <code>&#92;u0F4F</code> if you wish
+        to get the preferred THDL Extended Wylie. */
+    public String getThdlWylie(boolean needsVowel) {
+        throw new Error("DLC NOW unimplemented");
+    }
+
+    /** Given some (possibly unnormalized) Unicode 3.2 string unicode,
+        appends grapheme clusters to the vector of GraphemeClusters
+        grcls if grcls is nonnulla.  Performs good error checking if
+        validate is true.  If an error is found, grcls may have been
+        modified if nonnull.  Setting grcls to null and setting
+        validate to true is sometimes useful for testing the validity
+        of a Unicode string.
+        @return the number of grapheme clusters that were or would
+        have been added to grcls
+        @exception BadTibetanUnicodeException if the unicode is not
+        syntactically legal
+        @exception IllegalArgumentException if correctErrors and
+        validate are both true
+        @exception NullPointerException if unicode is null */
+    public static int breakUnicodeIntoGraphemeClusters(Vector grcls,
+                                                       String unicode,
+                                                       boolean validate,
+                                                       boolean correctErrors)
+        throws // DLC SOON: BadTibetanUnicodeException, 
+               IllegalArgumentException, NullPointerException
+    {
+        if (validate && correctErrors) {
+            throw new IllegalArgumentException("validate and correctErrors cannot both be true.");
+        }
+        throw new Error("DLC NOW unimplemented");
+        /*
+            if (start == i) {
+                // special tests at the beginning of input.
+                if (0 != height || UnicodeUtils.combinesLeftToRight(cp)) {
+                    throw new BadTibetanUnicodeException("A combining codepoint was found at the start of input or after a mark that ends a grapheme cluster.");
+                }
+            }
+            if (height == last_height) {
+                if ('\u0F39' == cp) {
+                    if (!UnicodeUtils.isTibetanConsonant(last_cp)) {
+                        throw new BadTibetanUnicodeException("U+0F39 can only occur after a (possibly subjoined) Tibetan consonant");
+                    }
+                } else {
+                    // DLC:                    cp BEGINS A NEW GRAPHEME CLUSTER!!!
+                }
+            }
+
+            // Test to see if this last character has ended this
+            // grapheme cluster:
+            if (UnicodeUtils.isTibetanTerminatingVowel(cp)) {
+                // DLC: cp ENDS A GRAPHEME CLUSTER!!!
+            }
+        */
+    }
+
+    /** FIXMEDOC */
+    public String getTopToBottomCodepoints() {
+        return getTopToBottomCodepoints(new StringBuffer(unicodeString),
+                                        0, unicodeString.length()).toString();
+    }
+
+    /** Returns a new StringBuffer consisting of the codepoints in
+        NFTHDLString at indices [start, end) sorted in top-to-bottom
+        order, or null on some occasions when NFTHDLString is already
+        sorted.  A top-to-bottom ordering is a useful form for
+        applications wishing to render the grapheme cluster.  Note
+        that this method is only useful if NFTHDLString is part of or
+        an entire grapheme cluster.  Does no error checking on
+        NFTHDLString.
+        @param NFTHDLString a buffer with characters at indices i,
+        where start <= i < end, being the Unicode codepoints for a
+        single grapheme cluster or part of a grapheme cluster
+        @param start NFTHDLString.charAt(start) is the first codepoint
+        dealt with
+        @param end NFTHDLString.charAt(end) is the first codepoint NOT
+        dealt with
+        @return null only if (but not necessarily if) NFTHDLString is
+        already sorted top-to-bottom, or the sorted form of
+        NFTHDLString */
+    private static StringBuffer getTopToBottomCodepoints(StringBuffer NFTHDLString, /* DLC FIXME: for efficiency, use a ThdlCharIterator. */
+                                                         int start, int end)
+    {
+        if (end <= start) /* 0-length string. */
+            return null;
+        if (start + 1 == end) /* 1-length string. */
+            return null;
+        // else we have a string of length >= 2.
+
+        // We'll use the world's fastest sorting algorithm.  Linear
+        // time, baby.  Here are the ten or so mailboxes for our
+        // postman's sort:
+        StringBuffer chunksAtCommonHeights[]
+            = new StringBuffer[(MAX_HEIGHT + 1) - MIN_HEIGHT];
+
+        for (int i = start; i < end; i++) {
+            char cp = NFTHDLString.charAt(i);
+            int height = getCPHeight(cp);
+
+            // initialize mailbox if necessary.
+            if (null == chunksAtCommonHeights[height - MIN_HEIGHT]) {
+                chunksAtCommonHeights[height - MIN_HEIGHT]
+                    = new StringBuffer(2);
+            }
+
+            // put this cp into the correct mailbox.
+            chunksAtCommonHeights[height - MIN_HEIGHT].append(cp);
+        }
+
+        // Now concatenate together the mailboxes:
+        StringBuffer sb = new StringBuffer(end - start);
+        for (int h = MAX_HEIGHT; h >= MIN_HEIGHT; h--) {
+            if (null != chunksAtCommonHeights[h - MIN_HEIGHT]) {
+                sb.append(chunksAtCommonHeights[h - MIN_HEIGHT]);
+            }
+        }
+        return sb;
+    }
+
+
+    /** Returns the <i>height</i> for the Tibetan Unicode codepoint x.
+        This relative height is 0 for a base consonant, digit,
+        punctuation, mark, or sign.  It is -1 for a subjoined
+        consonant, -2 for EWSUB_wa_zur, -3 for EW_achung, +1 for
+        EWV_gigu, and so on according to the height these codepoints
+        appear relative to one another when on the same stack.  If two
+        codepoints have equal height, they should not exist in the
+        same grapheme cluster unless one is <code>U+0F39</code>, which
+        is an integral part of a consonant when tacked on to, e.g.,
+        EWC_PHA.
+
+        <p>If x is not a Unicode 3.2 codepoint in the Tibetan range,
+        or if x is not in NFTHDL form, 0 is returned.  The height code
+        of <code>U+0F76</code> is not valid, and it is not an accident
+        that <code>U+0F76</code> is not in NFTHDL form.</p> */
+    private static int getCPHeight(char x) {
+        // DLC make this an assertion:
+        ThdlDebug.verify(null == UnicodeUtils.toNormalizedForm(x, NORM_NFTHDL));
+
+        if (x >= '\u0F90' && x <= '\u0FAC'
+            || x >= '\u0FAE' && x <= '\u0FBC') {
+            // subjoined consonant.  Note that wa-zur is an exception.
+            return -1;
+        } else if (x >= '\u0F00' && x <= '\u0F17'
+                   || x >= '\u0F1A' && x <= '\u0F34'
+                   || x >= '\u0F3A' && x <= '\u0F3D'
+                   || x >= '\u0F40' && x <= '\u0F6A' // consonants
+                   || x >= '\u0F88' && x <= '\u0F8B'
+                   || x >= '\u0FBE' && x <= '\u0FCF') {
+            // neutral height:
+            return 0;
+        } else { // Oddballs.
+            switch (x) {
+            //
+            // non-combining:
+            //
+            case '\u0F36':
+            case '\u0F38':
+            case '\u0F85':
+                return 0;
+
+
+            //
+            // combining, but left-to-right combining:
+            //
+            case '\u0F3E':
+            case '\u0F3F':
+            case '\u0F7F':
+                return 0;
+
+
+            //
+            // combining by coming below:
+            //
+            case '\u0FAD':
+                return -2; // wa-zur
+            case '\u0F71':
+                return -3; // a-chung
+            case '\u0F74':
+            case '\u0F84':
+                return -4; // DLC CHECKME
+            case '\u0F18': // combines with digits
+            case '\u0F19': // combines with digits
+                return -5;
+            case '\u0F35':
+            case '\u0F37':
+            case '\u0FC6': {
+                ThdlDebug.verify(-6 == MIN_HEIGHT);
+                return -6; // min height
+            }
+
+
+            //
+            // combining by coming above:
+            //
+            case '\u0F72':
+            case '\u0F7A':
+            case '\u0F7B':
+            case '\u0F7C':
+            case '\u0F7D':
+            case '\u0F80':
+                return 1;
+            case '\u0F7E':
+            case '\u0F82':
+            case '\u0F83':
+                return 2; // these three come above 0F7C, right? (DLC CHECKME)
+            case '\u0F86':
+            case '\u0F87': {
+                ThdlDebug.verify(3 == MAX_HEIGHT);
+                return 3; // max height
+            }
+
+
+            //
+            // exceptional case:
+            //
+            // some would say +1, but then "\u0F40\u0FA5\u0F39" will
+            // not have a5 combine with 39.  Unicode could well have
+            // put in a single codepoint for "\u0FA5\u0F39" IMO.
+            case '\u0F39': return 0;
+
+
+            default: {
+                if (x >= '\u0F00' && x <= '\u0FFF') {
+                    // This wasn't explicitly handled?  Hmmm... This
+                    // won't ever happen for NFTHDL-formed input.
+                    ThdlDebug.noteIffyCode();
+                }
+
+                // This codepoint is not in the Tibetan range.
+                return 0;
+            }
+            } // end switch
+        }
+    }
+    /** DLC SOON */
+    public boolean isTibetan() {
+        throw new Error("DLC FIXME: not yet implemented.");
+    }
+}
+
--- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
@ -97,10 +97,12 @@ public class UnicodeUtils implements UnicodeConstants {
        Unicode codepoints, into either Normalization Form KD (NFKD),
        D (NFD), or THDL (NFTHDL), depending on the value of normForm.
        NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed
-        for {@link org.thdl.tib.text.tshegbar#GraphemeCluster} because
-        NFKD normalizes <code>U+0F0C</code>.  NFTHDL uses a maximum of
-        codepoints, and it never uses codepoints whose use has been
-        {@link #isDiscouraged(char) discouraged}.
+        for {@link org.thdl.tib.text.tshegbar#UnicodeGraphemeCluster}
+        because NFKD normalizes <code>U+0F0C</code> and neither NFD
+        nor NFKD breaks down <code>U+0F00</code> into its constituent
+        codepoints.  NFTHDL uses a maximum of codepoints, and it never
+        uses codepoints whose use has been {@link #isDiscouraged(char)
+        discouraged}.

        <p>The Tibetan passages of the returned string are in the
        chosen normalized form, but codepoints outside of the {@link
@ -170,6 +172,8 @@ public class UnicodeUtils implements UnicodeConstants {
            // Where not specified, the NFKD and NFTHDL forms are
            // identical to the NFD form.
            switch (tibetanUnicodeCP) {
+            case '\u0F00': return ((normalizationForm == NORM_NFTHDL)
+                                   ? "\u0F68\u0F7C\u0F7E" : null);
            case '\u0F0C': return ((normalizationForm == NORM_NFKD)
                                   ? "\u0F0B" : null);
            case '\u0F43': return "\u0F42\u0FB7";
@ -282,9 +286,37 @@ public class UnicodeUtils implements UnicodeConstants {
    }

    /** Returns a human-readable, ASCII form of the Unicode codepoint
-        ch. */
-    public static String unicodeCPToString(char ch) {
-        return "U+" + Integer.toHexString((int)ch);
+        cp. */
+    public static String unicodeCodepointToString(char cp) {
+        if (cp < '\u0010')
+            return "\\u000" + Integer.toHexString((int)cp);
+        else if (cp < '\u0100')
+            return "\\u00" + Integer.toHexString((int)cp);
+        else if (cp < '\u1000')
+            return "\\u0" + Integer.toHexString((int)cp);
+        else
+            return "\\u" + Integer.toHexString((int)cp);
+    }
+
+    public static String unicodeStringToString(String s) {
+        StringBuffer sb = new StringBuffer(s.length() * 6);
+        for (int i = 0; i < s.length(); i++) {
+            sb.append(unicodeCodepointToString(s.charAt(i)));
+        }
+        return sb.toString();
+    }
+
+    /** Returns true iff cp is a Unicode 3.2 Tibetan consonant,
+        subjoined or not.  This counts precomposed consonant stacks
+        like <code>U+0FA7</code> as consonants.  If you don't wish to
+        treat such as consonants, then put the input into NORM_NFD,
+        NORM_NFKD, or NORM_NFTHDL first.  If it changes under such a
+        normalization, it is a precomposed consonant. */
+    public static boolean isTibetanConsonant(char cp) {
+        return (((cp >= '\u0F40' && cp <= '\u0F6A')
+                 || (cp >= '\u0F90' && cp <= '\u0FBC'))
+                && '\u0F48' != cp
+                && '\u0F98' != cp);
    }
 }