From a42347b2248d447cba47341cc91824426ce7f2e0 Mon Sep 17 00:00:00 2001
From: dchandler <dchandler>
Date: Sun, 15 Dec 2002 03:35:24 +0000
Subject: [PATCH] Now uses terminology from the Unicode standard.  No more talk
 of characters, for example.

Normalization forms NFKD and NFD are supported for the Tibetan Unicode
range.  I don't like either, actually.  I've tested NFKD, but I've not yet
committed the tests.
---
 .../thdl/tib/text/tshegbar/LegalTshegBar.java |  29 +--
 .../org/thdl/tib/text/tshegbar/TshegBar.java  |  51 +++--
 .../tshegbar/UnicodeCharToExtendedWylie.java  |  10 +-
 .../tib/text/tshegbar/UnicodeConstants.java   |  24 ++-
 .../tib/text/tshegbar/UnicodeReadyThunk.java  |  32 +--
 .../thdl/tib/text/tshegbar/UnicodeUtils.java  | 196 +++++++++++-------
 .../org/thdl/tib/text/tshegbar/package.html   |   4 +-
 7 files changed, 210 insertions(+), 136 deletions(-)
diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
index aabc790..ce782a4 100644
--- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
+++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
@@ -103,7 +103,7 @@ And also there are cases where they combine. For ex you can have
  *  consonants and vowels.  In some situations, you should use {@link
  *  #EWSUB_wa_zur} to represent the consonant wa, while in others you
  *  should use {@link #EWC_wa}, even though you mean to subscribe a
- *  fixed-form wa.  Basically, stick to the characters for which
+ *  fixed-form wa.  Basically, stick to the codepoints for which
  *  enumerations exist in {@link
  *  org.thdl.tib.text.tshegbar.UnicodeConstants} and use your common
  *  sense.</p>
@@ -131,7 +131,7 @@ public class LegalTshegBar
     private boolean hasWaZur;
     /** true iff EW_wa_zur is under the root syllable. */
     private boolean hasAChung;
-    /** If this is a string, it is of a single character or is equal
+    /** If this is a string, it is of a single codepoint or is equal
      *  to {@link #getConnectiveCaseSuffix()} */
     private String suffix;
     /** EW_da, EW_sa, or EW_ABSENT */
@@ -237,7 +237,7 @@ public class LegalTshegBar
 
     /** Returns null if there is no suffix, or a string containing the
      *  one consonant or a string <code>"&#92;u0F60&#92;u0F72"</code>
-     *  containing two characters in the special case that the suffix
+     *  containing two codepoints in the special case that the suffix
      *  is that connective case marker {@link
      *  #getConnectiveCaseSuffix()}. */
     public String getSuffix() {
@@ -317,7 +317,7 @@ public class LegalTshegBar
     }
 
 
-    /** Returns a string of two characters, da and sa. */
+    /** Returns a string of two codepoints, da and sa. */
     public static String getPossiblePostsuffixes() {
         return new String(new char[] { EWC_da, EWC_sa });
     }
@@ -328,7 +328,7 @@ public class LegalTshegBar
             EWC_ra, EWC_la, EWC_sa
         });
 
-    /** Returns a string of ten characters, each of which can be a
+    /** Returns a string of ten codepoints, each of which can be a
      *  suffix in Tibetan. */
     public static String getPossibleSuffixes() {
         return possibleSuffixes;
@@ -345,7 +345,7 @@ public class LegalTshegBar
             EWC_achen, EWV_i
         });
 
-    /** Returns a two-character string consisting of the Unicode
+    /** Returns a two-codepoint string consisting of the Unicode
      *  representation of what Extended Wylie calls
      *  <code>'i</code>. */
     public static String getConnectiveCaseSuffix() {
@@ -594,9 +594,9 @@ public class LegalTshegBar
      *  @param rootLetter the mandatory root consonant
      *  @param subjoinedLetter the optional, subscribed consonant
      *  @param suffix the optional suffix, which is null, a String
-     *  consisting of a single consonant (i.e. a single character)
-     *  except in the special case that this is {@link
-     *  #getConnectiveCaseSuffix()}
+     *  consisting of a single consonant (i.e. a single,
+     *  nondecomposable codepoint) except in the special case that
+     *  this is {@link #getConnectiveCaseSuffix()}
      *  @param postsuffix the optional postsuffix, which should be
      *  EWC_sa or EWC_da
      *  @param vowel the optional vowel */
@@ -748,7 +748,7 @@ public class LegalTshegBar
                 if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) {
                     return internalThrowThing(throwIfIllegal,
                                               "Illegal suffix -- not one of the ten legal suffixes: "
-                                              + UnicodeUtils.unicodeCharToString(suffix.charAt(0)));
+                                              + UnicodeUtils.unicodeCPToString(suffix.charAt(0)));
                 }
             }
         }
@@ -971,10 +971,11 @@ public class LegalTshegBar
 
 
     /** Overrides {@link org.thdl.tib.text.tshegbar.UnicodeReadyThunk}
-        method to return {@link UnicodeUtils#toCanonicalForm(String)
-        canonically-formed Unicode}.
+        method to return {@link
+        UnicodeUtils#toMostlyDecomposedUnicode(String, byte)
+        NFKD-normalized Unicode}.
         @exception UnsupportedOperationException is never thrown */
-    public String getEquivalentUnicode() {
+    public String getUnicodeRepresentation() {
         StringBuffer sb = new StringBuffer();
         if (hasPrefix()) {
             ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getPrefix()));
@@ -1017,7 +1018,7 @@ public class LegalTshegBar
 
     /** Overrides {@link org.thdl.tib.text.tshegbar.UnicodeReadyThunk}
         method to return true. */
-    public boolean hasEquivalentUnicode() {
+    public boolean hasUnicodeRepresentation() {
         return true;
     }
 
diff --git a/source/org/thdl/tib/text/tshegbar/TshegBar.java b/source/org/thdl/tib/text/tshegbar/TshegBar.java
index 769a104..4eefed6 100644
--- a/source/org/thdl/tib/text/tshegbar/TshegBar.java
+++ b/source/org/thdl/tib/text/tshegbar/TshegBar.java
@@ -23,26 +23,37 @@ package org.thdl.tib.text.tshegbar;
  *
  *  <p> First, some terminology.</p>
  *
- *  <ul> <li>When we talk about a <i>glyph</i>, we mean a picture
- *  found in a font.  A single glyph may have one or more
- *  representations by sequences of Unicode characters, or it may not
- *  be representable becuase it is only part of one Unicode character
- *  or pictures a nonstandard character.</li> <li>When we talk about a
- *  <i>stack</i>, we mean either a number (or half-number), a mark or
- *  sign, a bit of punctuation, or a consonant stack.</li> <li>A
- *  <i>consonant stack</i> is or one or more consonants stacked
- *  vertically, plus an optional vocalic modification such as an
- *  anusvara (DLC what do we call a bindu?) or visarga, plus zero or
- *  more signs like <code>&#92;u0F35</code>, plus an optional a-chung
- *  (<code>&#92;u0F71</code>), plus an optional simple vowel.</li> <li>By
- *  <i>simple vowel</i>, we mean any of <code>&#92;u0F72</code>,
- *  <code>&#92;u0F74</code>, <code>&#92;u0F7A</code>, <code>&#92;u0F7B</code>,
+ *  <ul> <li>When we talk about a <i>grapheme cluster</i> (or
+ *  <i>grcl</i>), we mean what the Unicode standard calls a "grapheme
+ *  cluster".  Most glyphs (i.e., pictures) found in a font are
+ *  grapheme clusters, but the picture corresponding to the Unicode
+ *  codepoint <code>&#92;u0F74</code> is not a grapheme cluster.  In
+ *  addition, in English, many fonts have a single glyph (a
+ *  "ligature") for the combination of two grapheme clusters,
+ *  e.g. "fi".  A single grapheme cluster may have one or more
+ *  representations by sequences of Unicode codepoints, or it may not
+ *  be representable becuase it is only part of one Unicode codepoint
+ *  or pictures a nonstandard character.</li> <li>We will attempt to
+ *  avoid using the word "character", as it sometimes refers to a
+ *  codepoint and sometimes refers to a glyph in a font and yet other
+ *  times refers to a grapheme cluster.</li> <li>We'll try to avoid
+ *  using the word "stack" because it sometimes refers to a sequence
+ *  of stacked Tibetan consonants and sometimes refers to an entire
+ *  grapheme cluster.</li> <li>A <i>Tibetan stack</i> is or one or
+ *  more consonants stacked vertically, plus an optional vocalic
+ *  modification such as an anusvara (DLC what do we call a bindu?) or
+ *  visarga, plus zero or more signs like <code>&#92;u0F35</code>,
+ *  plus an optional a-chung (<code>&#92;u0F71</code>), plus an
+ *  optional simple vowel.</li> <li>By <i>simple vowel</i>, we mean
+ *  any of <code>&#92;u0F72</code>, <code>&#92;u0F74</code>,
+ *  <code>&#92;u0F7A</code>, <code>&#92;u0F7B</code>,
  *  <code>&#92;u0F7C</code>, <code>&#92;u0F7D</code>, or
  *  <code>&#92;u0F80</code>.</li> </ul>
  *
- *  (Note: The string <code>"&#92;u0F68&#92;u0F7E&#92;u0F7C"</code> seems to equal
- *  <code>"&#92;u0F00"</code>, though the Unicode standard does not
- *  indicate that it is so.  This code treats it that way.)</p>
+ *  <p>(Note: The string <code>"&#92;u0F68&#92;u0F7E&#92;u0F7C"</code>
+ *  seems to equal <code>"&#92;u0F00"</code>, though the Unicode
+ *  standard does not indicate that it is so.  This code treats it
+ *  that way.)</p>
  *
  *  <p> This class allows for invalid tsheg bars, like those
  *  containing more than one prefix, more than two suffixes, an
@@ -55,10 +66,10 @@ package org.thdl.tib.text.tshegbar;
  *  and for invalid tsheg bars.  Note that correctness is at the tsheg
  *  bar level only; it may be grammatically incorrect to concatenate
  *  two valid tsheg bars.  Some subclasses can be represented in
- *  Unicode, but others contain nonstandard glyphs and cannot be.</p>
+ *  Unicode, but others contain nonstandard glyphs/characters and
+ *  cannot be.</p>
  *
- *  @author David Chandler
- */
+ *  @author David Chandler */
 public abstract class TshegBar implements UnicodeReadyThunk {
     /** Returns true, as we consider a transliteration in the Tibetan
      *  alphabet of a non-Tibetan language, say Chinese, as being
diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java b/source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java
index a454278..955ca59 100644
--- a/source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java
@@ -21,10 +21,10 @@ package org.thdl.tib.text.tshegbar;
 import org.thdl.tib.text.TibetanMachineWeb;
 
 /** This noninstantiable class allows for converting from Unicode
- *  characters (i.e., code points) to Extended Wylie.  It cannot be
- *  used for long stretches of text, though, as it is unaware of
- *  context, which is essential to understanding a non-trivial string
- *  of Tibetan Unicode.
+ *  codepoints to Extended Wylie.  It cannot be used for long
+ *  stretches of text, though, as it is unaware of context, which is
+ *  essential to understanding a non-trivial string of Tibetan
+ *  Unicode.
  *
  *  <p>See the document by Nathaniel Garson and David Germano entitled
  *  <i>Extended Wylie Transliteration Scheme</i>.  Note that there are
@@ -307,7 +307,7 @@ public class UnicodeCharToExtendedWylie {
         default: {
             // DLC handle space (EW's "_")
 
-            // This character is in the range 0FD0-0FFF or is not in
+            // This codepoint is in the range 0FD0-0FFF or is not in
             // the Tibetan range at all.  In either case, there is no
             // corresponding Extended Wylie.
             return null;
diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java b/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java
index 195b5bc..611abcd 100644
--- a/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java
@@ -19,7 +19,7 @@ Contributor(s): ______________________________________.
 package org.thdl.tib.text.tshegbar;
 
 /** Provides handy Extended Wylie-inspired names for Unicode
- *  characters commonly used to represent Tibetan.  The consonant that
+ *  codepoints commonly used to represent Tibetan.  The consonant that
  *  the Extended Wylie text "ka" refers to is named EWC_ka as in "The
  *  Extended Wylie Consonant ka", the vowel represented in Extended
  *  Wylie by "i" is EWV_i, and so on.  There is at least one exception
@@ -30,10 +30,26 @@ package org.thdl.tib.text.tshegbar;
  *  @author David Chandler */
 public interface UnicodeConstants {
 
-    /** for those times when you need a char to represent a non-existent character */
+    /** Refers to unnormalized Unicode: */
+    static final byte NORM_UNNORMALIZED = 0;
+    /** Refers to Normalization Form C: */
+    static final byte NORM_NFC = 1;
+    /** Refers to Normalization Form KC: */
+    static final byte NORM_NFKC = 2;
+    /** Refers to Normalization Form D: */
+    static final byte NORM_NFD = 3;
+    /** Refers to Normalization Form KD: */
+    static final byte NORM_NFKD = 4;
+
+
+    /** for those times when you need a char to represent a
+        non-existent codepoint */
     static final char EW_ABSENT = '\u0000';
 
+
+    //
     // the thirty consonants, in alphabetical order:
+    //
 
     /** first letter of the alphabet: */
     static final char EWC_ka = '\u0F40';
@@ -70,11 +86,13 @@ public interface UnicodeConstants {
     static final char EWC_ha = '\u0F67';
     static final char EWC_a = '\u0F68';
 
+
     /** In the word for father, "pA lags", there is an a-chung (i.e.,
         <code>\u0F71</code>).  This is the constant for that little
         guy. */
     static final char EW_achung = '\u0F71';
 
+
     /* Four of the five vowels, some say, or, others say, "the four
        vowels": */
     /** "gi gu", the 'i' sound in the English word keep: */
@@ -86,7 +104,7 @@ public interface UnicodeConstants {
     /** "na ro", the 'o' sound in the English word bone: */
     static final char EWV_o = '\u0F7C';
 
-    
+
     /** subscribed form of EWC_wa, also known as wa-btags */
     static final char EWSUB_wa_zur = '\u0FAD';
     /** subscribed form of EWC_ya */
diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeReadyThunk.java b/source/org/thdl/tib/text/tshegbar/UnicodeReadyThunk.java
index e85a42d..7a454dd 100644
--- a/source/org/thdl/tib/text/tshegbar/UnicodeReadyThunk.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeReadyThunk.java
@@ -18,14 +18,14 @@ Contributor(s): ______________________________________.
 
 package org.thdl.tib.text.tshegbar;
 
-/** A UnicodeReadyThunk represents a string of characters.  While
- *  there are ways to turn a string of Unicode characters into a list
+/** A UnicodeReadyThunk represents a string of codepoints.  While
+ *  there are ways to turn a string of Unicode codepoints into a list
  *  of UnicodeReadyThunks (DLC reference it), you cannot
- *  necessarily recover the exact sequence of Unicode characters from
- *  a UnicodeReadyThunk.  For characters that are not Tibetan
- *  Unicode and are not one of a handful of other known characters,
+ *  necessarily recover the exact sequence of Unicode codepoints from
+ *  a UnicodeReadyThunk.  For codepoints that are not Tibetan
+ *  Unicode and are not one of a handful of other known codepoints,
  *  only the most primitive operations are available.  Generally in
- *  this case you can recover the exact string of Unicode characters,
+ *  this case you can recover the exact string of Unicode codepoints,
  *  but don't bank on it.
  *
  *  @author David Chandler
@@ -33,23 +33,25 @@ package org.thdl.tib.text.tshegbar;
 public interface UnicodeReadyThunk {
 
     /** Returns true iff this thunk is entirely Tibetan (regardless of
-        whether or not all characters come from the Tibetan range of
-        Unicode 3, i.e. <code>0x0F00</code>-<code>0x0FFF</code>). */
+        whether or not all codepoints come from the Tibetan range of
+        Unicode 3, i.e. <code>U+0F00</code>-<code>U+0FFF</code>, and
+        regardless of whether or not this thunk is syntactically legal
+        Tibetan). */
     public boolean isTibetan();
     
-    /** Returns a sequence of Unicode characters that is equivalent to
+    /** Returns a sequence of Unicode codepoints that is equivalent to
      *  this thunk if possible.  It is only possible if {@link
-     *  #hasEquivalentUnicode()} is true.  Unicode has more than one
+     *  #hasUnicodeRepresentation()} is true.  Unicode has more than one
      *  way to refer to the same language element, so this is just one
      *  method.  When more than one Unicode sequence exists, and when
      *  the thunk {@link #isTibetan() is Tibetan}, this method returns
      *  sequences that the Unicode 3.2 standard does not discourage.
      *  @exception UnsupportedOperationException if {@link
-     *  #hasEquivalentUnicode()} is false
-     *  @return a String of Unicode characters */
-    public String getEquivalentUnicode() throws UnsupportedOperationException;
+     *  #hasUnicodeRepresentation()} is false
+     *  @return a String of Unicode codepoints */
+    public String getUnicodeRepresentation() throws UnsupportedOperationException;
     
-    /** Returns true iff there exists a sequence of Unicode characters
+    /** Returns true iff there exists a sequence of Unicode codepoints
      *  that correctly represents this thunk.  This will not be the
      *  case if the thunk contains Tibetan characters for which the
      *  Unicode standard does not provide.  See the Extended Wylie
@@ -58,6 +60,6 @@ public interface UnicodeReadyThunk {
      *  standard section 9.13.  The presence of head marks or multiple
      *  vowels in the thunk would cause this to return false, for
      *  example.  */
-    public boolean hasEquivalentUnicode();
+    public boolean hasUnicodeRepresentation();
 }
 
diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
index 3cd7d7b..f527438 100644
--- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
@@ -19,15 +19,15 @@ Contributor(s): ______________________________________.
 package org.thdl.tib.text.tshegbar;
 
 /** <p>This non-instantiable class contains utility routines for
- *  dealing with Tibetan Unicode characters and strings of such
- *  characters.</p>
+ *  dealing with Tibetan Unicode codepoints and strings of such
+ *  codepoints.</p>
  *
  *  @author David Chandler */
-public class UnicodeUtils {
+public class UnicodeUtils implements UnicodeConstants {
     /** Do not use this, as this class is not instantiable. */
     private UnicodeUtils() { super(); }
 
-    /** Returns true iff x is a Unicode character that represents a
+    /** Returns true iff x is a Unicode codepoint that represents a
         consonant or two-consonant stack that has a Unicode code
         point.  Returns true only for the usual suspects (like
         <code>&#92;u0F40</code>) and for Sanskrit consonants (like
@@ -40,7 +40,7 @@ public class UnicodeUtils {
                 && (x >= '\u0F40' && x <= '\u0F6A'));
     }
 
-    /** Returns true iff x is a Unicode character that represents a
+    /** Returns true iff x is a Unicode codepoint that represents a
         subjoined consonant or subjoined two-consonant stack that has
         a Unicode code point.  Returns true only for the usual
         suspects (like <code>&#92;u0F90</code>) and for Sanskrit
@@ -61,7 +61,7 @@ public class UnicodeUtils {
         '&#92;u0F6A'.  The new consonants (for transcribing Chinese, I
         believe) "&#92;u0F55&#92;u0F39" (which EWTS calls "fa"),
         "&#92;u0F56&#92;u0F39" ("va"), and "&#92;u0F5F&#92;u0F39" ("Dza") are
-        two-character sequences, but you should be aware of them
+        two-codepoint sequences, but you should be aware of them
         also. */
     public static boolean isPreferredFormOfConsonant(char x) {
         return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */)
@@ -73,16 +73,16 @@ public class UnicodeUtils {
                 && (x != '\u0F5C'));
     }
 
-    /** Returns true iff unicodeChar is a character from the Unicode
+    /** Returns true iff unicodeCP is a codepoint from the Unicode
         range U+0F00-U+0FFF.
         @see #isEntirelyTibetanUnicode(String) */
-    public static boolean isInTibetanRange(char unicodeChar) {
-        return (unicodeChar >= '\u0F00' && unicodeChar <= '\u0FFF');
+    public static boolean isInTibetanRange(char unicodeCP) {
+        return (unicodeCP >= '\u0F00' && unicodeCP <= '\u0FFF');
     }
 
-    /** Returns true iff unicodeString consists only of characters
+    /** Returns true iff unicodeString consists only of codepoints
         from the Unicode range U+0F00-U+0FFF.  (Note that these
-        characters are typically not enough to represent a Tibetan
+        codepoints are typically not enough to represent a Tibetan
         text, you may need ZWSP (zero-width space) and various
         whitespace from other ranges.) */
     public static boolean isEntirelyTibetanUnicode(String unicodeString) {
@@ -93,21 +93,40 @@ public class UnicodeUtils {
         return true;
     }
 
-    /** Modifies tibetanUnicode so that it is equivalent, according to
-        the Unicode 3.2 standard, to the input buffer.  The Tibetan
-        passages of the returned string are in THDL-canonical form,
-        however.  This form uses a maximum of characters, in general,
-        and never uses characters whose use has been {@link
-        #isDiscouraged(char) discouraged}.  If the input contains
-        characters for which {@link #isInTibetanRange(char)} is not
-        true, then they will not be modified.
+    /** Puts the Tibetan codepoints in tibetanUnicode, a sequence of
+        Unicode codepoints, into Normalization Form KD (NFKD) as
+        specified by Unicode 3.2.  The Tibetan passages of the
+        returned string are in NFKD, but codepoints outside of the
+        range <code>U+0F00</code>-<code>U+0FFF</code> are not
+        necessarily put into NFKD.  This form uses a maximum of
+        codepoints, and it never uses codepoints whose use has been
+        {@link #isDiscouraged(char) discouraged}.  It would be David
+        Chandler's very favorite form if not for the fact that
+        <code>U+0F0C</code> normalizes to <code>U+0F0B</code> in NFKD.
+        NFD is thus David Chandler's favorite, though it does not
+        decompose <code>U+0F77</code> and <code>U+0F79</code> (for
+        some reason, hopefully a well-thought-out one).
+
+        <p>Recall that NFKD, as it applies to Tibetan codepoints, is
+        closed under string concatenation and under substringing.
+        Note again that if the input contains codepoints for which
+        {@link #isInTibetanRange(char)} is not true, then they will
+        not be modified.</p>
     
         <p>Note well that only well-formed input guarantees
-        well-formed output.</p> */
-    public static void toCanonicalForm(StringBuffer tibetanUnicode) {
+        well-formed output.</p>
+
+        @param tibetanUnicode the codepoints to be decomposed
+        @param normForm NORM_NFKD or NORM_NFD */
+    public static void toMostlyDecomposedUnicode(StringBuffer tibetanUnicode,
+                                                 byte normForm)
+    {
+        if (normForm != NORM_NFD && normForm != NORM_NFKD)
+            throw new IllegalArgumentException("normForm must be NORM_NFD or NORM_NFKD for decomposition to work");
         int offset = 0;
         while (offset < tibetanUnicode.length()) {
-            String s = toCanonicalForm(tibetanUnicode.charAt(offset));
+            String s
+                = toNormalizedForm(tibetanUnicode.charAt(offset), normForm);
             if (null == s) {
                 ++offset;
             } else {
@@ -118,67 +137,88 @@ public class UnicodeUtils {
         }
     }
 
-    /** Like {@link #toCanonicalForm(StringBuffer)}, but does not
-        modify its input.  Instead, it returns the canonically-formed
-        version of tibetanUnicode. */
-    public static String toCanonicalForm(String tibetanUnicode) {
+    /** Like {@link #toMostlyDecomposedUnicode(StringBuffer, byte)},
+        but does not modify its input.  Instead, it returns the NFKD-
+        or NFD-normalized version of tibetanUnicode. */
+    public static String toMostlyDecomposedUnicode(String tibetanUnicode,
+                                                   byte normForm)
+    {
         StringBuffer sb = new StringBuffer(tibetanUnicode);
-        toCanonicalForm(sb);
+        toMostlyDecomposedUnicode(sb, normForm);
         return sb.toString();
     }
 
-    /** There are 19 characters in the Tibetan range of Unicode 3.2
-        which can be decomposed into longer strings of characters in
-        the Tibetan range of Unicode.  These 19 are said not to be in
-        THDL-canonical form.  This routine returns the canonical form
-        for such characters, and returns null for characters that are
-        already canonical or are not in the Tibetan range of Unicode.
-        @param tibetanUnicodeChar the character to canonicalize
-        @return null if tibetanUnicodeChar is canonical, or a string
-        of two or three characters otherwise */
-    public static String toCanonicalForm(char tibetanUnicodeChar) {
-        switch (tibetanUnicodeChar) {
-        case '\u0F43': return new String(new char[] { '\u0F42', '\u0FB7' });
-        case '\u0F4D': return new String(new char[] { '\u0F4C', '\u0FB7' });
-        case '\u0F52': return new String(new char[] { '\u0F51', '\u0FB7' });
-        case '\u0F57': return new String(new char[] { '\u0F56', '\u0FB7' });
-        case '\u0F5C': return new String(new char[] { '\u0F5B', '\u0FB7' });
-        case '\u0F69': return new String(new char[] { '\u0F40', '\u0FB5' });
-        case '\u0F73': return new String(new char[] { '\u0F71', '\u0F72' });
-        case '\u0F75': return new String(new char[] { '\u0F71', '\u0F74' });
-        case '\u0F76': return new String(new char[] { '\u0FB2', '\u0F80' });
-        case '\u0F77': return new String(new char[] { '\u0FB2', '\u0F71', '\u0F80' });
-        case '\u0F78': return new String(new char[] { '\u0FB3', '\u0F80' });
-        case '\u0F79': return new String(new char[] { '\u0FB3', '\u0F71', '\u0F80' });
-        case '\u0F81': return new String(new char[] { '\u0F71', '\u0F80' });
-        case '\u0F93': return new String(new char[] { '\u0F92', '\u0FB7' });
-        case '\u0F9D': return new String(new char[] { '\u0F9C', '\u0FB7' });
-        case '\u0FA2': return new String(new char[] { '\u0FA1', '\u0FB7' });
-        case '\u0FA7': return new String(new char[] { '\u0FA6', '\u0FB7' });
-        case '\u0FAC': return new String(new char[] { '\u0FAB', '\u0FB7' });
-        case '\u0FB9': return new String(new char[] { '\u0F90', '\u0FB5' });
+    /** There are 19 codepoints in the Tibetan range of Unicode 3.2
+        which can be decomposed into longer strings of codepoints in
+        the Tibetan range of Unicode.  Often one wants to manipulate
+        decomposed codepoint strings.  Also, HTML and XML are W3C
+        standards that require certain normalization forms.  This
+        routine returns a chosen normalized form for such codepoints,
+        and returns null for codepoints that are already normalized or
+        are not in the Tibetan range of Unicode.
+        @param tibetanUnicodeCP the codepoint to normalize
+        @param normalizationForm NORM_NFKD or NORM_NFD if you expect
+        something nontrivial to happen
+        @return null if tibetanUnicodeCP is already in the chosen
+        normalized form, or a string of two or three codepoints
+        otherwise */
+    public static String toNormalizedForm(char tibetanUnicodeCP, byte normalizationForm) {
+        if (normalizationForm == NORM_NFKD
+            || normalizationForm == NORM_NFD) {
+            // Where not specified, the NFKD form is also the NFD form.
+            switch (tibetanUnicodeCP) {
+            case '\u0F0C': return ((normalizationForm == NORM_NFKD)
+                                   ? "\u0F0B" : null);
+            case '\u0F43': return "\u0F42\u0FB7";
+            case '\u0F4D': return "\u0F4C\u0FB7";
+            case '\u0F52': return "\u0F51\u0FB7";
+            case '\u0F57': return "\u0F56\u0FB7";
+            case '\u0F5C': return "\u0F5B\u0FB7";
+            case '\u0F69': return "\u0F40\u0FB5";
+            case '\u0F73': return "\u0F71\u0F72";
+            case '\u0F75': return "\u0F71\u0F74";
+            case '\u0F76': return "\u0FB2\u0F80";
+            // I do not understand why NFD does not decompose this codepoint:
+            case '\u0F77': return ((normalizationForm == NORM_NFKD)
+                                   ? "\u0FB2\u0F71\u0F80" : null);
+            case '\u0F78': return "\u0FB3\u0F80";
+            // I do not understand why NFD does not decompose this codepoint:
+            case '\u0F79': return ((normalizationForm == NORM_NFKD)
+                                   ? "\u0FB3\u0F71\u0F80" : null);
 
-        default:
-            return null;
+            case '\u0F81': return "\u0F71\u0F80";
+            case '\u0F93': return "\u0F92\u0FB7";
+            case '\u0F9D': return "\u0F9C\u0FB7";
+            case '\u0FA2': return "\u0FA1\u0FB7";
+            case '\u0FA7': return "\u0FA6\u0FB7";
+            case '\u0FAC': return "\u0FAB\u0FB7";
+            case '\u0FB9': return "\u0F90\u0FB5";
+
+            default:
+                return null;
+            }
         }
+        return null;
     }
 
-    /** Returns true iff tibetanUnicodeChar {@link
-        #isInTibetanRange(char)} and if the Unicode 3.2 standard
-        discourages the use of tibetanUnicodeChar. */
-    public static boolean isDiscouraged(char tibetanUnicodeChar) {
-        return ('\u0F73' == tibetanUnicodeChar
-                || '\u0F75' == tibetanUnicodeChar
-                || '\u0F77' == tibetanUnicodeChar
-                || '\u0F81' == tibetanUnicodeChar);
+    /** Returns true iff tibetanUnicodeCP {@link
+        #isInTibetanRange(char) is a Tibetan codepoint} and if the
+        Unicode 3.2 standard discourages the use of
+        tibetanUnicodeCP. */
+    public static boolean isDiscouraged(char tibetanUnicodeCP) {
+        return ('\u0F73' == tibetanUnicodeCP
+                || '\u0F75' == tibetanUnicodeCP
+                || '\u0F77' == tibetanUnicodeCP
+                || '\u0F79' == tibetanUnicodeCP
+                || '\u0F81' == tibetanUnicodeCP);
         /* DLC FIXME -- I was using 3.0 p.437-440, check 3.2. */
     }
 
     /** Returns true iff ch corresponds to the Tibetan letter ra.
-        Several Unicode characters correspond to the Tibetan letter ra
+        Several Unicode codepoints correspond to the Tibetan letter ra
         (in its subscribed form or otherwise).  Oftentimes,
         <code>&#92;u0F62</code> is thought of as the nominal
-        representation.  Returns false for some characters that
+        representation.  Returns false for some codepoints that
         contain ra but are not merely ra, such as <code>&#92;u0F77</code> */
     public static boolean isRa(char ch) {
         return ('\u0F62' == ch
@@ -188,7 +228,7 @@ public class UnicodeUtils {
     }
 
     /** Returns true iff ch corresponds to the Tibetan letter wa.
-        Several Unicode characters correspond to the Tibetan letter
+        Several Unicode codepoints correspond to the Tibetan letter
         wa.  Oftentimes, <code>&#92;u0F5D</code> is thought of as the
         nominal representation. */
     public static boolean isWa(char ch) {
@@ -198,7 +238,7 @@ public class UnicodeUtils {
     }
 
     /** Returns true iff ch corresponds to the Tibetan letter ya.
-        Several Unicode characters correspond to the Tibetan letter
+        Several Unicode codepoints correspond to the Tibetan letter
         ya.  Oftentimes, <code>&#92;u0F61</code> is thought of as the
         nominal representation. */
     public static boolean isYa(char ch) {
@@ -207,14 +247,14 @@ public class UnicodeUtils {
                 || '\u0FBB' == ch);
     }
 
-    /** Returns true iff there exists at least one character ch in
-        unicodeString such that ch {@link #isRa(char) is ra} or contains
+    /** Returns true iff there exists at least one codepoint cp in
+        unicodeString such that cp {@link #isRa(char) is ra} or contains
         ra (like <code>&#92;u0F77</code>).  This method is not implemented
         as fast as it could be.  It calls on the canonicalization code
         in order to maximize reuse and minimize the possibility of
         coder error. */
     public static boolean containsRa(String unicodeString) {
-        String canonForm = toCanonicalForm(unicodeString);
+        String canonForm = toMostlyDecomposedUnicode(unicodeString, NORM_NFKD);
         for (int i = 0; i < canonForm.length(); i++) {
             if (isRa(canonForm.charAt(i)))
                 return true;
@@ -223,11 +263,13 @@ public class UnicodeUtils {
     }
     /** Inefficient shortcut.
         @see #containsRa(String) */
-    public static boolean containsRa(char unicodeChar) {
-        return containsRa(new String(new char[] { unicodeChar }));
+    public static boolean containsRa(char unicodeCP) {
+        return containsRa(new String(new char[] { unicodeCP }));
     }
 
-    public static String unicodeCharToString(char ch) {
+    /** Returns a human-readable, ASCII form of the Unicode codepoint
+        ch. */
+    public static String unicodeCPToString(char ch) {
         return "U+" + Integer.toHexString((int)ch);
     }
 }
diff --git a/source/org/thdl/tib/text/tshegbar/package.html b/source/org/thdl/tib/text/tshegbar/package.html
index 4de8dfa..7656c3e 100644
--- a/source/org/thdl/tib/text/tshegbar/package.html
+++ b/source/org/thdl/tib/text/tshegbar/package.html
@@ -21,9 +21,9 @@
   syllable.
 
 <p>
-  This package allows for turning a string of Unicode characters into
+  This package allows for turning a string of Unicode codepoints into
   our <i>TTBIR</i>, our Tibetan Tsheg Bar Internal Representation.
-  Said Unicode document may contain non-Tibetan characters also.
+  Said Unicode document may contain non-Tibetan codepoints also.
 </p>
 
 </body>