diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
new file mode 100644
index 0000000..e72a510
--- /dev/null
+++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
@@ -0,0 +1,1027 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site 
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis, 
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
+License for the specific terms governing rights and limitations under the 
+License. 
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
+All Rights Reserved. 
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.tshegbar;
+
+import org.thdl.tib.text.TibetanMachineWeb;
+import org.thdl.util.ThdlDebug;
+
+/** <p>A LegalTshegBar is a simple Tibetan syllable or a syllable with
+ *  syntactically legal {@link #getPossibleSuffixParticles() suffix
+ *  particles}.  A legal tsheg-bar is not a transliteration of Chinese
+ *  or some other language.  It obeys the following properties:</p>
+ *
+ *  <ul>
+ *
+ *  <li>It contains at most one prefix, which must be one of {EWC_ga,
+ *  EWC_da, EWC_ba, EWC_ma, EWC_achen} and must be prefixable to the
+ *  root letter.</li>
+ *
+ *  <li>It contains no vocalic modifications</li>
+ *
+ *  <li>It may or may not contain an a-chung
+ *  (<code>\u0F71</code>)</li>
+ *
+ *  <li>It contains at most one vowel from the set {EWV_a, EWV_i,
+ *  EWV_e, EWV_u}, and that vowel is on the root stack.  The one
+ *  exception is that a 'i suffix is permitted (this is a connective
+ *  case marker).</li>
+ *
+ *  <li>It has at most one suffix, which is a single consonant or the
+ *  special connective case marker 'i (i.e.,
+ *  <code>"\u0F60\u0F72"</code>).</li>
+ *
+ *
+DLC FIXME: we must allow many suffixes.  See Andres' e-mail below:
+<pre>
+David,
+
+It is a particle that means "or" as opposed to "dang" that means and.
+
+"sgom pa'am" would mean "... or meditation"
+
+You can also have "'ang" which would be equivalent to "yang" (also)
+
+"sgom pa'ang" : even/also meditation.
+
+And also there are cases where they combine. For ex you can have
+
+"le'u'i'o". "le'u" means chapter. "le'u'i" means "of this chapter".
+'o would mark the end of the sentence.
+
+	Andres 
+</pre>
+ *
+ *
+ *  <li>It may contain a EWC_sa or EWC_da postsuffix iff there exists
+ *  a suffix (and a suffix that is not the special connective case
+ *  marker 'i (i.e., <code>"\u0F60\u0F72"</code>) (DLC FIXME: 'o and
+ *  'am maybe?  I asked in the "Embarrasing error in wylie conversion"
+ *  bug report.).</li>
+ *
+ *  <li>The root stack follows the rules of Tibetan syntax, meaning
+ *  that the following holds:
+ *
+ *    <ul>
+ *
+ *       <li>the ra-mgo, sa-mgo, la-mgo head letters appear only over
+ *       root consonants (root letters) that take them, if they
+ *       appear</li>
+ *
+ *       <li>the wa-zur, ra-btags, ya-btags, and la-btags subjoined
+ *       letters appear only under root consonants (root letters) that
+ *       take them</li>
+ *
+ *       <li>at most one subscribed letter, except for the special
+ *       case that ra-btags and wa-zur or ya-btags and wa-zur
+ *       sometimes appear together.</li>
+ *
+ *       <li>the root stack may contain at most one head letter</li>
+ *
+ *    </ul>
+ *
+ *  </li>
+ *
+ *  </ul>
+ *
+ *  <p>Note that this class uses only a subset of Unicode to represent
+ *  consonants and vowels.  In some situations, you should use {@link
+ *  #EWSUB_wa_zur} to represent the consonant wa, while in others
+ *  you should use {@link #EWC_wa}, even though you mean to subscribe
+ *  a fixed-form wa.  Basically, stick to the characters for which
+ *  enumerations exist in {@link #UnicodeConstants} and use your
+ *  common sense.</p>
+ *
+ *  <p>For a pretty good, concise summary of the rules this class
+ *  knows about, see Joe B. Wilson's <i>Translating Buddhism from
+ *  Tibetan</i> from {@see http://snowlionpubs.com/ Snow Lion
+ *  Publications}, Appendix 1, e.g. p. 548.</p>
+ *
+ *  @author David Chandler */
+public class LegalTshegBar
+    extends TshegBar
+    implements UnicodeConstants
+{
+    /** the prefixed consonant or EW_ABSENT */
+    private char prefix;
+    /** the consonant superscribed over the {@link #rootLetter} or
+     *  EW_ABSENT */
+    private char headLetter;
+    /** the root consonant, never EW_ABSENT */
+    private char rootLetter;
+    /** subscribed letter, or EW_ABSENT */
+    private char subjoinedLetter;
+    /** true iff EWSUB_wa_zur is under the root syllable. */
+    private boolean hasWaZur;
+    /** true iff EW_wa_zur is under the root syllable. */
+    private boolean hasAChung;
+    /** If this is a string, it is of a single character or is equal
+     *  to {@link #getConnectiveCaseSuffix()} */
+    private String suffix;
+    /** EW_da, EW_sa, or EW_ABSENT */
+    private char postsuffix;
+    /** EWV_i, EWV_u, EWV_e, EWV_o, or EW_ABSENT */
+    private char vowel;
+
+    /** Do not use this constructor. */
+    private LegalTshegBar() { super(); }
+
+    // DLC FIXME: do we want to accept EWC_ra or EWSUB_ra_btags for
+    // the root letter, even if there is no head letter?  Etc.
+    /** Constructs a valid Tibetan syllable or throws an exception.
+     *  Use EW_ABSENT (or null in the case of <code>suffix</code>) for
+     *  those parts of the syllable that are absent.  The root letter
+     *  must not be absent.  To learn about the arguments, and to be
+     *  sure that your input won't cause an exception to be thrown,
+     *  see {@link
+     *  #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char)}.
+     *
+     *  @exception IllegalArgumentException if the rootLetter is not
+     *  one of the thirty consonants (and represented nominally, at
+     *  that), or if one of the other arguments is not valid, or if
+     *  postsuffix is present but suffix is absent, etc. */
+    public LegalTshegBar(char prefix, char headLetter, char rootLetter,
+                         char subjoinedLetter,
+                         boolean hasWaZur,
+                         boolean hasAChung,
+                         String suffix, char postsuffix, char vowel)
+        throws IllegalArgumentException
+    {
+        super();
+
+        throwIfNotLegalTshegBar(prefix, headLetter, rootLetter,
+                                subjoinedLetter, hasWaZur, hasAChung,
+                                suffix, postsuffix, vowel);
+
+        this.prefix = prefix;
+        this.headLetter = headLetter;
+        this.rootLetter = rootLetter;
+        this.subjoinedLetter = subjoinedLetter;
+
+        this.hasWaZur = hasWaZur;
+        this.hasAChung = hasAChung;
+
+        // copying is slightly inefficient because it is unnecessary
+        // since Java strings are read-only, but translating this code
+        // to C++ is easier this way.
+        this.suffix = new String(suffix);
+
+        this.postsuffix = postsuffix;
+        this.vowel = vowel;
+    }
+
+    /** Like {@link
+     *  #LegalTshegBar(char,char,char,char,boolean,boolean,String,char,char)
+     *  but for the common case where the suffix is simply a
+     *  consonant. */
+    public LegalTshegBar(char prefix, char headLetter, char rootLetter,
+                         char subjoinedLetter,
+                         boolean hasWaZur, // DLC FIXME handle this
+                         boolean hasAChung, // DLC FIXME handle this
+                         char suffix, char postsuffix, char vowel)
+        throws IllegalArgumentException
+    {
+        this(prefix, headLetter, rootLetter, subjoinedLetter,
+             hasWaZur, hasAChung, new String(new char[] { suffix }),
+             postsuffix, vowel);
+    }
+
+
+    /** Returns the prefixed consonant, or EW_ABSENT if there is no
+     *  prefix. */
+    public char getPrefix() {
+        return prefix;
+    }
+
+    /** Returns true iff this syllable contains a prefixed
+     *  consonant. */
+    public boolean hasPrefix() {
+        return (EW_ABSENT != prefix);
+    }
+
+    /** Returns the non-EWSUB_wa_zur consonant subscribed to the root
+     *  consonant, or EW_ABSENT if none is.  If you want to know if there is a wa-zur, use {@link #hasWaZurSubjoinedToRootLetter()}*/
+    public char getSubjoinedLetter() {
+        return subjoinedLetter;
+    }
+
+    /** Returns true iff the root letter possesses a subscribed
+     *  consonant ya-btags, ra-btags, la-btags, or wa-zur. */
+    public boolean hasSubjoinedLetter() {
+        return (EW_ABSENT != subjoinedLetter);
+    }
+
+    public boolean hasWaZurSubjoinedToRootLetter() {
+        return hasWaZur;
+    }
+
+    public boolean hasAChungOnRootLetter() {
+        return hasAChung;
+    }
+
+    /** Returns null if there is no suffix, or a string containing the
+     *  one consonant or a string <code>"\u0F60\u0F72"</code>
+     *  containing two characters in the special case that the suffix
+     *  is that connective case marker {@link
+     *  #getConnectiveCaseSuffix()}. */
+    public String getSuffix() {
+        return suffix;
+    }
+
+    /** Returns true iff there is a suffixed consonant or a suffixed
+     *  <code>'i</code> (DLC FIXME). */
+    public boolean hasSuffix() {
+        return (null != suffix);
+    }
+
+    /** Returns true iff there is a single, suffixed consonant.  This
+        means that suffixes like <code>'am</code>, <code>'i</code>,
+        <code>'u</code>, and <code>'o</code> are not present, but this
+        does not rule out the presence of a postsuffix. */
+    public boolean hasSimpleSuffix() {
+        return ((null != suffix) && (1 == suffix.length()));
+    }
+
+    /** If this syllable {@link #hasSimpleSuffix() has a simple
+        suffix}, this returns it.
+        @exception Exception if {@link #hasSimpleSuffix()} is not true */
+    public char getSimpleSuffix() throws Exception {
+        if (!hasSimpleSuffix())
+            throw new Exception("there isn't a simple suffix");
+        return getSuffix().charAt(0);
+    }
+
+    /** Returns the secondary suffix, which is either
+     *  EWC_da or EWC_sa, or EW_ABSENT if
+     *  there is no postsuffix. */
+    public char getPostsuffix() {
+        return postsuffix;
+    }
+
+    /** Returns true iff there is a secondary suffix EWC_da or
+     *  EWC_sa. */
+    public boolean hasPostsuffix() {
+        return (EW_ABSENT != postsuffix);
+    }
+
+    /** Returns true iff this syllable has a <code>'i</code>
+     *  suffix. */
+    public boolean hasConnectiveCaseMarkerSuffix() {
+        return getSuffix().equals(getConnectiveCaseSuffix());
+    }
+
+    /** Returns the root consonant. */
+    public char getRootLetter() {
+        return rootLetter;
+    }
+
+    /** Returns the head letter of the root stack if it has one, or
+     *  EW_ABSENT otherwise. */
+    public char getHeadLetter() {
+        return headLetter;
+    }
+    
+    /** Returns true iff this syllable has a head letter. */
+    public boolean hasHeadLetter() {
+        return (EW_ABSENT != headLetter);
+    }
+
+    /** Returns the vowel, or EW_ABSENT if there is no {@link
+     *  hasExplicitVowel() explicit vowel} (the syllable has the
+     *  built-in "ah" sound in this case). */
+    public char getVowel() {
+        // DLC assert this is one of { EWV_i, EWV_u, EWV_e, EWV_o }
+        return vowel;
+    }
+
+    /** Returns false iff the implicit, built-in "ah" sound is the
+        only vowel for the root stack. */
+    public boolean hasExplicitVowel() {
+        return (EW_ABSENT != vowel);
+    }
+
+
+    /** Returns a string of two characters, da and sa. */
+    public static String getPossiblePostsuffixes() {
+        return new String(new char[] { EWC_da, EWC_sa });
+    }
+
+    private final static String possibleSuffixes
+        = new String(new char[] {
+            EWC_ga, EWC_nga, EWC_da, EWC_na, EWC_ba, EWC_ma, EWC_achen,
+            EWC_ra, EWC_la, EWC_sa
+        });
+
+    /** Returns a string of ten characters, each of which can be a
+     *  suffix in Tibetan. */
+    public static String getPossibleSuffixes() {
+        return possibleSuffixes;
+
+        // DLC unit test that each EWC is a nominal form of a consonant
+
+        // you could use either \u0F62 or \u0F6A, but we won't confuse
+        // this ra for a ra-mgo, so we use \u0F62, EWC_ra, not
+        // EWSUB_ra_btags.
+    }
+
+    private final static String connectiveCaseSuffix
+        = new String(new char[] {
+            EWC_achen, EWV_i
+        });
+
+    /** Returns a two-character string consisting of the Unicode
+     *  representation of what Extended Wylie calls
+     *  <code>'i</code>. */
+    public static String getConnectiveCaseSuffix() {
+        return connectiveCaseSuffix;
+    }
+
+    private final static String thirtyConsonants
+        = new String(new char[] {
+            EWC_ga,  EWC_kha,  EWC_ga,     EWC_nga,
+            EWC_ca,  EWC_cha,  EWC_ja,     EWC_nya,
+            EWC_ta,  EWC_tha,  EWC_da,     EWC_na,
+            EWC_pa,  EWC_pha,  EWC_ba,     EWC_ma,
+            EWC_tsa, EWC_tsha, EWC_dza,    EWC_wa,
+            EWC_zha, EWC_za,   EWC_achen,  EWC_ya,
+            EWC_ra,  EWC_la,   EWC_sha,    EWC_sa,
+            EWC_ha,  EWC_a
+        });
+
+    /** Returns a String containing the nominal Unicode
+     *  representations of the thirty consonants.  The consonants are
+     *  in the usual order you find them in the 8 row by 4 column
+     *  table that students of the language memorize.
+     *  @see org.thdl.tib.text.tshegbar#UnicodeConstants */
+    public static String getTheThirtyConsonants() {
+        ThdlDebug.verify(thirtyConsonants.length() == 30); // DLC put this into a JUnit test to avoid the slow-down.
+        return thirtyConsonants;
+    }
+
+    /** Returns true iff x is the preferred, nominal Unicode
+     *  representation of one the thirty consonants. */
+    public static boolean isNominalRepresentationOfConsonant(char x) {
+        return (-1 != getTheThirtyConsonants().indexOf(x));
+    }
+
+
+    /** Returns an array of Unicode strings, all the legal suffix
+        particles.  In Extended Wylie, these are: <ul> <li>'i</li>
+        <li>'o</li> <li>'u</li> <li>'am</li> </ul>
+    
+        <p>This is not very efficient.</p> */
+    public static String[] getPossibleSuffixParticles() {
+        return new String[] {
+            new String(new char[] { EWC_achen, EWV_i }),
+            new String(new char[] { EWC_achen, EWV_o }),
+            new String(new char[] { EWC_achen, EWV_u }),
+            new String(new char[] { EWC_achen, EWC_ma }),
+        };
+    }
+
+
+    /** Returns a String containing the nominal Unicode
+     *  representations of the five prefixes.  The prefixes are in
+     *  dictionary order.
+     *  @see org.thdl.tib.text.tshegbar#UnicodeConstants */
+    public static String getTheFivePrefixes() {
+        final String s = new String(new char[] {
+            EWC_ga, EWC_da, EWC_ba, EWC_ma, EWC_achen
+        });
+        ThdlDebug.verify(s.length() == 5); // DLC put this into a JUnit test to avoid the slow-down.
+        return s;
+    }
+
+    /** Returns true iff x is the preferred, nominal Unicode
+     *  representation of one of the five prefixes. */
+    public static boolean isNominalRepresentationOfPrefix(char x) {
+        return (-1 != getTheFivePrefixes().indexOf(x));
+    }
+
+    /** Returns a String containing the nominal Unicode
+     *  representations of the ten suffixes.  The suffixes are in
+     *  dictionary order.
+     *  @see #getConnectiveCaseSuffix()
+     *  @see org.thdl.tib.text.tshegbar#UnicodeConstants */
+    public static String getTheTenSuffixes() {
+        final String s = new String(new char[] {
+            EWC_ga, EWC_nga, EWC_da, EWC_na, EWC_ba,
+            EWC_ma, EWC_achen, EWC_ra, EWC_la, EWC_sa
+        });
+        ThdlDebug.verify(s.length() == 10); // DLC put this into a JUnit test to avoid the slow-down.
+        return s;
+    }
+
+    /** Returns true iff x is the preferred, nominal Unicode
+     *  representation of one of the ten suffixes.
+     *  @see #getConnectiveCaseSuffix()
+     */
+    public static boolean isNominalRepresentationOfSimpleSuffix(char x) {
+        return (-1 != getTheTenSuffixes().indexOf(x));
+    }
+
+
+    /** Returns true iff the given (rootLetter, subjoinedLetter)
+        combination can accept an additional wa-zur.  Only g-r-w,
+        d-r-w, and ph-y-w fall into this category according to
+        tibwn.ini. (DLC FIXME: are these all legal?  are any others?)
+
+        @param rootLetter the root consonant (in {@link
+        UnicodeUtils#isPreferredFormOfConsonant() preferred form} in
+        you expect true to be returned)
+        @param subjoinedLetter the letter subscribed to rootLetter,
+        which should not {@link UnicodeUtils#isWa(char) be wa} if you
+        expect true to be returned
+        @return true iff (rootLetter, subjoinedLetter, wa-zur) is a
+        legal stack. */
+    public static boolean takesWaZur(char rootLetter,
+                                     char subjoinedLetter) {
+
+        // DLC NOW use this test
+
+        if (EW_ABSENT == subjoinedLetter) {
+            return isConsonantThatTakesWaZur(rootLetter);
+        }
+        if (EWSUB_ra_btags == subjoinedLetter) {
+            if (EWC_ga == rootLetter
+                    || EWC_da == rootLetter)
+                return true;
+        } else if (EWSUB_ya_btags == subjoinedLetter) {
+            if (EWC_pha == rootLetter)
+                return true;
+        }
+        return false;
+    }
+
+    /** Returns true iff rootLetter is a consonant to which wa-zur can
+     *  be subjoined (perhaps in addition to another subjoined
+     *  ra-btags or ya-btags. */
+    public static boolean isConsonantThatTakesWaZur(char rootLetter) {
+        return !(EWC_ka != rootLetter
+                 && EWC_kha != rootLetter
+                 && EWC_ga != rootLetter
+                 && EWC_nya != rootLetter
+                 && EWC_da != rootLetter
+                 && EWC_tsa != rootLetter
+                 && EWC_tsha != rootLetter
+                 && EWC_zha != rootLetter
+                 && EWC_za != rootLetter
+                 && EWC_ra != rootLetter
+                 && EWC_la != rootLetter
+                 && EWC_sha != rootLetter
+                 && EWC_pha != rootLetter /* ph-y-w is legal. */
+                 && EWC_ha != rootLetter);
+    }
+
+    /** Returns true iff rootLetter is a consonant to which ya-btags
+     *  can be subjoined. */
+    public static boolean isConsonantThatTakesYaBtags(char rootLetter) {
+        return !(EWC_ka != rootLetter
+                 && EWC_kha != rootLetter
+                 && EWC_ga != rootLetter
+                 && EWC_pa != rootLetter
+                 && EWC_pha != rootLetter
+                 && EWC_ba != rootLetter
+                 && EWC_ma != rootLetter
+                 && EWC_ha != rootLetter);
+    }
+
+    /** Returns true iff rootLetter is a consonant to which la-btags
+     *  can be subjoined. */
+    public static boolean isConsonantThatTakesLaBtags(char rootLetter) {
+        return !(EWC_ka != rootLetter
+                 && EWC_ga != rootLetter
+                 && EWC_ba != rootLetter
+                 && EWC_ra != rootLetter
+                 && EWC_sa != rootLetter
+
+                 // this combination is pronounced as a
+                 // prenasaling, low-tone <i>da</i> in my opinion:
+                 && EWC_za != rootLetter);
+    }
+
+
+    /** Returns true iff rootLetter is a consonant to which ra-btags
+     *  can be subjoined. */
+    public static boolean isConsonantThatTakesRaBtags(char rootLetter) {
+        return !(EWC_ka != rootLetter
+                 && EWC_kha != rootLetter
+                 && EWC_ga != rootLetter
+                 && EWC_ta != rootLetter
+                 && EWC_tha != rootLetter
+                 && EWC_da != rootLetter
+                 && EWC_na != rootLetter
+                 && EWC_pa != rootLetter
+                 && EWC_pha != rootLetter
+                 && EWC_ba != rootLetter
+                 && EWC_ma != rootLetter
+                 && EWC_sa != rootLetter
+                 && EWC_ha != rootLetter);
+    }
+
+    /** Returns true iff rootLetter is a consonant that takes a ra-mgo
+     *  (pronounced <i>rango</i> because ma is a prenasaling prefix)
+     *  head letter */
+    public static boolean isConsonantThatTakesRaMgo(char rootLetter) {
+        return !(EWC_ka != rootLetter
+                 && EWC_ga != rootLetter
+                 && EWC_nga != rootLetter
+                 && EWC_ja != rootLetter
+                 && EWC_nya != rootLetter
+                 && EWC_ta != rootLetter
+                 && EWC_da != rootLetter
+                 && EWC_na != rootLetter
+                 && EWC_ba != rootLetter
+                 && EWC_ma != rootLetter
+                 && EWC_tsa != rootLetter
+                 && EWC_dza != rootLetter);
+    }
+
+    /** Returns true iff rootLetter is a consonant that takes a la-mgo
+     *  (pronounced <i>lango</i> because ma is a prenasaling prefix)
+     *  head letter */
+    public static boolean isConsonantThatTakesLaMgo(char rootLetter) {
+        return !(EWC_ka != rootLetter
+                 && EWC_ga != rootLetter
+                 && EWC_nga != rootLetter
+                 && EWC_ca != rootLetter
+                 && EWC_ja != rootLetter
+                 && EWC_ta != rootLetter
+                 && EWC_da != rootLetter
+                 && EWC_pa != rootLetter
+                 && EWC_ba != rootLetter
+                 && EWC_ha != rootLetter); // pronunciation exception, btw
+    }
+
+    /** Returns true iff rootLetter is a consonant that takes a sa-mgo
+     *  (pronounced <i>sango</i> because ma is a prenasaling prefix)
+     *  head letter */
+    public static boolean isConsonantThatTakesSaMgo(char rootLetter) {
+        return !(EWC_ka != rootLetter
+                 && EWC_ga != rootLetter
+                 && EWC_nga != rootLetter
+                 && EWC_nya != rootLetter
+                 && EWC_ta != rootLetter
+                 && EWC_da != rootLetter
+                 && EWC_na != rootLetter
+                 && EWC_pa != rootLetter
+                 && EWC_ba != rootLetter
+                 && EWC_ma != rootLetter
+                 && EWC_tsa != rootLetter);
+    }
+
+    /** Returns true iff the given arguments form a legal Tibetan
+     *  syllable.
+     *
+     *  @param prefix the optional, prefixed consonant
+     *  @param headLetter the optional superscribed consonant
+     *  @param rootLetter the mandatory root consonant
+     *  @param subjoinedLetter the optional, subscribed consonant
+     *  @param suffix the optional suffix, which is null, a String
+     *  consisting of a single consonant (i.e. a single character)
+     *  except in the special case that this is {@link
+     *  #getConnectiveCaseSuffix()}
+     *  @param postsuffix the optional postsuffix, which should be
+     *  EWC_sa or EWC_da
+     *  @param vowel the optional vowel */
+    public static boolean formsLegalTshegBar(char prefix,
+                                             char headLetter,
+                                             char rootLetter,
+                                             char subjoinedLetter,
+                                             boolean hasWaZur, // DLC FIXME handle this
+                                             boolean hasAChung, // DLC FIXME handle this
+                                             String suffix,
+                                             char postsuffix,
+                                             char vowel)
+    {
+        try {
+            return internalLegalityTest(prefix, headLetter, rootLetter,
+                                        subjoinedLetter, hasWaZur, hasAChung,
+                                        suffix, postsuffix, vowel, false);
+        } catch (IllegalArgumentException e) {
+            throw new Error("This simply cannot happen, but it did.");
+        }
+    }
+
+    /** Like {@link
+     *  #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char)
+     *  but for the common case where the suffix is simply a consonant. */
+    public static boolean formsLegalTshegBar(char prefix,
+                                             char headLetter,
+                                             char rootLetter,
+                                             char subjoinedLetter,
+                                             boolean hasWaZur, // DLC FIXME handle this
+                                             boolean hasAChung, // DLC FIXME handle this
+                                             char suffix,
+                                             char postsuffix,
+                                             char vowel)
+    {
+        return formsLegalTshegBar(prefix, headLetter, rootLetter,
+                                  subjoinedLetter, hasWaZur, hasAChung,
+                                  new String(new char[] { suffix }),
+                                  postsuffix, vowel);
+    }
+
+
+    /** If you get through this gauntlet without having an exception
+     *  thrown, then this combination makes a legal Tibetan syllable.
+     *  @exception IllegalArgumentException if the syllable does not
+     *  follow the rules of a Tibetan syllable.  To learn about the
+     *  arguments, see {@link
+     *  #formsLegalTshegBar(char,char,char,char,String,char,char)}. */
+    private static void throwIfNotLegalTshegBar(char prefix,
+                                                char headLetter,
+                                                char rootLetter,
+                                                char subjoinedLetter,
+                                                boolean hasWaZur, // DLC FIXME handle this
+                                                boolean hasAChung, // DLC FIXME handle this
+                                                String suffix,
+                                                char postsuffix,
+                                                char vowel)
+        throws IllegalArgumentException
+    {
+        internalLegalityTest(prefix, headLetter, rootLetter,
+                             subjoinedLetter, hasWaZur, hasAChung,
+                             suffix, postsuffix, vowel, true);
+    }
+
+    /** Voodoo.  Stand back. */
+    private static boolean internalThrowThing(boolean doThrow, String msg)
+    {
+        if (doThrow)
+            throw new IllegalArgumentException(msg);
+        return false;
+    }
+
+    /** If you get through this gauntlet without having an exception
+     *  thrown, then this combination makes a legal Tibetan syllable.
+     *  To learn about the arguments, see {@link
+     *  #formsLegalTshegBar(char,char,char,char,String,char,char)}.
+     *  @return true if this syllable is legal, false if this syllable
+     *  is illegal and throwIfIllegal is false, does not return if
+     *  this syllable is illegal and throwIfIllegal is true
+     *  @exception IllegalArgumentException if the syllable does not
+     *  follow the rules of a Tibetan syllable and throwIfIllegal is
+     *  true */
+    private static boolean internalLegalityTest(char prefix,
+                                                char headLetter,
+                                                char rootLetter,
+                                                char subjoinedLetter,
+                                                boolean hasWaZur, // DLC FIXME handle this
+                                                boolean hasAChung, // DLC FIXME handle this
+                                                String suffix,
+                                                char postsuffix,
+                                                char vowel,
+                                                boolean throwIfIllegal)
+        throws IllegalArgumentException
+    {
+        if (!isNominalRepresentationOfConsonant(rootLetter))
+            return internalThrowThing(throwIfIllegal,
+                                      "The root letter must be one of the standard thirty Tibetan consonants, and must be represented nominally, not, for example, by FIXED-FORM RA (\\u0F6A)");
+
+        if (EW_ABSENT != prefix) {
+            // Ensure that this prefix is one of the five prefixes,
+            // and that it can go with this root letter:
+            if (!isNominalRepresentationOfPrefix(prefix))
+                return internalThrowThing(throwIfIllegal,
+                                          "The prefix is not absent, so it must be one of the five possible prefixes.");
+            // DLC test that it can go with the root letter.
+        }
+
+        if (EW_ABSENT != subjoinedLetter) {
+            if (EWSUB_ya_btags == subjoinedLetter) {
+                if (!isConsonantThatTakesYaBtags(rootLetter)) {
+                    return internalThrowThing(throwIfIllegal,
+                                              "Cannot subscribe ya-btags to that root letter.");
+                }
+            } else if (EWSUB_ra_btags == subjoinedLetter) {
+                if (!isConsonantThatTakesRaBtags(rootLetter)) {
+                    return internalThrowThing(throwIfIllegal,
+                                              "Cannot subscribe ra-btags to that root letter.");
+                }
+            } else if (EWSUB_la_btags == subjoinedLetter) {
+                if (!isConsonantThatTakesLaBtags(rootLetter)) {
+                    return internalThrowThing(throwIfIllegal,
+                                              "Cannot subscribe la-btags to that root letter.");
+                }
+            } else if (EWSUB_wa_zur == subjoinedLetter) {
+                throw new Error("DLC FIXME: can this happen?  wa-zur comes in via the boolean argument hasWaZur, not via subjoinedLetter.");
+            } else {
+                // check for a common mistake:
+                if ('\u0FBA' == subjoinedLetter
+                    || '\u0FBB' == subjoinedLetter
+                    || '\u0FBC' == subjoinedLetter)
+                    {
+                        return internalThrowThing(throwIfIllegal,
+                                                  "The subjoined letter given is subjoinable, but you gave the fixed-form variant, which is not used in Tibetan syllables but is sometimes used in Tibetan transliteration of Sanskrit, Chinese, or some non-Tibetan language.");
+                    }
+                return internalThrowThing(throwIfIllegal,
+                                          "The subjoined letter given is not one of the four consonants that may be subscribed.");
+            }
+        } // subjoinedLetter tests
+
+        // Suffix tests:
+        // DLC NOW -- allow 'o, 'u, 'am, etc.
+        if (null != suffix) {
+            if (!getConnectiveCaseSuffix().equals(suffix)) {
+                if (suffix.length() != 1) {
+                    return internalThrowThing(throwIfIllegal,
+                                              "Illegal suffix -- not one of the legal complex suffixes like 'u, 'o, 'i, 'am.");
+                }
+                if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) {
+                    return internalThrowThing(throwIfIllegal,
+                                              "Illegal suffix -- not one of the ten legal suffixes: "
+                                              + UnicodeUtils.unicodeCharToString(suffix.charAt(0)));
+                }
+            }
+        }
+        if (EW_ABSENT != postsuffix) {
+            if (null == suffix)
+                return internalThrowThing(throwIfIllegal,
+                                          "You cannot have a postsuffix unless you also have a suffix.");
+        }
+
+        if (EW_ABSENT != headLetter) {
+            if (EWC_ra == headLetter) {
+                if (!isConsonantThatTakesRaMgo(rootLetter)) {
+                    return internalThrowThing(throwIfIllegal,
+                                              "The head letter ra cannot be used with that root letter.");
+                }
+            } else if (EWC_la == headLetter) {
+                if (!isConsonantThatTakesLaMgo(rootLetter)) {
+                    return internalThrowThing(throwIfIllegal,
+                                              "The head letter la cannot be used with that root letter.");
+                }
+            } else if (EWC_sa == headLetter) {
+                if (!isConsonantThatTakesSaMgo(rootLetter)) {
+                    // handle a common error specially:
+                    if (EWC_la == rootLetter)
+                        return internalThrowThing(throwIfIllegal,
+                                                  "sa cannot be a head letter atop the root letter la.  You probably meant to have sa the root letter and la the subjoined letter.");
+
+                    return internalThrowThing(throwIfIllegal,
+                                              "The head letter sa cannot be used with that root letter.");
+                }
+            } else {
+                // '\u0F6A' is not a valid head letter, even for
+                // "rnya".  Use EWC_ra instead.
+                return internalThrowThing(throwIfIllegal,
+                                          "The head letter given is not valid.");
+            }
+        } // headLetter tests
+
+        // Now see if the vowel is valid:
+        if (EW_ABSENT /* built-in "ah" sound */ != vowel) {
+            if (EWV_i != vowel
+                && EWV_u != vowel
+                && EWV_e != vowel
+                && EWV_o != vowel)
+                {
+                    if (EWC_achen == vowel)
+                        return internalThrowThing(throwIfIllegal,
+                                                  "The vowel given is not valid.  Use EW_ABSENT for the EWC_achen sound.");
+                    if ('\u0F71' == vowel)
+                        return internalThrowThing(throwIfIllegal,
+                                                  "a-chung cannot be used in a simple Tibetan syllable.");
+                    return internalThrowThing(throwIfIllegal,
+                                              "The vowel given is not valid.");
+                }
+        }
+
+        // Phew.  We got here, so this combination of inputs is valid.
+        return true;
+    }
+
+
+    /*
+      DLC add a method giving the correct connective case thingy or
+      throwing error if the 'i suffix already appears.
+
+      DLC put in a method that gets pronunciation using Unicode
+      diacritical marks.  And another using just US Roman.  Note that
+      pronunciation is contextual, so have these methods return all
+      valid pronunciations, such as both "pa" and "wa" for EWC_ba.
+
+      DLC would be nice in the appropriate class: boolean
+      isTransliteratedSanskrit(), boolean isTransliteratedChinese()
+      (design: contains fa or va, maybe?). */
+
+    /** Returns a StringBuffer that holds the extended wylie
+     *  representation of this syllable. */
+    public StringBuffer getExtendedWylie() {
+        StringBuffer sb = new StringBuffer();
+        char rootLetter = getRootLetter();
+        if (hasPrefix()) {
+            // if there is a prefix but no head letter and (prefix,
+            // rootLetter) is ambiguous, i.e. if it could be mistaken
+            // for a legal (rootLetter, subjoinedLetter) combination,
+            // then put out prefix,disambiguator.  else just put out
+            // prefix.
+
+            boolean disambiguatorNeeded = false;
+            char prefix = getPrefix();
+            sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(prefix));
+            if (!hasHeadLetter()) {
+                if (EWC_ya == rootLetter) {
+                    if (isConsonantThatTakesYaBtags(prefix))
+                        disambiguatorNeeded = true;
+                } else if (EWC_ra == rootLetter) {
+                    if (isConsonantThatTakesRaBtags(prefix))
+                        disambiguatorNeeded = true;
+                } else if (EWC_la == rootLetter) {
+                    if (isConsonantThatTakesLaBtags(prefix))
+                        disambiguatorNeeded = true;
+                } else if (EWC_wa == rootLetter) {
+                    if (isConsonantThatTakesWaZur(prefix))
+                        disambiguatorNeeded = true;
+                }
+            }
+            if (disambiguatorNeeded)
+                sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
+        }
+        if (hasHeadLetter())
+            sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getHeadLetter()));
+        sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(rootLetter));
+        if (hasSubjoinedLetter())
+            sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getSubjoinedLetter()));
+        if (hasWaZurSubjoinedToRootLetter())
+            sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EWSUB_wa_zur));
+
+        // a-chung is treated, in Extended Wylie, like a vowel.  I.e.,
+        // you don't have 'pAa', you have 'pA'.
+        if (hasAChungOnRootLetter()) {
+            if (hasExplicitVowel()) {
+                if (EWV_i == getVowel()) {
+                    sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar('\u0F73'));
+                } else if (EWV_u == getVowel()) {
+                    sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar('\u0F75'));
+                } else if (EWV_e == getVowel() || EWV_o == getVowel()) {
+                    // The exception to the rule for a-chung and vowels...
+
+                    // DLC FIXME: are these allowed in legal Tibetan?
+                    // EWTS would have special cases for them if so,
+                    // I'd wager...
+                    sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EW_achung));
+                    sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel()));
+                } else {
+                    ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?");
+                }
+            } else {
+                sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EW_achung));
+            }
+        } else {
+            if (hasExplicitVowel())
+                sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel()));
+            else
+                sb.append("a");
+        }
+
+        if (hasSuffix()) {
+            String suf = getSuffix();
+            sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(suf.charAt(0)));
+            if (suf.length() > 1) {
+                // DLC assert, don't verify, that the length is two.
+                // This could change if I learn of more suffix
+                // particles.
+                ThdlDebug.verify(2 == suf.length());
+                sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(suf.charAt(1)));
+            }
+        }
+        if (hasPostsuffix())
+            sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPostsuffix()));
+        return sb;
+    }
+
+
+    // DLC: toXML for the dense XML
+    /** Returns a <legalTibetanSyllable> element that contains only
+     *  the Extended Wylie transliteration for the whole syllable and a note that the . */
+    public String toConciseXML() {
+        // DLC version-control the EWTS document. 0.5 is used below:
+        return ("<legalTibetanSyllable "
+                + "transliterationType=\"THDL Extended Wylie 0.5\" "
+                + "transliteration=\"" + getExtendedWylie() + "\"" + "/>");
+    }
+
+    /** Returns a <legalTibetanSyllable> element that contains the
+     *  syllable broken-down into its constituent vowel and
+     *  consonants. */
+    public String toVerboseXML() {
+        // DLC version-control the EWTS document. 0.5 is used below:
+        return ("<legalTibetanSyllable "
+                + "transliterationType=\"THDL Extended Wylie 0.5\" "
+                + (hasPrefix()
+                   ? ("prefix=\""
+                      + UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPrefix()) + "\" ")
+                   : "")
+                + (hasHeadLetter()
+                   ? ("headLetter=\""
+                      + UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getHeadLetter())
+                      + "\" ")
+                   : "")
+                + ("rootLetter=\""
+                   + UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getRootLetter()) + "\" ")
+                + (hasSubjoinedLetter()
+                   ? ("subjoinedLetter=\""
+                      + UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getSubjoinedLetter())
+                      + "\" ")
+                   : "")
+                + (hasWaZurSubjoinedToRootLetter()
+                   ? "hasWaZurSubjoinedToRootLetter=\"true\""
+                   : "")
+                + (hasAChungOnRootLetter()
+                   ? "hasAChungOnRootLetter=\"true\""
+                   : "")
+
+                // DLC NOW: what about the root letter a, i.e. \u0F68 ?  do we want the EWTS to be 'aa' ?
+                + ("vowel=\""
+                   + (hasExplicitVowel()
+                      ? UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel())
+                      : "a")
+                   + "\" ")
+                + (hasSuffix()
+                   ? ("suffix=\""
+                      + UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeString(getSuffix())
+                      + "\" ")
+                   : "")
+                + (hasPostsuffix()
+                   ? ("postsuffix=\""
+                      + UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPostsuffix())
+                      + "\" ")
+                   : "")
+                + "/>");
+    }
+
+
+    /** Overrides {@link org.thdl.tib.text.tshegbar#UnicodeReadyThunk
+        method to return {@link UnicodeUtils#toCanonicalForm(String)
+        canonically-formed Unicode}.
+        @exception UnsupportedOperationException is never thrown */
+    public String getEquivalentUnicode() {
+        StringBuffer sb = new StringBuffer();
+        if (hasPrefix()) {
+            ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getPrefix()));
+            sb.append(getPrefix());
+        }
+        if (hasHeadLetter()) {
+            // DLC FIXME this crap won't be true...
+            ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getPrefix()));
+            ThdlDebug.verify(UnicodeUtils.isSubjoinedConsonant(getRootLetter()));
+            sb.append(getHeadLetter());
+        } else {
+            ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getRootLetter()));
+        }
+        sb.append(getRootLetter());
+        if (hasSubjoinedLetter()) {
+            ThdlDebug.verify(UnicodeUtils.isSubjoinedConsonant(getSubjoinedLetter()));
+            sb.append(getSubjoinedLetter());
+        }
+        if (hasWaZurSubjoinedToRootLetter()) {
+            ThdlDebug.verify(UnicodeUtils.isSubjoinedConsonant(EWSUB_wa_zur));
+            sb.append(EWSUB_wa_zur);
+        }
+        if (hasAChungOnRootLetter()) {
+            ThdlDebug.verify('\u0F71' == EW_achung);
+            sb.append(EW_achung);
+        }
+        if (hasExplicitVowel()) {
+            sb.append(getVowel());
+        }
+        if (hasSuffix()) {
+            ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getSuffix().charAt(0)));
+            sb.append(getSuffix());
+        }
+        if (hasPostsuffix()) {
+            ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getPostsuffix()));
+            sb.append(getPostsuffix());
+        }
+        return sb.toString();
+    }
+
+    /** Overrides {@link org.thdl.tib.text.tshegbar#UnicodeReadyThunk
+        method to return true. */
+    public boolean hasEquivalentUnicode() {
+        return true;
+    }
+
+
+    /** Returns a descriptive XML element. */
+    public String toString() {
+        return toConciseXML();
+    }
+}
diff --git a/source/org/thdl/tib/text/tshegbar/TshegBar.java b/source/org/thdl/tib/text/tshegbar/TshegBar.java
new file mode 100644
index 0000000..5a560f6
--- /dev/null
+++ b/source/org/thdl/tib/text/tshegbar/TshegBar.java
@@ -0,0 +1,68 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site 
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis, 
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
+License for the specific terms governing rights and limitations under the 
+License. 
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
+All Rights Reserved. 
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.tshegbar;
+
+/** A TshegBar (pronounced <i>tsek bar</i>) is roughly a Tibetan
+ *  syllable.  In truth, it is the stuff between two <i>tsek</i>s.
+ *
+ *  <p> First, some terminology.</p>
+ *
+ *  <ul> <li>When we talk about a <i>glyph</i>, we mean a picture
+ *  found in a font.  A single glyph may have one or more
+ *  representations by sequences of Unicode characters, or it may not
+ *  be representable becuase it is only part of one Unicode character
+ *  or pictures a nonstandard character.</li> <li>When we talk about a
+ *  <i>stack</i>, we mean either a number (or half-number), a mark or
+ *  sign, a bit of punctuation, or a consonant stack.</li> <li>A
+ *  <i>consonant stack</i> is or one or more consonants stacked
+ *  vertically, plus an optional vocalic modification such as an
+ *  anusvara (DLC what do we call a bindu?) or visarga, plus zero or
+ *  more signs like <code>\u0F35</code>, plus an optional a-chung
+ *  (<code>\u0F71</code>), plus an optional simple vowel.</li> <li>By
+ *  <i>simple vowel</i>, we mean any of <code>\u0F72</code>,
+ *  <code>\u0F74</code>, <code>\u0F7A</code>, <code>\u0F7B</code>,
+ *  <code>\u0F7C</code>, <code>\u0F7D</code>, or
+ *  <code>\u0F80</code>.</li> </ul>
+ *
+ *  (Note: The string <code>"\u0F68\u0F7E\u0F7C"</code> seems to equal
+ *  <code>"\u0F00"</code>, though the Unicode standard does not
+ *  indicate that it is so.  This code treats it that way.)</p>
+ *
+ *  <p> This class allows for invalid tsheg bars, like those
+ *  containing more than one prefix, more than two suffixes, an
+ *  invalid postsuffix (secondary suffix), more than one consonant
+ *  stack (excluding the special case of what we call in Extended
+ *  Wylie "'i", which is technically a consonant stack but is used in
+ *  Tibetan like a suffix).</p>.
+ *
+ *  <p>Subclasses exist for valid, grammatically correct tsheg bars,
+ *  and for invalid tsheg bars.  Note that correctness is at the tsheg
+ *  bar level only; it may be grammatically incorrect to concatenate
+ *  two valid tsheg bars.  Some subclasses can be represented in
+ *  Unicode, but others contain nonstandard glyphs and cannot be.</p>
+ *
+ *  @author David Chandler
+ */
+public abstract class TshegBar implements UnicodeReadyThunk {
+    /** Returns true, as we consider a transliteration in the Tibetan
+     *  alphabet of a non-Tibetan language, say Chinese, as being
+     *  Tibetan.
+     *  @return true */
+    public boolean isTibetan() { return true; }
+}
diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java b/source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java
new file mode 100644
index 0000000..bac731c
--- /dev/null
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java
@@ -0,0 +1,317 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site 
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis, 
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
+License for the specific terms governing rights and limitations under the 
+License. 
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
+All Rights Reserved. 
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.tshegbar;
+
+import org.thdl.tib.text.TibetanMachineWeb;
+
+/** This noninstantiable class allows for converting from Unicode
+ *  characters (i.e., code points) to Extended Wylie.  It cannot be
+ *  used for long stretches of text, though, as it is unaware of
+ *  context, which is essential to understanding a non-trivial string
+ *  of Tibetan Unicode.
+ *
+ *  <p>See the document by Nathaniel Garson and David Germano entitled
+ *  <i>Extended Wylie Transliteration Scheme</i>.  Note that there are
+ *  a couple of issues with the November 18, 2001 revision of that
+ *  document; these issues are in the Bugs tracker at {@see
+ *  http://sourceforge.net/projects/thdltools}.</p>
+ *
+ *  @author David Chandler */
+public class UnicodeCharToExtendedWylie {
+
+    /** Returns the extended Wylie for the very simple sequence x.
+     *  Returns null iff some (Unicode) char in s has no extended
+     *  Wylie representation.  This is unaware of context, so use it
+     *  sparingly. */
+    public static StringBuffer getExtendedWylieForUnicodeString(String x) {
+        StringBuffer sb = new StringBuffer();
+        for (int i = 0; i < x.length(); i++) {
+            String ew = getExtendedWylieForUnicodeChar(x.charAt(i));
+            if (null == ew)
+                return null;
+            sb.append(ew);
+        }
+        return sb;
+    }
+
+    /** Returns the extended Wylie for x, or null if there is none.
+     *  Understand that multiple Unicode code points (chars) map to
+     *  the same Extended Wylie representation.  Understand also that
+     *  the scrap of Extended Wylie returned is only valid in certain
+     *  contexts.  For example, not all consonants take ra-btags.  DLC NOW what about canonicalization? */
+    public static String getExtendedWylieForUnicodeChar(char x) {
+        switch (x) {
+
+        case '\u0F00': return "oM";
+        case '\u0F01': return null;
+        case '\u0F02': return null;
+        case '\u0F03': return null;
+        case '\u0F04': return "@";
+        case '\u0F05': return "#";
+        case '\u0F06': return "$";
+        case '\u0F07': return "%";
+        case '\u0F08': return "!";
+        case '\u0F09': return null;
+        case '\u0F0A': return null;
+        case '\u0F0B': return " ";
+        case '\u0F0C': return "*"; // DLC NOW: Jskad does not support this!
+        case '\u0F0D': return "/";
+        case '\u0F0E': return "//"; // DLC FIXME: this is kind of a hack-- the Unicode standard says the spacing for this construct is different than the spacing for "\u0F0D\u0F0D"
+        case '\u0F0F': return ";";
+
+        case '\u0F10': return "[";
+        case '\u0F11': return "|";
+        case '\u0F12': return "]";
+        case '\u0F13': return "`";
+        case '\u0F14': return ":";
+        case '\u0F15': return null;
+        case '\u0F16': return null;
+        case '\u0F17': return null;
+        case '\u0F18': return null;
+        case '\u0F19': return null;
+        case '\u0F1A': return null;
+        case '\u0F1B': return null;
+        case '\u0F1C': return null;
+        case '\u0F1D': return null;
+        case '\u0F1E': return null;
+        case '\u0F1F': return null;
+
+        case '\u0F20': return "0";
+        case '\u0F21': return "1";
+        case '\u0F22': return "2";
+        case '\u0F23': return "3";
+        case '\u0F24': return "4";
+        case '\u0F25': return "5";
+        case '\u0F26': return "6";
+        case '\u0F27': return "7";
+        case '\u0F28': return "8";
+        case '\u0F29': return "9";
+        case '\u0F2A': return null;
+        case '\u0F2B': return null;
+        case '\u0F2C': return null;
+        case '\u0F2D': return null;
+        case '\u0F2E': return null;
+        case '\u0F2F': return null;
+
+        case '\u0F30': return null;
+        case '\u0F31': return null;
+        case '\u0F32': return null;
+        case '\u0F33': return null;
+        case '\u0F34': return "=";
+        case '\u0F35': return null;
+        case '\u0F36': return null;
+        case '\u0F37': return null;
+        case '\u0F38': return null;
+        case '\u0F39': return null;
+        case '\u0F3A': return "<";
+        case '\u0F3B': return ">";
+        case '\u0F3C': return "(";
+        case '\u0F3D': return ")";
+        case '\u0F3E': return "{";
+        case '\u0F3F': return "}";
+
+        case '\u0F40': return "k";
+        case '\u0F41': return "kh";
+        case '\u0F42': return "g";
+        case '\u0F43': return (getExtendedWylieForUnicodeChar('\u0F42')
+                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+        case '\u0F44': return "ng";
+        case '\u0F45': return "c";
+        case '\u0F46': return "ch";
+        case '\u0F47': return "j";
+        case '\u0F48': return null;
+        case '\u0F49': return "ny";
+        case '\u0F4A': return "T";
+        case '\u0F4B': return "Th";
+        case '\u0F4C': return "D";
+        case '\u0F4D': return (getExtendedWylieForUnicodeChar('\u0F4C')
+                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+        case '\u0F4E': return "N";
+        case '\u0F4F': return "t";
+
+        case '\u0F50': return "th";
+        case '\u0F51': return "d";
+        case '\u0F52': return (getExtendedWylieForUnicodeChar('\u0F51')
+                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+        case '\u0F53': return "n";
+        case '\u0F54': return "p";
+        case '\u0F55': return "ph";
+        case '\u0F56': return "b";
+        case '\u0F57': return (getExtendedWylieForUnicodeChar('\u0F56')
+                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+        case '\u0F58': return "m";
+        case '\u0F59': return "ts";
+        case '\u0F5A': return "tsh";
+        case '\u0F5B': return "dz";
+        case '\u0F5C': return (getExtendedWylieForUnicodeChar('\u0F5B')
+                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+        case '\u0F5D': return "w";
+        case '\u0F5E': return "zh";
+        case '\u0F5F': return "z";
+
+        case '\u0F60': return "'";
+        case '\u0F61': return "y";
+        case '\u0F62': return "r";
+        case '\u0F63': return "l";
+        case '\u0F64': return "sh";
+        case '\u0F65': return "Sh";
+        case '\u0F66': return "s";
+        case '\u0F67': return "h";
+        case '\u0F68': return "a"; // DLC: maybe the empty string is OK here because typing just 'i' into Jskad causes root letter \u0F68 to appear... yuck...
+        case '\u0F69': return (getExtendedWylieForUnicodeChar('\u0F40')
+                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+                               + getExtendedWylieForUnicodeChar('\u0FB5'));
+        case '\u0F6A': return "r";
+        case '\u0F6B': return null;
+        case '\u0F6C': return null;
+        case '\u0F6D': return null;
+        case '\u0F6E': return null;
+        case '\u0F6F': return null;
+
+        case '\u0F70': return null;
+        case '\u0F71': return "A";
+        case '\u0F72': return "i";
+        case '\u0F73': return "I";
+        case '\u0F74': return "u";
+        case '\u0F75': return "U";
+        case '\u0F76': return "r-i"; // DLC Ri or r-i?  I put in a bug report.
+        case '\u0F77': return "r-I"; // DLC or RI?
+        case '\u0F78': return "l-i";
+        case '\u0F79': return "l-I";
+        case '\u0F7A': return "e";
+        case '\u0F7B': return "ai";
+        case '\u0F7C': return "o";
+        case '\u0F7D': return "au";
+        case '\u0F7E': return "M";
+        case '\u0F7F': return "H";
+
+        case '\u0F80': return "-i";
+        case '\u0F81': return "-I";
+        case '\u0F82': return "~^";// DLC unsupported in Jskad
+        case '\u0F83': return "~"; // DLC unsupported in Jskad
+        case '\u0F84': return "?";
+        case '\u0F85': return "&";
+        case '\u0F86': return null;
+        case '\u0F87': return null;
+        case '\u0F88': return null;
+        case '\u0F89': return null;
+        case '\u0F8A': return null;
+        case '\u0F8B': return null;
+        case '\u0F8C': return null;
+        case '\u0F8D': return null;
+        case '\u0F8E': return null;
+        case '\u0F8F': return null;
+
+        case '\u0F90': return "k";
+        case '\u0F91': return "kh";
+        case '\u0F92': return "g";
+        case '\u0F93': return (getExtendedWylieForUnicodeChar('\u0F92')
+                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+        case '\u0F94': return "ng";
+        case '\u0F95': return "c";
+        case '\u0F96': return "ch";
+        case '\u0F97': return "j";
+        case '\u0F98': return null;
+        case '\u0F99': return "ny";
+        case '\u0F9A': return "T";
+        case '\u0F9B': return "Th";
+        case '\u0F9C': return "D";
+        case '\u0F9D': return (getExtendedWylieForUnicodeChar('\u0F92')
+                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+        case '\u0F9E': return "N";
+        case '\u0F9F': return "t";
+
+        case '\u0FA0': return "th";
+        case '\u0FA1': return "d";
+        case '\u0FA2': return (getExtendedWylieForUnicodeChar('\u0FA1')
+                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+        case '\u0FA3': return "n";
+        case '\u0FA4': return "p";
+        case '\u0FA5': return "ph";
+        case '\u0FA6': return "b";
+        case '\u0FA7': return (getExtendedWylieForUnicodeChar('\u0FA6')
+                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+        case '\u0FA8': return "m";
+        case '\u0FA9': return "ts";
+        case '\u0FAA': return "tsh";
+        case '\u0FAB': return "dz";
+        case '\u0FAC': return (getExtendedWylieForUnicodeChar('\u0FAB')
+                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+                               + getExtendedWylieForUnicodeChar('\u0FB7'));
+        case '\u0FAD': return "w";
+        case '\u0FAE': return "zh";
+        case '\u0FAF': return "z";
+
+        case '\u0FB0': return "'";
+        case '\u0FB1': return "y";
+        case '\u0FB2': return "r";
+        case '\u0FB3': return "l";
+        case '\u0FB4': return "sh";
+        case '\u0FB5': return "Sh";
+        case '\u0FB6': return "s";
+        case '\u0FB7': return "h";
+        case '\u0FB8': return "a"; // DLC see note on \u0F68 ...
+        case '\u0FB9': return (getExtendedWylieForUnicodeChar('\u0F90')
+                               + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+                               + getExtendedWylieForUnicodeChar('\u0FB5'));
+        case '\u0FBA': return "w";
+        case '\u0FBB': return "y";
+        case '\u0FBC': return "r";
+        case '\u0FBD': return null;
+        case '\u0FBE': return null;
+        case '\u0FBF': return null;
+
+        case '\u0FC0': return null;
+        case '\u0FC1': return null;
+        case '\u0FC2': return null;
+        case '\u0FC3': return null;
+        case '\u0FC4': return null;
+        case '\u0FC5': return null;
+        case '\u0FC6': return null;
+        case '\u0FC7': return null;
+        case '\u0FC8': return null;
+        case '\u0FC9': return null;
+        case '\u0FCA': return null;
+        case '\u0FCB': return null;
+        case '\u0FCC': return null;
+        case '\u0FCD': return null;
+        case '\u0FCE': return null;
+        case '\u0FCF': return ""; // DLC i added this to the 'EWTS document misspeaks' bug report... null I think...
+
+        default: {
+            // DLC handle space (EW's "_")
+
+            // This character is in the range 0FD0-0FFF or is not in
+            // the Tibetan range at all.  In either case, there is no
+            // corresponding Extended Wylie.
+            return null;
+        }
+        } // end switch
+    }
+}
+
diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java b/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java
new file mode 100644
index 0000000..7c8a315
--- /dev/null
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java
@@ -0,0 +1,98 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site 
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis, 
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
+License for the specific terms governing rights and limitations under the 
+License. 
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
+All Rights Reserved. 
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.tshegbar;
+
+/** Provides handy Extended Wylie-inspired names for Unicode
+ *  characters commonly used to represent Tibetan.  The consonant that
+ *  the Extended Wylie text "ka" refers to is named EWC_ka as in "The
+ *  Extended Wylie Consonant ka", the vowel represented in Extended
+ *  Wylie by "i" is EWV_i, and so on.  There is at least one exception
+ *  to the naming scheme, but exceptions are well-commented.
+ *
+ *  @see org.thdl.tib.text.tshegbar#ValidTshegBar
+ *
+ *  @author David Chandler */
+public interface UnicodeConstants {
+
+    /** for those times when you need a char to represent a non-existent character */
+    static final char EW_ABSENT = '\u0000';
+
+    // the thirty consonants, in alphabetical order:
+
+    /** first letter of the alphabet: */
+    static final char EWC_ka = '\u0F40';
+
+    static final char EWC_kha = '\u0F41';
+    static final char EWC_ga = '\u0F42';
+    static final char EWC_nga = '\u0F44';
+    static final char EWC_ca = '\u0F45';
+    static final char EWC_cha = '\u0F46';
+    static final char EWC_ja = '\u0F47';
+    static final char EWC_nya = '\u0F49';
+    static final char EWC_ta = '\u0F4F';
+    static final char EWC_tha = '\u0F50';
+    static final char EWC_da = '\u0F51';
+    static final char EWC_na = '\u0F53';
+    static final char EWC_pa = '\u0F54';
+    static final char EWC_pha = '\u0F55';
+    static final char EWC_ba = '\u0F56';
+    static final char EWC_ma = '\u0F58';
+    static final char EWC_tsa = '\u0F59';
+    static final char EWC_tsha = '\u0F5A';
+    static final char EWC_dza = '\u0F5B';
+    static final char EWC_wa = '\u0F5D';
+    static final char EWC_zha = '\u0F5E';
+    static final char EWC_za = '\u0F5F';
+    /** Note the irregular name.  The Extended Wylie representation is
+        <code>'a</code>. */
+    static final char EWC_achen = '\u0F60'; /* DLC NOW is this achen or achung? achen is EWC_a, right? comment it.  replace EWC_achen everywhere if you change it. */
+    static final char EWC_ya = '\u0F61';
+    static final char EWC_ra = '\u0F62';
+    static final char EWC_la = '\u0F63';
+    static final char EWC_sha = '\u0F64';
+    static final char EWC_sa = '\u0F66';
+    static final char EWC_ha = '\u0F67';
+    static final char EWC_a = '\u0F68';
+
+    /** In the word for father, "pA lags", there is an a-chung (i.e.,
+        <code>\u0F71</code>).  This is the constant for that little
+        guy. */
+    static final char EW_achung = '\u0F71';
+
+    /* Four of the five vowels, some say, or, others say, "the four
+       vowels": */
+    /** "gi gu" (DLC?), the 'i' sound in the English word keep: */
+    static final char EWV_i = '\u0F72';
+    /** "zhabs kyu", the 'u' sound in the English word tune: */
+    static final char EWV_u = '\u0F74';
+    /** "'greng bu" (also known as "'greng po", and pronounced <i>dang-bo</i>), the 'a' sound in the English word gate: */
+    static final char EWV_e = '\u0F7A';
+    /** "na ro" (DLC?), the 'o' sound in the English word bone: */
+    static final char EWV_o = '\u0F7C';
+
+    
+    /** subscribed form of EWC_wa, a.k.a. wa-btags */
+    static final char EWSUB_wa_zur = '\u0FAD';
+    /** subscribed form of EWC_ya */
+    static final char EWSUB_ya_btags = '\u0FB1';
+    /** subscribed form of EWC_ra */
+    static final char EWSUB_ra_btags = '\u0FB2';
+    /** subscribed form of EWC_la */
+    static final char EWSUB_la_btags = '\u0FB3';
+}
diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeReadyThunk.java b/source/org/thdl/tib/text/tshegbar/UnicodeReadyThunk.java
new file mode 100644
index 0000000..e85a42d
--- /dev/null
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeReadyThunk.java
@@ -0,0 +1,63 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site 
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis, 
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
+License for the specific terms governing rights and limitations under the 
+License. 
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
+All Rights Reserved. 
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.tshegbar;
+
+/** A UnicodeReadyThunk represents a string of characters.  While
+ *  there are ways to turn a string of Unicode characters into a list
+ *  of UnicodeReadyThunks (DLC reference it), you cannot
+ *  necessarily recover the exact sequence of Unicode characters from
+ *  a UnicodeReadyThunk.  For characters that are not Tibetan
+ *  Unicode and are not one of a handful of other known characters,
+ *  only the most primitive operations are available.  Generally in
+ *  this case you can recover the exact string of Unicode characters,
+ *  but don't bank on it.
+ *
+ *  @author David Chandler
+ */
+public interface UnicodeReadyThunk {
+
+    /** Returns true iff this thunk is entirely Tibetan (regardless of
+        whether or not all characters come from the Tibetan range of
+        Unicode 3, i.e. <code>0x0F00</code>-<code>0x0FFF</code>). */
+    public boolean isTibetan();
+    
+    /** Returns a sequence of Unicode characters that is equivalent to
+     *  this thunk if possible.  It is only possible if {@link
+     *  #hasEquivalentUnicode()} is true.  Unicode has more than one
+     *  way to refer to the same language element, so this is just one
+     *  method.  When more than one Unicode sequence exists, and when
+     *  the thunk {@link #isTibetan() is Tibetan}, this method returns
+     *  sequences that the Unicode 3.2 standard does not discourage.
+     *  @exception UnsupportedOperationException if {@link
+     *  #hasEquivalentUnicode()} is false
+     *  @return a String of Unicode characters */
+    public String getEquivalentUnicode() throws UnsupportedOperationException;
+    
+    /** Returns true iff there exists a sequence of Unicode characters
+     *  that correctly represents this thunk.  This will not be the
+     *  case if the thunk contains Tibetan characters for which the
+     *  Unicode standard does not provide.  See the Extended Wylie
+     *  Transliteration System (EWTS) document (DLC ref, DLC mention
+     *  Dza,fa,va doc bug) for more info, and see the Unicode 3
+     *  standard section 9.13.  The presence of head marks or multiple
+     *  vowels in the thunk would cause this to return false, for
+     *  example.  */
+    public boolean hasEquivalentUnicode();
+}
+
diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
new file mode 100644
index 0000000..413cb4a
--- /dev/null
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
@@ -0,0 +1,234 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site 
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis, 
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
+License for the specific terms governing rights and limitations under the 
+License. 
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
+All Rights Reserved. 
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.tshegbar;
+
+/** <p>This non-instantiable class contains utility routines for
+ *  dealing with Tibetan Unicode characters and strings of such
+ *  characters.</p>
+ *
+ *  @author David Chandler */
+public class UnicodeUtils {
+    /** Do not use this, as this class is not instantiable. */
+    private UnicodeUtils() { super(); }
+
+    /** Returns true iff x is a Unicode character that represents a
+        consonant or two-consonant stack that has a Unicode code
+        point.  Returns true only for the usual suspects (like
+        <code>\u0F40</code>) and for Sanskrit consonants (like
+        <code>\u0F71</code>) and the simple two-consonant stacks in
+        Unicode (like <code>\u0F43</code>).  Returns false for, among
+        other things, subjoined consonants like
+        <code>\u0F90</code>. */
+    public static boolean isNonSubjoinedConsonant(char x) {
+        return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */)
+                && (x >= '\u0F40' && x <= '\u0F6A'));
+    }
+
+    /** Returns true iff x is a Unicode character that represents a
+        subjoined consonant or subjoined two-consonant stack that has
+        a Unicode code point.  Returns true only for the usual
+        suspects (like <code>\u0F90</code>) and for Sanskrit
+        consonants (like <code>\u0F9C</code>) and the simple
+        two-consonant stacks in Unicode (like <code>\u0FAC</code>).
+        Returns false for, among other things, non-subjoined
+        consonants like <code>\u0F40</code>. */
+    public static boolean isSubjoinedConsonant(char x) {
+        return ((x != '\u0F98' /* reserved in Unicode 3.2, but not in use */)
+                && (x >= '\u0F90' && x <= '\u0FBC'));
+    }
+
+    /** Returns true iff x is the preferred representation of a
+        Tibetan or Sanskrit consonant and cannot be broken down any
+        further.  Returns false for, among other things, subjoined
+        consonants like <code>\u0F90</code>, two-component consonants
+        like <code>\u0F43</code>, and fixed-form consonants like
+        '\u0F6A'.  The new consonants (for transcribing Chinese, I
+        believe) "\u0F55\u0F39" (which EWTS calls "fa"),
+        "\u0F56\u0F39" ("va"), and "\u0F5F\u0F39" ("Dza") are
+        two-character sequences, but you should be aware of them
+        also. */
+    public static boolean isPreferredFormOfConsonant(char x) {
+        return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */)
+                && (x >= '\u0F40' && x <= '\u0F68')
+                && (x != '\u0F43')
+                && (x != '\u0F4D')
+                && (x != '\u0F52')
+                && (x != '\u0F57')
+                && (x != '\u0F5C'));
+    }
+
+    /** Returns true iff unicodeChar is a character from the Unicode
+        range U+0F00-U+0FFF.
+        @see #isEntirelyTibetanUnicode(String) */
+    public static boolean isInTibetanRange(char unicodeChar) {
+        return (unicodeChar >= '\u0F00' && unicodeChar <= '\u0FFF');
+    }
+
+    /** Returns true iff unicodeString consists only of characters
+        from the Unicode range U+0F00-U+0FFF.  (Note that these
+        characters are typically not enough to represent a Tibetan
+        text, you may need ZWSP (zero-width space) and various
+        whitespace from other ranges.) */
+    public static boolean isEntirelyTibetanUnicode(String unicodeString) {
+        for (int i = 0; i < unicodeString.length(); i++) {
+            if (!isInTibetanRange(unicodeString.charAt(i)))
+                return false;
+        }
+        return true;
+    }
+
+    /** Modifies tibetanUnicode so that it is equivalent, according to
+        the Unicode 3.2 standard, to the input buffer.  The Tibetan
+        passages of the returned string are in THDL-canonical form,
+        however.  This form uses a maximum of characters, in general,
+        and never uses characters whose use has been {@link
+        #isDiscouraged(char) discouraged}.  If the input contains
+        characters for which {@link #isInTibetanRange(char)} is not
+        true, then they will not be modified.
+    
+        <p>Note well that only well-formed input guarantees
+        well-formed output.</p> */
+    public static void toCanonicalForm(StringBuffer tibetanUnicode) {
+        int offset = 0;
+        while (offset < tibetanUnicode.length()) {
+            String s = toCanonicalForm(tibetanUnicode.charAt(offset));
+            if (null == s) {
+                ++offset;
+            } else {
+                // modify tibetanUnicode and update offset.
+                tibetanUnicode.deleteCharAt(offset);
+                tibetanUnicode.insert(offset, s);
+            }
+        }
+    }
+
+    /** Like {@link #toCanonicalForm(StringBuffer)}, but does not
+        modify its input.  Instead, it returns the canonically-formed
+        version of tibetanUnicode. */
+    public static String toCanonicalForm(String tibetanUnicode) {
+        StringBuffer sb = new StringBuffer(tibetanUnicode);
+        toCanonicalForm(sb);
+        return sb.toString();
+    }
+
+    /** There are 19 characters in the Tibetan range of Unicode 3.2
+        which can be decomposed into longer strings of characters in
+        the Tibetan range of Unicode.  These 19 are said not to be in
+        THDL-canonical form.  This routine returns the canonical form
+        for such characters, and returns null for characters that are
+        already canonical or are not in the Tibetan range of Unicode.
+        @param tibetanUnicodeChar the character to canonicalize
+        @return null if tibetanUnicodeChar is canonical, or a string
+        of two or three characters otherwise */
+    public static String toCanonicalForm(char tibetanUnicodeChar) {
+        switch (tibetanUnicodeChar) {
+        case '\u0F43': return new String(new char[] { '\u0F42', '\u0FB7' });
+        case '\u0F4D': return new String(new char[] { '\u0F4C', '\u0FB7' });
+        case '\u0F52': return new String(new char[] { '\u0F51', '\u0FB7' });
+        case '\u0F57': return new String(new char[] { '\u0F56', '\u0FB7' });
+        case '\u0F5C': return new String(new char[] { '\u0F5B', '\u0FB7' });
+        case '\u0F69': return new String(new char[] { '\u0F40', '\u0FB5' });
+        case '\u0F73': return new String(new char[] { '\u0F71', '\u0F72' });
+        case '\u0F75': return new String(new char[] { '\u0F71', '\u0F74' });
+        case '\u0F76': return new String(new char[] { '\u0FB2', '\u0F80' });
+        case '\u0F77': return new String(new char[] { '\u0FB2', '\u0F71', '\u0F80' });
+        case '\u0F78': return new String(new char[] { '\u0FB3', '\u0F80' });
+        case '\u0F79': return new String(new char[] { '\u0FB3', '\u0F71', '\u0F80' });
+        case '\u0F81': return new String(new char[] { '\u0F71', '\u0F80' });
+        case '\u0F93': return new String(new char[] { '\u0F92', '\u0FB7' });
+        case '\u0F9D': return new String(new char[] { '\u0F9C', '\u0FB7' });
+        case '\u0FA2': return new String(new char[] { '\u0FA1', '\u0FB7' });
+        case '\u0FA7': return new String(new char[] { '\u0FA6', '\u0FB7' });
+        case '\u0FAC': return new String(new char[] { '\u0FAB', '\u0FB7' });
+        case '\u0FB9': return new String(new char[] { '\u0F90', '\u0FB5' });
+
+        default:
+            return null;
+        }
+    }
+
+    /** Returns true iff tibetanUnicodeChar {@link
+        #isInTibetanRange(char)} and if the Unicode 3.2 standard
+        discourages the use of tibetanUnicodeChar. */
+    public static boolean isDiscouraged(char tibetanUnicodeChar) {
+        return ('\u0F73' == tibetanUnicodeChar
+                || '\u0F75' == tibetanUnicodeChar
+                || '\u0F77' == tibetanUnicodeChar
+                || '\u0F81' == tibetanUnicodeChar);
+        /* DLC FIXME -- I was using 3.0 p.437-440, check 3.2. */
+    }
+
+    /** Returns true iff ch corresponds to the Tibetan letter ra.
+        Several Unicode characters correspond to the Tibetan letter ra
+        (in its subscribed form or otherwise).  Oftentimes,
+        <code>\u0F62</code> is thought of as the nominal
+        representation.  Returns false for some characters that
+        contain ra but are not merely ra, such as <code>\u0F77</code> */
+    public static boolean isRa(char ch) {
+        return ('\u0F62' == ch
+                || '\u0F6A' == ch
+                || '\u0FB2' == ch
+                || '\u0FBC' == ch);
+    }
+
+    /** Returns true iff ch corresponds to the Tibetan letter wa.
+        Several Unicode characters correspond to the Tibetan letter
+        wa.  Oftentimes, <code>\u0F5D</code> is thought of as the
+        nominal representation. */
+    public static boolean isWa(char ch) {
+        return ('\u0F5D' == ch
+                || '\u0FAD' == ch
+                || '\u0FBA' == ch);
+    }
+
+    /** Returns true iff ch corresponds to the Tibetan letter ya.
+        Several Unicode characters correspond to the Tibetan letter
+        ya.  Oftentimes, <code>\u0F61</code> is thought of as the
+        nominal representation. */
+    public static boolean isYa(char ch) {
+        return ('\u0F61' == ch
+                || '\u0FB1' == ch
+                || '\u0FBB' == ch);
+    }
+
+    /** Returns true iff there exists at least one character ch in
+        unicodeString such that ch {@link #isRa() is ra} or contains
+        ra (like <code>\u0F77</code>).  This method is not implemented
+        as fast as it could be.  It calls on the canonicalization code
+        in order to maximize reuse and minimize the possibility of
+        coder error. */
+    public static boolean containsRa(String unicodeString) {
+        String canonForm = toCanonicalForm(unicodeString);
+        for (int i = 0; i < canonForm.length(); i++) {
+            if (isRa(canonForm.charAt(i)))
+                return true;
+        }
+        return false;
+    }
+    /** Inefficient shortcut.
+        @see #containsRa(String) */
+    public static boolean containsRa(char unicodeChar) {
+        return containsRa(new String(new char[] { unicodeChar }));
+    }
+
+    public static String unicodeCharToString(char ch) {
+        return "U+" + Integer.toHexString((int)ch);
+    }
+}
+
diff --git a/source/org/thdl/tib/text/tshegbar/package.html b/source/org/thdl/tib/text/tshegbar/package.html
new file mode 100644
index 0000000..4de8dfa
--- /dev/null
+++ b/source/org/thdl/tib/text/tshegbar/package.html
@@ -0,0 +1,30 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
+<html>
+<head>
+<!--
+
+  @(#)package.html
+
+  Copyright 2002 Tibetan and Himalayan Digital Library
+
+  This software is the confidential and proprietary information of
+  the Tibetan and Himalayan Digital Library. You shall use such
+  information only in accordance with the terms of the license
+  agreement you entered into with the THDL.
+
+-->
+</head>
+<body bgcolor="white">
+
+  Provides for manipulating Tibetan text at the <i>tsek bar</i> level.
+  Roughly speaking, a "tsheg bar" (pronounced <i>tsek bar</i>) is a
+  syllable.
+
+<p>
+  This package allows for turning a string of Unicode characters into
+  our <i>TTBIR</i>, our Tibetan Tsheg Bar Internal Representation.
+  Said Unicode document may contain non-Tibetan characters also.
+</p>
+
+</body>
+</html>