From daacf6ee3b51b76df01ed23479f0bc23a50608c4 Mon Sep 17 00:00:00 2001
From: dchandler <dchandler>
Date: Sat, 12 Apr 2003 20:56:20 +0000
Subject: [PATCH] I've got too many sandboxes, so I'm committing these changes,
 half-done, from one sandbox so as to consolidate my sandboxes.

---
 .../thdl/tib/text/tshegbar/LegalTshegBar.java | 363 +++++++++++++++++-
 .../tib/text/tshegbar/LegalTshegBarTest.java  |  73 ++++
 .../text/tshegbar/TibetanSyntaxException.java |  51 +++
 .../text/tshegbar/TransitionInstruction.java  |  58 +++
 .../UnicodeReadingStateMachineConstants.java  | 174 +++++++++
 .../tshegbar/ValidatingUnicodeReader.java     | 345 +++++++++++++++++
 .../tshegbar/ValidatingUnicodeReaderTest.java | 195 ++++++++++
 7 files changed, 1252 insertions(+), 7 deletions(-)
 create mode 100644 source/org/thdl/tib/text/tshegbar/TibetanSyntaxException.java
 create mode 100644 source/org/thdl/tib/text/tshegbar/TransitionInstruction.java
 create mode 100644 source/org/thdl/tib/text/tshegbar/UnicodeReadingStateMachineConstants.java
 create mode 100644 source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java
 create mode 100644 source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReaderTest.java
diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
index e119b7d..658c631 100644
--- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
+++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
@@ -18,6 +18,8 @@ Contributor(s): ______________________________________.
 
 package org.thdl.tib.text.tshegbar;
 
+import java.util.Vector;
+
 import org.thdl.tib.text.THDLWylieConstants;
 import org.thdl.util.ThdlDebug;
 
@@ -42,15 +44,15 @@ import org.thdl.util.ThdlDebug;
  *  exception is that 'i (i.e., the connective case marker), 'u, and
  *  'o suffixes are permitted.</li>
  *
- *  <li>It has at most one suffix, which is a single consonant or a
- *  string consisting of 'i, 'u, 'o, 'am, and 'ang.</li>
- *
- *
-DLC FIXME: we must allow many suffixes.  See Andres' e-mail below:
+ *  <li>It has at most one suffix, which is a single consonant (the
+ *  common case) or a string consisting of 'i, 'u, 'o, 'am, and
+ *  'ang.
+
+<p>See Andres' e-mail below:</p>
 <pre>
 David,
 
-It is a particle that means "or" as opposed to "dang" that means and.
+['am] is a particle that means "or" as opposed to "dang" that means and.
 
 "sgom pa'am" would mean "... or meditation"
 
@@ -65,6 +67,7 @@ And also there are cases where they combine. For ex you can have
 
 	Andres 
 </pre>
+</li>
  *
  *
  *  <li>It may contain a EWC_sa or EWC_da postsuffix iff there exists
@@ -681,7 +684,7 @@ public final class LegalTshegBar
     }
 
     /** Like {@link
-     *  #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char)}
+     *  #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char,StringBuffer)}
      *  but geared for the common case where the suffix is simply a
      *  consonant. */
     public static boolean formsLegalTshegBar(char prefix,
@@ -1138,4 +1141,350 @@ public final class LegalTshegBar
     public String toString() {
         return toConciseXML();
     }
+
+    /** FIXMEDOC a shortcut */
+    private static boolean formsLegalTshegBar(Vector grcls) {
+        return formsLegalTshegBar(grcls, 0, grcls.size());
+    }
+
+    /** FIXMEDOC DLC
+     *  
+     *  Returns true iff the given UnicodeGraphemeClusters form a
+     *  syntactically legal Tibetan syllable.  If one is null, it
+     *  means that it is not present.
+     *
+     *  @exception IllegalArgumentException if root is null, or if
+     *  postsuffix is non-null and suffix is null (these being clearly
+     *  illegal)
+     */
+    private static boolean formsLegalTshegBar(UnicodeGraphemeCluster prefix,
+                                              UnicodeGraphemeCluster root,
+                                              UnicodeGraphemeCluster suffix,
+                                              UnicodeGraphemeCluster postsuffix)
+        throws IllegalArgumentException
+    {
+        // reality checks:
+        if (null == root)
+            throw new IllegalArgumentException("root letter is not present");
+        if (null != postsuffix && null == suffix)
+            throw new IllegalArgumentException("a postsuffix cannot occur without a suffix");
+
+        // handle root:
+        if (!root.isLegalTibetan())
+            return false;
+        char headLetter = root.getSuperscribedLetter();
+        char rootLetter = root.getRootCP();
+        char subjoinedLetter = root.getSoleNonWazurSubjoinedLetter();
+        char vowel = root.getVowel();
+        boolean hasAchung = root.hasAchung();
+        boolean hasWazur = root.hasWazur();
+
+        // handle prefix:
+        char prefixLetter = prefix.getSoleTibetanUnicodeCP();
+
+        // handle suffix:
+        String suffixString = null;
+        if (null != suffix) {
+            // DLC FIXME            suffixString = suffix.getUnicodeInUsualOrder();
+            throw new Error("DLC FIXME");
+        }
+
+        // handle postsuffix:
+        char postsuffixLetter = postsuffix.getSoleTibetanUnicodeCP();
+
+        return formsLegalTshegBar(prefixLetter, headLetter, rootLetter,
+                                  subjoinedLetter, hasWazur, hasAchung,
+                                  suffixString, postsuffixLetter, vowel, null);
+    }
+
+    /** Returns true iff the UnicodeGraphemeClusters in grcls with
+     *  indices in the range [start, end) form a syntactically legal
+     *  syllable.  If start is as large as end, false is returned. */
+    private static boolean formsLegalTshegBar(Vector grcls,
+                                              int start,
+                                              int end)
+    {
+        int numGrcls = start - end;
+        if (numGrcls <= 0)
+            return false;
+        if (numGrcls == 1) {
+            // Option 1: (root)
+            // else: return false;
+
+            return formsLegalTshegBar(null,
+                                      (UnicodeGraphemeCluster)grcls.elementAt(start),
+                                      null, null);
+        } else if (numGrcls == 2) {
+            // Option 1: (prefix, root)
+            // Option 2: (root, suffix)
+            // else: return false;
+
+            return (formsLegalTshegBar((UnicodeGraphemeCluster)grcls.elementAt(start),
+                                       (UnicodeGraphemeCluster)grcls.elementAt(start + 1),
+                                       null,
+                                       null)
+                    || formsLegalTshegBar(null,
+                                          (UnicodeGraphemeCluster)grcls.elementAt(start),
+                                          (UnicodeGraphemeCluster)grcls.elementAt(start + 1),
+                                          null));
+        } else if (numGrcls == 3) {
+            // Option 1: (prefix, root, suffix)
+            // Option 2: (root, suffix, postsuffix)
+            // else: return false;
+
+            return (formsLegalTshegBar((UnicodeGraphemeCluster)grcls.elementAt(start),
+                                       (UnicodeGraphemeCluster)grcls.elementAt(start + 1),
+                                       (UnicodeGraphemeCluster)grcls.elementAt(start + 2),
+                                       null)
+                    || formsLegalTshegBar(null,
+                                          (UnicodeGraphemeCluster)grcls.elementAt(start),
+                                          (UnicodeGraphemeCluster)grcls.elementAt(start + 1),
+                                          (UnicodeGraphemeCluster)grcls.elementAt(start + 2)));
+        } else if (numGrcls == 4) {
+            return (formsLegalTshegBar((UnicodeGraphemeCluster)grcls.elementAt(start),
+                                       (UnicodeGraphemeCluster)grcls.elementAt(start + 1),
+                                       (UnicodeGraphemeCluster)grcls.elementAt(start + 2),
+                                       (UnicodeGraphemeCluster)grcls.elementAt(start + 3)));
+        } else {
+            // the largest has 'i ... DLC FIXME rethink -- even the case where numGrcls == 3 could be pa'am
+            return false;
+        }
+    }
+
+
+
+    /** Returns true if the given Tibetan consonant stack (i.e., the
+     *  combination of superscribed, root, and subscribed letters)
+     *  takes an EWC_ga prefix.
+     *  @param head the {@link
+     *  isNominalRepresentationOfConsonant(char) nominal
+     *  representation} of the superscribed letter, or EW_ABSENT if
+     *  not present
+     *  @param root the {@link
+     *  isNominalRepresentationOfConsonant(char) nominal
+     *  representation} of the root letter
+     *  @param sub the {@link isNominalRepresentationOfConsonant(char)
+     *  nominal representation} of the subjoined letter, or EW_ABSENT
+     *  if not present */
+    static boolean takesGao(char head, char root, char sub) {
+        if (EW_ABSENT == head) {
+            if (EW_ABSENT == sub) {
+                return (EWC_ca == root
+                        || EWC_ta == root
+                        || EWC_da == root
+                        || EWC_tsa == root
+                        || EWC_zha == root
+                        || EWC_za == root
+                        || EWC_ya == root
+                        || EWC_sha == root
+                        || EWC_sa == root
+                        || EWC_nya == root
+                        || EWC_na == root);
+            }
+        }
+        return false;
+    }
+
+    /** Returns true if the given Tibetan consonant stack (i.e., the
+     *  combination of superscribed, root, and subscribed letters)
+     *  takes an EWC_da prefix.
+     *  @param head the {@link
+     *  isNominalRepresentationOfConsonant(char) nominal
+     *  representation} of the superscribed letter, or EW_ABSENT if
+     *  not present
+     *  @param root the {@link
+     *  isNominalRepresentationOfConsonant(char) nominal
+     *  representation} of the root letter
+     *  @param sub the {@link isNominalRepresentationOfConsonant(char)
+     *  nominal representation} of the subjoined letter, or EW_ABSENT
+     *  if not present */
+    static boolean takesDao(char head, char root, char sub) {
+        if (EW_ABSENT == head) {
+            if (EW_ABSENT == sub) {
+                return (EWC_ka == root
+                        || EWC_ga == root
+                        || EWC_nga == root
+                        || EWC_pa == root
+                        || EWC_ba == root
+                        || EWC_ma == root);
+            } else {
+                return ((EWC_ga == root && EWC_ya == sub)
+                        || (EWC_pa == root && EWC_ya == sub)
+                        || (EWC_ba == root && EWC_ya == sub)
+                        || (EWC_ma == root && EWC_ya == sub)
+
+                        || (EWC_ka == root && EWC_ra == sub)
+                        || (EWC_ga == root && EWC_ra == sub)
+                        || (EWC_ba == root && EWC_ra == sub)
+                        || (EWC_pa == root && EWC_ra == sub));
+            }
+        } else {
+            return false;
+        }
+    }
+
+    /** Returns true if the given Tibetan consonant stack (i.e., the
+     *  combination of superscribed, root, and subscribed letters)
+     *  takes an EWC_achung prefix.
+     *  @param head the {@link
+     *  isNominalRepresentationOfConsonant(char) nominal
+     *  representation} of the superscribed letter, or EW_ABSENT if
+     *  not present
+     *  @param root the {@link
+     *  isNominalRepresentationOfConsonant(char) nominal
+     *  representation} of the root letter
+     *  @param sub the {@link isNominalRepresentationOfConsonant(char)
+     *  nominal representation} of the subjoined letter, or EW_ABSENT
+     *  if not present */
+    static boolean takesAchungPrefix(char head, char root, char sub) {
+        if (EW_ABSENT == head) {
+            if (EW_ABSENT == sub) {
+                return (EWC_ga == root
+                        || EWC_ja == root
+                        || EWC_da == root
+                        || EWC_ba == root
+                        || EWC_dza == root
+                        || EWC_kha == root
+                        || EWC_cha == root
+                        || EWC_tha == root
+                        || EWC_pha == root
+                        || EWC_tsha == root);
+            } else {
+                return ((EWC_pha == root && EWC_ya == sub)
+                        || (EWC_ba == root && EWC_ya == sub)
+                        || (EWC_kha == root && EWC_ya == sub)
+                        || (EWC_ga == root && EWC_ya == sub)
+
+                        || (EWC_ba == root && EWC_ra == sub)
+                        || (EWC_kha == root && EWC_ra == sub)
+                        || (EWC_ga == root && EWC_ra == sub)
+                        || (EWC_da == root && EWC_ra == sub)
+                        || (EWC_pha == root && EWC_ra == sub));
+            }
+        } else {
+            return false;
+        }
+    }
+
+    /** Returns true if the given Tibetan consonant stack (i.e., the
+     *  combination of superscribed, root, and subscribed letters)
+     *  takes an EWC_ma prefix.
+     *  @param head the {@link
+     *  isNominalRepresentationOfConsonant(char) nominal
+     *  representation} of the superscribed letter, or EW_ABSENT if
+     *  not present
+     *  @param root the {@link
+     *  isNominalRepresentationOfConsonant(char) nominal
+     *  representation} of the root letter
+     *  @param sub the {@link isNominalRepresentationOfConsonant(char)
+     *  nominal representation} of the subjoined letter, or EW_ABSENT
+     *  if not present */
+    static boolean takesMao(char head, char root, char sub) {
+        if (EW_ABSENT == head) {
+            if (EW_ABSENT == sub) {
+                return (EWC_kha == root
+                        || EWC_ga == root
+                        || EWC_cha == root
+                        || EWC_ja == root
+                        || EWC_tha == root
+                        || EWC_tsha == root
+                        || EWC_da == root
+                        || EWC_dza == root
+                        || EWC_nga == root
+                        || EWC_nya == root
+                        || EWC_na == root);
+            } else {
+                return ((EWC_kha == root && EWC_ya == sub)
+                        || (EWC_ga == root && EWC_ya == sub)
+
+                        || (EWC_kha == root && EWC_ra == sub)
+                        || (EWC_ga == root && EWC_ra == sub));
+            }
+        } else {
+            return false;
+        }
+    }
+
+    /** Returns true if the given Tibetan consonant stack (i.e., the
+     *  combination of superscribed, root, and subscribed letters)
+     *  takes an EWC_ba prefix.
+     *  @param head the {@link
+     *  isNominalRepresentationOfConsonant(char) nominal
+     *  representation} of the superscribed letter, or EW_ABSENT if
+     *  not present
+     *  @param root the {@link
+     *  isNominalRepresentationOfConsonant(char) nominal
+     *  representation} of the root letter
+     *  @param sub the {@link isNominalRepresentationOfConsonant(char)
+     *  nominal representation} of the subjoined letter, or EW_ABSENT
+     *  if not present */
+    static boolean takesBao(char head, char root, char sub) {
+        // DLC ask Ten-lo la about Wazur.
+        if (EW_ABSENT == head) {
+            if (EW_ABSENT == sub) {
+                return (EWC_ka == root
+                        || EWC_ca == root
+                        || EWC_ta == root
+                        || EWC_tsa == root
+                        || EWC_ga == root
+                        || EWC_nga == root
+                        || EWC_ja == root
+                        || EWC_nya == root
+                        || EWC_da == root
+                        || EWC_na == root
+                        || EWC_dza == root
+                        || EWC_zha == root
+                        || EWC_za == root
+                        || EWC_ra == root
+                        || EWC_la == root
+                        || EWC_sha == root);
+            } else {
+                // kra, e.g.
+                return ((EWC_ka == root && EWC_ya == sub)
+                        || (EWC_ga == root && EWC_ya == sub)
+
+                        || (EWC_ka == root && EWC_ra == sub)
+                        || (EWC_ga == root && EWC_ra == sub)
+                        || (EWC_sa == root && EWC_ra == sub)
+
+                        || (EWC_ka == root && EWC_la == sub)
+                        || (EWC_za == root && EWC_la == sub)
+                        || (EWC_ra == root && EWC_la == sub)
+                        || (EWC_sa == root && EWC_la == sub));
+            }
+        } else {
+            if (EW_ABSENT == sub) {
+                // ska, e.g.
+                return ((EWC_sa == head && EWC_ka == root)
+                        || (EWC_sa == head && EWC_ga == root)
+                        || (EWC_sa == head && EWC_nga == root)
+                        || (EWC_sa == head && EWC_nya == root)
+                        || (EWC_sa == head && EWC_ta == root)
+                        || (EWC_sa == head && EWC_da == root)
+                        || (EWC_sa == head && EWC_na == root)
+                        || (EWC_sa == head && EWC_tsa == root)
+
+                        || (EWC_ra == head && EWC_ka == root)
+                        || (EWC_ra == head && EWC_ga == root)
+                        || (EWC_ra == head && EWC_nga == root)
+                        || (EWC_ra == head && EWC_ja == root)
+                        || (EWC_ra == head && EWC_nya == root)
+                        || (EWC_ra == head && EWC_ta == root)
+                        || (EWC_ra == head && EWC_da == root)
+                        || (EWC_ra == head && EWC_na == root)
+                        || (EWC_ra == head && EWC_tsa == root)
+                        || (EWC_ra == head && EWC_dza == root)
+
+                        || (EWC_la == head && EWC_ta == root)
+                        || (EWC_la == head && EWC_da == root));
+            } else {
+                return ((EWC_ra == head && EWC_ka == root && EWC_ya == sub)
+                        || (EWC_ra == head && EWC_ga == root && EWC_ya == sub)
+                        || (EWC_sa == head && EWC_ka == root && EWC_ya == sub)
+                        || (EWC_sa == head && EWC_ga == root && EWC_ya == sub)
+                        || (EWC_sa == head && EWC_ka == root && EWC_ra == sub)
+                        || (EWC_sa == head && EWC_ga == root && EWC_ra == sub));
+            }
+        }
+    }
 }
diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java
index c747304..88a7b01 100644
--- a/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java
+++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java
@@ -279,4 +279,77 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants {
 
         assertTrue(!LegalTshegBar.isAchungBasedSuffix(""));
     }
+
+    /** Tests that the rules concerning "which root letters take which
+     * prefixes?" are accurate.  I got a list of such rules from a
+     * native Tibetan who has been kind enough to teach me the
+     * fundamentals of the Tibetan language, but I'm not sure where he
+     * got the list.
+     */
+    public void testPrefixRules() {
+        // DLC FIXME how can we say that 0Fb2 is ok but 0fBc is not?
+        assertTrue(LegalTshegBar.takesBao(EWC_sa, EWC_ka, EWC_ra));
+        assertTrue(!LegalTshegBar.takesBao('\u0FB6', EWC_ka, EWC_ra));
+        assertTrue(!LegalTshegBar.takesBao(EWC_sa, '\u0F90', EWC_ra));
+        assertTrue(!LegalTshegBar.takesBao(EWC_sa, '\u0F90', '\u0FB2'));
+        assertTrue(!LegalTshegBar.takesBao('\u0FB6', '\u0F90', EWC_ra));
+        assertTrue(!LegalTshegBar.takesBao(EWC_sa, EWC_ka, '\u0FB2'));
+
+
+        {
+            assertTrue(LegalTshegBar.takesBao(EW_ABSENT, EWC_ka, EW_ABSENT));
+            assertTrue(LegalTshegBar.takesBao(EWC_la, EWC_da, EW_ABSENT));
+            assertTrue(LegalTshegBar.takesBao(EW_ABSENT, EWC_sa, EWC_ra));
+            assertTrue(LegalTshegBar.takesBao(EW_ABSENT, EWC_ga, EWC_ra));
+            assertTrue(LegalTshegBar.takesBao(EWC_ra, EWC_ga, EWC_ya));
+
+            assertTrue(!LegalTshegBar.takesBao(EWC_ra, EWC_da, EWC_ya));
+            assertTrue(!LegalTshegBar.takesBao(EW_ABSENT, EWC_ba, EW_ABSENT));
+            assertTrue(!LegalTshegBar.takesBao(EWC_la, EWC_nga, EW_ABSENT));
+            assertTrue(!LegalTshegBar.takesBao(EW_ABSENT, EWC_nga, EWC_ra));
+        }
+
+        {
+            assertTrue(LegalTshegBar.takesGao(EW_ABSENT, EWC_ca, EW_ABSENT));
+            assertTrue(!LegalTshegBar.takesGao(EW_ABSENT, EWC_ka, EW_ABSENT));
+            assertTrue(!LegalTshegBar.takesGao(EW_ABSENT, EWC_ka, EWC_ya));
+            assertTrue(!LegalTshegBar.takesGao(EWC_ra, EWC_ka, EW_ABSENT));
+            assertTrue(!LegalTshegBar.takesGao(EWC_ra, EWC_ka, EWC_ya));
+        }
+
+
+        {
+            assertTrue(LegalTshegBar.takesDao(EW_ABSENT, EWC_ka, EW_ABSENT));
+            assertTrue(!LegalTshegBar.takesDao(EW_ABSENT, EWC_wa, EW_ABSENT));
+            assertTrue(!LegalTshegBar.takesDao(EW_ABSENT, EWC_nga, EWC_ya));
+            assertTrue(!LegalTshegBar.takesDao(EWC_ra, EWC_ga, EW_ABSENT));
+            assertTrue(!LegalTshegBar.takesDao(EWC_ra, EWC_ga, EWC_ya));
+
+            assertTrue(LegalTshegBar.takesDao(EW_ABSENT, EWC_ga, EWC_ya));
+            assertTrue(LegalTshegBar.takesDao(EW_ABSENT, EWC_ka, EWC_ra));
+        }
+
+        {
+            assertTrue(LegalTshegBar.takesMao(EW_ABSENT, EWC_ja, EW_ABSENT));
+            assertTrue(!LegalTshegBar.takesMao(EW_ABSENT, EWC_wa, EW_ABSENT));
+            assertTrue(!LegalTshegBar.takesMao(EW_ABSENT, EWC_nga, EWC_ya));
+            assertTrue(!LegalTshegBar.takesMao(EWC_ra, EWC_ga, EW_ABSENT));
+            assertTrue(!LegalTshegBar.takesMao(EWC_ra, EWC_ga, EWC_ya));
+
+            assertTrue(LegalTshegBar.takesMao(EW_ABSENT, EWC_kha, EWC_ya));
+            assertTrue(LegalTshegBar.takesMao(EW_ABSENT, EWC_kha, EWC_ra));
+        }
+
+        {
+            assertTrue(LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_ga, EW_ABSENT));
+            assertTrue(!LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_ka, EW_ABSENT));
+            assertTrue(!LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_wa, EW_ABSENT));
+            assertTrue(!LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_nga, EWC_ya));
+            assertTrue(!LegalTshegBar.takesAchungPrefix(EWC_ra, EWC_ga, EW_ABSENT));
+            assertTrue(!LegalTshegBar.takesAchungPrefix(EWC_ra, EWC_ga, EWC_ya));
+
+            assertTrue(LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_ba, EWC_ya));
+            assertTrue(LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_pha, EWC_ra));
+        }
+    }
 }
diff --git a/source/org/thdl/tib/text/tshegbar/TibetanSyntaxException.java b/source/org/thdl/tib/text/tshegbar/TibetanSyntaxException.java
new file mode 100644
index 0000000..db7561b
--- /dev/null
+++ b/source/org/thdl/tib/text/tshegbar/TibetanSyntaxException.java
@@ -0,0 +1,51 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site 
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis, 
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
+License for the specific terms governing rights and limitations under the 
+License. 
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
+All Rights Reserved. 
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.tshegbar;
+
+/** DLC FIXMEDOC: says "this isn't legal Tibetan", not "this isn't a valid sequence of Unicode" */
+class TibetanSyntaxException extends Exception {
+    /** This constructor creates an exception with a less than helpful
+     *  message for the end user.  Please don't use this constructor
+     *  for production code. */
+    TibetanSyntaxException() {
+        super("A Unicode input stream had a syntactically incorrect run of Tibetan.  For example, kha, i.e., U+0F41, is not an allowed prefix.  This run of Tibetan was not expected.");
+        // we can tell it wasn't expected, because this error message
+        // isn't very helpful, and one of the other constructors
+        // should've been used.
+    }
+
+    /** DLC FIXMEDOC */
+    TibetanSyntaxException(String x) {
+        super(x);
+    }
+
+    /** DLC FIXMEDOC
+
+        @param grcls a Vector whose elements x are GraphemeClusters
+        where x is in the range [start, end)
+        @param start grcls.elementAt(start) is the first
+        GraphemeCluster in the syntactically incorrect stretch of
+        Tibetan.
+        @param end grcls.elementAt(end - 1) is the last
+        GraphemeCluster in the syntactically incorrect stretch of
+        Tibetan. */
+    TibetanSyntaxException(Vector grcls, int start, int end) {
+        DLC NOW;
+    }
+}
diff --git a/source/org/thdl/tib/text/tshegbar/TransitionInstruction.java b/source/org/thdl/tib/text/tshegbar/TransitionInstruction.java
new file mode 100644
index 0000000..5da8fe4
--- /dev/null
+++ b/source/org/thdl/tib/text/tshegbar/TransitionInstruction.java
@@ -0,0 +1,58 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site 
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis, 
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
+License for the specific terms governing rights and limitations under the 
+License. 
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
+All Rights Reserved. 
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.tshegbar;
+
+
+/** DLC FIXMEDOC */
+class TransitionInstruction implements UnicodeReadingStateMachineConstants {
+    private TransitionInstruction() { super(); }
+    TransitionInstruction(int nextState, int action) {
+        super();
+
+        assert(action == ACTION_CONTINUES_GRAPHEME_CLUSTER
+               || action == ACTION_BEGINS_NEW_GRAPHEME_CLUSTER
+               || action == ACTION_PREPEND_WITH_0F68);
+
+        assert(nextState == STATE_START
+               || nextState == STATE_READY
+               || nextState == STATE_DIGIT
+               || nextState == STATE_STACKING
+               || nextState == STATE_STACKPLUSACHUNG
+               || nextState == STATE_PARTIALMARK);
+
+        // we start in the start state, but we can never return to it.
+        assert(nextState != STATE_START);
+        
+        this.nextState = nextState;
+        this.action = action;
+    }
+
+    /** the state (e.g., {@link #STATE_READY}) to which to transition
+     *  next */
+    private int nextState;
+    
+    /** the action to perform upon transition, either {@link
+     *  #ACTION_CONTINUES_GRAPHEME_CLUSTER}, {@link
+     *  #ACTION_BEGINS_NEW_GRAPHEME_CLUSTER}, or {@link
+     *  #ACTION_PREPEND_WITH_0F68} */
+    private int action;
+
+    int getAction() { return action; }
+    int getNextState() { return nextState; }
+}
diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeReadingStateMachineConstants.java b/source/org/thdl/tib/text/tshegbar/UnicodeReadingStateMachineConstants.java
new file mode 100644
index 0000000..e4a7418
--- /dev/null
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeReadingStateMachineConstants.java
@@ -0,0 +1,174 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site 
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis, 
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
+License for the specific terms governing rights and limitations under the 
+License. 
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
+All Rights Reserved. 
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.tshegbar;
+
+/** Constants and static routines (DLC still?) useful in writing state
+ *  machines for transforming Unicode input into other forms.
+ *
+ *  @author David Chandler
+ */
+interface UnicodeReadingStateMachineConstants {
+
+    /** Returns the codepoint class for cp, e.g. {@link #CC_SJC}.
+     *  @param cp a Unicode codepoint, which MUST be nondecomposable
+     *  if it is in the Tibetan range but can be from outside the
+     *  Tibetan range of Unicode */
+    static int getCCForCP(char cp) {
+        assert(getNFTHDL(cp) == null);
+        if ('\u0F82' == cp) {
+            return CC_0F82;
+        } else if ('\u0F8A' == cp) {
+            return CC_0F8A;
+        } else if ('\u0F39' == cp) {
+            return CC_0F39;
+        } else if ('\u0F71' == cp) {
+            return CC_ACHUNG;
+        } else if ('\u0F40' <= cp && cp <= '\u0F6A') {
+            assert(cp != '\u0F48');
+            return CC_CON;
+        } else if ('\u0F90' <= cp && cp <= '\u0FBC') {
+            assert(cp != '\u0F98');
+            return CC_SJC;
+        } else if ('\u0F20' <= cp && cp <= '\u0F33') {
+            return CC_DIGIT;
+        } else if (/* DLC NOW do these combine ONLY with digits, or do CC_CM just NOT combine with digits? */
+                   '\u0F3E' == cp
+                   || '\u0F3F' == cp
+                   || '\u0F18' == cp
+                   || '\u0F19' == cp) {
+            return CC_MCWD;
+        } else if ('\u0FC6' == cp
+                   || '\u0F87' == cp
+                   || '\u0F86' == cp
+                   || '\u0F84' == cp
+                   || '\u0F83' == cp
+                   || '\u0F82' == cp
+                   || '\u0F7F' == cp
+                   || '\u0F7E' == cp
+                   || '\u0F37' == cp /* DLC NOW NORMALIZATION OF 0F10, 11 to 0F0F ??? */
+                   || '\u0F35' == cp) {
+            return CC_CM;
+        } else if ('\u0F72' == cp
+                   || '\u0F74' == cp
+                   || '\u0F7A' == cp
+                   || '\u0F7B' == cp
+                   || '\u0F7C' == cp
+                   || '\u0F7D' == cp
+                   || '\u0F80' == cp) {
+            // DLC what about U+0F84 ??? CC_V or CC_CM ?
+            return CC_V;
+        } else {
+            return CC_SIN;
+        }
+    }
+
+    // codepoint classes (CC_...) follow.  These are mutually
+    // exclusive, and their union is the whole of Unicode.
+
+    /** for everything else, i.e. non-Tibetan characters like U+0E00
+     *  and also Tibetan characters like U+0FCF and U+0F05 (DLC rename
+     *  SIN[GLETON] to OTHER as combining marks from outside the
+     *  Tibetan range count as this) but not U+0F8A */
+    static final int CC_SIN = 0;
+
+    /** for combining marks in the Tibetan range of Unicode that
+     *  combine with digits alone */
+    static final int CC_MCWD = 1;
+
+    /** for combining marks in the Tibetan range of Unicode, minus
+     *  CC_MCWD, U+0F82, and U+0F39 */
+    static final int CC_CM = 2;
+
+    /** for combining consonants, i.e. U+0F90-U+0FBC minus U+0F98
+     *  minus the decomposable entries like U+0F93, U+0F9D, U+0FA2,
+     *  etc. */
+    static final int CC_SJC = 3;
+
+    /** for noncombining consonants, i.e. U+0F40-U+0F6A minus U+0F48
+     *  minus the decomposable entries like U+0F43, U+0F4D, U+0F52,
+     *  etc. */
+    static final int CC_CON = 4;
+
+    /** for simple, nondecomposable vowels, i.e. U+0F72, U+0F74,
+     *  U+0F7A, U+0F7B, U+0F7C, U+0F7D, U+0F80 */
+    static final int CC_V = 5;
+
+    /** for U+0F8A */
+    static final int CC_0F8A = 6;
+
+    /** for U+0F82, which is treated like {@link #CC_CM} except after
+     *  U+0F8A */
+    static final int CC_0F82 = 7;
+
+    /** for U+0F39, an integral part of a consonant when it directly
+     *  follows a member of CM_CONS or CM_SJC */
+    static final int CC_0F39 = 8;
+
+    /** for U+0F71 */
+    static final int CC_ACHUNG = 9;
+
+    /** for digits, i.e. U+0F20-U+0F33 */
+    static final int CC_DIGIT = 10;
+
+
+
+    // states STATE_...:
+
+    /** initial state */
+    static final int STATE_START = 0;
+
+    /** ready state, i.e. the state in which some non-empty Unicode
+     *  String is in the holding area, <i>ready</i> to receive
+     *  combining marks like U+0F35 */
+    static final int STATE_READY = 1;
+
+    /** digit state, i.e. the state in which some non-empty Unicode
+     *  String consisting entirely of digits is in the holding area,
+     *  ready to receive marks that combine only with digits */
+    static final int STATE_DIGIT = 2;
+
+    /** state in which CC_SJC are welcomed and treated as consonants
+     *  to be subscribed to the GraphemeCluster in holding. */
+    static final int STATE_STACKING = 3;
+
+    /** state in which one or more consonants have been seen and also
+     *  an achung (U+0F71) has been seen */
+    static final int STATE_STACKPLUSACHUNG = 4;
+
+    /** state that seeing U+0F8A (when that's not an error) puts you
+     *  in.  Needed because U+0F8A is always followed by U+0F82, and
+     *  we check for the exceptional case that U+0F8A is followed by
+     *  something else. */
+    static final int STATE_PARTIALMARK = 5;
+
+    /* DLC we should have many error states or none. */
+
+
+    /** the present codepoint marks the start of a new
+     *  GraphemeCluster */
+    static final int ACTION_BEGINS_NEW_GRAPHEME_CLUSTER = 0;
+    /** the present codepoint is a continuation of the current
+     *  GraphemeCluster */
+    static final int ACTION_CONTINUES_GRAPHEME_CLUSTER = 1;
+    /** there is an error in the input stream, which we are correcting
+     *  (as we are in error-correcting mode) by starting a new
+     *  GraphemeCluster with U+0F68 as the first codepoint and the
+     *  current codepoint as the second */
+    static final int ACTION_PREPEND_WITH_0F68 = 2;
+}
diff --git a/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java b/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java
new file mode 100644
index 0000000..4084444
--- /dev/null
+++ b/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java
@@ -0,0 +1,345 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site 
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis, 
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
+License for the specific terms governing rights and limitations under the 
+License. 
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
+All Rights Reserved. 
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.tshegbar;
+
+class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
+    /** Don't instantiate this class. */
+    private Foo() { super(); }
+
+    /** This table tells how to transition from state a 6 states + error state */
+    private static final TransitionInstruction
+        transitionTable[6 /* number of STATEs */]
+                       [11 /* number of CC classes */]
+        = {
+            // STATE_START:
+            {
+                /* upon seeing CC_SIN in this state: */
+                new TransitionInstruction(STATE_READY,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+                /* upon seeing CC_MCWD in this state: */
+                null,
+                /* upon seeing CC_CM in this state: */
+                null,
+                /* upon seeing CC_SJC in this state: */
+                null,
+                /* upon seeing CC_CON in this state: */
+                new TransitionInstruction(STATE_READY,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+                /* upon seeing CC_V in this state: */
+                null,
+                /* upon seeing CC_0F8A in this state: */
+                new TransitionInstruction(STATE_READY,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+                /* upon seeing CC_0F82 in this state: */
+                null,
+                /* upon seeing CC_0F39 in this state: */
+                null,
+                /* upon seeing CC_ACHUNG in this state: */
+                null,
+                /* upon seeing CC_DIGIT in this state: */
+                new TransitionInstruction(STATE_DIGIT,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
+            },
+
+            // STATE_READY:
+            {
+                /* upon seeing CC_SIN in this state: */
+                new TransitionInstruction(STATE_READY, // self
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+                /* upon seeing CC_MCWD in this state: */
+                null,
+                /* upon seeing CC_CM in this state: */
+                new TransitionInstruction(STATE_READY,
+                                          ACTION_CONTINUES_GRAPHEME_CLUSTER),
+                /* upon seeing CC_SJC in this state: */
+                null,
+                /* upon seeing CC_CON in this state: */
+                new TransitionInstruction(STATE_STACKING,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+                /* upon seeing CC_V in this state: */
+                null
+                /* upon seeing CC_0F8A in this state: */
+                new TransitionInstruction(STATE_PARTIALMARK,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+                /* upon seeing CC_0F82 in this state: */
+                new TransitionInstruction(STATE_READY,
+                                          ACTION_CONTINUES_GRAPHEME_CLUSTER),
+                /* upon seeing CC_0F39 in this state: */
+                null,
+                /* upon seeing CC_ACHUNG in this state: */
+                null, // because 0F71 comes after SJCs, before Vs, and
+                      // before CMs.
+                /* upon seeing CC_DIGIT in this state: */
+                new TransitionInstruction(STATE_DIGIT,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
+            },
+            // STATE_DIGIT:
+            {
+                /* upon seeing CC_SIN in this state: */
+                new TransitionInstruction(STATE_READY,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+                /* upon seeing CC_MCWD in this state: */
+                new TransitionInstruction(STATE_READY,
+                                          ACTION_CONTINUES_GRAPHEME_CLUSTER),
+                /* upon seeing CC_CM in this state: */
+                new TransitionInstruction(STATE_READY,
+                                          ACTION_CONTINUES_GRAPHEME_CLUSTER),
+                /* upon seeing CC_SJC in this state: */
+                null,
+                /* upon seeing CC_CON in this state: */
+                new TransitionInstruction(STATE_STACKING,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+                /* upon seeing CC_V in this state: */
+                null,
+                /* upon seeing CC_0F8A in this state: */
+                new TransitionInstruction(STATE_PARTIALMARK,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+                /* upon seeing CC_0F82 in this state: */
+                new TransitionInstruction(STATE_READY,
+                                          ACTION_CONTINUES_GRAPHEME_CLUSTER),
+                /* upon seeing CC_0F39 in this state: */
+                null,
+                /* upon seeing CC_ACHUNG in this state: */
+                null,
+                /* upon seeing CC_DIGIT in this state: */
+                new TransitionInstruction(STATE_DIGIT,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER) /* DLC although consider the meaning of 0F22,0F22,0F3F */
+            },
+            // STATE_STACKING:
+            {
+                /* upon seeing CC_SIN in this state: */
+                new TransitionInstruction(STATE_READY,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+                /* upon seeing CC_MCWD in this state: */
+                null,
+                /* upon seeing CC_CM in this state: */
+                new TransitionInstruction(STATE_READY,
+                                          ACTION_CONTINUES_GRAPHEME_CLUSTER),
+                /* upon seeing CC_SJC in this state: */
+                new TransitionInstruction(STATE_STACKING,
+                                          ACTION_CONTINUES_GRAPHEME_CLUSTER),
+                /* upon seeing CC_CON in this state: */
+                new TransitionInstruction(STATE_STACKING,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+                /* upon seeing CC_V in this state: */
+                new TransitionInstruction(STATE_READY,
+                                          ACTION_CONTINUES_GRAPHEME_CLUSTER),
+                /* upon seeing CC_0F8A in this state: */
+                new TransitionInstruction(STATE_PARTIALMARK,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+                /* upon seeing CC_0F82 in this state: */
+                new TransitionInstruction(STATE_READY,
+                                          ACTION_CONTINUES_GRAPHEME_CLUSTER),
+                /* upon seeing CC_0F39 in this state: */
+                new TransitionInstruction(STATE_STACKING,
+                                          ACTION_CONTINUES_GRAPHEME_CLUSTER),
+                /* upon seeing CC_ACHUNG in this state: */
+                new TransitionInstruction(STATE_STACKPLUSACHUNG,
+                                          ACTION_CONTINUES_GRAPHEME_CLUSTER),
+                /* upon seeing CC_DIGIT in this state: */
+                new TransitionInstruction(STATE_DIGIT,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
+            },
+            // STATE_STACKPLUSACHUNG:
+            {
+                /* upon seeing CC_SIN in this state: */
+                new TransitionInstruction(STATE_READY,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+                /* upon seeing CC_MCWD in this state: */
+                null,
+                /* upon seeing CC_CM in this state: */
+                new TransitionInstruction(STATE_READY,
+                                          ACTION_CONTINUES_GRAPHEME_CLUSTER),
+                /* upon seeing CC_SJC in this state: */
+                null,
+                /* upon seeing CC_CON in this state: */
+                new TransitionInstruction(STATE_STACKING,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+                /* upon seeing CC_V in this state: */
+                new TransitionInstruction(STATE_READY,
+                                          ACTION_CONTINUES_GRAPHEME_CLUSTER),
+                /* upon seeing CC_0F8A in this state: */
+                new TransitionInstruction(STATE_PARTIALMARK,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+                /* upon seeing CC_0F82 in this state: */
+                new TransitionInstruction(STATE_READY,
+                                          ACTION_CONTINUES_GRAPHEME_CLUSTER),
+                /* upon seeing CC_0F39 in this state: */
+                null,
+                /* upon seeing CC_ACHUNG in this state: */
+                null,
+                /* upon seeing CC_DIGIT in this state: */
+                new TransitionInstruction(STATE_DIGIT,
+                                          ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
+            },
+            // STATE_PARTIALMARK:
+            {
+                /* upon seeing CC_SIN in this state: */
+                null,
+                /* upon seeing CC_MCWD in this state: */
+                null,
+                /* upon seeing CC_CM in this state: */
+                null,
+                /* upon seeing CC_SJC in this state: */
+                null,
+                /* upon seeing CC_CON in this state: */
+                null,
+                /* upon seeing CC_V in this state: */
+                null,
+                /* upon seeing CC_0F8A in this state: */
+                null,
+                /* upon seeing CC_0F82 in this state: */
+                new TransitionInstruction(STATE_READY,
+                                          ACTION_CONTINUES_GRAPHEME_CLUSTER),
+                /* upon seeing CC_0F39 in this state: */
+                null,
+                /* upon seeing CC_ACHUNG in this state: */
+                null,
+                /* upon seeing CC_DIGIT in this state: */
+                null
+            }
+        };
+
+    DLC NOW -- clearly, we need LegalSyllable to be convertable to and from GraphemeClusters;
+
+    /** Breaks a sequence of GraphemeClusters into LegalSyllables.
+        @param grcls a sequence of nonnull GraphemeClusters
+        @return a sequence of nonnull LegalSyllables
+        @exception TibetanSyntaxException if grcls does not consist
+        entirely of legal Tibetan syllables
+        @see #GraphemeCluster
+        @see #LegalSyllable
+    */
+    private static Vector breakGraphemeClustersIntoOnlySyllables(Vector grcls)
+        throws TibetanSyntaxException
+    {
+        return breakGraphemeClustersIntoSyllablesAndGraphemeClusters(grcls,
+                                                                     true);
+    }
+
+    private static Vector breakGraphemeClustersIntoOnlySyllables(Vector grcls) {
+        try {
+            return breakGraphemeClustersIntoSyllablesAndGraphemeClusters(grcls,
+                                                                         false);
+        } catch (TibetanSyntaxException) {
+            throw new Error("This can never happen, because the second parameter, validating, was false.");
+        }
+    }
+
+    /** 
+     @param grcls a Vector consisting entirely of GraphemeClusters
+     @param validate true iff you wish to have a
+     TibetanSyntaxException thrown upon encountering a sequence of
+     GraphemeClusters that is syntactically incorrect Tibetan
+     @return if validate is true, a Vector consisting entirely of
+     LegalSyllables, else a vector of LegalSyllables and
+     GraphemeClusters */
+    private static Vector breakGraphemeClustersIntoSyllablesAndGraphemeClusters(Vector grcls,
+                                                                                boolean validate)
+        throws TibetanSyntaxException
+    {
+        Vector syllables = new Vector();
+        int grcls_len = grcls.length();
+        int beginning_of_cluster = 0;
+        for (int i = 0; i < grcls_len; i++) {
+            GraphemeCluster current_grcl
+                = (GraphemeCluster)grcls.elementAt(i);
+            if (current_grcl.isTshegLike()) {
+                if (beginning_of_cluster < i) {
+                    // One or more non-tsheg-like grapheme clusters is
+                    // here between tsheg-like grapheme clusters.  Is
+                    // it a legal syllable?
+                    if (LegalTshegBar.formsLegalTshegBar(grcls,
+                                                         beginning_of_cluster,
+                                                         i))
+                        {
+                            syllables.add(new LegalSyllable(grcls,
+                                                            beginning_of_cluster,
+                                                            i, tsheg=current_grcl));
+                        }
+                    else
+                        {
+                            if (validating) {
+                                TibetanSyntaxException ex
+                                    = new TibetanSyntaxException(grcls,
+                                                                 beginning_of_cluster,
+                                                                 i);
+                                // DLC: return an int -1 for "all good" or
+                                // 3 for "the fourth element is the first
+                                // bad one" but then you don't know that
+                                // 3-6 were the bad ones
+                                throw ex;
+                            } else {
+                                for (int j = beginning_of_cluster; j <= i; j++) {
+                                    syllables.add(grcls.elementAt(j));
+                                }
+                            }
+                        }
+                }
+                beginning_of_cluster = i + 1;
+            } // else add current_grcl to the waiting list, in a sense
+        }
+        return syllables;
+    }
+
+    /** Breaks a string of perfectly-formed Unicode into
+        GraphemeClusters.
+        @param nfthdl_unicode a String of NFTHDL-normalized Unicode
+        codepoints
+        @exception Exception if the input is not perfectly formed
+        @return a vector of GraphemeClusters
+        @see #GraphemeCluster
+    */
+    private static Vector nonErrorCorrectingReader(String nfthdl_unicode)
+        throws Exception
+    {
+        // a vector of GraphemeClusters that we build up little by
+        // little:
+        Vector grcls = new Vector();
+        int currentState = STATE_START;
+        StringBuffer holdingPen = new StringBuffer();
+
+        int ilen = nfthdl_unicode.length();
+        for (int i = 0; i < ilen; i++) {
+            char current_cp = nfthdl_unicode.charAt(i);
+            int cc_of_current_cp = getCCForCP(current_cp);
+            final TransitionInstruction ti
+                = transitionTable[currentState][cc_of_current_cp];
+            if (null == ti) {
+                throw new Exception("Bad Unicode.  DLC improve these messages");
+            } else {
+                switch (ti.getAction()) {
+                case ACTION_BEGINS_NEW_GRAPHEME_CLUSTER:
+                    grcls.add(new GraphemeCluster(holdingPen));
+                    holdingPen = new StringBuffer();
+                    break;
+                case ACTION_CONTINUES_GRAPHEME_CLUSTER:
+                    holdingString.append(current_cp);
+                    break;
+                case ACTION_PREPEND_WITH_0F68:
+                    throw new Error("This never happens inside the validating scanner.");
+                default:
+                    throw new Error("Famous last words: This won't happen.");
+                }
+                currentState = ti.getNextState();
+            }
+        }
+        return grcls;
+    }
+}
diff --git a/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReaderTest.java b/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReaderTest.java
new file mode 100644
index 0000000..bf6ae3d
--- /dev/null
+++ b/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReaderTest.java
@@ -0,0 +1,195 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site 
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis, 
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
+License for the specific terms governing rights and limitations under the 
+License. 
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
+All Rights Reserved. 
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.tshegbar;
+
+/** Tests ValidatingUnicodeReader.
+ *  @author David Chandler */
+class ValidatingUnicodeReaderTest {
+    private static String skyagd = "\u0F66\u0F90\u0FB1\u0F42\u0F51";
+    private static String bskyagd = "\u0F56" + skyagd;
+
+    void testValidatingUnicodeReader() {
+        // DLC these routines can be slow.
+        assertTrue(ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode(
+                      bskyagd + "\u0F0C"));
+        assertTrue(!ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode(
+                      "\u0F42" + skyagd + "\u0F0C"));
+        assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+                      bskyagd + "\u0F0C"));
+        assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+                      "\u0F42" + skyagd + "\u0F0C"));
+
+        assertTrue(ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode(
+                      bskyagd + "\u0F0C\u0F62\u0F0B" + bskyagd + "\u0F0F"));
+
+        assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+                      "\u0F6A\u0F0B"));
+        assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+                      "\u0F62\u0F0B"));
+        assertTrue(!ValidatingUnicodeReader.isPerfectUnicode(
+                      "\u0F6A\u0F0B"));
+        assertTrue(ValidatingUnicodeReader.isPerfectUnicode(
+                      "\u0F62\u0F0B"));
+        assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+                      "\u0F6A\u0F90\u0F0B"));
+        assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+                      "\u0F62\u0F90\u0F0B"));
+        assertTrue(ValidatingUnicodeReader.isPerfectUnicode(
+                      "\u0F62\u0F90\u0F0B"));
+        assertTrue(!ValidatingUnicodeReader.isPerfectUnicode(
+                      "\u0F6A\u0F90\u0F0B"));
+
+        assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+                      "\u0F43"));
+        assertTrue(!ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode(
+                      "\u0F43"));
+
+        // The Unicode standard states that U+0F8A is always followed
+        // by U+0F82.
+        assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+                      "\u0F8A\u0F82"));
+        assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
+                      "\u0F8A"));
+        assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
+                      "\u0F8A\u0F40"));
+        assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
+                      "\u0F8A\u0F83"));
+
+        assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
+                      "\u0F74"));
+        assertTrue(ValidatingUnicodeReader.isPerfectUnicode(
+                      "\u0F40\u0F74"));
+        assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
+                      "\u0F90\u0F74"));
+
+        assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+                      "\u0F40\u0F77"));
+        assertTrue(!ValidatingUnicodeReader.isPerfectUnicode(
+                      "\u0F40\u0F77"));
+        assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
+                      "\u0F90\u0F77"));
+
+        assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+                      "\u0F40\u0F90\u0F7F"));
+        assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+                      "\u0F40\u0F90\u0F7F\u0F35"));
+
+        // Test that each singleton (except U+0F8A) in the Tibetan
+        // range is legal, and that each combining char and empty
+        // codepoint (and also U+0F8A) is illegal alone.
+        {
+            for (char cp = '\u0F00'; cp <= '\u0F17'; cp++)
+                assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
+            for (char cp = '\u0F1a'; cp <= '\u0F34'; cp++)
+                assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
+            for (char cp = '\u0F3a'; cp <= '\u0F3d'; cp++)
+                assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
+            for (char cp = '\u0F40'; cp <= '\u0F47'; cp++)
+                assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
+            for (char cp = '\u0F49'; cp <= '\u0F6a'; cp++)
+                assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
+            for (char cp = '\u0F88'; cp <= '\u0F89'; cp++)
+                assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
+            for (char cp = '\u0Fbe'; cp <= '\u0Fc5'; cp++)
+                assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
+            for (char cp = '\u0Fc7'; cp <= '\u0Fcc'; cp++)
+                assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
+            assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F36"));
+            assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F38"));
+            assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F85"));
+            assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F8b"));
+            assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0Fcf"));
+
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F48"));
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6b"));
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6c"));
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6d"));
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6e"));
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6f"));
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F70"));
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8c"));
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8d"));
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8e"));
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8f"));
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F98"));
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fbd"));
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fcd"));
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fce"));
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fd0"));
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fe4"));
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Ff0"));
+            assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fff"));
+        }
+
+        assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+                      "\u0F40\u0Fc6"));
+
+        // Test that combining characters that combine with both
+        // consonants and digits work.
+        {
+            String combiningMarks[] = new String[] {
+                "\u0F71",
+                "\u0F72",
+                "\u0F73",
+                "\u0F74",
+                "\u0F75",
+                "\u0F76",
+                "\u0F77",
+                "\u0F78",
+                "\u0F79",
+                "\u0F7a",
+                "\u0F7b",
+                "\u0F7c",
+                "\u0F7d",
+                "\u0F7e",
+                "\u0F7f",
+                "\u0F80",
+                "\u0F81",
+                "\u0F82",
+                "\u0F83",
+                "\u0F84",
+                "\u0F86",
+                "\u0F87"
+            };
+            for (int i = 0; i < combiningMarks.length(); i++) {
+                assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+                               "\u0F40" + combiningMarks[i]));
+                // DLC have a group that works with both digits and consonants, cuz vowels plus digits is a no go, right?
+                assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+                               "\u0F20" + combiningMarks[i]));
+                assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+                               "\u0F30" + combiningMarks[i]));
+            }
+        }
+
+        DLC;
+        assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+                      "\u0F\u0F\u0F\u0F\u0F"));
+    }
+
+    void testSyntacticallyLegalUnicodeToThdlWylie() {
+        assertTrue("bskyagd"
+                   .equals(ValidatingUnicodeReader.syntacticallyLegalTibetanUnicodeToThdlWylie(
+                      bskyagd)));
+
+        assertTrue("bskyagd bskyagd/"
+                   .equals(ValidatingUnicodeReader.syntacticallyLegalTibetanUnicodeToThdlWylie(
+                      bskyagd + "\u0F0B" + bskyagd + "\u0F0D")));
+    }
+}