diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
index e119b7d..658c631 100644
--- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
+++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
@@ -18,6 +18,8 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.tshegbar;
+import java.util.Vector;
+
import org.thdl.tib.text.THDLWylieConstants;
import org.thdl.util.ThdlDebug;
@@ -42,15 +44,15 @@ import org.thdl.util.ThdlDebug;
* exception is that 'i (i.e., the connective case marker), 'u, and
* 'o suffixes are permitted.
*
- *
It has at most one suffix, which is a single consonant or a
- * string consisting of 'i, 'u, 'o, 'am, and 'ang.
- *
- *
-DLC FIXME: we must allow many suffixes. See Andres' e-mail below:
+ * It has at most one suffix, which is a single consonant (the
+ * common case) or a string consisting of 'i, 'u, 'o, 'am, and
+ * 'ang.
+
+See Andres' e-mail below:
David,
-It is a particle that means "or" as opposed to "dang" that means and.
+['am] is a particle that means "or" as opposed to "dang" that means and.
"sgom pa'am" would mean "... or meditation"
@@ -65,6 +67,7 @@ And also there are cases where they combine. For ex you can have
Andres
+
*
*
* It may contain a EWC_sa or EWC_da postsuffix iff there exists
@@ -681,7 +684,7 @@ public final class LegalTshegBar
}
/** Like {@link
- * #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char)}
+ * #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char,StringBuffer)}
* but geared for the common case where the suffix is simply a
* consonant. */
public static boolean formsLegalTshegBar(char prefix,
@@ -1138,4 +1141,350 @@ public final class LegalTshegBar
public String toString() {
return toConciseXML();
}
+
+ /** FIXMEDOC a shortcut */
+ private static boolean formsLegalTshegBar(Vector grcls) {
+ return formsLegalTshegBar(grcls, 0, grcls.size());
+ }
+
+ /** FIXMEDOC DLC
+ *
+ * Returns true iff the given UnicodeGraphemeClusters form a
+ * syntactically legal Tibetan syllable. If one is null, it
+ * means that it is not present.
+ *
+ * @exception IllegalArgumentException if root is null, or if
+ * postsuffix is non-null and suffix is null (these being clearly
+ * illegal)
+ */
+ private static boolean formsLegalTshegBar(UnicodeGraphemeCluster prefix,
+ UnicodeGraphemeCluster root,
+ UnicodeGraphemeCluster suffix,
+ UnicodeGraphemeCluster postsuffix)
+ throws IllegalArgumentException
+ {
+ // reality checks:
+ if (null == root)
+ throw new IllegalArgumentException("root letter is not present");
+ if (null != postsuffix && null == suffix)
+ throw new IllegalArgumentException("a postsuffix cannot occur without a suffix");
+
+ // handle root:
+ if (!root.isLegalTibetan())
+ return false;
+ char headLetter = root.getSuperscribedLetter();
+ char rootLetter = root.getRootCP();
+ char subjoinedLetter = root.getSoleNonWazurSubjoinedLetter();
+ char vowel = root.getVowel();
+ boolean hasAchung = root.hasAchung();
+ boolean hasWazur = root.hasWazur();
+
+ // handle prefix:
+ char prefixLetter = prefix.getSoleTibetanUnicodeCP();
+
+ // handle suffix:
+ String suffixString = null;
+ if (null != suffix) {
+ // DLC FIXME suffixString = suffix.getUnicodeInUsualOrder();
+ throw new Error("DLC FIXME");
+ }
+
+ // handle postsuffix:
+ char postsuffixLetter = postsuffix.getSoleTibetanUnicodeCP();
+
+ return formsLegalTshegBar(prefixLetter, headLetter, rootLetter,
+ subjoinedLetter, hasWazur, hasAchung,
+ suffixString, postsuffixLetter, vowel, null);
+ }
+
+ /** Returns true iff the UnicodeGraphemeClusters in grcls with
+ * indices in the range [start, end) form a syntactically legal
+ * syllable. If start is as large as end, false is returned. */
+ private static boolean formsLegalTshegBar(Vector grcls,
+ int start,
+ int end)
+ {
+ int numGrcls = start - end;
+ if (numGrcls <= 0)
+ return false;
+ if (numGrcls == 1) {
+ // Option 1: (root)
+ // else: return false;
+
+ return formsLegalTshegBar(null,
+ (UnicodeGraphemeCluster)grcls.elementAt(start),
+ null, null);
+ } else if (numGrcls == 2) {
+ // Option 1: (prefix, root)
+ // Option 2: (root, suffix)
+ // else: return false;
+
+ return (formsLegalTshegBar((UnicodeGraphemeCluster)grcls.elementAt(start),
+ (UnicodeGraphemeCluster)grcls.elementAt(start + 1),
+ null,
+ null)
+ || formsLegalTshegBar(null,
+ (UnicodeGraphemeCluster)grcls.elementAt(start),
+ (UnicodeGraphemeCluster)grcls.elementAt(start + 1),
+ null));
+ } else if (numGrcls == 3) {
+ // Option 1: (prefix, root, suffix)
+ // Option 2: (root, suffix, postsuffix)
+ // else: return false;
+
+ return (formsLegalTshegBar((UnicodeGraphemeCluster)grcls.elementAt(start),
+ (UnicodeGraphemeCluster)grcls.elementAt(start + 1),
+ (UnicodeGraphemeCluster)grcls.elementAt(start + 2),
+ null)
+ || formsLegalTshegBar(null,
+ (UnicodeGraphemeCluster)grcls.elementAt(start),
+ (UnicodeGraphemeCluster)grcls.elementAt(start + 1),
+ (UnicodeGraphemeCluster)grcls.elementAt(start + 2)));
+ } else if (numGrcls == 4) {
+ return (formsLegalTshegBar((UnicodeGraphemeCluster)grcls.elementAt(start),
+ (UnicodeGraphemeCluster)grcls.elementAt(start + 1),
+ (UnicodeGraphemeCluster)grcls.elementAt(start + 2),
+ (UnicodeGraphemeCluster)grcls.elementAt(start + 3)));
+ } else {
+ // the largest has 'i ... DLC FIXME rethink -- even the case where numGrcls == 3 could be pa'am
+ return false;
+ }
+ }
+
+
+
+ /** Returns true if the given Tibetan consonant stack (i.e., the
+ * combination of superscribed, root, and subscribed letters)
+ * takes an EWC_ga prefix.
+ * @param head the {@link
+ * isNominalRepresentationOfConsonant(char) nominal
+ * representation} of the superscribed letter, or EW_ABSENT if
+ * not present
+ * @param root the {@link
+ * isNominalRepresentationOfConsonant(char) nominal
+ * representation} of the root letter
+ * @param sub the {@link isNominalRepresentationOfConsonant(char)
+ * nominal representation} of the subjoined letter, or EW_ABSENT
+ * if not present */
+ static boolean takesGao(char head, char root, char sub) {
+ if (EW_ABSENT == head) {
+ if (EW_ABSENT == sub) {
+ return (EWC_ca == root
+ || EWC_ta == root
+ || EWC_da == root
+ || EWC_tsa == root
+ || EWC_zha == root
+ || EWC_za == root
+ || EWC_ya == root
+ || EWC_sha == root
+ || EWC_sa == root
+ || EWC_nya == root
+ || EWC_na == root);
+ }
+ }
+ return false;
+ }
+
+ /** Returns true if the given Tibetan consonant stack (i.e., the
+ * combination of superscribed, root, and subscribed letters)
+ * takes an EWC_da prefix.
+ * @param head the {@link
+ * isNominalRepresentationOfConsonant(char) nominal
+ * representation} of the superscribed letter, or EW_ABSENT if
+ * not present
+ * @param root the {@link
+ * isNominalRepresentationOfConsonant(char) nominal
+ * representation} of the root letter
+ * @param sub the {@link isNominalRepresentationOfConsonant(char)
+ * nominal representation} of the subjoined letter, or EW_ABSENT
+ * if not present */
+ static boolean takesDao(char head, char root, char sub) {
+ if (EW_ABSENT == head) {
+ if (EW_ABSENT == sub) {
+ return (EWC_ka == root
+ || EWC_ga == root
+ || EWC_nga == root
+ || EWC_pa == root
+ || EWC_ba == root
+ || EWC_ma == root);
+ } else {
+ return ((EWC_ga == root && EWC_ya == sub)
+ || (EWC_pa == root && EWC_ya == sub)
+ || (EWC_ba == root && EWC_ya == sub)
+ || (EWC_ma == root && EWC_ya == sub)
+
+ || (EWC_ka == root && EWC_ra == sub)
+ || (EWC_ga == root && EWC_ra == sub)
+ || (EWC_ba == root && EWC_ra == sub)
+ || (EWC_pa == root && EWC_ra == sub));
+ }
+ } else {
+ return false;
+ }
+ }
+
+ /** Returns true if the given Tibetan consonant stack (i.e., the
+ * combination of superscribed, root, and subscribed letters)
+ * takes an EWC_achung prefix.
+ * @param head the {@link
+ * isNominalRepresentationOfConsonant(char) nominal
+ * representation} of the superscribed letter, or EW_ABSENT if
+ * not present
+ * @param root the {@link
+ * isNominalRepresentationOfConsonant(char) nominal
+ * representation} of the root letter
+ * @param sub the {@link isNominalRepresentationOfConsonant(char)
+ * nominal representation} of the subjoined letter, or EW_ABSENT
+ * if not present */
+ static boolean takesAchungPrefix(char head, char root, char sub) {
+ if (EW_ABSENT == head) {
+ if (EW_ABSENT == sub) {
+ return (EWC_ga == root
+ || EWC_ja == root
+ || EWC_da == root
+ || EWC_ba == root
+ || EWC_dza == root
+ || EWC_kha == root
+ || EWC_cha == root
+ || EWC_tha == root
+ || EWC_pha == root
+ || EWC_tsha == root);
+ } else {
+ return ((EWC_pha == root && EWC_ya == sub)
+ || (EWC_ba == root && EWC_ya == sub)
+ || (EWC_kha == root && EWC_ya == sub)
+ || (EWC_ga == root && EWC_ya == sub)
+
+ || (EWC_ba == root && EWC_ra == sub)
+ || (EWC_kha == root && EWC_ra == sub)
+ || (EWC_ga == root && EWC_ra == sub)
+ || (EWC_da == root && EWC_ra == sub)
+ || (EWC_pha == root && EWC_ra == sub));
+ }
+ } else {
+ return false;
+ }
+ }
+
+ /** Returns true if the given Tibetan consonant stack (i.e., the
+ * combination of superscribed, root, and subscribed letters)
+ * takes an EWC_ma prefix.
+ * @param head the {@link
+ * isNominalRepresentationOfConsonant(char) nominal
+ * representation} of the superscribed letter, or EW_ABSENT if
+ * not present
+ * @param root the {@link
+ * isNominalRepresentationOfConsonant(char) nominal
+ * representation} of the root letter
+ * @param sub the {@link isNominalRepresentationOfConsonant(char)
+ * nominal representation} of the subjoined letter, or EW_ABSENT
+ * if not present */
+ static boolean takesMao(char head, char root, char sub) {
+ if (EW_ABSENT == head) {
+ if (EW_ABSENT == sub) {
+ return (EWC_kha == root
+ || EWC_ga == root
+ || EWC_cha == root
+ || EWC_ja == root
+ || EWC_tha == root
+ || EWC_tsha == root
+ || EWC_da == root
+ || EWC_dza == root
+ || EWC_nga == root
+ || EWC_nya == root
+ || EWC_na == root);
+ } else {
+ return ((EWC_kha == root && EWC_ya == sub)
+ || (EWC_ga == root && EWC_ya == sub)
+
+ || (EWC_kha == root && EWC_ra == sub)
+ || (EWC_ga == root && EWC_ra == sub));
+ }
+ } else {
+ return false;
+ }
+ }
+
+ /** Returns true if the given Tibetan consonant stack (i.e., the
+ * combination of superscribed, root, and subscribed letters)
+ * takes an EWC_ba prefix.
+ * @param head the {@link
+ * isNominalRepresentationOfConsonant(char) nominal
+ * representation} of the superscribed letter, or EW_ABSENT if
+ * not present
+ * @param root the {@link
+ * isNominalRepresentationOfConsonant(char) nominal
+ * representation} of the root letter
+ * @param sub the {@link isNominalRepresentationOfConsonant(char)
+ * nominal representation} of the subjoined letter, or EW_ABSENT
+ * if not present */
+ static boolean takesBao(char head, char root, char sub) {
+ // DLC ask Ten-lo la about Wazur.
+ if (EW_ABSENT == head) {
+ if (EW_ABSENT == sub) {
+ return (EWC_ka == root
+ || EWC_ca == root
+ || EWC_ta == root
+ || EWC_tsa == root
+ || EWC_ga == root
+ || EWC_nga == root
+ || EWC_ja == root
+ || EWC_nya == root
+ || EWC_da == root
+ || EWC_na == root
+ || EWC_dza == root
+ || EWC_zha == root
+ || EWC_za == root
+ || EWC_ra == root
+ || EWC_la == root
+ || EWC_sha == root);
+ } else {
+ // kra, e.g.
+ return ((EWC_ka == root && EWC_ya == sub)
+ || (EWC_ga == root && EWC_ya == sub)
+
+ || (EWC_ka == root && EWC_ra == sub)
+ || (EWC_ga == root && EWC_ra == sub)
+ || (EWC_sa == root && EWC_ra == sub)
+
+ || (EWC_ka == root && EWC_la == sub)
+ || (EWC_za == root && EWC_la == sub)
+ || (EWC_ra == root && EWC_la == sub)
+ || (EWC_sa == root && EWC_la == sub));
+ }
+ } else {
+ if (EW_ABSENT == sub) {
+ // ska, e.g.
+ return ((EWC_sa == head && EWC_ka == root)
+ || (EWC_sa == head && EWC_ga == root)
+ || (EWC_sa == head && EWC_nga == root)
+ || (EWC_sa == head && EWC_nya == root)
+ || (EWC_sa == head && EWC_ta == root)
+ || (EWC_sa == head && EWC_da == root)
+ || (EWC_sa == head && EWC_na == root)
+ || (EWC_sa == head && EWC_tsa == root)
+
+ || (EWC_ra == head && EWC_ka == root)
+ || (EWC_ra == head && EWC_ga == root)
+ || (EWC_ra == head && EWC_nga == root)
+ || (EWC_ra == head && EWC_ja == root)
+ || (EWC_ra == head && EWC_nya == root)
+ || (EWC_ra == head && EWC_ta == root)
+ || (EWC_ra == head && EWC_da == root)
+ || (EWC_ra == head && EWC_na == root)
+ || (EWC_ra == head && EWC_tsa == root)
+ || (EWC_ra == head && EWC_dza == root)
+
+ || (EWC_la == head && EWC_ta == root)
+ || (EWC_la == head && EWC_da == root));
+ } else {
+ return ((EWC_ra == head && EWC_ka == root && EWC_ya == sub)
+ || (EWC_ra == head && EWC_ga == root && EWC_ya == sub)
+ || (EWC_sa == head && EWC_ka == root && EWC_ya == sub)
+ || (EWC_sa == head && EWC_ga == root && EWC_ya == sub)
+ || (EWC_sa == head && EWC_ka == root && EWC_ra == sub)
+ || (EWC_sa == head && EWC_ga == root && EWC_ra == sub));
+ }
+ }
+ }
}
diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java
index c747304..88a7b01 100644
--- a/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java
+++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java
@@ -279,4 +279,77 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants {
assertTrue(!LegalTshegBar.isAchungBasedSuffix(""));
}
+
+ /** Tests that the rules concerning "which root letters take which
+ * prefixes?" are accurate. I got a list of such rules from a
+ * native Tibetan who has been kind enough to teach me the
+ * fundamentals of the Tibetan language, but I'm not sure where he
+ * got the list.
+ */
+ public void testPrefixRules() {
+ // DLC FIXME how can we say that 0Fb2 is ok but 0fBc is not?
+ assertTrue(LegalTshegBar.takesBao(EWC_sa, EWC_ka, EWC_ra));
+ assertTrue(!LegalTshegBar.takesBao('\u0FB6', EWC_ka, EWC_ra));
+ assertTrue(!LegalTshegBar.takesBao(EWC_sa, '\u0F90', EWC_ra));
+ assertTrue(!LegalTshegBar.takesBao(EWC_sa, '\u0F90', '\u0FB2'));
+ assertTrue(!LegalTshegBar.takesBao('\u0FB6', '\u0F90', EWC_ra));
+ assertTrue(!LegalTshegBar.takesBao(EWC_sa, EWC_ka, '\u0FB2'));
+
+
+ {
+ assertTrue(LegalTshegBar.takesBao(EW_ABSENT, EWC_ka, EW_ABSENT));
+ assertTrue(LegalTshegBar.takesBao(EWC_la, EWC_da, EW_ABSENT));
+ assertTrue(LegalTshegBar.takesBao(EW_ABSENT, EWC_sa, EWC_ra));
+ assertTrue(LegalTshegBar.takesBao(EW_ABSENT, EWC_ga, EWC_ra));
+ assertTrue(LegalTshegBar.takesBao(EWC_ra, EWC_ga, EWC_ya));
+
+ assertTrue(!LegalTshegBar.takesBao(EWC_ra, EWC_da, EWC_ya));
+ assertTrue(!LegalTshegBar.takesBao(EW_ABSENT, EWC_ba, EW_ABSENT));
+ assertTrue(!LegalTshegBar.takesBao(EWC_la, EWC_nga, EW_ABSENT));
+ assertTrue(!LegalTshegBar.takesBao(EW_ABSENT, EWC_nga, EWC_ra));
+ }
+
+ {
+ assertTrue(LegalTshegBar.takesGao(EW_ABSENT, EWC_ca, EW_ABSENT));
+ assertTrue(!LegalTshegBar.takesGao(EW_ABSENT, EWC_ka, EW_ABSENT));
+ assertTrue(!LegalTshegBar.takesGao(EW_ABSENT, EWC_ka, EWC_ya));
+ assertTrue(!LegalTshegBar.takesGao(EWC_ra, EWC_ka, EW_ABSENT));
+ assertTrue(!LegalTshegBar.takesGao(EWC_ra, EWC_ka, EWC_ya));
+ }
+
+
+ {
+ assertTrue(LegalTshegBar.takesDao(EW_ABSENT, EWC_ka, EW_ABSENT));
+ assertTrue(!LegalTshegBar.takesDao(EW_ABSENT, EWC_wa, EW_ABSENT));
+ assertTrue(!LegalTshegBar.takesDao(EW_ABSENT, EWC_nga, EWC_ya));
+ assertTrue(!LegalTshegBar.takesDao(EWC_ra, EWC_ga, EW_ABSENT));
+ assertTrue(!LegalTshegBar.takesDao(EWC_ra, EWC_ga, EWC_ya));
+
+ assertTrue(LegalTshegBar.takesDao(EW_ABSENT, EWC_ga, EWC_ya));
+ assertTrue(LegalTshegBar.takesDao(EW_ABSENT, EWC_ka, EWC_ra));
+ }
+
+ {
+ assertTrue(LegalTshegBar.takesMao(EW_ABSENT, EWC_ja, EW_ABSENT));
+ assertTrue(!LegalTshegBar.takesMao(EW_ABSENT, EWC_wa, EW_ABSENT));
+ assertTrue(!LegalTshegBar.takesMao(EW_ABSENT, EWC_nga, EWC_ya));
+ assertTrue(!LegalTshegBar.takesMao(EWC_ra, EWC_ga, EW_ABSENT));
+ assertTrue(!LegalTshegBar.takesMao(EWC_ra, EWC_ga, EWC_ya));
+
+ assertTrue(LegalTshegBar.takesMao(EW_ABSENT, EWC_kha, EWC_ya));
+ assertTrue(LegalTshegBar.takesMao(EW_ABSENT, EWC_kha, EWC_ra));
+ }
+
+ {
+ assertTrue(LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_ga, EW_ABSENT));
+ assertTrue(!LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_ka, EW_ABSENT));
+ assertTrue(!LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_wa, EW_ABSENT));
+ assertTrue(!LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_nga, EWC_ya));
+ assertTrue(!LegalTshegBar.takesAchungPrefix(EWC_ra, EWC_ga, EW_ABSENT));
+ assertTrue(!LegalTshegBar.takesAchungPrefix(EWC_ra, EWC_ga, EWC_ya));
+
+ assertTrue(LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_ba, EWC_ya));
+ assertTrue(LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_pha, EWC_ra));
+ }
+ }
}
diff --git a/source/org/thdl/tib/text/tshegbar/TibetanSyntaxException.java b/source/org/thdl/tib/text/tshegbar/TibetanSyntaxException.java
new file mode 100644
index 0000000..db7561b
--- /dev/null
+++ b/source/org/thdl/tib/text/tshegbar/TibetanSyntaxException.java
@@ -0,0 +1,51 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis,
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+License for the specific terms governing rights and limitations under the
+License.
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
+All Rights Reserved.
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.tshegbar;
+
+/** DLC FIXMEDOC: says "this isn't legal Tibetan", not "this isn't a valid sequence of Unicode" */
+class TibetanSyntaxException extends Exception {
+ /** This constructor creates an exception with a less than helpful
+ * message for the end user. Please don't use this constructor
+ * for production code. */
+ TibetanSyntaxException() {
+ super("A Unicode input stream had a syntactically incorrect run of Tibetan. For example, kha, i.e., U+0F41, is not an allowed prefix. This run of Tibetan was not expected.");
+ // we can tell it wasn't expected, because this error message
+ // isn't very helpful, and one of the other constructors
+ // should've been used.
+ }
+
+ /** DLC FIXMEDOC */
+ TibetanSyntaxException(String x) {
+ super(x);
+ }
+
+ /** DLC FIXMEDOC
+
+ @param grcls a Vector whose elements x are GraphemeClusters
+ where x is in the range [start, end)
+ @param start grcls.elementAt(start) is the first
+ GraphemeCluster in the syntactically incorrect stretch of
+ Tibetan.
+ @param end grcls.elementAt(end - 1) is the last
+ GraphemeCluster in the syntactically incorrect stretch of
+ Tibetan. */
+ TibetanSyntaxException(Vector grcls, int start, int end) {
+ DLC NOW;
+ }
+}
diff --git a/source/org/thdl/tib/text/tshegbar/TransitionInstruction.java b/source/org/thdl/tib/text/tshegbar/TransitionInstruction.java
new file mode 100644
index 0000000..5da8fe4
--- /dev/null
+++ b/source/org/thdl/tib/text/tshegbar/TransitionInstruction.java
@@ -0,0 +1,58 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis,
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+License for the specific terms governing rights and limitations under the
+License.
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
+All Rights Reserved.
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.tshegbar;
+
+
+/** DLC FIXMEDOC */
+class TransitionInstruction implements UnicodeReadingStateMachineConstants {
+ private TransitionInstruction() { super(); }
+ TransitionInstruction(int nextState, int action) {
+ super();
+
+ assert(action == ACTION_CONTINUES_GRAPHEME_CLUSTER
+ || action == ACTION_BEGINS_NEW_GRAPHEME_CLUSTER
+ || action == ACTION_PREPEND_WITH_0F68);
+
+ assert(nextState == STATE_START
+ || nextState == STATE_READY
+ || nextState == STATE_DIGIT
+ || nextState == STATE_STACKING
+ || nextState == STATE_STACKPLUSACHUNG
+ || nextState == STATE_PARTIALMARK);
+
+ // we start in the start state, but we can never return to it.
+ assert(nextState != STATE_START);
+
+ this.nextState = nextState;
+ this.action = action;
+ }
+
+ /** the state (e.g., {@link #STATE_READY}) to which to transition
+ * next */
+ private int nextState;
+
+ /** the action to perform upon transition, either {@link
+ * #ACTION_CONTINUES_GRAPHEME_CLUSTER}, {@link
+ * #ACTION_BEGINS_NEW_GRAPHEME_CLUSTER}, or {@link
+ * #ACTION_PREPEND_WITH_0F68} */
+ private int action;
+
+ int getAction() { return action; }
+ int getNextState() { return nextState; }
+}
diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeReadingStateMachineConstants.java b/source/org/thdl/tib/text/tshegbar/UnicodeReadingStateMachineConstants.java
new file mode 100644
index 0000000..e4a7418
--- /dev/null
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeReadingStateMachineConstants.java
@@ -0,0 +1,174 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis,
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+License for the specific terms governing rights and limitations under the
+License.
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
+All Rights Reserved.
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.tshegbar;
+
+/** Constants and static routines (DLC still?) useful in writing state
+ * machines for transforming Unicode input into other forms.
+ *
+ * @author David Chandler
+ */
+interface UnicodeReadingStateMachineConstants {
+
+ /** Returns the codepoint class for cp, e.g. {@link #CC_SJC}.
+ * @param cp a Unicode codepoint, which MUST be nondecomposable
+ * if it is in the Tibetan range but can be from outside the
+ * Tibetan range of Unicode */
+ static int getCCForCP(char cp) {
+ assert(getNFTHDL(cp) == null);
+ if ('\u0F82' == cp) {
+ return CC_0F82;
+ } else if ('\u0F8A' == cp) {
+ return CC_0F8A;
+ } else if ('\u0F39' == cp) {
+ return CC_0F39;
+ } else if ('\u0F71' == cp) {
+ return CC_ACHUNG;
+ } else if ('\u0F40' <= cp && cp <= '\u0F6A') {
+ assert(cp != '\u0F48');
+ return CC_CON;
+ } else if ('\u0F90' <= cp && cp <= '\u0FBC') {
+ assert(cp != '\u0F98');
+ return CC_SJC;
+ } else if ('\u0F20' <= cp && cp <= '\u0F33') {
+ return CC_DIGIT;
+ } else if (/* DLC NOW do these combine ONLY with digits, or do CC_CM just NOT combine with digits? */
+ '\u0F3E' == cp
+ || '\u0F3F' == cp
+ || '\u0F18' == cp
+ || '\u0F19' == cp) {
+ return CC_MCWD;
+ } else if ('\u0FC6' == cp
+ || '\u0F87' == cp
+ || '\u0F86' == cp
+ || '\u0F84' == cp
+ || '\u0F83' == cp
+ || '\u0F82' == cp
+ || '\u0F7F' == cp
+ || '\u0F7E' == cp
+ || '\u0F37' == cp /* DLC NOW NORMALIZATION OF 0F10, 11 to 0F0F ??? */
+ || '\u0F35' == cp) {
+ return CC_CM;
+ } else if ('\u0F72' == cp
+ || '\u0F74' == cp
+ || '\u0F7A' == cp
+ || '\u0F7B' == cp
+ || '\u0F7C' == cp
+ || '\u0F7D' == cp
+ || '\u0F80' == cp) {
+ // DLC what about U+0F84 ??? CC_V or CC_CM ?
+ return CC_V;
+ } else {
+ return CC_SIN;
+ }
+ }
+
+ // codepoint classes (CC_...) follow. These are mutually
+ // exclusive, and their union is the whole of Unicode.
+
+ /** for everything else, i.e. non-Tibetan characters like U+0E00
+ * and also Tibetan characters like U+0FCF and U+0F05 (DLC rename
+ * SIN[GLETON] to OTHER as combining marks from outside the
+ * Tibetan range count as this) but not U+0F8A */
+ static final int CC_SIN = 0;
+
+ /** for combining marks in the Tibetan range of Unicode that
+ * combine with digits alone */
+ static final int CC_MCWD = 1;
+
+ /** for combining marks in the Tibetan range of Unicode, minus
+ * CC_MCWD, U+0F82, and U+0F39 */
+ static final int CC_CM = 2;
+
+ /** for combining consonants, i.e. U+0F90-U+0FBC minus U+0F98
+ * minus the decomposable entries like U+0F93, U+0F9D, U+0FA2,
+ * etc. */
+ static final int CC_SJC = 3;
+
+ /** for noncombining consonants, i.e. U+0F40-U+0F6A minus U+0F48
+ * minus the decomposable entries like U+0F43, U+0F4D, U+0F52,
+ * etc. */
+ static final int CC_CON = 4;
+
+ /** for simple, nondecomposable vowels, i.e. U+0F72, U+0F74,
+ * U+0F7A, U+0F7B, U+0F7C, U+0F7D, U+0F80 */
+ static final int CC_V = 5;
+
+ /** for U+0F8A */
+ static final int CC_0F8A = 6;
+
+ /** for U+0F82, which is treated like {@link #CC_CM} except after
+ * U+0F8A */
+ static final int CC_0F82 = 7;
+
+ /** for U+0F39, an integral part of a consonant when it directly
+ * follows a member of CM_CONS or CM_SJC */
+ static final int CC_0F39 = 8;
+
+ /** for U+0F71 */
+ static final int CC_ACHUNG = 9;
+
+ /** for digits, i.e. U+0F20-U+0F33 */
+ static final int CC_DIGIT = 10;
+
+
+
+ // states STATE_...:
+
+ /** initial state */
+ static final int STATE_START = 0;
+
+ /** ready state, i.e. the state in which some non-empty Unicode
+ * String is in the holding area, ready to receive
+ * combining marks like U+0F35 */
+ static final int STATE_READY = 1;
+
+ /** digit state, i.e. the state in which some non-empty Unicode
+ * String consisting entirely of digits is in the holding area,
+ * ready to receive marks that combine only with digits */
+ static final int STATE_DIGIT = 2;
+
+ /** state in which CC_SJC are welcomed and treated as consonants
+ * to be subscribed to the GraphemeCluster in holding. */
+ static final int STATE_STACKING = 3;
+
+ /** state in which one or more consonants have been seen and also
+ * an achung (U+0F71) has been seen */
+ static final int STATE_STACKPLUSACHUNG = 4;
+
+ /** state that seeing U+0F8A (when that's not an error) puts you
+ * in. Needed because U+0F8A is always followed by U+0F82, and
+ * we check for the exceptional case that U+0F8A is followed by
+ * something else. */
+ static final int STATE_PARTIALMARK = 5;
+
+ /* DLC we should have many error states or none. */
+
+
+ /** the present codepoint marks the start of a new
+ * GraphemeCluster */
+ static final int ACTION_BEGINS_NEW_GRAPHEME_CLUSTER = 0;
+ /** the present codepoint is a continuation of the current
+ * GraphemeCluster */
+ static final int ACTION_CONTINUES_GRAPHEME_CLUSTER = 1;
+ /** there is an error in the input stream, which we are correcting
+ * (as we are in error-correcting mode) by starting a new
+ * GraphemeCluster with U+0F68 as the first codepoint and the
+ * current codepoint as the second */
+ static final int ACTION_PREPEND_WITH_0F68 = 2;
+}
diff --git a/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java b/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java
new file mode 100644
index 0000000..4084444
--- /dev/null
+++ b/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java
@@ -0,0 +1,345 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis,
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+License for the specific terms governing rights and limitations under the
+License.
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
+All Rights Reserved.
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.tshegbar;
+
+class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
+ /** Don't instantiate this class. */
+ private Foo() { super(); }
+
+ /** This table tells how to transition from state a 6 states + error state */
+ private static final TransitionInstruction
+ transitionTable[6 /* number of STATEs */]
+ [11 /* number of CC classes */]
+ = {
+ // STATE_START:
+ {
+ /* upon seeing CC_SIN in this state: */
+ new TransitionInstruction(STATE_READY,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+ /* upon seeing CC_MCWD in this state: */
+ null,
+ /* upon seeing CC_CM in this state: */
+ null,
+ /* upon seeing CC_SJC in this state: */
+ null,
+ /* upon seeing CC_CON in this state: */
+ new TransitionInstruction(STATE_READY,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+ /* upon seeing CC_V in this state: */
+ null,
+ /* upon seeing CC_0F8A in this state: */
+ new TransitionInstruction(STATE_READY,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+ /* upon seeing CC_0F82 in this state: */
+ null,
+ /* upon seeing CC_0F39 in this state: */
+ null,
+ /* upon seeing CC_ACHUNG in this state: */
+ null,
+ /* upon seeing CC_DIGIT in this state: */
+ new TransitionInstruction(STATE_DIGIT,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
+ },
+
+ // STATE_READY:
+ {
+ /* upon seeing CC_SIN in this state: */
+ new TransitionInstruction(STATE_READY, // self
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+ /* upon seeing CC_MCWD in this state: */
+ null,
+ /* upon seeing CC_CM in this state: */
+ new TransitionInstruction(STATE_READY,
+ ACTION_CONTINUES_GRAPHEME_CLUSTER),
+ /* upon seeing CC_SJC in this state: */
+ null,
+ /* upon seeing CC_CON in this state: */
+ new TransitionInstruction(STATE_STACKING,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+ /* upon seeing CC_V in this state: */
+ null
+ /* upon seeing CC_0F8A in this state: */
+ new TransitionInstruction(STATE_PARTIALMARK,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+ /* upon seeing CC_0F82 in this state: */
+ new TransitionInstruction(STATE_READY,
+ ACTION_CONTINUES_GRAPHEME_CLUSTER),
+ /* upon seeing CC_0F39 in this state: */
+ null,
+ /* upon seeing CC_ACHUNG in this state: */
+ null, // because 0F71 comes after SJCs, before Vs, and
+ // before CMs.
+ /* upon seeing CC_DIGIT in this state: */
+ new TransitionInstruction(STATE_DIGIT,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
+ },
+ // STATE_DIGIT:
+ {
+ /* upon seeing CC_SIN in this state: */
+ new TransitionInstruction(STATE_READY,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+ /* upon seeing CC_MCWD in this state: */
+ new TransitionInstruction(STATE_READY,
+ ACTION_CONTINUES_GRAPHEME_CLUSTER),
+ /* upon seeing CC_CM in this state: */
+ new TransitionInstruction(STATE_READY,
+ ACTION_CONTINUES_GRAPHEME_CLUSTER),
+ /* upon seeing CC_SJC in this state: */
+ null,
+ /* upon seeing CC_CON in this state: */
+ new TransitionInstruction(STATE_STACKING,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+ /* upon seeing CC_V in this state: */
+ null,
+ /* upon seeing CC_0F8A in this state: */
+ new TransitionInstruction(STATE_PARTIALMARK,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+ /* upon seeing CC_0F82 in this state: */
+ new TransitionInstruction(STATE_READY,
+ ACTION_CONTINUES_GRAPHEME_CLUSTER),
+ /* upon seeing CC_0F39 in this state: */
+ null,
+ /* upon seeing CC_ACHUNG in this state: */
+ null,
+ /* upon seeing CC_DIGIT in this state: */
+ new TransitionInstruction(STATE_DIGIT,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER) /* DLC although consider the meaning of 0F22,0F22,0F3F */
+ },
+ // STATE_STACKING:
+ {
+ /* upon seeing CC_SIN in this state: */
+ new TransitionInstruction(STATE_READY,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+ /* upon seeing CC_MCWD in this state: */
+ null,
+ /* upon seeing CC_CM in this state: */
+ new TransitionInstruction(STATE_READY,
+ ACTION_CONTINUES_GRAPHEME_CLUSTER),
+ /* upon seeing CC_SJC in this state: */
+ new TransitionInstruction(STATE_STACKING,
+ ACTION_CONTINUES_GRAPHEME_CLUSTER),
+ /* upon seeing CC_CON in this state: */
+ new TransitionInstruction(STATE_STACKING,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+ /* upon seeing CC_V in this state: */
+ new TransitionInstruction(STATE_READY,
+ ACTION_CONTINUES_GRAPHEME_CLUSTER),
+ /* upon seeing CC_0F8A in this state: */
+ new TransitionInstruction(STATE_PARTIALMARK,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+ /* upon seeing CC_0F82 in this state: */
+ new TransitionInstruction(STATE_READY,
+ ACTION_CONTINUES_GRAPHEME_CLUSTER),
+ /* upon seeing CC_0F39 in this state: */
+ new TransitionInstruction(STATE_STACKING,
+ ACTION_CONTINUES_GRAPHEME_CLUSTER),
+ /* upon seeing CC_ACHUNG in this state: */
+ new TransitionInstruction(STATE_STACKPLUSACHUNG,
+ ACTION_CONTINUES_GRAPHEME_CLUSTER),
+ /* upon seeing CC_DIGIT in this state: */
+ new TransitionInstruction(STATE_DIGIT,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
+ },
+ // STATE_STACKPLUSACHUNG:
+ {
+ /* upon seeing CC_SIN in this state: */
+ new TransitionInstruction(STATE_READY,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+ /* upon seeing CC_MCWD in this state: */
+ null,
+ /* upon seeing CC_CM in this state: */
+ new TransitionInstruction(STATE_READY,
+ ACTION_CONTINUES_GRAPHEME_CLUSTER),
+ /* upon seeing CC_SJC in this state: */
+ null,
+ /* upon seeing CC_CON in this state: */
+ new TransitionInstruction(STATE_STACKING,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+ /* upon seeing CC_V in this state: */
+ new TransitionInstruction(STATE_READY,
+ ACTION_CONTINUES_GRAPHEME_CLUSTER),
+ /* upon seeing CC_0F8A in this state: */
+ new TransitionInstruction(STATE_PARTIALMARK,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
+ /* upon seeing CC_0F82 in this state: */
+ new TransitionInstruction(STATE_READY,
+ ACTION_CONTINUES_GRAPHEME_CLUSTER),
+ /* upon seeing CC_0F39 in this state: */
+ null,
+ /* upon seeing CC_ACHUNG in this state: */
+ null,
+ /* upon seeing CC_DIGIT in this state: */
+ new TransitionInstruction(STATE_DIGIT,
+ ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
+ },
+ // STATE_PARTIALMARK:
+ {
+ /* upon seeing CC_SIN in this state: */
+ null,
+ /* upon seeing CC_MCWD in this state: */
+ null,
+ /* upon seeing CC_CM in this state: */
+ null,
+ /* upon seeing CC_SJC in this state: */
+ null,
+ /* upon seeing CC_CON in this state: */
+ null,
+ /* upon seeing CC_V in this state: */
+ null,
+ /* upon seeing CC_0F8A in this state: */
+ null,
+ /* upon seeing CC_0F82 in this state: */
+ new TransitionInstruction(STATE_READY,
+ ACTION_CONTINUES_GRAPHEME_CLUSTER),
+ /* upon seeing CC_0F39 in this state: */
+ null,
+ /* upon seeing CC_ACHUNG in this state: */
+ null,
+ /* upon seeing CC_DIGIT in this state: */
+ null
+ }
+ };
+
+ DLC NOW -- clearly, we need LegalSyllable to be convertable to and from GraphemeClusters;
+
+ /** Breaks a sequence of GraphemeClusters into LegalSyllables.
+ @param grcls a sequence of nonnull GraphemeClusters
+ @return a sequence of nonnull LegalSyllables
+ @exception TibetanSyntaxException if grcls does not consist
+ entirely of legal Tibetan syllables
+ @see #GraphemeCluster
+ @see #LegalSyllable
+ */
+ private static Vector breakGraphemeClustersIntoOnlySyllables(Vector grcls)
+ throws TibetanSyntaxException
+ {
+ return breakGraphemeClustersIntoSyllablesAndGraphemeClusters(grcls,
+ true);
+ }
+
+ private static Vector breakGraphemeClustersIntoOnlySyllables(Vector grcls) {
+ try {
+ return breakGraphemeClustersIntoSyllablesAndGraphemeClusters(grcls,
+ false);
+ } catch (TibetanSyntaxException) {
+ throw new Error("This can never happen, because the second parameter, validating, was false.");
+ }
+ }
+
+ /**
+ @param grcls a Vector consisting entirely of GraphemeClusters
+ @param validate true iff you wish to have a
+ TibetanSyntaxException thrown upon encountering a sequence of
+ GraphemeClusters that is syntactically incorrect Tibetan
+ @return if validate is true, a Vector consisting entirely of
+ LegalSyllables, else a vector of LegalSyllables and
+ GraphemeClusters */
+ private static Vector breakGraphemeClustersIntoSyllablesAndGraphemeClusters(Vector grcls,
+ boolean validate)
+ throws TibetanSyntaxException
+ {
+ Vector syllables = new Vector();
+ int grcls_len = grcls.length();
+ int beginning_of_cluster = 0;
+ for (int i = 0; i < grcls_len; i++) {
+ GraphemeCluster current_grcl
+ = (GraphemeCluster)grcls.elementAt(i);
+ if (current_grcl.isTshegLike()) {
+ if (beginning_of_cluster < i) {
+ // One or more non-tsheg-like grapheme clusters is
+ // here between tsheg-like grapheme clusters. Is
+ // it a legal syllable?
+ if (LegalTshegBar.formsLegalTshegBar(grcls,
+ beginning_of_cluster,
+ i))
+ {
+ syllables.add(new LegalSyllable(grcls,
+ beginning_of_cluster,
+ i, tsheg=current_grcl));
+ }
+ else
+ {
+ if (validating) {
+ TibetanSyntaxException ex
+ = new TibetanSyntaxException(grcls,
+ beginning_of_cluster,
+ i);
+ // DLC: return an int -1 for "all good" or
+ // 3 for "the fourth element is the first
+ // bad one" but then you don't know that
+ // 3-6 were the bad ones
+ throw ex;
+ } else {
+ for (int j = beginning_of_cluster; j <= i; j++) {
+ syllables.add(grcls.elementAt(j));
+ }
+ }
+ }
+ }
+ beginning_of_cluster = i + 1;
+ } // else add current_grcl to the waiting list, in a sense
+ }
+ return syllables;
+ }
+
+ /** Breaks a string of perfectly-formed Unicode into
+ GraphemeClusters.
+ @param nfthdl_unicode a String of NFTHDL-normalized Unicode
+ codepoints
+ @exception Exception if the input is not perfectly formed
+ @return a vector of GraphemeClusters
+ @see #GraphemeCluster
+ */
+ private static Vector nonErrorCorrectingReader(String nfthdl_unicode)
+ throws Exception
+ {
+ // a vector of GraphemeClusters that we build up little by
+ // little:
+ Vector grcls = new Vector();
+ int currentState = STATE_START;
+ StringBuffer holdingPen = new StringBuffer();
+
+ int ilen = nfthdl_unicode.length();
+ for (int i = 0; i < ilen; i++) {
+ char current_cp = nfthdl_unicode.charAt(i);
+ int cc_of_current_cp = getCCForCP(current_cp);
+ final TransitionInstruction ti
+ = transitionTable[currentState][cc_of_current_cp];
+ if (null == ti) {
+ throw new Exception("Bad Unicode. DLC improve these messages");
+ } else {
+ switch (ti.getAction()) {
+ case ACTION_BEGINS_NEW_GRAPHEME_CLUSTER:
+ grcls.add(new GraphemeCluster(holdingPen));
+ holdingPen = new StringBuffer();
+ break;
+ case ACTION_CONTINUES_GRAPHEME_CLUSTER:
+ holdingString.append(current_cp);
+ break;
+ case ACTION_PREPEND_WITH_0F68:
+ throw new Error("This never happens inside the validating scanner.");
+ default:
+ throw new Error("Famous last words: This won't happen.");
+ }
+ currentState = ti.getNextState();
+ }
+ }
+ return grcls;
+ }
+}
diff --git a/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReaderTest.java b/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReaderTest.java
new file mode 100644
index 0000000..bf6ae3d
--- /dev/null
+++ b/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReaderTest.java
@@ -0,0 +1,195 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis,
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+License for the specific terms governing rights and limitations under the
+License.
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
+All Rights Reserved.
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.tshegbar;
+
+/** Tests ValidatingUnicodeReader.
+ * @author David Chandler */
+class ValidatingUnicodeReaderTest {
+ private static String skyagd = "\u0F66\u0F90\u0FB1\u0F42\u0F51";
+ private static String bskyagd = "\u0F56" + skyagd;
+
+ void testValidatingUnicodeReader() {
+ // DLC these routines can be slow.
+ assertTrue(ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode(
+ bskyagd + "\u0F0C"));
+ assertTrue(!ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode(
+ "\u0F42" + skyagd + "\u0F0C"));
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+ bskyagd + "\u0F0C"));
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F42" + skyagd + "\u0F0C"));
+
+ assertTrue(ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode(
+ bskyagd + "\u0F0C\u0F62\u0F0B" + bskyagd + "\u0F0F"));
+
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F6A\u0F0B"));
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F62\u0F0B"));
+ assertTrue(!ValidatingUnicodeReader.isPerfectUnicode(
+ "\u0F6A\u0F0B"));
+ assertTrue(ValidatingUnicodeReader.isPerfectUnicode(
+ "\u0F62\u0F0B"));
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F6A\u0F90\u0F0B"));
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F62\u0F90\u0F0B"));
+ assertTrue(ValidatingUnicodeReader.isPerfectUnicode(
+ "\u0F62\u0F90\u0F0B"));
+ assertTrue(!ValidatingUnicodeReader.isPerfectUnicode(
+ "\u0F6A\u0F90\u0F0B"));
+
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F43"));
+ assertTrue(!ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode(
+ "\u0F43"));
+
+ // The Unicode standard states that U+0F8A is always followed
+ // by U+0F82.
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F8A\u0F82"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F8A"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F8A\u0F40"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F8A\u0F83"));
+
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F74"));
+ assertTrue(ValidatingUnicodeReader.isPerfectUnicode(
+ "\u0F40\u0F74"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F90\u0F74"));
+
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F40\u0F77"));
+ assertTrue(!ValidatingUnicodeReader.isPerfectUnicode(
+ "\u0F40\u0F77"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F90\u0F77"));
+
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F40\u0F90\u0F7F"));
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F40\u0F90\u0F7F\u0F35"));
+
+ // Test that each singleton (except U+0F8A) in the Tibetan
+ // range is legal, and that each combining char and empty
+ // codepoint (and also U+0F8A) is illegal alone.
+ {
+ for (char cp = '\u0F00'; cp <= '\u0F17'; cp++)
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
+ for (char cp = '\u0F1a'; cp <= '\u0F34'; cp++)
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
+ for (char cp = '\u0F3a'; cp <= '\u0F3d'; cp++)
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
+ for (char cp = '\u0F40'; cp <= '\u0F47'; cp++)
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
+ for (char cp = '\u0F49'; cp <= '\u0F6a'; cp++)
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
+ for (char cp = '\u0F88'; cp <= '\u0F89'; cp++)
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
+ for (char cp = '\u0Fbe'; cp <= '\u0Fc5'; cp++)
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
+ for (char cp = '\u0Fc7'; cp <= '\u0Fcc'; cp++)
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F36"));
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F38"));
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F85"));
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F8b"));
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0Fcf"));
+
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F48"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6b"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6c"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6d"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6e"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6f"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F70"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8c"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8d"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8e"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8f"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F98"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fbd"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fcd"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fce"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fd0"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fe4"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Ff0"));
+ assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fff"));
+ }
+
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F40\u0Fc6"));
+
+ // Test that combining characters that combine with both
+ // consonants and digits work.
+ {
+ String combiningMarks[] = new String[] {
+ "\u0F71",
+ "\u0F72",
+ "\u0F73",
+ "\u0F74",
+ "\u0F75",
+ "\u0F76",
+ "\u0F77",
+ "\u0F78",
+ "\u0F79",
+ "\u0F7a",
+ "\u0F7b",
+ "\u0F7c",
+ "\u0F7d",
+ "\u0F7e",
+ "\u0F7f",
+ "\u0F80",
+ "\u0F81",
+ "\u0F82",
+ "\u0F83",
+ "\u0F84",
+ "\u0F86",
+ "\u0F87"
+ };
+ for (int i = 0; i < combiningMarks.length(); i++) {
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F40" + combiningMarks[i]));
+ // DLC have a group that works with both digits and consonants, cuz vowels plus digits is a no go, right?
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F20" + combiningMarks[i]));
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F30" + combiningMarks[i]));
+ }
+ }
+
+ DLC;
+ assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
+ "\u0F\u0F\u0F\u0F\u0F"));
+ }
+
+ void testSyntacticallyLegalUnicodeToThdlWylie() {
+ assertTrue("bskyagd"
+ .equals(ValidatingUnicodeReader.syntacticallyLegalTibetanUnicodeToThdlWylie(
+ bskyagd)));
+
+ assertTrue("bskyagd bskyagd/"
+ .equals(ValidatingUnicodeReader.syntacticallyLegalTibetanUnicodeToThdlWylie(
+ bskyagd + "\u0F0B" + bskyagd + "\u0F0D")));
+ }
+}