From daacf6ee3b51b76df01ed23479f0bc23a50608c4 Mon Sep 17 00:00:00 2001 From: dchandler Date: Sat, 12 Apr 2003 20:56:20 +0000 Subject: [PATCH] I've got too many sandboxes, so I'm committing these changes, half-done, from one sandbox so as to consolidate my sandboxes. --- .../thdl/tib/text/tshegbar/LegalTshegBar.java | 363 +++++++++++++++++- .../tib/text/tshegbar/LegalTshegBarTest.java | 73 ++++ .../text/tshegbar/TibetanSyntaxException.java | 51 +++ .../text/tshegbar/TransitionInstruction.java | 58 +++ .../UnicodeReadingStateMachineConstants.java | 174 +++++++++ .../tshegbar/ValidatingUnicodeReader.java | 345 +++++++++++++++++ .../tshegbar/ValidatingUnicodeReaderTest.java | 195 ++++++++++ 7 files changed, 1252 insertions(+), 7 deletions(-) create mode 100644 source/org/thdl/tib/text/tshegbar/TibetanSyntaxException.java create mode 100644 source/org/thdl/tib/text/tshegbar/TransitionInstruction.java create mode 100644 source/org/thdl/tib/text/tshegbar/UnicodeReadingStateMachineConstants.java create mode 100644 source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java create mode 100644 source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReaderTest.java diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java index e119b7d..658c631 100644 --- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java +++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java @@ -18,6 +18,8 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.tshegbar; +import java.util.Vector; + import org.thdl.tib.text.THDLWylieConstants; import org.thdl.util.ThdlDebug; @@ -42,15 +44,15 @@ import org.thdl.util.ThdlDebug; * exception is that 'i (i.e., the connective case marker), 'u, and * 'o suffixes are permitted. * - *
  • It has at most one suffix, which is a single consonant or a - * string consisting of 'i, 'u, 'o, 'am, and 'ang.
  • - * - * -DLC FIXME: we must allow many suffixes. See Andres' e-mail below: + *
  • It has at most one suffix, which is a single consonant (the + * common case) or a string consisting of 'i, 'u, 'o, 'am, and + * 'ang. + +

    See Andres' e-mail below:

     David,
     
    -It is a particle that means "or" as opposed to "dang" that means and.
    +['am] is a particle that means "or" as opposed to "dang" that means and.
     
     "sgom pa'am" would mean "... or meditation"
     
    @@ -65,6 +67,7 @@ And also there are cases where they combine. For ex you can have
     
     	Andres 
     
    +
  • * * *
  • It may contain a EWC_sa or EWC_da postsuffix iff there exists @@ -681,7 +684,7 @@ public final class LegalTshegBar } /** Like {@link - * #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char)} + * #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char,StringBuffer)} * but geared for the common case where the suffix is simply a * consonant. */ public static boolean formsLegalTshegBar(char prefix, @@ -1138,4 +1141,350 @@ public final class LegalTshegBar public String toString() { return toConciseXML(); } + + /** FIXMEDOC a shortcut */ + private static boolean formsLegalTshegBar(Vector grcls) { + return formsLegalTshegBar(grcls, 0, grcls.size()); + } + + /** FIXMEDOC DLC + * + * Returns true iff the given UnicodeGraphemeClusters form a + * syntactically legal Tibetan syllable. If one is null, it + * means that it is not present. + * + * @exception IllegalArgumentException if root is null, or if + * postsuffix is non-null and suffix is null (these being clearly + * illegal) + */ + private static boolean formsLegalTshegBar(UnicodeGraphemeCluster prefix, + UnicodeGraphemeCluster root, + UnicodeGraphemeCluster suffix, + UnicodeGraphemeCluster postsuffix) + throws IllegalArgumentException + { + // reality checks: + if (null == root) + throw new IllegalArgumentException("root letter is not present"); + if (null != postsuffix && null == suffix) + throw new IllegalArgumentException("a postsuffix cannot occur without a suffix"); + + // handle root: + if (!root.isLegalTibetan()) + return false; + char headLetter = root.getSuperscribedLetter(); + char rootLetter = root.getRootCP(); + char subjoinedLetter = root.getSoleNonWazurSubjoinedLetter(); + char vowel = root.getVowel(); + boolean hasAchung = root.hasAchung(); + boolean hasWazur = root.hasWazur(); + + // handle prefix: + char prefixLetter = prefix.getSoleTibetanUnicodeCP(); + + // handle suffix: + String suffixString = null; + if (null != suffix) { + // DLC FIXME suffixString = suffix.getUnicodeInUsualOrder(); + throw new Error("DLC FIXME"); + } + + // handle postsuffix: + char postsuffixLetter = postsuffix.getSoleTibetanUnicodeCP(); + + return formsLegalTshegBar(prefixLetter, headLetter, rootLetter, + subjoinedLetter, hasWazur, hasAchung, + suffixString, postsuffixLetter, vowel, null); + } + + /** Returns true iff the UnicodeGraphemeClusters in grcls with + * indices in the range [start, end) form a syntactically legal + * syllable. If start is as large as end, false is returned. */ + private static boolean formsLegalTshegBar(Vector grcls, + int start, + int end) + { + int numGrcls = start - end; + if (numGrcls <= 0) + return false; + if (numGrcls == 1) { + // Option 1: (root) + // else: return false; + + return formsLegalTshegBar(null, + (UnicodeGraphemeCluster)grcls.elementAt(start), + null, null); + } else if (numGrcls == 2) { + // Option 1: (prefix, root) + // Option 2: (root, suffix) + // else: return false; + + return (formsLegalTshegBar((UnicodeGraphemeCluster)grcls.elementAt(start), + (UnicodeGraphemeCluster)grcls.elementAt(start + 1), + null, + null) + || formsLegalTshegBar(null, + (UnicodeGraphemeCluster)grcls.elementAt(start), + (UnicodeGraphemeCluster)grcls.elementAt(start + 1), + null)); + } else if (numGrcls == 3) { + // Option 1: (prefix, root, suffix) + // Option 2: (root, suffix, postsuffix) + // else: return false; + + return (formsLegalTshegBar((UnicodeGraphemeCluster)grcls.elementAt(start), + (UnicodeGraphemeCluster)grcls.elementAt(start + 1), + (UnicodeGraphemeCluster)grcls.elementAt(start + 2), + null) + || formsLegalTshegBar(null, + (UnicodeGraphemeCluster)grcls.elementAt(start), + (UnicodeGraphemeCluster)grcls.elementAt(start + 1), + (UnicodeGraphemeCluster)grcls.elementAt(start + 2))); + } else if (numGrcls == 4) { + return (formsLegalTshegBar((UnicodeGraphemeCluster)grcls.elementAt(start), + (UnicodeGraphemeCluster)grcls.elementAt(start + 1), + (UnicodeGraphemeCluster)grcls.elementAt(start + 2), + (UnicodeGraphemeCluster)grcls.elementAt(start + 3))); + } else { + // the largest has 'i ... DLC FIXME rethink -- even the case where numGrcls == 3 could be pa'am + return false; + } + } + + + + /** Returns true if the given Tibetan consonant stack (i.e., the + * combination of superscribed, root, and subscribed letters) + * takes an EWC_ga prefix. + * @param head the {@link + * isNominalRepresentationOfConsonant(char) nominal + * representation} of the superscribed letter, or EW_ABSENT if + * not present + * @param root the {@link + * isNominalRepresentationOfConsonant(char) nominal + * representation} of the root letter + * @param sub the {@link isNominalRepresentationOfConsonant(char) + * nominal representation} of the subjoined letter, or EW_ABSENT + * if not present */ + static boolean takesGao(char head, char root, char sub) { + if (EW_ABSENT == head) { + if (EW_ABSENT == sub) { + return (EWC_ca == root + || EWC_ta == root + || EWC_da == root + || EWC_tsa == root + || EWC_zha == root + || EWC_za == root + || EWC_ya == root + || EWC_sha == root + || EWC_sa == root + || EWC_nya == root + || EWC_na == root); + } + } + return false; + } + + /** Returns true if the given Tibetan consonant stack (i.e., the + * combination of superscribed, root, and subscribed letters) + * takes an EWC_da prefix. + * @param head the {@link + * isNominalRepresentationOfConsonant(char) nominal + * representation} of the superscribed letter, or EW_ABSENT if + * not present + * @param root the {@link + * isNominalRepresentationOfConsonant(char) nominal + * representation} of the root letter + * @param sub the {@link isNominalRepresentationOfConsonant(char) + * nominal representation} of the subjoined letter, or EW_ABSENT + * if not present */ + static boolean takesDao(char head, char root, char sub) { + if (EW_ABSENT == head) { + if (EW_ABSENT == sub) { + return (EWC_ka == root + || EWC_ga == root + || EWC_nga == root + || EWC_pa == root + || EWC_ba == root + || EWC_ma == root); + } else { + return ((EWC_ga == root && EWC_ya == sub) + || (EWC_pa == root && EWC_ya == sub) + || (EWC_ba == root && EWC_ya == sub) + || (EWC_ma == root && EWC_ya == sub) + + || (EWC_ka == root && EWC_ra == sub) + || (EWC_ga == root && EWC_ra == sub) + || (EWC_ba == root && EWC_ra == sub) + || (EWC_pa == root && EWC_ra == sub)); + } + } else { + return false; + } + } + + /** Returns true if the given Tibetan consonant stack (i.e., the + * combination of superscribed, root, and subscribed letters) + * takes an EWC_achung prefix. + * @param head the {@link + * isNominalRepresentationOfConsonant(char) nominal + * representation} of the superscribed letter, or EW_ABSENT if + * not present + * @param root the {@link + * isNominalRepresentationOfConsonant(char) nominal + * representation} of the root letter + * @param sub the {@link isNominalRepresentationOfConsonant(char) + * nominal representation} of the subjoined letter, or EW_ABSENT + * if not present */ + static boolean takesAchungPrefix(char head, char root, char sub) { + if (EW_ABSENT == head) { + if (EW_ABSENT == sub) { + return (EWC_ga == root + || EWC_ja == root + || EWC_da == root + || EWC_ba == root + || EWC_dza == root + || EWC_kha == root + || EWC_cha == root + || EWC_tha == root + || EWC_pha == root + || EWC_tsha == root); + } else { + return ((EWC_pha == root && EWC_ya == sub) + || (EWC_ba == root && EWC_ya == sub) + || (EWC_kha == root && EWC_ya == sub) + || (EWC_ga == root && EWC_ya == sub) + + || (EWC_ba == root && EWC_ra == sub) + || (EWC_kha == root && EWC_ra == sub) + || (EWC_ga == root && EWC_ra == sub) + || (EWC_da == root && EWC_ra == sub) + || (EWC_pha == root && EWC_ra == sub)); + } + } else { + return false; + } + } + + /** Returns true if the given Tibetan consonant stack (i.e., the + * combination of superscribed, root, and subscribed letters) + * takes an EWC_ma prefix. + * @param head the {@link + * isNominalRepresentationOfConsonant(char) nominal + * representation} of the superscribed letter, or EW_ABSENT if + * not present + * @param root the {@link + * isNominalRepresentationOfConsonant(char) nominal + * representation} of the root letter + * @param sub the {@link isNominalRepresentationOfConsonant(char) + * nominal representation} of the subjoined letter, or EW_ABSENT + * if not present */ + static boolean takesMao(char head, char root, char sub) { + if (EW_ABSENT == head) { + if (EW_ABSENT == sub) { + return (EWC_kha == root + || EWC_ga == root + || EWC_cha == root + || EWC_ja == root + || EWC_tha == root + || EWC_tsha == root + || EWC_da == root + || EWC_dza == root + || EWC_nga == root + || EWC_nya == root + || EWC_na == root); + } else { + return ((EWC_kha == root && EWC_ya == sub) + || (EWC_ga == root && EWC_ya == sub) + + || (EWC_kha == root && EWC_ra == sub) + || (EWC_ga == root && EWC_ra == sub)); + } + } else { + return false; + } + } + + /** Returns true if the given Tibetan consonant stack (i.e., the + * combination of superscribed, root, and subscribed letters) + * takes an EWC_ba prefix. + * @param head the {@link + * isNominalRepresentationOfConsonant(char) nominal + * representation} of the superscribed letter, or EW_ABSENT if + * not present + * @param root the {@link + * isNominalRepresentationOfConsonant(char) nominal + * representation} of the root letter + * @param sub the {@link isNominalRepresentationOfConsonant(char) + * nominal representation} of the subjoined letter, or EW_ABSENT + * if not present */ + static boolean takesBao(char head, char root, char sub) { + // DLC ask Ten-lo la about Wazur. + if (EW_ABSENT == head) { + if (EW_ABSENT == sub) { + return (EWC_ka == root + || EWC_ca == root + || EWC_ta == root + || EWC_tsa == root + || EWC_ga == root + || EWC_nga == root + || EWC_ja == root + || EWC_nya == root + || EWC_da == root + || EWC_na == root + || EWC_dza == root + || EWC_zha == root + || EWC_za == root + || EWC_ra == root + || EWC_la == root + || EWC_sha == root); + } else { + // kra, e.g. + return ((EWC_ka == root && EWC_ya == sub) + || (EWC_ga == root && EWC_ya == sub) + + || (EWC_ka == root && EWC_ra == sub) + || (EWC_ga == root && EWC_ra == sub) + || (EWC_sa == root && EWC_ra == sub) + + || (EWC_ka == root && EWC_la == sub) + || (EWC_za == root && EWC_la == sub) + || (EWC_ra == root && EWC_la == sub) + || (EWC_sa == root && EWC_la == sub)); + } + } else { + if (EW_ABSENT == sub) { + // ska, e.g. + return ((EWC_sa == head && EWC_ka == root) + || (EWC_sa == head && EWC_ga == root) + || (EWC_sa == head && EWC_nga == root) + || (EWC_sa == head && EWC_nya == root) + || (EWC_sa == head && EWC_ta == root) + || (EWC_sa == head && EWC_da == root) + || (EWC_sa == head && EWC_na == root) + || (EWC_sa == head && EWC_tsa == root) + + || (EWC_ra == head && EWC_ka == root) + || (EWC_ra == head && EWC_ga == root) + || (EWC_ra == head && EWC_nga == root) + || (EWC_ra == head && EWC_ja == root) + || (EWC_ra == head && EWC_nya == root) + || (EWC_ra == head && EWC_ta == root) + || (EWC_ra == head && EWC_da == root) + || (EWC_ra == head && EWC_na == root) + || (EWC_ra == head && EWC_tsa == root) + || (EWC_ra == head && EWC_dza == root) + + || (EWC_la == head && EWC_ta == root) + || (EWC_la == head && EWC_da == root)); + } else { + return ((EWC_ra == head && EWC_ka == root && EWC_ya == sub) + || (EWC_ra == head && EWC_ga == root && EWC_ya == sub) + || (EWC_sa == head && EWC_ka == root && EWC_ya == sub) + || (EWC_sa == head && EWC_ga == root && EWC_ya == sub) + || (EWC_sa == head && EWC_ka == root && EWC_ra == sub) + || (EWC_sa == head && EWC_ga == root && EWC_ra == sub)); + } + } + } } diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java index c747304..88a7b01 100644 --- a/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java +++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBarTest.java @@ -279,4 +279,77 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants { assertTrue(!LegalTshegBar.isAchungBasedSuffix("")); } + + /** Tests that the rules concerning "which root letters take which + * prefixes?" are accurate. I got a list of such rules from a + * native Tibetan who has been kind enough to teach me the + * fundamentals of the Tibetan language, but I'm not sure where he + * got the list. + */ + public void testPrefixRules() { + // DLC FIXME how can we say that 0Fb2 is ok but 0fBc is not? + assertTrue(LegalTshegBar.takesBao(EWC_sa, EWC_ka, EWC_ra)); + assertTrue(!LegalTshegBar.takesBao('\u0FB6', EWC_ka, EWC_ra)); + assertTrue(!LegalTshegBar.takesBao(EWC_sa, '\u0F90', EWC_ra)); + assertTrue(!LegalTshegBar.takesBao(EWC_sa, '\u0F90', '\u0FB2')); + assertTrue(!LegalTshegBar.takesBao('\u0FB6', '\u0F90', EWC_ra)); + assertTrue(!LegalTshegBar.takesBao(EWC_sa, EWC_ka, '\u0FB2')); + + + { + assertTrue(LegalTshegBar.takesBao(EW_ABSENT, EWC_ka, EW_ABSENT)); + assertTrue(LegalTshegBar.takesBao(EWC_la, EWC_da, EW_ABSENT)); + assertTrue(LegalTshegBar.takesBao(EW_ABSENT, EWC_sa, EWC_ra)); + assertTrue(LegalTshegBar.takesBao(EW_ABSENT, EWC_ga, EWC_ra)); + assertTrue(LegalTshegBar.takesBao(EWC_ra, EWC_ga, EWC_ya)); + + assertTrue(!LegalTshegBar.takesBao(EWC_ra, EWC_da, EWC_ya)); + assertTrue(!LegalTshegBar.takesBao(EW_ABSENT, EWC_ba, EW_ABSENT)); + assertTrue(!LegalTshegBar.takesBao(EWC_la, EWC_nga, EW_ABSENT)); + assertTrue(!LegalTshegBar.takesBao(EW_ABSENT, EWC_nga, EWC_ra)); + } + + { + assertTrue(LegalTshegBar.takesGao(EW_ABSENT, EWC_ca, EW_ABSENT)); + assertTrue(!LegalTshegBar.takesGao(EW_ABSENT, EWC_ka, EW_ABSENT)); + assertTrue(!LegalTshegBar.takesGao(EW_ABSENT, EWC_ka, EWC_ya)); + assertTrue(!LegalTshegBar.takesGao(EWC_ra, EWC_ka, EW_ABSENT)); + assertTrue(!LegalTshegBar.takesGao(EWC_ra, EWC_ka, EWC_ya)); + } + + + { + assertTrue(LegalTshegBar.takesDao(EW_ABSENT, EWC_ka, EW_ABSENT)); + assertTrue(!LegalTshegBar.takesDao(EW_ABSENT, EWC_wa, EW_ABSENT)); + assertTrue(!LegalTshegBar.takesDao(EW_ABSENT, EWC_nga, EWC_ya)); + assertTrue(!LegalTshegBar.takesDao(EWC_ra, EWC_ga, EW_ABSENT)); + assertTrue(!LegalTshegBar.takesDao(EWC_ra, EWC_ga, EWC_ya)); + + assertTrue(LegalTshegBar.takesDao(EW_ABSENT, EWC_ga, EWC_ya)); + assertTrue(LegalTshegBar.takesDao(EW_ABSENT, EWC_ka, EWC_ra)); + } + + { + assertTrue(LegalTshegBar.takesMao(EW_ABSENT, EWC_ja, EW_ABSENT)); + assertTrue(!LegalTshegBar.takesMao(EW_ABSENT, EWC_wa, EW_ABSENT)); + assertTrue(!LegalTshegBar.takesMao(EW_ABSENT, EWC_nga, EWC_ya)); + assertTrue(!LegalTshegBar.takesMao(EWC_ra, EWC_ga, EW_ABSENT)); + assertTrue(!LegalTshegBar.takesMao(EWC_ra, EWC_ga, EWC_ya)); + + assertTrue(LegalTshegBar.takesMao(EW_ABSENT, EWC_kha, EWC_ya)); + assertTrue(LegalTshegBar.takesMao(EW_ABSENT, EWC_kha, EWC_ra)); + } + + { + assertTrue(LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_ga, EW_ABSENT)); + assertTrue(!LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_ka, EW_ABSENT)); + assertTrue(!LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_wa, EW_ABSENT)); + assertTrue(!LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_nga, EWC_ya)); + assertTrue(!LegalTshegBar.takesAchungPrefix(EWC_ra, EWC_ga, EW_ABSENT)); + assertTrue(!LegalTshegBar.takesAchungPrefix(EWC_ra, EWC_ga, EWC_ya)); + + assertTrue(LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_ba, EWC_ya)); + assertTrue(LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_pha, EWC_ra)); + } + } } diff --git a/source/org/thdl/tib/text/tshegbar/TibetanSyntaxException.java b/source/org/thdl/tib/text/tshegbar/TibetanSyntaxException.java new file mode 100644 index 0000000..db7561b --- /dev/null +++ b/source/org/thdl/tib/text/tshegbar/TibetanSyntaxException.java @@ -0,0 +1,51 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.tshegbar; + +/** DLC FIXMEDOC: says "this isn't legal Tibetan", not "this isn't a valid sequence of Unicode" */ +class TibetanSyntaxException extends Exception { + /** This constructor creates an exception with a less than helpful + * message for the end user. Please don't use this constructor + * for production code. */ + TibetanSyntaxException() { + super("A Unicode input stream had a syntactically incorrect run of Tibetan. For example, kha, i.e., U+0F41, is not an allowed prefix. This run of Tibetan was not expected."); + // we can tell it wasn't expected, because this error message + // isn't very helpful, and one of the other constructors + // should've been used. + } + + /** DLC FIXMEDOC */ + TibetanSyntaxException(String x) { + super(x); + } + + /** DLC FIXMEDOC + + @param grcls a Vector whose elements x are GraphemeClusters + where x is in the range [start, end) + @param start grcls.elementAt(start) is the first + GraphemeCluster in the syntactically incorrect stretch of + Tibetan. + @param end grcls.elementAt(end - 1) is the last + GraphemeCluster in the syntactically incorrect stretch of + Tibetan. */ + TibetanSyntaxException(Vector grcls, int start, int end) { + DLC NOW; + } +} diff --git a/source/org/thdl/tib/text/tshegbar/TransitionInstruction.java b/source/org/thdl/tib/text/tshegbar/TransitionInstruction.java new file mode 100644 index 0000000..5da8fe4 --- /dev/null +++ b/source/org/thdl/tib/text/tshegbar/TransitionInstruction.java @@ -0,0 +1,58 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.tshegbar; + + +/** DLC FIXMEDOC */ +class TransitionInstruction implements UnicodeReadingStateMachineConstants { + private TransitionInstruction() { super(); } + TransitionInstruction(int nextState, int action) { + super(); + + assert(action == ACTION_CONTINUES_GRAPHEME_CLUSTER + || action == ACTION_BEGINS_NEW_GRAPHEME_CLUSTER + || action == ACTION_PREPEND_WITH_0F68); + + assert(nextState == STATE_START + || nextState == STATE_READY + || nextState == STATE_DIGIT + || nextState == STATE_STACKING + || nextState == STATE_STACKPLUSACHUNG + || nextState == STATE_PARTIALMARK); + + // we start in the start state, but we can never return to it. + assert(nextState != STATE_START); + + this.nextState = nextState; + this.action = action; + } + + /** the state (e.g., {@link #STATE_READY}) to which to transition + * next */ + private int nextState; + + /** the action to perform upon transition, either {@link + * #ACTION_CONTINUES_GRAPHEME_CLUSTER}, {@link + * #ACTION_BEGINS_NEW_GRAPHEME_CLUSTER}, or {@link + * #ACTION_PREPEND_WITH_0F68} */ + private int action; + + int getAction() { return action; } + int getNextState() { return nextState; } +} diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeReadingStateMachineConstants.java b/source/org/thdl/tib/text/tshegbar/UnicodeReadingStateMachineConstants.java new file mode 100644 index 0000000..e4a7418 --- /dev/null +++ b/source/org/thdl/tib/text/tshegbar/UnicodeReadingStateMachineConstants.java @@ -0,0 +1,174 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.tshegbar; + +/** Constants and static routines (DLC still?) useful in writing state + * machines for transforming Unicode input into other forms. + * + * @author David Chandler + */ +interface UnicodeReadingStateMachineConstants { + + /** Returns the codepoint class for cp, e.g. {@link #CC_SJC}. + * @param cp a Unicode codepoint, which MUST be nondecomposable + * if it is in the Tibetan range but can be from outside the + * Tibetan range of Unicode */ + static int getCCForCP(char cp) { + assert(getNFTHDL(cp) == null); + if ('\u0F82' == cp) { + return CC_0F82; + } else if ('\u0F8A' == cp) { + return CC_0F8A; + } else if ('\u0F39' == cp) { + return CC_0F39; + } else if ('\u0F71' == cp) { + return CC_ACHUNG; + } else if ('\u0F40' <= cp && cp <= '\u0F6A') { + assert(cp != '\u0F48'); + return CC_CON; + } else if ('\u0F90' <= cp && cp <= '\u0FBC') { + assert(cp != '\u0F98'); + return CC_SJC; + } else if ('\u0F20' <= cp && cp <= '\u0F33') { + return CC_DIGIT; + } else if (/* DLC NOW do these combine ONLY with digits, or do CC_CM just NOT combine with digits? */ + '\u0F3E' == cp + || '\u0F3F' == cp + || '\u0F18' == cp + || '\u0F19' == cp) { + return CC_MCWD; + } else if ('\u0FC6' == cp + || '\u0F87' == cp + || '\u0F86' == cp + || '\u0F84' == cp + || '\u0F83' == cp + || '\u0F82' == cp + || '\u0F7F' == cp + || '\u0F7E' == cp + || '\u0F37' == cp /* DLC NOW NORMALIZATION OF 0F10, 11 to 0F0F ??? */ + || '\u0F35' == cp) { + return CC_CM; + } else if ('\u0F72' == cp + || '\u0F74' == cp + || '\u0F7A' == cp + || '\u0F7B' == cp + || '\u0F7C' == cp + || '\u0F7D' == cp + || '\u0F80' == cp) { + // DLC what about U+0F84 ??? CC_V or CC_CM ? + return CC_V; + } else { + return CC_SIN; + } + } + + // codepoint classes (CC_...) follow. These are mutually + // exclusive, and their union is the whole of Unicode. + + /** for everything else, i.e. non-Tibetan characters like U+0E00 + * and also Tibetan characters like U+0FCF and U+0F05 (DLC rename + * SIN[GLETON] to OTHER as combining marks from outside the + * Tibetan range count as this) but not U+0F8A */ + static final int CC_SIN = 0; + + /** for combining marks in the Tibetan range of Unicode that + * combine with digits alone */ + static final int CC_MCWD = 1; + + /** for combining marks in the Tibetan range of Unicode, minus + * CC_MCWD, U+0F82, and U+0F39 */ + static final int CC_CM = 2; + + /** for combining consonants, i.e. U+0F90-U+0FBC minus U+0F98 + * minus the decomposable entries like U+0F93, U+0F9D, U+0FA2, + * etc. */ + static final int CC_SJC = 3; + + /** for noncombining consonants, i.e. U+0F40-U+0F6A minus U+0F48 + * minus the decomposable entries like U+0F43, U+0F4D, U+0F52, + * etc. */ + static final int CC_CON = 4; + + /** for simple, nondecomposable vowels, i.e. U+0F72, U+0F74, + * U+0F7A, U+0F7B, U+0F7C, U+0F7D, U+0F80 */ + static final int CC_V = 5; + + /** for U+0F8A */ + static final int CC_0F8A = 6; + + /** for U+0F82, which is treated like {@link #CC_CM} except after + * U+0F8A */ + static final int CC_0F82 = 7; + + /** for U+0F39, an integral part of a consonant when it directly + * follows a member of CM_CONS or CM_SJC */ + static final int CC_0F39 = 8; + + /** for U+0F71 */ + static final int CC_ACHUNG = 9; + + /** for digits, i.e. U+0F20-U+0F33 */ + static final int CC_DIGIT = 10; + + + + // states STATE_...: + + /** initial state */ + static final int STATE_START = 0; + + /** ready state, i.e. the state in which some non-empty Unicode + * String is in the holding area, ready to receive + * combining marks like U+0F35 */ + static final int STATE_READY = 1; + + /** digit state, i.e. the state in which some non-empty Unicode + * String consisting entirely of digits is in the holding area, + * ready to receive marks that combine only with digits */ + static final int STATE_DIGIT = 2; + + /** state in which CC_SJC are welcomed and treated as consonants + * to be subscribed to the GraphemeCluster in holding. */ + static final int STATE_STACKING = 3; + + /** state in which one or more consonants have been seen and also + * an achung (U+0F71) has been seen */ + static final int STATE_STACKPLUSACHUNG = 4; + + /** state that seeing U+0F8A (when that's not an error) puts you + * in. Needed because U+0F8A is always followed by U+0F82, and + * we check for the exceptional case that U+0F8A is followed by + * something else. */ + static final int STATE_PARTIALMARK = 5; + + /* DLC we should have many error states or none. */ + + + /** the present codepoint marks the start of a new + * GraphemeCluster */ + static final int ACTION_BEGINS_NEW_GRAPHEME_CLUSTER = 0; + /** the present codepoint is a continuation of the current + * GraphemeCluster */ + static final int ACTION_CONTINUES_GRAPHEME_CLUSTER = 1; + /** there is an error in the input stream, which we are correcting + * (as we are in error-correcting mode) by starting a new + * GraphemeCluster with U+0F68 as the first codepoint and the + * current codepoint as the second */ + static final int ACTION_PREPEND_WITH_0F68 = 2; +} diff --git a/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java b/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java new file mode 100644 index 0000000..4084444 --- /dev/null +++ b/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java @@ -0,0 +1,345 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.tshegbar; + +class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants { + /** Don't instantiate this class. */ + private Foo() { super(); } + + /** This table tells how to transition from state a 6 states + error state */ + private static final TransitionInstruction + transitionTable[6 /* number of STATEs */] + [11 /* number of CC classes */] + = { + // STATE_START: + { + /* upon seeing CC_SIN in this state: */ + new TransitionInstruction(STATE_READY, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER), + /* upon seeing CC_MCWD in this state: */ + null, + /* upon seeing CC_CM in this state: */ + null, + /* upon seeing CC_SJC in this state: */ + null, + /* upon seeing CC_CON in this state: */ + new TransitionInstruction(STATE_READY, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER), + /* upon seeing CC_V in this state: */ + null, + /* upon seeing CC_0F8A in this state: */ + new TransitionInstruction(STATE_READY, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER), + /* upon seeing CC_0F82 in this state: */ + null, + /* upon seeing CC_0F39 in this state: */ + null, + /* upon seeing CC_ACHUNG in this state: */ + null, + /* upon seeing CC_DIGIT in this state: */ + new TransitionInstruction(STATE_DIGIT, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER) + }, + + // STATE_READY: + { + /* upon seeing CC_SIN in this state: */ + new TransitionInstruction(STATE_READY, // self + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER), + /* upon seeing CC_MCWD in this state: */ + null, + /* upon seeing CC_CM in this state: */ + new TransitionInstruction(STATE_READY, + ACTION_CONTINUES_GRAPHEME_CLUSTER), + /* upon seeing CC_SJC in this state: */ + null, + /* upon seeing CC_CON in this state: */ + new TransitionInstruction(STATE_STACKING, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER), + /* upon seeing CC_V in this state: */ + null + /* upon seeing CC_0F8A in this state: */ + new TransitionInstruction(STATE_PARTIALMARK, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER), + /* upon seeing CC_0F82 in this state: */ + new TransitionInstruction(STATE_READY, + ACTION_CONTINUES_GRAPHEME_CLUSTER), + /* upon seeing CC_0F39 in this state: */ + null, + /* upon seeing CC_ACHUNG in this state: */ + null, // because 0F71 comes after SJCs, before Vs, and + // before CMs. + /* upon seeing CC_DIGIT in this state: */ + new TransitionInstruction(STATE_DIGIT, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER) + }, + // STATE_DIGIT: + { + /* upon seeing CC_SIN in this state: */ + new TransitionInstruction(STATE_READY, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER), + /* upon seeing CC_MCWD in this state: */ + new TransitionInstruction(STATE_READY, + ACTION_CONTINUES_GRAPHEME_CLUSTER), + /* upon seeing CC_CM in this state: */ + new TransitionInstruction(STATE_READY, + ACTION_CONTINUES_GRAPHEME_CLUSTER), + /* upon seeing CC_SJC in this state: */ + null, + /* upon seeing CC_CON in this state: */ + new TransitionInstruction(STATE_STACKING, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER), + /* upon seeing CC_V in this state: */ + null, + /* upon seeing CC_0F8A in this state: */ + new TransitionInstruction(STATE_PARTIALMARK, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER), + /* upon seeing CC_0F82 in this state: */ + new TransitionInstruction(STATE_READY, + ACTION_CONTINUES_GRAPHEME_CLUSTER), + /* upon seeing CC_0F39 in this state: */ + null, + /* upon seeing CC_ACHUNG in this state: */ + null, + /* upon seeing CC_DIGIT in this state: */ + new TransitionInstruction(STATE_DIGIT, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER) /* DLC although consider the meaning of 0F22,0F22,0F3F */ + }, + // STATE_STACKING: + { + /* upon seeing CC_SIN in this state: */ + new TransitionInstruction(STATE_READY, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER), + /* upon seeing CC_MCWD in this state: */ + null, + /* upon seeing CC_CM in this state: */ + new TransitionInstruction(STATE_READY, + ACTION_CONTINUES_GRAPHEME_CLUSTER), + /* upon seeing CC_SJC in this state: */ + new TransitionInstruction(STATE_STACKING, + ACTION_CONTINUES_GRAPHEME_CLUSTER), + /* upon seeing CC_CON in this state: */ + new TransitionInstruction(STATE_STACKING, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER), + /* upon seeing CC_V in this state: */ + new TransitionInstruction(STATE_READY, + ACTION_CONTINUES_GRAPHEME_CLUSTER), + /* upon seeing CC_0F8A in this state: */ + new TransitionInstruction(STATE_PARTIALMARK, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER), + /* upon seeing CC_0F82 in this state: */ + new TransitionInstruction(STATE_READY, + ACTION_CONTINUES_GRAPHEME_CLUSTER), + /* upon seeing CC_0F39 in this state: */ + new TransitionInstruction(STATE_STACKING, + ACTION_CONTINUES_GRAPHEME_CLUSTER), + /* upon seeing CC_ACHUNG in this state: */ + new TransitionInstruction(STATE_STACKPLUSACHUNG, + ACTION_CONTINUES_GRAPHEME_CLUSTER), + /* upon seeing CC_DIGIT in this state: */ + new TransitionInstruction(STATE_DIGIT, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER) + }, + // STATE_STACKPLUSACHUNG: + { + /* upon seeing CC_SIN in this state: */ + new TransitionInstruction(STATE_READY, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER), + /* upon seeing CC_MCWD in this state: */ + null, + /* upon seeing CC_CM in this state: */ + new TransitionInstruction(STATE_READY, + ACTION_CONTINUES_GRAPHEME_CLUSTER), + /* upon seeing CC_SJC in this state: */ + null, + /* upon seeing CC_CON in this state: */ + new TransitionInstruction(STATE_STACKING, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER), + /* upon seeing CC_V in this state: */ + new TransitionInstruction(STATE_READY, + ACTION_CONTINUES_GRAPHEME_CLUSTER), + /* upon seeing CC_0F8A in this state: */ + new TransitionInstruction(STATE_PARTIALMARK, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER), + /* upon seeing CC_0F82 in this state: */ + new TransitionInstruction(STATE_READY, + ACTION_CONTINUES_GRAPHEME_CLUSTER), + /* upon seeing CC_0F39 in this state: */ + null, + /* upon seeing CC_ACHUNG in this state: */ + null, + /* upon seeing CC_DIGIT in this state: */ + new TransitionInstruction(STATE_DIGIT, + ACTION_BEGINS_NEW_GRAPHEME_CLUSTER) + }, + // STATE_PARTIALMARK: + { + /* upon seeing CC_SIN in this state: */ + null, + /* upon seeing CC_MCWD in this state: */ + null, + /* upon seeing CC_CM in this state: */ + null, + /* upon seeing CC_SJC in this state: */ + null, + /* upon seeing CC_CON in this state: */ + null, + /* upon seeing CC_V in this state: */ + null, + /* upon seeing CC_0F8A in this state: */ + null, + /* upon seeing CC_0F82 in this state: */ + new TransitionInstruction(STATE_READY, + ACTION_CONTINUES_GRAPHEME_CLUSTER), + /* upon seeing CC_0F39 in this state: */ + null, + /* upon seeing CC_ACHUNG in this state: */ + null, + /* upon seeing CC_DIGIT in this state: */ + null + } + }; + + DLC NOW -- clearly, we need LegalSyllable to be convertable to and from GraphemeClusters; + + /** Breaks a sequence of GraphemeClusters into LegalSyllables. + @param grcls a sequence of nonnull GraphemeClusters + @return a sequence of nonnull LegalSyllables + @exception TibetanSyntaxException if grcls does not consist + entirely of legal Tibetan syllables + @see #GraphemeCluster + @see #LegalSyllable + */ + private static Vector breakGraphemeClustersIntoOnlySyllables(Vector grcls) + throws TibetanSyntaxException + { + return breakGraphemeClustersIntoSyllablesAndGraphemeClusters(grcls, + true); + } + + private static Vector breakGraphemeClustersIntoOnlySyllables(Vector grcls) { + try { + return breakGraphemeClustersIntoSyllablesAndGraphemeClusters(grcls, + false); + } catch (TibetanSyntaxException) { + throw new Error("This can never happen, because the second parameter, validating, was false."); + } + } + + /** + @param grcls a Vector consisting entirely of GraphemeClusters + @param validate true iff you wish to have a + TibetanSyntaxException thrown upon encountering a sequence of + GraphemeClusters that is syntactically incorrect Tibetan + @return if validate is true, a Vector consisting entirely of + LegalSyllables, else a vector of LegalSyllables and + GraphemeClusters */ + private static Vector breakGraphemeClustersIntoSyllablesAndGraphemeClusters(Vector grcls, + boolean validate) + throws TibetanSyntaxException + { + Vector syllables = new Vector(); + int grcls_len = grcls.length(); + int beginning_of_cluster = 0; + for (int i = 0; i < grcls_len; i++) { + GraphemeCluster current_grcl + = (GraphemeCluster)grcls.elementAt(i); + if (current_grcl.isTshegLike()) { + if (beginning_of_cluster < i) { + // One or more non-tsheg-like grapheme clusters is + // here between tsheg-like grapheme clusters. Is + // it a legal syllable? + if (LegalTshegBar.formsLegalTshegBar(grcls, + beginning_of_cluster, + i)) + { + syllables.add(new LegalSyllable(grcls, + beginning_of_cluster, + i, tsheg=current_grcl)); + } + else + { + if (validating) { + TibetanSyntaxException ex + = new TibetanSyntaxException(grcls, + beginning_of_cluster, + i); + // DLC: return an int -1 for "all good" or + // 3 for "the fourth element is the first + // bad one" but then you don't know that + // 3-6 were the bad ones + throw ex; + } else { + for (int j = beginning_of_cluster; j <= i; j++) { + syllables.add(grcls.elementAt(j)); + } + } + } + } + beginning_of_cluster = i + 1; + } // else add current_grcl to the waiting list, in a sense + } + return syllables; + } + + /** Breaks a string of perfectly-formed Unicode into + GraphemeClusters. + @param nfthdl_unicode a String of NFTHDL-normalized Unicode + codepoints + @exception Exception if the input is not perfectly formed + @return a vector of GraphemeClusters + @see #GraphemeCluster + */ + private static Vector nonErrorCorrectingReader(String nfthdl_unicode) + throws Exception + { + // a vector of GraphemeClusters that we build up little by + // little: + Vector grcls = new Vector(); + int currentState = STATE_START; + StringBuffer holdingPen = new StringBuffer(); + + int ilen = nfthdl_unicode.length(); + for (int i = 0; i < ilen; i++) { + char current_cp = nfthdl_unicode.charAt(i); + int cc_of_current_cp = getCCForCP(current_cp); + final TransitionInstruction ti + = transitionTable[currentState][cc_of_current_cp]; + if (null == ti) { + throw new Exception("Bad Unicode. DLC improve these messages"); + } else { + switch (ti.getAction()) { + case ACTION_BEGINS_NEW_GRAPHEME_CLUSTER: + grcls.add(new GraphemeCluster(holdingPen)); + holdingPen = new StringBuffer(); + break; + case ACTION_CONTINUES_GRAPHEME_CLUSTER: + holdingString.append(current_cp); + break; + case ACTION_PREPEND_WITH_0F68: + throw new Error("This never happens inside the validating scanner."); + default: + throw new Error("Famous last words: This won't happen."); + } + currentState = ti.getNextState(); + } + } + return grcls; + } +} diff --git a/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReaderTest.java b/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReaderTest.java new file mode 100644 index 0000000..bf6ae3d --- /dev/null +++ b/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReaderTest.java @@ -0,0 +1,195 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.tshegbar; + +/** Tests ValidatingUnicodeReader. + * @author David Chandler */ +class ValidatingUnicodeReaderTest { + private static String skyagd = "\u0F66\u0F90\u0FB1\u0F42\u0F51"; + private static String bskyagd = "\u0F56" + skyagd; + + void testValidatingUnicodeReader() { + // DLC these routines can be slow. + assertTrue(ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode( + bskyagd + "\u0F0C")); + assertTrue(!ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode( + "\u0F42" + skyagd + "\u0F0C")); + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode( + bskyagd + "\u0F0C")); + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F42" + skyagd + "\u0F0C")); + + assertTrue(ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode( + bskyagd + "\u0F0C\u0F62\u0F0B" + bskyagd + "\u0F0F")); + + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F6A\u0F0B")); + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F62\u0F0B")); + assertTrue(!ValidatingUnicodeReader.isPerfectUnicode( + "\u0F6A\u0F0B")); + assertTrue(ValidatingUnicodeReader.isPerfectUnicode( + "\u0F62\u0F0B")); + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F6A\u0F90\u0F0B")); + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F62\u0F90\u0F0B")); + assertTrue(ValidatingUnicodeReader.isPerfectUnicode( + "\u0F62\u0F90\u0F0B")); + assertTrue(!ValidatingUnicodeReader.isPerfectUnicode( + "\u0F6A\u0F90\u0F0B")); + + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F43")); + assertTrue(!ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode( + "\u0F43")); + + // The Unicode standard states that U+0F8A is always followed + // by U+0F82. + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F8A\u0F82")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F8A")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F8A\u0F40")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F8A\u0F83")); + + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F74")); + assertTrue(ValidatingUnicodeReader.isPerfectUnicode( + "\u0F40\u0F74")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F90\u0F74")); + + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F40\u0F77")); + assertTrue(!ValidatingUnicodeReader.isPerfectUnicode( + "\u0F40\u0F77")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F90\u0F77")); + + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F40\u0F90\u0F7F")); + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F40\u0F90\u0F7F\u0F35")); + + // Test that each singleton (except U+0F8A) in the Tibetan + // range is legal, and that each combining char and empty + // codepoint (and also U+0F8A) is illegal alone. + { + for (char cp = '\u0F00'; cp <= '\u0F17'; cp++) + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp)); + for (char cp = '\u0F1a'; cp <= '\u0F34'; cp++) + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp)); + for (char cp = '\u0F3a'; cp <= '\u0F3d'; cp++) + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp)); + for (char cp = '\u0F40'; cp <= '\u0F47'; cp++) + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp)); + for (char cp = '\u0F49'; cp <= '\u0F6a'; cp++) + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp)); + for (char cp = '\u0F88'; cp <= '\u0F89'; cp++) + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp)); + for (char cp = '\u0Fbe'; cp <= '\u0Fc5'; cp++) + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp)); + for (char cp = '\u0Fc7'; cp <= '\u0Fcc'; cp++) + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp)); + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F36")); + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F38")); + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F85")); + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F8b")); + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0Fcf")); + + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F48")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6b")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6c")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6d")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6e")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6f")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F70")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8c")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8d")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8e")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8f")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F98")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fbd")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fcd")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fce")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fd0")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fe4")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Ff0")); + assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fff")); + } + + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F40\u0Fc6")); + + // Test that combining characters that combine with both + // consonants and digits work. + { + String combiningMarks[] = new String[] { + "\u0F71", + "\u0F72", + "\u0F73", + "\u0F74", + "\u0F75", + "\u0F76", + "\u0F77", + "\u0F78", + "\u0F79", + "\u0F7a", + "\u0F7b", + "\u0F7c", + "\u0F7d", + "\u0F7e", + "\u0F7f", + "\u0F80", + "\u0F81", + "\u0F82", + "\u0F83", + "\u0F84", + "\u0F86", + "\u0F87" + }; + for (int i = 0; i < combiningMarks.length(); i++) { + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F40" + combiningMarks[i])); + // DLC have a group that works with both digits and consonants, cuz vowels plus digits is a no go, right? + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F20" + combiningMarks[i])); + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F30" + combiningMarks[i])); + } + } + + DLC; + assertTrue(ValidatingUnicodeReader.isFullyValidUnicode( + "\u0F\u0F\u0F\u0F\u0F")); + } + + void testSyntacticallyLegalUnicodeToThdlWylie() { + assertTrue("bskyagd" + .equals(ValidatingUnicodeReader.syntacticallyLegalTibetanUnicodeToThdlWylie( + bskyagd))); + + assertTrue("bskyagd bskyagd/" + .equals(ValidatingUnicodeReader.syntacticallyLegalTibetanUnicodeToThdlWylie( + bskyagd + "\u0F0B" + bskyagd + "\u0F0D"))); + } +}