I've got too many sandboxes, so I'm committing these changes,

half-done, from one sandbox so as to consolidate my sandboxes.
This commit is contained in:
dchandler 2003-04-12 20:56:20 +00:00
parent 6e05b60cff
commit daacf6ee3b
7 changed files with 1252 additions and 7 deletions

View file

@ -18,6 +18,8 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.tshegbar;
import java.util.Vector;
import org.thdl.tib.text.THDLWylieConstants;
import org.thdl.util.ThdlDebug;
@ -42,15 +44,15 @@ import org.thdl.util.ThdlDebug;
* exception is that 'i (i.e., the connective case marker), 'u, and
* 'o suffixes are permitted.</li>
*
* <li>It has at most one suffix, which is a single consonant or a
* string consisting of 'i, 'u, 'o, 'am, and 'ang.</li>
*
*
DLC FIXME: we must allow many suffixes. See Andres' e-mail below:
* <li>It has at most one suffix, which is a single consonant (the
* common case) or a string consisting of 'i, 'u, 'o, 'am, and
* 'ang.
<p>See Andres' e-mail below:</p>
<pre>
David,
It is a particle that means "or" as opposed to "dang" that means and.
['am] is a particle that means "or" as opposed to "dang" that means and.
"sgom pa'am" would mean "... or meditation"
@ -65,6 +67,7 @@ And also there are cases where they combine. For ex you can have
Andres
</pre>
</li>
*
*
* <li>It may contain a EWC_sa or EWC_da postsuffix iff there exists
@ -681,7 +684,7 @@ public final class LegalTshegBar
}
/** Like {@link
* #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char)}
* #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char,StringBuffer)}
* but geared for the common case where the suffix is simply a
* consonant. */
public static boolean formsLegalTshegBar(char prefix,
@ -1138,4 +1141,350 @@ public final class LegalTshegBar
public String toString() {
return toConciseXML();
}
/** FIXMEDOC a shortcut */
private static boolean formsLegalTshegBar(Vector grcls) {
return formsLegalTshegBar(grcls, 0, grcls.size());
}
/** FIXMEDOC DLC
*
* Returns true iff the given UnicodeGraphemeClusters form a
* syntactically legal Tibetan syllable. If one is null, it
* means that it is not present.
*
* @exception IllegalArgumentException if root is null, or if
* postsuffix is non-null and suffix is null (these being clearly
* illegal)
*/
private static boolean formsLegalTshegBar(UnicodeGraphemeCluster prefix,
UnicodeGraphemeCluster root,
UnicodeGraphemeCluster suffix,
UnicodeGraphemeCluster postsuffix)
throws IllegalArgumentException
{
// reality checks:
if (null == root)
throw new IllegalArgumentException("root letter is not present");
if (null != postsuffix && null == suffix)
throw new IllegalArgumentException("a postsuffix cannot occur without a suffix");
// handle root:
if (!root.isLegalTibetan())
return false;
char headLetter = root.getSuperscribedLetter();
char rootLetter = root.getRootCP();
char subjoinedLetter = root.getSoleNonWazurSubjoinedLetter();
char vowel = root.getVowel();
boolean hasAchung = root.hasAchung();
boolean hasWazur = root.hasWazur();
// handle prefix:
char prefixLetter = prefix.getSoleTibetanUnicodeCP();
// handle suffix:
String suffixString = null;
if (null != suffix) {
// DLC FIXME suffixString = suffix.getUnicodeInUsualOrder();
throw new Error("DLC FIXME");
}
// handle postsuffix:
char postsuffixLetter = postsuffix.getSoleTibetanUnicodeCP();
return formsLegalTshegBar(prefixLetter, headLetter, rootLetter,
subjoinedLetter, hasWazur, hasAchung,
suffixString, postsuffixLetter, vowel, null);
}
/** Returns true iff the UnicodeGraphemeClusters in grcls with
* indices in the range [start, end) form a syntactically legal
* syllable. If start is as large as end, false is returned. */
private static boolean formsLegalTshegBar(Vector grcls,
int start,
int end)
{
int numGrcls = start - end;
if (numGrcls <= 0)
return false;
if (numGrcls == 1) {
// Option 1: (root)
// else: return false;
return formsLegalTshegBar(null,
(UnicodeGraphemeCluster)grcls.elementAt(start),
null, null);
} else if (numGrcls == 2) {
// Option 1: (prefix, root)
// Option 2: (root, suffix)
// else: return false;
return (formsLegalTshegBar((UnicodeGraphemeCluster)grcls.elementAt(start),
(UnicodeGraphemeCluster)grcls.elementAt(start + 1),
null,
null)
|| formsLegalTshegBar(null,
(UnicodeGraphemeCluster)grcls.elementAt(start),
(UnicodeGraphemeCluster)grcls.elementAt(start + 1),
null));
} else if (numGrcls == 3) {
// Option 1: (prefix, root, suffix)
// Option 2: (root, suffix, postsuffix)
// else: return false;
return (formsLegalTshegBar((UnicodeGraphemeCluster)grcls.elementAt(start),
(UnicodeGraphemeCluster)grcls.elementAt(start + 1),
(UnicodeGraphemeCluster)grcls.elementAt(start + 2),
null)
|| formsLegalTshegBar(null,
(UnicodeGraphemeCluster)grcls.elementAt(start),
(UnicodeGraphemeCluster)grcls.elementAt(start + 1),
(UnicodeGraphemeCluster)grcls.elementAt(start + 2)));
} else if (numGrcls == 4) {
return (formsLegalTshegBar((UnicodeGraphemeCluster)grcls.elementAt(start),
(UnicodeGraphemeCluster)grcls.elementAt(start + 1),
(UnicodeGraphemeCluster)grcls.elementAt(start + 2),
(UnicodeGraphemeCluster)grcls.elementAt(start + 3)));
} else {
// the largest has 'i ... DLC FIXME rethink -- even the case where numGrcls == 3 could be pa'am
return false;
}
}
/** Returns true if the given Tibetan consonant stack (i.e., the
* combination of superscribed, root, and subscribed letters)
* takes an EWC_ga prefix.
* @param head the {@link
* isNominalRepresentationOfConsonant(char) nominal
* representation} of the superscribed letter, or EW_ABSENT if
* not present
* @param root the {@link
* isNominalRepresentationOfConsonant(char) nominal
* representation} of the root letter
* @param sub the {@link isNominalRepresentationOfConsonant(char)
* nominal representation} of the subjoined letter, or EW_ABSENT
* if not present */
static boolean takesGao(char head, char root, char sub) {
if (EW_ABSENT == head) {
if (EW_ABSENT == sub) {
return (EWC_ca == root
|| EWC_ta == root
|| EWC_da == root
|| EWC_tsa == root
|| EWC_zha == root
|| EWC_za == root
|| EWC_ya == root
|| EWC_sha == root
|| EWC_sa == root
|| EWC_nya == root
|| EWC_na == root);
}
}
return false;
}
/** Returns true if the given Tibetan consonant stack (i.e., the
* combination of superscribed, root, and subscribed letters)
* takes an EWC_da prefix.
* @param head the {@link
* isNominalRepresentationOfConsonant(char) nominal
* representation} of the superscribed letter, or EW_ABSENT if
* not present
* @param root the {@link
* isNominalRepresentationOfConsonant(char) nominal
* representation} of the root letter
* @param sub the {@link isNominalRepresentationOfConsonant(char)
* nominal representation} of the subjoined letter, or EW_ABSENT
* if not present */
static boolean takesDao(char head, char root, char sub) {
if (EW_ABSENT == head) {
if (EW_ABSENT == sub) {
return (EWC_ka == root
|| EWC_ga == root
|| EWC_nga == root
|| EWC_pa == root
|| EWC_ba == root
|| EWC_ma == root);
} else {
return ((EWC_ga == root && EWC_ya == sub)
|| (EWC_pa == root && EWC_ya == sub)
|| (EWC_ba == root && EWC_ya == sub)
|| (EWC_ma == root && EWC_ya == sub)
|| (EWC_ka == root && EWC_ra == sub)
|| (EWC_ga == root && EWC_ra == sub)
|| (EWC_ba == root && EWC_ra == sub)
|| (EWC_pa == root && EWC_ra == sub));
}
} else {
return false;
}
}
/** Returns true if the given Tibetan consonant stack (i.e., the
* combination of superscribed, root, and subscribed letters)
* takes an EWC_achung prefix.
* @param head the {@link
* isNominalRepresentationOfConsonant(char) nominal
* representation} of the superscribed letter, or EW_ABSENT if
* not present
* @param root the {@link
* isNominalRepresentationOfConsonant(char) nominal
* representation} of the root letter
* @param sub the {@link isNominalRepresentationOfConsonant(char)
* nominal representation} of the subjoined letter, or EW_ABSENT
* if not present */
static boolean takesAchungPrefix(char head, char root, char sub) {
if (EW_ABSENT == head) {
if (EW_ABSENT == sub) {
return (EWC_ga == root
|| EWC_ja == root
|| EWC_da == root
|| EWC_ba == root
|| EWC_dza == root
|| EWC_kha == root
|| EWC_cha == root
|| EWC_tha == root
|| EWC_pha == root
|| EWC_tsha == root);
} else {
return ((EWC_pha == root && EWC_ya == sub)
|| (EWC_ba == root && EWC_ya == sub)
|| (EWC_kha == root && EWC_ya == sub)
|| (EWC_ga == root && EWC_ya == sub)
|| (EWC_ba == root && EWC_ra == sub)
|| (EWC_kha == root && EWC_ra == sub)
|| (EWC_ga == root && EWC_ra == sub)
|| (EWC_da == root && EWC_ra == sub)
|| (EWC_pha == root && EWC_ra == sub));
}
} else {
return false;
}
}
/** Returns true if the given Tibetan consonant stack (i.e., the
* combination of superscribed, root, and subscribed letters)
* takes an EWC_ma prefix.
* @param head the {@link
* isNominalRepresentationOfConsonant(char) nominal
* representation} of the superscribed letter, or EW_ABSENT if
* not present
* @param root the {@link
* isNominalRepresentationOfConsonant(char) nominal
* representation} of the root letter
* @param sub the {@link isNominalRepresentationOfConsonant(char)
* nominal representation} of the subjoined letter, or EW_ABSENT
* if not present */
static boolean takesMao(char head, char root, char sub) {
if (EW_ABSENT == head) {
if (EW_ABSENT == sub) {
return (EWC_kha == root
|| EWC_ga == root
|| EWC_cha == root
|| EWC_ja == root
|| EWC_tha == root
|| EWC_tsha == root
|| EWC_da == root
|| EWC_dza == root
|| EWC_nga == root
|| EWC_nya == root
|| EWC_na == root);
} else {
return ((EWC_kha == root && EWC_ya == sub)
|| (EWC_ga == root && EWC_ya == sub)
|| (EWC_kha == root && EWC_ra == sub)
|| (EWC_ga == root && EWC_ra == sub));
}
} else {
return false;
}
}
/** Returns true if the given Tibetan consonant stack (i.e., the
* combination of superscribed, root, and subscribed letters)
* takes an EWC_ba prefix.
* @param head the {@link
* isNominalRepresentationOfConsonant(char) nominal
* representation} of the superscribed letter, or EW_ABSENT if
* not present
* @param root the {@link
* isNominalRepresentationOfConsonant(char) nominal
* representation} of the root letter
* @param sub the {@link isNominalRepresentationOfConsonant(char)
* nominal representation} of the subjoined letter, or EW_ABSENT
* if not present */
static boolean takesBao(char head, char root, char sub) {
// DLC ask Ten-lo la about Wazur.
if (EW_ABSENT == head) {
if (EW_ABSENT == sub) {
return (EWC_ka == root
|| EWC_ca == root
|| EWC_ta == root
|| EWC_tsa == root
|| EWC_ga == root
|| EWC_nga == root
|| EWC_ja == root
|| EWC_nya == root
|| EWC_da == root
|| EWC_na == root
|| EWC_dza == root
|| EWC_zha == root
|| EWC_za == root
|| EWC_ra == root
|| EWC_la == root
|| EWC_sha == root);
} else {
// kra, e.g.
return ((EWC_ka == root && EWC_ya == sub)
|| (EWC_ga == root && EWC_ya == sub)
|| (EWC_ka == root && EWC_ra == sub)
|| (EWC_ga == root && EWC_ra == sub)
|| (EWC_sa == root && EWC_ra == sub)
|| (EWC_ka == root && EWC_la == sub)
|| (EWC_za == root && EWC_la == sub)
|| (EWC_ra == root && EWC_la == sub)
|| (EWC_sa == root && EWC_la == sub));
}
} else {
if (EW_ABSENT == sub) {
// ska, e.g.
return ((EWC_sa == head && EWC_ka == root)
|| (EWC_sa == head && EWC_ga == root)
|| (EWC_sa == head && EWC_nga == root)
|| (EWC_sa == head && EWC_nya == root)
|| (EWC_sa == head && EWC_ta == root)
|| (EWC_sa == head && EWC_da == root)
|| (EWC_sa == head && EWC_na == root)
|| (EWC_sa == head && EWC_tsa == root)
|| (EWC_ra == head && EWC_ka == root)
|| (EWC_ra == head && EWC_ga == root)
|| (EWC_ra == head && EWC_nga == root)
|| (EWC_ra == head && EWC_ja == root)
|| (EWC_ra == head && EWC_nya == root)
|| (EWC_ra == head && EWC_ta == root)
|| (EWC_ra == head && EWC_da == root)
|| (EWC_ra == head && EWC_na == root)
|| (EWC_ra == head && EWC_tsa == root)
|| (EWC_ra == head && EWC_dza == root)
|| (EWC_la == head && EWC_ta == root)
|| (EWC_la == head && EWC_da == root));
} else {
return ((EWC_ra == head && EWC_ka == root && EWC_ya == sub)
|| (EWC_ra == head && EWC_ga == root && EWC_ya == sub)
|| (EWC_sa == head && EWC_ka == root && EWC_ya == sub)
|| (EWC_sa == head && EWC_ga == root && EWC_ya == sub)
|| (EWC_sa == head && EWC_ka == root && EWC_ra == sub)
|| (EWC_sa == head && EWC_ga == root && EWC_ra == sub));
}
}
}
}

View file

@ -279,4 +279,77 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants {
assertTrue(!LegalTshegBar.isAchungBasedSuffix(""));
}
/** Tests that the rules concerning "which root letters take which
* prefixes?" are accurate. I got a list of such rules from a
* native Tibetan who has been kind enough to teach me the
* fundamentals of the Tibetan language, but I'm not sure where he
* got the list.
*/
public void testPrefixRules() {
// DLC FIXME how can we say that 0Fb2 is ok but 0fBc is not?
assertTrue(LegalTshegBar.takesBao(EWC_sa, EWC_ka, EWC_ra));
assertTrue(!LegalTshegBar.takesBao('\u0FB6', EWC_ka, EWC_ra));
assertTrue(!LegalTshegBar.takesBao(EWC_sa, '\u0F90', EWC_ra));
assertTrue(!LegalTshegBar.takesBao(EWC_sa, '\u0F90', '\u0FB2'));
assertTrue(!LegalTshegBar.takesBao('\u0FB6', '\u0F90', EWC_ra));
assertTrue(!LegalTshegBar.takesBao(EWC_sa, EWC_ka, '\u0FB2'));
{
assertTrue(LegalTshegBar.takesBao(EW_ABSENT, EWC_ka, EW_ABSENT));
assertTrue(LegalTshegBar.takesBao(EWC_la, EWC_da, EW_ABSENT));
assertTrue(LegalTshegBar.takesBao(EW_ABSENT, EWC_sa, EWC_ra));
assertTrue(LegalTshegBar.takesBao(EW_ABSENT, EWC_ga, EWC_ra));
assertTrue(LegalTshegBar.takesBao(EWC_ra, EWC_ga, EWC_ya));
assertTrue(!LegalTshegBar.takesBao(EWC_ra, EWC_da, EWC_ya));
assertTrue(!LegalTshegBar.takesBao(EW_ABSENT, EWC_ba, EW_ABSENT));
assertTrue(!LegalTshegBar.takesBao(EWC_la, EWC_nga, EW_ABSENT));
assertTrue(!LegalTshegBar.takesBao(EW_ABSENT, EWC_nga, EWC_ra));
}
{
assertTrue(LegalTshegBar.takesGao(EW_ABSENT, EWC_ca, EW_ABSENT));
assertTrue(!LegalTshegBar.takesGao(EW_ABSENT, EWC_ka, EW_ABSENT));
assertTrue(!LegalTshegBar.takesGao(EW_ABSENT, EWC_ka, EWC_ya));
assertTrue(!LegalTshegBar.takesGao(EWC_ra, EWC_ka, EW_ABSENT));
assertTrue(!LegalTshegBar.takesGao(EWC_ra, EWC_ka, EWC_ya));
}
{
assertTrue(LegalTshegBar.takesDao(EW_ABSENT, EWC_ka, EW_ABSENT));
assertTrue(!LegalTshegBar.takesDao(EW_ABSENT, EWC_wa, EW_ABSENT));
assertTrue(!LegalTshegBar.takesDao(EW_ABSENT, EWC_nga, EWC_ya));
assertTrue(!LegalTshegBar.takesDao(EWC_ra, EWC_ga, EW_ABSENT));
assertTrue(!LegalTshegBar.takesDao(EWC_ra, EWC_ga, EWC_ya));
assertTrue(LegalTshegBar.takesDao(EW_ABSENT, EWC_ga, EWC_ya));
assertTrue(LegalTshegBar.takesDao(EW_ABSENT, EWC_ka, EWC_ra));
}
{
assertTrue(LegalTshegBar.takesMao(EW_ABSENT, EWC_ja, EW_ABSENT));
assertTrue(!LegalTshegBar.takesMao(EW_ABSENT, EWC_wa, EW_ABSENT));
assertTrue(!LegalTshegBar.takesMao(EW_ABSENT, EWC_nga, EWC_ya));
assertTrue(!LegalTshegBar.takesMao(EWC_ra, EWC_ga, EW_ABSENT));
assertTrue(!LegalTshegBar.takesMao(EWC_ra, EWC_ga, EWC_ya));
assertTrue(LegalTshegBar.takesMao(EW_ABSENT, EWC_kha, EWC_ya));
assertTrue(LegalTshegBar.takesMao(EW_ABSENT, EWC_kha, EWC_ra));
}
{
assertTrue(LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_ga, EW_ABSENT));
assertTrue(!LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_ka, EW_ABSENT));
assertTrue(!LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_wa, EW_ABSENT));
assertTrue(!LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_nga, EWC_ya));
assertTrue(!LegalTshegBar.takesAchungPrefix(EWC_ra, EWC_ga, EW_ABSENT));
assertTrue(!LegalTshegBar.takesAchungPrefix(EWC_ra, EWC_ga, EWC_ya));
assertTrue(LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_ba, EWC_ya));
assertTrue(LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_pha, EWC_ra));
}
}
}

View file

@ -0,0 +1,51 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.tshegbar;
/** DLC FIXMEDOC: says "this isn't legal Tibetan", not "this isn't a valid sequence of Unicode" */
class TibetanSyntaxException extends Exception {
/** This constructor creates an exception with a less than helpful
* message for the end user. Please don't use this constructor
* for production code. */
TibetanSyntaxException() {
super("A Unicode input stream had a syntactically incorrect run of Tibetan. For example, kha, i.e., U+0F41, is not an allowed prefix. This run of Tibetan was not expected.");
// we can tell it wasn't expected, because this error message
// isn't very helpful, and one of the other constructors
// should've been used.
}
/** DLC FIXMEDOC */
TibetanSyntaxException(String x) {
super(x);
}
/** DLC FIXMEDOC
@param grcls a Vector whose elements x are GraphemeClusters
where x is in the range [start, end)
@param start grcls.elementAt(start) is the first
GraphemeCluster in the syntactically incorrect stretch of
Tibetan.
@param end grcls.elementAt(end - 1) is the last
GraphemeCluster in the syntactically incorrect stretch of
Tibetan. */
TibetanSyntaxException(Vector grcls, int start, int end) {
DLC NOW;
}
}

View file

@ -0,0 +1,58 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.tshegbar;
/** DLC FIXMEDOC */
class TransitionInstruction implements UnicodeReadingStateMachineConstants {
private TransitionInstruction() { super(); }
TransitionInstruction(int nextState, int action) {
super();
assert(action == ACTION_CONTINUES_GRAPHEME_CLUSTER
|| action == ACTION_BEGINS_NEW_GRAPHEME_CLUSTER
|| action == ACTION_PREPEND_WITH_0F68);
assert(nextState == STATE_START
|| nextState == STATE_READY
|| nextState == STATE_DIGIT
|| nextState == STATE_STACKING
|| nextState == STATE_STACKPLUSACHUNG
|| nextState == STATE_PARTIALMARK);
// we start in the start state, but we can never return to it.
assert(nextState != STATE_START);
this.nextState = nextState;
this.action = action;
}
/** the state (e.g., {@link #STATE_READY}) to which to transition
* next */
private int nextState;
/** the action to perform upon transition, either {@link
* #ACTION_CONTINUES_GRAPHEME_CLUSTER}, {@link
* #ACTION_BEGINS_NEW_GRAPHEME_CLUSTER}, or {@link
* #ACTION_PREPEND_WITH_0F68} */
private int action;
int getAction() { return action; }
int getNextState() { return nextState; }
}

View file

@ -0,0 +1,174 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.tshegbar;
/** Constants and static routines (DLC still?) useful in writing state
* machines for transforming Unicode input into other forms.
*
* @author David Chandler
*/
interface UnicodeReadingStateMachineConstants {
/** Returns the codepoint class for cp, e.g. {@link #CC_SJC}.
* @param cp a Unicode codepoint, which MUST be nondecomposable
* if it is in the Tibetan range but can be from outside the
* Tibetan range of Unicode */
static int getCCForCP(char cp) {
assert(getNFTHDL(cp) == null);
if ('\u0F82' == cp) {
return CC_0F82;
} else if ('\u0F8A' == cp) {
return CC_0F8A;
} else if ('\u0F39' == cp) {
return CC_0F39;
} else if ('\u0F71' == cp) {
return CC_ACHUNG;
} else if ('\u0F40' <= cp && cp <= '\u0F6A') {
assert(cp != '\u0F48');
return CC_CON;
} else if ('\u0F90' <= cp && cp <= '\u0FBC') {
assert(cp != '\u0F98');
return CC_SJC;
} else if ('\u0F20' <= cp && cp <= '\u0F33') {
return CC_DIGIT;
} else if (/* DLC NOW do these combine ONLY with digits, or do CC_CM just NOT combine with digits? */
'\u0F3E' == cp
|| '\u0F3F' == cp
|| '\u0F18' == cp
|| '\u0F19' == cp) {
return CC_MCWD;
} else if ('\u0FC6' == cp
|| '\u0F87' == cp
|| '\u0F86' == cp
|| '\u0F84' == cp
|| '\u0F83' == cp
|| '\u0F82' == cp
|| '\u0F7F' == cp
|| '\u0F7E' == cp
|| '\u0F37' == cp /* DLC NOW NORMALIZATION OF 0F10, 11 to 0F0F ??? */
|| '\u0F35' == cp) {
return CC_CM;
} else if ('\u0F72' == cp
|| '\u0F74' == cp
|| '\u0F7A' == cp
|| '\u0F7B' == cp
|| '\u0F7C' == cp
|| '\u0F7D' == cp
|| '\u0F80' == cp) {
// DLC what about U+0F84 ??? CC_V or CC_CM ?
return CC_V;
} else {
return CC_SIN;
}
}
// codepoint classes (CC_...) follow. These are mutually
// exclusive, and their union is the whole of Unicode.
/** for everything else, i.e. non-Tibetan characters like U+0E00
* and also Tibetan characters like U+0FCF and U+0F05 (DLC rename
* SIN[GLETON] to OTHER as combining marks from outside the
* Tibetan range count as this) but not U+0F8A */
static final int CC_SIN = 0;
/** for combining marks in the Tibetan range of Unicode that
* combine with digits alone */
static final int CC_MCWD = 1;
/** for combining marks in the Tibetan range of Unicode, minus
* CC_MCWD, U+0F82, and U+0F39 */
static final int CC_CM = 2;
/** for combining consonants, i.e. U+0F90-U+0FBC minus U+0F98
* minus the decomposable entries like U+0F93, U+0F9D, U+0FA2,
* etc. */
static final int CC_SJC = 3;
/** for noncombining consonants, i.e. U+0F40-U+0F6A minus U+0F48
* minus the decomposable entries like U+0F43, U+0F4D, U+0F52,
* etc. */
static final int CC_CON = 4;
/** for simple, nondecomposable vowels, i.e. U+0F72, U+0F74,
* U+0F7A, U+0F7B, U+0F7C, U+0F7D, U+0F80 */
static final int CC_V = 5;
/** for U+0F8A */
static final int CC_0F8A = 6;
/** for U+0F82, which is treated like {@link #CC_CM} except after
* U+0F8A */
static final int CC_0F82 = 7;
/** for U+0F39, an integral part of a consonant when it directly
* follows a member of CM_CONS or CM_SJC */
static final int CC_0F39 = 8;
/** for U+0F71 */
static final int CC_ACHUNG = 9;
/** for digits, i.e. U+0F20-U+0F33 */
static final int CC_DIGIT = 10;
// states STATE_...:
/** initial state */
static final int STATE_START = 0;
/** ready state, i.e. the state in which some non-empty Unicode
* String is in the holding area, <i>ready</i> to receive
* combining marks like U+0F35 */
static final int STATE_READY = 1;
/** digit state, i.e. the state in which some non-empty Unicode
* String consisting entirely of digits is in the holding area,
* ready to receive marks that combine only with digits */
static final int STATE_DIGIT = 2;
/** state in which CC_SJC are welcomed and treated as consonants
* to be subscribed to the GraphemeCluster in holding. */
static final int STATE_STACKING = 3;
/** state in which one or more consonants have been seen and also
* an achung (U+0F71) has been seen */
static final int STATE_STACKPLUSACHUNG = 4;
/** state that seeing U+0F8A (when that's not an error) puts you
* in. Needed because U+0F8A is always followed by U+0F82, and
* we check for the exceptional case that U+0F8A is followed by
* something else. */
static final int STATE_PARTIALMARK = 5;
/* DLC we should have many error states or none. */
/** the present codepoint marks the start of a new
* GraphemeCluster */
static final int ACTION_BEGINS_NEW_GRAPHEME_CLUSTER = 0;
/** the present codepoint is a continuation of the current
* GraphemeCluster */
static final int ACTION_CONTINUES_GRAPHEME_CLUSTER = 1;
/** there is an error in the input stream, which we are correcting
* (as we are in error-correcting mode) by starting a new
* GraphemeCluster with U+0F68 as the first codepoint and the
* current codepoint as the second */
static final int ACTION_PREPEND_WITH_0F68 = 2;
}

View file

@ -0,0 +1,345 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.tshegbar;
class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
/** Don't instantiate this class. */
private Foo() { super(); }
/** This table tells how to transition from state a 6 states + error state */
private static final TransitionInstruction
transitionTable[6 /* number of STATEs */]
[11 /* number of CC classes */]
= {
// STATE_START:
{
/* upon seeing CC_SIN in this state: */
new TransitionInstruction(STATE_READY,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_MCWD in this state: */
null,
/* upon seeing CC_CM in this state: */
null,
/* upon seeing CC_SJC in this state: */
null,
/* upon seeing CC_CON in this state: */
new TransitionInstruction(STATE_READY,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_V in this state: */
null,
/* upon seeing CC_0F8A in this state: */
new TransitionInstruction(STATE_READY,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_0F82 in this state: */
null,
/* upon seeing CC_0F39 in this state: */
null,
/* upon seeing CC_ACHUNG in this state: */
null,
/* upon seeing CC_DIGIT in this state: */
new TransitionInstruction(STATE_DIGIT,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
},
// STATE_READY:
{
/* upon seeing CC_SIN in this state: */
new TransitionInstruction(STATE_READY, // self
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_MCWD in this state: */
null,
/* upon seeing CC_CM in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_SJC in this state: */
null,
/* upon seeing CC_CON in this state: */
new TransitionInstruction(STATE_STACKING,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_V in this state: */
null
/* upon seeing CC_0F8A in this state: */
new TransitionInstruction(STATE_PARTIALMARK,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_0F82 in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F39 in this state: */
null,
/* upon seeing CC_ACHUNG in this state: */
null, // because 0F71 comes after SJCs, before Vs, and
// before CMs.
/* upon seeing CC_DIGIT in this state: */
new TransitionInstruction(STATE_DIGIT,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
},
// STATE_DIGIT:
{
/* upon seeing CC_SIN in this state: */
new TransitionInstruction(STATE_READY,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_MCWD in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_CM in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_SJC in this state: */
null,
/* upon seeing CC_CON in this state: */
new TransitionInstruction(STATE_STACKING,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_V in this state: */
null,
/* upon seeing CC_0F8A in this state: */
new TransitionInstruction(STATE_PARTIALMARK,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_0F82 in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F39 in this state: */
null,
/* upon seeing CC_ACHUNG in this state: */
null,
/* upon seeing CC_DIGIT in this state: */
new TransitionInstruction(STATE_DIGIT,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER) /* DLC although consider the meaning of 0F22,0F22,0F3F */
},
// STATE_STACKING:
{
/* upon seeing CC_SIN in this state: */
new TransitionInstruction(STATE_READY,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_MCWD in this state: */
null,
/* upon seeing CC_CM in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_SJC in this state: */
new TransitionInstruction(STATE_STACKING,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_CON in this state: */
new TransitionInstruction(STATE_STACKING,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_V in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F8A in this state: */
new TransitionInstruction(STATE_PARTIALMARK,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_0F82 in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F39 in this state: */
new TransitionInstruction(STATE_STACKING,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_ACHUNG in this state: */
new TransitionInstruction(STATE_STACKPLUSACHUNG,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_DIGIT in this state: */
new TransitionInstruction(STATE_DIGIT,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
},
// STATE_STACKPLUSACHUNG:
{
/* upon seeing CC_SIN in this state: */
new TransitionInstruction(STATE_READY,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_MCWD in this state: */
null,
/* upon seeing CC_CM in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_SJC in this state: */
null,
/* upon seeing CC_CON in this state: */
new TransitionInstruction(STATE_STACKING,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_V in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F8A in this state: */
new TransitionInstruction(STATE_PARTIALMARK,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_0F82 in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F39 in this state: */
null,
/* upon seeing CC_ACHUNG in this state: */
null,
/* upon seeing CC_DIGIT in this state: */
new TransitionInstruction(STATE_DIGIT,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
},
// STATE_PARTIALMARK:
{
/* upon seeing CC_SIN in this state: */
null,
/* upon seeing CC_MCWD in this state: */
null,
/* upon seeing CC_CM in this state: */
null,
/* upon seeing CC_SJC in this state: */
null,
/* upon seeing CC_CON in this state: */
null,
/* upon seeing CC_V in this state: */
null,
/* upon seeing CC_0F8A in this state: */
null,
/* upon seeing CC_0F82 in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F39 in this state: */
null,
/* upon seeing CC_ACHUNG in this state: */
null,
/* upon seeing CC_DIGIT in this state: */
null
}
};
DLC NOW -- clearly, we need LegalSyllable to be convertable to and from GraphemeClusters;
/** Breaks a sequence of GraphemeClusters into LegalSyllables.
@param grcls a sequence of nonnull GraphemeClusters
@return a sequence of nonnull LegalSyllables
@exception TibetanSyntaxException if grcls does not consist
entirely of legal Tibetan syllables
@see #GraphemeCluster
@see #LegalSyllable
*/
private static Vector breakGraphemeClustersIntoOnlySyllables(Vector grcls)
throws TibetanSyntaxException
{
return breakGraphemeClustersIntoSyllablesAndGraphemeClusters(grcls,
true);
}
private static Vector breakGraphemeClustersIntoOnlySyllables(Vector grcls) {
try {
return breakGraphemeClustersIntoSyllablesAndGraphemeClusters(grcls,
false);
} catch (TibetanSyntaxException) {
throw new Error("This can never happen, because the second parameter, validating, was false.");
}
}
/**
@param grcls a Vector consisting entirely of GraphemeClusters
@param validate true iff you wish to have a
TibetanSyntaxException thrown upon encountering a sequence of
GraphemeClusters that is syntactically incorrect Tibetan
@return if validate is true, a Vector consisting entirely of
LegalSyllables, else a vector of LegalSyllables and
GraphemeClusters */
private static Vector breakGraphemeClustersIntoSyllablesAndGraphemeClusters(Vector grcls,
boolean validate)
throws TibetanSyntaxException
{
Vector syllables = new Vector();
int grcls_len = grcls.length();
int beginning_of_cluster = 0;
for (int i = 0; i < grcls_len; i++) {
GraphemeCluster current_grcl
= (GraphemeCluster)grcls.elementAt(i);
if (current_grcl.isTshegLike()) {
if (beginning_of_cluster < i) {
// One or more non-tsheg-like grapheme clusters is
// here between tsheg-like grapheme clusters. Is
// it a legal syllable?
if (LegalTshegBar.formsLegalTshegBar(grcls,
beginning_of_cluster,
i))
{
syllables.add(new LegalSyllable(grcls,
beginning_of_cluster,
i, tsheg=current_grcl));
}
else
{
if (validating) {
TibetanSyntaxException ex
= new TibetanSyntaxException(grcls,
beginning_of_cluster,
i);
// DLC: return an int -1 for "all good" or
// 3 for "the fourth element is the first
// bad one" but then you don't know that
// 3-6 were the bad ones
throw ex;
} else {
for (int j = beginning_of_cluster; j <= i; j++) {
syllables.add(grcls.elementAt(j));
}
}
}
}
beginning_of_cluster = i + 1;
} // else add current_grcl to the waiting list, in a sense
}
return syllables;
}
/** Breaks a string of perfectly-formed Unicode into
GraphemeClusters.
@param nfthdl_unicode a String of NFTHDL-normalized Unicode
codepoints
@exception Exception if the input is not perfectly formed
@return a vector of GraphemeClusters
@see #GraphemeCluster
*/
private static Vector nonErrorCorrectingReader(String nfthdl_unicode)
throws Exception
{
// a vector of GraphemeClusters that we build up little by
// little:
Vector grcls = new Vector();
int currentState = STATE_START;
StringBuffer holdingPen = new StringBuffer();
int ilen = nfthdl_unicode.length();
for (int i = 0; i < ilen; i++) {
char current_cp = nfthdl_unicode.charAt(i);
int cc_of_current_cp = getCCForCP(current_cp);
final TransitionInstruction ti
= transitionTable[currentState][cc_of_current_cp];
if (null == ti) {
throw new Exception("Bad Unicode. DLC improve these messages");
} else {
switch (ti.getAction()) {
case ACTION_BEGINS_NEW_GRAPHEME_CLUSTER:
grcls.add(new GraphemeCluster(holdingPen));
holdingPen = new StringBuffer();
break;
case ACTION_CONTINUES_GRAPHEME_CLUSTER:
holdingString.append(current_cp);
break;
case ACTION_PREPEND_WITH_0F68:
throw new Error("This never happens inside the validating scanner.");
default:
throw new Error("Famous last words: This won't happen.");
}
currentState = ti.getNextState();
}
}
return grcls;
}
}

View file

@ -0,0 +1,195 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.tshegbar;
/** Tests ValidatingUnicodeReader.
* @author David Chandler */
class ValidatingUnicodeReaderTest {
private static String skyagd = "\u0F66\u0F90\u0FB1\u0F42\u0F51";
private static String bskyagd = "\u0F56" + skyagd;
void testValidatingUnicodeReader() {
// DLC these routines can be slow.
assertTrue(ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode(
bskyagd + "\u0F0C"));
assertTrue(!ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode(
"\u0F42" + skyagd + "\u0F0C"));
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
bskyagd + "\u0F0C"));
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F42" + skyagd + "\u0F0C"));
assertTrue(ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode(
bskyagd + "\u0F0C\u0F62\u0F0B" + bskyagd + "\u0F0F"));
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F6A\u0F0B"));
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F62\u0F0B"));
assertTrue(!ValidatingUnicodeReader.isPerfectUnicode(
"\u0F6A\u0F0B"));
assertTrue(ValidatingUnicodeReader.isPerfectUnicode(
"\u0F62\u0F0B"));
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F6A\u0F90\u0F0B"));
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F62\u0F90\u0F0B"));
assertTrue(ValidatingUnicodeReader.isPerfectUnicode(
"\u0F62\u0F90\u0F0B"));
assertTrue(!ValidatingUnicodeReader.isPerfectUnicode(
"\u0F6A\u0F90\u0F0B"));
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F43"));
assertTrue(!ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode(
"\u0F43"));
// The Unicode standard states that U+0F8A is always followed
// by U+0F82.
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F8A\u0F82"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F8A"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F8A\u0F40"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F8A\u0F83"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F74"));
assertTrue(ValidatingUnicodeReader.isPerfectUnicode(
"\u0F40\u0F74"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F90\u0F74"));
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F40\u0F77"));
assertTrue(!ValidatingUnicodeReader.isPerfectUnicode(
"\u0F40\u0F77"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F90\u0F77"));
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F40\u0F90\u0F7F"));
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F40\u0F90\u0F7F\u0F35"));
// Test that each singleton (except U+0F8A) in the Tibetan
// range is legal, and that each combining char and empty
// codepoint (and also U+0F8A) is illegal alone.
{
for (char cp = '\u0F00'; cp <= '\u0F17'; cp++)
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
for (char cp = '\u0F1a'; cp <= '\u0F34'; cp++)
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
for (char cp = '\u0F3a'; cp <= '\u0F3d'; cp++)
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
for (char cp = '\u0F40'; cp <= '\u0F47'; cp++)
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
for (char cp = '\u0F49'; cp <= '\u0F6a'; cp++)
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
for (char cp = '\u0F88'; cp <= '\u0F89'; cp++)
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
for (char cp = '\u0Fbe'; cp <= '\u0Fc5'; cp++)
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
for (char cp = '\u0Fc7'; cp <= '\u0Fcc'; cp++)
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F36"));
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F38"));
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F85"));
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F8b"));
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0Fcf"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F48"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6b"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6c"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6d"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6e"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6f"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F70"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8c"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8d"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8e"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8f"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F98"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fbd"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fcd"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fce"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fd0"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fe4"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Ff0"));
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fff"));
}
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F40\u0Fc6"));
// Test that combining characters that combine with both
// consonants and digits work.
{
String combiningMarks[] = new String[] {
"\u0F71",
"\u0F72",
"\u0F73",
"\u0F74",
"\u0F75",
"\u0F76",
"\u0F77",
"\u0F78",
"\u0F79",
"\u0F7a",
"\u0F7b",
"\u0F7c",
"\u0F7d",
"\u0F7e",
"\u0F7f",
"\u0F80",
"\u0F81",
"\u0F82",
"\u0F83",
"\u0F84",
"\u0F86",
"\u0F87"
};
for (int i = 0; i < combiningMarks.length(); i++) {
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F40" + combiningMarks[i]));
// DLC have a group that works with both digits and consonants, cuz vowels plus digits is a no go, right?
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F20" + combiningMarks[i]));
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F30" + combiningMarks[i]));
}
}
DLC;
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
"\u0F\u0F\u0F\u0F\u0F"));
}
void testSyntacticallyLegalUnicodeToThdlWylie() {
assertTrue("bskyagd"
.equals(ValidatingUnicodeReader.syntacticallyLegalTibetanUnicodeToThdlWylie(
bskyagd)));
assertTrue("bskyagd bskyagd/"
.equals(ValidatingUnicodeReader.syntacticallyLegalTibetanUnicodeToThdlWylie(
bskyagd + "\u0F0B" + bskyagd + "\u0F0D")));
}
}