I've got too many sandboxes, so I'm committing these changes,
half-done, from one sandbox so as to consolidate my sandboxes.
This commit is contained in:
parent
6e05b60cff
commit
daacf6ee3b
7 changed files with 1252 additions and 7 deletions
|
@ -18,6 +18,8 @@ Contributor(s): ______________________________________.
|
||||||
|
|
||||||
package org.thdl.tib.text.tshegbar;
|
package org.thdl.tib.text.tshegbar;
|
||||||
|
|
||||||
|
import java.util.Vector;
|
||||||
|
|
||||||
import org.thdl.tib.text.THDLWylieConstants;
|
import org.thdl.tib.text.THDLWylieConstants;
|
||||||
import org.thdl.util.ThdlDebug;
|
import org.thdl.util.ThdlDebug;
|
||||||
|
|
||||||
|
@ -42,15 +44,15 @@ import org.thdl.util.ThdlDebug;
|
||||||
* exception is that 'i (i.e., the connective case marker), 'u, and
|
* exception is that 'i (i.e., the connective case marker), 'u, and
|
||||||
* 'o suffixes are permitted.</li>
|
* 'o suffixes are permitted.</li>
|
||||||
*
|
*
|
||||||
* <li>It has at most one suffix, which is a single consonant or a
|
* <li>It has at most one suffix, which is a single consonant (the
|
||||||
* string consisting of 'i, 'u, 'o, 'am, and 'ang.</li>
|
* common case) or a string consisting of 'i, 'u, 'o, 'am, and
|
||||||
*
|
* 'ang.
|
||||||
*
|
|
||||||
DLC FIXME: we must allow many suffixes. See Andres' e-mail below:
|
<p>See Andres' e-mail below:</p>
|
||||||
<pre>
|
<pre>
|
||||||
David,
|
David,
|
||||||
|
|
||||||
It is a particle that means "or" as opposed to "dang" that means and.
|
['am] is a particle that means "or" as opposed to "dang" that means and.
|
||||||
|
|
||||||
"sgom pa'am" would mean "... or meditation"
|
"sgom pa'am" would mean "... or meditation"
|
||||||
|
|
||||||
|
@ -65,6 +67,7 @@ And also there are cases where they combine. For ex you can have
|
||||||
|
|
||||||
Andres
|
Andres
|
||||||
</pre>
|
</pre>
|
||||||
|
</li>
|
||||||
*
|
*
|
||||||
*
|
*
|
||||||
* <li>It may contain a EWC_sa or EWC_da postsuffix iff there exists
|
* <li>It may contain a EWC_sa or EWC_da postsuffix iff there exists
|
||||||
|
@ -681,7 +684,7 @@ public final class LegalTshegBar
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Like {@link
|
/** Like {@link
|
||||||
* #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char)}
|
* #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char,StringBuffer)}
|
||||||
* but geared for the common case where the suffix is simply a
|
* but geared for the common case where the suffix is simply a
|
||||||
* consonant. */
|
* consonant. */
|
||||||
public static boolean formsLegalTshegBar(char prefix,
|
public static boolean formsLegalTshegBar(char prefix,
|
||||||
|
@ -1138,4 +1141,350 @@ public final class LegalTshegBar
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return toConciseXML();
|
return toConciseXML();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** FIXMEDOC a shortcut */
|
||||||
|
private static boolean formsLegalTshegBar(Vector grcls) {
|
||||||
|
return formsLegalTshegBar(grcls, 0, grcls.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
/** FIXMEDOC DLC
|
||||||
|
*
|
||||||
|
* Returns true iff the given UnicodeGraphemeClusters form a
|
||||||
|
* syntactically legal Tibetan syllable. If one is null, it
|
||||||
|
* means that it is not present.
|
||||||
|
*
|
||||||
|
* @exception IllegalArgumentException if root is null, or if
|
||||||
|
* postsuffix is non-null and suffix is null (these being clearly
|
||||||
|
* illegal)
|
||||||
|
*/
|
||||||
|
private static boolean formsLegalTshegBar(UnicodeGraphemeCluster prefix,
|
||||||
|
UnicodeGraphemeCluster root,
|
||||||
|
UnicodeGraphemeCluster suffix,
|
||||||
|
UnicodeGraphemeCluster postsuffix)
|
||||||
|
throws IllegalArgumentException
|
||||||
|
{
|
||||||
|
// reality checks:
|
||||||
|
if (null == root)
|
||||||
|
throw new IllegalArgumentException("root letter is not present");
|
||||||
|
if (null != postsuffix && null == suffix)
|
||||||
|
throw new IllegalArgumentException("a postsuffix cannot occur without a suffix");
|
||||||
|
|
||||||
|
// handle root:
|
||||||
|
if (!root.isLegalTibetan())
|
||||||
|
return false;
|
||||||
|
char headLetter = root.getSuperscribedLetter();
|
||||||
|
char rootLetter = root.getRootCP();
|
||||||
|
char subjoinedLetter = root.getSoleNonWazurSubjoinedLetter();
|
||||||
|
char vowel = root.getVowel();
|
||||||
|
boolean hasAchung = root.hasAchung();
|
||||||
|
boolean hasWazur = root.hasWazur();
|
||||||
|
|
||||||
|
// handle prefix:
|
||||||
|
char prefixLetter = prefix.getSoleTibetanUnicodeCP();
|
||||||
|
|
||||||
|
// handle suffix:
|
||||||
|
String suffixString = null;
|
||||||
|
if (null != suffix) {
|
||||||
|
// DLC FIXME suffixString = suffix.getUnicodeInUsualOrder();
|
||||||
|
throw new Error("DLC FIXME");
|
||||||
|
}
|
||||||
|
|
||||||
|
// handle postsuffix:
|
||||||
|
char postsuffixLetter = postsuffix.getSoleTibetanUnicodeCP();
|
||||||
|
|
||||||
|
return formsLegalTshegBar(prefixLetter, headLetter, rootLetter,
|
||||||
|
subjoinedLetter, hasWazur, hasAchung,
|
||||||
|
suffixString, postsuffixLetter, vowel, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns true iff the UnicodeGraphemeClusters in grcls with
|
||||||
|
* indices in the range [start, end) form a syntactically legal
|
||||||
|
* syllable. If start is as large as end, false is returned. */
|
||||||
|
private static boolean formsLegalTshegBar(Vector grcls,
|
||||||
|
int start,
|
||||||
|
int end)
|
||||||
|
{
|
||||||
|
int numGrcls = start - end;
|
||||||
|
if (numGrcls <= 0)
|
||||||
|
return false;
|
||||||
|
if (numGrcls == 1) {
|
||||||
|
// Option 1: (root)
|
||||||
|
// else: return false;
|
||||||
|
|
||||||
|
return formsLegalTshegBar(null,
|
||||||
|
(UnicodeGraphemeCluster)grcls.elementAt(start),
|
||||||
|
null, null);
|
||||||
|
} else if (numGrcls == 2) {
|
||||||
|
// Option 1: (prefix, root)
|
||||||
|
// Option 2: (root, suffix)
|
||||||
|
// else: return false;
|
||||||
|
|
||||||
|
return (formsLegalTshegBar((UnicodeGraphemeCluster)grcls.elementAt(start),
|
||||||
|
(UnicodeGraphemeCluster)grcls.elementAt(start + 1),
|
||||||
|
null,
|
||||||
|
null)
|
||||||
|
|| formsLegalTshegBar(null,
|
||||||
|
(UnicodeGraphemeCluster)grcls.elementAt(start),
|
||||||
|
(UnicodeGraphemeCluster)grcls.elementAt(start + 1),
|
||||||
|
null));
|
||||||
|
} else if (numGrcls == 3) {
|
||||||
|
// Option 1: (prefix, root, suffix)
|
||||||
|
// Option 2: (root, suffix, postsuffix)
|
||||||
|
// else: return false;
|
||||||
|
|
||||||
|
return (formsLegalTshegBar((UnicodeGraphemeCluster)grcls.elementAt(start),
|
||||||
|
(UnicodeGraphemeCluster)grcls.elementAt(start + 1),
|
||||||
|
(UnicodeGraphemeCluster)grcls.elementAt(start + 2),
|
||||||
|
null)
|
||||||
|
|| formsLegalTshegBar(null,
|
||||||
|
(UnicodeGraphemeCluster)grcls.elementAt(start),
|
||||||
|
(UnicodeGraphemeCluster)grcls.elementAt(start + 1),
|
||||||
|
(UnicodeGraphemeCluster)grcls.elementAt(start + 2)));
|
||||||
|
} else if (numGrcls == 4) {
|
||||||
|
return (formsLegalTshegBar((UnicodeGraphemeCluster)grcls.elementAt(start),
|
||||||
|
(UnicodeGraphemeCluster)grcls.elementAt(start + 1),
|
||||||
|
(UnicodeGraphemeCluster)grcls.elementAt(start + 2),
|
||||||
|
(UnicodeGraphemeCluster)grcls.elementAt(start + 3)));
|
||||||
|
} else {
|
||||||
|
// the largest has 'i ... DLC FIXME rethink -- even the case where numGrcls == 3 could be pa'am
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
/** Returns true if the given Tibetan consonant stack (i.e., the
|
||||||
|
* combination of superscribed, root, and subscribed letters)
|
||||||
|
* takes an EWC_ga prefix.
|
||||||
|
* @param head the {@link
|
||||||
|
* isNominalRepresentationOfConsonant(char) nominal
|
||||||
|
* representation} of the superscribed letter, or EW_ABSENT if
|
||||||
|
* not present
|
||||||
|
* @param root the {@link
|
||||||
|
* isNominalRepresentationOfConsonant(char) nominal
|
||||||
|
* representation} of the root letter
|
||||||
|
* @param sub the {@link isNominalRepresentationOfConsonant(char)
|
||||||
|
* nominal representation} of the subjoined letter, or EW_ABSENT
|
||||||
|
* if not present */
|
||||||
|
static boolean takesGao(char head, char root, char sub) {
|
||||||
|
if (EW_ABSENT == head) {
|
||||||
|
if (EW_ABSENT == sub) {
|
||||||
|
return (EWC_ca == root
|
||||||
|
|| EWC_ta == root
|
||||||
|
|| EWC_da == root
|
||||||
|
|| EWC_tsa == root
|
||||||
|
|| EWC_zha == root
|
||||||
|
|| EWC_za == root
|
||||||
|
|| EWC_ya == root
|
||||||
|
|| EWC_sha == root
|
||||||
|
|| EWC_sa == root
|
||||||
|
|| EWC_nya == root
|
||||||
|
|| EWC_na == root);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns true if the given Tibetan consonant stack (i.e., the
|
||||||
|
* combination of superscribed, root, and subscribed letters)
|
||||||
|
* takes an EWC_da prefix.
|
||||||
|
* @param head the {@link
|
||||||
|
* isNominalRepresentationOfConsonant(char) nominal
|
||||||
|
* representation} of the superscribed letter, or EW_ABSENT if
|
||||||
|
* not present
|
||||||
|
* @param root the {@link
|
||||||
|
* isNominalRepresentationOfConsonant(char) nominal
|
||||||
|
* representation} of the root letter
|
||||||
|
* @param sub the {@link isNominalRepresentationOfConsonant(char)
|
||||||
|
* nominal representation} of the subjoined letter, or EW_ABSENT
|
||||||
|
* if not present */
|
||||||
|
static boolean takesDao(char head, char root, char sub) {
|
||||||
|
if (EW_ABSENT == head) {
|
||||||
|
if (EW_ABSENT == sub) {
|
||||||
|
return (EWC_ka == root
|
||||||
|
|| EWC_ga == root
|
||||||
|
|| EWC_nga == root
|
||||||
|
|| EWC_pa == root
|
||||||
|
|| EWC_ba == root
|
||||||
|
|| EWC_ma == root);
|
||||||
|
} else {
|
||||||
|
return ((EWC_ga == root && EWC_ya == sub)
|
||||||
|
|| (EWC_pa == root && EWC_ya == sub)
|
||||||
|
|| (EWC_ba == root && EWC_ya == sub)
|
||||||
|
|| (EWC_ma == root && EWC_ya == sub)
|
||||||
|
|
||||||
|
|| (EWC_ka == root && EWC_ra == sub)
|
||||||
|
|| (EWC_ga == root && EWC_ra == sub)
|
||||||
|
|| (EWC_ba == root && EWC_ra == sub)
|
||||||
|
|| (EWC_pa == root && EWC_ra == sub));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns true if the given Tibetan consonant stack (i.e., the
|
||||||
|
* combination of superscribed, root, and subscribed letters)
|
||||||
|
* takes an EWC_achung prefix.
|
||||||
|
* @param head the {@link
|
||||||
|
* isNominalRepresentationOfConsonant(char) nominal
|
||||||
|
* representation} of the superscribed letter, or EW_ABSENT if
|
||||||
|
* not present
|
||||||
|
* @param root the {@link
|
||||||
|
* isNominalRepresentationOfConsonant(char) nominal
|
||||||
|
* representation} of the root letter
|
||||||
|
* @param sub the {@link isNominalRepresentationOfConsonant(char)
|
||||||
|
* nominal representation} of the subjoined letter, or EW_ABSENT
|
||||||
|
* if not present */
|
||||||
|
static boolean takesAchungPrefix(char head, char root, char sub) {
|
||||||
|
if (EW_ABSENT == head) {
|
||||||
|
if (EW_ABSENT == sub) {
|
||||||
|
return (EWC_ga == root
|
||||||
|
|| EWC_ja == root
|
||||||
|
|| EWC_da == root
|
||||||
|
|| EWC_ba == root
|
||||||
|
|| EWC_dza == root
|
||||||
|
|| EWC_kha == root
|
||||||
|
|| EWC_cha == root
|
||||||
|
|| EWC_tha == root
|
||||||
|
|| EWC_pha == root
|
||||||
|
|| EWC_tsha == root);
|
||||||
|
} else {
|
||||||
|
return ((EWC_pha == root && EWC_ya == sub)
|
||||||
|
|| (EWC_ba == root && EWC_ya == sub)
|
||||||
|
|| (EWC_kha == root && EWC_ya == sub)
|
||||||
|
|| (EWC_ga == root && EWC_ya == sub)
|
||||||
|
|
||||||
|
|| (EWC_ba == root && EWC_ra == sub)
|
||||||
|
|| (EWC_kha == root && EWC_ra == sub)
|
||||||
|
|| (EWC_ga == root && EWC_ra == sub)
|
||||||
|
|| (EWC_da == root && EWC_ra == sub)
|
||||||
|
|| (EWC_pha == root && EWC_ra == sub));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns true if the given Tibetan consonant stack (i.e., the
|
||||||
|
* combination of superscribed, root, and subscribed letters)
|
||||||
|
* takes an EWC_ma prefix.
|
||||||
|
* @param head the {@link
|
||||||
|
* isNominalRepresentationOfConsonant(char) nominal
|
||||||
|
* representation} of the superscribed letter, or EW_ABSENT if
|
||||||
|
* not present
|
||||||
|
* @param root the {@link
|
||||||
|
* isNominalRepresentationOfConsonant(char) nominal
|
||||||
|
* representation} of the root letter
|
||||||
|
* @param sub the {@link isNominalRepresentationOfConsonant(char)
|
||||||
|
* nominal representation} of the subjoined letter, or EW_ABSENT
|
||||||
|
* if not present */
|
||||||
|
static boolean takesMao(char head, char root, char sub) {
|
||||||
|
if (EW_ABSENT == head) {
|
||||||
|
if (EW_ABSENT == sub) {
|
||||||
|
return (EWC_kha == root
|
||||||
|
|| EWC_ga == root
|
||||||
|
|| EWC_cha == root
|
||||||
|
|| EWC_ja == root
|
||||||
|
|| EWC_tha == root
|
||||||
|
|| EWC_tsha == root
|
||||||
|
|| EWC_da == root
|
||||||
|
|| EWC_dza == root
|
||||||
|
|| EWC_nga == root
|
||||||
|
|| EWC_nya == root
|
||||||
|
|| EWC_na == root);
|
||||||
|
} else {
|
||||||
|
return ((EWC_kha == root && EWC_ya == sub)
|
||||||
|
|| (EWC_ga == root && EWC_ya == sub)
|
||||||
|
|
||||||
|
|| (EWC_kha == root && EWC_ra == sub)
|
||||||
|
|| (EWC_ga == root && EWC_ra == sub));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns true if the given Tibetan consonant stack (i.e., the
|
||||||
|
* combination of superscribed, root, and subscribed letters)
|
||||||
|
* takes an EWC_ba prefix.
|
||||||
|
* @param head the {@link
|
||||||
|
* isNominalRepresentationOfConsonant(char) nominal
|
||||||
|
* representation} of the superscribed letter, or EW_ABSENT if
|
||||||
|
* not present
|
||||||
|
* @param root the {@link
|
||||||
|
* isNominalRepresentationOfConsonant(char) nominal
|
||||||
|
* representation} of the root letter
|
||||||
|
* @param sub the {@link isNominalRepresentationOfConsonant(char)
|
||||||
|
* nominal representation} of the subjoined letter, or EW_ABSENT
|
||||||
|
* if not present */
|
||||||
|
static boolean takesBao(char head, char root, char sub) {
|
||||||
|
// DLC ask Ten-lo la about Wazur.
|
||||||
|
if (EW_ABSENT == head) {
|
||||||
|
if (EW_ABSENT == sub) {
|
||||||
|
return (EWC_ka == root
|
||||||
|
|| EWC_ca == root
|
||||||
|
|| EWC_ta == root
|
||||||
|
|| EWC_tsa == root
|
||||||
|
|| EWC_ga == root
|
||||||
|
|| EWC_nga == root
|
||||||
|
|| EWC_ja == root
|
||||||
|
|| EWC_nya == root
|
||||||
|
|| EWC_da == root
|
||||||
|
|| EWC_na == root
|
||||||
|
|| EWC_dza == root
|
||||||
|
|| EWC_zha == root
|
||||||
|
|| EWC_za == root
|
||||||
|
|| EWC_ra == root
|
||||||
|
|| EWC_la == root
|
||||||
|
|| EWC_sha == root);
|
||||||
|
} else {
|
||||||
|
// kra, e.g.
|
||||||
|
return ((EWC_ka == root && EWC_ya == sub)
|
||||||
|
|| (EWC_ga == root && EWC_ya == sub)
|
||||||
|
|
||||||
|
|| (EWC_ka == root && EWC_ra == sub)
|
||||||
|
|| (EWC_ga == root && EWC_ra == sub)
|
||||||
|
|| (EWC_sa == root && EWC_ra == sub)
|
||||||
|
|
||||||
|
|| (EWC_ka == root && EWC_la == sub)
|
||||||
|
|| (EWC_za == root && EWC_la == sub)
|
||||||
|
|| (EWC_ra == root && EWC_la == sub)
|
||||||
|
|| (EWC_sa == root && EWC_la == sub));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (EW_ABSENT == sub) {
|
||||||
|
// ska, e.g.
|
||||||
|
return ((EWC_sa == head && EWC_ka == root)
|
||||||
|
|| (EWC_sa == head && EWC_ga == root)
|
||||||
|
|| (EWC_sa == head && EWC_nga == root)
|
||||||
|
|| (EWC_sa == head && EWC_nya == root)
|
||||||
|
|| (EWC_sa == head && EWC_ta == root)
|
||||||
|
|| (EWC_sa == head && EWC_da == root)
|
||||||
|
|| (EWC_sa == head && EWC_na == root)
|
||||||
|
|| (EWC_sa == head && EWC_tsa == root)
|
||||||
|
|
||||||
|
|| (EWC_ra == head && EWC_ka == root)
|
||||||
|
|| (EWC_ra == head && EWC_ga == root)
|
||||||
|
|| (EWC_ra == head && EWC_nga == root)
|
||||||
|
|| (EWC_ra == head && EWC_ja == root)
|
||||||
|
|| (EWC_ra == head && EWC_nya == root)
|
||||||
|
|| (EWC_ra == head && EWC_ta == root)
|
||||||
|
|| (EWC_ra == head && EWC_da == root)
|
||||||
|
|| (EWC_ra == head && EWC_na == root)
|
||||||
|
|| (EWC_ra == head && EWC_tsa == root)
|
||||||
|
|| (EWC_ra == head && EWC_dza == root)
|
||||||
|
|
||||||
|
|| (EWC_la == head && EWC_ta == root)
|
||||||
|
|| (EWC_la == head && EWC_da == root));
|
||||||
|
} else {
|
||||||
|
return ((EWC_ra == head && EWC_ka == root && EWC_ya == sub)
|
||||||
|
|| (EWC_ra == head && EWC_ga == root && EWC_ya == sub)
|
||||||
|
|| (EWC_sa == head && EWC_ka == root && EWC_ya == sub)
|
||||||
|
|| (EWC_sa == head && EWC_ga == root && EWC_ya == sub)
|
||||||
|
|| (EWC_sa == head && EWC_ka == root && EWC_ra == sub)
|
||||||
|
|| (EWC_sa == head && EWC_ga == root && EWC_ra == sub));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -279,4 +279,77 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants {
|
||||||
|
|
||||||
assertTrue(!LegalTshegBar.isAchungBasedSuffix(""));
|
assertTrue(!LegalTshegBar.isAchungBasedSuffix(""));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Tests that the rules concerning "which root letters take which
|
||||||
|
* prefixes?" are accurate. I got a list of such rules from a
|
||||||
|
* native Tibetan who has been kind enough to teach me the
|
||||||
|
* fundamentals of the Tibetan language, but I'm not sure where he
|
||||||
|
* got the list.
|
||||||
|
*/
|
||||||
|
public void testPrefixRules() {
|
||||||
|
// DLC FIXME how can we say that 0Fb2 is ok but 0fBc is not?
|
||||||
|
assertTrue(LegalTshegBar.takesBao(EWC_sa, EWC_ka, EWC_ra));
|
||||||
|
assertTrue(!LegalTshegBar.takesBao('\u0FB6', EWC_ka, EWC_ra));
|
||||||
|
assertTrue(!LegalTshegBar.takesBao(EWC_sa, '\u0F90', EWC_ra));
|
||||||
|
assertTrue(!LegalTshegBar.takesBao(EWC_sa, '\u0F90', '\u0FB2'));
|
||||||
|
assertTrue(!LegalTshegBar.takesBao('\u0FB6', '\u0F90', EWC_ra));
|
||||||
|
assertTrue(!LegalTshegBar.takesBao(EWC_sa, EWC_ka, '\u0FB2'));
|
||||||
|
|
||||||
|
|
||||||
|
{
|
||||||
|
assertTrue(LegalTshegBar.takesBao(EW_ABSENT, EWC_ka, EW_ABSENT));
|
||||||
|
assertTrue(LegalTshegBar.takesBao(EWC_la, EWC_da, EW_ABSENT));
|
||||||
|
assertTrue(LegalTshegBar.takesBao(EW_ABSENT, EWC_sa, EWC_ra));
|
||||||
|
assertTrue(LegalTshegBar.takesBao(EW_ABSENT, EWC_ga, EWC_ra));
|
||||||
|
assertTrue(LegalTshegBar.takesBao(EWC_ra, EWC_ga, EWC_ya));
|
||||||
|
|
||||||
|
assertTrue(!LegalTshegBar.takesBao(EWC_ra, EWC_da, EWC_ya));
|
||||||
|
assertTrue(!LegalTshegBar.takesBao(EW_ABSENT, EWC_ba, EW_ABSENT));
|
||||||
|
assertTrue(!LegalTshegBar.takesBao(EWC_la, EWC_nga, EW_ABSENT));
|
||||||
|
assertTrue(!LegalTshegBar.takesBao(EW_ABSENT, EWC_nga, EWC_ra));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
assertTrue(LegalTshegBar.takesGao(EW_ABSENT, EWC_ca, EW_ABSENT));
|
||||||
|
assertTrue(!LegalTshegBar.takesGao(EW_ABSENT, EWC_ka, EW_ABSENT));
|
||||||
|
assertTrue(!LegalTshegBar.takesGao(EW_ABSENT, EWC_ka, EWC_ya));
|
||||||
|
assertTrue(!LegalTshegBar.takesGao(EWC_ra, EWC_ka, EW_ABSENT));
|
||||||
|
assertTrue(!LegalTshegBar.takesGao(EWC_ra, EWC_ka, EWC_ya));
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
{
|
||||||
|
assertTrue(LegalTshegBar.takesDao(EW_ABSENT, EWC_ka, EW_ABSENT));
|
||||||
|
assertTrue(!LegalTshegBar.takesDao(EW_ABSENT, EWC_wa, EW_ABSENT));
|
||||||
|
assertTrue(!LegalTshegBar.takesDao(EW_ABSENT, EWC_nga, EWC_ya));
|
||||||
|
assertTrue(!LegalTshegBar.takesDao(EWC_ra, EWC_ga, EW_ABSENT));
|
||||||
|
assertTrue(!LegalTshegBar.takesDao(EWC_ra, EWC_ga, EWC_ya));
|
||||||
|
|
||||||
|
assertTrue(LegalTshegBar.takesDao(EW_ABSENT, EWC_ga, EWC_ya));
|
||||||
|
assertTrue(LegalTshegBar.takesDao(EW_ABSENT, EWC_ka, EWC_ra));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
assertTrue(LegalTshegBar.takesMao(EW_ABSENT, EWC_ja, EW_ABSENT));
|
||||||
|
assertTrue(!LegalTshegBar.takesMao(EW_ABSENT, EWC_wa, EW_ABSENT));
|
||||||
|
assertTrue(!LegalTshegBar.takesMao(EW_ABSENT, EWC_nga, EWC_ya));
|
||||||
|
assertTrue(!LegalTshegBar.takesMao(EWC_ra, EWC_ga, EW_ABSENT));
|
||||||
|
assertTrue(!LegalTshegBar.takesMao(EWC_ra, EWC_ga, EWC_ya));
|
||||||
|
|
||||||
|
assertTrue(LegalTshegBar.takesMao(EW_ABSENT, EWC_kha, EWC_ya));
|
||||||
|
assertTrue(LegalTshegBar.takesMao(EW_ABSENT, EWC_kha, EWC_ra));
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
assertTrue(LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_ga, EW_ABSENT));
|
||||||
|
assertTrue(!LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_ka, EW_ABSENT));
|
||||||
|
assertTrue(!LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_wa, EW_ABSENT));
|
||||||
|
assertTrue(!LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_nga, EWC_ya));
|
||||||
|
assertTrue(!LegalTshegBar.takesAchungPrefix(EWC_ra, EWC_ga, EW_ABSENT));
|
||||||
|
assertTrue(!LegalTshegBar.takesAchungPrefix(EWC_ra, EWC_ga, EWC_ya));
|
||||||
|
|
||||||
|
assertTrue(LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_ba, EWC_ya));
|
||||||
|
assertTrue(LegalTshegBar.takesAchungPrefix(EW_ABSENT, EWC_pha, EWC_ra));
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,51 @@
|
||||||
|
/*
|
||||||
|
The contents of this file are subject to the THDL Open Community License
|
||||||
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||||
|
with the License. You may obtain a copy of the License on the THDL web site
|
||||||
|
(http://www.thdl.org/).
|
||||||
|
|
||||||
|
Software distributed under the License is distributed on an "AS IS" basis,
|
||||||
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||||
|
License for the specific terms governing rights and limitations under the
|
||||||
|
License.
|
||||||
|
|
||||||
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||||
|
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
|
||||||
|
All Rights Reserved.
|
||||||
|
|
||||||
|
Contributor(s): ______________________________________.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.thdl.tib.text.tshegbar;
|
||||||
|
|
||||||
|
/** DLC FIXMEDOC: says "this isn't legal Tibetan", not "this isn't a valid sequence of Unicode" */
|
||||||
|
class TibetanSyntaxException extends Exception {
|
||||||
|
/** This constructor creates an exception with a less than helpful
|
||||||
|
* message for the end user. Please don't use this constructor
|
||||||
|
* for production code. */
|
||||||
|
TibetanSyntaxException() {
|
||||||
|
super("A Unicode input stream had a syntactically incorrect run of Tibetan. For example, kha, i.e., U+0F41, is not an allowed prefix. This run of Tibetan was not expected.");
|
||||||
|
// we can tell it wasn't expected, because this error message
|
||||||
|
// isn't very helpful, and one of the other constructors
|
||||||
|
// should've been used.
|
||||||
|
}
|
||||||
|
|
||||||
|
/** DLC FIXMEDOC */
|
||||||
|
TibetanSyntaxException(String x) {
|
||||||
|
super(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** DLC FIXMEDOC
|
||||||
|
|
||||||
|
@param grcls a Vector whose elements x are GraphemeClusters
|
||||||
|
where x is in the range [start, end)
|
||||||
|
@param start grcls.elementAt(start) is the first
|
||||||
|
GraphemeCluster in the syntactically incorrect stretch of
|
||||||
|
Tibetan.
|
||||||
|
@param end grcls.elementAt(end - 1) is the last
|
||||||
|
GraphemeCluster in the syntactically incorrect stretch of
|
||||||
|
Tibetan. */
|
||||||
|
TibetanSyntaxException(Vector grcls, int start, int end) {
|
||||||
|
DLC NOW;
|
||||||
|
}
|
||||||
|
}
|
58
source/org/thdl/tib/text/tshegbar/TransitionInstruction.java
Normal file
58
source/org/thdl/tib/text/tshegbar/TransitionInstruction.java
Normal file
|
@ -0,0 +1,58 @@
|
||||||
|
/*
|
||||||
|
The contents of this file are subject to the THDL Open Community License
|
||||||
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||||
|
with the License. You may obtain a copy of the License on the THDL web site
|
||||||
|
(http://www.thdl.org/).
|
||||||
|
|
||||||
|
Software distributed under the License is distributed on an "AS IS" basis,
|
||||||
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||||
|
License for the specific terms governing rights and limitations under the
|
||||||
|
License.
|
||||||
|
|
||||||
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||||
|
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
|
||||||
|
All Rights Reserved.
|
||||||
|
|
||||||
|
Contributor(s): ______________________________________.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.thdl.tib.text.tshegbar;
|
||||||
|
|
||||||
|
|
||||||
|
/** DLC FIXMEDOC */
|
||||||
|
class TransitionInstruction implements UnicodeReadingStateMachineConstants {
|
||||||
|
private TransitionInstruction() { super(); }
|
||||||
|
TransitionInstruction(int nextState, int action) {
|
||||||
|
super();
|
||||||
|
|
||||||
|
assert(action == ACTION_CONTINUES_GRAPHEME_CLUSTER
|
||||||
|
|| action == ACTION_BEGINS_NEW_GRAPHEME_CLUSTER
|
||||||
|
|| action == ACTION_PREPEND_WITH_0F68);
|
||||||
|
|
||||||
|
assert(nextState == STATE_START
|
||||||
|
|| nextState == STATE_READY
|
||||||
|
|| nextState == STATE_DIGIT
|
||||||
|
|| nextState == STATE_STACKING
|
||||||
|
|| nextState == STATE_STACKPLUSACHUNG
|
||||||
|
|| nextState == STATE_PARTIALMARK);
|
||||||
|
|
||||||
|
// we start in the start state, but we can never return to it.
|
||||||
|
assert(nextState != STATE_START);
|
||||||
|
|
||||||
|
this.nextState = nextState;
|
||||||
|
this.action = action;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** the state (e.g., {@link #STATE_READY}) to which to transition
|
||||||
|
* next */
|
||||||
|
private int nextState;
|
||||||
|
|
||||||
|
/** the action to perform upon transition, either {@link
|
||||||
|
* #ACTION_CONTINUES_GRAPHEME_CLUSTER}, {@link
|
||||||
|
* #ACTION_BEGINS_NEW_GRAPHEME_CLUSTER}, or {@link
|
||||||
|
* #ACTION_PREPEND_WITH_0F68} */
|
||||||
|
private int action;
|
||||||
|
|
||||||
|
int getAction() { return action; }
|
||||||
|
int getNextState() { return nextState; }
|
||||||
|
}
|
|
@ -0,0 +1,174 @@
|
||||||
|
/*
|
||||||
|
The contents of this file are subject to the THDL Open Community License
|
||||||
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||||
|
with the License. You may obtain a copy of the License on the THDL web site
|
||||||
|
(http://www.thdl.org/).
|
||||||
|
|
||||||
|
Software distributed under the License is distributed on an "AS IS" basis,
|
||||||
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||||
|
License for the specific terms governing rights and limitations under the
|
||||||
|
License.
|
||||||
|
|
||||||
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||||
|
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
|
||||||
|
All Rights Reserved.
|
||||||
|
|
||||||
|
Contributor(s): ______________________________________.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.thdl.tib.text.tshegbar;
|
||||||
|
|
||||||
|
/** Constants and static routines (DLC still?) useful in writing state
|
||||||
|
* machines for transforming Unicode input into other forms.
|
||||||
|
*
|
||||||
|
* @author David Chandler
|
||||||
|
*/
|
||||||
|
interface UnicodeReadingStateMachineConstants {
|
||||||
|
|
||||||
|
/** Returns the codepoint class for cp, e.g. {@link #CC_SJC}.
|
||||||
|
* @param cp a Unicode codepoint, which MUST be nondecomposable
|
||||||
|
* if it is in the Tibetan range but can be from outside the
|
||||||
|
* Tibetan range of Unicode */
|
||||||
|
static int getCCForCP(char cp) {
|
||||||
|
assert(getNFTHDL(cp) == null);
|
||||||
|
if ('\u0F82' == cp) {
|
||||||
|
return CC_0F82;
|
||||||
|
} else if ('\u0F8A' == cp) {
|
||||||
|
return CC_0F8A;
|
||||||
|
} else if ('\u0F39' == cp) {
|
||||||
|
return CC_0F39;
|
||||||
|
} else if ('\u0F71' == cp) {
|
||||||
|
return CC_ACHUNG;
|
||||||
|
} else if ('\u0F40' <= cp && cp <= '\u0F6A') {
|
||||||
|
assert(cp != '\u0F48');
|
||||||
|
return CC_CON;
|
||||||
|
} else if ('\u0F90' <= cp && cp <= '\u0FBC') {
|
||||||
|
assert(cp != '\u0F98');
|
||||||
|
return CC_SJC;
|
||||||
|
} else if ('\u0F20' <= cp && cp <= '\u0F33') {
|
||||||
|
return CC_DIGIT;
|
||||||
|
} else if (/* DLC NOW do these combine ONLY with digits, or do CC_CM just NOT combine with digits? */
|
||||||
|
'\u0F3E' == cp
|
||||||
|
|| '\u0F3F' == cp
|
||||||
|
|| '\u0F18' == cp
|
||||||
|
|| '\u0F19' == cp) {
|
||||||
|
return CC_MCWD;
|
||||||
|
} else if ('\u0FC6' == cp
|
||||||
|
|| '\u0F87' == cp
|
||||||
|
|| '\u0F86' == cp
|
||||||
|
|| '\u0F84' == cp
|
||||||
|
|| '\u0F83' == cp
|
||||||
|
|| '\u0F82' == cp
|
||||||
|
|| '\u0F7F' == cp
|
||||||
|
|| '\u0F7E' == cp
|
||||||
|
|| '\u0F37' == cp /* DLC NOW NORMALIZATION OF 0F10, 11 to 0F0F ??? */
|
||||||
|
|| '\u0F35' == cp) {
|
||||||
|
return CC_CM;
|
||||||
|
} else if ('\u0F72' == cp
|
||||||
|
|| '\u0F74' == cp
|
||||||
|
|| '\u0F7A' == cp
|
||||||
|
|| '\u0F7B' == cp
|
||||||
|
|| '\u0F7C' == cp
|
||||||
|
|| '\u0F7D' == cp
|
||||||
|
|| '\u0F80' == cp) {
|
||||||
|
// DLC what about U+0F84 ??? CC_V or CC_CM ?
|
||||||
|
return CC_V;
|
||||||
|
} else {
|
||||||
|
return CC_SIN;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// codepoint classes (CC_...) follow. These are mutually
|
||||||
|
// exclusive, and their union is the whole of Unicode.
|
||||||
|
|
||||||
|
/** for everything else, i.e. non-Tibetan characters like U+0E00
|
||||||
|
* and also Tibetan characters like U+0FCF and U+0F05 (DLC rename
|
||||||
|
* SIN[GLETON] to OTHER as combining marks from outside the
|
||||||
|
* Tibetan range count as this) but not U+0F8A */
|
||||||
|
static final int CC_SIN = 0;
|
||||||
|
|
||||||
|
/** for combining marks in the Tibetan range of Unicode that
|
||||||
|
* combine with digits alone */
|
||||||
|
static final int CC_MCWD = 1;
|
||||||
|
|
||||||
|
/** for combining marks in the Tibetan range of Unicode, minus
|
||||||
|
* CC_MCWD, U+0F82, and U+0F39 */
|
||||||
|
static final int CC_CM = 2;
|
||||||
|
|
||||||
|
/** for combining consonants, i.e. U+0F90-U+0FBC minus U+0F98
|
||||||
|
* minus the decomposable entries like U+0F93, U+0F9D, U+0FA2,
|
||||||
|
* etc. */
|
||||||
|
static final int CC_SJC = 3;
|
||||||
|
|
||||||
|
/** for noncombining consonants, i.e. U+0F40-U+0F6A minus U+0F48
|
||||||
|
* minus the decomposable entries like U+0F43, U+0F4D, U+0F52,
|
||||||
|
* etc. */
|
||||||
|
static final int CC_CON = 4;
|
||||||
|
|
||||||
|
/** for simple, nondecomposable vowels, i.e. U+0F72, U+0F74,
|
||||||
|
* U+0F7A, U+0F7B, U+0F7C, U+0F7D, U+0F80 */
|
||||||
|
static final int CC_V = 5;
|
||||||
|
|
||||||
|
/** for U+0F8A */
|
||||||
|
static final int CC_0F8A = 6;
|
||||||
|
|
||||||
|
/** for U+0F82, which is treated like {@link #CC_CM} except after
|
||||||
|
* U+0F8A */
|
||||||
|
static final int CC_0F82 = 7;
|
||||||
|
|
||||||
|
/** for U+0F39, an integral part of a consonant when it directly
|
||||||
|
* follows a member of CM_CONS or CM_SJC */
|
||||||
|
static final int CC_0F39 = 8;
|
||||||
|
|
||||||
|
/** for U+0F71 */
|
||||||
|
static final int CC_ACHUNG = 9;
|
||||||
|
|
||||||
|
/** for digits, i.e. U+0F20-U+0F33 */
|
||||||
|
static final int CC_DIGIT = 10;
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// states STATE_...:
|
||||||
|
|
||||||
|
/** initial state */
|
||||||
|
static final int STATE_START = 0;
|
||||||
|
|
||||||
|
/** ready state, i.e. the state in which some non-empty Unicode
|
||||||
|
* String is in the holding area, <i>ready</i> to receive
|
||||||
|
* combining marks like U+0F35 */
|
||||||
|
static final int STATE_READY = 1;
|
||||||
|
|
||||||
|
/** digit state, i.e. the state in which some non-empty Unicode
|
||||||
|
* String consisting entirely of digits is in the holding area,
|
||||||
|
* ready to receive marks that combine only with digits */
|
||||||
|
static final int STATE_DIGIT = 2;
|
||||||
|
|
||||||
|
/** state in which CC_SJC are welcomed and treated as consonants
|
||||||
|
* to be subscribed to the GraphemeCluster in holding. */
|
||||||
|
static final int STATE_STACKING = 3;
|
||||||
|
|
||||||
|
/** state in which one or more consonants have been seen and also
|
||||||
|
* an achung (U+0F71) has been seen */
|
||||||
|
static final int STATE_STACKPLUSACHUNG = 4;
|
||||||
|
|
||||||
|
/** state that seeing U+0F8A (when that's not an error) puts you
|
||||||
|
* in. Needed because U+0F8A is always followed by U+0F82, and
|
||||||
|
* we check for the exceptional case that U+0F8A is followed by
|
||||||
|
* something else. */
|
||||||
|
static final int STATE_PARTIALMARK = 5;
|
||||||
|
|
||||||
|
/* DLC we should have many error states or none. */
|
||||||
|
|
||||||
|
|
||||||
|
/** the present codepoint marks the start of a new
|
||||||
|
* GraphemeCluster */
|
||||||
|
static final int ACTION_BEGINS_NEW_GRAPHEME_CLUSTER = 0;
|
||||||
|
/** the present codepoint is a continuation of the current
|
||||||
|
* GraphemeCluster */
|
||||||
|
static final int ACTION_CONTINUES_GRAPHEME_CLUSTER = 1;
|
||||||
|
/** there is an error in the input stream, which we are correcting
|
||||||
|
* (as we are in error-correcting mode) by starting a new
|
||||||
|
* GraphemeCluster with U+0F68 as the first codepoint and the
|
||||||
|
* current codepoint as the second */
|
||||||
|
static final int ACTION_PREPEND_WITH_0F68 = 2;
|
||||||
|
}
|
345
source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java
Normal file
345
source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java
Normal file
|
@ -0,0 +1,345 @@
|
||||||
|
/*
|
||||||
|
The contents of this file are subject to the THDL Open Community License
|
||||||
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||||
|
with the License. You may obtain a copy of the License on the THDL web site
|
||||||
|
(http://www.thdl.org/).
|
||||||
|
|
||||||
|
Software distributed under the License is distributed on an "AS IS" basis,
|
||||||
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||||
|
License for the specific terms governing rights and limitations under the
|
||||||
|
License.
|
||||||
|
|
||||||
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||||
|
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
|
||||||
|
All Rights Reserved.
|
||||||
|
|
||||||
|
Contributor(s): ______________________________________.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.thdl.tib.text.tshegbar;
|
||||||
|
|
||||||
|
class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
|
||||||
|
/** Don't instantiate this class. */
|
||||||
|
private Foo() { super(); }
|
||||||
|
|
||||||
|
/** This table tells how to transition from state a 6 states + error state */
|
||||||
|
private static final TransitionInstruction
|
||||||
|
transitionTable[6 /* number of STATEs */]
|
||||||
|
[11 /* number of CC classes */]
|
||||||
|
= {
|
||||||
|
// STATE_START:
|
||||||
|
{
|
||||||
|
/* upon seeing CC_SIN in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_MCWD in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_CM in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_SJC in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_CON in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_V in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_0F8A in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_0F82 in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_0F39 in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_ACHUNG in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_DIGIT in this state: */
|
||||||
|
new TransitionInstruction(STATE_DIGIT,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
|
||||||
|
},
|
||||||
|
|
||||||
|
// STATE_READY:
|
||||||
|
{
|
||||||
|
/* upon seeing CC_SIN in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY, // self
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_MCWD in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_CM in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY,
|
||||||
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_SJC in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_CON in this state: */
|
||||||
|
new TransitionInstruction(STATE_STACKING,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_V in this state: */
|
||||||
|
null
|
||||||
|
/* upon seeing CC_0F8A in this state: */
|
||||||
|
new TransitionInstruction(STATE_PARTIALMARK,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_0F82 in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY,
|
||||||
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_0F39 in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_ACHUNG in this state: */
|
||||||
|
null, // because 0F71 comes after SJCs, before Vs, and
|
||||||
|
// before CMs.
|
||||||
|
/* upon seeing CC_DIGIT in this state: */
|
||||||
|
new TransitionInstruction(STATE_DIGIT,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
|
||||||
|
},
|
||||||
|
// STATE_DIGIT:
|
||||||
|
{
|
||||||
|
/* upon seeing CC_SIN in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_MCWD in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY,
|
||||||
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_CM in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY,
|
||||||
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_SJC in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_CON in this state: */
|
||||||
|
new TransitionInstruction(STATE_STACKING,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_V in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_0F8A in this state: */
|
||||||
|
new TransitionInstruction(STATE_PARTIALMARK,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_0F82 in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY,
|
||||||
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_0F39 in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_ACHUNG in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_DIGIT in this state: */
|
||||||
|
new TransitionInstruction(STATE_DIGIT,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER) /* DLC although consider the meaning of 0F22,0F22,0F3F */
|
||||||
|
},
|
||||||
|
// STATE_STACKING:
|
||||||
|
{
|
||||||
|
/* upon seeing CC_SIN in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_MCWD in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_CM in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY,
|
||||||
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_SJC in this state: */
|
||||||
|
new TransitionInstruction(STATE_STACKING,
|
||||||
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_CON in this state: */
|
||||||
|
new TransitionInstruction(STATE_STACKING,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_V in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY,
|
||||||
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_0F8A in this state: */
|
||||||
|
new TransitionInstruction(STATE_PARTIALMARK,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_0F82 in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY,
|
||||||
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_0F39 in this state: */
|
||||||
|
new TransitionInstruction(STATE_STACKING,
|
||||||
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_ACHUNG in this state: */
|
||||||
|
new TransitionInstruction(STATE_STACKPLUSACHUNG,
|
||||||
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_DIGIT in this state: */
|
||||||
|
new TransitionInstruction(STATE_DIGIT,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
|
||||||
|
},
|
||||||
|
// STATE_STACKPLUSACHUNG:
|
||||||
|
{
|
||||||
|
/* upon seeing CC_SIN in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_MCWD in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_CM in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY,
|
||||||
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_SJC in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_CON in this state: */
|
||||||
|
new TransitionInstruction(STATE_STACKING,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_V in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY,
|
||||||
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_0F8A in this state: */
|
||||||
|
new TransitionInstruction(STATE_PARTIALMARK,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_0F82 in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY,
|
||||||
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_0F39 in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_ACHUNG in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_DIGIT in this state: */
|
||||||
|
new TransitionInstruction(STATE_DIGIT,
|
||||||
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
|
||||||
|
},
|
||||||
|
// STATE_PARTIALMARK:
|
||||||
|
{
|
||||||
|
/* upon seeing CC_SIN in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_MCWD in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_CM in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_SJC in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_CON in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_V in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_0F8A in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_0F82 in this state: */
|
||||||
|
new TransitionInstruction(STATE_READY,
|
||||||
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
|
/* upon seeing CC_0F39 in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_ACHUNG in this state: */
|
||||||
|
null,
|
||||||
|
/* upon seeing CC_DIGIT in this state: */
|
||||||
|
null
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
DLC NOW -- clearly, we need LegalSyllable to be convertable to and from GraphemeClusters;
|
||||||
|
|
||||||
|
/** Breaks a sequence of GraphemeClusters into LegalSyllables.
|
||||||
|
@param grcls a sequence of nonnull GraphemeClusters
|
||||||
|
@return a sequence of nonnull LegalSyllables
|
||||||
|
@exception TibetanSyntaxException if grcls does not consist
|
||||||
|
entirely of legal Tibetan syllables
|
||||||
|
@see #GraphemeCluster
|
||||||
|
@see #LegalSyllable
|
||||||
|
*/
|
||||||
|
private static Vector breakGraphemeClustersIntoOnlySyllables(Vector grcls)
|
||||||
|
throws TibetanSyntaxException
|
||||||
|
{
|
||||||
|
return breakGraphemeClustersIntoSyllablesAndGraphemeClusters(grcls,
|
||||||
|
true);
|
||||||
|
}
|
||||||
|
|
||||||
|
private static Vector breakGraphemeClustersIntoOnlySyllables(Vector grcls) {
|
||||||
|
try {
|
||||||
|
return breakGraphemeClustersIntoSyllablesAndGraphemeClusters(grcls,
|
||||||
|
false);
|
||||||
|
} catch (TibetanSyntaxException) {
|
||||||
|
throw new Error("This can never happen, because the second parameter, validating, was false.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
@param grcls a Vector consisting entirely of GraphemeClusters
|
||||||
|
@param validate true iff you wish to have a
|
||||||
|
TibetanSyntaxException thrown upon encountering a sequence of
|
||||||
|
GraphemeClusters that is syntactically incorrect Tibetan
|
||||||
|
@return if validate is true, a Vector consisting entirely of
|
||||||
|
LegalSyllables, else a vector of LegalSyllables and
|
||||||
|
GraphemeClusters */
|
||||||
|
private static Vector breakGraphemeClustersIntoSyllablesAndGraphemeClusters(Vector grcls,
|
||||||
|
boolean validate)
|
||||||
|
throws TibetanSyntaxException
|
||||||
|
{
|
||||||
|
Vector syllables = new Vector();
|
||||||
|
int grcls_len = grcls.length();
|
||||||
|
int beginning_of_cluster = 0;
|
||||||
|
for (int i = 0; i < grcls_len; i++) {
|
||||||
|
GraphemeCluster current_grcl
|
||||||
|
= (GraphemeCluster)grcls.elementAt(i);
|
||||||
|
if (current_grcl.isTshegLike()) {
|
||||||
|
if (beginning_of_cluster < i) {
|
||||||
|
// One or more non-tsheg-like grapheme clusters is
|
||||||
|
// here between tsheg-like grapheme clusters. Is
|
||||||
|
// it a legal syllable?
|
||||||
|
if (LegalTshegBar.formsLegalTshegBar(grcls,
|
||||||
|
beginning_of_cluster,
|
||||||
|
i))
|
||||||
|
{
|
||||||
|
syllables.add(new LegalSyllable(grcls,
|
||||||
|
beginning_of_cluster,
|
||||||
|
i, tsheg=current_grcl));
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (validating) {
|
||||||
|
TibetanSyntaxException ex
|
||||||
|
= new TibetanSyntaxException(grcls,
|
||||||
|
beginning_of_cluster,
|
||||||
|
i);
|
||||||
|
// DLC: return an int -1 for "all good" or
|
||||||
|
// 3 for "the fourth element is the first
|
||||||
|
// bad one" but then you don't know that
|
||||||
|
// 3-6 were the bad ones
|
||||||
|
throw ex;
|
||||||
|
} else {
|
||||||
|
for (int j = beginning_of_cluster; j <= i; j++) {
|
||||||
|
syllables.add(grcls.elementAt(j));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
beginning_of_cluster = i + 1;
|
||||||
|
} // else add current_grcl to the waiting list, in a sense
|
||||||
|
}
|
||||||
|
return syllables;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Breaks a string of perfectly-formed Unicode into
|
||||||
|
GraphemeClusters.
|
||||||
|
@param nfthdl_unicode a String of NFTHDL-normalized Unicode
|
||||||
|
codepoints
|
||||||
|
@exception Exception if the input is not perfectly formed
|
||||||
|
@return a vector of GraphemeClusters
|
||||||
|
@see #GraphemeCluster
|
||||||
|
*/
|
||||||
|
private static Vector nonErrorCorrectingReader(String nfthdl_unicode)
|
||||||
|
throws Exception
|
||||||
|
{
|
||||||
|
// a vector of GraphemeClusters that we build up little by
|
||||||
|
// little:
|
||||||
|
Vector grcls = new Vector();
|
||||||
|
int currentState = STATE_START;
|
||||||
|
StringBuffer holdingPen = new StringBuffer();
|
||||||
|
|
||||||
|
int ilen = nfthdl_unicode.length();
|
||||||
|
for (int i = 0; i < ilen; i++) {
|
||||||
|
char current_cp = nfthdl_unicode.charAt(i);
|
||||||
|
int cc_of_current_cp = getCCForCP(current_cp);
|
||||||
|
final TransitionInstruction ti
|
||||||
|
= transitionTable[currentState][cc_of_current_cp];
|
||||||
|
if (null == ti) {
|
||||||
|
throw new Exception("Bad Unicode. DLC improve these messages");
|
||||||
|
} else {
|
||||||
|
switch (ti.getAction()) {
|
||||||
|
case ACTION_BEGINS_NEW_GRAPHEME_CLUSTER:
|
||||||
|
grcls.add(new GraphemeCluster(holdingPen));
|
||||||
|
holdingPen = new StringBuffer();
|
||||||
|
break;
|
||||||
|
case ACTION_CONTINUES_GRAPHEME_CLUSTER:
|
||||||
|
holdingString.append(current_cp);
|
||||||
|
break;
|
||||||
|
case ACTION_PREPEND_WITH_0F68:
|
||||||
|
throw new Error("This never happens inside the validating scanner.");
|
||||||
|
default:
|
||||||
|
throw new Error("Famous last words: This won't happen.");
|
||||||
|
}
|
||||||
|
currentState = ti.getNextState();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return grcls;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,195 @@
|
||||||
|
/*
|
||||||
|
The contents of this file are subject to the THDL Open Community License
|
||||||
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||||
|
with the License. You may obtain a copy of the License on the THDL web site
|
||||||
|
(http://www.thdl.org/).
|
||||||
|
|
||||||
|
Software distributed under the License is distributed on an "AS IS" basis,
|
||||||
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||||
|
License for the specific terms governing rights and limitations under the
|
||||||
|
License.
|
||||||
|
|
||||||
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||||
|
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
|
||||||
|
All Rights Reserved.
|
||||||
|
|
||||||
|
Contributor(s): ______________________________________.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.thdl.tib.text.tshegbar;
|
||||||
|
|
||||||
|
/** Tests ValidatingUnicodeReader.
|
||||||
|
* @author David Chandler */
|
||||||
|
class ValidatingUnicodeReaderTest {
|
||||||
|
private static String skyagd = "\u0F66\u0F90\u0FB1\u0F42\u0F51";
|
||||||
|
private static String bskyagd = "\u0F56" + skyagd;
|
||||||
|
|
||||||
|
void testValidatingUnicodeReader() {
|
||||||
|
// DLC these routines can be slow.
|
||||||
|
assertTrue(ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode(
|
||||||
|
bskyagd + "\u0F0C"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode(
|
||||||
|
"\u0F42" + skyagd + "\u0F0C"));
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
bskyagd + "\u0F0C"));
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F42" + skyagd + "\u0F0C"));
|
||||||
|
|
||||||
|
assertTrue(ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode(
|
||||||
|
bskyagd + "\u0F0C\u0F62\u0F0B" + bskyagd + "\u0F0F"));
|
||||||
|
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F6A\u0F0B"));
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F62\u0F0B"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isPerfectUnicode(
|
||||||
|
"\u0F6A\u0F0B"));
|
||||||
|
assertTrue(ValidatingUnicodeReader.isPerfectUnicode(
|
||||||
|
"\u0F62\u0F0B"));
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F6A\u0F90\u0F0B"));
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F62\u0F90\u0F0B"));
|
||||||
|
assertTrue(ValidatingUnicodeReader.isPerfectUnicode(
|
||||||
|
"\u0F62\u0F90\u0F0B"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isPerfectUnicode(
|
||||||
|
"\u0F6A\u0F90\u0F0B"));
|
||||||
|
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F43"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isSyntacticallyLegalTibetanUnicode(
|
||||||
|
"\u0F43"));
|
||||||
|
|
||||||
|
// The Unicode standard states that U+0F8A is always followed
|
||||||
|
// by U+0F82.
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F8A\u0F82"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F8A"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F8A\u0F40"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F8A\u0F83"));
|
||||||
|
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F74"));
|
||||||
|
assertTrue(ValidatingUnicodeReader.isPerfectUnicode(
|
||||||
|
"\u0F40\u0F74"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F90\u0F74"));
|
||||||
|
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F40\u0F77"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isPerfectUnicode(
|
||||||
|
"\u0F40\u0F77"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F90\u0F77"));
|
||||||
|
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F40\u0F90\u0F7F"));
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F40\u0F90\u0F7F\u0F35"));
|
||||||
|
|
||||||
|
// Test that each singleton (except U+0F8A) in the Tibetan
|
||||||
|
// range is legal, and that each combining char and empty
|
||||||
|
// codepoint (and also U+0F8A) is illegal alone.
|
||||||
|
{
|
||||||
|
for (char cp = '\u0F00'; cp <= '\u0F17'; cp++)
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
|
||||||
|
for (char cp = '\u0F1a'; cp <= '\u0F34'; cp++)
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
|
||||||
|
for (char cp = '\u0F3a'; cp <= '\u0F3d'; cp++)
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
|
||||||
|
for (char cp = '\u0F40'; cp <= '\u0F47'; cp++)
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
|
||||||
|
for (char cp = '\u0F49'; cp <= '\u0F6a'; cp++)
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
|
||||||
|
for (char cp = '\u0F88'; cp <= '\u0F89'; cp++)
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
|
||||||
|
for (char cp = '\u0Fbe'; cp <= '\u0Fc5'; cp++)
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
|
||||||
|
for (char cp = '\u0Fc7'; cp <= '\u0Fcc'; cp++)
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(cp));
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F36"));
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F38"));
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F85"));
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0F8b"));
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode("\u0Fcf"));
|
||||||
|
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F48"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6b"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6c"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6d"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6e"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F6f"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F70"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8c"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8d"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8e"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F8f"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0F98"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fbd"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fcd"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fce"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fd0"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fe4"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Ff0"));
|
||||||
|
assertTrue(!ValidatingUnicodeReader.isFullyValidUnicode("\u0Fff"));
|
||||||
|
}
|
||||||
|
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F40\u0Fc6"));
|
||||||
|
|
||||||
|
// Test that combining characters that combine with both
|
||||||
|
// consonants and digits work.
|
||||||
|
{
|
||||||
|
String combiningMarks[] = new String[] {
|
||||||
|
"\u0F71",
|
||||||
|
"\u0F72",
|
||||||
|
"\u0F73",
|
||||||
|
"\u0F74",
|
||||||
|
"\u0F75",
|
||||||
|
"\u0F76",
|
||||||
|
"\u0F77",
|
||||||
|
"\u0F78",
|
||||||
|
"\u0F79",
|
||||||
|
"\u0F7a",
|
||||||
|
"\u0F7b",
|
||||||
|
"\u0F7c",
|
||||||
|
"\u0F7d",
|
||||||
|
"\u0F7e",
|
||||||
|
"\u0F7f",
|
||||||
|
"\u0F80",
|
||||||
|
"\u0F81",
|
||||||
|
"\u0F82",
|
||||||
|
"\u0F83",
|
||||||
|
"\u0F84",
|
||||||
|
"\u0F86",
|
||||||
|
"\u0F87"
|
||||||
|
};
|
||||||
|
for (int i = 0; i < combiningMarks.length(); i++) {
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F40" + combiningMarks[i]));
|
||||||
|
// DLC have a group that works with both digits and consonants, cuz vowels plus digits is a no go, right?
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F20" + combiningMarks[i]));
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F30" + combiningMarks[i]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
DLC;
|
||||||
|
assertTrue(ValidatingUnicodeReader.isFullyValidUnicode(
|
||||||
|
"\u0F\u0F\u0F\u0F\u0F"));
|
||||||
|
}
|
||||||
|
|
||||||
|
void testSyntacticallyLegalUnicodeToThdlWylie() {
|
||||||
|
assertTrue("bskyagd"
|
||||||
|
.equals(ValidatingUnicodeReader.syntacticallyLegalTibetanUnicodeToThdlWylie(
|
||||||
|
bskyagd)));
|
||||||
|
|
||||||
|
assertTrue("bskyagd bskyagd/"
|
||||||
|
.equals(ValidatingUnicodeReader.syntacticallyLegalTibetanUnicodeToThdlWylie(
|
||||||
|
bskyagd + "\u0F0B" + bskyagd + "\u0F0D")));
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue