2003-04-12 20:56:20 +00:00
|
|
|
/*
|
|
|
|
The contents of this file are subject to the THDL Open Community License
|
|
|
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
|
|
|
with the License. You may obtain a copy of the License on the THDL web site
|
|
|
|
(http://www.thdl.org/).
|
|
|
|
|
|
|
|
Software distributed under the License is distributed on an "AS IS" basis,
|
|
|
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
|
|
|
License for the specific terms governing rights and limitations under the
|
|
|
|
License.
|
|
|
|
|
|
|
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
|
|
|
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
|
|
|
|
All Rights Reserved.
|
|
|
|
|
|
|
|
Contributor(s): ______________________________________.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package org.thdl.tib.text.tshegbar;
|
|
|
|
|
2003-04-13 01:46:20 +00:00
|
|
|
import java.util.Vector;
|
|
|
|
|
2003-04-12 20:56:20 +00:00
|
|
|
class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
|
|
|
|
/** Don't instantiate this class. */
|
2003-04-13 01:46:20 +00:00
|
|
|
private ValidatingUnicodeReader() { super(); }
|
2003-04-12 20:56:20 +00:00
|
|
|
|
2003-04-13 01:46:20 +00:00
|
|
|
/** This table tells how to transition from state to state upon
|
|
|
|
* encountering certain classes of Unicode codepoints. There are
|
|
|
|
* 6 legal states + an error state. */
|
2003-04-12 20:56:20 +00:00
|
|
|
private static final TransitionInstruction
|
2003-04-13 01:46:20 +00:00
|
|
|
transitionTable[/* 6 is the number of STATEs */][/* 11 is the number of CC classes */]
|
2003-04-12 20:56:20 +00:00
|
|
|
= {
|
|
|
|
// STATE_START:
|
|
|
|
{
|
|
|
|
/* upon seeing CC_SIN in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_MCWD in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_CM in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_SJC in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_CON in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_V in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_0F8A in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_0F82 in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_0F39 in this state: */
|
|
|
|
null,
|
2003-04-13 01:46:20 +00:00
|
|
|
/* upon seeing CC_SUBSCRIBED_ACHUNG in this state: */
|
2003-04-12 20:56:20 +00:00
|
|
|
null,
|
|
|
|
/* upon seeing CC_DIGIT in this state: */
|
|
|
|
new TransitionInstruction(STATE_DIGIT,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
|
|
|
|
},
|
|
|
|
|
|
|
|
// STATE_READY:
|
|
|
|
{
|
|
|
|
/* upon seeing CC_SIN in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY, // self
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_MCWD in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_CM in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY,
|
|
|
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_SJC in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_CON in this state: */
|
|
|
|
new TransitionInstruction(STATE_STACKING,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_V in this state: */
|
2003-04-13 01:46:20 +00:00
|
|
|
null,
|
2003-04-12 20:56:20 +00:00
|
|
|
/* upon seeing CC_0F8A in this state: */
|
|
|
|
new TransitionInstruction(STATE_PARTIALMARK,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_0F82 in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY,
|
|
|
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_0F39 in this state: */
|
|
|
|
null,
|
2003-04-13 01:46:20 +00:00
|
|
|
/* upon seeing CC_SUBSCRIBED_ACHUNG in this state: */
|
2003-04-12 20:56:20 +00:00
|
|
|
null, // because 0F71 comes after SJCs, before Vs, and
|
|
|
|
// before CMs.
|
|
|
|
/* upon seeing CC_DIGIT in this state: */
|
|
|
|
new TransitionInstruction(STATE_DIGIT,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
|
|
|
|
},
|
|
|
|
// STATE_DIGIT:
|
|
|
|
{
|
|
|
|
/* upon seeing CC_SIN in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_MCWD in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY,
|
|
|
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_CM in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY,
|
|
|
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_SJC in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_CON in this state: */
|
|
|
|
new TransitionInstruction(STATE_STACKING,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_V in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_0F8A in this state: */
|
|
|
|
new TransitionInstruction(STATE_PARTIALMARK,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_0F82 in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY,
|
|
|
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_0F39 in this state: */
|
|
|
|
null,
|
2003-04-13 01:46:20 +00:00
|
|
|
/* upon seeing CC_SUBSCRIBED_ACHUNG in this state: */
|
2003-04-12 20:56:20 +00:00
|
|
|
null,
|
|
|
|
/* upon seeing CC_DIGIT in this state: */
|
|
|
|
new TransitionInstruction(STATE_DIGIT,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER) /* DLC although consider the meaning of 0F22,0F22,0F3F */
|
|
|
|
},
|
|
|
|
// STATE_STACKING:
|
|
|
|
{
|
|
|
|
/* upon seeing CC_SIN in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_MCWD in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_CM in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY,
|
|
|
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_SJC in this state: */
|
|
|
|
new TransitionInstruction(STATE_STACKING,
|
|
|
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_CON in this state: */
|
|
|
|
new TransitionInstruction(STATE_STACKING,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_V in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY,
|
|
|
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_0F8A in this state: */
|
|
|
|
new TransitionInstruction(STATE_PARTIALMARK,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_0F82 in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY,
|
|
|
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_0F39 in this state: */
|
|
|
|
new TransitionInstruction(STATE_STACKING,
|
|
|
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
2003-04-13 01:46:20 +00:00
|
|
|
/* upon seeing CC_SUBSCRIBED_ACHUNG in this state: */
|
2003-04-12 20:56:20 +00:00
|
|
|
new TransitionInstruction(STATE_STACKPLUSACHUNG,
|
|
|
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_DIGIT in this state: */
|
|
|
|
new TransitionInstruction(STATE_DIGIT,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
|
|
|
|
},
|
|
|
|
// STATE_STACKPLUSACHUNG:
|
|
|
|
{
|
|
|
|
/* upon seeing CC_SIN in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_MCWD in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_CM in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY,
|
|
|
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_SJC in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_CON in this state: */
|
|
|
|
new TransitionInstruction(STATE_STACKING,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_V in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY,
|
|
|
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_0F8A in this state: */
|
|
|
|
new TransitionInstruction(STATE_PARTIALMARK,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_0F82 in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY,
|
|
|
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_0F39 in this state: */
|
|
|
|
null,
|
2003-04-13 01:46:20 +00:00
|
|
|
/* upon seeing CC_SUBSCRIBED_ACHUNG in this state: */
|
2003-04-12 20:56:20 +00:00
|
|
|
null,
|
|
|
|
/* upon seeing CC_DIGIT in this state: */
|
|
|
|
new TransitionInstruction(STATE_DIGIT,
|
|
|
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
|
|
|
|
},
|
|
|
|
// STATE_PARTIALMARK:
|
|
|
|
{
|
|
|
|
/* upon seeing CC_SIN in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_MCWD in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_CM in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_SJC in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_CON in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_V in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_0F8A in this state: */
|
|
|
|
null,
|
|
|
|
/* upon seeing CC_0F82 in this state: */
|
|
|
|
new TransitionInstruction(STATE_READY,
|
|
|
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
|
|
|
/* upon seeing CC_0F39 in this state: */
|
|
|
|
null,
|
2003-04-13 01:46:20 +00:00
|
|
|
/* upon seeing CC_SUBSCRIBED_ACHUNG in this state: */
|
2003-04-12 20:56:20 +00:00
|
|
|
null,
|
|
|
|
/* upon seeing CC_DIGIT in this state: */
|
|
|
|
null
|
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2003-04-13 01:46:20 +00:00
|
|
|
/* DLC NOW FIXME -- clearly, we need LegalTshegBar to be convertable to and from UnicodeGraphemeClusters; */
|
2003-04-12 20:56:20 +00:00
|
|
|
|
2003-04-13 01:46:20 +00:00
|
|
|
/** Breaks a sequence of UnicodeGraphemeClusters into LegalTshegBars.
|
|
|
|
@param grcls a sequence of nonnull UnicodeGraphemeClusters
|
|
|
|
@return a sequence of nonnull LegalTshegBars
|
2003-04-12 20:56:20 +00:00
|
|
|
@exception TibetanSyntaxException if grcls does not consist
|
|
|
|
entirely of legal Tibetan syllables
|
2003-04-13 01:46:20 +00:00
|
|
|
@see UnicodeGraphemeCluster
|
|
|
|
@see LegalTshegBar
|
2003-04-12 20:56:20 +00:00
|
|
|
*/
|
2003-04-13 01:46:20 +00:00
|
|
|
private static Vector breakGraphemeClustersIntoOnlyTshegBars(Vector grcls)
|
2003-04-12 20:56:20 +00:00
|
|
|
throws TibetanSyntaxException
|
|
|
|
{
|
2003-04-13 01:46:20 +00:00
|
|
|
return breakGraphemeClustersIntoTshegBarsAndGraphemeClusters(grcls,
|
2003-04-12 20:56:20 +00:00
|
|
|
true);
|
|
|
|
}
|
|
|
|
|
2003-04-13 01:46:20 +00:00
|
|
|
private static Vector breakLegalGraphemeClustersIntoOnlyTshegBars(Vector grcls) {
|
2003-04-12 20:56:20 +00:00
|
|
|
try {
|
2003-04-13 01:46:20 +00:00
|
|
|
return breakGraphemeClustersIntoTshegBarsAndGraphemeClusters(grcls,
|
2003-04-12 20:56:20 +00:00
|
|
|
false);
|
2003-04-13 01:46:20 +00:00
|
|
|
} catch (TibetanSyntaxException ex) {
|
2003-04-12 20:56:20 +00:00
|
|
|
throw new Error("This can never happen, because the second parameter, validating, was false.");
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
2003-04-13 01:46:20 +00:00
|
|
|
@param grcls a Vector consisting entirely of UnicodeGraphemeClusters
|
2003-04-12 20:56:20 +00:00
|
|
|
@param validate true iff you wish to have a
|
|
|
|
TibetanSyntaxException thrown upon encountering a sequence of
|
2003-04-13 01:46:20 +00:00
|
|
|
UnicodeGraphemeClusters that is syntactically incorrect Tibetan
|
2003-04-12 20:56:20 +00:00
|
|
|
@return if validate is true, a Vector consisting entirely of
|
2003-04-13 01:46:20 +00:00
|
|
|
LegalTshegBars, else a vector of LegalTshegBars and
|
|
|
|
UnicodeGraphemeClusters */
|
|
|
|
private static Vector breakGraphemeClustersIntoTshegBarsAndGraphemeClusters(Vector grcls,
|
2003-04-12 20:56:20 +00:00
|
|
|
boolean validate)
|
|
|
|
throws TibetanSyntaxException
|
|
|
|
{
|
|
|
|
Vector syllables = new Vector();
|
|
|
|
int grcls_len = grcls.length();
|
|
|
|
int beginning_of_cluster = 0;
|
|
|
|
for (int i = 0; i < grcls_len; i++) {
|
2003-04-13 01:46:20 +00:00
|
|
|
UnicodeGraphemeCluster current_grcl
|
|
|
|
= (UnicodeGraphemeCluster)grcls.elementAt(i);
|
2003-04-12 20:56:20 +00:00
|
|
|
if (current_grcl.isTshegLike()) {
|
|
|
|
if (beginning_of_cluster < i) {
|
|
|
|
// One or more non-tsheg-like grapheme clusters is
|
|
|
|
// here between tsheg-like grapheme clusters. Is
|
|
|
|
// it a legal syllable?
|
|
|
|
if (LegalTshegBar.formsLegalTshegBar(grcls,
|
|
|
|
beginning_of_cluster,
|
|
|
|
i))
|
|
|
|
{
|
2003-04-13 01:46:20 +00:00
|
|
|
syllables.add(new LegalTshegBar(grcls,
|
2003-04-12 20:56:20 +00:00
|
|
|
beginning_of_cluster,
|
|
|
|
i, tsheg=current_grcl));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (validating) {
|
|
|
|
TibetanSyntaxException ex
|
|
|
|
= new TibetanSyntaxException(grcls,
|
|
|
|
beginning_of_cluster,
|
|
|
|
i);
|
|
|
|
// DLC: return an int -1 for "all good" or
|
|
|
|
// 3 for "the fourth element is the first
|
|
|
|
// bad one" but then you don't know that
|
|
|
|
// 3-6 were the bad ones
|
|
|
|
throw ex;
|
|
|
|
} else {
|
|
|
|
for (int j = beginning_of_cluster; j <= i; j++) {
|
|
|
|
syllables.add(grcls.elementAt(j));
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
beginning_of_cluster = i + 1;
|
|
|
|
} // else add current_grcl to the waiting list, in a sense
|
|
|
|
}
|
|
|
|
return syllables;
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Breaks a string of perfectly-formed Unicode into
|
2003-04-13 01:46:20 +00:00
|
|
|
UnicodeGraphemeClusters.
|
2003-04-12 20:56:20 +00:00
|
|
|
@param nfthdl_unicode a String of NFTHDL-normalized Unicode
|
|
|
|
codepoints
|
|
|
|
@exception Exception if the input is not perfectly formed
|
2003-04-13 01:46:20 +00:00
|
|
|
@return a vector of UnicodeGraphemeClusters
|
|
|
|
@see UnicodeGraphemeCluster
|
2003-04-12 20:56:20 +00:00
|
|
|
*/
|
|
|
|
private static Vector nonErrorCorrectingReader(String nfthdl_unicode)
|
|
|
|
throws Exception
|
|
|
|
{
|
2003-04-13 01:46:20 +00:00
|
|
|
// a vector of UnicodeGraphemeClusters that we build up little by
|
2003-04-12 20:56:20 +00:00
|
|
|
// little:
|
|
|
|
Vector grcls = new Vector();
|
|
|
|
int currentState = STATE_START;
|
|
|
|
StringBuffer holdingPen = new StringBuffer();
|
|
|
|
|
|
|
|
int ilen = nfthdl_unicode.length();
|
|
|
|
for (int i = 0; i < ilen; i++) {
|
|
|
|
char current_cp = nfthdl_unicode.charAt(i);
|
|
|
|
int cc_of_current_cp = getCCForCP(current_cp);
|
|
|
|
final TransitionInstruction ti
|
|
|
|
= transitionTable[currentState][cc_of_current_cp];
|
|
|
|
if (null == ti) {
|
|
|
|
throw new Exception("Bad Unicode. DLC improve these messages");
|
|
|
|
} else {
|
|
|
|
switch (ti.getAction()) {
|
|
|
|
case ACTION_BEGINS_NEW_GRAPHEME_CLUSTER:
|
2003-04-13 01:46:20 +00:00
|
|
|
grcls.add(new UnicodeGraphemeCluster(holdingPen));
|
2003-04-12 20:56:20 +00:00
|
|
|
holdingPen = new StringBuffer();
|
|
|
|
break;
|
|
|
|
case ACTION_CONTINUES_GRAPHEME_CLUSTER:
|
|
|
|
holdingString.append(current_cp);
|
|
|
|
break;
|
|
|
|
case ACTION_PREPEND_WITH_0F68:
|
|
|
|
throw new Error("This never happens inside the validating scanner.");
|
|
|
|
default:
|
|
|
|
throw new Error("Famous last words: This won't happen.");
|
|
|
|
}
|
|
|
|
currentState = ti.getNextState();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return grcls;
|
|
|
|
}
|
|
|
|
}
|