Jskad/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java
dchandler daacf6ee3b I've got too many sandboxes, so I'm committing these changes,
half-done, from one sandbox so as to consolidate my sandboxes.
2003-04-12 20:56:20 +00:00

345 lines
16 KiB
Java

/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.tshegbar;
class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
/** Don't instantiate this class. */
private Foo() { super(); }
/** This table tells how to transition from state a 6 states + error state */
private static final TransitionInstruction
transitionTable[6 /* number of STATEs */]
[11 /* number of CC classes */]
= {
// STATE_START:
{
/* upon seeing CC_SIN in this state: */
new TransitionInstruction(STATE_READY,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_MCWD in this state: */
null,
/* upon seeing CC_CM in this state: */
null,
/* upon seeing CC_SJC in this state: */
null,
/* upon seeing CC_CON in this state: */
new TransitionInstruction(STATE_READY,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_V in this state: */
null,
/* upon seeing CC_0F8A in this state: */
new TransitionInstruction(STATE_READY,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_0F82 in this state: */
null,
/* upon seeing CC_0F39 in this state: */
null,
/* upon seeing CC_ACHUNG in this state: */
null,
/* upon seeing CC_DIGIT in this state: */
new TransitionInstruction(STATE_DIGIT,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
},
// STATE_READY:
{
/* upon seeing CC_SIN in this state: */
new TransitionInstruction(STATE_READY, // self
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_MCWD in this state: */
null,
/* upon seeing CC_CM in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_SJC in this state: */
null,
/* upon seeing CC_CON in this state: */
new TransitionInstruction(STATE_STACKING,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_V in this state: */
null
/* upon seeing CC_0F8A in this state: */
new TransitionInstruction(STATE_PARTIALMARK,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_0F82 in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F39 in this state: */
null,
/* upon seeing CC_ACHUNG in this state: */
null, // because 0F71 comes after SJCs, before Vs, and
// before CMs.
/* upon seeing CC_DIGIT in this state: */
new TransitionInstruction(STATE_DIGIT,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
},
// STATE_DIGIT:
{
/* upon seeing CC_SIN in this state: */
new TransitionInstruction(STATE_READY,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_MCWD in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_CM in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_SJC in this state: */
null,
/* upon seeing CC_CON in this state: */
new TransitionInstruction(STATE_STACKING,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_V in this state: */
null,
/* upon seeing CC_0F8A in this state: */
new TransitionInstruction(STATE_PARTIALMARK,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_0F82 in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F39 in this state: */
null,
/* upon seeing CC_ACHUNG in this state: */
null,
/* upon seeing CC_DIGIT in this state: */
new TransitionInstruction(STATE_DIGIT,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER) /* DLC although consider the meaning of 0F22,0F22,0F3F */
},
// STATE_STACKING:
{
/* upon seeing CC_SIN in this state: */
new TransitionInstruction(STATE_READY,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_MCWD in this state: */
null,
/* upon seeing CC_CM in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_SJC in this state: */
new TransitionInstruction(STATE_STACKING,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_CON in this state: */
new TransitionInstruction(STATE_STACKING,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_V in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F8A in this state: */
new TransitionInstruction(STATE_PARTIALMARK,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_0F82 in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F39 in this state: */
new TransitionInstruction(STATE_STACKING,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_ACHUNG in this state: */
new TransitionInstruction(STATE_STACKPLUSACHUNG,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_DIGIT in this state: */
new TransitionInstruction(STATE_DIGIT,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
},
// STATE_STACKPLUSACHUNG:
{
/* upon seeing CC_SIN in this state: */
new TransitionInstruction(STATE_READY,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_MCWD in this state: */
null,
/* upon seeing CC_CM in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_SJC in this state: */
null,
/* upon seeing CC_CON in this state: */
new TransitionInstruction(STATE_STACKING,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_V in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F8A in this state: */
new TransitionInstruction(STATE_PARTIALMARK,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_0F82 in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F39 in this state: */
null,
/* upon seeing CC_ACHUNG in this state: */
null,
/* upon seeing CC_DIGIT in this state: */
new TransitionInstruction(STATE_DIGIT,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
},
// STATE_PARTIALMARK:
{
/* upon seeing CC_SIN in this state: */
null,
/* upon seeing CC_MCWD in this state: */
null,
/* upon seeing CC_CM in this state: */
null,
/* upon seeing CC_SJC in this state: */
null,
/* upon seeing CC_CON in this state: */
null,
/* upon seeing CC_V in this state: */
null,
/* upon seeing CC_0F8A in this state: */
null,
/* upon seeing CC_0F82 in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F39 in this state: */
null,
/* upon seeing CC_ACHUNG in this state: */
null,
/* upon seeing CC_DIGIT in this state: */
null
}
};
DLC NOW -- clearly, we need LegalSyllable to be convertable to and from GraphemeClusters;
/** Breaks a sequence of GraphemeClusters into LegalSyllables.
@param grcls a sequence of nonnull GraphemeClusters
@return a sequence of nonnull LegalSyllables
@exception TibetanSyntaxException if grcls does not consist
entirely of legal Tibetan syllables
@see #GraphemeCluster
@see #LegalSyllable
*/
private static Vector breakGraphemeClustersIntoOnlySyllables(Vector grcls)
throws TibetanSyntaxException
{
return breakGraphemeClustersIntoSyllablesAndGraphemeClusters(grcls,
true);
}
private static Vector breakGraphemeClustersIntoOnlySyllables(Vector grcls) {
try {
return breakGraphemeClustersIntoSyllablesAndGraphemeClusters(grcls,
false);
} catch (TibetanSyntaxException) {
throw new Error("This can never happen, because the second parameter, validating, was false.");
}
}
/**
@param grcls a Vector consisting entirely of GraphemeClusters
@param validate true iff you wish to have a
TibetanSyntaxException thrown upon encountering a sequence of
GraphemeClusters that is syntactically incorrect Tibetan
@return if validate is true, a Vector consisting entirely of
LegalSyllables, else a vector of LegalSyllables and
GraphemeClusters */
private static Vector breakGraphemeClustersIntoSyllablesAndGraphemeClusters(Vector grcls,
boolean validate)
throws TibetanSyntaxException
{
Vector syllables = new Vector();
int grcls_len = grcls.length();
int beginning_of_cluster = 0;
for (int i = 0; i < grcls_len; i++) {
GraphemeCluster current_grcl
= (GraphemeCluster)grcls.elementAt(i);
if (current_grcl.isTshegLike()) {
if (beginning_of_cluster < i) {
// One or more non-tsheg-like grapheme clusters is
// here between tsheg-like grapheme clusters. Is
// it a legal syllable?
if (LegalTshegBar.formsLegalTshegBar(grcls,
beginning_of_cluster,
i))
{
syllables.add(new LegalSyllable(grcls,
beginning_of_cluster,
i, tsheg=current_grcl));
}
else
{
if (validating) {
TibetanSyntaxException ex
= new TibetanSyntaxException(grcls,
beginning_of_cluster,
i);
// DLC: return an int -1 for "all good" or
// 3 for "the fourth element is the first
// bad one" but then you don't know that
// 3-6 were the bad ones
throw ex;
} else {
for (int j = beginning_of_cluster; j <= i; j++) {
syllables.add(grcls.elementAt(j));
}
}
}
}
beginning_of_cluster = i + 1;
} // else add current_grcl to the waiting list, in a sense
}
return syllables;
}
/** Breaks a string of perfectly-formed Unicode into
GraphemeClusters.
@param nfthdl_unicode a String of NFTHDL-normalized Unicode
codepoints
@exception Exception if the input is not perfectly formed
@return a vector of GraphemeClusters
@see #GraphemeCluster
*/
private static Vector nonErrorCorrectingReader(String nfthdl_unicode)
throws Exception
{
// a vector of GraphemeClusters that we build up little by
// little:
Vector grcls = new Vector();
int currentState = STATE_START;
StringBuffer holdingPen = new StringBuffer();
int ilen = nfthdl_unicode.length();
for (int i = 0; i < ilen; i++) {
char current_cp = nfthdl_unicode.charAt(i);
int cc_of_current_cp = getCCForCP(current_cp);
final TransitionInstruction ti
= transitionTable[currentState][cc_of_current_cp];
if (null == ti) {
throw new Exception("Bad Unicode. DLC improve these messages");
} else {
switch (ti.getAction()) {
case ACTION_BEGINS_NEW_GRAPHEME_CLUSTER:
grcls.add(new GraphemeCluster(holdingPen));
holdingPen = new StringBuffer();
break;
case ACTION_CONTINUES_GRAPHEME_CLUSTER:
holdingString.append(current_cp);
break;
case ACTION_PREPEND_WITH_0F68:
throw new Error("This never happens inside the validating scanner.");
default:
throw new Error("Famous last words: This won't happen.");
}
currentState = ti.getNextState();
}
}
return grcls;
}
}