I've got too many sandboxes, so I'm committing these changes,
half-done, from one sandbox so as to consolidate my sandboxes.
This commit is contained in:
parent
6e05b60cff
commit
daacf6ee3b
7 changed files with 1252 additions and 7 deletions
345
source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java
Normal file
345
source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java
Normal file
|
@ -0,0 +1,345 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.tshegbar;
|
||||
|
||||
class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
|
||||
/** Don't instantiate this class. */
|
||||
private Foo() { super(); }
|
||||
|
||||
/** This table tells how to transition from state a 6 states + error state */
|
||||
private static final TransitionInstruction
|
||||
transitionTable[6 /* number of STATEs */]
|
||||
[11 /* number of CC classes */]
|
||||
= {
|
||||
// STATE_START:
|
||||
{
|
||||
/* upon seeing CC_SIN in this state: */
|
||||
new TransitionInstruction(STATE_READY,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_MCWD in this state: */
|
||||
null,
|
||||
/* upon seeing CC_CM in this state: */
|
||||
null,
|
||||
/* upon seeing CC_SJC in this state: */
|
||||
null,
|
||||
/* upon seeing CC_CON in this state: */
|
||||
new TransitionInstruction(STATE_READY,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_V in this state: */
|
||||
null,
|
||||
/* upon seeing CC_0F8A in this state: */
|
||||
new TransitionInstruction(STATE_READY,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_0F82 in this state: */
|
||||
null,
|
||||
/* upon seeing CC_0F39 in this state: */
|
||||
null,
|
||||
/* upon seeing CC_ACHUNG in this state: */
|
||||
null,
|
||||
/* upon seeing CC_DIGIT in this state: */
|
||||
new TransitionInstruction(STATE_DIGIT,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
|
||||
},
|
||||
|
||||
// STATE_READY:
|
||||
{
|
||||
/* upon seeing CC_SIN in this state: */
|
||||
new TransitionInstruction(STATE_READY, // self
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_MCWD in this state: */
|
||||
null,
|
||||
/* upon seeing CC_CM in this state: */
|
||||
new TransitionInstruction(STATE_READY,
|
||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_SJC in this state: */
|
||||
null,
|
||||
/* upon seeing CC_CON in this state: */
|
||||
new TransitionInstruction(STATE_STACKING,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_V in this state: */
|
||||
null
|
||||
/* upon seeing CC_0F8A in this state: */
|
||||
new TransitionInstruction(STATE_PARTIALMARK,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_0F82 in this state: */
|
||||
new TransitionInstruction(STATE_READY,
|
||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_0F39 in this state: */
|
||||
null,
|
||||
/* upon seeing CC_ACHUNG in this state: */
|
||||
null, // because 0F71 comes after SJCs, before Vs, and
|
||||
// before CMs.
|
||||
/* upon seeing CC_DIGIT in this state: */
|
||||
new TransitionInstruction(STATE_DIGIT,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
|
||||
},
|
||||
// STATE_DIGIT:
|
||||
{
|
||||
/* upon seeing CC_SIN in this state: */
|
||||
new TransitionInstruction(STATE_READY,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_MCWD in this state: */
|
||||
new TransitionInstruction(STATE_READY,
|
||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_CM in this state: */
|
||||
new TransitionInstruction(STATE_READY,
|
||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_SJC in this state: */
|
||||
null,
|
||||
/* upon seeing CC_CON in this state: */
|
||||
new TransitionInstruction(STATE_STACKING,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_V in this state: */
|
||||
null,
|
||||
/* upon seeing CC_0F8A in this state: */
|
||||
new TransitionInstruction(STATE_PARTIALMARK,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_0F82 in this state: */
|
||||
new TransitionInstruction(STATE_READY,
|
||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_0F39 in this state: */
|
||||
null,
|
||||
/* upon seeing CC_ACHUNG in this state: */
|
||||
null,
|
||||
/* upon seeing CC_DIGIT in this state: */
|
||||
new TransitionInstruction(STATE_DIGIT,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER) /* DLC although consider the meaning of 0F22,0F22,0F3F */
|
||||
},
|
||||
// STATE_STACKING:
|
||||
{
|
||||
/* upon seeing CC_SIN in this state: */
|
||||
new TransitionInstruction(STATE_READY,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_MCWD in this state: */
|
||||
null,
|
||||
/* upon seeing CC_CM in this state: */
|
||||
new TransitionInstruction(STATE_READY,
|
||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_SJC in this state: */
|
||||
new TransitionInstruction(STATE_STACKING,
|
||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_CON in this state: */
|
||||
new TransitionInstruction(STATE_STACKING,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_V in this state: */
|
||||
new TransitionInstruction(STATE_READY,
|
||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_0F8A in this state: */
|
||||
new TransitionInstruction(STATE_PARTIALMARK,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_0F82 in this state: */
|
||||
new TransitionInstruction(STATE_READY,
|
||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_0F39 in this state: */
|
||||
new TransitionInstruction(STATE_STACKING,
|
||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_ACHUNG in this state: */
|
||||
new TransitionInstruction(STATE_STACKPLUSACHUNG,
|
||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_DIGIT in this state: */
|
||||
new TransitionInstruction(STATE_DIGIT,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
|
||||
},
|
||||
// STATE_STACKPLUSACHUNG:
|
||||
{
|
||||
/* upon seeing CC_SIN in this state: */
|
||||
new TransitionInstruction(STATE_READY,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_MCWD in this state: */
|
||||
null,
|
||||
/* upon seeing CC_CM in this state: */
|
||||
new TransitionInstruction(STATE_READY,
|
||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_SJC in this state: */
|
||||
null,
|
||||
/* upon seeing CC_CON in this state: */
|
||||
new TransitionInstruction(STATE_STACKING,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_V in this state: */
|
||||
new TransitionInstruction(STATE_READY,
|
||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_0F8A in this state: */
|
||||
new TransitionInstruction(STATE_PARTIALMARK,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_0F82 in this state: */
|
||||
new TransitionInstruction(STATE_READY,
|
||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_0F39 in this state: */
|
||||
null,
|
||||
/* upon seeing CC_ACHUNG in this state: */
|
||||
null,
|
||||
/* upon seeing CC_DIGIT in this state: */
|
||||
new TransitionInstruction(STATE_DIGIT,
|
||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
|
||||
},
|
||||
// STATE_PARTIALMARK:
|
||||
{
|
||||
/* upon seeing CC_SIN in this state: */
|
||||
null,
|
||||
/* upon seeing CC_MCWD in this state: */
|
||||
null,
|
||||
/* upon seeing CC_CM in this state: */
|
||||
null,
|
||||
/* upon seeing CC_SJC in this state: */
|
||||
null,
|
||||
/* upon seeing CC_CON in this state: */
|
||||
null,
|
||||
/* upon seeing CC_V in this state: */
|
||||
null,
|
||||
/* upon seeing CC_0F8A in this state: */
|
||||
null,
|
||||
/* upon seeing CC_0F82 in this state: */
|
||||
new TransitionInstruction(STATE_READY,
|
||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||
/* upon seeing CC_0F39 in this state: */
|
||||
null,
|
||||
/* upon seeing CC_ACHUNG in this state: */
|
||||
null,
|
||||
/* upon seeing CC_DIGIT in this state: */
|
||||
null
|
||||
}
|
||||
};
|
||||
|
||||
DLC NOW -- clearly, we need LegalSyllable to be convertable to and from GraphemeClusters;
|
||||
|
||||
/** Breaks a sequence of GraphemeClusters into LegalSyllables.
|
||||
@param grcls a sequence of nonnull GraphemeClusters
|
||||
@return a sequence of nonnull LegalSyllables
|
||||
@exception TibetanSyntaxException if grcls does not consist
|
||||
entirely of legal Tibetan syllables
|
||||
@see #GraphemeCluster
|
||||
@see #LegalSyllable
|
||||
*/
|
||||
private static Vector breakGraphemeClustersIntoOnlySyllables(Vector grcls)
|
||||
throws TibetanSyntaxException
|
||||
{
|
||||
return breakGraphemeClustersIntoSyllablesAndGraphemeClusters(grcls,
|
||||
true);
|
||||
}
|
||||
|
||||
private static Vector breakGraphemeClustersIntoOnlySyllables(Vector grcls) {
|
||||
try {
|
||||
return breakGraphemeClustersIntoSyllablesAndGraphemeClusters(grcls,
|
||||
false);
|
||||
} catch (TibetanSyntaxException) {
|
||||
throw new Error("This can never happen, because the second parameter, validating, was false.");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
@param grcls a Vector consisting entirely of GraphemeClusters
|
||||
@param validate true iff you wish to have a
|
||||
TibetanSyntaxException thrown upon encountering a sequence of
|
||||
GraphemeClusters that is syntactically incorrect Tibetan
|
||||
@return if validate is true, a Vector consisting entirely of
|
||||
LegalSyllables, else a vector of LegalSyllables and
|
||||
GraphemeClusters */
|
||||
private static Vector breakGraphemeClustersIntoSyllablesAndGraphemeClusters(Vector grcls,
|
||||
boolean validate)
|
||||
throws TibetanSyntaxException
|
||||
{
|
||||
Vector syllables = new Vector();
|
||||
int grcls_len = grcls.length();
|
||||
int beginning_of_cluster = 0;
|
||||
for (int i = 0; i < grcls_len; i++) {
|
||||
GraphemeCluster current_grcl
|
||||
= (GraphemeCluster)grcls.elementAt(i);
|
||||
if (current_grcl.isTshegLike()) {
|
||||
if (beginning_of_cluster < i) {
|
||||
// One or more non-tsheg-like grapheme clusters is
|
||||
// here between tsheg-like grapheme clusters. Is
|
||||
// it a legal syllable?
|
||||
if (LegalTshegBar.formsLegalTshegBar(grcls,
|
||||
beginning_of_cluster,
|
||||
i))
|
||||
{
|
||||
syllables.add(new LegalSyllable(grcls,
|
||||
beginning_of_cluster,
|
||||
i, tsheg=current_grcl));
|
||||
}
|
||||
else
|
||||
{
|
||||
if (validating) {
|
||||
TibetanSyntaxException ex
|
||||
= new TibetanSyntaxException(grcls,
|
||||
beginning_of_cluster,
|
||||
i);
|
||||
// DLC: return an int -1 for "all good" or
|
||||
// 3 for "the fourth element is the first
|
||||
// bad one" but then you don't know that
|
||||
// 3-6 were the bad ones
|
||||
throw ex;
|
||||
} else {
|
||||
for (int j = beginning_of_cluster; j <= i; j++) {
|
||||
syllables.add(grcls.elementAt(j));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
beginning_of_cluster = i + 1;
|
||||
} // else add current_grcl to the waiting list, in a sense
|
||||
}
|
||||
return syllables;
|
||||
}
|
||||
|
||||
/** Breaks a string of perfectly-formed Unicode into
|
||||
GraphemeClusters.
|
||||
@param nfthdl_unicode a String of NFTHDL-normalized Unicode
|
||||
codepoints
|
||||
@exception Exception if the input is not perfectly formed
|
||||
@return a vector of GraphemeClusters
|
||||
@see #GraphemeCluster
|
||||
*/
|
||||
private static Vector nonErrorCorrectingReader(String nfthdl_unicode)
|
||||
throws Exception
|
||||
{
|
||||
// a vector of GraphemeClusters that we build up little by
|
||||
// little:
|
||||
Vector grcls = new Vector();
|
||||
int currentState = STATE_START;
|
||||
StringBuffer holdingPen = new StringBuffer();
|
||||
|
||||
int ilen = nfthdl_unicode.length();
|
||||
for (int i = 0; i < ilen; i++) {
|
||||
char current_cp = nfthdl_unicode.charAt(i);
|
||||
int cc_of_current_cp = getCCForCP(current_cp);
|
||||
final TransitionInstruction ti
|
||||
= transitionTable[currentState][cc_of_current_cp];
|
||||
if (null == ti) {
|
||||
throw new Exception("Bad Unicode. DLC improve these messages");
|
||||
} else {
|
||||
switch (ti.getAction()) {
|
||||
case ACTION_BEGINS_NEW_GRAPHEME_CLUSTER:
|
||||
grcls.add(new GraphemeCluster(holdingPen));
|
||||
holdingPen = new StringBuffer();
|
||||
break;
|
||||
case ACTION_CONTINUES_GRAPHEME_CLUSTER:
|
||||
holdingString.append(current_cp);
|
||||
break;
|
||||
case ACTION_PREPEND_WITH_0F68:
|
||||
throw new Error("This never happens inside the validating scanner.");
|
||||
default:
|
||||
throw new Error("Famous last words: This won't happen.");
|
||||
}
|
||||
currentState = ti.getNextState();
|
||||
}
|
||||
}
|
||||
return grcls;
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue