Jskad/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java
dchandler 7198f23361 I really hesitate to commit this because I'm not sure what it brings to the
table exactly and I fear that it makes the ACIP->Tibetan converter code
a lot uglier.  The TODO(DLC)[EWTS->Tibetan] comments littered throughout
are part of the ugliness; they point to the ugliness.  If each were addressed,
cleanliness could perhaps be achieved.

I've largely forgotten exactly what this change does, but it attempts to
improve EWTS->Tibetan conversion.  The lexer is probably really, really
primitive.  I concentrate here on converting a single tsheg bar rather than
a whole document.

Eclipse was used during part of my journey here and some imports were
reorganized merely because I could.  :)

(Eclipse was needed when the usual ant build failed to run a new test
EWTSTest.  And I wanted its debugger.)

Next steps: end-to-end EWTS tests should bring many problems to light.  Fix
those.  Triage all the TODO comments.

I don't know that I'll ever really trust the implementation.  The tests are
valuable, though.  A clean implementation of EWTS->Tibetan in Jython
might hold enough interest for me; I'd like to learn Python.
2005-06-20 06:18:00 +00:00

348 lines
16 KiB
Java

/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.tshegbar;
import java.util.Vector;
class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
/** Don't instantiate this class. */
private ValidatingUnicodeReader() { super(); }
/** This table tells how to transition from state to state upon
* encountering certain classes of Unicode codepoints. There are
* 6 legal states + an error state. */
private static final TransitionInstruction
transitionTable[/* 6 is the number of STATEs */][/* 11 is the number of CC classes */]
= {
// STATE_START:
{
/* upon seeing CC_SIN in this state: */
new TransitionInstruction(STATE_READY,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_MCWD in this state: */
null,
/* upon seeing CC_CM in this state: */
null,
/* upon seeing CC_SJC in this state: */
null,
/* upon seeing CC_CON in this state: */
new TransitionInstruction(STATE_READY,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_V in this state: */
null,
/* upon seeing CC_0F8A in this state: */
new TransitionInstruction(STATE_READY,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_0F82 in this state: */
null,
/* upon seeing CC_0F39 in this state: */
null,
/* upon seeing CC_SUBSCRIBED_ACHUNG in this state: */
null,
/* upon seeing CC_DIGIT in this state: */
new TransitionInstruction(STATE_DIGIT,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
},
// STATE_READY:
{
/* upon seeing CC_SIN in this state: */
new TransitionInstruction(STATE_READY, // self
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_MCWD in this state: */
null,
/* upon seeing CC_CM in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_SJC in this state: */
null,
/* upon seeing CC_CON in this state: */
new TransitionInstruction(STATE_STACKING,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_V in this state: */
null,
/* upon seeing CC_0F8A in this state: */
new TransitionInstruction(STATE_PARTIALMARK,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_0F82 in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F39 in this state: */
null,
/* upon seeing CC_SUBSCRIBED_ACHUNG in this state: */
null, // because 0F71 comes after SJCs, before Vs, and
// before CMs.
/* upon seeing CC_DIGIT in this state: */
new TransitionInstruction(STATE_DIGIT,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
},
// STATE_DIGIT:
{
/* upon seeing CC_SIN in this state: */
new TransitionInstruction(STATE_READY,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_MCWD in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_CM in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_SJC in this state: */
null,
/* upon seeing CC_CON in this state: */
new TransitionInstruction(STATE_STACKING,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_V in this state: */
null,
/* upon seeing CC_0F8A in this state: */
new TransitionInstruction(STATE_PARTIALMARK,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_0F82 in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F39 in this state: */
null,
/* upon seeing CC_SUBSCRIBED_ACHUNG in this state: */
null,
/* upon seeing CC_DIGIT in this state: */
new TransitionInstruction(STATE_DIGIT,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER) /* DLC although consider the meaning of 0F22,0F22,0F3F */
},
// STATE_STACKING:
{
/* upon seeing CC_SIN in this state: */
new TransitionInstruction(STATE_READY,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_MCWD in this state: */
null,
/* upon seeing CC_CM in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_SJC in this state: */
new TransitionInstruction(STATE_STACKING,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_CON in this state: */
new TransitionInstruction(STATE_STACKING,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_V in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F8A in this state: */
new TransitionInstruction(STATE_PARTIALMARK,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_0F82 in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F39 in this state: */
new TransitionInstruction(STATE_STACKING,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_SUBSCRIBED_ACHUNG in this state: */
new TransitionInstruction(STATE_STACKPLUSACHUNG,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_DIGIT in this state: */
new TransitionInstruction(STATE_DIGIT,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
},
// STATE_STACKPLUSACHUNG:
{
/* upon seeing CC_SIN in this state: */
new TransitionInstruction(STATE_READY,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_MCWD in this state: */
null,
/* upon seeing CC_CM in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_SJC in this state: */
null,
/* upon seeing CC_CON in this state: */
new TransitionInstruction(STATE_STACKING,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_V in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F8A in this state: */
new TransitionInstruction(STATE_PARTIALMARK,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
/* upon seeing CC_0F82 in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F39 in this state: */
null,
/* upon seeing CC_SUBSCRIBED_ACHUNG in this state: */
null,
/* upon seeing CC_DIGIT in this state: */
new TransitionInstruction(STATE_DIGIT,
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER)
},
// STATE_PARTIALMARK:
{
/* upon seeing CC_SIN in this state: */
null,
/* upon seeing CC_MCWD in this state: */
null,
/* upon seeing CC_CM in this state: */
null,
/* upon seeing CC_SJC in this state: */
null,
/* upon seeing CC_CON in this state: */
null,
/* upon seeing CC_V in this state: */
null,
/* upon seeing CC_0F8A in this state: */
null,
/* upon seeing CC_0F82 in this state: */
new TransitionInstruction(STATE_READY,
ACTION_CONTINUES_GRAPHEME_CLUSTER),
/* upon seeing CC_0F39 in this state: */
null,
/* upon seeing CC_SUBSCRIBED_ACHUNG in this state: */
null,
/* upon seeing CC_DIGIT in this state: */
null
}
};
/* DLC NOW FIXME -- clearly, we need LegalTshegBar to be convertable to and from UnicodeGraphemeClusters; */
/** Breaks a sequence of UnicodeGraphemeClusters into LegalTshegBars.
@param grcls a sequence of nonnull UnicodeGraphemeClusters
@return a sequence of nonnull LegalTshegBars
@exception TibetanSyntaxException if grcls does not consist
entirely of legal Tibetan syllables
@see UnicodeGraphemeCluster
@see LegalTshegBar
*/
private static Vector breakGraphemeClustersIntoOnlyTshegBars(Vector grcls)
throws TibetanSyntaxException
{
return breakGraphemeClustersIntoTshegBarsAndGraphemeClusters(grcls,
true);
}
private static Vector breakLegalGraphemeClustersIntoOnlyTshegBars(Vector grcls) {
try {
return breakGraphemeClustersIntoTshegBarsAndGraphemeClusters(grcls,
false);
} catch (TibetanSyntaxException ex) {
throw new Error("This can never happen, because the second parameter, validating, was false.");
}
}
/**
@param grcls a Vector consisting entirely of UnicodeGraphemeClusters
@param validate true iff you wish to have a
TibetanSyntaxException thrown upon encountering a sequence of
UnicodeGraphemeClusters that is syntactically incorrect Tibetan
@return if validate is true, a Vector consisting entirely of
LegalTshegBars, else a vector of LegalTshegBars and
UnicodeGraphemeClusters */
private static Vector breakGraphemeClustersIntoTshegBarsAndGraphemeClusters(Vector grcls,
boolean validate)
throws TibetanSyntaxException
{
Vector syllables = new Vector();
int grcls_len = grcls.size();
int beginning_of_cluster = 0;
for (int i = 0; i < grcls_len; i++) {
UnicodeGraphemeCluster current_grcl
= (UnicodeGraphemeCluster)grcls.elementAt(i);
if (current_grcl.isTshegLike()) {
if (beginning_of_cluster < i) {
// One or more non-tsheg-like grapheme clusters is
// here between tsheg-like grapheme clusters. Is
// it a legal syllable?
if (LegalTshegBar.formsLegalTshegBar(grcls,
beginning_of_cluster,
i))
{
syllables.add(new LegalTshegBar(grcls,
beginning_of_cluster,
i, tsheg=current_grcl));
}
else
{
if (validating) {
TibetanSyntaxException ex
= new TibetanSyntaxException(grcls,
beginning_of_cluster,
i);
// DLC: return an int -1 for "all good" or
// 3 for "the fourth element is the first
// bad one" but then you don't know that
// 3-6 were the bad ones
throw ex;
} else {
for (int j = beginning_of_cluster; j <= i; j++) {
syllables.add(grcls.elementAt(j));
}
}
}
}
beginning_of_cluster = i + 1;
} // else add current_grcl to the waiting list, in a sense
}
return syllables;
}
/** Breaks a string of perfectly-formed Unicode into
UnicodeGraphemeClusters.
@param nfthdl_unicode a String of NFTHDL-normalized Unicode
codepoints
@exception Exception if the input is not perfectly formed
@return a vector of UnicodeGraphemeClusters
@see UnicodeGraphemeCluster
*/
private static Vector nonErrorCorrectingReader(String nfthdl_unicode)
throws Exception
{
// a vector of UnicodeGraphemeClusters that we build up little by
// little:
Vector grcls = new Vector();
int currentState = STATE_START;
StringBuffer holdingPen = new StringBuffer();
int ilen = nfthdl_unicode.length();
for (int i = 0; i < ilen; i++) {
char current_cp = nfthdl_unicode.charAt(i);
int cc_of_current_cp = getCCForCP(current_cp);
final TransitionInstruction ti
= transitionTable[currentState][cc_of_current_cp];
if (null == ti) {
throw new Exception("Bad Unicode. DLC improve these messages");
} else {
switch (ti.getAction()) {
case ACTION_BEGINS_NEW_GRAPHEME_CLUSTER:
grcls.add(new UnicodeGraphemeCluster(holdingPen));
holdingPen = new StringBuffer();
break;
case ACTION_CONTINUES_GRAPHEME_CLUSTER:
holdingString.append(current_cp);
break;
case ACTION_PREPEND_WITH_0F68:
throw new Error("This never happens inside the validating scanner.");
default:
throw new Error("Famous last words: This won't happen.");
}
currentState = ti.getNextState();
}
}
return grcls;
}
}