ant private-javadocs runs without warnings; cleaned up some
as-yet-unused code.
This commit is contained in:
parent
644c0d3801
commit
6636d03a41
9 changed files with 158 additions and 145 deletions
|
@ -19,7 +19,7 @@ Contributor(s): ______________________________________.
|
||||||
package org.thdl.tib.text;
|
package org.thdl.tib.text;
|
||||||
|
|
||||||
/** This is where basic, static knowledge of THDL's Extended Wylie is housed.
|
/** This is where basic, static knowledge of THDL's Extended Wylie is housed.
|
||||||
* @see org.thdl.tib.text#TibetanMachineWeb */
|
* @see TibetanMachineWeb */
|
||||||
public interface THDLWylieConstants {
|
public interface THDLWylieConstants {
|
||||||
/**
|
/**
|
||||||
* the Wylie for bindu/anusvara
|
* the Wylie for bindu/anusvara
|
||||||
|
|
|
@ -146,7 +146,7 @@ public final class LegalTshegBar
|
||||||
* must not be absent. To learn about the arguments, and to be
|
* must not be absent. To learn about the arguments, and to be
|
||||||
* sure that your input won't cause an exception to be thrown,
|
* sure that your input won't cause an exception to be thrown,
|
||||||
* see {@link
|
* see {@link
|
||||||
* #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char)}.
|
* #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char,StringBuffer)}.
|
||||||
*
|
*
|
||||||
* @exception IllegalArgumentException if the rootLetter is not
|
* @exception IllegalArgumentException if the rootLetter is not
|
||||||
* one of the thirty consonants (and represented nominally, at
|
* one of the thirty consonants (and represented nominally, at
|
||||||
|
@ -712,7 +712,7 @@ public final class LegalTshegBar
|
||||||
* @exception IllegalArgumentException if the syllable does not
|
* @exception IllegalArgumentException if the syllable does not
|
||||||
* follow the rules of a Tibetan syllable. To learn about the
|
* follow the rules of a Tibetan syllable. To learn about the
|
||||||
* arguments, see {@link
|
* arguments, see {@link
|
||||||
* #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char)}. */
|
* #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char,StringBuffer)}. */
|
||||||
private static void throwIfNotLegalTshegBar(char prefix,
|
private static void throwIfNotLegalTshegBar(char prefix,
|
||||||
char headLetter,
|
char headLetter,
|
||||||
char rootLetter,
|
char rootLetter,
|
||||||
|
@ -745,7 +745,7 @@ public final class LegalTshegBar
|
||||||
/** If you get through this gauntlet without having an exception
|
/** If you get through this gauntlet without having an exception
|
||||||
* thrown, then this combination makes a legal Tibetan syllable.
|
* thrown, then this combination makes a legal Tibetan syllable.
|
||||||
* To learn about the arguments, see {@link
|
* To learn about the arguments, see {@link
|
||||||
* #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char)}.
|
* #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char,StringBuffer)}.
|
||||||
* @param errorBuf if non-null, the reason this is illegal will
|
* @param errorBuf if non-null, the reason this is illegal will
|
||||||
* be written here, if this is illegal
|
* be written here, if this is illegal
|
||||||
* @return true if this syllable is legal, false if this syllable
|
* @return true if this syllable is legal, false if this syllable
|
||||||
|
@ -1257,13 +1257,13 @@ public final class LegalTshegBar
|
||||||
* combination of superscribed, root, and subscribed letters)
|
* combination of superscribed, root, and subscribed letters)
|
||||||
* takes an EWC_ga prefix.
|
* takes an EWC_ga prefix.
|
||||||
* @param head the {@link
|
* @param head the {@link
|
||||||
* isNominalRepresentationOfConsonant(char) nominal
|
* #isNominalRepresentationOfConsonant(char) nominal
|
||||||
* representation} of the superscribed letter, or EW_ABSENT if
|
* representation} of the superscribed letter, or EW_ABSENT if
|
||||||
* not present
|
* not present
|
||||||
* @param root the {@link
|
* @param root the {@link
|
||||||
* isNominalRepresentationOfConsonant(char) nominal
|
* #isNominalRepresentationOfConsonant(char) nominal
|
||||||
* representation} of the root letter
|
* representation} of the root letter
|
||||||
* @param sub the {@link isNominalRepresentationOfConsonant(char)
|
* @param sub the {@link #isNominalRepresentationOfConsonant(char)
|
||||||
* nominal representation} of the subjoined letter, or EW_ABSENT
|
* nominal representation} of the subjoined letter, or EW_ABSENT
|
||||||
* if not present */
|
* if not present */
|
||||||
static boolean takesGao(char head, char root, char sub) {
|
static boolean takesGao(char head, char root, char sub) {
|
||||||
|
@ -1289,13 +1289,13 @@ public final class LegalTshegBar
|
||||||
* combination of superscribed, root, and subscribed letters)
|
* combination of superscribed, root, and subscribed letters)
|
||||||
* takes an EWC_da prefix.
|
* takes an EWC_da prefix.
|
||||||
* @param head the {@link
|
* @param head the {@link
|
||||||
* isNominalRepresentationOfConsonant(char) nominal
|
* #isNominalRepresentationOfConsonant(char) nominal
|
||||||
* representation} of the superscribed letter, or EW_ABSENT if
|
* representation} of the superscribed letter, or EW_ABSENT if
|
||||||
* not present
|
* not present
|
||||||
* @param root the {@link
|
* @param root the {@link
|
||||||
* isNominalRepresentationOfConsonant(char) nominal
|
* #isNominalRepresentationOfConsonant(char) nominal
|
||||||
* representation} of the root letter
|
* representation} of the root letter
|
||||||
* @param sub the {@link isNominalRepresentationOfConsonant(char)
|
* @param sub the {@link #isNominalRepresentationOfConsonant(char)
|
||||||
* nominal representation} of the subjoined letter, or EW_ABSENT
|
* nominal representation} of the subjoined letter, or EW_ABSENT
|
||||||
* if not present */
|
* if not present */
|
||||||
static boolean takesDao(char head, char root, char sub) {
|
static boolean takesDao(char head, char root, char sub) {
|
||||||
|
@ -1327,13 +1327,13 @@ public final class LegalTshegBar
|
||||||
* combination of superscribed, root, and subscribed letters)
|
* combination of superscribed, root, and subscribed letters)
|
||||||
* takes an EWC_achung prefix.
|
* takes an EWC_achung prefix.
|
||||||
* @param head the {@link
|
* @param head the {@link
|
||||||
* isNominalRepresentationOfConsonant(char) nominal
|
* #isNominalRepresentationOfConsonant(char) nominal
|
||||||
* representation} of the superscribed letter, or EW_ABSENT if
|
* representation} of the superscribed letter, or EW_ABSENT if
|
||||||
* not present
|
* not present
|
||||||
* @param root the {@link
|
* @param root the {@link
|
||||||
* isNominalRepresentationOfConsonant(char) nominal
|
* #isNominalRepresentationOfConsonant(char) nominal
|
||||||
* representation} of the root letter
|
* representation} of the root letter
|
||||||
* @param sub the {@link isNominalRepresentationOfConsonant(char)
|
* @param sub the {@link #isNominalRepresentationOfConsonant(char)
|
||||||
* nominal representation} of the subjoined letter, or EW_ABSENT
|
* nominal representation} of the subjoined letter, or EW_ABSENT
|
||||||
* if not present */
|
* if not present */
|
||||||
static boolean takesAchungPrefix(char head, char root, char sub) {
|
static boolean takesAchungPrefix(char head, char root, char sub) {
|
||||||
|
@ -1370,13 +1370,13 @@ public final class LegalTshegBar
|
||||||
* combination of superscribed, root, and subscribed letters)
|
* combination of superscribed, root, and subscribed letters)
|
||||||
* takes an EWC_ma prefix.
|
* takes an EWC_ma prefix.
|
||||||
* @param head the {@link
|
* @param head the {@link
|
||||||
* isNominalRepresentationOfConsonant(char) nominal
|
* #isNominalRepresentationOfConsonant(char) nominal
|
||||||
* representation} of the superscribed letter, or EW_ABSENT if
|
* representation} of the superscribed letter, or EW_ABSENT if
|
||||||
* not present
|
* not present
|
||||||
* @param root the {@link
|
* @param root the {@link
|
||||||
* isNominalRepresentationOfConsonant(char) nominal
|
* #isNominalRepresentationOfConsonant(char) nominal
|
||||||
* representation} of the root letter
|
* representation} of the root letter
|
||||||
* @param sub the {@link isNominalRepresentationOfConsonant(char)
|
* @param sub the {@link #isNominalRepresentationOfConsonant(char)
|
||||||
* nominal representation} of the subjoined letter, or EW_ABSENT
|
* nominal representation} of the subjoined letter, or EW_ABSENT
|
||||||
* if not present */
|
* if not present */
|
||||||
static boolean takesMao(char head, char root, char sub) {
|
static boolean takesMao(char head, char root, char sub) {
|
||||||
|
@ -1409,13 +1409,13 @@ public final class LegalTshegBar
|
||||||
* combination of superscribed, root, and subscribed letters)
|
* combination of superscribed, root, and subscribed letters)
|
||||||
* takes an EWC_ba prefix.
|
* takes an EWC_ba prefix.
|
||||||
* @param head the {@link
|
* @param head the {@link
|
||||||
* isNominalRepresentationOfConsonant(char) nominal
|
* #isNominalRepresentationOfConsonant(char) nominal
|
||||||
* representation} of the superscribed letter, or EW_ABSENT if
|
* representation} of the superscribed letter, or EW_ABSENT if
|
||||||
* not present
|
* not present
|
||||||
* @param root the {@link
|
* @param root the {@link
|
||||||
* isNominalRepresentationOfConsonant(char) nominal
|
* #isNominalRepresentationOfConsonant(char) nominal
|
||||||
* representation} of the root letter
|
* representation} of the root letter
|
||||||
* @param sub the {@link isNominalRepresentationOfConsonant(char)
|
* @param sub the {@link #isNominalRepresentationOfConsonant(char)
|
||||||
* nominal representation} of the subjoined letter, or EW_ABSENT
|
* nominal representation} of the subjoined letter, or EW_ABSENT
|
||||||
* if not present */
|
* if not present */
|
||||||
static boolean takesBao(char head, char root, char sub) {
|
static boolean takesBao(char head, char root, char sub) {
|
||||||
|
|
|
@ -18,6 +18,8 @@ Contributor(s): ______________________________________.
|
||||||
|
|
||||||
package org.thdl.tib.text.tshegbar;
|
package org.thdl.tib.text.tshegbar;
|
||||||
|
|
||||||
|
import java.util.Vector;
|
||||||
|
|
||||||
/** DLC FIXMEDOC: says "this isn't legal Tibetan", not "this isn't a valid sequence of Unicode" */
|
/** DLC FIXMEDOC: says "this isn't legal Tibetan", not "this isn't a valid sequence of Unicode" */
|
||||||
class TibetanSyntaxException extends Exception {
|
class TibetanSyntaxException extends Exception {
|
||||||
/** This constructor creates an exception with a less than helpful
|
/** This constructor creates an exception with a less than helpful
|
||||||
|
@ -46,6 +48,6 @@ class TibetanSyntaxException extends Exception {
|
||||||
GraphemeCluster in the syntactically incorrect stretch of
|
GraphemeCluster in the syntactically incorrect stretch of
|
||||||
Tibetan. */
|
Tibetan. */
|
||||||
TibetanSyntaxException(Vector grcls, int start, int end) {
|
TibetanSyntaxException(Vector grcls, int start, int end) {
|
||||||
DLC NOW;
|
throw new Error("DLC NOW");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,8 @@ Contributor(s): ______________________________________.
|
||||||
|
|
||||||
package org.thdl.tib.text.tshegbar;
|
package org.thdl.tib.text.tshegbar;
|
||||||
|
|
||||||
|
import org.thdl.util.ThdlDebug;
|
||||||
|
|
||||||
|
|
||||||
/** DLC FIXMEDOC */
|
/** DLC FIXMEDOC */
|
||||||
class TransitionInstruction implements UnicodeReadingStateMachineConstants {
|
class TransitionInstruction implements UnicodeReadingStateMachineConstants {
|
||||||
|
@ -25,19 +27,19 @@ class TransitionInstruction implements UnicodeReadingStateMachineConstants {
|
||||||
TransitionInstruction(int nextState, int action) {
|
TransitionInstruction(int nextState, int action) {
|
||||||
super();
|
super();
|
||||||
|
|
||||||
assert(action == ACTION_CONTINUES_GRAPHEME_CLUSTER
|
ThdlDebug.verify(action == ACTION_CONTINUES_GRAPHEME_CLUSTER
|
||||||
|| action == ACTION_BEGINS_NEW_GRAPHEME_CLUSTER
|
|| action == ACTION_BEGINS_NEW_GRAPHEME_CLUSTER
|
||||||
|| action == ACTION_PREPEND_WITH_0F68);
|
|| action == ACTION_PREPEND_WITH_0F68); // DLC FIXME: assert this.
|
||||||
|
|
||||||
assert(nextState == STATE_START
|
ThdlDebug.verify(nextState == STATE_START
|
||||||
|| nextState == STATE_READY
|
|| nextState == STATE_READY
|
||||||
|| nextState == STATE_DIGIT
|
|| nextState == STATE_DIGIT
|
||||||
|| nextState == STATE_STACKING
|
|| nextState == STATE_STACKING
|
||||||
|| nextState == STATE_STACKPLUSACHUNG
|
|| nextState == STATE_STACKPLUSACHUNG
|
||||||
|| nextState == STATE_PARTIALMARK);
|
|| nextState == STATE_PARTIALMARK); // DLC FIXME: assert this.
|
||||||
|
|
||||||
// we start in the start state, but we can never return to it.
|
// we start in the start state, but we can never return to it.
|
||||||
assert(nextState != STATE_START);
|
ThdlDebug.verify(nextState != STATE_START); // DLC FIXME: assert this.
|
||||||
|
|
||||||
this.nextState = nextState;
|
this.nextState = nextState;
|
||||||
this.action = action;
|
this.action = action;
|
||||||
|
@ -55,4 +57,60 @@ class TransitionInstruction implements UnicodeReadingStateMachineConstants {
|
||||||
|
|
||||||
int getAction() { return action; }
|
int getAction() { return action; }
|
||||||
int getNextState() { return nextState; }
|
int getNextState() { return nextState; }
|
||||||
|
|
||||||
|
|
||||||
|
/** Returns the codepoint class for cp, e.g. {@link
|
||||||
|
* UnicodeReadingStateMachineConstants#CC_SJC}.
|
||||||
|
* @param cp a Unicode codepoint, which MUST be nondecomposable
|
||||||
|
* if it is in the Tibetan range but can be from outside the
|
||||||
|
* Tibetan range of Unicode */
|
||||||
|
static int getCCForCP(char cp) {
|
||||||
|
ThdlDebug.verify(getNFTHDL(cp) == null); // DLC FIXME: assert this
|
||||||
|
if ('\u0F82' == cp) {
|
||||||
|
return CC_0F82;
|
||||||
|
} else if ('\u0F8A' == cp) {
|
||||||
|
return CC_0F8A;
|
||||||
|
} else if ('\u0F39' == cp) {
|
||||||
|
return CC_0F39;
|
||||||
|
} else if ('\u0F71' == cp) {
|
||||||
|
return CC_SUBSCRIBED_ACHUNG;
|
||||||
|
} else if ('\u0F40' <= cp && cp <= '\u0F6A') {
|
||||||
|
ThdlDebug.verify(cp != '\u0F48'); // DLC FIXME: assert this
|
||||||
|
return CC_CON;
|
||||||
|
} else if ('\u0F90' <= cp && cp <= '\u0FBC') {
|
||||||
|
ThdlDebug.verify(cp != '\u0F98'); // DLC FIXME: assert this
|
||||||
|
return CC_SJC;
|
||||||
|
} else if ('\u0F20' <= cp && cp <= '\u0F33') {
|
||||||
|
return CC_DIGIT;
|
||||||
|
} else if (/* DLC NOW do these combine ONLY with digits, or do CC_CM just NOT combine with digits? */
|
||||||
|
'\u0F3E' == cp
|
||||||
|
|| '\u0F3F' == cp
|
||||||
|
|| '\u0F18' == cp
|
||||||
|
|| '\u0F19' == cp) {
|
||||||
|
return CC_MCWD;
|
||||||
|
} else if ('\u0FC6' == cp
|
||||||
|
|| '\u0F87' == cp
|
||||||
|
|| '\u0F86' == cp
|
||||||
|
|| '\u0F84' == cp
|
||||||
|
|| '\u0F83' == cp
|
||||||
|
|| '\u0F82' == cp
|
||||||
|
|| '\u0F7F' == cp
|
||||||
|
|| '\u0F7E' == cp
|
||||||
|
|| '\u0F37' == cp /* DLC NOW NORMALIZATION OF 0F10, 11 to 0F0F ??? */
|
||||||
|
|| '\u0F35' == cp) {
|
||||||
|
return CC_CM;
|
||||||
|
} else if ('\u0F72' == cp
|
||||||
|
|| '\u0F74' == cp
|
||||||
|
|| '\u0F7A' == cp
|
||||||
|
|| '\u0F7B' == cp
|
||||||
|
|| '\u0F7C' == cp
|
||||||
|
|| '\u0F7D' == cp
|
||||||
|
|| '\u0F80' == cp) {
|
||||||
|
// DLC what about U+0F84 ??? CC_V or CC_CM ?
|
||||||
|
return CC_V;
|
||||||
|
} else {
|
||||||
|
return CC_SIN;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -387,7 +387,7 @@ public class UnicodeGraphemeCluster
|
||||||
throw new Error("DLC FIXME");
|
throw new Error("DLC FIXME");
|
||||||
}
|
}
|
||||||
|
|
||||||
// DLC NOW -- LegalSyllable doesn't handle digits w/ underlining, etc.
|
// DLC NOW -- LegalTshegBar doesn't handle digits w/ underlining, etc.
|
||||||
|
|
||||||
/** If this is a Tibetan consonant stack, this returns the root
|
/** If this is a Tibetan consonant stack, this returns the root
|
||||||
* letter. If this is a Tibetan digit (perhaps with other
|
* letter. If this is a Tibetan digit (perhaps with other
|
||||||
|
|
|
@ -32,6 +32,7 @@ import java.io.InputStream;
|
||||||
public class UnicodeReader {
|
public class UnicodeReader {
|
||||||
/** You cannot instantiate this class. */
|
/** You cannot instantiate this class. */
|
||||||
private UnicodeReader() { }
|
private UnicodeReader() { }
|
||||||
|
// DLC NOW
|
||||||
|
|
||||||
// public static TTBIR parsePerfectUnicode() {
|
// public static TTBIR parsePerfectUnicode() {
|
||||||
// }
|
// }
|
||||||
|
|
|
@ -18,73 +18,22 @@ Contributor(s): ______________________________________.
|
||||||
|
|
||||||
package org.thdl.tib.text.tshegbar;
|
package org.thdl.tib.text.tshegbar;
|
||||||
|
|
||||||
/** Constants and static routines (DLC still?) useful in writing state
|
/** Constants useful in writing state machines for transforming
|
||||||
* machines for transforming Unicode input into other forms.
|
* Unicode input into other forms.
|
||||||
|
*
|
||||||
|
* @see TransitionInstruction#getCCForCP(char)
|
||||||
*
|
*
|
||||||
* @author David Chandler
|
* @author David Chandler
|
||||||
*/
|
*/
|
||||||
interface UnicodeReadingStateMachineConstants {
|
interface UnicodeReadingStateMachineConstants {
|
||||||
|
|
||||||
/** Returns the codepoint class for cp, e.g. {@link #CC_SJC}.
|
// Codepoint classes (CC_...) follow. These are mutually
|
||||||
* @param cp a Unicode codepoint, which MUST be nondecomposable
|
|
||||||
* if it is in the Tibetan range but can be from outside the
|
|
||||||
* Tibetan range of Unicode */
|
|
||||||
static int getCCForCP(char cp) {
|
|
||||||
assert(getNFTHDL(cp) == null);
|
|
||||||
if ('\u0F82' == cp) {
|
|
||||||
return CC_0F82;
|
|
||||||
} else if ('\u0F8A' == cp) {
|
|
||||||
return CC_0F8A;
|
|
||||||
} else if ('\u0F39' == cp) {
|
|
||||||
return CC_0F39;
|
|
||||||
} else if ('\u0F71' == cp) {
|
|
||||||
return CC_ACHUNG;
|
|
||||||
} else if ('\u0F40' <= cp && cp <= '\u0F6A') {
|
|
||||||
assert(cp != '\u0F48');
|
|
||||||
return CC_CON;
|
|
||||||
} else if ('\u0F90' <= cp && cp <= '\u0FBC') {
|
|
||||||
assert(cp != '\u0F98');
|
|
||||||
return CC_SJC;
|
|
||||||
} else if ('\u0F20' <= cp && cp <= '\u0F33') {
|
|
||||||
return CC_DIGIT;
|
|
||||||
} else if (/* DLC NOW do these combine ONLY with digits, or do CC_CM just NOT combine with digits? */
|
|
||||||
'\u0F3E' == cp
|
|
||||||
|| '\u0F3F' == cp
|
|
||||||
|| '\u0F18' == cp
|
|
||||||
|| '\u0F19' == cp) {
|
|
||||||
return CC_MCWD;
|
|
||||||
} else if ('\u0FC6' == cp
|
|
||||||
|| '\u0F87' == cp
|
|
||||||
|| '\u0F86' == cp
|
|
||||||
|| '\u0F84' == cp
|
|
||||||
|| '\u0F83' == cp
|
|
||||||
|| '\u0F82' == cp
|
|
||||||
|| '\u0F7F' == cp
|
|
||||||
|| '\u0F7E' == cp
|
|
||||||
|| '\u0F37' == cp /* DLC NOW NORMALIZATION OF 0F10, 11 to 0F0F ??? */
|
|
||||||
|| '\u0F35' == cp) {
|
|
||||||
return CC_CM;
|
|
||||||
} else if ('\u0F72' == cp
|
|
||||||
|| '\u0F74' == cp
|
|
||||||
|| '\u0F7A' == cp
|
|
||||||
|| '\u0F7B' == cp
|
|
||||||
|| '\u0F7C' == cp
|
|
||||||
|| '\u0F7D' == cp
|
|
||||||
|| '\u0F80' == cp) {
|
|
||||||
// DLC what about U+0F84 ??? CC_V or CC_CM ?
|
|
||||||
return CC_V;
|
|
||||||
} else {
|
|
||||||
return CC_SIN;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// codepoint classes (CC_...) follow. These are mutually
|
|
||||||
// exclusive, and their union is the whole of Unicode.
|
// exclusive, and their union is the whole of Unicode.
|
||||||
|
|
||||||
/** for everything else, i.e. non-Tibetan characters like U+0E00
|
/** for everything else, that is to say non-Tibetan characters
|
||||||
* and also Tibetan characters like U+0FCF and U+0F05 (DLC rename
|
* like U+0E00 and also Tibetan characters like U+0FCF and U+0F05
|
||||||
* SIN[GLETON] to OTHER as combining marks from outside the
|
* (DLC rename SIN[GLETON] to OTHER as combining marks from
|
||||||
* Tibetan range count as this) but not U+0F8A */
|
* outside the Tibetan range count as this) but not U+0F8A */
|
||||||
static final int CC_SIN = 0;
|
static final int CC_SIN = 0;
|
||||||
|
|
||||||
/** for combining marks in the Tibetan range of Unicode that
|
/** for combining marks in the Tibetan range of Unicode that
|
||||||
|
@ -95,18 +44,18 @@ interface UnicodeReadingStateMachineConstants {
|
||||||
* CC_MCWD, U+0F82, and U+0F39 */
|
* CC_MCWD, U+0F82, and U+0F39 */
|
||||||
static final int CC_CM = 2;
|
static final int CC_CM = 2;
|
||||||
|
|
||||||
/** for combining consonants, i.e. U+0F90-U+0FBC minus U+0F98
|
/** for combining consonants, that is to say U+0F90-U+0FBC minus
|
||||||
* minus the decomposable entries like U+0F93, U+0F9D, U+0FA2,
|
* U+0F98 minus the decomposable entries like U+0F93, U+0F9D,
|
||||||
* etc. */
|
* U+0FA2, etc. */
|
||||||
static final int CC_SJC = 3;
|
static final int CC_SJC = 3;
|
||||||
|
|
||||||
/** for noncombining consonants, i.e. U+0F40-U+0F6A minus U+0F48
|
/** for noncombining consonants, that is to say U+0F40-U+0F6A
|
||||||
* minus the decomposable entries like U+0F43, U+0F4D, U+0F52,
|
* minus U+0F48 minus the decomposable entries like U+0F43,
|
||||||
* etc. */
|
* U+0F4D, U+0F52, etc. */
|
||||||
static final int CC_CON = 4;
|
static final int CC_CON = 4;
|
||||||
|
|
||||||
/** for simple, nondecomposable vowels, i.e. U+0F72, U+0F74,
|
/** for simple, nondecomposable vowels, that is to say U+0F72,
|
||||||
* U+0F7A, U+0F7B, U+0F7C, U+0F7D, U+0F80 */
|
* U+0F74, U+0F7A, U+0F7B, U+0F7C, U+0F7D, U+0F80 */
|
||||||
static final int CC_V = 5;
|
static final int CC_V = 5;
|
||||||
|
|
||||||
/** for U+0F8A */
|
/** for U+0F8A */
|
||||||
|
@ -121,9 +70,9 @@ interface UnicodeReadingStateMachineConstants {
|
||||||
static final int CC_0F39 = 8;
|
static final int CC_0F39 = 8;
|
||||||
|
|
||||||
/** for U+0F71 */
|
/** for U+0F71 */
|
||||||
static final int CC_ACHUNG = 9;
|
static final int CC_SUBSCRIBED_ACHUNG = 9;
|
||||||
|
|
||||||
/** for digits, i.e. U+0F20-U+0F33 */
|
/** for digits, that is to say U+0F20-U+0F33 */
|
||||||
static final int CC_DIGIT = 10;
|
static final int CC_DIGIT = 10;
|
||||||
|
|
||||||
|
|
||||||
|
@ -133,14 +82,14 @@ interface UnicodeReadingStateMachineConstants {
|
||||||
/** initial state */
|
/** initial state */
|
||||||
static final int STATE_START = 0;
|
static final int STATE_START = 0;
|
||||||
|
|
||||||
/** ready state, i.e. the state in which some non-empty Unicode
|
/** ready state, that is to say the state in which some non-empty
|
||||||
* String is in the holding area, <i>ready</i> to receive
|
* Unicode String is in the holding area, <i>ready</i> to receive
|
||||||
* combining marks like U+0F35 */
|
* combining marks like U+0F35 */
|
||||||
static final int STATE_READY = 1;
|
static final int STATE_READY = 1;
|
||||||
|
|
||||||
/** digit state, i.e. the state in which some non-empty Unicode
|
/** digit state, that is to say the state in which some non-empty
|
||||||
* String consisting entirely of digits is in the holding area,
|
* Unicode String consisting entirely of digits is in the holding
|
||||||
* ready to receive marks that combine only with digits */
|
* area, ready to receive marks that combine only with digits */
|
||||||
static final int STATE_DIGIT = 2;
|
static final int STATE_DIGIT = 2;
|
||||||
|
|
||||||
/** state in which CC_SJC are welcomed and treated as consonants
|
/** state in which CC_SJC are welcomed and treated as consonants
|
||||||
|
|
|
@ -18,14 +18,17 @@ Contributor(s): ______________________________________.
|
||||||
|
|
||||||
package org.thdl.tib.text.tshegbar;
|
package org.thdl.tib.text.tshegbar;
|
||||||
|
|
||||||
|
import java.util.Vector;
|
||||||
|
|
||||||
class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
|
class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
|
||||||
/** Don't instantiate this class. */
|
/** Don't instantiate this class. */
|
||||||
private Foo() { super(); }
|
private ValidatingUnicodeReader() { super(); }
|
||||||
|
|
||||||
/** This table tells how to transition from state a 6 states + error state */
|
/** This table tells how to transition from state to state upon
|
||||||
|
* encountering certain classes of Unicode codepoints. There are
|
||||||
|
* 6 legal states + an error state. */
|
||||||
private static final TransitionInstruction
|
private static final TransitionInstruction
|
||||||
transitionTable[6 /* number of STATEs */]
|
transitionTable[/* 6 is the number of STATEs */][/* 11 is the number of CC classes */]
|
||||||
[11 /* number of CC classes */]
|
|
||||||
= {
|
= {
|
||||||
// STATE_START:
|
// STATE_START:
|
||||||
{
|
{
|
||||||
|
@ -50,7 +53,7 @@ class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
|
||||||
null,
|
null,
|
||||||
/* upon seeing CC_0F39 in this state: */
|
/* upon seeing CC_0F39 in this state: */
|
||||||
null,
|
null,
|
||||||
/* upon seeing CC_ACHUNG in this state: */
|
/* upon seeing CC_SUBSCRIBED_ACHUNG in this state: */
|
||||||
null,
|
null,
|
||||||
/* upon seeing CC_DIGIT in this state: */
|
/* upon seeing CC_DIGIT in this state: */
|
||||||
new TransitionInstruction(STATE_DIGIT,
|
new TransitionInstruction(STATE_DIGIT,
|
||||||
|
@ -73,7 +76,7 @@ class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
|
||||||
new TransitionInstruction(STATE_STACKING,
|
new TransitionInstruction(STATE_STACKING,
|
||||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||||
/* upon seeing CC_V in this state: */
|
/* upon seeing CC_V in this state: */
|
||||||
null
|
null,
|
||||||
/* upon seeing CC_0F8A in this state: */
|
/* upon seeing CC_0F8A in this state: */
|
||||||
new TransitionInstruction(STATE_PARTIALMARK,
|
new TransitionInstruction(STATE_PARTIALMARK,
|
||||||
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
ACTION_BEGINS_NEW_GRAPHEME_CLUSTER),
|
||||||
|
@ -82,7 +85,7 @@ class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
|
||||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
/* upon seeing CC_0F39 in this state: */
|
/* upon seeing CC_0F39 in this state: */
|
||||||
null,
|
null,
|
||||||
/* upon seeing CC_ACHUNG in this state: */
|
/* upon seeing CC_SUBSCRIBED_ACHUNG in this state: */
|
||||||
null, // because 0F71 comes after SJCs, before Vs, and
|
null, // because 0F71 comes after SJCs, before Vs, and
|
||||||
// before CMs.
|
// before CMs.
|
||||||
/* upon seeing CC_DIGIT in this state: */
|
/* upon seeing CC_DIGIT in this state: */
|
||||||
|
@ -115,7 +118,7 @@ class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
|
||||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
/* upon seeing CC_0F39 in this state: */
|
/* upon seeing CC_0F39 in this state: */
|
||||||
null,
|
null,
|
||||||
/* upon seeing CC_ACHUNG in this state: */
|
/* upon seeing CC_SUBSCRIBED_ACHUNG in this state: */
|
||||||
null,
|
null,
|
||||||
/* upon seeing CC_DIGIT in this state: */
|
/* upon seeing CC_DIGIT in this state: */
|
||||||
new TransitionInstruction(STATE_DIGIT,
|
new TransitionInstruction(STATE_DIGIT,
|
||||||
|
@ -149,7 +152,7 @@ class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
|
||||||
/* upon seeing CC_0F39 in this state: */
|
/* upon seeing CC_0F39 in this state: */
|
||||||
new TransitionInstruction(STATE_STACKING,
|
new TransitionInstruction(STATE_STACKING,
|
||||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
/* upon seeing CC_ACHUNG in this state: */
|
/* upon seeing CC_SUBSCRIBED_ACHUNG in this state: */
|
||||||
new TransitionInstruction(STATE_STACKPLUSACHUNG,
|
new TransitionInstruction(STATE_STACKPLUSACHUNG,
|
||||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
/* upon seeing CC_DIGIT in this state: */
|
/* upon seeing CC_DIGIT in this state: */
|
||||||
|
@ -182,7 +185,7 @@ class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
|
||||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
/* upon seeing CC_0F39 in this state: */
|
/* upon seeing CC_0F39 in this state: */
|
||||||
null,
|
null,
|
||||||
/* upon seeing CC_ACHUNG in this state: */
|
/* upon seeing CC_SUBSCRIBED_ACHUNG in this state: */
|
||||||
null,
|
null,
|
||||||
/* upon seeing CC_DIGIT in this state: */
|
/* upon seeing CC_DIGIT in this state: */
|
||||||
new TransitionInstruction(STATE_DIGIT,
|
new TransitionInstruction(STATE_DIGIT,
|
||||||
|
@ -209,48 +212,48 @@ class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
|
||||||
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
ACTION_CONTINUES_GRAPHEME_CLUSTER),
|
||||||
/* upon seeing CC_0F39 in this state: */
|
/* upon seeing CC_0F39 in this state: */
|
||||||
null,
|
null,
|
||||||
/* upon seeing CC_ACHUNG in this state: */
|
/* upon seeing CC_SUBSCRIBED_ACHUNG in this state: */
|
||||||
null,
|
null,
|
||||||
/* upon seeing CC_DIGIT in this state: */
|
/* upon seeing CC_DIGIT in this state: */
|
||||||
null
|
null
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
DLC NOW -- clearly, we need LegalSyllable to be convertable to and from GraphemeClusters;
|
/* DLC NOW FIXME -- clearly, we need LegalTshegBar to be convertable to and from UnicodeGraphemeClusters; */
|
||||||
|
|
||||||
/** Breaks a sequence of GraphemeClusters into LegalSyllables.
|
/** Breaks a sequence of UnicodeGraphemeClusters into LegalTshegBars.
|
||||||
@param grcls a sequence of nonnull GraphemeClusters
|
@param grcls a sequence of nonnull UnicodeGraphemeClusters
|
||||||
@return a sequence of nonnull LegalSyllables
|
@return a sequence of nonnull LegalTshegBars
|
||||||
@exception TibetanSyntaxException if grcls does not consist
|
@exception TibetanSyntaxException if grcls does not consist
|
||||||
entirely of legal Tibetan syllables
|
entirely of legal Tibetan syllables
|
||||||
@see #GraphemeCluster
|
@see UnicodeGraphemeCluster
|
||||||
@see #LegalSyllable
|
@see LegalTshegBar
|
||||||
*/
|
*/
|
||||||
private static Vector breakGraphemeClustersIntoOnlySyllables(Vector grcls)
|
private static Vector breakGraphemeClustersIntoOnlyTshegBars(Vector grcls)
|
||||||
throws TibetanSyntaxException
|
throws TibetanSyntaxException
|
||||||
{
|
{
|
||||||
return breakGraphemeClustersIntoSyllablesAndGraphemeClusters(grcls,
|
return breakGraphemeClustersIntoTshegBarsAndGraphemeClusters(grcls,
|
||||||
true);
|
true);
|
||||||
}
|
}
|
||||||
|
|
||||||
private static Vector breakGraphemeClustersIntoOnlySyllables(Vector grcls) {
|
private static Vector breakLegalGraphemeClustersIntoOnlyTshegBars(Vector grcls) {
|
||||||
try {
|
try {
|
||||||
return breakGraphemeClustersIntoSyllablesAndGraphemeClusters(grcls,
|
return breakGraphemeClustersIntoTshegBarsAndGraphemeClusters(grcls,
|
||||||
false);
|
false);
|
||||||
} catch (TibetanSyntaxException) {
|
} catch (TibetanSyntaxException ex) {
|
||||||
throw new Error("This can never happen, because the second parameter, validating, was false.");
|
throw new Error("This can never happen, because the second parameter, validating, was false.");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
@param grcls a Vector consisting entirely of GraphemeClusters
|
@param grcls a Vector consisting entirely of UnicodeGraphemeClusters
|
||||||
@param validate true iff you wish to have a
|
@param validate true iff you wish to have a
|
||||||
TibetanSyntaxException thrown upon encountering a sequence of
|
TibetanSyntaxException thrown upon encountering a sequence of
|
||||||
GraphemeClusters that is syntactically incorrect Tibetan
|
UnicodeGraphemeClusters that is syntactically incorrect Tibetan
|
||||||
@return if validate is true, a Vector consisting entirely of
|
@return if validate is true, a Vector consisting entirely of
|
||||||
LegalSyllables, else a vector of LegalSyllables and
|
LegalTshegBars, else a vector of LegalTshegBars and
|
||||||
GraphemeClusters */
|
UnicodeGraphemeClusters */
|
||||||
private static Vector breakGraphemeClustersIntoSyllablesAndGraphemeClusters(Vector grcls,
|
private static Vector breakGraphemeClustersIntoTshegBarsAndGraphemeClusters(Vector grcls,
|
||||||
boolean validate)
|
boolean validate)
|
||||||
throws TibetanSyntaxException
|
throws TibetanSyntaxException
|
||||||
{
|
{
|
||||||
|
@ -258,8 +261,8 @@ class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
|
||||||
int grcls_len = grcls.length();
|
int grcls_len = grcls.length();
|
||||||
int beginning_of_cluster = 0;
|
int beginning_of_cluster = 0;
|
||||||
for (int i = 0; i < grcls_len; i++) {
|
for (int i = 0; i < grcls_len; i++) {
|
||||||
GraphemeCluster current_grcl
|
UnicodeGraphemeCluster current_grcl
|
||||||
= (GraphemeCluster)grcls.elementAt(i);
|
= (UnicodeGraphemeCluster)grcls.elementAt(i);
|
||||||
if (current_grcl.isTshegLike()) {
|
if (current_grcl.isTshegLike()) {
|
||||||
if (beginning_of_cluster < i) {
|
if (beginning_of_cluster < i) {
|
||||||
// One or more non-tsheg-like grapheme clusters is
|
// One or more non-tsheg-like grapheme clusters is
|
||||||
|
@ -269,7 +272,7 @@ class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
|
||||||
beginning_of_cluster,
|
beginning_of_cluster,
|
||||||
i))
|
i))
|
||||||
{
|
{
|
||||||
syllables.add(new LegalSyllable(grcls,
|
syllables.add(new LegalTshegBar(grcls,
|
||||||
beginning_of_cluster,
|
beginning_of_cluster,
|
||||||
i, tsheg=current_grcl));
|
i, tsheg=current_grcl));
|
||||||
}
|
}
|
||||||
|
@ -299,17 +302,17 @@ class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Breaks a string of perfectly-formed Unicode into
|
/** Breaks a string of perfectly-formed Unicode into
|
||||||
GraphemeClusters.
|
UnicodeGraphemeClusters.
|
||||||
@param nfthdl_unicode a String of NFTHDL-normalized Unicode
|
@param nfthdl_unicode a String of NFTHDL-normalized Unicode
|
||||||
codepoints
|
codepoints
|
||||||
@exception Exception if the input is not perfectly formed
|
@exception Exception if the input is not perfectly formed
|
||||||
@return a vector of GraphemeClusters
|
@return a vector of UnicodeGraphemeClusters
|
||||||
@see #GraphemeCluster
|
@see UnicodeGraphemeCluster
|
||||||
*/
|
*/
|
||||||
private static Vector nonErrorCorrectingReader(String nfthdl_unicode)
|
private static Vector nonErrorCorrectingReader(String nfthdl_unicode)
|
||||||
throws Exception
|
throws Exception
|
||||||
{
|
{
|
||||||
// a vector of GraphemeClusters that we build up little by
|
// a vector of UnicodeGraphemeClusters that we build up little by
|
||||||
// little:
|
// little:
|
||||||
Vector grcls = new Vector();
|
Vector grcls = new Vector();
|
||||||
int currentState = STATE_START;
|
int currentState = STATE_START;
|
||||||
|
@ -326,7 +329,7 @@ class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants {
|
||||||
} else {
|
} else {
|
||||||
switch (ti.getAction()) {
|
switch (ti.getAction()) {
|
||||||
case ACTION_BEGINS_NEW_GRAPHEME_CLUSTER:
|
case ACTION_BEGINS_NEW_GRAPHEME_CLUSTER:
|
||||||
grcls.add(new GraphemeCluster(holdingPen));
|
grcls.add(new UnicodeGraphemeCluster(holdingPen));
|
||||||
holdingPen = new StringBuffer();
|
holdingPen = new StringBuffer();
|
||||||
break;
|
break;
|
||||||
case ACTION_CONTINUES_GRAPHEME_CLUSTER:
|
case ACTION_CONTINUES_GRAPHEME_CLUSTER:
|
||||||
|
|
|
@ -19,12 +19,12 @@ Contributor(s): ______________________________________.
|
||||||
package org.thdl.util;
|
package org.thdl.util;
|
||||||
import java.io.*;
|
import java.io.*;
|
||||||
|
|
||||||
/** Used by {@link LinkedList} to provide the implementation of a
|
/** Used by {@link SimplifiedLinkedList} to provide the implementation of a
|
||||||
simple dynamic link list.
|
simple dynamic link list.
|
||||||
|
|
||||||
@author Andrés Montano Pellegrini
|
@author Andrés Montano Pellegrini
|
||||||
@see LinkedList
|
@see SimplifiedLinkedList
|
||||||
@see ListIterator
|
@see SimplifiedListIterator
|
||||||
*/
|
*/
|
||||||
|
|
||||||
public class Link
|
public class Link
|
||||||
|
@ -111,4 +111,4 @@ public class Link
|
||||||
else siguiente.insertSorted(link);
|
else siguiente.insertSorted(link);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue