ACIP now stacks greedily. TTTTTA is T+T+T+T+TA, even though that stack doesn't exist in TM or TMW. Robert Chilton, in personal correspondence, agreed that this is the way to do things.
ACIP handles the appendages 'AM, 'ANG, 'US, 'UR, 'I, 'O, and 'U correctly.
This commit is contained in:
parent
5f4fbfab7c
commit
5e18feb47d
10 changed files with 576 additions and 348 deletions
|
@ -236,13 +236,25 @@ public class TibetanDocument extends DefaultStyledDocument {
|
|||
// PERFORMANCE FIXME: this isn't so speedy, but it reuses
|
||||
// existing code.
|
||||
for (int i = 0; i < glyphs.length; i++) {
|
||||
insertDuff(getLength(),
|
||||
new DuffData[] { new DuffData(new String(new char[] { glyphs[i].getCharacter() }),
|
||||
glyphs[i].getFontNum()) },
|
||||
color);
|
||||
appendDuffCode(glyphs[i], color);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends glyph to the end of this document.
|
||||
* @param glyph the Tibetan glyph you want to insert
|
||||
* @param color the color in which to insert, which is used if and only
|
||||
* if {@link #colorsEnabled() colors are enabled}
|
||||
*/
|
||||
public void appendDuffCode(DuffCode glyph, Color color) {
|
||||
// PERFORMANCE FIXME: this isn't so speedy, but it reuses
|
||||
// existing code.
|
||||
insertDuff(getLength(),
|
||||
new DuffData[] { new DuffData(new String(new char[] { glyph.getCharacter() }),
|
||||
glyph.getFontNum()) },
|
||||
color);
|
||||
}
|
||||
|
||||
|
||||
/** Replacing can be more efficient than inserting and then
|
||||
removing. This replaces the glyph at position pos with glyph,
|
||||
|
|
|
@ -1,3 +1,6 @@
|
|||
// DLC NOW: 'US etc. -- do we handle them all?
|
||||
// DLC NOW WARN ON NNYA and DBA
|
||||
// DLC NOW: implement Robert Chilton-supplied prefix rules
|
||||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
|
@ -348,13 +351,14 @@ public class ACIPConverter {
|
|||
if (null != tdoc) tdoc.appendRoman(text, Color.BLACK);
|
||||
} else {
|
||||
String unicode = null;
|
||||
DuffCode[] duff = null;
|
||||
Object[] duff = null;
|
||||
if (stype == TString.TIBETAN_NON_PUNCTUATION) {
|
||||
lastGuyWasNonPunct = true;
|
||||
TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText());
|
||||
TPairList pls[] = TPairListFactory.breakACIPIntoChunks(s.getText());
|
||||
String acipError;
|
||||
|
||||
if ((acipError = pl.getACIPError()) != null) {
|
||||
if ((acipError = pls[0].getACIPError()) != null
|
||||
&& (null == pls[1] || pls[1].getACIPError() != null)) {
|
||||
hasErrors = true;
|
||||
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS THESE ERRORS: " + acipError + "]";
|
||||
if (null != writer) writer.write(errorMessage);
|
||||
|
@ -362,8 +366,10 @@ public class ACIPConverter {
|
|||
if (null != errors)
|
||||
errors.append(errorMessage + "\n");
|
||||
} else {
|
||||
TParseTree pt = pl.getParseTree();
|
||||
if (null == pt) {
|
||||
TParseTree pt0 = pls[0].getParseTree();
|
||||
TParseTree pt1 = ((null == pls[1])
|
||||
? null : pls[1].getParseTree());
|
||||
if (null == pt0 && null == pt1) {
|
||||
hasErrors = true;
|
||||
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " IS ESSENTIALLY NOTHING.]";
|
||||
if (null != writer) writer.write(errorMessage);
|
||||
|
@ -371,8 +377,10 @@ public class ACIPConverter {
|
|||
if (null != errors)
|
||||
errors.append(errorMessage + "\n");
|
||||
} else {
|
||||
TStackList sl = pt.getBestParse();
|
||||
if (null == sl) {
|
||||
TStackList sl0 = pt0.getBestParse();
|
||||
TStackList sl1 = ((null == pt1)
|
||||
? null : pt1.getBestParse());
|
||||
if (null == sl0 && null == sl1) {
|
||||
hasErrors = true;
|
||||
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS NO LEGAL PARSES.]";
|
||||
if (null != writer) writer.write(errorMessage);
|
||||
|
@ -380,6 +388,25 @@ public class ACIPConverter {
|
|||
if (null != errors)
|
||||
errors.append(errorMessage + "\n");
|
||||
} else {
|
||||
TStackList sl = sl0;
|
||||
TPairList pl = pls[0];
|
||||
TParseTree pt = pt0;
|
||||
// set sl equal to the best choice of sl0 and sl1.
|
||||
if (null != sl1) {
|
||||
BoolTriple sl0bt = sl0.isLegalTshegBar(false);
|
||||
BoolTriple sl1bt = sl1.isLegalTshegBar(false);
|
||||
int ct;
|
||||
if ((ct = sl0bt.compareTo(sl1bt)) < 0) {
|
||||
sl = sl1;
|
||||
pl = pls[1];
|
||||
pt = pt1;
|
||||
} else if (0 == ct) {
|
||||
// sl remains sl0 -- '* is
|
||||
// a vowel unless it's
|
||||
// clearly part of an
|
||||
// appendage like 'AM.
|
||||
}
|
||||
}
|
||||
lastGuy = sl;
|
||||
String warning = null;
|
||||
if ("None" != warningLevel) {
|
||||
|
@ -428,10 +455,10 @@ public class ACIPConverter {
|
|||
color = Color.BLACK;
|
||||
if (stype == TString.START_SLASH) {
|
||||
if (null != writer) unicode = "\u0F3C";
|
||||
if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph("(") };
|
||||
if (null != tdoc) duff = new Object[] { TibetanMachineWeb.getGlyph("(") };
|
||||
} else if (stype == TString.END_SLASH) {
|
||||
if (null != writer) unicode = "\u0F3D";
|
||||
if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph(")") };
|
||||
if (null != tdoc) duff = new Object[] { TibetanMachineWeb.getGlyph(")") };
|
||||
} else if (stype == TString.TIBETAN_PUNCTUATION) {
|
||||
// For ACIP, tshegs are used as both
|
||||
// tshegs and whitespace. We treat a
|
||||
|
@ -499,7 +526,7 @@ public class ACIPConverter {
|
|||
} else {
|
||||
String wy = ACIPRules.getWylieForACIPOther(s.getText());
|
||||
if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
|
||||
duff = new DuffCode[] { TibetanMachineWeb.getGlyph(wy) };
|
||||
duff = new Object[] { TibetanMachineWeb.getGlyph(wy) };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -526,7 +553,18 @@ public class ACIPConverter {
|
|||
if (null != writer && null != unicode) writer.write(unicode);
|
||||
if (null != tdoc) {
|
||||
if (null != duff && 0 != duff.length) {
|
||||
tdoc.appendDuffCodes(duff, color);
|
||||
for (int j = 0; j < duff.length; j++) {
|
||||
if (duff[j] instanceof DuffCode)
|
||||
tdoc.appendDuffCode((DuffCode)duff[j],
|
||||
color);
|
||||
else {
|
||||
hasErrors = true;
|
||||
if (null != errors)
|
||||
errors.append((String)duff[j] + "\n");
|
||||
tdoc.appendRoman((String)duff[j],
|
||||
Color.RED);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// this happens when you have an
|
||||
// [#ERROR]-producing tsheg bar.
|
||||
|
|
|
@ -179,6 +179,11 @@ public class ACIPRules {
|
|||
wylieToACIP.put(EWTS, ACIP);
|
||||
}
|
||||
|
||||
/** Returns true if and only if s is an ACIP consonant. */
|
||||
static final boolean isACIPConsonant(String s) {
|
||||
return (null != ACIPRules.getWylieForACIPConsonant(s));
|
||||
}
|
||||
|
||||
private static HashMap acipConsonant2wylie = null;
|
||||
/** Returns the EWTS corresponding to the given ACIP consonant
|
||||
* (without the "A" vowel). Returns null if there is no such
|
||||
|
|
File diff suppressed because one or more lines are too long
|
@ -118,7 +118,7 @@ class TPair {
|
|||
return (null != l
|
||||
&& ((null == r || "".equals(r))
|
||||
|| "-".equals(r)
|
||||
|| "A".equals(r)) // DLC though check for BASKYABS and warn because BSKYABS is more common
|
||||
|| "A".equals(r)) // DLC FIXME: though check for BASKYABS and warn because BSKYABS is more common
|
||||
&& ("'".equals(l)
|
||||
|| "M".equals(l)
|
||||
|| "B".equals(l)
|
||||
|
@ -126,12 +126,52 @@ class TPair {
|
|||
|| "G".equals(l)));
|
||||
}
|
||||
|
||||
/** Returns true if and only if this pair could be a Tibetan
|
||||
* secondary sufffix. */
|
||||
boolean isPostSuffix() {
|
||||
return (null != l
|
||||
&& ((null == r || "".equals(r))
|
||||
|| "-".equals(r)
|
||||
|| "A".equals(r)) // DLC FIXME: though warn about GAMASA vs. GAMS
|
||||
&& ("S".equals(l)
|
||||
|| "D".equals(l)));
|
||||
}
|
||||
|
||||
/** Returns true if and only if this pair could be a Tibetan
|
||||
* sufffix. DLC FIXME: ACIP specific, just like isPostSuffix() and isPrefix() */
|
||||
boolean isSuffix() {
|
||||
return (null != l
|
||||
&& ((null == r || "".equals(r))
|
||||
|| "-".equals(r)
|
||||
|| "A".equals(r))
|
||||
&& ("S".equals(l)
|
||||
|| "G".equals(l)
|
||||
|| "D".equals(l)
|
||||
|| "M".equals(l)
|
||||
|| "'".equals(l)
|
||||
|| "B".equals(l)
|
||||
|| "NG".equals(l)
|
||||
|| "N".equals(l)
|
||||
|| "L".equals(l)
|
||||
|| "R".equals(l)));
|
||||
}
|
||||
|
||||
/** Returns true if and only if this pair is merely a
|
||||
* disambiguator. */
|
||||
boolean isDisambiguator() {
|
||||
return ("-".equals(r) && getLeft() == null);
|
||||
}
|
||||
|
||||
/** Yep, this works for TPairs. */
|
||||
public boolean equals(Object x) {
|
||||
if (x instanceof TPair) {
|
||||
TPair p = (TPair)x;
|
||||
return ((getLeft() == p.getLeft() || (getLeft() != null && getLeft().equals(p.getLeft())))
|
||||
|| (getRight() == p.getRight() || (getRight() != null && getRight().equals(p.getRight()))));
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Returns an TPair that is like this pair except that it has
|
||||
* a "+" on the right if this pair is empty on the right and is
|
||||
* empty on the right if this pair has a disambiguator (i.e., a
|
||||
|
@ -195,4 +235,11 @@ class TPair {
|
|||
if (null != x) sb.append(x);
|
||||
}
|
||||
}
|
||||
|
||||
/** Returns true if this pair is surely the last pair in an ACIP
|
||||
* stack. Stacking continues through (* . ) and (* . +), but
|
||||
* stops anywhere else. */
|
||||
boolean endsACIPStack() {
|
||||
return (getRight() != null && !"+".equals(getRight()));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -284,216 +284,216 @@ class TPairList {
|
|||
* syntax) to do so. If this list of pairs has something clearly
|
||||
* illegal in it, or is empty, or is merely a list of
|
||||
* disambiguators etc., then this returns null. Never returns an
|
||||
* empty parse tree. */
|
||||
* empty parse tree.
|
||||
*/
|
||||
public TParseTree getParseTree() {
|
||||
TParseTree pt = new TParseTree();
|
||||
// We treat [(B . ), (G . +), (K . ), (T . A)] as if it could
|
||||
// be {B+G+K+T} or {B}{G+K+T}; we handle prefixes specially
|
||||
// this way. [(T . ), (G . +), (K . ), (T . A)] is clearly
|
||||
// {T+G+K+TA} (and, DLC FIXME, we should warn that there are
|
||||
// some pluses but not all)
|
||||
//
|
||||
// We don't care if T+G+K+T is in TMW or not -- there is no
|
||||
// master list of stacks.
|
||||
|
||||
int sz = size();
|
||||
int firstPair = 0;
|
||||
for (int i = 0; i < sz; i++) {
|
||||
|
||||
// We treat [(B . ), (G . +), (K . ), (T . A)] as if it
|
||||
// could be {B+G+K+T} or {B}{G+K}{T} or {B+G+K}{T} or
|
||||
// {B}{G+K+T} (modulo stack legality); we're conservative.
|
||||
// (Though some stacks won't be legal.)
|
||||
TPair p = get(i);
|
||||
if (p.getLeft() == null && !"-".equals(p.getRight()))
|
||||
return null; // clearly illegal.
|
||||
if ("+".equals(p.getLeft()))
|
||||
return null; // clearly illegal.
|
||||
if (":".equals(p.getLeft()))
|
||||
return null; // clearly illegal.
|
||||
if ("m".equals(p.getLeft()))
|
||||
return null; // clearly illegal.
|
||||
if ("m:".equals(p.getLeft()))
|
||||
return null; // clearly illegal.
|
||||
}
|
||||
|
||||
|
||||
TParseTree pt = new TParseTree();
|
||||
if (sz < 1) return null;
|
||||
|
||||
// When we see a stretch of ACIP without a disambiguator or a
|
||||
// vowel, that stretch is taken to be one stack unless it may
|
||||
// be prefix-root or suffix-postsuffix or suffix/postsuffix-'
|
||||
// -- the latter necessary because GAMS'I is GAM-S-'I, not
|
||||
// GAM-S+'I. 'UR, 'US, 'ANG, 'AM, 'I, 'O, 'U -- all begin
|
||||
// with '. So we can have zero, one, two, or three special
|
||||
// break locations. (The kind that aren't special are the
|
||||
// break after G in G-DAMS, or the break after G in GADAMS or
|
||||
// GEDAMS.)
|
||||
//
|
||||
// If a nonnegative number appears in breakLocations[i], it
|
||||
// means that pair i may or may not be stacked with pair i+1.
|
||||
int nextBreakLoc = 0;
|
||||
int breakLocations[] = { -1, -1, -1 };
|
||||
|
||||
boolean mayHavePrefix;
|
||||
|
||||
// Handle the first pair specially -- it could be a prefix.
|
||||
if (ddebug) System.out.println("i is " + 0);
|
||||
if ((mayHavePrefix = get(0).isPrefix()) && null == get(0).getRight()) {
|
||||
// special case: we must have a branch in the parse tree
|
||||
// for the initial part of this pair list. For example,
|
||||
// is DKHYA D+KH+YA or D-KH+YA? It depends on prefix
|
||||
// rules (can KH+YA take a DA prefix?), so the parse tree
|
||||
// includes both.
|
||||
breakLocations[nextBreakLoc++] = 0;
|
||||
}
|
||||
|
||||
// stack numbers start at 1.
|
||||
int stackNumber = (get(0).endsACIPStack()) ? 2 : 1;
|
||||
// this starts at 0.
|
||||
int stackStart = (get(0).endsACIPStack()) ? 1 : 0;
|
||||
|
||||
int numeric = 0; // 1 means surely, 0 means we don't know yet, -1 means surely not
|
||||
|
||||
for (int i = 1; i < sz; i++) {
|
||||
if (ddebug) System.out.println("i is " + i);
|
||||
TPair p = get(i);
|
||||
if (p.getRight() == null && firstPair + 1 < sz) {
|
||||
// Here's the ambiguity. Let's fill up sl. (B . ) (G
|
||||
// . +) (K . A) could be {B+G+KA} or {BA}{G+KA}, so we
|
||||
// go until we hit a vowel and then break into
|
||||
// TPairLists.
|
||||
int start = firstPair;
|
||||
int blanks[] = new int[sz - start]; // we may not use all of this.
|
||||
int j;
|
||||
for (j = start; j < sz; j++) {
|
||||
TPair pj = get(j);
|
||||
boolean isBlank;
|
||||
if (ddebug) System.out.println("right guy is " + pj.getRight());
|
||||
if (pj.isDisambiguator())
|
||||
blanks[j-start] = ALWAYS_STOP_STACKING;
|
||||
else {
|
||||
if (!(isBlank = (pj.getRight() == null)) && !"+".equals(pj.getRight())) {
|
||||
if (ddebug) System.out.println("breaker breaker at j=" + j);
|
||||
break;
|
||||
}
|
||||
blanks[j-start] = isBlank ? STOP_STACK : ALWAYS_KEEP_STACKING;
|
||||
}
|
||||
}
|
||||
if (j >= sz) j = sz - 1;
|
||||
|
||||
blanks[j-start] = ALWAYS_STOP_STACKING;
|
||||
|
||||
// get(j) [corresponding to blanks[j-i]] is
|
||||
// the last pair in the ambiguous stretch; get(i)
|
||||
// [corresponding to blanks[0]] is the first.
|
||||
|
||||
// We'll end up doing 2**(j-i+1) (i.e., (1 <<
|
||||
// (j-i+1))) iterations. If that's going to be too
|
||||
// many, let's just say there's no legal parse. FIXME:
|
||||
// give a nice error message in this case.
|
||||
if (ddebug) System.out.println("ddebug: we're going to do 2^" + (j-i+1) + " [or " + (1 << (j-i+1)) + "] wacky iterations!");
|
||||
if ((j-i+1) > 13) // if you don't use 13, then change PackageTest.testSlowestTshegBar().
|
||||
return null;
|
||||
|
||||
boolean keepGoing = true;
|
||||
TStackListList sll = new TStackListList();
|
||||
do {
|
||||
// Add the stack list currently specified by
|
||||
// blanks if all the stacks in it are legal.
|
||||
// DLC DELETE {
|
||||
// ArrayList x = new ArrayList((j-start+1));
|
||||
// for (int ii = 0; ii < (j-start+1); ii++)
|
||||
// x.add(new Integer(blanks[ii]));
|
||||
// }
|
||||
TStackList sl = new TStackList(sz - start);
|
||||
boolean illegal = false;
|
||||
TPairList currentStack = new TPairList();
|
||||
for (int k = 0; k < j-start+1; k++) {
|
||||
TPair pk = get(start + k);
|
||||
if (!pk.isDisambiguator()) {
|
||||
currentStack.add(pk.insideStack());
|
||||
if (blanks[k] == STOP_STACK) {
|
||||
if (currentStack.isLegalTibetanOrSanskritStack())
|
||||
sl.add(currentStack.asStack());
|
||||
else {
|
||||
illegal = true;
|
||||
break;
|
||||
}
|
||||
currentStack = new TPairList();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!illegal && !currentStack.isEmpty()) {
|
||||
if (currentStack.isLegalTibetanOrSanskritStack()) {
|
||||
TPairList stack = currentStack.asStack();
|
||||
if (ddebug) System.out.println("adding currentStack " + stack + " to sl " + sl);
|
||||
sl.add(stack);
|
||||
boolean nn;
|
||||
if ((nn = p.isNumeric()) && ("+".equals(get(i-1).getRight())
|
||||
|| "+".equals(p.getRight())))
|
||||
return null; // clearly illegal. You can't stack numbers.
|
||||
if (nn) {
|
||||
if (-1 == numeric)
|
||||
return null; // you can't mix numbers and letters.
|
||||
else if (0 == numeric)
|
||||
numeric = 1;
|
||||
} else {
|
||||
illegal = true;
|
||||
if (numeric == 1)
|
||||
return null; // you can't mix numbers and letters.
|
||||
else if (0 == numeric && !p.isDisambiguator())
|
||||
numeric = -1;
|
||||
}
|
||||
|
||||
if (i+1==sz || p.endsACIPStack()) {
|
||||
if (/* the stack ending here might really be
|
||||
suffix-postsuffix or
|
||||
suffix-appendage or
|
||||
suffix-postsuffix-appendage */
|
||||
(mayHavePrefix && (stackNumber == 2 || stackNumber == 3))
|
||||
|| (!mayHavePrefix && (stackNumber == 2))) {
|
||||
if (i > stackStart) {
|
||||
if (get(stackStart).isSuffix()
|
||||
&& (get(stackStart+1).isPostSuffix() // suffix-postsuffix
|
||||
|| "'".equals(get(stackStart+1).getLeft()))) // suffix-appendage
|
||||
breakLocations[nextBreakLoc++] = stackStart;
|
||||
if (i > stackStart + 1) {
|
||||
// three to play with, maybe it's
|
||||
// suffix-postsuffix-appendage.
|
||||
if (get(stackStart).isSuffix()
|
||||
&& get(stackStart+1).isPostSuffix()
|
||||
&& "'".equals(get(stackStart+2).getLeft()))
|
||||
breakLocations[nextBreakLoc++] = stackStart+1;
|
||||
}
|
||||
}
|
||||
if (!illegal) {
|
||||
if (ddebug) System.out.println("adding sl " + sl + " to sll " + sll);
|
||||
// else no need to insert a breakLocation, we're
|
||||
// breaking hard.
|
||||
}
|
||||
if (/* the stack ending here might really be
|
||||
postsuffix-appendage (e.g., GDAM-S'O) */
|
||||
(mayHavePrefix && (stackNumber == 3 || stackNumber == 4))
|
||||
|| (!mayHavePrefix && (stackNumber == 3))) {
|
||||
if (i == stackStart+1) { // because GDAM--S'O is illegal, and because it's 'ANG, not 'NG, 'AM, not 'M -- ' always ends the stack
|
||||
if (get(stackStart).isPostSuffix()
|
||||
&& "'".equals(get(stackStart+1).getLeft()))
|
||||
breakLocations[nextBreakLoc++] = stackStart;
|
||||
}
|
||||
}
|
||||
++stackNumber;
|
||||
stackStart = i+1;
|
||||
}
|
||||
}
|
||||
// DLC FIXME: we no longer need all these breakLocations -- we can handle SAM'AM'ANG
|
||||
|
||||
// Now go from hard break (i.e., (* . VOWEL or -)) to hard
|
||||
// break (and there's a hard break after the last pair, of
|
||||
// course, even if it is (G . ) or (G . +) [the latter being
|
||||
// hideously illegal]). Between the hard breaks, there will
|
||||
// be 1, 2, or 4 (can you see why 8 isn't possible, though
|
||||
// numBreaks can be 3?) possible parses. There are two of DGA
|
||||
// in DGAMS'O -- D-GA and D+GA. There are 4 of MS'O in
|
||||
// DGAMS'O -- M-S-'O, M-S+'O, M+S-'O, and M+S+'O. Add one
|
||||
// TStackListList per hard break to pt, the parse tree.
|
||||
int startLoc = 0; // which pair starts this hard break?
|
||||
|
||||
// DLC FIXME: assert this
|
||||
if ((breakLocations[1] >= 0 && breakLocations[1] <= breakLocations[0])
|
||||
|| (breakLocations[2] >= 0 && breakLocations[2] <= breakLocations[1]))
|
||||
throw new Error("breakLocations is monotonically increasing, ain't it?");
|
||||
|
||||
for (int i = 0; i < sz; i++) {
|
||||
if (i+1 == sz || get(i).endsACIPStack()) {
|
||||
TStackListList sll = new TStackListList(4); // maximum is 4.
|
||||
|
||||
int numBreaks = 0;
|
||||
int breakStart = -1;
|
||||
for (int jj = 0; jj < breakLocations.length; jj++) {
|
||||
if (breakLocations[jj] >= startLoc
|
||||
&& breakLocations[jj] <= i) {
|
||||
if (breakStart < 0)
|
||||
breakStart = jj;
|
||||
++numBreaks;
|
||||
}
|
||||
}
|
||||
|
||||
// Count from [0, 1<<numBreaks). At each point,
|
||||
// counter equals b2b1b0 in binary. 1<<numBreaks is
|
||||
// the number of stack lists there are in this stack
|
||||
// list list of the parse tree. Break at location
|
||||
// breakLocations[breakStart+0] if and only if b0 is
|
||||
// one, at location breakLocations[breakStart+1] if
|
||||
// and only if b1 is one, etc.
|
||||
for (int counter = 0; counter < (1<<numBreaks); counter++) {
|
||||
TStackList sl = new TStackList();
|
||||
TPairList currentStack = new TPairList();
|
||||
for (int k = startLoc; k <= i; k++) {
|
||||
if (!get(k).isDisambiguator()) {
|
||||
if (get(k).isNumeric()
|
||||
|| (get(k).getLeft() != null
|
||||
&& ACIPRules.isConsonant(get(k).getLeft())))
|
||||
currentStack.add(get(k).insideStack());
|
||||
else
|
||||
return null; // sA, for example, is illegal.
|
||||
}
|
||||
if (k == i || get(k).endsACIPStack()) {
|
||||
if (!currentStack.isEmpty())
|
||||
sl.add(currentStack.asStack());
|
||||
currentStack = new TPairList();
|
||||
} else {
|
||||
if (numBreaks > 0) {
|
||||
for (int j = 0; breakStart+j < 3; j++) {
|
||||
if (k == breakLocations[breakStart+j]
|
||||
&& 1 == ((counter >> j) & 1)) {
|
||||
if (!currentStack.isEmpty())
|
||||
sl.add(currentStack.asStack());
|
||||
currentStack = new TPairList();
|
||||
break; // shouldn't matter, but you never know
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!sl.isEmpty()) {
|
||||
sll.add(sl);
|
||||
}
|
||||
}
|
||||
|
||||
// Update blanks. Think of this as doing base 2
|
||||
// arithmetic where STOP_STACK is zero,
|
||||
// KEEP_STACKING is one, and ALWAYS_KEEP_STACKING
|
||||
// and ALWAYS_STOP_STACKING are digits we cannot
|
||||
// modify. We'll end up doing 2^M iterations,
|
||||
// where M is the number of fields in blanks that
|
||||
// are not equal to ALWAYS_KEEP_STACKING or
|
||||
// ALWAYS_STOP_STACKING.
|
||||
keepGoing = false;
|
||||
for (int k = j-start; k >= 0; k--) {
|
||||
if (blanks[k] == STOP_STACK) {
|
||||
keepGoing = true;
|
||||
blanks[k] = KEEP_STACKING;
|
||||
// reset all digits to the right of k to
|
||||
// "zero":
|
||||
for (int m = k + 1; m < j-start+1; m++) {
|
||||
if (blanks[m] == KEEP_STACKING)
|
||||
blanks[m] = STOP_STACK;
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
} while (keepGoing);
|
||||
if (sll.isEmpty())
|
||||
return null; // STXAL or shT+ZNAGN, e.g.
|
||||
else {
|
||||
if (ddebug) System.out.println("adding sll " + sll + " to parse tree " + pt);
|
||||
if (!sll.isEmpty())
|
||||
pt.add(sll);
|
||||
startLoc = i+1;
|
||||
}
|
||||
}
|
||||
|
||||
if (ddebug) System.out.println("i is " + i + " and j is " + j + " and we are resetting so that i==j+1 next time.");
|
||||
i = j;
|
||||
firstPair = j + 1;
|
||||
} else if ("+".equals(p.getRight())) {
|
||||
// Keep firstPair where it is.
|
||||
} else {
|
||||
// Add all pairs in the range [firstPair, i]. Some
|
||||
// pairs are stacks all by themselves, some pairs have
|
||||
// '+' on the right and are thus just part of a stack.
|
||||
// We'll add a whole number of stacks, though.
|
||||
|
||||
// this is initialized to hold the max we might use:
|
||||
TStackListList sll
|
||||
= new TStackListList(i - firstPair + 1);
|
||||
|
||||
TPairList currentStack = new TPairList();
|
||||
for (int j = firstPair; j <= i; j++) {
|
||||
TPair pj = get(j);
|
||||
if (!pj.isDisambiguator()) {
|
||||
currentStack.add(pj.insideStack());
|
||||
if (!"+".equals(pj.getRight())) {
|
||||
if (currentStack.isLegalTibetanOrSanskritStack())
|
||||
sll.add(new TStackList(currentStack.asStack()));
|
||||
else {
|
||||
return null;
|
||||
}
|
||||
currentStack = new TPairList();
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!currentStack.isEmpty())
|
||||
throw new Error("how can this happen? currentStack is " + currentStack);
|
||||
|
||||
if (!sll.isEmpty()) {
|
||||
if (ddebug) System.out.println("adding sll " + sll + " to parse tree " + pt);
|
||||
pt.add(sll);
|
||||
firstPair = i + 1;
|
||||
} // else you probably have {G--YA} or something as
|
||||
// your tsheg bar.
|
||||
}
|
||||
}
|
||||
if (pt.isEmpty()) return null;
|
||||
return pt;
|
||||
}
|
||||
|
||||
/** Returns true if and only if this list of TPairs can be
|
||||
* interpreted as a legal Tibetan stack or a legal Tibetanized
|
||||
* Sanskrit stack. This is private because a precondition is
|
||||
* that no vowels or disambiguators appear except possibly in the
|
||||
* final pair. */
|
||||
private boolean isLegalTibetanOrSanskritStack() {
|
||||
StringBuffer tibetan = new StringBuffer();
|
||||
StringBuffer sanskrit = new StringBuffer();
|
||||
int sz = size();
|
||||
|
||||
// Special case because otherwise wa-zur alone would be seen
|
||||
// as legal.
|
||||
if (sz == 1 && "V".equals(get(0).getLeft()))
|
||||
return false;
|
||||
|
||||
for (int i = 0; i < sz; i++) {
|
||||
TPair p = get(i);
|
||||
String ewts_form
|
||||
= ACIPRules.getWylieForACIPConsonant(p.getLeft());
|
||||
if (null == ewts_form) {
|
||||
if (p.isNumeric())
|
||||
ewts_form = p.getLeft();
|
||||
}
|
||||
if (null == ewts_form) {
|
||||
if (ddebug) System.out.println("testing " + toString2() + " for legality said false. numeric?" + p.isNumeric() + "[1]");
|
||||
return false;
|
||||
}
|
||||
tibetan.append(ewts_form);
|
||||
sanskrit.append(ewts_form);
|
||||
if (i + 1 < sz) {
|
||||
tibetan.append('-');
|
||||
sanskrit.append('+');
|
||||
}
|
||||
}
|
||||
boolean ans =
|
||||
(TibetanMachineWeb.hasGlyph(tibetan.toString())
|
||||
|| TibetanMachineWeb.hasGlyph(sanskrit.toString()));
|
||||
if (ddebug) System.out.println("testing " + toString2() + " for legality said " + ans + " [2]; san is " + sanskrit + " tib is " + tibetan + ".");
|
||||
return ans;
|
||||
}
|
||||
private static final boolean ddebug = false;
|
||||
|
||||
/** Mutates this TPairList object such that the last pair is
|
||||
|
@ -611,9 +611,11 @@ class TPairList {
|
|||
}
|
||||
|
||||
/** Appends the DuffCodes that correspond to this grapheme cluster
|
||||
* to duff. Assumes this is one grapheme cluster. */
|
||||
void getDuff(ArrayList duff) {
|
||||
int previousSize = duff.size();
|
||||
* to duffsAndErrors, or appends a String that is an error
|
||||
* message saying that TMW cannot represent this grapheme
|
||||
* cluster. */
|
||||
void getDuff(ArrayList duffsAndErrors) {
|
||||
int previousSize = duffsAndErrors.size();
|
||||
StringBuffer wylieForConsonant = new StringBuffer();
|
||||
for (int x = 0; x + 1 < size(); x++) {
|
||||
wylieForConsonant.append(get(x).getWylie(false));
|
||||
|
@ -624,17 +626,18 @@ class TPairList {
|
|||
if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {
|
||||
hashKey = hashKey.replace('+', '-');
|
||||
if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {
|
||||
throw new Error("How did this happen?");
|
||||
duffsAndErrors.add("[#ERROR The ACIP {" + recoverACIP() + "} cannot be represented with the TibetanMachine or TibetanMachineWeb fonts because no such glyph exists in these fonts.]");
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (lastPair.getRight() == null || lastPair.equals("-")) {
|
||||
duff.add(TibetanMachineWeb.getGlyph(hashKey));
|
||||
duffsAndErrors.add(TibetanMachineWeb.getGlyph(hashKey));
|
||||
} else {
|
||||
ACIPRules.getDuffForACIPVowel(duff,
|
||||
ACIPRules.getDuffForACIPVowel(duffsAndErrors,
|
||||
TibetanMachineWeb.getGlyph(hashKey),
|
||||
lastPair.getRight());
|
||||
}
|
||||
if (previousSize == duff.size())
|
||||
if (previousSize == duffsAndErrors.size())
|
||||
throw new Error("TPairList with no duffs? " + toString()); // DLC FIXME: change to assertion.
|
||||
}
|
||||
}
|
||||
|
|
|
@ -38,9 +38,42 @@ class TPairListFactory {
|
|||
* rest would be suboptimal, so we backtrack to [(T . )] and then
|
||||
* finally become [(T . ), (A . A)]. We look for (A . ) and (
|
||||
* . <vowel>) in the rest in order to say "the rest would be
|
||||
* suboptimal", i.e. we use TPairList.hasSimpleError()
|
||||
* @param acip a string of ACIP with no punctuation in it */
|
||||
static TPairList breakACIPIntoChunks(String acip) {
|
||||
* suboptimal", i.e. we use TPairList.hasSimpleError().</p>
|
||||
*
|
||||
* <p>There is one case where we break things up into two pair
|
||||
* lists -- I found out about this case too late to do anything
|
||||
* clean about it. SNYAM'AM, e.g., breaks up into [(S . ), (NY
|
||||
* . A), (M . 'A), (M . )], which is incorrect -- [(S . ), (NY
|
||||
* . A), (M . ), (' . A), (M . )] is correct. But we don't know
|
||||
* which is correct without parsing, so both are returned. The
|
||||
* clean treatment (low-priority FIXME) would be to lex into a
|
||||
* form that didn't insist 'A was either a vowel or a consonant.
|
||||
* Then the parser would figure it out.</p>
|
||||
*
|
||||
* @param acip a string of ACIP with no punctuation in it
|
||||
* @return an array of one or two pair lists, if the former, then
|
||||
* the second element will be null, if the latter, the second
|
||||
* element will have (* . ), (' . *) instead of (* . '*) which
|
||||
* the former has @throws IllegalArgumentException if acip is too
|
||||
* large for us to break into chunks (we're recursive, not
|
||||
* iterative, so the boundary can be increased a lot if you care,
|
||||
* but you don't) */
|
||||
static TPairList[] breakACIPIntoChunks(String acip) throws IllegalArgumentException {
|
||||
try {
|
||||
TPairList a = breakHelper(acip, true);
|
||||
TPairList b = breakHelper(acip, false);
|
||||
if (a.equals(b))
|
||||
return new TPairList[] { a, null };
|
||||
else
|
||||
return new TPairList[] { a, b };
|
||||
} catch (StackOverflowError e) {
|
||||
throw new IllegalArgumentException("Input too large[1]: " + acip);
|
||||
} catch (OutOfMemoryError e) {
|
||||
throw new IllegalArgumentException("Input too large[2]: " + acip);
|
||||
}
|
||||
}
|
||||
/** Helps {@link breakACIPIntoChunks(String)}. */
|
||||
private static TPairList breakHelper(String acip, boolean tickIsVowel) {
|
||||
|
||||
// base case for our recursion:
|
||||
if ("".equals(acip))
|
||||
|
@ -50,9 +83,21 @@ class TPairListFactory {
|
|||
int howMuchBuf[] = new int[1];
|
||||
TPair head = getFirstConsonantAndVowel(acipBuf, howMuchBuf);
|
||||
int howMuch = howMuchBuf[0];
|
||||
if (!tickIsVowel
|
||||
&& null != head.getLeft()
|
||||
&& null != head.getRight()
|
||||
&& head.getRight().startsWith("'")) {
|
||||
head = new TPair(head.getLeft(),
|
||||
// Without this disambiguator, we are
|
||||
// less efficient (8 parses, not 4) and
|
||||
// we can't handle PA'AM'ANG etc.
|
||||
"-");
|
||||
howMuch = head.getLeft().length();
|
||||
}
|
||||
|
||||
TPairList tail;
|
||||
if ((tail
|
||||
= breakACIPIntoChunks(acipBuf.substring(howMuch))).hasSimpleError()) {
|
||||
= breakHelper(acipBuf.substring(howMuch), tickIsVowel)).hasSimpleError()) {
|
||||
for (int i = 1; i < howMuch; i++) {
|
||||
// try giving i characters back if that leaves us with
|
||||
// a legal head and makes the rest free of simple
|
||||
|
@ -61,7 +106,7 @@ class TPairListFactory {
|
|||
TPair newHead;
|
||||
if ((newHead = head.minusNRightmostACIPCharacters(i)).isLegal()
|
||||
&& !(newTail
|
||||
= breakACIPIntoChunks(acipBuf.substring(howMuch - i))).hasSimpleError()) {
|
||||
= breakHelper(acipBuf.substring(howMuch - i), tickIsVowel)).hasSimpleError()) {
|
||||
newTail.prepend(newHead);
|
||||
return newTail;
|
||||
}
|
||||
|
|
|
@ -184,10 +184,7 @@ class TParseTree {
|
|||
}
|
||||
|
||||
/** Returns a list containing the unique legal parse of this parse
|
||||
* tree if there is a unique legal parse. Note that {SRAS} has a
|
||||
* unique legal parse, though {SRS} has two equally good parses;
|
||||
* i.e., note that the {A} vowel is treated specially here
|
||||
* (unlike in {@link #getLegalParses()}). Returns an empty list
|
||||
* tree if there is a unique legal parse. Returns an empty list
|
||||
* if there are no legal parses. Returns a list containing all
|
||||
* legal parses if there two or more equally good parses. By
|
||||
* "legal", we mean a sequence of stacks that is legal
|
||||
|
@ -223,13 +220,21 @@ class TParseTree {
|
|||
if (allStrictlyLegalParses.size() > 2)
|
||||
throw new Error("can this happen?");
|
||||
if (legalParsesWithVowelOnRoot.size() == 2) {
|
||||
if (legalParsesWithVowelOnRoot.get(0).size() != 1 + legalParsesWithVowelOnRoot.get(1).size())
|
||||
throw new Error("Something other than the G-YA vs. GYA case appeared. Sorry for your trouble! " + legalParsesWithVowelOnRoot.get(0) + " ;; " + legalParsesWithVowelOnRoot.get(1));
|
||||
if (legalParsesWithVowelOnRoot.get(0).size()
|
||||
!= 1 + legalParsesWithVowelOnRoot.get(1).size()) {
|
||||
// MARDA is MAR+DA or MA-R-DA -- both are legal if
|
||||
// noPrefixTests.
|
||||
return new TStackListList();
|
||||
} else {
|
||||
// G-YA vs. GYA.
|
||||
return new TStackListList(legalParsesWithVowelOnRoot.get(1));
|
||||
}
|
||||
}
|
||||
if (allNonillegalParses.size() == 2) {
|
||||
if (allNonillegalParses.get(0).size() != 1 + allNonillegalParses.get(1).size())
|
||||
throw new Error("Something other than the G-YA vs. GYA case appeared. Sorry for your trouble! " + allNonillegalParses.get(0) + " ;; " + allNonillegalParses.get(1));
|
||||
if (allNonillegalParses.get(0).size() != 1 + allNonillegalParses.get(1).size()) {
|
||||
// BDREN, e.g., if noPrefixTests:
|
||||
return new TStackListList();
|
||||
}
|
||||
return new TStackListList(allNonillegalParses.get(1));
|
||||
}
|
||||
return allNonillegalParses;
|
||||
|
|
|
@ -131,7 +131,7 @@ class TStackList {
|
|||
* stack can take every prefix, which is not the case in
|
||||
* reality */
|
||||
public BoolTriple isLegalTshegBar(boolean noPrefixTests) {
|
||||
// DLC handle PADMA and other Tibetanized Sanskrit fellows consistently. Right now we only treat single-stack Sanskrit guys as legal.
|
||||
// DLC Should we handle PADMA and other Tibetanized Sanskrit fellows consistently? Right now we only treat single-stack Sanskrit guys as legal.
|
||||
|
||||
TTGCList tgcList = new TTGCList(this);
|
||||
StringBuffer warnings = new StringBuffer();
|
||||
|
@ -191,8 +191,10 @@ class TStackList {
|
|||
* @param isLastStack if non-null, then isLastStack[0] will be
|
||||
* set to true if and only if the very last stack is the only
|
||||
* stack not to have a vowel or disambiguator on it */
|
||||
// DLC FIXME: DELETE THIS WARNING and this code unless EWTS will need it...
|
||||
boolean hasStackWithoutVowel(TPairList opl, boolean[] isLastStack) {
|
||||
int runningSize = 0;
|
||||
// DLC FIXME: MARDA is MARD==MAR-D to us, but is probably MAR+DA, warn
|
||||
for (int i = 0; i < size(); i++) {
|
||||
TPairList pl = get(i);
|
||||
String l;
|
||||
|
@ -207,7 +209,7 @@ class TStackList {
|
|||
}
|
||||
}
|
||||
if (runningSize != opl.sizeMinusDisambiguators())
|
||||
throw new IllegalArgumentException("opl (" + opl + ") is bad for this stack list (" + toString() + ")");
|
||||
throw new IllegalArgumentException("runningSize = " + runningSize + "; opl.sizeMinusDisambiguators = " + opl.sizeMinusDisambiguators() + "; opl (" + opl + ") is bad for this stack list (" + toString() + ")");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -219,8 +221,11 @@ class TStackList {
|
|||
}
|
||||
return u.toString();
|
||||
}
|
||||
/** Returns the DuffCodes corresponding to this stack list. */
|
||||
DuffCode[] getDuff() {
|
||||
/** Returns the DuffCodes and errors corresponding to this stack
|
||||
list. Each element of the array is a DuffCode or a String, the
|
||||
latter if and only if the TMW font cannot represent the
|
||||
corresponding stack in this list. */
|
||||
Object[] getDuff() {
|
||||
ArrayList al = new ArrayList(size()*2); // rough estimate
|
||||
int count = 0;
|
||||
for (int i = 0; i < size(); i++) {
|
||||
|
@ -229,20 +234,40 @@ class TStackList {
|
|||
if (size() > 0 && al.size() == 0) {
|
||||
throw new Error("But this stack list, " + this + ", contains " + size() + " stacks! How can it not have DuffCodes associated with it?");
|
||||
}
|
||||
return (DuffCode[])al.toArray(new DuffCode[] { });
|
||||
return al.toArray();
|
||||
}
|
||||
}
|
||||
|
||||
/** Too simple to comment. */
|
||||
class BoolTriple {
|
||||
class BoolTriple implements Comparable {
|
||||
boolean isLegal;
|
||||
boolean isLegalButSanskrit; // some subset are legal but legal Sanskrit -- the single sanskrit stacks are this way, such as B+DE.
|
||||
boolean isLegalAndHasAVowelOnRoot;
|
||||
BoolTriple(boolean isLegal,
|
||||
boolean isLegalButSanskrit,
|
||||
boolean isLegalAndHasAVowelOnRoot) {
|
||||
if (!isLegal && (isLegalButSanskrit || isLegalAndHasAVowelOnRoot))
|
||||
throw new IllegalArgumentException();
|
||||
this.isLegal = isLegal;
|
||||
this.isLegalButSanskrit = isLegalButSanskrit;
|
||||
this.isLegalAndHasAVowelOnRoot = isLegalAndHasAVowelOnRoot;
|
||||
}
|
||||
private int score() {
|
||||
int score = 0;
|
||||
if (isLegalAndHasAVowelOnRoot) {
|
||||
score += 5;
|
||||
}
|
||||
if (isLegal) {
|
||||
score += 5;
|
||||
}
|
||||
if (isLegalButSanskrit) {
|
||||
score -= 3;
|
||||
}
|
||||
return score;
|
||||
}
|
||||
/** The most legal BoolTriple compares higher. */
|
||||
public int compareTo(Object o) {
|
||||
BoolTriple b = (BoolTriple)o;
|
||||
return score() - b.score();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -25,6 +25,12 @@ Machine Web and methods for converting EWTS transliteration into
|
|||
Tibetan Machine Web. It has extensive tests, though probably not
|
||||
mentioned in these Javadoc documents.
|
||||
</p>
|
||||
<p>
|
||||
When you see the term "Sanskrit" used here, it often means
|
||||
non-native (not native Tibetan, in other words) rather than truly
|
||||
Tibetanized Sanskrit. It is overloaded to refer to Tibetanized
|
||||
Chinese, Tibetanized Sanskrit, etc.
|
||||
</p>
|
||||
<h2>Related Documentation</h2>
|
||||
@see <a href="../package-summary.html">org.thdl.tib.text</a>
|
||||
</body>
|
||||
|
|
Loading…
Reference in a new issue