323 lines
14 KiB
Java
323 lines
14 KiB
Java
/*
|
|
The contents of this file are subject to the THDL Open Community License
|
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
|
with the License. You may obtain a copy of the License on the THDL web site
|
|
(http://www.thdl.org/).
|
|
|
|
Software distributed under the License is distributed on an "AS IS" basis,
|
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
|
License for the specific terms governing rights and limitations under the
|
|
License.
|
|
|
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
|
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
|
All Rights Reserved.
|
|
|
|
Contributor(s): ______________________________________.
|
|
*/
|
|
|
|
// TODO(DLC)[EWTS->Tibetan]: If EWTS still has 'v', warn about it if it looks like someone thinks that ACIP's usage of it for wa-zur is how EWTS does things.
|
|
|
|
package org.thdl.tib.text.ttt;
|
|
|
|
/** A factory for creating {@link TPairList TPairLists} from
|
|
* Strings of ACIP.
|
|
* @author David Chandler */
|
|
class TPairListFactory {
|
|
/** This class is not instantiable. */
|
|
private TPairListFactory() { }
|
|
|
|
/** Returns one or two new TPairList instances. Breaks an ACIP
|
|
* tsheg bar (roughly a "syllable") into chunks; this
|
|
* computes l' (for you design doc enthusiasts).
|
|
*
|
|
* <p>Here's a rough sketch of the algorithm: run along getting
|
|
* the current TPair as big as you can. If you get it very
|
|
* big, but there's something illegal afterward that wouldn't
|
|
* otherwise be illegal, undo as little as possible to correct.
|
|
* For example, G'A'I becomes [(G . 'A), (' . I)], and TAA
|
|
* becomes [(T . A)] in a first pass but then we see that the
|
|
* rest would be suboptimal, so we backtrack to [(T . )] and then
|
|
* finally become [(T . ), (A . A)]. We look for (A . ) and (
|
|
* . <vowel>) in the rest in order to say "the rest would be
|
|
* suboptimal", i.e. we use TPairList.hasSimpleError(TTraits).</p>
|
|
*
|
|
* <p>There is one case where we break things up into two pair
|
|
* lists if and only if specialHandlingForAppendages is true -- I
|
|
* thought the converter had a bug because I saw SNYAM'AM in
|
|
* KD0003I2.ACT. I asked Robert Chilton, though, and he said
|
|
* "SNYAM'AM " was likely a typo for "SNYAM 'AM", so leave
|
|
* specialHandlingForAppendages false.</p>
|
|
*
|
|
* <p>I found out about (OK, as it turns out, imagined) this case
|
|
* too late to do anything clean about it. SNYAM'AM, e.g.,
|
|
* breaks up into [(S . ), (NY . A), (M . 'A), (M . )], which is
|
|
* incorrect -- [(S . ), (NY . A), (M . ), (' . A), (M . )] is
|
|
* correct. But we don't know which is correct without parsing,
|
|
* so both are returned. The clean treatment would be to lex
|
|
* into a form that didn't insist 'A was either a vowel or a
|
|
* consonant. Then the parser would figure it out. But don't
|
|
* bother, because specialHandlingForAppendages should be false
|
|
* always.</p>
|
|
*
|
|
* @param acip a string of ACIP with no punctuation in it
|
|
* @param specialHandlingForAppendages true if and only if you
|
|
* want SNYAM'AM to ultimately parse as {S+NYA}{M}{'A}{M} instead
|
|
* of {S+NYA}{M'A}{M}
|
|
* @return an array of one or two pair lists, if the former, then
|
|
* the second element will be null, if the latter, the second
|
|
* element will have (* . ), (' . *) instead of (* . '*) which
|
|
* the former has
|
|
* @throws IllegalArgumentException if acip is too large for us
|
|
* to break into chunks (we're recursive, not iterative, so the
|
|
* boundary can be increased a lot if you care, but you don't) */
|
|
static TPairList[] breakACIPIntoChunks(String acip,
|
|
boolean specialHandlingForAppendages)
|
|
throws IllegalArgumentException
|
|
{
|
|
try {
|
|
TTraits ttraits = ACIPTraits.instance();
|
|
TPairList a = breakHelperACIP(acip, true, false, ttraits);
|
|
TPairList b = null;
|
|
if (specialHandlingForAppendages)
|
|
b = breakHelperACIP(acip, false, false, ttraits);
|
|
if (null != b && a.equals(b))
|
|
return new TPairList[] { a, null };
|
|
else
|
|
return new TPairList[] { a, b };
|
|
} catch (StackOverflowError e) {
|
|
throw new IllegalArgumentException("Input too large[1]: " + acip);
|
|
} catch (OutOfMemoryError e) {
|
|
throw new IllegalArgumentException("Input too large[2]: " + acip);
|
|
}
|
|
}
|
|
|
|
/** TODO(DLC)[EWTS->Tibetan]: doc */
|
|
static TPairList[] breakEWTSIntoChunks(String ewts)
|
|
throws IllegalArgumentException
|
|
{
|
|
try {
|
|
return new TPairList[] {
|
|
breakHelperEWTS(ewts, EWTSTraits.instance()), null
|
|
};
|
|
} catch (StackOverflowError e) {
|
|
throw new IllegalArgumentException("Input too large[1]: " + ewts);
|
|
} catch (OutOfMemoryError e) {
|
|
throw new IllegalArgumentException("Input too large[2]: " + ewts);
|
|
}
|
|
}
|
|
|
|
/** Helps {@link #breakACIPIntoChunks(String,boolean)}.
|
|
* @param tickIsVowel true if and only if you want to treat the
|
|
* ACIP {'} as an U+0F71 vowel instead of the full-sized
|
|
* consonant in special, "this might be an appendage like 'AM or
|
|
* 'ANG" circumstances
|
|
* @param weHaveSeenVowelAlready true if and only if, in our
|
|
* recursion, we've already found one vowel (not a disambiguator,
|
|
* but a vowel like "A", "E", "Um:", "m", "'U", etc.) */
|
|
private static TPairList breakHelperACIP(String acip, boolean tickIsVowel,
|
|
boolean weHaveSeenVowelAlready,
|
|
TTraits ttraits) {
|
|
|
|
// base case for our recursion:
|
|
if ("".equals(acip))
|
|
return new TPairList();
|
|
|
|
StringBuffer acipBuf = new StringBuffer(acip);
|
|
int howMuchBuf[] = new int[1];
|
|
TPair head = getFirstConsonantAndVowel(acipBuf, howMuchBuf, ttraits);
|
|
int howMuch = howMuchBuf[0];
|
|
if (!tickIsVowel
|
|
&& null != head.getLeft()
|
|
&& null != head.getRight()
|
|
&& weHaveSeenVowelAlready
|
|
&& ACIPRules.isACIPSuffix(head.getLeft()) // DKY'O should be two horizontal units, not three. -- {D}{KY'O}, not {D}{KY}{'O}.
|
|
&& head.getRight().startsWith("'")) {
|
|
head = new TPair(head.getLeft(),
|
|
// Without this disambiguator, we are
|
|
// less efficient (8 parses, not 4) and
|
|
// we can't handle PA'AM'ANG etc.
|
|
"-");
|
|
howMuch = head.getLeft().length();
|
|
}
|
|
|
|
TPairList tail;
|
|
if ((tail
|
|
= breakHelperACIP(acipBuf.substring(howMuch),
|
|
tickIsVowel,
|
|
weHaveSeenVowelAlready
|
|
|| (head.getRight() != null
|
|
&& !"+".equals(head.getRight())
|
|
&& !"-".equals(head.getRight())),
|
|
ttraits)).hasSimpleError(ttraits)) {
|
|
for (int i = 1; i < howMuch; i++) {
|
|
// try giving i characters back if that leaves us with
|
|
// a legal head and makes the rest free of simple
|
|
// errors.
|
|
TPairList newTail = null;
|
|
TPair newHead;
|
|
if ((newHead = head.minusNRightmostTransliterationCharacters(i)).isLegal()
|
|
&& !(newTail
|
|
= breakHelperACIP(acipBuf.substring(howMuch - i),
|
|
tickIsVowel,
|
|
weHaveSeenVowelAlready
|
|
|| (newHead.getRight() != null
|
|
&& !"+".equals(newHead.getRight())
|
|
&& !"-".equals(newHead.getRight())),
|
|
ttraits)).hasSimpleError(ttraits)) {
|
|
newTail.prepend(newHead);
|
|
return newTail;
|
|
}
|
|
}
|
|
// It didn't work. Return the first thing we'd thought
|
|
// of: head appended with tail. (I.e., fall through.)
|
|
}
|
|
tail.prepend(head);
|
|
return tail;
|
|
}
|
|
|
|
// TODO(DLC)[EWTS->Tibetan]: doc
|
|
private static TPairList breakHelperEWTS(String ewts, TTraits ttraits /* TODO(DLC)[EWTS->Tibetan]: use */) {
|
|
|
|
// base case for our recursion:
|
|
if ("".equals(ewts))
|
|
return new TPairList();
|
|
|
|
StringBuffer ewtsBuf = new StringBuffer(ewts);
|
|
int howMuchBuf[] = new int[1];
|
|
TPair head = getFirstConsonantAndVowel(ewtsBuf, howMuchBuf, ttraits);
|
|
int howMuch = howMuchBuf[0];
|
|
|
|
TPairList tail;
|
|
if ((tail = breakHelperEWTS(ewtsBuf.substring(howMuch),
|
|
ttraits)).hasSimpleError(ttraits)) {
|
|
for (int i = 1; i < howMuch; i++) {
|
|
// try giving i characters back if that leaves us with
|
|
// a legal head and makes the rest free of simple
|
|
// errors.
|
|
TPairList newTail = null;
|
|
TPair newHead;
|
|
if ((newHead = head.minusNRightmostTransliterationCharacters(i)).isLegal()
|
|
&& !(newTail
|
|
= breakHelperEWTS(ewtsBuf.substring(howMuch - i), ttraits)).hasSimpleError(ttraits)) {
|
|
newTail.prepend(newHead);
|
|
return newTail;
|
|
}
|
|
}
|
|
// It didn't work. Return the first thing we'd thought
|
|
// of: head appended with tail. (I.e., fall through.)
|
|
}
|
|
tail.prepend(head);
|
|
return tail;
|
|
}
|
|
|
|
/** Returns the largest TPair we can make from the acip starting
|
|
* from the left. This will return a size zero pair if and only
|
|
* if acip is the empty string; otherwise, it may return a pair
|
|
* with either the left or right component empty. This mutates
|
|
* acip when we run into {NA+YA}; it mutates acip into {N+YA}.
|
|
* For {NE+YA}, it does not mutate acip or behave intelligently.
|
|
* A later phase will need to turn that into {N+YE} or an error
|
|
* or whatever you like. howMuch[0] will be set to the number of
|
|
* characters of acip that this call has consumed. */
|
|
private static TPair getFirstConsonantAndVowel(StringBuffer acip, // TODO(DLC)[EWTS->Tibetan]: function name needs ACIP in it?
|
|
int howMuch[],
|
|
TTraits ttraits) {
|
|
// Note that it is *not* the case that if acip.substring(0, N)
|
|
// is legal (according to TPair.isLegal()), then
|
|
// acip.substring(0, N-1) is legal for all N. For example,
|
|
// think of ACIP's {shA} and {KshA}. However, 's' is the only
|
|
// tricky fellow, so it is true that acip.substring(0, N-1) is
|
|
// either legal or ends with 's' if acip.substring(0, N) is
|
|
// legal.
|
|
//
|
|
// We don't, however, use this approach. We just try to find
|
|
// a consonant of length 3, and then, failing that, of length
|
|
// 2, etc. Likewise with vowels. This avoids the issue.
|
|
|
|
int i, xl = acip.length();
|
|
if (0 == xl) {
|
|
howMuch[0] = 0;
|
|
return new TPair(null, null);
|
|
}
|
|
if (acip.charAt(0) == ttraits.disambiguatorChar()) {
|
|
howMuch[0] = 1;
|
|
return new TPair(null, ttraits.disambiguator());
|
|
}
|
|
char ch = acip.charAt(0);
|
|
|
|
// Numbers never appear in stacks, so if you see 1234, that's
|
|
// like seeing 1-2-3-4.
|
|
if (ch >= '0' && ch <= '9') {
|
|
howMuch[0] = 1; // not 2...
|
|
return new TPair(acip.substring(0, 1), (xl == 1) ? null : ttraits.disambiguator());
|
|
}
|
|
|
|
String l = null, r = null;
|
|
for (i = Math.min(ttraits.maxConsonantLength(), xl); i >= 1; i--) {
|
|
String t = null;
|
|
if (ttraits.isConsonant(t = acip.substring(0, i))) {
|
|
l = t;
|
|
break;
|
|
}
|
|
}
|
|
int ll = (null == l) ? 0 : l.length();
|
|
if (null != l && xl > ll && acip.charAt(ll) == ttraits.disambiguatorChar()) {
|
|
howMuch[0] = l.length() + 1;
|
|
return new TPair(l, ttraits.disambiguator());
|
|
}
|
|
if (null != l && xl > ll && acip.charAt(ll) == '+') {
|
|
howMuch[0] = l.length() + 1;
|
|
return new TPair(l, "+");
|
|
}
|
|
for (i = Math.min(ttraits.maxWowelLength(), xl - ll); i >= 1; i--) {
|
|
String t = null;
|
|
if (ttraits.isWowel(t = acip.substring(ll, ll + i))
|
|
// Or these, which we massage into "Am", "Am:", and
|
|
// "A:" because I didn't think {Pm} should be treated
|
|
// like {PAm} originally:
|
|
// TODO(DLC)[EWTS->Tibetan]: NOW NIGHTMARE
|
|
|| "m".equals(t) || "m:".equals(t) || ":".equals(t)) {
|
|
r = t;
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Treat {BATA+SA'I} like {BAT+SA'I}:
|
|
int z;
|
|
if (null != l && /* TODO(DLC)[EWTS->Tibetan]: */"A".equals(r) && ((z = ll + /* TODO(DLC)[EWTS->Tibetan]: */"A".length()) < xl)
|
|
&& acip.charAt(z) == '+') {
|
|
acip.deleteCharAt(z-1);
|
|
howMuch[0] = l.length() + 1;
|
|
return new TPair(l, "+");
|
|
}
|
|
|
|
// Allow Pm to mean PAm, P: to mean PA:, Pm: to mean PAm:. /* TODO(DLC)[EWTS->Tibetan]: */
|
|
int mod = 0;
|
|
if ("m".equals(r)) { r = "Am"; mod = -1; }
|
|
if (":".equals(r)) { r = "A:"; mod = -1; }
|
|
if ("m:".equals(r)) { r = "Am:"; mod = -1; }
|
|
if (":m".equals(r)) { r = "A:m"; mod = -1; } // not seen, though...
|
|
|
|
|
|
// what if we see a character that's not part of any wowel or
|
|
// consonant? We return it.
|
|
if (null == l && null == r) {
|
|
howMuch[0] = 1; // not 2...
|
|
// add a disambiguator to avoid exponential running time:
|
|
return new TPair(acip.substring(0, 1),
|
|
(xl == 1) ? null : ttraits.disambiguator());
|
|
}
|
|
|
|
howMuch[0] = (((l == null) ? 0 : l.length())
|
|
+ ((r == null) ? 0 : r.length())
|
|
+ mod);
|
|
return new TPair(l, r);
|
|
} // TODO(DLC)[EWTS->Tibetan]:
|
|
}
|
|
|
|
|
|
// FIXME: test for nested comments
|
|
|
|
// FIXME: see Translit directory on ACIP v4 CD-ROM
|