712 lines
32 KiB
Java
712 lines
32 KiB
Java
/*
|
|
The contents of this file are subject to the THDL Open Community License
|
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
|
with the License. You may obtain a copy of the License on the THDL web site
|
|
(http://www.thdl.org/).
|
|
|
|
Software distributed under the License is distributed on an "AS IS" basis,
|
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
|
License for the specific terms governing rights and limitations under the
|
|
License.
|
|
|
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
|
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
|
All Rights Reserved.
|
|
|
|
Contributor(s): ______________________________________.
|
|
*/
|
|
|
|
// TODO(DLC)[EWTS->Tibetan]: If EWTS still has 'v', warn about it if it looks like someone thinks that ACIP's usage of it for wa-zur is how EWTS does things.
|
|
|
|
package org.thdl.tib.text.ttt;
|
|
|
|
import java.util.Arrays;
|
|
import java.util.Comparator;
|
|
import java.util.List;
|
|
|
|
import org.thdl.tib.text.THDLWylieConstants;
|
|
import org.thdl.tib.text.TibetanMachineWeb;
|
|
|
|
/** A factory for creating {@link TPairList TPairLists} from
|
|
* Strings of ACIP.
|
|
* @author David Chandler */
|
|
// TODO(DLC)[EWTS->Tibetan]: kill this class; put it all in TTraits.
|
|
class TPairListFactory {
|
|
/** This class is not instantiable. */
|
|
private TPairListFactory() { }
|
|
|
|
/** See {@link TTraits#breakTshegBarIntoChunks}. */
|
|
static TPairList[] breakACIPIntoChunks(String tt,
|
|
boolean specialHandlingForAppendages) {
|
|
TTraits ttraits = ACIPTraits.instance();
|
|
TPairList a = breakHelperACIP(tt, true, false, ttraits);
|
|
TPairList b = null;
|
|
if (specialHandlingForAppendages)
|
|
b = breakHelperACIP(tt, false, false, ttraits);
|
|
if (null != b && a.equals(b))
|
|
return new TPairList[] { a, null };
|
|
else
|
|
return new TPairList[] { a, b };
|
|
}
|
|
|
|
/** Helps {@link #breakACIPIntoChunks(String,boolean)}.
|
|
* @param tickIsVowel true if and only if you want to treat the
|
|
* ACIP {'} as an U+0F71 vowel instead of the full-sized
|
|
* consonant in special, "this might be an appendage like 'AM or
|
|
* 'ANG" circumstances
|
|
* @param weHaveSeenVowelAlready true if and only if, in our
|
|
* recursion, we've already found one vowel (not a disambiguator,
|
|
* but a vowel like "A", "E", "Um:", "m", "'U", etc.) */
|
|
private static TPairList breakHelperACIP(String acip, boolean tickIsVowel,
|
|
boolean weHaveSeenVowelAlready,
|
|
TTraits ttraits) {
|
|
|
|
// base case for our recursion:
|
|
if ("".equals(acip))
|
|
return new TPairList(ttraits);
|
|
|
|
StringBuffer acipBuf = new StringBuffer(acip);
|
|
int howMuchBuf[] = new int[1];
|
|
TPair head = getFirstConsonantAndVowel(acipBuf, howMuchBuf, ttraits);
|
|
int howMuch = howMuchBuf[0];
|
|
if (!tickIsVowel
|
|
&& null != head.getLeft()
|
|
&& null != head.getRight()
|
|
&& weHaveSeenVowelAlready
|
|
&& ttraits.isSuffix(head.getLeft()) // DKY'O should be two horizontal units, not three. -- {D}{KY'O}, not {D}{KY}{'O}.
|
|
&& head.getRight().startsWith("'")) {
|
|
head = new TPair(ttraits, head.getLeft(),
|
|
// Without this disambiguator, we are
|
|
// less efficient (8 parses, not 4) and
|
|
// we can't handle PA'AM'ANG etc.
|
|
"-");
|
|
howMuch = head.getLeft().length();
|
|
}
|
|
|
|
TPairList tail;
|
|
if ((tail
|
|
= breakHelperACIP(acipBuf.substring(howMuch),
|
|
tickIsVowel,
|
|
weHaveSeenVowelAlready
|
|
|| (head.getRight() != null
|
|
&& !"+".equals(head.getRight())
|
|
&& !"-".equals(head.getRight())),
|
|
ttraits)).hasSimpleError()) {
|
|
for (int i = 1; i < howMuch; i++) {
|
|
// try giving i characters back if that leaves us with
|
|
// a legal head and makes the rest free of simple
|
|
// errors.
|
|
TPairList newTail = null;
|
|
TPair newHead;
|
|
if ((newHead = head.minusNRightmostTransliterationCharacters(i)).isLegal()
|
|
&& !(newTail
|
|
= breakHelperACIP(acipBuf.substring(howMuch - i),
|
|
tickIsVowel,
|
|
weHaveSeenVowelAlready
|
|
|| (newHead.getRight() != null
|
|
&& !"+".equals(newHead.getRight())
|
|
&& !"-".equals(newHead.getRight())),
|
|
ttraits)).hasSimpleError()) {
|
|
newTail.prepend(newHead);
|
|
return newTail;
|
|
}
|
|
}
|
|
// It didn't work. Return the first thing we'd thought
|
|
// of: head appended with tail. (I.e., fall through.)
|
|
}
|
|
tail.prepend(head);
|
|
return tail;
|
|
}
|
|
|
|
private static final boolean debug = false;
|
|
|
|
/** See {@link TTraits#breakTshegBarIntoChunks}. */
|
|
static TPairList[] breakEWTSIntoChunks(String ewts)
|
|
throws IllegalArgumentException
|
|
{
|
|
EWTSTraits traits = EWTSTraits.instance();
|
|
TPairList pl = breakHelperEWTS(ewts, traits);
|
|
if (debug) System.out.println("breakEWTSIntoChunks: pl is " + pl);
|
|
TPairList npl = pl;
|
|
|
|
// TODO(DLC)[EWTS->Tibetan]: this crap ain't workin' for kaHM. But kaeM and kaMe shouldn't work, right? Figure out what EWTS really says...
|
|
|
|
// TODO(DLC)[EWTS->Tibetan]: for "a\\0f86" e.g.:
|
|
if (pl.size() > 1) {
|
|
npl = new TPairList(traits, pl.size());
|
|
|
|
for (int i = pl.size() - 1; i >= 1; i--) {
|
|
TPair left = pl.get(i - 1);
|
|
TPair right = pl.get(i);
|
|
if (traits.aVowel().equals(left.getRight())
|
|
&& left.getLeft() == null
|
|
&& right.getLeft() == null
|
|
&& traits.isWowelThatRequiresAChen(right.getRight())) {
|
|
npl.prepend(new TPair(traits, traits.aVowel(), right.getRight()));
|
|
--i;
|
|
} else if (traits.aVowel().equals(left.getRight())
|
|
&& left.getLeft() != null
|
|
&& right.getLeft() == null
|
|
&& traits.isWowelThatRequiresAChen(right.getRight())
|
|
&& false /* TODO(DLC)[EWTS->Tibetan]: ewts kaM is bothersome now */) {
|
|
npl.prepend(new TPair(traits, left.getLeft(), right.getRight()));
|
|
--i;
|
|
} else {
|
|
npl.prepend(right);
|
|
if (i == 1)
|
|
npl.prepend(left);
|
|
}
|
|
}
|
|
}
|
|
pl = null;
|
|
if (debug) System.out.println("breakEWTSIntoChunks: npl is " + npl);
|
|
|
|
TPairList nnpl;
|
|
if (true) {
|
|
// TODO(DLC)[EWTS->Tibetan]: this nnpl crap was before getFirstConsonantAndVowel got fixed. Try killing it!
|
|
|
|
// Collapse ( . wowel1) ( . wowel2) into (
|
|
// . wowel1+wowel2). Then collapse (* . a) ( . x) into (*
|
|
// . x). Also, if an a-chen (\u0f68) is implied, then
|
|
// insert it.
|
|
TPairList xnnpl = new TPairList(traits, npl.size());
|
|
for (int i = 0; i < npl.size(); ) {
|
|
TPair p = npl.get(i);
|
|
int set_i_to = i + 1;
|
|
if (p.getLeft() == null
|
|
&& p.getRight() != null
|
|
&& !traits.disambiguator().equals(p.getRight())
|
|
&& !"+".equals(p.getRight())) {
|
|
StringBuffer sb = new StringBuffer(p.getRight());
|
|
for (int j = i + 1; j < npl.size(); j++) {
|
|
TPair p2 = npl.get(j);
|
|
if (p2.getLeft() == null
|
|
&& p2.getRight() != null
|
|
&& !traits.disambiguator().equals(p2.getRight())
|
|
&& !"+".equals(p2.getRight()))
|
|
{
|
|
sb.append("+" + p2.getRight());
|
|
set_i_to = j + 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
p = new TPair(traits, traits.aVowel(), sb.toString());
|
|
}
|
|
// TODO(DLC)[EWTS->Tibetan]: Do we still have "ai" converting to the wrong thing. "ae"?
|
|
xnnpl.append(p);
|
|
i = set_i_to;
|
|
}
|
|
|
|
nnpl = new TPairList(traits, xnnpl.size());
|
|
// (* . a ) ( . x) ... ( . y) -> (* . a+x+...+y)
|
|
for (int i = 0; i < xnnpl.size(); ) {
|
|
TPair p = xnnpl.get(i);
|
|
int set_i_to = i + 1;
|
|
if (traits.aVowel().equals(p.getRight())) {
|
|
StringBuffer sb = new StringBuffer(p.getRight());
|
|
for (int j = i + 1; j < xnnpl.size(); j++) {
|
|
TPair p2 = xnnpl.get(j);
|
|
if (p2.getLeft() == null
|
|
&& p2.getRight() != null
|
|
&& !traits.disambiguator().equals(p2.getRight())
|
|
&& !"+".equals(p2.getRight()))
|
|
{
|
|
// TODO(DLC)[EWTS->Tibetan] a+o+e is what we'll get.. maybe we want just o+e?
|
|
sb.append("+" + p2.getRight());
|
|
set_i_to = j + 1;
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
p = new TPair(traits, p.getLeft(), sb.toString());
|
|
}
|
|
|
|
if (false) { // TODO(DLC)[EWTS->Tibetan]: bra is screwed up, do in it stacklist?
|
|
// EWTS does not think that kra is k+ra. Replace
|
|
// (consonant . ) with (consonant . DISAMBIGUATOR):
|
|
if (p.getRight() == null && p.getLeft() != null
|
|
&& i + 1 < xnnpl.size())
|
|
p = new TPair(traits, p.getLeft(), traits.disambiguator());
|
|
}
|
|
|
|
nnpl.append(p);
|
|
i = set_i_to;
|
|
}
|
|
} else {
|
|
// TODO(DLC)[EWTS->Tibetan]: this block is not executing. kill it after testing and thinking
|
|
nnpl = new TPairList(traits, npl.size());
|
|
|
|
for (int i = npl.size() - 1; i >= 0; i--) {
|
|
TPair p = npl.get(i);
|
|
if (p.getLeft() == null
|
|
&& p.getRight() != null
|
|
&& !traits.disambiguator().equals(p.getRight())
|
|
&& !"+".equals(p.getRight())) /* TODO(DLC)[EWTS->Tibetan] this should be equivalent to isWowel(p.getRight()) but o+o shows that's not true yet */
|
|
p = new TPair(traits, traits.aVowel(), p.getRight());
|
|
// TODO(DLC)[EWTS->Tibetan]: do you still have "ai" converting to the wrong thing? ("ae" also?)
|
|
nnpl.prepend(p);
|
|
}
|
|
}
|
|
npl = null;
|
|
if (debug) System.out.println("breakEWTSIntoChunks: nnpl is " + nnpl);
|
|
|
|
TPairList nnnpl = transformNativeStacks(traits, nnpl);
|
|
if (debug) System.out.println("breakEWTSIntoChunks: nnnpl is " + nnnpl);
|
|
|
|
return new TPairList[] {
|
|
nnnpl, null
|
|
};
|
|
}
|
|
|
|
/** EWTS helper function that transforms native stacks to include
|
|
* pluses: [(ph . ) (y . ) (w . *)] -> [(ph . +) (y . +) (w
|
|
* . *)], e.g. The tricky case is something like [brgyad] or
|
|
* [brjod] because b+r is a native stack and so is r+g+y (and in
|
|
* fact r+g+y accepts a bao prefix). It's not quite safe to
|
|
* always grab the rightmost native stack from a stretch, as
|
|
* [drwa] proves. You must grab the longest, rightmost stack.
|
|
* In most cases, either way you did it it'd be illegal. In the
|
|
* rest, the only way it can be legal is if there's a prefix and
|
|
* the rightmost stack.
|
|
* @param traits must mesh with orig */
|
|
private static TPairList transformNativeStacks(TTraits traits,
|
|
TPairList orig) {
|
|
// TODO(DLC)[EWTS->Tibetan]: instead of using
|
|
// TibetanMachineWeb's knowledge of the hash keys in tibwn.ini
|
|
// (ph-y-w is a hash key, e.g.), we assume that 3 is the
|
|
// maximum size of a native stack.
|
|
final int maxNativeStackSize = 3;
|
|
// [(s . *)] alone doesn't need transformation. [(s . )
|
|
// (k . *)] does:
|
|
final int minNativeStackSize = 2;
|
|
|
|
TPairList result = new TPairList(traits, orig.size());
|
|
for (int i = 0; i < orig.size();
|
|
) { // we increment i inside the loop
|
|
// If, upon looking ahead, we see a native stack of
|
|
// size 3, we transform three pairs. Failing that, if
|
|
// we see a native stack of size 2, we transform it.
|
|
|
|
boolean found_something = false;
|
|
TPair p[]
|
|
= new TPair[maxNativeStackSize + 1]; // plus one for [brgyad]
|
|
for (int j = 0; j < maxNativeStackSize + 1; j++) {
|
|
if (i + j < orig.size())
|
|
p[j] = orig.get(i + j);
|
|
else
|
|
p[j] = null;
|
|
}
|
|
// Now p[0] is current pair, p[1] is the one after that, etc.
|
|
|
|
if (null != p[0].getLeft()
|
|
&& null == p[0].getRight()) {
|
|
// TODO(dchandler): The way I do this [drwa] case,
|
|
// does it rely on the fact that maxNativeStackSize ==
|
|
// 3? Let's have it not rely on that...
|
|
int h;
|
|
if (0 == (h = helper(traits, 0, maxNativeStackSize, p, result))) { // [drwa]
|
|
// [brgyad] makes us go from right to left.
|
|
// (TODO(dchandler): It's a shame we're doing this
|
|
// stuff when we have the code to figure out, for
|
|
// ACIP, that [BRGYAD] is what it is.)
|
|
for (int offset = 1; offset >= 0; offset--) {
|
|
if (found_something) break;
|
|
for (int nss = maxNativeStackSize;
|
|
nss >= minNativeStackSize;
|
|
nss--) {
|
|
if (0 != (h = helper(traits, offset, nss, p, result))) {
|
|
found_something = true;
|
|
i += h;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
} else {
|
|
i += h;
|
|
found_something = true;
|
|
}
|
|
}
|
|
if (!found_something) {
|
|
++i;
|
|
result.append(p[0]);
|
|
}
|
|
}
|
|
if (result.size() != orig.size()) {
|
|
throw new Error("orig=" + orig + "\nresult=" + result); // TODO(dchandler): make this an assertion.
|
|
}
|
|
return result;
|
|
}
|
|
|
|
/** We mutate result and return the number of TPairs we scarfed if
|
|
* we find a native stack of size nss at p[offset], p[offset +
|
|
* 1], ..., p[offset + nss - 1]. */
|
|
private static int helper(TTraits traits, int offset, int nss, TPair p[],
|
|
TPairList result) {
|
|
String hashKey = "";
|
|
int good = 0;
|
|
for (int k = 0; k < nss - 1; k++) {
|
|
if (null != p[k + offset]
|
|
&& null != p[k + offset].getLeft()
|
|
&& null == p[k + offset].getRight()) {
|
|
hashKey += p[k + offset].getLeft() + "-";
|
|
++good;
|
|
}
|
|
}
|
|
if (null != p[nss - 1 + offset]
|
|
&& null != p[nss - 1 + offset].getLeft()
|
|
&& !"+".equals(p[nss - 1 + offset].getRight())) {
|
|
hashKey += p[nss - 1 + offset].getLeft();
|
|
++good;
|
|
}
|
|
if (nss == good
|
|
&& TibetanMachineWeb.isKnownHashKey(hashKey)) {
|
|
int i = 0;
|
|
if (1 == offset) {
|
|
++i;
|
|
result.append(p[0]);
|
|
}
|
|
for (int n = 0; n < nss - 1; n++) {
|
|
++i;
|
|
result.append(new TPair(traits,
|
|
p[n + offset].getLeft(),
|
|
"+"));
|
|
}
|
|
++i;
|
|
result.append(p[nss - 1 + offset]);
|
|
return i;
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
/** Returns a TPair just like tp (sometimes the very same,
|
|
* unchanged instance) except that the wowel, if present, is in
|
|
* the order that Section 9.11 of the Unicode Standard, version
|
|
* 4.0.1, would have us use. */
|
|
private static TPair ewtsSortWowels(TPair tp) {
|
|
if (tp.getRight() != null
|
|
&& tp.getRight().length() > 0
|
|
&& !"+".equals(tp.getRight())) {
|
|
class WowelComparator implements Comparator {
|
|
/** @see
|
|
* org.thdl.tib.text.tshegbar.UnicodeUtils#fixSomeOrderingErrorsInTibetanUnicode(StringBuffer) */
|
|
private List order = Arrays.asList(new String[] {
|
|
// equivalence class:
|
|
"\u0f39", THDLWylieConstants.WYLIE_TSA_PHRU,
|
|
|
|
// equivalence class:
|
|
THDLWylieConstants.WYLIE_aVOWEL,
|
|
|
|
// equivalence class:
|
|
"\u0f71", THDLWylieConstants.A_VOWEL,
|
|
"\u0f73", THDLWylieConstants.I_VOWEL, // TODO(dchandler): in a perfect world, we'd decompose and sort the components.
|
|
"\u0f75", THDLWylieConstants.U_VOWEL, // TODO(dchandler): in a perfect world, we'd decompose and sort the components.
|
|
"\u0f81", THDLWylieConstants.reverse_I_VOWEL, // TODO(dchandler): in a perfect world, we'd decompose and sort the components.
|
|
|
|
"\u0f74", THDLWylieConstants.u_VOWEL,
|
|
|
|
// TODO(dchandler): equivalence classes I'm not
|
|
// sure.
|
|
// http://iris.lib.virginia.edu/tibet/xml/showEssay.php?xml=/tools/encodingTib.xml
|
|
// says to go above base and then upwards. Think
|
|
// it over.
|
|
|
|
// equivalence class:
|
|
"\u0f72", THDLWylieConstants.i_VOWEL,
|
|
"\u0f7a", THDLWylieConstants.e_VOWEL,
|
|
"\u0f7b", THDLWylieConstants.ai_VOWEL,
|
|
"\u0f7c", THDLWylieConstants.o_VOWEL,
|
|
"\u0f7d", THDLWylieConstants.au_VOWEL,
|
|
"\u0f80", THDLWylieConstants.reverse_i_VOWEL,
|
|
|
|
// equivalence class:
|
|
"\u0f7e", THDLWylieConstants.BINDU,
|
|
"\u0f82", THDLWylieConstants.U0F82,
|
|
"\u0f83", THDLWylieConstants.U0F83,
|
|
"\u0f86", THDLWylieConstants.U0F86,
|
|
"\u0f87", THDLWylieConstants.U0F87,
|
|
|
|
// NOTE: we always say "e" comes before "o" but
|
|
// either order would work.
|
|
|
|
/* TODO(dchandler): should these go with other
|
|
* under-line wowels like \u0f74? They're for the
|
|
* whole tsheg-bar, so they're oddballs...
|
|
*
|
|
* bestEwtsMap.put("\u0f35", THDLWylieConstants.U0F35);
|
|
*
|
|
* bestEwtsMap.put("\u0f37", THDLWylieConstants.U0F37);
|
|
*
|
|
* bestEwtsMap.put("\u0f84", THDLWylieConstants.U0F84);
|
|
*
|
|
* bestEwtsMap.put("\u0fc6", THDLWylieConstants.U0FC6);
|
|
*/
|
|
});
|
|
public int compare(Object o1, Object o2) {
|
|
int i1 = order.indexOf(o1);
|
|
int i2 = order.indexOf(o2);
|
|
if (i1 < 0) i1 = order.size();
|
|
if (i2 < 0) i2 = order.size();
|
|
return i1 - i2;
|
|
}
|
|
}
|
|
String wowels[] = tp.getRight().split("\\+");
|
|
java.util.Arrays.sort(wowels, new WowelComparator());
|
|
StringBuffer sb = new StringBuffer();
|
|
for (int i = 0; i < wowels.length; i++) {
|
|
sb.append(wowels[i]);
|
|
if (i + 1 < wowels.length)
|
|
sb.append('+');
|
|
}
|
|
return new TPair(tp.getTraits(), tp.getLeft(), sb.toString());
|
|
} else {
|
|
return tp;
|
|
}
|
|
}
|
|
|
|
// TODO(DLC)[EWTS->Tibetan]: doc
|
|
private static TPairList breakHelperEWTS(String ewts, TTraits ttraits) {
|
|
|
|
// base case for our recursion:
|
|
if ("".equals(ewts))
|
|
return new TPairList(ttraits);
|
|
|
|
StringBuffer ewtsBuf = new StringBuffer(ewts);
|
|
int howMuchBuf[] = new int[1];
|
|
TPair head = ewtsSortWowels(getFirstConsonantAndVowel(ewtsBuf,
|
|
howMuchBuf,
|
|
ttraits));
|
|
int howMuch = howMuchBuf[0];
|
|
|
|
TPairList tail;
|
|
if ((tail = breakHelperEWTS(ewtsBuf.substring(howMuch),
|
|
ttraits)).hasSimpleError()) {
|
|
for (int i = 1; i < howMuch; i++) {
|
|
// try giving i characters back if that leaves us with
|
|
// a legal head and makes the rest free of simple
|
|
// errors.
|
|
TPairList newTail = null;
|
|
TPair newHead;
|
|
if ((newHead = head.minusNRightmostTransliterationCharacters(i)).isLegal()
|
|
&& !(newTail
|
|
= breakHelperEWTS(ewtsBuf.substring(howMuch - i), ttraits)).hasSimpleError()) {
|
|
newTail.prepend(newHead);
|
|
return newTail;
|
|
}
|
|
}
|
|
// It didn't work. Return the first thing we'd thought
|
|
// of: head appended with tail. (I.e., fall through.)
|
|
}
|
|
tail.prepend(head);
|
|
return tail;
|
|
}
|
|
|
|
private static String GetInitialVowel(TTraits ttraits, String tx,
|
|
String startOfVowel) {
|
|
if (null == startOfVowel) startOfVowel = "";
|
|
boolean startsWithPlus = false;
|
|
if (!"".equals(startOfVowel)
|
|
&& (!ttraits.vowelsMayStack()
|
|
|| (tx.length() < 1 || !(startsWithPlus = tx.substring(0, 1).equals("+")))))
|
|
return ("".equals(startOfVowel) ? null : startOfVowel);
|
|
if (startsWithPlus)
|
|
tx = tx.substring(1);
|
|
for (int i = Math.min(ttraits.maxWowelLength(), tx.length()); i >= 1; i--) {
|
|
String t = tx.substring(0, i);
|
|
if (ttraits.isWowel(t)
|
|
|| (ttraits.isACIP()
|
|
// Or these, which we massage into "Am", "Am:", and
|
|
// "A:" because I didn't think {Pm} should be treated
|
|
// like {PAm} originally:
|
|
// TODO(DLC)[EWTS->Tibetan]: NOW NIGHTMARE
|
|
&& ("m".equals(t) || "m:".equals(t) || ":".equals(t)))) {
|
|
// If this is followed by +wowel[+wowel[+wowel... in EWTS then that's part of the vowel also:
|
|
return GetInitialVowel(ttraits,
|
|
tx.substring(i),
|
|
startOfVowel + (startsWithPlus ? "+" : "") + t);
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
|
|
/** Returns the largest TPair we can make from the transliteration
|
|
* starting from the left. This will return a size zero pair if
|
|
* and only if tx is the empty string; otherwise, it may return a
|
|
* pair with either the left or right component empty. [FOR
|
|
* ACIP:] This mutates tx when we run into {NA+YA}; it mutates tx
|
|
* into {N+YA}. For {NE+YA}, it does not mutate tx or behave
|
|
* intelligently. A later phase will need to turn that into
|
|
* {N+YE} or an error or whatever you like. howMuch[0] will be
|
|
* set to the number of characters of tx that this call has
|
|
* consumed. */
|
|
private static TPair getFirstConsonantAndVowel(StringBuffer tx,
|
|
int howMuch[],
|
|
TTraits ttraits) {
|
|
// To handle EWTS "phywa\\u0f84\u0f86" [yes that's two slashes
|
|
// and then one slash], for example, we need to make the wowel
|
|
// (the getRight() field of the returned TPair) contain
|
|
// everything that it should.
|
|
//
|
|
// It can't hurt in ACIP, though I don't recall if ACIP's lexer
|
|
// allows Unicode characters.
|
|
TPair og = helpGetFirstConsonantAndVowel(tx, howMuch, ttraits);
|
|
int len = tx.length();
|
|
StringBuffer x = null;
|
|
while (howMuch[0] < len) {
|
|
if (isUnicodeWowelChar(tx.charAt(howMuch[0]))) {
|
|
if (null == x) x = new StringBuffer(); // rarely happens
|
|
if (x.length() > 0) x.append('+');
|
|
x.append(tx.charAt(howMuch[0]++));
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
// In EWTS, deal with M, ~M`, etc. They're much like
|
|
// UnicodeWowelCharacters.
|
|
if (ttraits instanceof EWTSTraits) {
|
|
EWTSTraits tt = (EWTSTraits)ttraits;
|
|
while (howMuch[0] < len) {
|
|
int howMuchExtra[] = new int[] { 0 };
|
|
TPair p
|
|
= helpGetFirstConsonantAndVowel(new StringBuffer(tx.substring(howMuch[0])),
|
|
howMuchExtra,
|
|
ttraits);
|
|
if (p.getLeft() == null
|
|
&& p.getRight() != null
|
|
&& tt.isWowelThatRequiresAChen(p.getRight())) {
|
|
if (null == x) x = new StringBuffer(); // rarely happens
|
|
String extra;
|
|
if (x.length() > 0) x.append('+');
|
|
x.append(extra = tx.substring(howMuch[0], howMuch[0] + howMuchExtra[0]));
|
|
// System.out.println("extra is " + extra); TODO(DLC)[EWTS->Tibetan]
|
|
howMuch[0] += howMuchExtra[0];
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
if (null != x)
|
|
return new TPair(ttraits, og.getLeft(),
|
|
(null == og.getRight() || ttraits.aVowel().equals(og.getRight()))
|
|
? x.toString()
|
|
: (og.getRight() + "+" + x.toString()));
|
|
else
|
|
return og;
|
|
}
|
|
private static TPair helpGetFirstConsonantAndVowel(StringBuffer tx, // TODO(DLC)[EWTS->Tibetan]: function name needs ACIP in it?
|
|
int howMuch[],
|
|
TTraits ttraits) {
|
|
// Note that it is *not* the case that if tx.substring(0, N)
|
|
// is legal (according to TPair.isLegal()), then
|
|
// tx.substring(0, N-1) is legal for all N. For example,
|
|
// think of ACIP's {shA} and {KshA}. However, 's' is the only
|
|
// tricky fellow in ACIP, so in ACIP it is true that
|
|
// tx.substring(0, N-1) is either legal or ends with 's' if
|
|
// tx.substring(0, N) is legal.
|
|
//
|
|
// We don't, however, use this approach. We just try to find
|
|
// a consonant of length 3, and then, failing that, of length
|
|
// 2, etc. Likewise with vowels. This avoids the issue.
|
|
|
|
int i, xl = tx.length();
|
|
// TODO(DLC)[EWTS->Tibetan]: nasty special case!
|
|
if (false && !ttraits.isACIP() /* TODO(DLC)[EWTS->Tibetan]: isEWTS! */
|
|
&& xl >= 2 && tx.charAt(0) == 'a' && (tx.charAt(1) == 'i' || tx.charAt(1) == 'u')) {
|
|
howMuch[0] = 2;
|
|
return new TPair(ttraits, null, tx.substring(0, 2));
|
|
// TODO(DLC)[EWTS->Tibetan]: test that "au" alone is \u0f68\u0f7d, "ai" alone is \u0f68\u0f7b in EWTS.
|
|
}
|
|
if (0 == xl) {
|
|
howMuch[0] = 0;
|
|
return new TPair(ttraits, null, null);
|
|
}
|
|
if (tx.charAt(0) == ttraits.disambiguatorChar()) {
|
|
howMuch[0] = 1;
|
|
return new TPair(ttraits, null, ttraits.disambiguator());
|
|
}
|
|
char ch = tx.charAt(0);
|
|
|
|
// Numbers never appear in stacks, so if you see 1234, that's
|
|
// like seeing 1-2-3-4. Though in EWTS you can have '0\u0f19'
|
|
if (ch >= '0' && ch <= '9') {
|
|
// TODO(DLC)[EWTS->Tibetan]: test case: 0e should have a-chen and 0\u0f74 should go through without errors.
|
|
if (xl > 1 && ttraits.isUnicodeWowel(tx.charAt(1))) {
|
|
howMuch[0] = 2;
|
|
return new TPair(ttraits, tx.substring(0, 1), tx.substring(1, 2));
|
|
}
|
|
|
|
howMuch[0] = 1; // not 2...
|
|
return new TPair(ttraits, tx.substring(0, 1), (xl == 1) ? null : ttraits.disambiguator());
|
|
}
|
|
|
|
String l = null, r = null;
|
|
for (i = Math.min(ttraits.maxConsonantLength(), xl); i >= 1; i--) {
|
|
String t = null;
|
|
if (ttraits.isConsonant(t = tx.substring(0, i))
|
|
|| (ttraits.vowelAloneImpliesAChen() // handle EWTS {a+yo}
|
|
&& ttraits.aVowel().equals(tx.substring(0, i))
|
|
&& i < xl && tx.substring(i, i + i).equals("+"))) {
|
|
l = t;
|
|
break;
|
|
}
|
|
}
|
|
int ll = (null == l) ? 0 : l.length();
|
|
if (null != l && xl > ll && tx.charAt(ll) == ttraits.disambiguatorChar()) {
|
|
howMuch[0] = l.length() + 1;
|
|
return new TPair(ttraits, l, ttraits.disambiguator());
|
|
}
|
|
if (null != l && xl > ll && tx.charAt(ll) == '+') {
|
|
howMuch[0] = l.length() + 1;
|
|
return new TPair(ttraits, l, "+");
|
|
}
|
|
int mod = 0;
|
|
|
|
r = GetInitialVowel(ttraits, tx.substring(ll), null);
|
|
if (ttraits.isACIP()) {
|
|
// Treat {BATA+SA'I} like {BAT+SA'I}: // TODO(DLC)[EWTS->Tibetan]: in EWTS???
|
|
int z;
|
|
if (null != l
|
|
&& ttraits.aVowel().equals(r)
|
|
&& ((z = ll + ttraits.aVowel().length()) < xl)
|
|
&& tx.charAt(z) == '+') {
|
|
tx.deleteCharAt(z-1);
|
|
howMuch[0] = l.length() + 1;
|
|
return new TPair(ttraits, l, "+");
|
|
}
|
|
|
|
// Allow Pm to mean PAm, P: to mean PA:, Pm: to mean PAm:. /* TODO(DLC)[EWTS->Tibetan]: in EWTS? */
|
|
if ("m".equals(r)) { r = "Am"; mod = -1; }
|
|
if (":".equals(r)) { r = "A:"; mod = -1; }
|
|
if ("m:".equals(r)) { r = "Am:"; mod = -1; }
|
|
if (":m".equals(r)) { r = "A:m"; mod = -1; } // not seen, though...
|
|
}
|
|
|
|
// what if we see a character that's not part of any wowel or
|
|
// consonant? We return it.
|
|
if (null == l && null == r) {
|
|
howMuch[0] = 1; // not 2...
|
|
// add a disambiguator to avoid exponential running time:
|
|
return new TPair(ttraits, tx.substring(0, 1),
|
|
(xl == 1) ? null : ttraits.disambiguator());
|
|
}
|
|
|
|
howMuch[0] = (((l == null) ? 0 : l.length())
|
|
+ ((r == null) ? 0 : r.length())
|
|
+ mod);
|
|
return new TPair(ttraits, l, r);
|
|
} // TODO(DLC)[EWTS->Tibetan]:
|
|
|
|
private static boolean isUnicodeWowelChar(char ch) {
|
|
return ((ch >= '\u0f71' && ch <= '\u0f84')
|
|
|| "\u0f35\u0f37\u0f18\u0f19\u0f3e\u0f3f\u0f86\u0f87\u0fc6".indexOf(ch) >= 0);
|
|
// TODO(dchandler): should we really allow "phywa\\u0f18", or
|
|
// does \u0f18 only combine with digits?
|
|
}
|
|
}
|
|
|
|
|
|
// FIXME: test for nested comments
|
|
|
|
// FIXME: see Translit directory on ACIP v4 CD-ROM
|