Here's a rough sketch of the algorithm: run along getting
- * the current TPair as big as you can. If you get it very
- * big, but there's something illegal afterward that wouldn't
- * otherwise be illegal, undo as little as possible to correct.
- * For example, G'A'I becomes [(G . 'A), (' . I)], and TAA
- * becomes [(T . A)] in a first pass but then we see that the
- * rest would be suboptimal, so we backtrack to [(T . )] and then
- * finally become [(T . ), (A . A)]. We look for (A . ) and (
- * .
There is one case where we break things up into two pair - * lists if and only if specialHandlingForAppendages is true -- I - * thought the converter had a bug because I saw SNYAM'AM in - * KD0003I2.ACT. I asked Robert Chilton, though, and he said - * "SNYAM'AM " was likely a typo for "SNYAM 'AM", so leave - * specialHandlingForAppendages false.
- * - *I found out about (OK, as it turns out, imagined) this case - * too late to do anything clean about it. SNYAM'AM, e.g., - * breaks up into [(S . ), (NY . A), (M . 'A), (M . )], which is - * incorrect -- [(S . ), (NY . A), (M . ), (' . A), (M . )] is - * correct. But we don't know which is correct without parsing, - * so both are returned. The clean treatment would be to lex - * into a form that didn't insist 'A was either a vowel or a - * consonant. Then the parser would figure it out. But don't - * bother, because specialHandlingForAppendages should be false - * always.
- * - * @param acip a string of ACIP with no punctuation in it - * @param specialHandlingForAppendages true if and only if you - * want SNYAM'AM to ultimately parse as {S+NYA}{M}{'A}{M} instead - * of {S+NYA}{M'A}{M} - * @return an array of one or two pair lists, if the former, then - * the second element will be null, if the latter, the second - * element will have (* . ), (' . *) instead of (* . '*) which - * the former has - * @throws IllegalArgumentException if acip is too large for us - * to break into chunks (we're recursive, not iterative, so the - * boundary can be increased a lot if you care, but you don't) */ - static TPairList[] breakACIPIntoChunks(String acip, - boolean specialHandlingForAppendages) - throws IllegalArgumentException - { - try { - TTraits ttraits = ACIPTraits.instance(); - TPairList a = breakHelperACIP(acip, true, false, ttraits); - TPairList b = null; - if (specialHandlingForAppendages) - b = breakHelperACIP(acip, false, false, ttraits); - if (null != b && a.equals(b)) - return new TPairList[] { a, null }; - else - return new TPairList[] { a, b }; - } catch (StackOverflowError e) { - throw new IllegalArgumentException("Input too large[1]: " + acip); - } catch (OutOfMemoryError e) { - throw new IllegalArgumentException("Input too large[2]: " + acip); - } - } - - /** TODO(DLC)[EWTS->Tibetan]: doc */ - static TPairList[] breakEWTSIntoChunks(String ewts) - throws IllegalArgumentException - { - try { - return new TPairList[] { - breakHelperEWTS(ewts, EWTSTraits.instance()), null - }; - } catch (StackOverflowError e) { - throw new IllegalArgumentException("Input too large[1]: " + ewts); - } catch (OutOfMemoryError e) { - throw new IllegalArgumentException("Input too large[2]: " + ewts); - } + /** See {@link TTraits#breakTshegBarIntoChunks}. */ + static TPairList[] breakACIPIntoChunks(String tt, + boolean specialHandlingForAppendages) { + TTraits ttraits = ACIPTraits.instance(); + TPairList a = breakHelperACIP(tt, true, false, ttraits); + TPairList b = null; + if (specialHandlingForAppendages) + b = breakHelperACIP(tt, false, false, ttraits); + if (null != b && a.equals(b)) + return new TPairList[] { a, null }; + else + return new TPairList[] { a, b }; } /** Helps {@link #breakACIPIntoChunks(String,boolean)}. @@ -149,7 +84,7 @@ class TPairListFactory { || (head.getRight() != null && !"+".equals(head.getRight()) && !"-".equals(head.getRight())), - ttraits)).hasSimpleError(ttraits)) { + ttraits)).hasSimpleError()) { for (int i = 1; i < howMuch; i++) { // try giving i characters back if that leaves us with // a legal head and makes the rest free of simple @@ -164,7 +99,7 @@ class TPairListFactory { || (newHead.getRight() != null && !"+".equals(newHead.getRight()) && !"-".equals(newHead.getRight())), - ttraits)).hasSimpleError(ttraits)) { + ttraits)).hasSimpleError()) { newTail.prepend(newHead); return newTail; } @@ -176,6 +111,136 @@ class TPairListFactory { return tail; } + /** See {@link TTraits#breakTshegBarIntoChunks}. */ + static TPairList[] breakEWTSIntoChunks(String ewts) + throws IllegalArgumentException + { + EWTSTraits traits = EWTSTraits.instance(); + TPairList pl = breakHelperEWTS(ewts, traits); + TPairList npl = pl; + + // TODO(DLC)[EWTS->Tibetan]: this crap ain't workin' for kaHM. But kaeM and kaMe shouldn't work, right? Figure out what EWTS really says... + + // TODO(DLC)[EWTS->Tibetan]: for "a\\0f86" e.g.: + if (pl.size() > 1) { + npl = new TPairList(traits, pl.size()); + + for (int i = pl.size() - 1; i >= 1; i--) { + TPair left = pl.get(i - 1); + TPair right = pl.get(i); + if (traits.aVowel().equals(left.getRight()) + && left.getLeft() == null + && right.getLeft() == null + && traits.isWowelThatRequiresAChen(right.getRight())) { + npl.prepend(new TPair(traits, traits.aVowel(), right.getRight())); + --i; + } else if (traits.aVowel().equals(left.getRight()) + && left.getLeft() != null + && right.getLeft() == null + && traits.isWowelThatRequiresAChen(right.getRight()) + && false /* TODO(DLC)[EWTS->Tibetan]: ewts kaM is bothersome now */) { + npl.prepend(new TPair(traits, left.getLeft(), right.getRight())); + --i; + } else { + npl.prepend(right); + if (i == 1) + npl.prepend(left); + } + } + } + + TPairList nnpl; + if (true) { + // Collapse ( . wowel1) ( . wowel2) into ( + // . wowel1+wowel2). Then collapse (* . a) ( . x) into (* + // . x). Also, if an a-chen (\u0f68) is implied, then + // insert it. + TPairList xnnpl = new TPairList(traits, pl.size()); + for (int i = 0; i < npl.size(); ) { + TPair p = npl.get(i); + int set_i_to = i + 1; + if (p.getLeft() == null + && p.getRight() != null + && !traits.disambiguator().equals(p.getRight()) + && !"+".equals(p.getRight())) { + StringBuffer sb = new StringBuffer(p.getRight()); + for (int j = i + 1; j < npl.size(); j++) { + TPair p2 = npl.get(j); + if (p2.getLeft() == null + && p2.getRight() != null + && !traits.disambiguator().equals(p2.getRight()) + && !"+".equals(p2.getRight())) + { + sb.append("+" + p2.getRight()); + set_i_to = j + 1; + } else { + break; + } + } + p = new TPair(traits, traits.aVowel(), sb.toString()); + } + // TODO(DLC)[EWTS->Tibetan]: Do we still have "ai" converting to the wrong thing. "ae"? + xnnpl.append(p); + i = set_i_to; + } + + nnpl = new TPairList(traits, pl.size()); + // (* . a ) ( . x) ... ( . y) -> (* . a+x+...+y) + for (int i = 0; i < xnnpl.size(); ) { + TPair p = xnnpl.get(i); + int set_i_to = i + 1; + if (traits.aVowel().equals(p.getRight())) { + StringBuffer sb = new StringBuffer(p.getRight()); + for (int j = i + 1; j < xnnpl.size(); j++) { + TPair p2 = xnnpl.get(j); + if (p2.getLeft() == null + && p2.getRight() != null + && !traits.disambiguator().equals(p2.getRight()) + && !"+".equals(p2.getRight())) + { + // TODO(DLC)[EWTS->Tibetan] a+o+e is what we'll get.. maybe we want just o+e? + sb.append("+" + p2.getRight()); + set_i_to = j + 1; + } else { + break; + } + } + p = new TPair(traits, p.getLeft(), sb.toString()); + } + + if (false) { // TODO(DLC)[EWTS->Tibetan]: bra is screwed up, do in it stacklist? + // EWTS does not think that kra is k+ra. Replace + // (consonant . ) with (consonant . DISAMBIGUATOR): + if (p.getRight() == null && p.getLeft() != null + && i + 1 < xnnpl.size()) + p = new TPair(traits, p.getLeft(), traits.disambiguator()); + } + + nnpl.append(p); + i = set_i_to; + } + } else { + // TODO(DLC)[EWTS->Tibetan]: this block is not executing. kill it after testing and thinking + nnpl = new TPairList(traits, pl.size()); + + for (int i = npl.size() - 1; i >= 0; i--) { + TPair p = npl.get(i); + if (p.getLeft() == null + && p.getRight() != null + && !traits.disambiguator().equals(p.getRight()) + && !"+".equals(p.getRight())) /* TODO(DLC)[EWTS->Tibetan] this should be equivalent to isWowel(p.getRight()) but o+o shows that's not true yet */ + p = new TPair(traits, traits.aVowel(), p.getRight()); + // TODO(DLC)[EWTS->Tibetan]: do you still have "ai" converting to the wrong thing? ("ae" also?) + nnpl.prepend(p); + } + } + + // TODO(DLC)[EWTS->Tibetan]: this nnpl crap was before getFirstConsonantAndVowel got fixed. Try killing it! + return new TPairList[] { + nnpl, null + }; + } + // TODO(DLC)[EWTS->Tibetan]: doc private static TPairList breakHelperEWTS(String ewts, TTraits ttraits) { @@ -190,7 +255,7 @@ class TPairListFactory { TPairList tail; if ((tail = breakHelperEWTS(ewtsBuf.substring(howMuch), - ttraits)).hasSimpleError(ttraits)) { + ttraits)).hasSimpleError()) { for (int i = 1; i < howMuch; i++) { // try giving i characters back if that leaves us with // a legal head and makes the rest free of simple @@ -199,7 +264,7 @@ class TPairListFactory { TPair newHead; if ((newHead = head.minusNRightmostTransliterationCharacters(i)).isLegal() && !(newTail - = breakHelperEWTS(ewtsBuf.substring(howMuch - i), ttraits)).hasSimpleError(ttraits)) { + = breakHelperEWTS(ewtsBuf.substring(howMuch - i), ttraits)).hasSimpleError()) { newTail.prepend(newHead); return newTail; } @@ -211,101 +276,193 @@ class TPairListFactory { return tail; } - /** Returns the largest TPair we can make from the acip starting - * from the left. This will return a size zero pair if and only - * if acip is the empty string; otherwise, it may return a pair - * with either the left or right component empty. This mutates - * acip when we run into {NA+YA}; it mutates acip into {N+YA}. - * For {NE+YA}, it does not mutate acip or behave intelligently. - * A later phase will need to turn that into {N+YE} or an error - * or whatever you like. howMuch[0] will be set to the number of - * characters of acip that this call has consumed. */ - private static TPair getFirstConsonantAndVowel(StringBuffer acip, // TODO(DLC)[EWTS->Tibetan]: function name needs ACIP in it? + private static String GetInitialVowel(TTraits ttraits, String tx, + String startOfVowel) { + if (null == startOfVowel) startOfVowel = ""; + boolean startsWithPlus = false; + if (!"".equals(startOfVowel) + && (!ttraits.vowelsMayStack() + || (tx.length() < 1 || !(startsWithPlus = tx.substring(0, 1).equals("+"))))) + return ("".equals(startOfVowel) ? null : startOfVowel); + if (startsWithPlus) + tx = tx.substring(1); + for (int i = Math.min(ttraits.maxWowelLength(), tx.length()); i >= 1; i--) { + String t = tx.substring(0, i); + if (ttraits.isWowel(t) + || (ttraits.isACIP() + // Or these, which we massage into "Am", "Am:", and + // "A:" because I didn't think {Pm} should be treated + // like {PAm} originally: + // TODO(DLC)[EWTS->Tibetan]: NOW NIGHTMARE + && ("m".equals(t) || "m:".equals(t) || ":".equals(t)))) { + // If this is followed by +wowel[+wowel[+wowel... in EWTS then that's part of the vowel also: + return GetInitialVowel(ttraits, + tx.substring(i), + startOfVowel + (startsWithPlus ? "+" : "") + t); + } + } + return null; + } + + + /** Returns the largest TPair we can make from the transliteration + * starting from the left. This will return a size zero pair if + * and only if tx is the empty string; otherwise, it may return a + * pair with either the left or right component empty. [FOR + * ACIP:] This mutates tx when we run into {NA+YA}; it mutates tx + * into {N+YA}. For {NE+YA}, it does not mutate tx or behave + * intelligently. A later phase will need to turn that into + * {N+YE} or an error or whatever you like. howMuch[0] will be + * set to the number of characters of tx that this call has + * consumed. */ + private static TPair getFirstConsonantAndVowel(StringBuffer tx, // TODO(DLC)[EWTS->Tibetan]: function name needs ACIP in it? int howMuch[], TTraits ttraits) { - // Note that it is *not* the case that if acip.substring(0, N) + // To handle EWTS "phywa\\u0f84\u0f86" [yes that's two slashes + // and then one slash], for example, we need to make the wowel + // (the getRight() field of the returned TPair) contain + // everything that it should. + // + // It can't hurt in ACIP, though I don't recall if ACIP's lexer + // allows Unicode characters. + TPair og = helpGetFirstConsonantAndVowel(tx, howMuch, ttraits); + int len = tx.length(); + StringBuffer x = null; + while (howMuch[0] < len) { + if (isUnicodeWowelChar(tx.charAt(howMuch[0]))) { + if (null == x) x = new StringBuffer(); // rarely happens + if (x.length() > 0) x.append('+'); + x.append(tx.charAt(howMuch[0]++)); + } else { + break; + } + } + // In EWTS, deal with M, ~M`, etc. They're much like + // UnicodeWowelCharacters. + if (ttraits instanceof EWTSTraits) { + EWTSTraits tt = (EWTSTraits)ttraits; + while (howMuch[0] < len) { + int howMuchExtra[] = new int[] { 0 }; + TPair p + = helpGetFirstConsonantAndVowel(new StringBuffer(tx.substring(howMuch[0])), + howMuchExtra, + ttraits); + if (p.getLeft() == null + && p.getRight() != null + && tt.isWowelThatRequiresAChen(p.getRight())) { + if (null == x) x = new StringBuffer(); // rarely happens + String extra; + if (x.length() > 0) x.append('+'); + x.append(extra = tx.substring(howMuch[0], howMuch[0] + howMuchExtra[0])); + // System.out.println("extra is " + extra); TODO(DLC)[EWTS->Tibetan] + howMuch[0] += howMuchExtra[0]; + } else { + break; + } + } + } + if (null != x) + return new TPair(ttraits, og.getLeft(), + (null == og.getRight() || ttraits.aVowel().equals(og.getRight())) + ? x.toString() + : (og.getRight() + "+" + x.toString())); + else + return og; + } + private static TPair helpGetFirstConsonantAndVowel(StringBuffer tx, // TODO(DLC)[EWTS->Tibetan]: function name needs ACIP in it? + int howMuch[], + TTraits ttraits) { + // Note that it is *not* the case that if tx.substring(0, N) // is legal (according to TPair.isLegal()), then - // acip.substring(0, N-1) is legal for all N. For example, + // tx.substring(0, N-1) is legal for all N. For example, // think of ACIP's {shA} and {KshA}. However, 's' is the only - // tricky fellow, so it is true that acip.substring(0, N-1) is - // either legal or ends with 's' if acip.substring(0, N) is - // legal. + // tricky fellow in ACIP, so in ACIP it is true that + // tx.substring(0, N-1) is either legal or ends with 's' if + // tx.substring(0, N) is legal. // // We don't, however, use this approach. We just try to find // a consonant of length 3, and then, failing that, of length // 2, etc. Likewise with vowels. This avoids the issue. - int i, xl = acip.length(); + int i, xl = tx.length(); + // TODO(DLC)[EWTS->Tibetan]: nasty special case! + if (false && !ttraits.isACIP() /* TODO(DLC)[EWTS->Tibetan]: isEWTS! */ + && xl >= 2 && tx.charAt(0) == 'a' && (tx.charAt(1) == 'i' || tx.charAt(1) == 'u')) { + howMuch[0] = 2; + return new TPair(ttraits, null, tx.substring(0, 2)); + // TODO(DLC)[EWTS->Tibetan]: test that "au" alone is \u0f68\u0f7d, "ai" alone is \u0f68\u0f7b in EWTS. + } if (0 == xl) { howMuch[0] = 0; return new TPair(ttraits, null, null); } - if (acip.charAt(0) == ttraits.disambiguatorChar()) { + if (tx.charAt(0) == ttraits.disambiguatorChar()) { howMuch[0] = 1; return new TPair(ttraits, null, ttraits.disambiguator()); } - char ch = acip.charAt(0); + char ch = tx.charAt(0); // Numbers never appear in stacks, so if you see 1234, that's - // like seeing 1-2-3-4. + // like seeing 1-2-3-4. Though in EWTS you can have '0\u0f19' if (ch >= '0' && ch <= '9') { + // TODO(DLC)[EWTS->Tibetan]: test case: 0e should have a-chen and 0\u0f74 should go through without errors. + if (xl > 1 && ttraits.isUnicodeWowel(tx.charAt(1))) { + howMuch[0] = 2; + return new TPair(ttraits, tx.substring(0, 1), tx.substring(1, 2)); + } + howMuch[0] = 1; // not 2... - return new TPair(ttraits, acip.substring(0, 1), (xl == 1) ? null : ttraits.disambiguator()); + return new TPair(ttraits, tx.substring(0, 1), (xl == 1) ? null : ttraits.disambiguator()); } String l = null, r = null; for (i = Math.min(ttraits.maxConsonantLength(), xl); i >= 1; i--) { String t = null; - if (ttraits.isConsonant(t = acip.substring(0, i))) { + if (ttraits.isConsonant(t = tx.substring(0, i)) + || (ttraits.vowelAloneImpliesAChen() // handle EWTS {a+yo} + && ttraits.aVowel().equals(tx.substring(0, i)) + && i < xl && tx.substring(i, i + i).equals("+"))) { l = t; break; } } int ll = (null == l) ? 0 : l.length(); - if (null != l && xl > ll && acip.charAt(ll) == ttraits.disambiguatorChar()) { + if (null != l && xl > ll && tx.charAt(ll) == ttraits.disambiguatorChar()) { howMuch[0] = l.length() + 1; return new TPair(ttraits, l, ttraits.disambiguator()); } - if (null != l && xl > ll && acip.charAt(ll) == '+') { + if (null != l && xl > ll && tx.charAt(ll) == '+') { howMuch[0] = l.length() + 1; return new TPair(ttraits, l, "+"); } - for (i = Math.min(ttraits.maxWowelLength(), xl - ll); i >= 1; i--) { - String t = null; - if (ttraits.isWowel(t = acip.substring(ll, ll + i)) - // Or these, which we massage into "Am", "Am:", and - // "A:" because I didn't think {Pm} should be treated - // like {PAm} originally: - // TODO(DLC)[EWTS->Tibetan]: NOW NIGHTMARE - || "m".equals(t) || "m:".equals(t) || ":".equals(t)) { - r = t; - break; - } - } - - // Treat {BATA+SA'I} like {BAT+SA'I}: - int z; - if (null != l && /* TODO(DLC)[EWTS->Tibetan]: */"A".equals(r) && ((z = ll + /* TODO(DLC)[EWTS->Tibetan]: */"A".length()) < xl) - && acip.charAt(z) == '+') { - acip.deleteCharAt(z-1); - howMuch[0] = l.length() + 1; - return new TPair(ttraits, l, "+"); - } - - // Allow Pm to mean PAm, P: to mean PA:, Pm: to mean PAm:. /* TODO(DLC)[EWTS->Tibetan]: */ int mod = 0; - if ("m".equals(r)) { r = "Am"; mod = -1; } - if (":".equals(r)) { r = "A:"; mod = -1; } - if ("m:".equals(r)) { r = "Am:"; mod = -1; } - if (":m".equals(r)) { r = "A:m"; mod = -1; } // not seen, though... + r = GetInitialVowel(ttraits, tx.substring(ll), null); + if (ttraits.isACIP()) { + // Treat {BATA+SA'I} like {BAT+SA'I}: // TODO(DLC)[EWTS->Tibetan]: in EWTS??? + int z; + if (null != l + && ttraits.aVowel().equals(r) + && ((z = ll + ttraits.aVowel().length()) < xl) + && tx.charAt(z) == '+') { + tx.deleteCharAt(z-1); + howMuch[0] = l.length() + 1; + return new TPair(ttraits, l, "+"); + } + + // Allow Pm to mean PAm, P: to mean PA:, Pm: to mean PAm:. /* TODO(DLC)[EWTS->Tibetan]: in EWTS? */ + if ("m".equals(r)) { r = "Am"; mod = -1; } + if (":".equals(r)) { r = "A:"; mod = -1; } + if ("m:".equals(r)) { r = "Am:"; mod = -1; } + if (":m".equals(r)) { r = "A:m"; mod = -1; } // not seen, though... + } // what if we see a character that's not part of any wowel or // consonant? We return it. if (null == l && null == r) { howMuch[0] = 1; // not 2... // add a disambiguator to avoid exponential running time: - return new TPair(ttraits, acip.substring(0, 1), + return new TPair(ttraits, tx.substring(0, 1), (xl == 1) ? null : ttraits.disambiguator()); } @@ -314,6 +471,13 @@ class TPairListFactory { + mod); return new TPair(ttraits, l, r); } // TODO(DLC)[EWTS->Tibetan]: + + private static boolean isUnicodeWowelChar(char ch) { + return ((ch >= '\u0f71' && ch <= '\u0f84') + || "\u0f35\u0f37\u0f18\u0f19\u0f3e\u0f3f\u0f86\u0f87\u0fc6".indexOf(ch) >= 0); + // TODO(dchandler): should we really allow "phywa\\u0f18", or + // does \u0f18 only combine with digits? + } } diff --git a/source/org/thdl/tib/text/ttt/TParseTree.java b/source/org/thdl/tib/text/ttt/TParseTree.java index 2dba84a..f81b433 100644 --- a/source/org/thdl/tib/text/ttt/TParseTree.java +++ b/source/org/thdl/tib/text/ttt/TParseTree.java @@ -18,8 +18,6 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; -import org.thdl.util.ThdlDebug; - import java.util.ArrayList; /** A list of non-empty list of {@link TStackListList @@ -129,6 +127,10 @@ class TParseTree { if (sz == 1) { return up.get(0); } else if (sz > 1) { + // TODO(DLC)[EWTS->Tibetan]: does this still happen? If so, when? + // + // System.out.println("SHO NUFF, >1 non-illegal parses still happens"); + // {PADMA}, for example. Our technique is to go from the // left and stack as much as we can. So {PA}{D}{MA} is // inferior to {PA}{D+MA}, and {PA}{D+MA}{D}{MA} is @@ -279,7 +281,8 @@ class TParseTree { public String getWarning(String warningLevel, TPairList pl, String originalACIP, - boolean shortMessages) { + boolean shortMessages, + TTraits traits) { // ROOM_FOR_IMPROVEMENT: Allow one tsheg bar to have multiple // warnings/errors associated with it. Make this a private // subroutine, and have the public getWarning(..) call on this @@ -301,7 +304,7 @@ class TParseTree { if (shortMessages) return "501: Using " + bestParse + ", not " + noPrefixTestsUniqueParse.get(0); else - return "501: Using " + bestParse + ((null != originalACIP) ? (" for the ACIP {" + originalACIP + "}") : "") + ", but only because the tool's knowledge of prefix rules (see the documentation) says that " + noPrefixTestsUniqueParse.get(0) + " is not a legal Tibetan tsheg bar (\"syllable\")"; + return "501: Using " + bestParse + ((null != originalACIP) ? (" for the " + traits.shortTranslitName() + " {" + originalACIP + "}") : "") + ", but only because the tool's knowledge of prefix rules (see the documentation) says that " + noPrefixTestsUniqueParse.get(0) + " is not a legal Tibetan tsheg bar (\"syllable\")"; } } @@ -321,27 +324,31 @@ class TParseTree { // FIXME: The caller will prepend "WARNING " to this error! if (ErrorsAndWarnings.isEnabled(101, warningLevel)) return ErrorsAndWarnings.getMessage(101, shortMessages, - translit); + translit, + traits); } else { if (bestParse.hasStackWithoutVowel(pl, isLastStack)) { if (isLastStack[0]) { if (ErrorsAndWarnings.isEnabled(502, warningLevel)) return ErrorsAndWarnings.getMessage(502, shortMessages, - translit); + translit, + traits); } else { throw new Error("Can't happen now that we stack greedily"); } } if (ErrorsAndWarnings.isEnabled(503, warningLevel)) return ErrorsAndWarnings.getMessage(503, shortMessages, - translit); + translit, + traits); } } else { if (nip.get(0).hasStackWithoutVowel(pl, isLastStack)) { if (isLastStack[0]) { if (ErrorsAndWarnings.isEnabled(502, warningLevel)) return ErrorsAndWarnings.getMessage(502, shortMessages, - translit); + translit, + traits); } else { throw new Error("Can't happen now that we stack greedily [2]"); } @@ -362,7 +369,8 @@ class TParseTree { ++plnum; if (ErrorsAndWarnings.isEnabled(505, warningLevel)) return ErrorsAndWarnings.getMessage(505, shortMessages, - translit); + translit, + traits); } plnum = 0; for (int stackNum = 0; stackNum < bestParse.size(); stackNum++) { @@ -380,14 +388,16 @@ class TParseTree { else if (type == 1) if (ErrorsAndWarnings.isEnabled(506, warningLevel)) return ErrorsAndWarnings.getMessage(506, shortMessages, - translit); + translit, + traits); } else { if (type == 0) type = 1; else if (type == -1) if (ErrorsAndWarnings.isEnabled(506, warningLevel)) return ErrorsAndWarnings.getMessage(506, shortMessages, - translit); + translit, + traits); } } if (stackSize > 1 && tp.getLeft() != null && tp.getLeft().length() > 1) { @@ -445,14 +455,16 @@ n+t+s if (ErrorsAndWarnings.isEnabled(warningNum, warningLevel)) return ErrorsAndWarnings.getMessage(warningNum, shortMessages, - translit); + translit, + traits); } while (plnum < pl.size() && pl.get(plnum).isDisambiguator()) { ++plnum; if (ErrorsAndWarnings.isEnabled(505, warningLevel)) return ErrorsAndWarnings.getMessage(505, shortMessages, - translit); + translit, + traits); } } } @@ -472,11 +484,13 @@ n+t+s if (pl.size() == 3) { if (ErrorsAndWarnings.isEnabled(508, warningLevel)) return ErrorsAndWarnings.getMessage(508, shortMessages, - translit); + translit, + traits); } else { if (ErrorsAndWarnings.isEnabled(509, warningLevel)) return ErrorsAndWarnings.getMessage(509, shortMessages, - translit); + translit, + traits); } } } @@ -497,11 +511,13 @@ n+t+s if (pl.size() == 2) { if (ErrorsAndWarnings.isEnabled(508, warningLevel)) return ErrorsAndWarnings.getMessage(508, shortMessages, - translit); + translit, + traits); } else { if (ErrorsAndWarnings.isEnabled(509, warningLevel)) return ErrorsAndWarnings.getMessage(509, shortMessages, - translit); + translit, + traits); } } } @@ -513,7 +529,7 @@ n+t+s /** Returns something akin to the ACIP input (okay, maybe 1-2-3-4 * instead of 1234, and maybe AUTPA instead of AUT-PA) * corresponding to this parse tree. */ - public String recoverACIP() { + public String recoverACIP() { // TODO(DLC)[EWTS->Tibetan]: acip-specific ParseIterator pi = getParseIterator(); if (pi.hasNext()) { return pi.next().recoverACIP(); diff --git a/source/org/thdl/tib/text/ttt/TStackList.java b/source/org/thdl/tib/text/ttt/TStackList.java index 6007acf..e02a152 100644 --- a/source/org/thdl/tib/text/ttt/TStackList.java +++ b/source/org/thdl/tib/text/ttt/TStackList.java @@ -18,14 +18,12 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; -import org.thdl.tib.text.TibTextUtils; -import org.thdl.tib.text.TGCList; -import org.thdl.tib.text.DuffCode; - import java.util.ArrayList; -import java.util.HashMap; import java.util.ListIterator; +import org.thdl.tib.text.TGCList; +import org.thdl.tib.text.TibTextUtils; + /** A list of {@link TPairList TPairLists}, each of which is for * a stack (a grapheme cluster), typically corresponding to one tsheg * bar. @@ -165,7 +163,7 @@ class TStackList { TPairList pl = get(pairListIndex); TPair p = pl.get(pl.size() - 1); isLegalAndHasAVowelOnRoot - = (p.getRight() != null && p.getRight().startsWith("A")); // could be {A:}, e.g. + = (p.getRight() != null && p.getRight().startsWith("A")); // could be {A:}, e.g. TODO(DLC)[EWTS->Tibetan]: ??? if (isLegalAndHasAVowelOnRoot) break; } diff --git a/source/org/thdl/tib/text/ttt/TString.java b/source/org/thdl/tib/text/ttt/TString.java index 17c1656..90fb9d1 100644 --- a/source/org/thdl/tib/text/ttt/TString.java +++ b/source/org/thdl/tib/text/ttt/TString.java @@ -18,12 +18,11 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; -import org.thdl.util.ThdlOptions; -import org.thdl.util.ThdlDebug; -import org.thdl.tib.text.tshegbar.UnicodeUtils; - import java.util.HashSet; -import java.io.*; + +import org.thdl.tib.text.tshegbar.UnicodeUtils; +import org.thdl.util.ThdlDebug; +import org.thdl.util.ThdlOptions; /** * An TString is some Latin text and a type, the type stating whether diff --git a/source/org/thdl/tib/text/ttt/TTGCList.java b/source/org/thdl/tib/text/ttt/TTGCList.java index 0a97971..6eca573 100644 --- a/source/org/thdl/tib/text/ttt/TTGCList.java +++ b/source/org/thdl/tib/text/ttt/TTGCList.java @@ -18,11 +18,11 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; +import java.util.ArrayList; + import org.thdl.tib.text.TGCList; import org.thdl.tib.text.TGCPair; -import java.util.ArrayList; - /** A list of grapheme clusters. * * @author David Chandler */ diff --git a/source/org/thdl/tib/text/ttt/TTraits.java b/source/org/thdl/tib/text/ttt/TTraits.java index d6eac0a..790d847 100644 --- a/source/org/thdl/tib/text/ttt/TTraits.java +++ b/source/org/thdl/tib/text/ttt/TTraits.java @@ -19,6 +19,7 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; import java.util.ArrayList; + import org.thdl.tib.text.DuffCode; /** A TTraits object encapsulates all the things that make a @@ -65,6 +66,11 @@ interface TTraits { * any wowel) */ boolean isConsonant(String s); + /** Returns true if and only if this transliteration scheme supports + * Tibetan Unicode characters and if ch is such a character and is a + * wowel. */ + boolean isUnicodeWowel(char ch); + /** Returns true if and only if s is a stretch of * transliteration corresponding to a Tibetan wowel (without any * [achen or other] consonant) */ @@ -120,6 +126,10 @@ interface TTraits { * null if l is unknown. */ String getUnicodeFor(String l, boolean subscribed); + /** Returns the unicode for a wowel. Returns null if l is + * unknown. */ + String getUnicodeForWowel(String wowel); + /** Returns a scanner that can break up a string of transliteration. */ TTshegBarScanner scanner(); @@ -127,4 +137,78 @@ interface TTraits { /** Gets the duffcodes for wowel, such that they look good with * the preceding glyph, and appends them to duff. */ void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel); + + /** Human-readable name of this transliteration for short error + strings. */ + String shortTranslitName(); + + /** Returns true if and only pair is clearly not valid + transliteration. */ + boolean isClearlyIllegal(TPair pair); + + /** Returns one or two new TPairList instances. Breaks a + * transliterated tsheg bar (roughly a "syllable") into + * chunks; this computes l' (for you design doc enthusiasts). + * + *Here's a rough sketch of the algorithm: run along getting
+ * the current TPair as big as you can. If you get it very big,
+ * but there's something illegal afterward that wouldn't
+ * otherwise be illegal, undo as little as possible to correct.
+ * For example, ACIP {G'A'I} becomes [(G . 'A), (' . I)], and
+ * ACIP {TAA} becomes [(T . A)] in a first pass but then we see
+ * that the rest would be suboptimal, so we backtrack to [(T . )]
+ * and then finally become [(T . ), (A . A)]. We look for (A . )
+ * and ( .
There is one case where we break things up into two pair + * lists if and only if specialHandlingForAppendages is true -- I + * thought the converter had a bug because I saw ACIP {SNYAM'AM} + * in KD0003I2.ACT. I asked Robert Chilton, though, and he said + * "SNYAM'AM " was likely a typo for "SNYAM 'AM", so leave + * specialHandlingForAppendages false.
+ * + *I found out about (OK, as it turns out, imagined) this case + * too late to do anything clean about it. ACIP {SNYAM'AM}, + * e.g., breaks up into [(S . ), (NY . A), (M . 'A), (M . )], + * which is incorrect -- [(S . ), (NY . A), (M . ), (' . A), (M + * . )] is correct. But we don't know which is correct without + * parsing, so both are returned. The clean treatment would be + * to lex into a form that didn't insist ACIP {'A} was either a + * vowel or a consonant. Then the parser would figure it out. + * But don't bother, because specialHandlingForAppendages should + * be false always.
+ * + * @param tt a string of transliteration corresponding to a tsheg + * bar (i.e., it has no punctuation in it) + * @param specialHandlingForAppendages true if and only if you + * want ACIP {SNYAM'AM} to ultimately parse as {S+NYA}{M}{'A}{M} + * instead of {S+NYA}{M'A}{M} + * @return an array of length two consisting of one or two pair + * lists. If the former, then the second element will be null, + * if the latter, the second element will have (* . ), (' . *) + * instead of (* . '*) which the former has. */ + TPairList[] breakTshegBarIntoChunks(String tt, + boolean specialHandlingForAppendages); + + /** Returns true if and only if these are ACIP transliteration's + traits. TODO(dchandler): get rid of this function. Any + caller is employing a hack. */ + boolean isACIP(); + + /** Returns true if and only if a vowel all by its lonesome has an + * implied a-chen (U+0F68) with it. (ACIP requires "AI" to + * represent a-chen with gigu, but EWTS requires "i".)*/ + boolean vowelAloneImpliesAChen(); + + /** Returns true if and only if multiple vowels (TODO(dchandler): + * wowels?) may appear on a single consonant stack via the + * stacking operator, '+'. */ + boolean vowelsMayStack(); + + /** Returns true if and only if pl could represent one TPairList + in a tsheg bar. (EWTS's list of standard stacks comes into + play; ACIP always returns true.) */ + boolean couldBeValidStack(TPairList pl); } diff --git a/source/org/thdl/tib/text/ttt/TTshegBarScanner.java b/source/org/thdl/tib/text/ttt/TTshegBarScanner.java index 0835a3b..fcbdab8 100644 --- a/source/org/thdl/tib/text/ttt/TTshegBarScanner.java +++ b/source/org/thdl/tib/text/ttt/TTshegBarScanner.java @@ -18,16 +18,12 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; -import java.io.IOException; -import java.io.FileInputStream; -import java.io.InputStreamReader; -import java.io.InputStream; import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.util.ArrayList; -import java.util.Stack; - -import org.thdl.util.ThdlDebug; -import org.thdl.util.ThdlOptions; /** * A TTshegBarScanner is able to break up Strings of transliterated diff --git a/source/org/thdl/util/HTMLPane.java b/source/org/thdl/util/HTMLPane.java index bfd9051..884110b 100644 --- a/source/org/thdl/util/HTMLPane.java +++ b/source/org/thdl/util/HTMLPane.java @@ -21,8 +21,9 @@ package org.thdl.util; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; -import javax.swing.JScrollPane; + import javax.swing.JEditorPane; +import javax.swing.JScrollPane; /** An HTMLPane is a JScrollPane displaying the contents of an HTML * file. DLC FIXME: at present, neither internal nor external diff --git a/source/org/thdl/util/Link.java b/source/org/thdl/util/Link.java index 04f5d53..65396b7 100644 --- a/source/org/thdl/util/Link.java +++ b/source/org/thdl/util/Link.java @@ -17,7 +17,6 @@ Contributor(s): ______________________________________. */ package org.thdl.util; -import java.io.*; /** Used by {@link SimplifiedLinkedList} to provide the implementation of a simple dynamic link list. diff --git a/source/org/thdl/util/RTFFixerInputStream.java b/source/org/thdl/util/RTFFixerInputStream.java index a2744b0..99923e0 100644 --- a/source/org/thdl/util/RTFFixerInputStream.java +++ b/source/org/thdl/util/RTFFixerInputStream.java @@ -18,13 +18,11 @@ Contributor(s): ______________________________________. package org.thdl.util; -import org.thdl.util.ThdlDebug; - -import java.util.ArrayList; -import java.io.IOException; -import java.io.FilterInputStream; import java.io.BufferedInputStream; +import java.io.FilterInputStream; +import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; /** Provides an input stream that fixes another RTF input stream so diff --git a/source/org/thdl/util/RTFFixerInputStreamTest.java b/source/org/thdl/util/RTFFixerInputStreamTest.java index 55057e3..c979fb7 100644 --- a/source/org/thdl/util/RTFFixerInputStreamTest.java +++ b/source/org/thdl/util/RTFFixerInputStreamTest.java @@ -18,11 +18,12 @@ Contributor(s): ______________________________________. package org.thdl.util; -import junit.framework.TestCase; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; +import junit.framework.TestCase; + /** * @author David Chandler * diff --git a/source/org/thdl/util/RTFPane.java b/source/org/thdl/util/RTFPane.java index 7d2e1a6..283bdf2 100644 --- a/source/org/thdl/util/RTFPane.java +++ b/source/org/thdl/util/RTFPane.java @@ -21,11 +21,12 @@ package org.thdl.util; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; + import javax.swing.JScrollPane; import javax.swing.JTextPane; +import javax.swing.text.BadLocationException; import javax.swing.text.DefaultStyledDocument; import javax.swing.text.rtf.RTFEditorKit; -import javax.swing.text.BadLocationException; /** An RTFPane is a JScrollPane displaying the contents of a rich text file (an RTF file). */ diff --git a/source/org/thdl/util/SimpleFrame.java b/source/org/thdl/util/SimpleFrame.java index aee7f97..84a56cf 100644 --- a/source/org/thdl/util/SimpleFrame.java +++ b/source/org/thdl/util/SimpleFrame.java @@ -18,13 +18,12 @@ Contributor(s): ______________________________________. package org.thdl.util; -import javax.swing.JFrame; -import java.awt.Container; import java.awt.Component; +import java.awt.Container; import java.awt.event.ComponentAdapter; import java.awt.event.ComponentEvent; -import org.thdl.util.RTFPane; +import javax.swing.JFrame; /** An SimpleFrame is a top-level window displaying a JScrollPane. */ public class SimpleFrame extends JFrame { diff --git a/source/org/thdl/util/SimplifiedLinkedList.java b/source/org/thdl/util/SimplifiedLinkedList.java index b9527ac..72261ff 100644 --- a/source/org/thdl/util/SimplifiedLinkedList.java +++ b/source/org/thdl/util/SimplifiedLinkedList.java @@ -18,7 +18,7 @@ Contributor(s): ______________________________________. package org.thdl.util; -import java.io.*; +import java.io.PrintWriter; /** Implementation of a simple dynamic link list. Be careful with word order! Why not just use java.util.LinkedList? It is not supported for the diff --git a/source/org/thdl/util/SimplifiedListIterator.java b/source/org/thdl/util/SimplifiedListIterator.java index 2d7a559..8a0a90d 100644 --- a/source/org/thdl/util/SimplifiedListIterator.java +++ b/source/org/thdl/util/SimplifiedListIterator.java @@ -17,7 +17,7 @@ Contributor(s): ______________________________________. */ package org.thdl.util; -import java.util.*; +import java.util.LinkedList; /** Used by {@link LinkedList} to provide the implementation of a simple dynamic link list. diff --git a/source/org/thdl/util/StatusBar.java b/source/org/thdl/util/StatusBar.java index 262240c..d100f8e 100644 --- a/source/org/thdl/util/StatusBar.java +++ b/source/org/thdl/util/StatusBar.java @@ -18,11 +18,13 @@ Contributor(s): ______________________________________. package org.thdl.util; -import java.awt.*; -import java.awt.event.*; -import javax.swing.*; import java.util.Stack; +import javax.swing.BoxLayout; +import javax.swing.JLabel; +import javax.swing.JPanel; +import javax.swing.SwingConstants; + /** A StatusBar can be added to a component, typically to the bottom of it, in order to show the user the status of the program. There are methods to change the status, and there are actually a LIFO diff --git a/source/org/thdl/util/ThdlAbstractAction.java b/source/org/thdl/util/ThdlAbstractAction.java index 82b421e..8d2b411 100644 --- a/source/org/thdl/util/ThdlAbstractAction.java +++ b/source/org/thdl/util/ThdlAbstractAction.java @@ -18,11 +18,10 @@ Contributor(s): ______________________________________. package org.thdl.util; -import javax.swing.AbstractAction; -import javax.swing.Icon; import java.awt.event.ActionEvent; -import org.thdl.util.ThdlDebug; +import javax.swing.AbstractAction; +import javax.swing.Icon; /** * This ActionListener is like any other except in the way that it diff --git a/source/org/thdl/util/ThdlActionListener.java b/source/org/thdl/util/ThdlActionListener.java index 349a098..4847f7b 100644 --- a/source/org/thdl/util/ThdlActionListener.java +++ b/source/org/thdl/util/ThdlActionListener.java @@ -18,10 +18,8 @@ Contributor(s): ______________________________________. package org.thdl.util; -import java.awt.event.ActionListener; import java.awt.event.ActionEvent; - -import org.thdl.util.ThdlDebug; +import java.awt.event.ActionListener; /** * This ActionListener is like any other except in the way that it diff --git a/source/org/thdl/util/ThdlDebug.java b/source/org/thdl/util/ThdlDebug.java index 8cee4e4..c954cf0 100644 --- a/source/org/thdl/util/ThdlDebug.java +++ b/source/org/thdl/util/ThdlDebug.java @@ -18,12 +18,9 @@ Contributor(s): ______________________________________. package org.thdl.util; -import java.io.PrintStream; -import java.io.FileOutputStream; import java.io.File; - -import org.thdl.util.TeeStream; -import org.thdl.util.ThdlOptions; +import java.io.FileOutputStream; +import java.io.PrintStream; /** * This uninstantiable class provides assertions and the like in a diff --git a/source/org/thdl/util/ThdlI18n.java b/source/org/thdl/util/ThdlI18n.java index b78f3d6..56d98a5 100644 --- a/source/org/thdl/util/ThdlI18n.java +++ b/source/org/thdl/util/ThdlI18n.java @@ -2,6 +2,7 @@ package org.thdl.util; import java.util.Locale; import java.util.ResourceBundle; + import javax.swing.JComponent; public class ThdlI18n { diff --git a/source/org/thdl/util/ThdlLazyExceptionTest.java b/source/org/thdl/util/ThdlLazyExceptionTest.java index 386acc0..ba8bdf7 100644 --- a/source/org/thdl/util/ThdlLazyExceptionTest.java +++ b/source/org/thdl/util/ThdlLazyExceptionTest.java @@ -18,9 +18,9 @@ Contributor(s): ______________________________________. package org.thdl.util; -import junit.framework.TestCase; +import java.io.IOException; -import java.io.IOException; /* a checked exception */ +import junit.framework.TestCase; /** * @author David Chandler diff --git a/source/org/thdl/util/ThdlOptions.java b/source/org/thdl/util/ThdlOptions.java index 8261290..015a0c7 100644 --- a/source/org/thdl/util/ThdlOptions.java +++ b/source/org/thdl/util/ThdlOptions.java @@ -18,17 +18,14 @@ Contributor(s): ______________________________________. package org.thdl.util; -import java.io.InputStream; +import java.io.File; import java.io.FileInputStream; +import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; -import java.io.File; -import java.io.FileNotFoundException; +import java.io.InputStream; import java.util.Properties; -import org.thdl.util.ThdlLazyException; -import org.thdl.util.OperatingSystemUtils; - /** * Provides a clean interface to the multi-tiered system of user * preferences (also known as options). diff --git a/source/org/thdl/util/Trie.java b/source/org/thdl/util/Trie.java index 760382e..64f02bc 100644 --- a/source/org/thdl/util/Trie.java +++ b/source/org/thdl/util/Trie.java @@ -81,7 +81,6 @@ Contributor(s): ______________________________________. package org.thdl.util; -import org.thdl.util.ThdlDebug; /** * A digital search trie for 7-bit ASCII text. The API is a subset of