+ }
+
+ /** TODO(DLC)[EWTS->Tibetan]: set this to true and fix the code or
+ * the test cases until things are green. */
+ private static final boolean RUN_FAILING_TESTS = false;
}
// TODO(DLC)[EWTS->Tibetan]: if 'k' were illegal, then would you have to say
diff --git a/source/org/thdl/tib/text/ttt/EWTSTraits.java b/source/org/thdl/tib/text/ttt/EWTSTraits.java
index 9c42b7d..a6cf6d0 100644
--- a/source/org/thdl/tib/text/ttt/EWTSTraits.java
+++ b/source/org/thdl/tib/text/ttt/EWTSTraits.java
@@ -22,6 +22,7 @@ package org.thdl.tib.text.ttt;
import java.util.ArrayList;
+import org.thdl.tib.text.tshegbar.UnicodeUtils;
import org.thdl.tib.text.DuffCode;
import org.thdl.tib.text.THDLWylieConstants;
import org.thdl.tib.text.TibTextUtils;
@@ -74,8 +75,12 @@ public final class EWTSTraits implements TTraits {
public int maxWowelLength() { return 3; /* a~M` (TODO(DLC)[EWTS->Tibetan]:! why the 'a'?) */}
public boolean isUnicodeConsonant(char ch) {
- return ((ch != '\u0f48' && ch >= '\u0f40' && ch <= '\u0f6a')
- || (ch != '\u0f98' && ch >= '\u0f90' && ch <= '\u0fbc'));
+ return ((ch != '\u0f48' && ch >= '\u0f40' && ch <= '\u0f6a')
+ || (ch != '\u0f98' && ch >= '\u0f90' && ch <= '\u0fbc')
+ // NOTE: \u0f88 is questionable, but we want EWTS
+ // [\u0f88+kha] to become "\u0f88\u0f91" and this does
+ // the trick.
+ || ch == '\u0f88');
}
public boolean isUnicodeWowel(char ch) {
@@ -290,6 +295,9 @@ public final class EWTSTraits implements TTraits {
for (int i = 0; i < l.length(); i++) {
char ch = l.charAt(i);
if ((ch < '\u0f00' || ch > '\u0fff')
+ && SAUVASTIKA != ch
+ && SWASTIKA != ch
+ && (ch < PUA_MIN || ch > PUA_MAX) // TODO(DLC)[EWTS->Tibetan]: give a warning, though? PUA isn't specified by the unicode standard after all.
&& '\n' != ch
&& '\r' != ch) {
// TODO(DLC)[EWTS->Tibetan]: Is this the place
@@ -352,7 +360,6 @@ public final class EWTSTraits implements TTraits {
if ("h".equals(l)) return "\u0FB7";
if ("a".equals(l)) return "\u0FB8";
if ("k+Sh".equals(l)) return "\u0FB9";
- if (false) throw new Error("TODO(DLC)[EWTS->Tibetan]:: subscribed for " + l);
return null;
} else {
if ("R".equals(l)) return "\u0f6a";
@@ -360,6 +367,10 @@ public final class EWTSTraits implements TTraits {
if ("W".equals(l)) return "\u0f5d";
if (!TibetanMachineWeb.isKnownHashKey(l)) {
+// System.err.println("Getting unicode for the following is hard: '"
+// + l + "' (pretty string: '"
+// + UnicodeUtils.unicodeStringToPrettyString(l)
+// + "'");
ThdlDebug.noteIffyCode();
return null;
}
@@ -445,4 +456,36 @@ public final class EWTSTraits implements TTraits {
return (allHavePlus
|| TibetanMachineWeb.hasGlyph(hashKey.toString())); // TODO(DLC)[EWTS->Tibetan]: test with smra and tsma and bdgya
}
+
+ public boolean stackingMustBeExplicit() { return true; }
+
+ public String U0F7F() { return "H"; }
+
+ public String U0F35() { return "~X"; }
+
+ public String U0F37() { return "X"; }
+
+ /** The EWTS standard mentions this character specifically. See
+ http://www.symbols.com/encyclopedia/15/155.html to learn about
+ its meaning as relates to Buddhism.
+ */
+ static final char SAUVASTIKA = '\u534d';
+
+ /** The EWTS standard mentions this character specifically. See
+ http://www.symbols.com/encyclopedia/15/151.html to learn about
+ its meaning as relates to Buddhism.
+ */
+ static final char SWASTIKA = '\u5350';
+
+ /** EWTS has some glyphs not specified by Unicode in the
+ * private-use area (PUA). EWTS puts them in the range [PUA_MIN,
+ * PUA_MAX]. (Note that \uf042 is the highest in use as of July
+ * 2, 2005.) */
+ static final char PUA_MIN = '\uf021';
+
+ /** EWTS has some glyphs not specified by Unicode in the
+ * private-use area (PUA). EWTS puts them in the range [PUA_MIN,
+ * PUA_MAX]. (Note that \uf042 is the highest in use as of July
+ * 2, 2005.) */
+ static final char PUA_MAX = '\uf0ff';
}
diff --git a/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java b/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java
index d450364..5447e89 100644
--- a/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java
@@ -10,7 +10,7 @@ License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
-Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
+Library (THDL). Portions created by the THDL are Copyright 2003-2005 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
@@ -42,52 +42,80 @@ class EWTSTshegBarScanner extends TTshegBarScanner {
|| EWTSTraits.instance().isUnicodeWowel(ch)
|| (ch >= '\u0f20' && ch <= '\u0f33')
|| "khgncjytdpbmtstdzwzz'rlafvTDNSWYReuioIAUMHX?^\u0f39\u0f35\u0f37.+~'`-\u0f19\u0f18\u0f3f\u0f3e\u0f86\u0f87\u0f88".indexOf(ch) >= 0);
+ // NOTE: We treat \u0f00 as punctuation, not something valid
+ // inside a tsheg bar. This is questionable, but since it is
+ // a tsheg bar all by itself (almost always in practice,
+ // anyway) and since it would've required code changes I
+ // didn't want to make, that's how it is.
}
- /** See the comment in TTshegBarScanner. This does not find
- errors and warnings that you'd think of a parser finding (TODO(DLC)[EWTS->Tibetan]:
- DOES IT?). */
- public ArrayList scan(String s, StringBuffer errors, int maxErrors, // TODO(DLC)[EWTS->Tibetan]: ignored
- boolean shortMessages, String warningLevel) {
- // the size depends on whether it's mostly Tibetan or mostly
- // Latin and a number of other factors. This is meant to be
- // an underestimate, but not too much of an underestimate.
- ArrayList al = new ArrayList(s.length() / 10);
+ // TODO(dchandler): use jflex, javacc or something similar as much
+ // as you can. I don't think EWTS can be perfectly parsed by
+ // javacc, by the way, but having several components in a pipeline
+ // would likely make things more maintainable.
+ //
+ // NOTE: EWTS doesn't fully specify how Unicode escapes (e.g.,
+ // [\\u0f20] should work). When do you evaluate them?
+ // Immediately like Java source files or later, say right before
+ // outputting? Our answer: immediately. [\\u0f88+ka] becomes
+ // hard to do otherwise. This means we treat actual Unicode in a
+ // way that a reader of the EWTS standard might not think about,
+ // but actual Unicode is rare in the input
+ // (TODO(DLC)[EWTS->Tibetan]: it's so rare that we ought to give a
+ // warning/error when we see it).
+ /** See the comment in TTshegBarScanner. This does not find
+ errors and warnings that you'd think of a parser finding (TODO(DLC)[EWTS->Tibetan]:
+ DOES IT?). */
+ public ArrayList scan(String s, StringBuffer errors, int maxErrors, // TODO(DLC)[EWTS->Tibetan]: ignored
+ boolean shortMessages, String warningLevel) {
+ // the size depends on whether it's mostly Tibetan or mostly
+ // Latin and a number of other factors. This is meant to be
+ // an underestimate, but not too much of an underestimate.
+ ArrayList al = new ArrayList(s.length() / 10);
- // TODO(DLC)[EWTS->Tibetan]: use jflex, javacc or something similar
-
- // TODO(DLC)[EWTS->Tibetan]: what about Unicode escapes like \u0f20? When do you do that? Immediately like Java source files? I think so and then we can say that oddballs like \u0f19 are valid within tsheg bars.
-
- StringBuffer sb = new StringBuffer(s);
- ExpandEscapeSequences(sb);
- int sl = sb.length();
- // TODO(DLC)[EWTS->Tibetan]:: '@#', in ewts->tmw, is not working
- // TODO(DLC)[EWTS->Tibetan]:: 'jamX 'jam~X one is not working in ->tmw mode
- // TODO(DLC)[EWTS->Tibetan]:: dzaHsogs is not working
- for (int i = 0; i < sl; i++) {
- if (isValidInsideTshegBar(sb.charAt(i))) {
- StringBuffer tbsb = new StringBuffer();
- for (; i < sl; i++) {
- if (isValidInsideTshegBar(sb.charAt(i)))
- tbsb.append(sb.charAt(i));
- else {
- --i;
- break;
- }
- }
- al.add(new TString("EWTS", tbsb.toString(),
- TString.TIBETAN_NON_PUNCTUATION));
- } else {
- if (" /;|!:=_@#$%<>()\r\n\t*".indexOf(sb.charAt(i)) >= 0)
- al.add(new TString("EWTS", sb.substring(i, i+1),
- TString.TIBETAN_PUNCTUATION));
- else
- al.add(new TString("EWTS", "ERROR TODO(DLC)[EWTS->Tibetan]: this character is illegal in EWTS: " + sb.substring(i, i+1),
- TString.ERROR));
- }
+ StringBuffer sb = new StringBuffer(s);
+ ExpandEscapeSequences(sb);
+ int sl = sb.length();
+ // TODO(DLC)[EWTS->Tibetan]:: '@#', in ewts->tmw, is not working
+ // TODO(DLC)[EWTS->Tibetan]:: 'jamX 'jam~X one is not working in ->tmw mode
+ // TODO(DLC)[EWTS->Tibetan]:: dzaHsogs is not working
+ for (int i = 0; i < sl; i++) {
+ if (isValidInsideTshegBar(sb.charAt(i))) {
+ StringBuffer tbsb = new StringBuffer();
+ for (; i < sl; i++) {
+ if (isValidInsideTshegBar(sb.charAt(i)))
+ tbsb.append(sb.charAt(i));
+ else {
+ --i;
+ break;
+ }
}
- return al;
+ al.add(new TString("EWTS", tbsb.toString(),
+ TString.TIBETAN_NON_PUNCTUATION));
+ } else {
+ // NOTE: It's questionable, but we treat
+ // \u0f00 like punctuation because it was
+ // easier coding that way.
+ if ((sb.charAt(i) >= EWTSTraits.PUA_MIN
+ && sb.charAt(i) <= EWTSTraits.PUA_MAX)
+ || (sb.charAt(i) >= '\u0f00' && sb.charAt(i) <= '\u0f17')
+ || (sb.charAt(i) >= '\u0f1a' && sb.charAt(i) <= '\u0f1f')
+ || (sb.charAt(i) >= '\u0fbe' && sb.charAt(i) <= '\u0fcc')
+ || (sb.charAt(i) >= '\u0fcf' && sb.charAt(i) <= '\u0fd1')
+ || (EWTSTraits.SAUVASTIKA == sb.charAt(i))
+ || (EWTSTraits.SWASTIKA == sb.charAt(i))
+ || (" /;|!:=_@#$%<>()*&\r\n\t\u0f36\u0f38\u0f89\u0f8a\u0f8b".indexOf(sb.charAt(i))
+ >= 0)) {
+ al.add(new TString("EWTS", sb.substring(i, i+1),
+ TString.TIBETAN_PUNCTUATION));
+ } else {
+ al.add(new TString("EWTS", "ERROR TODO(DLC)[EWTS->Tibetan]: this character is illegal in EWTS: " + sb.substring(i, i+1),
+ TString.ERROR));
+ }
+ }
}
+ return al;
+ }
/** Modifies the EWTS in sb such that Unicode escape sequences are
* expanded. */
diff --git a/source/org/thdl/tib/text/ttt/EWTStibwniniTest.java b/source/org/thdl/tib/text/ttt/EWTStibwniniTest.java
index 0f37895..db3a62e 100644
--- a/source/org/thdl/tib/text/ttt/EWTStibwniniTest.java
+++ b/source/org/thdl/tib/text/ttt/EWTStibwniniTest.java
@@ -792,7 +792,7 @@ public class EWTStibwniniTest extends TestCase {
assert_EWTS_error("khkha");
assert_EWTS_error("khna");
assert_EWTS_error("khla");
- special_case("gga");
+ assert_EWTS_error("gga");
assert_EWTS_error("ggha");
special_case("gnya");
special_case("gda");
@@ -801,13 +801,13 @@ public class EWTStibwniniTest extends TestCase {
assert_EWTS_error("gdhwa");
special_case("gna");
special_case("gnya");
- special_case("gpa");
+ assert_EWTS_error("gpa");
assert_EWTS_error("gbha");
assert_EWTS_error("gbhya");
- special_case("gma");
- special_case("gmya");
+ assert_EWTS_error("gma");
+ assert_EWTS_error("gmya");
assert_EWTS_error("grya");
- special_case("gha");
+ assert_EWTS_error("gha");
assert_EWTS_error("ghgha");
assert_EWTS_error("ghnya");
assert_EWTS_error("ghna");
@@ -815,8 +815,8 @@ public class EWTStibwniniTest extends TestCase {
assert_EWTS_error("ghma");
assert_EWTS_error("ghla");
assert_EWTS_error("ghya");
- special_case("ghra");
- special_case("ghwa");
+ assert_EWTS_error("ghra");
+ assert_EWTS_error("ghwa");
assert_EWTS_error("ngka");
assert_EWTS_error("ngkta");
assert_EWTS_error("ngktya");
@@ -970,34 +970,34 @@ public class EWTStibwniniTest extends TestCase {
special_case("dgra");
assert_EWTS_error("dgha");
assert_EWTS_error("dghra");
- special_case("ddza");
- special_case("dda");
+ assert_EWTS_error("ddza");
+ assert_EWTS_error("dda");
assert_EWTS_error("ddya");
- special_case("ddra");
- special_case("ddwa");
+ assert_EWTS_error("ddra");
+ assert_EWTS_error("ddwa");
assert_EWTS_error("ddha");
assert_EWTS_error("ddhna");
assert_EWTS_error("ddhya");
assert_EWTS_error("ddhra");
assert_EWTS_error("ddhwa");
- special_case("dna");
+ assert_EWTS_error("dna");
special_case("dba");
special_case("dbra");
assert_EWTS_error("dbha");
assert_EWTS_error("dbhya");
assert_EWTS_error("dbhra");
special_case("dma");
- special_case("dya");
+ assert_EWTS_error("dya");
assert_EWTS_error("drya");
assert_EWTS_error("dwya");
- special_case("dha");
+ assert_EWTS_error("dha");
assert_EWTS_error("dhna");
assert_EWTS_error("dhnya");
assert_EWTS_error("dhma");
assert_EWTS_error("dhya");
- special_case("dhra");
+ assert_EWTS_error("dhra");
assert_EWTS_error("dhrya");
- special_case("dhwa");
+ assert_EWTS_error("dhwa");
assert_EWTS_error("nka");
assert_EWTS_error("nkta");
assert_EWTS_error("ngha");
@@ -1051,39 +1051,39 @@ public class EWTStibwniniTest extends TestCase {
assert_EWTS_error("pswa");
assert_EWTS_error("psya");
assert_EWTS_error("bgha");
- special_case("bdza");
+ assert_EWTS_error("bdza");
special_case("bda");
assert_EWTS_error("bddza");
assert_EWTS_error("bdha");
assert_EWTS_error("bdhwa");
special_case("bta");
- special_case("bna");
- special_case("bba");
+ assert_EWTS_error("bna");
+ assert_EWTS_error("bba");
assert_EWTS_error("bbha");
assert_EWTS_error("bbhya");
- special_case("bma");
- special_case("bha");
+ assert_EWTS_error("bma");
+ assert_EWTS_error("bha");
assert_EWTS_error("bhNa");
assert_EWTS_error("bhna");
assert_EWTS_error("bhma");
assert_EWTS_error("bhya");
- special_case("bhra");
- special_case("bhwa");
+ assert_EWTS_error("bhra");
+ assert_EWTS_error("bhwa");
special_case("mnya");
- special_case("mNa"); // TODO(DLC)[EWTS->Tibetan]: do prefix rules really allow mNa? I think not.
+ assert_EWTS_error("mNa");
special_case("mna");
special_case("mnya");
- special_case("mpa");
- special_case("mpra");
- special_case("mpha");
- special_case("mba");
+ assert_EWTS_error("mpa");
+ assert_EWTS_error("mpra");
+ assert_EWTS_error("mpha");
+ assert_EWTS_error("mba");
assert_EWTS_error("mbha");
assert_EWTS_error("mbhya");
- special_case("mma");
- special_case("mla");
- special_case("mwa");
- special_case("msa");
- special_case("mha");
+ assert_EWTS_error("mma");
+ assert_EWTS_error("mla");
+ assert_EWTS_error("mwa");
+ assert_EWTS_error("msa");
+ assert_EWTS_error("mha");
assert_EWTS_error("yYa");
assert_EWTS_error("yra");
assert_EWTS_error("ywa");
diff --git a/source/org/thdl/tib/text/ttt/ParseIterator.java b/source/org/thdl/tib/text/ttt/ParseIterator.java
index 06bcaf0..c1bc71a 100644
--- a/source/org/thdl/tib/text/ttt/ParseIterator.java
+++ b/source/org/thdl/tib/text/ttt/ParseIterator.java
@@ -22,7 +22,9 @@ import java.util.ArrayList;
import java.util.ListIterator;
import java.util.NoSuchElementException;
-/** An object that can iterate over an {@link TParseTree}.
+/** An object that can iterate over an {@link TParseTree}. NOTE: This
+ * constructs the list over which it iterates when it is constructed,
+ * so you pay upfront.
*
* @author David Chandler */
class ParseIterator {
diff --git a/source/org/thdl/tib/text/ttt/TConverter.java b/source/org/thdl/tib/text/ttt/TConverter.java
index cfc5025..c1aaf8d 100644
--- a/source/org/thdl/tib/text/ttt/TConverter.java
+++ b/source/org/thdl/tib/text/ttt/TConverter.java
@@ -622,7 +622,7 @@ public class TConverter {
boolean done = false;
// what about after numbers? marks? FIXME: test
TPairList lpl = null;
- if (s.getText().equals(" ")) {
+ if (ttraits.isACIP() && s.getText().equals(" ")) {
if (!lastGuyWasNonPunct
|| (null != lastGuy
&& (lpl = lastGuy.get(lastGuy.size() - 1)).size() == 1
@@ -652,7 +652,8 @@ public class TConverter {
continue; // FIXME: if null != writer, output was just dropped.
}
}
- } else if (s.getText().equals(",")
+ } else if (ttraits.isACIP()
+ && s.getText().equals(",")
&& lastGuyWasNonPunct
&& null != lastGuy
&& (lpl = lastGuy.get(lastGuy.size() - 1)).size() == 1
@@ -722,7 +723,8 @@ public class TConverter {
ThdlDebug.verify(1 == s.getText().length());
if (null != writer) {
char ch = s.getText().charAt(0);
- if (ch >= '\uF021' && ch <= '\uF0FF') {
+ if (ch >= EWTSTraits.PUA_MIN
+ && ch <= EWTSTraits.PUA_MAX) {
hasErrors = true;
String errorMessage =
"[#ERROR "
diff --git a/source/org/thdl/tib/text/ttt/TPair.java b/source/org/thdl/tib/text/ttt/TPair.java
index b6c2e14..fa63fc2 100644
--- a/source/org/thdl/tib/text/ttt/TPair.java
+++ b/source/org/thdl/tib/text/ttt/TPair.java
@@ -163,14 +163,15 @@ class TPair {
}
/** Returns a TPair that is like this pair except that it has a
- * "+" on the right if this pair is empty on the right and is
- * empty on the right if this pair has a disambiguator on the
- * right. May return itself (but never mutates this
- * instance). */
+ * "+" on the right if this pair is empty on the right and, when
+ * appropriate, is empty on the right if this pair has a
+ * disambiguator on the right. May return itself (but never
+ * mutates this instance). */
TPair insideStack() {
if (null == getRight())
return new TPair(traits, getLeft(), "+");
- else if (traits.disambiguator().equals(getRight()))
+ else if (traits.disambiguator().equals(getRight())
+ && !traits.stackingMustBeExplicit())
return new TPair(traits, getLeft(), null);
else
return this;
@@ -248,11 +249,18 @@ class TPair {
}
}
- // TODO(DLC)[EWTS->Tibetan]
- /** Returns true if this pair is surely the last pair in an ACIP
- * stack. Stacking continues through (* . ) and (* . +), but
- * stops anywhere else. */
- boolean endsACIPStack() {
- return (getRight() != null && !"+".equals(getRight()));
+ /** For ACIP: Returns true if this pair is surely the last pair in
+ * an ACIP stack. Stacking continues through (* . ) and (* . +),
+ * but stops anywhere else.
+ *
+ * For EWTS: Returns true if this pair is probably the last
+ * pair in an EWTS stack. For natives stacks like that found in
+ * [bra], this is not really true. */
+ boolean endsStack() {
+ final boolean explicitlyStacks = "+".equals(getRight());
+ if (!traits.stackingMustBeExplicit())
+ return (getRight() != null && !explicitlyStacks);
+ else
+ return (!explicitlyStacks);
}
}
diff --git a/source/org/thdl/tib/text/ttt/TPairList.java b/source/org/thdl/tib/text/ttt/TPairList.java
index 13c5969..3185c6f 100644
--- a/source/org/thdl/tib/text/ttt/TPairList.java
+++ b/source/org/thdl/tib/text/ttt/TPairList.java
@@ -16,8 +16,6 @@ All Rights Reserved.
Contributor(s): ______________________________________.
*/
-// TODO(DLC)[EWTS->Tibetan]: a (DLC: does this become (a.) or (.a)?), ug pha, g.a, aM, etc. -- test!
-
package org.thdl.tib.text.ttt;
import java.util.ArrayList;
@@ -146,9 +144,10 @@ class TPairList {
return original.toString();
}
- /** Returns true if this list contains ( . ) or (A . ),
- * which are two simple errors you encounter if you interpret DAA
- * or TAA or DAI or DAE the wrong way. TODO(DLC)[EWTS->Tibetan]: ACIP vs. EWTS */
+ /** Returns true if this list contains an obvious error. For
+ * example, with ACIP this returns true if ( . ) or (A . )
+ * appears, which are two simple errors you encounter if you
+ * interpret (ACIP) DAA or TAA or DAI or DAE the wrong way. */
boolean hasSimpleError() {
int sz = size();
for (int i = 0; i < sz; i++) {
@@ -192,13 +191,6 @@ class TPairList {
&& (null == p.getRight()
|| "".equals(p.getRight()))) {
return ErrorsAndWarnings.getMessage(125, shortMessages, translit, traits);
- } else if (null != p.getRight()
- && !"+".equals(p.getRight())
- && !traits.disambiguator().equals(p.getRight())
- && !traits.isWowel(p.getRight())
- && false /* TODO(DLC)[EWTS->Tibetan]: think about this harder. */) {
- return "ErrorNumberDLC1: We don't yet support stacking vowels, convert {" + translit + "} manually.";
- // TODO(DLC)[EWTS->Tibetan]: test, i think we do support it
} else if ((null == p.getLeft()
&& (!traits.disambiguator().equals(p.getRight())
&& (!traits.vowelAloneImpliesAChen()
@@ -224,7 +216,8 @@ class TPairList {
return ErrorsAndWarnings.getMessage(126, shortMessages, translit, traits);
}
// FIXME: really this is a warning, not an error:
- if (traits.disambiguator().equals(get(sz - 1).getRight())) {
+ if (traits.disambiguator().equals(get(sz - 1).getRight())
+ && !traits.stackingMustBeExplicit()) {
return ErrorsAndWarnings.getMessage(127, shortMessages, translit, traits);
}
return null;
@@ -280,26 +273,28 @@ class TPairList {
if (sz < 1) return null;
- // When we see a stretch of ACIP without a disambiguator or a
- // vowel, that stretch is taken to be one stack unless it may
- // be prefix-root or suffix-postsuffix or suffix/postsuffix-'
- // -- the latter necessary because GAMS'I is GAM-S-'I, not
- // GAM-S+'I. 'UR, 'US, 'ANG, 'AM, 'I, 'O, 'U -- all begin
- // with '. So we can have zero, one, two, or three special
- // break locations. (The kind that aren't special are the
- // break after G in G-DAMS, or the break after G in GADAMS or
- // GEDAMS.)
+ // When we see a stretch of ACIP (TODO(DLC)[EWTS->Tibetan]:
+ // this works for EWTS, but differently) without a
+ // disambiguator or a vowel, that stretch is taken to be one
+ // stack unless it may be prefix-root or suffix-postsuffix or
+ // suffix/postsuffix-' -- the latter necessary because GAMS'I
+ // is GAM-S-'I, not GAM-S+'I. 'UR, 'US, 'ANG, 'AM, 'I, 'O, 'U
+ // -- all begin with '. So we can have zero, one, two, or
+ // three special break locations. (The kind that aren't
+ // special are the break after G in G-DAMS, or the break after
+ // G in GADAMS or GEDAMS.)
//
// If a nonnegative number appears in breakLocations[i], it
// means that pair i may or may not be stacked with pair i+1.
int nextBreakLoc = 0;
int breakLocations[] = { -1, -1, -1 };
- boolean mayHavePrefix;
+ boolean mayHavePrefix = get(0).isPrefix();
// Handle the first pair specially -- it could be a prefix.
if (ddebug) System.out.println("i is " + 0);
- if ((mayHavePrefix = get(0).isPrefix())
+ if (mayHavePrefix
+ && !traits.stackingMustBeExplicit()
&& sz > 1
&& null == get(0).getRight()) {
// special case: we must have a branch in the parse tree
@@ -311,9 +306,9 @@ class TPairList {
}
// stack numbers start at 1.
- int stackNumber = (get(0).endsACIPStack()) ? 2 : 1;
+ int stackNumber = (get(0).endsStack()) ? 2 : 1;
// this starts at 0.
- int stackStart = (get(0).endsACIPStack()) ? 1 : 0;
+ int stackStart = (get(0).endsStack()) ? 1 : 0;
int numeric = get(0).isNumeric() ? 1 : (get(0).isDisambiguator() ? 0 : -1);
@@ -340,7 +335,7 @@ class TPairList {
numeric = -1;
}
- if (i+1==sz || p.endsACIPStack()) {
+ if (i+1==sz || p.endsStack()) {
if (/* the stack ending here might really be
suffix-postsuffix or
suffix-appendage or
@@ -350,15 +345,17 @@ class TPairList {
if (i > stackStart) {
if (get(stackStart).isSuffix()
&& (get(stackStart+1).isPostSuffix() // suffix-postsuffix
- || "'".equals(get(stackStart+1).getLeft()))) // suffix-appendage
+ || "'".equals(get(stackStart+1).getLeft()))) { // suffix-appendage
breakLocations[nextBreakLoc++] = stackStart;
+ }
if (i > stackStart + 1) {
// three to play with, maybe it's
// suffix-postsuffix-appendage.
if (get(stackStart).isSuffix()
&& get(stackStart+1).isPostSuffix()
- && "'".equals(get(stackStart+2).getLeft()))
+ && "'".equals(get(stackStart+2).getLeft())) {
breakLocations[nextBreakLoc++] = stackStart+1;
+ }
}
}
// else no need to insert a breakLocation, we're
@@ -370,8 +367,9 @@ class TPairList {
|| (!mayHavePrefix && (stackNumber == 3))) {
if (i == stackStart+1) { // because GDAM--S'O is illegal, and because it's 'ANG, not 'NG, 'AM, not 'M -- ' always ends the stack
if (get(stackStart).isPostSuffix()
- && "'".equals(get(stackStart+1).getLeft()))
+ && "'".equals(get(stackStart+1).getLeft())) {
breakLocations[nextBreakLoc++] = stackStart;
+ }
}
}
++stackNumber;
@@ -397,7 +395,8 @@ class TPairList {
throw new Error("breakLocations is monotonically increasing, ain't it?");
TParseTree pt = new TParseTree();
for (int i = 0; i < sz; i++) {
- if (i+1 == sz || get(i).endsACIPStack()) {
+ if (ddebug) System.out.println("getParseTree: second loop i is " + i);
+ if (i+1 == sz || get(i).endsStack()) {
TStackListList sll = new TStackListList(4); // maximum is 4.
int numBreaks = 0;
@@ -419,6 +418,7 @@ class TPairList {
// one, at location breakLocations[breakStart+1] if
// and only if b1 is one, etc.
for (int counter = 0; counter < (1<= 0) { // TODO(DLC)[EWTS->Tibetan]
- // this ':' guy is his own TGCPair.
- add_U0F7F = true;
- StringBuffer rr = new StringBuffer(p.getRight());
- rr.deleteCharAt(where);
- p = new TPair(traits, p.getLeft(), rr.toString());
+ final String specialCases[] = new String[] {
+ traits.U0F7F(),
+ traits.U0F35(),
+ traits.U0F37()
+ };
+ final String specialCaseEwts[] = new String[] {
+ EWTSTraits.instance().U0F7F(),
+ EWTSTraits.instance().U0F35(),
+ EWTSTraits.instance().U0F37()
+ };
+ final boolean ignoreSpecialCase[] = new boolean[] {
+ false, // Don't ignore this -- it's Sanskrit.
+ // ['jamH] should be illegal EWTS.
+ // (TODO(dchandler): ask)
+ true,
+ true,
+ };
+ boolean hasSpecialCase[] = new boolean[] { false, false, false, };
+ for (int j = 0; j < specialCases.length; j++) {
+ if (null != specialCases[j]) {
+ int where;
+ if (p.getRight() != null
+ && (where = p.getRight().indexOf(specialCases[j])) >= 0) {
+ // this guy is his own TGCPair.
+ hasSpecialCase[j] = true;
+ StringBuffer rr = new StringBuffer(p.getRight());
+ rr.replace(where, where + specialCases[j].length(), "");
+ if (rr.length() > where && '+' == rr.charAt(where)) {
+ rr.deleteCharAt(where);
+ } else if (where > 0 && rr.length() > where - 1
+ && '+' == rr.charAt(where - 1)) {
+ rr.deleteCharAt(where - 1);
+ }
+ p = new TPair(traits, p.getLeft(), rr.toString());
+ }
+ }
}
boolean hasNonAVowel = (!traits.aVowel().equals(p.getRight())
&& null != p.getRight());
@@ -586,9 +616,12 @@ class TPairList {
? TGCPair.TYPE_TIBETAN
: TGCPair.TYPE_OTHER))));
pl.add(tp);
- if (add_U0F7F) {
- indexList.add(new Integer(index));
- pl.add(new TGCPair("H", null, TGCPair.TYPE_OTHER)); // TODO(DLC)[EWTS->Tibetan]
+ for (int j = 0; j < specialCases.length; j++) {
+ if (hasSpecialCase[j] && !ignoreSpecialCase[j]) {
+ indexList.add(new Integer(index));
+ pl.add(new TGCPair(specialCaseEwts[j],
+ null, TGCPair.TYPE_OTHER));
+ }
}
}
}
diff --git a/source/org/thdl/tib/text/ttt/TPairListFactory.java b/source/org/thdl/tib/text/ttt/TPairListFactory.java
index e2c7e9e..b7418e2 100644
--- a/source/org/thdl/tib/text/ttt/TPairListFactory.java
+++ b/source/org/thdl/tib/text/ttt/TPairListFactory.java
@@ -20,6 +20,8 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt;
+import org.thdl.tib.text.TibetanMachineWeb;
+
/** A factory for creating {@link TPairList TPairLists} from
* Strings of ACIP.
* @author David Chandler */
@@ -111,12 +113,15 @@ class TPairListFactory {
return tail;
}
+ private static final boolean debug = false;
+
/** See {@link TTraits#breakTshegBarIntoChunks}. */
static TPairList[] breakEWTSIntoChunks(String ewts)
throws IllegalArgumentException
{
EWTSTraits traits = EWTSTraits.instance();
TPairList pl = breakHelperEWTS(ewts, traits);
+ if (debug) System.out.println("breakEWTSIntoChunks: pl is " + pl);
TPairList npl = pl;
// TODO(DLC)[EWTS->Tibetan]: this crap ain't workin' for kaHM. But kaeM and kaMe shouldn't work, right? Figure out what EWTS really says...
@@ -148,14 +153,18 @@ class TPairListFactory {
}
}
}
+ pl = null;
+ if (debug) System.out.println("breakEWTSIntoChunks: npl is " + npl);
TPairList nnpl;
if (true) {
+ // TODO(DLC)[EWTS->Tibetan]: this nnpl crap was before getFirstConsonantAndVowel got fixed. Try killing it!
+
// Collapse ( . wowel1) ( . wowel2) into (
// . wowel1+wowel2). Then collapse (* . a) ( . x) into (*
// . x). Also, if an a-chen (\u0f68) is implied, then
// insert it.
- TPairList xnnpl = new TPairList(traits, pl.size());
+ TPairList xnnpl = new TPairList(traits, npl.size());
for (int i = 0; i < npl.size(); ) {
TPair p = npl.get(i);
int set_i_to = i + 1;
@@ -184,7 +193,7 @@ class TPairListFactory {
i = set_i_to;
}
- nnpl = new TPairList(traits, pl.size());
+ nnpl = new TPairList(traits, xnnpl.size());
// (* . a ) ( . x) ... ( . y) -> (* . a+x+...+y)
for (int i = 0; i < xnnpl.size(); ) {
TPair p = xnnpl.get(i);
@@ -221,7 +230,7 @@ class TPairListFactory {
}
} else {
// TODO(DLC)[EWTS->Tibetan]: this block is not executing. kill it after testing and thinking
- nnpl = new TPairList(traits, pl.size());
+ nnpl = new TPairList(traits, npl.size());
for (int i = npl.size() - 1; i >= 0; i--) {
TPair p = npl.get(i);
@@ -234,13 +243,91 @@ class TPairListFactory {
nnpl.prepend(p);
}
}
+ npl = null;
+ if (debug) System.out.println("breakEWTSIntoChunks: nnpl is " + nnpl);
+
+ TPairList nnnpl = transformNativeStacks(traits, nnpl);
+ if (debug) System.out.println("breakEWTSIntoChunks: nnnpl is " + nnnpl);
- // TODO(DLC)[EWTS->Tibetan]: this nnpl crap was before getFirstConsonantAndVowel got fixed. Try killing it!
return new TPairList[] {
- nnpl, null
+ nnnpl, null
};
}
+ /** EWTS helper function that transforms native stacks to include
+ * pluses: [(ph . ) (y . ) (w . *)] -> [(ph . +) (y . +) (w
+ * . *)], e.g.
+ * @param traits must mesh with orig */
+ private static TPairList transformNativeStacks(TTraits traits,
+ TPairList orig) {
+ // TODO(DLC)[EWTS->Tibetan]: instead of using
+ // TibetanMachineWeb's knowledge of the hash keys in tibwn.ini
+ // (ph-y-w is a hash key, e.g.), we assume that 3 is the
+ // maximum size of a native stack.
+ final int maxNativeStackSize = 3;
+ // [(s . *)] alone doesn't need transformation. [(s . )
+ // (k . *)] does:
+ final int minNativeStackSize = 2;
+
+ TPairList result = new TPairList(traits, orig.size());
+ for (int i = 0; i < orig.size();
+ ) { // we increment i inside the loop
+ // If, upon looking ahead, we see a native stack of
+ // size 3, we transform three pairs. Failing that, if
+ // we see a native stack of size 2, we transform it.
+
+ boolean found_something = false;
+ TPair p[] = new TPair[maxNativeStackSize];
+ for (int j = 0; j < maxNativeStackSize; j++) {
+ if (i + j < orig.size())
+ p[j] = orig.get(i + j);
+ else
+ p[j] = null;
+ }
+ // Now p[0] is current pair, p[1] is the one after that, etc.
+
+ for (int nss = maxNativeStackSize; nss >= minNativeStackSize;
+ nss--) {
+ String hash_key = "";
+ int good = 0;
+ for (int k = 0; k < nss - 1; k++) {
+ if (null != p[k]
+ && null != p[k].getLeft()
+ && null == p[k].getRight()) {
+ hash_key += p[k].getLeft() + "-";
+ ++good;
+ }
+ }
+ if (null != p[nss - 1]
+ && null != p[nss - 1].getLeft()
+ && !"+".equals(p[nss - 1].getRight())) {
+ hash_key += p[nss - 1].getLeft();
+ ++good;
+ }
+ if (nss == good
+ && TibetanMachineWeb.isKnownHashKey(hash_key)) {
+ found_something = true;
+ for (int n = 0; n < nss - 1; n++) {
+ ++i;
+ result.append(new TPair(traits,
+ p[n].getLeft(), "+"));
+ }
+ ++i;
+ result.append(p[nss - 1]);
+ break; // for ph-y-w etc.
+ }
+ }
+ if (!found_something) {
+ ++i;
+ result.append(p[0]);
+ }
+ }
+ if (result.size() != orig.size()) {
+ throw new Error("orig=" + orig + "\nresult=" + result); // TODO(dchandler): make this an assertion.
+ }
+ return result;
+ }
+
// TODO(DLC)[EWTS->Tibetan]: doc
private static TPairList breakHelperEWTS(String ewts, TTraits ttraits) {
diff --git a/source/org/thdl/tib/text/ttt/TParseTree.java b/source/org/thdl/tib/text/ttt/TParseTree.java
index 14eaa18..7c903d3 100644
--- a/source/org/thdl/tib/text/ttt/TParseTree.java
+++ b/source/org/thdl/tib/text/ttt/TParseTree.java
@@ -105,26 +105,33 @@ class TParseTree {
ParseIterator pi = getParseIterator();
while (pi.hasNext()) {
TStackList sl = pi.next();
- if (!sl.isClearlyIllegal()) {
+ BoolTriple bt = sl.isLegalTshegBar(false);
+ if (!sl.isClearlyIllegal(bt.candidateType)) {
sll.add(sl);
}
}
return sll;
}
+ private static final boolean debug = false;
+
/** Returns the best parse, if there is a unique parse that is
* clearly preferred to other parses. Basically, if there's a
* unique legal parse, you get it. If there's not, but there is
* a unique non-illegal parse, you get it. If there's not a
* unique answer, null is returned. */
public TStackList getBestParse() {
+ if (debug) System.out.println("getBestParse: parse tree is " + toString());
TStackListList up = getUniqueParse(false);
- if (up.size() == 1)
+ if (up.size() == 1) {
+ if (debug) System.out.println("getBestParse: unique parse");
return up.get(0);
+ }
up = getNonIllegalParses();
int sz = up.size();
if (sz == 1) {
+ if (debug) System.out.println("getBestParse: sole non-illegal parse");
return up.get(0);
} else if (sz > 1) {
// TODO(DLC)[EWTS->Tibetan]: does this still happen? If so, when?
@@ -132,12 +139,14 @@ class TParseTree {
// System.out.println("SHO NUFF, >1 non-illegal parses still happens");
// {PADMA}, for example. Our technique is to go from the
- // left and stack as much as we can. So {PA}{D}{MA} is
- // inferior to {PA}{D+MA}, and {PA}{D+MA}{D}{MA} is
- // inferior to {PA}{D+MA}{D+MA}. We do not look for the
- // minimum number of glyphs, though -- {PA}{N+D}{B+H+R}
- // and {PA}{N}{D+B+H+R} tie by that score, but the former
- // is the clear winner.
+ // left and stack as much as we can (when
+ // !traits.stackingMustBeExplicit() only!
+ // TODO(DLC)[EWTS->Tibetan]: fix these comments). So
+ // {PA}{D}{MA} is inferior to {PA}{D+MA}, and
+ // {PA}{D+MA}{D}{MA} is inferior to {PA}{D+MA}{D+MA}. We
+ // do not look for the minimum number of glyphs, though --
+ // {PA}{N+D}{B+H+R} and {PA}{N}{D+B+H+R} tie by that
+ // score, but the former is the clear winner.
// We give a warning about these, optionally, so that
// users can produce output that even a dumb ACIP reader
@@ -177,11 +186,27 @@ class TParseTree {
}
++stackNumber;
}
- if (candidates.size() == 1)
+ if (candidates.size() == 1) {
+ if (debug) System.out.println("getBestParse: one candidate");
return up.get(((Integer)candidates.get(0)).intValue());
- else
+ } else {
+ if (debug) {
+ System.out.println("getBestParse: no parse, num candidates="
+ + candidates.size());
+ for (int i = 0; i < candidates.size(); i++) {
+ System.out.println("candidate " + i + " is "
+ + up.get(((Integer)candidates.get(i)).intValue()));
+ if (i + 1 < candidates.size()) {
+ boolean eq = (up.get(((Integer)candidates.get(i)).intValue()).equals(up.get(((Integer)candidates.get(i + 1)).intValue())));
+ System.out.println("This candidate and the next are"
+ + (eq ? "" : " not") + " equal.");
+ }
+ }
+ }
return null;
+ }
}
+ if (debug) System.out.println("getBestParse: no non-illegal parses");
return null;
}
@@ -480,9 +505,10 @@ n+t+s
middle = pl.get(1).getLeft();
right = pl.get(2).getLeft();
if (pl.get(0).getRight() == null
- && !pl.get(1).endsACIPStack()
- && pl.get(2).endsACIPStack()
+ && !pl.get(1).endsStack()
+ && pl.get(2).endsStack()
&& null != left && null != right) {
+ // TODO(DLC)[EWTS->Tibetan]: This is ACIP-specific.
if (("D".equals(left) && "G".equals(middle) && "R".equals(right))
|| ("D".equals(left) && "G".equals(middle) && "Y".equals(right))) {
if (pl.size() == 3) {
@@ -503,7 +529,7 @@ n+t+s
String left, right;
left = pl.get(0).getLeft();
right = pl.get(1).getLeft();
- if (pl.get(0).getRight() == null && pl.get(1).endsACIPStack()
+ if (pl.get(0).getRight() == null && pl.get(1).endsStack()
&& null != left && null != right) {
if (("D".equals(left) && "B".equals(right))
|| ("B".equals(left) && "D".equals(right))
diff --git a/source/org/thdl/tib/text/ttt/TStackList.java b/source/org/thdl/tib/text/ttt/TStackList.java
index aca0fd0..2bdd110 100644
--- a/source/org/thdl/tib/text/ttt/TStackList.java
+++ b/source/org/thdl/tib/text/ttt/TStackList.java
@@ -21,6 +21,7 @@ package org.thdl.tib.text.ttt;
import java.util.ArrayList;
import java.util.ListIterator;
+import org.thdl.util.ThdlDebug;
import org.thdl.tib.text.TGCList;
import org.thdl.tib.text.TibTextUtils;
@@ -136,17 +137,21 @@ class TStackList {
StringBuffer warnings = new StringBuffer();
String candidateType
= TibTextUtils.getClassificationOfTshegBar(tgcList, warnings, noPrefixTests);
+ if (ddebug) System.out.println("ddebug: tgclist is " + tgcList + "\n warnings is " + warnings + "\n candidateType is " + candidateType);
// preliminary answer:
boolean isLegal = (candidateType != "invalid");
if (isLegal) {
- if (isClearlyIllegal())
+ if (isClearlyIllegal(candidateType))
isLegal = false;
TPairList firstStack = this.get(0);
+ // NOTE: In ewts, [([b'dgm] . ) (...] is illegal unless
+ // this is a legal tsheg bar featuring a prefix. (I'm not
+ // sure this is enforced here, though...)
if (1 == firstStack.size()
&& firstStack.get(0).isPrefix()
- && null == firstStack.get(0).getRight() // because GAM is legal
+ && null == firstStack.get(0).getRight() // ACIP {GAM}/EWTS {gam} is legal
&& !(candidateType.startsWith("prefix")
|| candidateType.startsWith("appendaged-prefix"))) {
isLegal = false;
@@ -163,7 +168,8 @@ class TStackList {
TPairList pl = get(pairListIndex);
TPair p = pl.get(pl.size() - 1);
isLegalAndHasAVowelOnRoot
- = (p.getRight() != null && p.getRight().startsWith("A")); // could be {A:}, e.g. TODO(DLC)[EWTS->Tibetan]: ???
+ = (p.getRight() != null
+ && p.getRight().startsWith(p.getTraits().aVowel())); // could be ACIP {A:}, e.g.
if (isLegalAndHasAVowelOnRoot)
break;
}
@@ -178,7 +184,34 @@ class TStackList {
/** Returns true if and only if this stack list contains a clearly
* illegal construct. An example of such is a TPair (V . something). */
- boolean isClearlyIllegal() {
+ boolean isClearlyIllegal(String candidateType) {
+ if (isVeryClearlyIllegal())
+ return true;
+ int choices[]
+ = TibTextUtils.getIndicesOfRootForCandidateType(candidateType);
+ int max = size() - 1; // TODO(DLC)[EWTS->Tibetan]:
+ // optionally, use just size(). This
+ // will make [g] and [bad+man] illegal,
+ // e.g.
+ for (int i = 0; i < max; i++) {
+ // We want EWTS [gga] to be illegal because ga does not
+ // takes a gao prefix and we want EWTS [trna] to be
+ // illegal because a disambiguator or wowel is required to
+ // end a stack unless that stack is a prefix, suffix, or
+ // postsuffix.
+ if ((choices[0] < 0 && choices[1] < 0)
+ || (choices[0] == i && choices[1] < 0)) {
+ TPair last = get(i).get(get(i).size() - 1);
+ if (last.getTraits().stackingMustBeExplicit()
+ && last.getRight() == null) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+
+ private boolean isVeryClearlyIllegal() {
// check for {D}{VA} sorts of things:
for (int i = 0; i < size(); i++) {
if (get(i).getACIPError("THIS MAKES IT FASTER AND IS SAFE, DON'T WORRY",
@@ -286,7 +319,7 @@ class BoolTriple implements Comparable {
}
/** True if and only if {@link #isLegal} is true and there may be
- an ACIP "A" vowel on the root stack. */
+ an TTraits.aVowel() on the root stack. */
boolean isLegalAndHasAVowelOnRoot;
BoolTriple(boolean isLegal,
boolean isLegalAndHasAVowelOnRoot,
@@ -322,4 +355,7 @@ class BoolTriple implements Comparable {
BoolTriple b = (BoolTriple)o;
return score() - b.score();
}
+
+ // NOTE: TibTextUtils.getIndicesOfRootForCandidateType(candidateType)
+ // is useful.
}
diff --git a/source/org/thdl/tib/text/ttt/TString.java b/source/org/thdl/tib/text/ttt/TString.java
index 90fb9d1..c84fd83 100644
--- a/source/org/thdl/tib/text/ttt/TString.java
+++ b/source/org/thdl/tib/text/ttt/TString.java
@@ -66,9 +66,8 @@ public class TString {
&& type != END_SLASH
&& (type != UNICODE_CHARACTER
|| !(UnicodeUtils.isInTibetanRange(ch = getText().charAt(0))
- // EWTS maps some TMW glyphs to this Unicode
- // private-use area (PUA):
- || (ch >= '\uF021' && ch <= '\uF0FF'))));
+ || (ch >= EWTSTraits.PUA_MIN
+ && ch <= EWTSTraits.PUA_MAX))));
}
/** For ACIP [#COMMENTS] and EWTS (DLC FIXME: what are EWTS comments?) */
diff --git a/source/org/thdl/tib/text/ttt/TTGCList.java b/source/org/thdl/tib/text/ttt/TTGCList.java
index 6eca573..bef01bb 100644
--- a/source/org/thdl/tib/text/ttt/TTGCList.java
+++ b/source/org/thdl/tib/text/ttt/TTGCList.java
@@ -23,7 +23,10 @@ import java.util.ArrayList;
import org.thdl.tib.text.TGCList;
import org.thdl.tib.text.TGCPair;
-/** A list of grapheme clusters.
+/** A list of grapheme clusters. If you use this for anything other
+ * than testing the legality (the Tibetanness, if you will) of a
+ * tsheg-bar, then you'll probably fail because U+0F7F, U+0F35, and
+ * U+0F37 get special treatment.
*
* @author David Chandler */
class TTGCList implements TGCList {
@@ -35,7 +38,9 @@ class TTGCList implements TGCList {
/** Don't use this. */
private TTGCList() { }
- /** Creates a TGCList. */
+ /** Creates a TGCList. Note that U+0F7F, U+0F35, and U+0F37 get
+ * special treatment because the sole use of this class is for
+ * testing the legality of a tsheg bar. */
public TTGCList(TStackList sl) {
al = new ArrayList();
stackIndices = new ArrayList();
diff --git a/source/org/thdl/tib/text/ttt/TTraits.java b/source/org/thdl/tib/text/ttt/TTraits.java
index 645fe52..ac2aee8 100644
--- a/source/org/thdl/tib/text/ttt/TTraits.java
+++ b/source/org/thdl/tib/text/ttt/TTraits.java
@@ -211,4 +211,24 @@ public interface TTraits {
in a tsheg bar. (EWTS's list of standard stacks comes into
play; ACIP always returns true.) */
boolean couldBeValidStack(TPairList pl);
+
+ /** Returns true if stacking happens only via the '+' operator.
+ * Otherwise, stacking is greedy: for the most part we stack up
+ * until we hit something that stops us, like a vowel (though
+ * prefixes are special). NOTE: In EWTS, native stacks (EWTS
+ * [phywa], e.g.) are transformed by an early pass to use '+'. */
+ boolean stackingMustBeExplicit();
+
+ // TODO(dchandler): If there exists more than one transliteration
+ // for \u0f7f or the like, do we handle both equally well? Must
+ // we?
+
+ /** The transliteration of \u0f7f. */
+ String U0F7F();
+
+ /** The transliteration of \u0f35. */
+ String U0F35();
+
+ /** The transliteration of \u0f37. */
+ String U0F37();
}
diff --git a/source/org/thdl/tib/text/ttt/TTshegBarScanner.java b/source/org/thdl/tib/text/ttt/TTshegBarScanner.java
index fcbdab8..93538a8 100644
--- a/source/org/thdl/tib/text/ttt/TTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/TTshegBarScanner.java
@@ -59,13 +59,13 @@ public abstract class TTshegBarScanner {
errors, maxErrors, shortMessages, warningLevel);
}
- /** Scans a stream of transliteration into tsheg bars. If errors is
- * non-null, error messages will be appended to it. You can
+ /** Scans a stream of transliteration into tsheg bars. If errors
+ * is non-null, error messages will be appended to it. You can
* recover both errors and (optionally) warnings (modulo offset
* information) from the result, though. They will be short
* messages iff shortMessages is true. Returns a list of
- * TStrings that is the scan, or null if more than maxErrors
- * occur.
+ * TStrings that is the scan, or null if maxErrors is nonnegative
+ * and more than maxErrors occur.
*
* This is not so efficient; copies the whole stream into
* memory first.