diff --git a/source/org/thdl/tib/input/DuffPaneTest.java b/source/org/thdl/tib/input/DuffPaneTest.java
index 102e256..55705c9 100644
--- a/source/org/thdl/tib/input/DuffPaneTest.java
+++ b/source/org/thdl/tib/input/DuffPaneTest.java
@@ -102,19 +102,23 @@ public class DuffPaneTest extends TestCase {
ensureKeysGiveCorrectWylie("gya");
ensureKeysGiveCorrectWylie("g.ya");
ensureKeysGiveCorrectWylie("bya");
- ensureKeysGiveCorrectWylie("b.ya");
+ ensureKeysGiveCorrectWylie("b.ya", "baya");
ensureKeysGiveCorrectWylie("mya");
- ensureKeysGiveCorrectWylie("m.ya");
- ensureKeysGiveCorrectWylie("'ya");
- ensureKeysGiveCorrectWylie("'.ya", "'ya");
- ensureKeysGiveCorrectWylie("dya");
- ensureKeysGiveCorrectWylie("d.ya", "dya");
+ ensureKeysGiveCorrectWylie("m.ya", "maya");
+ ensureKeysGiveCorrectWylie("'ya", "'aya");
+ ensureKeysGiveCorrectWylie("'.ya", "'aya");
+ ensureKeysGiveCorrectWylie("dya",
+ "daya");
+ ensureKeysGiveCorrectWylie("d.ya",
+ "daya");
ensureKeysGiveCorrectWylie("grwa");
- ensureKeysGiveCorrectWylie("g.rwa");
+ ensureKeysGiveCorrectWylie("g.rwa",
+ "garwa");
ensureKeysGiveCorrectWylie("gra");
ensureKeysGiveCorrectWylie("dra");
ensureKeysGiveCorrectWylie("drwa");
- ensureKeysGiveCorrectWylie("d.rwa");
+ ensureKeysGiveCorrectWylie("d.rwa",
+ "darwa");
ensureKeysGiveCorrectWylie("g.r", "gar");
ensureKeysGiveCorrectWylie("d.r", "dar");
ensureKeysGiveCorrectWylie("'.r", "'ar");
@@ -134,7 +138,7 @@ public class DuffPaneTest extends TestCase {
ensureKeysGiveCorrectWylie("t.sa",
"tas");
- ensureKeysGiveCorrectWylie("d.za");
+ ensureKeysGiveCorrectWylie("d.za", "daza");
ensureKeysGiveCorrectWylie("dza");
ensureKeysGiveCorrectWylie("s.ha",
@@ -219,7 +223,7 @@ public class DuffPaneTest extends TestCase {
ensureKeysGiveCorrectWylie("b.lag");
ensureKeysGiveCorrectWylie("blg",
- "blga");
+ "balga");
ensureKeysGiveCorrectWylie("b.las",
"bals");
@@ -244,21 +248,24 @@ public class DuffPaneTest extends TestCase {
"bras");
ensureKeysGiveCorrectWylie("bras");
- ensureKeysGiveCorrectWylie("d.wa");
+ ensureKeysGiveCorrectWylie("d.wa",
+ "dawa");
ensureKeysGiveCorrectWylie("dawa",
- "d.wa");
+ "dawa");
ensureKeysGiveCorrectWylie("dwa");
- ensureKeysGiveCorrectWylie("g.wa");
+ ensureKeysGiveCorrectWylie("g.wa",
+ "gawa");
ensureKeysGiveCorrectWylie("gawa",
- "g.wa");
+ "gawa");
ensureKeysGiveCorrectWylie("gwa");
ensureKeysGiveCorrectWylie("'.wa",
- "'wa");
+ "'awa");
ensureKeysGiveCorrectWylie("'awa",
- "'wa");
- ensureKeysGiveCorrectWylie("'wa");
+ "'awa");
+ ensureKeysGiveCorrectWylie("'wa",
+ "'awa");
ensureKeysGiveCorrectWylie("gyg",
"g.yag");
@@ -282,7 +289,8 @@ public class DuffPaneTest extends TestCase {
ensureKeysGiveCorrectWylie("ma.a.asa",
"mas");
- ensureKeysGiveCorrectWylie("'ka");
+ ensureKeysGiveCorrectWylie("'ka",
+ "'aka");
ensureKeysGiveCorrectWylie("'gas");
@@ -319,8 +327,9 @@ public class DuffPaneTest extends TestCase {
"lamanga");
ensureKeysGiveCorrectWylie("b.m.ng",
- "bmang");
- ensureKeysGiveCorrectWylie("bmang");
+ "bamanga");
+ ensureKeysGiveCorrectWylie("bmang",
+ "bamanga");
ensureKeysGiveCorrectWylie("gdams");
ensureKeysGiveCorrectWylie("g.d.m.s.",
@@ -372,7 +381,7 @@ public class DuffPaneTest extends TestCase {
ensureKeysGiveCorrectWylie("fivikikhigingicichijinyitithidinipiphibimitsitshidziwizhizi'iyirilishisihiTiThiDiNiShi");
ensureKeysGiveCorrectWylie("don't touch my coffee/that makes me very angry/supersize my drink",
- "dona'ata tocha mya cofafe/thata mkes me veraya angaraya/superasize mya drinaka");
+ "dona'ata tocha mya cofafe/thata makesa me veraya angaraya/superasize mya drinaka");
}
}
diff --git a/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest1ResultConversion.expected b/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest1ResultConversion.expected
index bdcd796..be3a254 100644
--- a/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest1ResultConversion.expected
+++ b/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest1ResultConversion.expected
@@ -28,7 +28,7 @@ zur mig nyag phran tsam gyis dge ba'i gzugs can 'dus ma byas//\par
\par
yid 'ong bzhin ras zla gzhon 'khor lo gnyis skyes la//\par
'khrul ba ster yang 'phyang mo sel byed mgo skyes kyi//\par
-bai DUr mthing kha'i lan bu rab 'phyang dbyangs can ma//\par
+bai DUra mthing kha'i lan bu rab 'phyang dbyangs can ma//\par
smra ba'i dbang phyug ngag gi rgyal po nyer grub mdzod//\par
\par
gangs can lha lam yangs pa'i khyon 'dir rgyal ba'i bstan pa bcu gnyis bdag po'i gur khang mchog/\par
diff --git a/source/org/thdl/tib/text/TGCPair.java b/source/org/thdl/tib/text/TGCPair.java
index d681cbd..9049b98 100644
--- a/source/org/thdl/tib/text/TGCPair.java
+++ b/source/org/thdl/tib/text/TGCPair.java
@@ -25,7 +25,7 @@ package org.thdl.tib.text;
context-insensitive THDL Extended Wylie representation. NOTE
WELL: this is not a real grapheme cluster; I'm misusing the term
(FIXME). It's actually whole or part of one. It's part of one
- when this is a vowel or U+0F7F alone.
+ when this is U+0F7F alone.
@author David Chandler */
public class TGCPair {
@@ -37,14 +37,84 @@ public class TGCPair {
public static final int SANSKRIT_WITHOUT_VOWEL = 5;
public static final int SANSKRIT_WITH_VOWEL = 6;
- public String wylie;
- public int classification;
- public TGCPair(String wylie, int classification) {
- this.wylie = wylie;
- this.classification = classification;
+ public static final int TYPE_OTHER = 31;
+ public static final int TYPE_SANSKRIT = 32;
+ public static final int TYPE_TIBETAN = 33;
+
+ // Sanskrit or Tibetan consonant, or number, or oddball:
+ private String consonantWylie;
+ private String vowelWylie;
+ public String getConsonantWylie() {
+ return consonantWylie;
}
+ public String getVowelWylie() {
+ return vowelWylie;
+ }
+ /** Cludge. */
+ public void setWylie(String x) {
+ consonantWylie = x;
+ vowelWylie = null;
+ }
+ public String getWylie() {
+ StringBuffer b = new StringBuffer();
+ if (consonantWylie != null) {
+ // we may have {p-y}, but the user wants to see {py}.
+ for (int i = 0; i < consonantWylie.length(); i++) {
+ char ch = consonantWylie.charAt(i);
+ if ('-' != ch)
+ b.append(ch);
+ }
+ }
+ if (vowelWylie != null)
+ b.append(vowelWylie);
+ return b.toString();
+ }
+ public int classification;
+ /** Constructs a new TGCPair with (Tibetan or Sanskrit) consonant
+ * consonantWylie and vowel vowelWylie. Use
+ * classification==TYPE_OTHER for numbers, lone vowels, marks,
+ * etc. Use classification==TYPE_TIBETAN for Tibetan (not
+ * Tibetanized Sanskrit) and classification=TYPE_SANSKRIT for
+ * Tibetanized Sanskrit. */
+ public TGCPair(String consonantWylie, String vowelWylie, int classification) {
+ if ("".equals(vowelWylie))
+ vowelWylie = null;
+ // Technically, we don't need the following check, but it's
+ // nice for consistency's sake.
+ if ("".equals(consonantWylie))
+ consonantWylie = null;
+
+ // DLC FIXME: for speed, make these assertions:
+ if (classification != TYPE_OTHER
+ && classification != TYPE_TIBETAN
+ && classification != TYPE_SANSKRIT) {
+ throw new IllegalArgumentException("Bad classification " + classification + ".");
+ }
+ int realClassification = -37;
+ if (vowelWylie == null && classification == TYPE_TIBETAN)
+ realClassification = CONSONANTAL_WITHOUT_VOWEL;
+ if (vowelWylie != null && classification == TYPE_TIBETAN)
+ realClassification = CONSONANTAL_WITH_VOWEL;
+ if (vowelWylie == null && classification == TYPE_SANSKRIT)
+ realClassification = SANSKRIT_WITHOUT_VOWEL;
+ if (vowelWylie != null && classification == TYPE_SANSKRIT)
+ realClassification = SANSKRIT_WITH_VOWEL;
+ if (consonantWylie == null) {
+ if (classification != TYPE_OTHER)
+ throw new IllegalArgumentException("That's the very definition of a lone vowel.");
+ realClassification = LONE_VOWEL;
+ } else {
+ if (classification == TYPE_OTHER)
+ realClassification = OTHER;
+ }
+
+ this.consonantWylie = consonantWylie;
+ this.vowelWylie = vowelWylie;
+ this.classification = realClassification;
+ }
+
public String toString() {
- return "
We run along until we hit whitespace or punctuation. We take * everything before that and we see if it's a legal Tibetan tsheg bar, @@ -1480,4 +1486,90 @@ public class TibTextUtils implements THDLWylieConstants { } return rv; } + + /** Returns true if and only if the stack with Wylie root + * can take the prefix prefix. */ + private static boolean isLegalPrefixRootCombo(String prefix, String root) { + // This will be decomposed enough. If you can decompose it, + // then it doesn't take a prefix! + if (!TibetanMachineWeb.isKnownHashKey(root)) { + root = root.replace('+', '-'); + if (!TibetanMachineWeb.isKnownHashKey(root)) { + throw new Error("root is, now, " + root); // FIXME: make this an assertion + } + } + String ru = TibetanMachineWeb.getUnicodeForWylieForGlyph(root); + + // ru may be for (head, root, sub), (head, root), (root), or + // (root, sub). Try all possibilities that are possible with + // a String of length ru. If there's a wa-zur, then we say + // (FIXME: do we say correctly?) that a stack with wa-zur can + // take a prefix if and only if the stack without can take a + // prefix. + + if (ru == null) throw new Error("how? root is " + root); // FIXME: make this an assertion + int rl = ru.length(); + if (ru.charAt(rl - 1) == UnicodeConstants.EWSUB_wa_zur) + --rl; // forget about wa-zur: see above. + if (rl == 2) { + char ch0 = ru.charAt(0); + char ch1 = UnicodeUtils.getNominalRepresentationOfSubscribedConsonant(ru.charAt(1)); + + // (head, root) and (root, sub) are possibilities. + if (ACHUNG.equals(prefix)) { + return LegalTshegBar.takesAchungPrefix(ch0, ch1, UnicodeConstants.EW_ABSENT) + || LegalTshegBar.takesAchungPrefix(UnicodeConstants.EW_ABSENT, ch0, ch1); + } else if ("b".equals(prefix)) { + return LegalTshegBar.takesBao(ch0, ch1, UnicodeConstants.EW_ABSENT) + || LegalTshegBar.takesBao(UnicodeConstants.EW_ABSENT, ch0, ch1); + } else if ("m".equals(prefix)) { + return LegalTshegBar.takesMao(ch0, ch1, UnicodeConstants.EW_ABSENT) + || LegalTshegBar.takesMao(UnicodeConstants.EW_ABSENT, ch0, ch1); + } else if ("g".equals(prefix)) { + return LegalTshegBar.takesGao(ch0, ch1, UnicodeConstants.EW_ABSENT) + || LegalTshegBar.takesGao(UnicodeConstants.EW_ABSENT, ch0, ch1); + } else if ("d".equals(prefix)) { + return LegalTshegBar.takesDao(ch0, ch1, UnicodeConstants.EW_ABSENT) + || LegalTshegBar.takesDao(UnicodeConstants.EW_ABSENT, ch0, ch1); + } else { + throw new IllegalArgumentException("prefix is " + prefix); + } + } else if (rl == 1) { + char ch0 = ru.charAt(0); + // (root) is the only choice. + if (ACHUNG.equals(prefix)) { + return LegalTshegBar.takesAchungPrefix(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT); + } else if ("b".equals(prefix)) { + return LegalTshegBar.takesBao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT); + } else if ("m".equals(prefix)) { + return LegalTshegBar.takesMao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT); + } else if ("g".equals(prefix)) { + return LegalTshegBar.takesGao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT); + } else if ("d".equals(prefix)) { + return LegalTshegBar.takesDao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT); + } else { + throw new IllegalArgumentException("prefix is " + prefix); + } + } else if (rl == 3) { + char ch0 = ru.charAt(0); + char ch1 = UnicodeUtils.getNominalRepresentationOfSubscribedConsonant(ru.charAt(1)); + char ch2 = UnicodeUtils.getNominalRepresentationOfSubscribedConsonant(ru.charAt(2)); + // (head, root, sub) is the only choice. + if (ACHUNG.equals(prefix)) { + return LegalTshegBar.takesAchungPrefix(ch0, ch1, ch2); + } else if ("b".equals(prefix)) { + return LegalTshegBar.takesBao(ch0, ch1, ch2); + } else if ("m".equals(prefix)) { + return LegalTshegBar.takesMao(ch0, ch1, ch2); + } else if ("g".equals(prefix)) { + return LegalTshegBar.takesGao(ch0, ch1, ch2); + } else if ("d".equals(prefix)) { + return LegalTshegBar.takesDao(ch0, ch1, ch2); + } else { + throw new IllegalArgumentException("prefix is " + prefix); + } + } else { + return false; + } + } } diff --git a/source/org/thdl/tib/text/TibetanMachineWeb.java b/source/org/thdl/tib/text/TibetanMachineWeb.java index 8ee9fb2..6200473 100644 --- a/source/org/thdl/tib/text/TibetanMachineWeb.java +++ b/source/org/thdl/tib/text/TibetanMachineWeb.java @@ -178,14 +178,19 @@ public class TibetanMachineWeb implements THDLWylieConstants { - // NOTE WELL: if you delete from consonants, numbers, vowels, or - // others, you'll change the way Jskad's Extended Wylie keyboard - // works, yes, but you'll also change TMW->Wylie. + // NOTE WELL: if you delete from tibetanConsonants, + // otherConsonants, numbers, vowels, or others, you'll change the + // way Jskad's Extended Wylie keyboard works, yes, but you'll also + // change TMW->Wylie. - /** comma-delimited list of supported consonants (Tibetan and - Tibetanized Sanskrit): */ - private static final String consonants - = "k,kh,g,ng,c,ch,j,ny,t,th,d,n,p,ph,b,m,ts,tsh,dz,w,zh,z,',y,r,l,sh,s,h,a,T,Th,D,N,Sh,v,f,Dz"; + /** comma-delimited list of supported Tibetan consonants: */ + private static final String tibetanConsonants + = "k,kh,g,ng,c,ch,j,ny,t,th,d,n,p,ph,b,m,ts,tsh,dz,w,zh,z,',y,r,l,sh,s,h,a"; + + /** comma-delimited list of supported non-Tibetan consonants, such + * as Sanskrit consonants: */ + private static final String otherConsonants // va and fa are treated pretty-much like Sanskrit. + = "T,Th,D,N,Sh,v,f,Dz"; /** comma-delimited list of supported numbers (superscribed, subscribed, normal, half-numerals): */ @@ -371,7 +376,7 @@ public class TibetanMachineWeb implements THDLWylieConstants { charSet = new HashSet(); tibSet = new HashSet(); - sTok = new StringTokenizer(consonants, ","); + sTok = new StringTokenizer(tibetanConsonants, ","); while (sTok.hasMoreTokens()) { String ntk; charSet.add(ntk = sTok.nextToken()); @@ -379,6 +384,15 @@ public class TibetanMachineWeb implements THDLWylieConstants { validInputSequences.put(ntk, anyOldObjectWillDo); } + sanskritStackSet = new HashSet(); + sTok = new StringTokenizer(otherConsonants, ","); + while (sTok.hasMoreTokens()) { + String ntk; + charSet.add(ntk = sTok.nextToken()); + sanskritStackSet.add(ntk); + validInputSequences.put(ntk, anyOldObjectWillDo); + } + numberSet = new HashSet(); sTok = new StringTokenizer(numbers, ","); while (sTok.hasMoreTokens()) { @@ -386,7 +400,7 @@ public class TibetanMachineWeb implements THDLWylieConstants { // do it in so that Jskad has the same // TMW->Wylie conversion regardless of whether or not it // chooses to support inputting numbers. Likewise for - // consonants, others, and vowels. + // tibetanConsonants, otherConsonants, others, and vowels. String ntk; charSet.add(ntk = sTok.nextToken()); numberSet.add(ntk); @@ -427,8 +441,6 @@ public class TibetanMachineWeb implements THDLWylieConstants { boolean ignore = false; - sanskritStackSet = new HashSet(); - while ((line = in.readLine()) != null) { if (line.startsWith("")) { //line is command if (line.equalsIgnoreCase("")) { @@ -1182,6 +1194,23 @@ public static boolean hasGlyph(String hashKey) { return true; } +/** Returns the Unicode correspondence for the Wylie wylie, which must + * be Wylie returned by getWylieForGlyph(int, int, boolean[]). + * Returns null if the Unicode correspondence is nonexistent or + * unknown. */ +public static String getUnicodeForWylieForGlyph(String wylie) { + DuffCode dc = getGlyph(wylie); + return mapTMWtoUnicode(dc.getFontNum() - 1, dc.getCharNum()); +} + +/** +* Returns true if and only if hashKey is a known hash key from tibwn.ini. +*/ +public static boolean isKnownHashKey(String hashKey) { + DuffCode[] dc = (DuffCode[])tibHash.get(hashKey); + return (null != dc); +} + /** * Gets a glyph for this hash key. Hash keys are not identical to Extended * Wylie. The hash key for a Tibetan stack separates the members of the stack @@ -1193,7 +1222,7 @@ public static boolean hasGlyph(String hashKey) { public static DuffCode getGlyph(String hashKey) { DuffCode[] dc = (DuffCode[])tibHash.get(hashKey); if (null == dc) - throw new Error("It is likely that you misconfigured tibwn.ini such that, say, M is expected (i.e., it is listed as, e.g. punctuation), but no 'M~...' line appears."); + throw new Error("Hash key " + hashKey + " not found; it is likely that you misconfigured tibwn.ini such that, say, M is expected (i.e., it is listed as, e.g. punctuation), but no 'M~...' line appears."); return dc[TMW]; } diff --git a/source/org/thdl/tib/text/tibwn.ini b/source/org/thdl/tib/text/tibwn.ini index 175d57c..160e3b9 100644 --- a/source/org/thdl/tib/text/tibwn.ini +++ b/source/org/thdl/tib/text/tibwn.ini @@ -98,13 +98,6 @@ __TILDE__~93,5~~9,91~~~~~~~none - -// 0F5F,0F39 might work, but the OpenType font's author must've had -// Dza in mind if it does. Note that the bottommost horizontal stroke -// goes upward on U+0F5F and downward on U+0F5B. -Dz~146,5~~10,42~~~~~~~none -f~153,5~~10,58~1,110~1,118~1,124~1,126~10,114~10,123~0F55,0F39 -v~154,5~~10,59~1,110~1,118~1,124~1,126~10,114~10,123~0F56,f39 k~33,1~1,92~1,33~1,109~1,111~1,123~1,125~10,118~10,120~0F40 kh~34,1~~1,34~1,109~1,118~1,123~1,125~10,114~10,123~0F41 g~35,1~1,93~1,35~1,109~1,111~1,123~1,125~10,118~10,120~0F42 @@ -135,11 +128,6 @@ sh~59,1~1,99~1,60~1,109~1,111~1,123~1,125~10,118~10,120~0F64 s~60,1~~1,61~1,109~1,118~1,123~1,125~10,114~10,123~0F66 h~61,1~1,100~1,62~1,109~1,112~1,123~1,125~10,115~10,122~0F67~1,102 a~62,1~~1,63~1,109~1,118~1,123~1,125~10,114~10,123~0F68 -T~170,1~~1,64~1,109~1,120~1,123~1,125~10,115~10,124~0F4A -Th~171,1~~1,65~1,109~1,118~1,123~1,125~10,114~10,123~0F4B -D~172,1~~1,66~1,109~1,120~1,123~1,125~10,115~10,124~0F4C -N~173,1~~1,67~1,109~1,118~1,123~1,125~10,115~10,124~0F4E -Sh~174,1~~1,68~1,109~1,118~1,123~1,125~10,115~10,124~0F65 r-k~63,1~~1,70~1,109~1,121~1,123~1,125~10,115~10,124~f62,f90 r-g~64,1~~1,71~1,109~1,121~1,123~1,125~10,115~10,124~f62,f92 r-ng~65,1~~1,72~1,109~1,119~1,123~1,125~10,115~10,124~f62,f94 @@ -241,6 +229,17 @@ au~237,1~~8,89~~~~~~~0F7D~~8,104 // DLC FIXME: need -I as well +// 0F5F,0F39 might work, but the OpenType font's author must've had +// Dza in mind if it does. Note that the bottommost horizontal stroke +// goes upward on U+0F5F and downward on U+0F5B. +Dz~146,5~~10,42~~~~~~~none +f~153,5~~10,58~1,110~1,118~1,124~1,126~10,114~10,123~0F55,0F39 +v~154,5~~10,59~1,110~1,118~1,124~1,126~10,114~10,123~0F56,f39 +T~170,1~~1,64~1,109~1,120~1,123~1,125~10,115~10,124~0F4A +Th~171,1~~1,65~1,109~1,118~1,123~1,125~10,114~10,123~0F4B +D~172,1~~1,66~1,109~1,120~1,123~1,125~10,115~10,124~0F4C +N~173,1~~1,67~1,109~1,118~1,123~1,125~10,115~10,124~0F4E +Sh~174,1~~1,68~1,109~1,118~1,123~1,125~10,115~10,124~0F65 k+Sh~175,1~~1,69~1,109~1,122~1,123~1,125~10,116~10,125~0F69 k+k~33,2~~3,33~1,109~4,120~1,123~1,125~4,106~4,113~f40,f90 k+kh~34,2~~3,34~1,109~4,120~1,123~1,125~4,106~4,113~f40,f91 diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java index 5dcb0fc..1e53ad2 100644 --- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java +++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java @@ -1266,7 +1266,7 @@ public final class LegalTshegBar * @param sub the {@link #isNominalRepresentationOfConsonant(char) * nominal representation} of the subjoined letter, or EW_ABSENT * if not present */ - static boolean takesGao(char head, char root, char sub) { + public static boolean takesGao(char head, char root, char sub) { if (EW_ABSENT == head) { if (EW_ABSENT == sub) { return (EWC_ca == root @@ -1298,7 +1298,7 @@ public final class LegalTshegBar * @param sub the {@link #isNominalRepresentationOfConsonant(char) * nominal representation} of the subjoined letter, or EW_ABSENT * if not present */ - static boolean takesDao(char head, char root, char sub) { + public static boolean takesDao(char head, char root, char sub) { if (EW_ABSENT == head) { if (EW_ABSENT == sub) { return (EWC_ka == root @@ -1312,6 +1312,7 @@ public final class LegalTshegBar || (EWC_pa == root && EWC_ya == sub) || (EWC_ba == root && EWC_ya == sub) || (EWC_ma == root && EWC_ya == sub) + || (EWC_ka == root && EWC_ya == sub) // dkyil, for example || (EWC_ka == root && EWC_ra == sub) || (EWC_ga == root && EWC_ra == sub) @@ -1336,7 +1337,7 @@ public final class LegalTshegBar * @param sub the {@link #isNominalRepresentationOfConsonant(char) * nominal representation} of the subjoined letter, or EW_ABSENT * if not present */ - static boolean takesAchungPrefix(char head, char root, char sub) { + public static boolean takesAchungPrefix(char head, char root, char sub) { if (EW_ABSENT == head) { if (EW_ABSENT == sub) { return (EWC_ga == root @@ -1379,7 +1380,7 @@ public final class LegalTshegBar * @param sub the {@link #isNominalRepresentationOfConsonant(char) * nominal representation} of the subjoined letter, or EW_ABSENT * if not present */ - static boolean takesMao(char head, char root, char sub) { + public static boolean takesMao(char head, char root, char sub) { if (EW_ABSENT == head) { if (EW_ABSENT == sub) { return (EWC_kha == root @@ -1418,11 +1419,12 @@ public final class LegalTshegBar * @param sub the {@link #isNominalRepresentationOfConsonant(char) * nominal representation} of the subjoined letter, or EW_ABSENT * if not present */ - static boolean takesBao(char head, char root, char sub) { + public static boolean takesBao(char head, char root, char sub) { // DLC ask Ten-lo la about Wazur. if (EW_ABSENT == head) { if (EW_ABSENT == sub) { return (EWC_ka == root + || EWC_sa == root // bsams, for example || EWC_ca == root || EWC_ta == root || EWC_tsa == root diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java index 544df36..5f18e32 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java @@ -232,6 +232,14 @@ public class UnicodeUtils implements UnicodeConstants { /* DLC FIXME -- I was using 3.0 p.437-440, check 3.2. */ } + /** If ch is in one of the ranges U+0F90-U+0F97, U+0F99-U+0FB9, + * then this returns the same consonant in the range + * U+0F40-U+0F69. If ch is not in that range, this returns + * garbage. */ + public static char getNominalRepresentationOfSubscribedConsonant(char ch) { + return (char)((int)ch-(((int)'\u0F90') - ((int)'\u0F40'))); + } + /** Returns true iff ch corresponds to the Tibetan letter ra. Several Unicode codepoints correspond to the Tibetan letter ra (in its subscribed form or otherwise). Oftentimes, diff --git a/source/org/thdl/tib/text/ttt/ACIPConverter.java b/source/org/thdl/tib/text/ttt/ACIPConverter.java index 03c40fa..d8166ba 100644 --- a/source/org/thdl/tib/text/ttt/ACIPConverter.java +++ b/source/org/thdl/tib/text/ttt/ACIPConverter.java @@ -58,28 +58,46 @@ public class ACIPConverter { ArrayList al = ACIPTshegBarScanner.scanFile(args[1], errors, strict, maxErrors - 1); if (null == al) { - System.err.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this"); + System.err.println(maxErrors + " or more lexical errors occurred while scanning ACIP input file; is this"); System.err.println("Tibetan or English input?"); System.err.println(""); - System.err.println("First " + maxErrors + " errors scanning ACIP input file: "); - System.err.println(errors); - System.err.println("Exiting with " + maxErrors + " or more errors; please fix input file and try again."); + if (false) { + // Nobody wants to see this. FIXME: maybe somebody; have an option. + System.err.println("First " + maxErrors + " lexical errors scanning ACIP input file: "); + System.err.println(errors); + } + System.err.println("Exiting with " + maxErrors + " or more lexical errors; please fix input file and try again."); System.exit(1); } + final boolean abortUponScanningError = false; // DLC MAKE ME CONFIGURABLE + // DLC NOW: BAo isn't converting. if (errors.length() > 0) { System.err.println("Errors scanning ACIP input file: "); System.err.println(errors); - System.err.println("Exiting; please fix input file and try again."); - System.exit(1); + if (abortUponScanningError) { + System.err.println("Exiting; please fix input file and try again."); + System.exit(1); + } } - convertToUnicode(al, System.out, errors); + StringBuffer warnings = new StringBuffer(); + boolean putWarningsInOutput = true; // DLC make me configurable. + convertToUnicode(al, System.out, errors, warnings, + putWarningsInOutput); if (errors.length() > 0) { System.err.println("Errors converting ACIP input file: "); System.err.println(errors); + System.err.println("The output contains these errors."); System.err.println("Exiting; please fix input file and try again."); System.exit(2); } + if (warnings.length() > 0) { + System.err.println("Warnings converting ACIP input file: "); + System.err.println(warnings); + if (putWarningsInOutput) + System.err.println("The output contains these warnings."); + System.exit(2); + } if (verbose) System.err.println("Converted " + args[1] + " perfectly."); System.exit(0); } @@ -96,19 +114,30 @@ public class ACIPConverter { { throw new Error("DLC UNIMPLEMENTED"); } + // DLC FIXME: sometimes { } is \u0F0B, and sometimes it is a + // space. Treat it as a tsheg only when it appears after a + // syllable or another tsheg. /** Returns UTF-8 encoded Unicode. A bit indirect, so use this * for testing only if performance is a concern. If errors occur * in scanning the ACIP or in converting a tsheg bar, then they - * are appended to errors if errors is non-null. Returns the + * are appended to errors if errors is non-null, as well as + * written to the result. If warnings occur in scanning the ACIP + * or in converting a tsheg bar, then they are appended to + * warnings if warnings is non-null, and they are written to the + * result if writeWarningsToResult is true. Returns the * conversion upon perfect success, null if errors occurred. */ public static String convertToUnicode(String acip, - StringBuffer errors) { + StringBuffer errors, + StringBuffer warnings, + boolean writeWarningsToResult) { ByteArrayOutputStream sw = new ByteArrayOutputStream(); ArrayList al = ACIPTshegBarScanner.scan(acip, errors, true /* DLC FIXME */, -1); try { - if (null != al && convertToUnicode(al, sw, errors)) { + if (null != al + && convertToUnicode(al, sw, errors, + warnings, writeWarningsToResult)) { return sw.toString("UTF-8"); } else { System.out.println("DLC al is " + al + " and convertToUnicode returned null."); @@ -119,15 +148,25 @@ public class ACIPConverter { } } - /** Writes Unicode to out. If errors occur in converting a - * tsheg bar, then they are appended to errors if errors is - * non-null. Returns true upon perfect success, false if errors - * occurred. + /** Writes Unicode to out. If errors occur in converting a tsheg + * bar, then they are appended to errors if errors is non-null. + * Furthermore, errors are written to out. If writeWarningsToOut + * is true, then warnings also will be written to out. Returns + * true upon perfect success, false if errors occurred. + * @param scan result of ACIPTshegBarScanner.scan(..) + * @param out stream to which to write converted text + * @param errors if non-null, all error messages are appended + * @param warnings if non-null, all warning messages are appended + * to this + * @param writeWarningsToOut if true, then all warning messages + * are written to out in the appropriate places * @throws IOException if we cannot write to out */ public static boolean convertToUnicode(ArrayList scan, OutputStream out, - StringBuffer errors) + StringBuffer errors, + StringBuffer warnings, + boolean writeWarningsToOut) throws IOException { int sz = scan.size(); @@ -139,7 +178,7 @@ public class ACIPConverter { int stype = s.getType(); if (stype == ACIPString.ERROR) { hasErrors = true; - writer.write("[#ERROR CONVERTING ACIP DOCUMENT: "); + writer.write("[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: "); writer.write(s.getText()); writer.write("]"); } else { @@ -179,6 +218,21 @@ public class ACIPConverter { if (null != errors) errors.append(errorMessage + "\n"); } else { + String warning + = pt.getWarning(false, // DLC: make me configurable + pl, + s.getText()); + if (null != warning) { + if (writeWarningsToOut) { + writer.write("[#WARNING CONVERTING ACIP DOCUMENT: "); + writer.write(warning); + writer.write("]"); + } + if (null != warnings) { + warnings.append(warning); + warnings.append('\n'); + } + } unicode = sl.getUnicode(); if (null == unicode) throw new Error("DLC: HOW?"); } diff --git a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java index 2879683..bea56a6 100644 --- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java +++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java @@ -133,16 +133,18 @@ public class ACIPTshegBarScanner { Stack bracketTypeStack = new Stack(); int startSlashIndex = -1; int startParenIndex = -1; + int numNewlines = 0; for (int i = 0; i < sl; i++) { if (i < startOfString) throw new Error("bad reset"); char ch; ch = s.charAt(i); + if (ch == '\n') ++numNewlines; if (ACIPString.COMMENT == currentType && ch != ']') { if ('[' == ch) { al.add(new ACIPString("Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n", ACIPString.ERROR)); if (null != errors) - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"); if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; } @@ -157,17 +159,18 @@ public class ACIPTshegBarScanner { al.add(new ACIPString(s.substring(startOfString, i), currentType)); } - al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR)); + al.add(new ACIPString("Found a truly unmatched close bracket, " + s.substring(i, i+1), + ACIPString.ERROR)); if (!waitingForMatchingIllegalClose) { if (null != errors) { - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found a truly unmatched close bracket, ] or }.\n"); } if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; } waitingForMatchingIllegalClose = false; if (null != errors) - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; startOfString = i+1; @@ -249,6 +252,11 @@ public class ACIPTshegBarScanner { || s.substring(i, i + "[BP]".length()).equals("{BP}"))) { thingy = "[BP]"; currentType = ACIPString.BP; + } else if (i + "[BLANK PAGE]".length() <= sl + && (s.substring(i, i + "[BLANK PAGE]".length()).equals("[BLANK PAGE]") + || s.substring(i, i + "[BLANK PAGE]".length()).equals("{BLANK PAGE}"))) { + thingy = "[BLANK PAGE]"; + currentType = ACIPString.BP; } else if (i + "[ BP ]".length() <= sl && (s.substring(i, i + "[ BP ]".length()).equals("[ BP ]") || s.substring(i, i + "[ BP ]".length()).equals("{ BP }"))) { @@ -414,11 +422,11 @@ public class ACIPTshegBarScanner { // This is an error. Sometimes [COMMENTS APPEAR // WITHOUT # MARKS]. Though "... [" could cause // this too. - al.add(new ACIPString(s.substring(i, i+1), + al.add(new ACIPString("Found an illegal open bracket: " + s.substring(i, i+1), ACIPString.ERROR)); if (waitingForMatchingIllegalClose) { if (null != errors) { - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.\n"); } if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; @@ -435,7 +443,7 @@ public class ACIPTshegBarScanner { inContext = inContext + "..."; } } - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n"); if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; } @@ -477,7 +485,6 @@ public class ACIPTshegBarScanner { if (i+numdigits+2 < sl && s.charAt(i+numdigits+2) == '.') { if (!(i+numdigits+4 < sl && isNumeric(s.charAt(i+numdigits+3)) && !isNumeric(s.charAt(i+numdigits+4)))) { - al.add(new ACIPString(s.substring(i, i+numdigits+3), ACIPString.ERROR)); String inContext = s.substring(i, i+Math.min(sl-i, 10)); if (inContext.indexOf("\r") >= 0) { inContext = inContext.substring(0, inContext.indexOf("\r")); @@ -488,8 +495,10 @@ public class ACIPTshegBarScanner { inContext = inContext + "..."; } } + al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.", + ACIPString.ERROR)); if (null != errors) - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.\n"); if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; startOfString = i+numdigits+3; @@ -498,7 +507,6 @@ public class ACIPTshegBarScanner { break; } if (i+numdigits+4 < sl && (s.charAt(i+numdigits+4) == '.' || s.charAt(i+numdigits+4) == 'A' || s.charAt(i+numdigits+4) == 'B' || s.charAt(i+numdigits+4) == 'a' || s.charAt(i+numdigits+4) == 'b' || isNumeric(s.charAt(i+numdigits+4)))) { - al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR)); String inContext = s.substring(i, i+Math.min(sl-i, 10)); if (inContext.indexOf("\r") >= 0) { inContext = inContext.substring(0, inContext.indexOf("\r")); @@ -509,8 +517,10 @@ public class ACIPTshegBarScanner { inContext = inContext + "..."; } } + al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.", + ACIPString.ERROR)); if (null != errors) - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.\n"); if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; startOfString = i+1; // DLC FIXME: skip over more? @@ -572,7 +582,9 @@ public class ACIPTshegBarScanner { } // This case, @NNN, must come after the @NNN{AB} case. - if (i+numdigits+1 < sl && s.charAt(i+numdigits+1) == ' ') { + if (i+numdigits+1 < sl && (s.charAt(i+numdigits+1) == ' ' + || s.charAt(i+numdigits+1) == '\n' + || s.charAt(i+numdigits+1) == '\r')) { boolean allAreNumeric = true; for (int k = 1; k <= numdigits; k++) { if (!isNumeric(s.charAt(i+k))) { @@ -591,7 +603,6 @@ public class ACIPTshegBarScanner { } } if (startOfString == i) { - al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR)); String inContext = s.substring(i, i+Math.min(sl-i, 10)); if (inContext.indexOf("\r") >= 0) { inContext = inContext.substring(0, inContext.indexOf("\r")); @@ -602,8 +613,10 @@ public class ACIPTshegBarScanner { inContext = inContext + "..."; } } + al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.", + ACIPString.ERROR)); if (null != errors) - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.\n"); if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; startOfString = i+1; @@ -626,9 +639,10 @@ public class ACIPTshegBarScanner { * it means /NYA/. We warn about // for this * reason. \\ causes a tsheg-bar error (DLC * FIXME: verify this is so). */ - al.add(new ACIPString("//", ACIPString.ERROR)); + al.add(new ACIPString("Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.", + ACIPString.ERROR)); if (errors != null) { - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\n"); } if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; @@ -661,9 +675,10 @@ public class ACIPTshegBarScanner { if (startParenIndex >= 0) { if (ch == '(') { - al.add(new ACIPString("Nesting of parentheses () is not allowed", ACIPString.ERROR)); + al.add(new ACIPString("Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.", + ACIPString.ERROR)); if (null != errors) - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\n"); if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; } else { @@ -674,9 +689,10 @@ public class ACIPTshegBarScanner { currentType = ACIPString.ERROR; } else { if (ch == ')') { - al.add(new ACIPString("Unexpected closing parenthesis )", ACIPString.ERROR)); + al.add(new ACIPString("Unexpected closing parenthesis, ), found.", + ACIPString.ERROR)); if (null != errors) - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Unexpected closing parenthesis, ), found.\n"); if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; } else { @@ -724,10 +740,10 @@ public class ACIPTshegBarScanner { al.add(new ACIPString(s.substring(i, i+1), ACIPString.TIBETAN_PUNCTUATION)); } else { - al.add(new ACIPString(s.substring(i, i+1), + al.add(new ACIPString("A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".", ACIPString.ERROR)); if (null != errors) - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n"); if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; } @@ -772,19 +788,24 @@ public class ACIPTshegBarScanner { al.add(new ACIPString(s.substring(startOfString, i), currentType)); } - al.add(new ACIPString(s.substring(i, i+1), - ACIPString.ERROR)); - if (null != errors) { - if ((int)ch == 65533) { - errors.append("Offset " + i + ": " + if ((int)ch == 65533) { + al.add(new ACIPString("Found an illegal, unprintable character.", + ACIPString.ERROR)); + if (null != errors) + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found an illegal, unprintable character.\n"); - } else if ('\\' == ch) { - errors.append("Offset " + i + ": " + } else if ('\\' == ch) { + al.add(new ACIPString("Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.", + ACIPString.ERROR)); + if (null != errors) + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n"); - } else { - errors.append("Offset " + i + ": " + } else { + al.add(new ACIPString("Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".", + ACIPString.ERROR)); + if (null != errors) + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".\n"); - } } if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; startOfString = i+1; diff --git a/source/org/thdl/tib/text/ttt/PackageTest.java b/source/org/thdl/tib/text/ttt/PackageTest.java index b447da1..b3c8c11 100644 --- a/source/org/thdl/tib/text/ttt/PackageTest.java +++ b/source/org/thdl/tib/text/ttt/PackageTest.java @@ -128,7 +128,7 @@ public class PackageTest extends TestCase { } { - TStackListList legalParses = pt.getUniqueParse(); + TStackListList legalParses = pt.getUniqueParse(false); boolean goodness2 = (expectedLegalParses == null || expectedLegalParses.length == legalParses.size()); for (int i = 0 ; i < legalParses.size(); i++) { @@ -139,18 +139,21 @@ public class PackageTest extends TestCase { || expectedLegalParses.length < i+1 || n.equals(expectedLegalParses[i])); if (!okay || !goodness2) - System.out.println("Legal parse " + (i) + " (from zero) is " + n + " (toString2=" + n.toString2() + ") and expected is " + expectedLegalParses[i]); + System.out.println("Legal parse " + (i) + " (from zero) is " + n + " (toString2=" + n.toString2() + ") and expected is " + + ((i < expectedLegalParses.length) + ? expectedLegalParses[i] + : "not present")); assertTrue(okay); } if (!goodness2) - System.out.println("You expected " + expectedLegalParses.length + " legal parses, but there were instead " + legalParses.size() + " legal parses."); + System.out.println("You expected " + expectedLegalParses.length + " legal parses, but there were instead " + legalParses.size() + " legal parses for ACIP " + acip + "."); assertTrue(goodness2); TStackListList allLegalParses = pt.getLegalParses(); TStackListList decentParses = pt.getNonIllegalParses(); if (pt.getBestParse() == null) { if (legalParses.size() == 0) { if (null != expectedBestParse && !"".equals(expectedBestParse)) { - System.out.print("Expected is that there is a best parse \"" + expectedBestParse + "\" but there is no best parse for acip {" + acip + "}"); + System.out.print("Expected is that there is a best parse \"" + expectedBestParse + "\" but there is no best parse for ACIP {" + acip + "}"); assertTrue(false); } System.out.print("ACIPNoBestParseError: There is no best parse for the ACIP {" + acip + "}; "); @@ -163,7 +166,7 @@ public class PackageTest extends TestCase { } } else { if (legalParses.size() > 1) { - System.out.println("ACIPTooManyLegalParsesError: see these " + legalParses.size() + " legal parses for acip " + acip + ": " + legalParses); + System.out.println("ACIPTooManyLegalParsesError: see these " + legalParses.size() + " legal parses for ACIP " + acip + ": " + legalParses); assertTrue(legalParses.size() == 2 && (legalParses.get(0).size() == 1 + legalParses.get(1).size())); @@ -176,7 +179,7 @@ public class PackageTest extends TestCase { if (null != expectedBestParse) { boolean good = pt.getBestParse().equals(expectedBestParse); if (!good) { - System.out.print("Expected best parse is \"" + expectedBestParse + "\" but the best parse is " + pt.getBestParse() + " for acip {" + acip + "}"); + System.out.print("Expected best parse is \"" + expectedBestParse + "\" but the best parse is " + pt.getBestParse() + " for ACIP {" + acip + "}"); } assertTrue(good); } @@ -229,6 +232,116 @@ public class PackageTest extends TestCase { * {@link TPairList#getACIPError()}, and {@link * TPairList#recoverACIP()}. */ public void testBreakACIPIntoChunks() { +tstHelper("GASN"); // ambiguous with regard to prefix rules +tstHelper("BARMA"); // ambiguous with regard to prefix rules +tstHelper("MARDA"); // ambiguous with regard to prefix rules +tstHelper("BBA"); // ambiguous with regard to prefix rules +tstHelper("BBLUGS"); // ambiguous with regard to prefix rules +tstHelper("BDRA"); // ambiguous with regard to prefix rules +tstHelper("BDRAG"); // ambiguous with regard to prefix rules +tstHelper("BDRA'I"); // ambiguous with regard to prefix rules +tstHelper("BDRAL"); // ambiguous with regard to prefix rules +tstHelper("BDRAN"); // ambiguous with regard to prefix rules +tstHelper("BDRANGS"); // ambiguous with regard to prefix rules +tstHelper("BDREN"); // ambiguous with regard to prefix rules +tstHelper("BDRI"); // ambiguous with regard to prefix rules +tstHelper("BDRIS"); // ambiguous with regard to prefix rules +tstHelper("BDROL"); // ambiguous with regard to prefix rules +tstHelper("BDRUG"); // ambiguous with regard to prefix rules +tstHelper("BLCAG"); // ambiguous with regard to prefix rules +tstHelper("BLCI"); // ambiguous with regard to prefix rules +tstHelper("BLKONG"); // ambiguous with regard to prefix rules +tstHelper("BLNGA"); // ambiguous with regard to prefix rules +tstHelper("BLNGAG"); // ambiguous with regard to prefix rules +tstHelper("BMA"); // ambiguous with regard to prefix rules +tstHelper("BMYOD"); // ambiguous with regard to prefix rules +tstHelper("BSALDA"); // ambiguous with regard to prefix rules +tstHelper("BSAMS"); // ambiguous with regard to prefix rules +tstHelper("BSEMS"); // ambiguous with regard to prefix rules +tstHelper("BTSAMS"); // ambiguous with regard to prefix rules +tstHelper("BTSIMS"); // ambiguous with regard to prefix rules +tstHelper("DDANG"); // ambiguous with regard to prefix rules +tstHelper("DDAR"); // ambiguous with regard to prefix rules +tstHelper("DDRANGS"); // ambiguous with regard to prefix rules +tstHelper("DDRUG"); // ambiguous with regard to prefix rules +tstHelper("DNAG"); // ambiguous with regard to prefix rules +tstHelper("DNOGS"); // ambiguous with regard to prefix rules +tstHelper("DRBAN"); // ambiguous with regard to prefix rules +tstHelper("DRGYU"); // ambiguous with regard to prefix rules +tstHelper("DRTOG"); // ambiguous with regard to prefix rules +tstHelper("DYA"); // ambiguous with regard to prefix rules +tstHelper("DYAN"); // ambiguous with regard to prefix rules +tstHelper("GDRA"); // ambiguous with regard to prefix rules +tstHelper("GDRIM"); // ambiguous with regard to prefix rules +tstHelper("GGAN"); // ambiguous with regard to prefix rules +tstHelper("GGYUR"); // ambiguous with regard to prefix rules +tstHelper("GLTAR"); // ambiguous with regard to prefix rules +tstHelper("GLTUNG"); // ambiguous with regard to prefix rules +tstHelper("GMA"); // ambiguous with regard to prefix rules +tstHelper("GMAN"); // ambiguous with regard to prefix rules +tstHelper("GMON"); // ambiguous with regard to prefix rules +tstHelper("GRDEGS"); // ambiguous with regard to prefix rules +tstHelper("GRDZU"); // ambiguous with regard to prefix rules +tstHelper("GRGYA"); // ambiguous with regard to prefix rules +tstHelper("GRNAGS"); // ambiguous with regard to prefix rules +tstHelper("GRTAN"); // ambiguous with regard to prefix rules +tstHelper("GRTOGS"); // ambiguous with regard to prefix rules +tstHelper("GRTZO"); // ambiguous with regard to prefix rules +tstHelper("GRTZOD"); // ambiguous with regard to prefix rules +tstHelper("GRTZON"); // ambiguous with regard to prefix rules +tstHelper("GSLA"); // ambiguous with regard to prefix rules +tstHelper("GSNAD"); // ambiguous with regard to prefix rules +tstHelper("GZLA"); // ambiguous with regard to prefix rules +tstHelper("MBA"); // ambiguous with regard to prefix rules +tstHelper("MBA'"); // ambiguous with regard to prefix rules +tstHelper("MBI'I"); // ambiguous with regard to prefix rules +tstHelper("MHA'A"); // ambiguous with regard to prefix rules +tstHelper("MRDA"); // ambiguous with regard to prefix rules +tstHelper("MRDO"); // ambiguous with regard to prefix rules +tstHelper("MRDZOGS"); // ambiguous with regard to prefix rules +tstHelper("MRGA"); // ambiguous with regard to prefix rules +tstHelper("MRGAD"); // ambiguous with regard to prefix rules +tstHelper("MRGAN"); // ambiguous with regard to prefix rules +tstHelper("MRJES"); // ambiguous with regard to prefix rules +tstHelper("MRJOD"); // ambiguous with regard to prefix rules +tstHelper("MRTOGS"); // ambiguous with regard to prefix rules +tstHelper("MRTOL"); // ambiguous with regard to prefix rules +tstHelper("MRTZE'I"); // ambiguous with regard to prefix rules +tstHelper("MRTZIGS"); // ambiguous with regard to prefix rules +tstHelper("MSAM"); // ambiguous with regard to prefix rules +tstHelper("MSGRIB"); // ambiguous with regard to prefix rules +tstHelper("MSKYES"); // ambiguous with regard to prefix rules +tstHelper("MSON"); // ambiguous with regard to prefix rules +tstHelper("MSOS"); // ambiguous with regard to prefix rules +tstHelper("MSTAMS"); // ambiguous with regard to prefix rules +tstHelper("MSTAN"); // ambiguous with regard to prefix rules + + + + + + // If you're not careful, you'll think GGYES is a legal + // Tibetan tsheg bar and parse it as {G}{G+YE}{S}. But it's + // Sanskrit, really, because GA doesn't take a GA prefix. + // This doesn't occur in ACIP input files that I've seen, but + // GGYI (S1000I.INC) and GGYUR (S5275MC4.ACT) do occur. + tstHelper("GGYES", "{G}{G}{YE}{S}", + new String[] { "{G}{G}{YE}{S}", "{G}{G+YE}{S}", "{G+G}{YE}{S}" }, + new String[] { }, + "{G+G}{YE}{S}"); + + tstHelper("DRUG", "{D}{RU}{G}", + new String[] { "{D}{RU}{G}", "{D+RU}{G}" }, + new String[] { "{D+RU}{G}" }, + "{D+RU}{G}"); + + + tstHelper("d+H+d+HA", "{d+}{H+}{d+}{HA}", + new String[] { "{d+H+d+HA}" }, + new String[] { "{d+H+d+HA}" }); + + tstHelper("Gd+H+d+HA"); + tstHelper("AUTPA", "{AU}{T}{PA}", new String[] { "{AU}{T}{PA}", "{AU}{T+PA}" }, new String[] { }, @@ -249,7 +362,8 @@ public class PackageTest extends TestCase { new String[] { "{G+R+VA}{'I}" }); tstHelper("G-RVA'I", "{G-}{R}{VA}{'I}", new String[] { "{G}{R+VA}{'I}" }, - new String[] { "{G}{R+VA}{'I}" }); + new String[] { }, + "{G}{R+VA}{'I}"); tstHelper("RVA", "{R}{VA}", new String[] { "{R+VA}" }, new String[] { "{R+VA}" }); @@ -6967,8 +7081,8 @@ tstHelper("ZUR"); "", "[TIBETAN_NON_PUNCTUATION:{LA}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_NON_PUNCTUATION:{SGRUB}]"); // DLC FIXME shelp("PAS... LA", - "Offset 5: A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n", - "[TIBETAN_NON_PUNCTUATION:{PAS}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, ERROR:{.}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{LA}]"); + "Offset 5 or maybe 5: A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n", + "[TIBETAN_NON_PUNCTUATION:{PAS}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, ERROR:{A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{LA}]"); shelp("PAS... LA", "", true, @@ -6983,28 +7097,28 @@ tstHelper("ZUR"); shelp("", "", "[]"); shelp("[DD]", ""); shelp("[", - "Offset 0: Found an illegal open bracket (in context, this is [). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n"); + "Offset 0 or maybe 0: Found an illegal open bracket (in context, this is [). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n"); shelp("{", - "Offset 0: Found an illegal open bracket (in context, this is {). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n"); + "Offset 0 or maybe 0: Found an illegal open bracket (in context, this is {). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n"); shelp("DD", ""); shelp("DD]", - "Offset 2: Found a truly unmatched close bracket, ] or }.\nOffset 2: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); + "Offset 2 or maybe 2: Found a truly unmatched close bracket, ] or }.\nOffset 2 or maybe 2: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); - shelp("///NYA", "Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n"); + shelp("///NYA", "Offset 1 or maybe 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n"); shelp("/NYA/", ""); shelp("[?][BP][LS][DD1][DD2][DDD][DR][# (<{A COMMENT)}>]", ""); shelp("[LS][# A [[[[[COMMENT][LS]", - "Offset 9: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" - + "Offset 10: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" - + "Offset 11: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" - + "Offset 12: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" - + "Offset 13: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"); + "Offset 9 or maybe 9: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" + + "Offset 10 or maybe 10: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" + + "Offset 11 or maybe 11: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" + + "Offset 12 or maybe 12: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" + + "Offset 13 or maybe 13: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"); shelp("[ILLEGAL COMMENT]", - "Offset 0: Found an illegal open bracket (in context, this is [ILLEGAL C...). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 16: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); + "Offset 0 or maybe 0: Found an illegal open bracket (in context, this is [ILLEGAL C...). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 16 or maybe 16: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); shelp("(BSKYABS GRO)", ""); // DLC WHAT ARE THESE FOR? - shelp("BSKYABS GRO)", "Offset 11: Unexpected closing parenthesis, ), found.\n"); + shelp("BSKYABS GRO)", "Offset 11 or maybe 11: Unexpected closing parenthesis, ), found.\n"); shelp("BSKYABS GRO(", "Offset END: Unmatched open parenthesis, (, found.\n"); - shelp("((NESTAGE))", "Offset 1: Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\nOffset 10: Unexpected closing parenthesis, ), found.\n"); + shelp("((NESTAGE))", "Offset 1 or maybe 1: Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\nOffset 10 or maybe 10: Unexpected closing parenthesis, ), found.\n"); shelp("(BA)(PA)NYA(CA)", ""); shelp("NYAx", ""); shelp("NYA x", ""); @@ -7033,9 +7147,9 @@ tstHelper("ZUR"); shelp("(NYA ", "Offset END: Unmatched open parenthesis, (, found.\n"); shelp("[*NYA ", "Offset END: Unmatched open bracket found. A correction does not terminate.\n"); shelp("?", "", "[QUESTION:{?}]"); - shelp("KHAN~ BAR ", "Offset 4: Found an illegal character, ~, with ordinal 126.\n"); + shelp("KHAN~ BAR ", "Offset 4 or maybe 4: Found an illegal character, ~, with ordinal 126.\n"); shelp("[* Correction with []]", - "Offset 5: Found an illegal character, r, with ordinal 114.\nOffset 6: Found an illegal character, r, with ordinal 114.\nOffset 7: Found an illegal character, e, with ordinal 101.\nOffset 8: Found an illegal character, c, with ordinal 99.\nOffset 14: Found an illegal character, w, with ordinal 119.\nOffset 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); + "Offset 5 or maybe 5: Found an illegal character, r, with ordinal 114.\nOffset 6 or maybe 6: Found an illegal character, r, with ordinal 114.\nOffset 7 or maybe 7: Found an illegal character, e, with ordinal 101.\nOffset 8 or maybe 8: Found an illegal character, c, with ordinal 99.\nOffset 14 or maybe 14: Found an illegal character, w, with ordinal 119.\nOffset 19 or maybe 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21 or maybe 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); // DLC FIXME: the line SDIG PA'I GROGS PO'I LAG TU SON PAR 'GYUR PA is followed by a blank line. Note that it's "PA", not "PA ", ending it. Autocorrect to the latter. @@ -7051,8 +7165,8 @@ tstHelper("ZUR"); uhelp(" 1\\ ", "\u0f0b\u0f21\u0f84\u0f0b"); } shelp("K\\,", - "Offset 1: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n", - "[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{\\}, TIBETAN_PUNCTUATION:{,}]"); + "Offset 1 or maybe 1: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n", + "[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}, TIBETAN_PUNCTUATION:{,}]"); shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR%}]"); @@ -7073,15 +7187,15 @@ tstHelper("ZUR"); shelp("@01A.3 ", "", "[FOLIO_MARKER:{@01A.3}, TIBETAN_PUNCTUATION:{ }]"); shelp("@001 ", "", "[FOLIO_MARKER:{@001}, TIBETAN_PUNCTUATION:{ }]"); shelp("@19-20A", - "Offset 0: Found an illegal at sign, @ (in context, this is @19-20A). @012B is an example of a legal folio marker.\n", - "[ERROR:{@}, TIBETAN_NON_PUNCTUATION:{19-20A}]"); // DLC FIXME: yes it occurs in the kangyur. + "Offset 0 or maybe 0: Found an illegal at sign, @ (in context, this is @19-20A). @012B is an example of a legal folio marker.\n", + "[ERROR:{Found an illegal at sign, @ (in context, this is @19-20A). @012B is an example of a legal folio marker.}, TIBETAN_NON_PUNCTUATION:{19-20A}]"); // DLC FIXME: yes it occurs in the kangyur. shelp("@[7B]", ""); shelp("@012A.3KA", "", "[FOLIO_MARKER:{@012A.3}, TIBETAN_NON_PUNCTUATION:{KA}]"); shelp("@012A.34", - "Offset 0: Found an illegal at sign, @ (in context, this is @012A.34). This folio marker has a period, '.', at the end of it, which is illegal.\n", - "[ERROR:{@012A.}, TIBETAN_NON_PUNCTUATION:{34}]"); + "Offset 0 or maybe 0: Found an illegal at sign, @ (in context, this is @012A.34). This folio marker has a period, '.', at the end of it, which is illegal.\n", + "[ERROR:{Found an illegal at sign, @ (in context, this is @012A.34). This folio marker has a period, '.', at the end of it, which is illegal.}, TIBETAN_NON_PUNCTUATION:{34}]"); shelp("@[07B]", ""); shelp("@[00007B]", ""); shelp("@7B", ""); @@ -7097,8 +7211,8 @@ tstHelper("ZUR"); shelp("{ DD }", "", "[DD:{{ DD }}]"); // TD3790E2.ACT shelp("{ BP }", "", "[BP:{{ BP }}]"); // TD3790E2.ACT shelp("//NYA\\\\", - "Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset 5: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\nOffset 6: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n", - "[START_SLASH:{/}, ERROR:{//}, END_SLASH:{/}, TIBETAN_NON_PUNCTUATION:{NYA}, ERROR:{\\}, ERROR:{\\}]"); + "Offset 1 or maybe 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset 5 or maybe 5: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\nOffset 6 or maybe 6: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n", + "[START_SLASH:{/}, ERROR:{Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.}, END_SLASH:{/}, TIBETAN_NON_PUNCTUATION:{NYA}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}]"); } private static void uhelp(String acip) { @@ -7106,7 +7220,7 @@ tstHelper("ZUR"); } private static void uhelp(String acip, String expectedUnicode) { StringBuffer errors = new StringBuffer(); - String unicode = ACIPConverter.convertToUnicode(acip, errors); + String unicode = ACIPConverter.convertToUnicode(acip, errors, null, true); if (null == unicode) { if (null != expectedUnicode && "none" != expectedUnicode) { System.out.println("No unicode exists for " + acip + " but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode)); @@ -8729,22 +8843,22 @@ tstHelper("shKA"); } /* DLC FIXME: add test cases: from R0021F.ACE: ambiguous Tibetan/Sanskrit: - BDA' þþþþ -B+DA þþþ -DBANG þþþ -D+BA þþþ -DGA' þþþþ -D+GA þþþ -DGRA þþþ -D+GRA þþþ -DGYESþþþþþ -D+GYA þþþ -DMAR þþþþ -D+MA þþþ -GDA' þþþþ -G+DA þþþ -GNAD þþþþ -G+NA þþþ -MNA' þþþþ -M+NA þþþ +BDA' +B+DA +DBANG +D+BA +DGA' +D+GA +DGRA +D+GRA +DGYES +D+GYA +DMAR +D+MA +GDA' +G+DA +GNAD +G+NA +MNA' +M+NA */ diff --git a/source/org/thdl/tib/text/ttt/TPairList.java b/source/org/thdl/tib/text/ttt/TPairList.java index 1d97639..c1ebfd5 100644 --- a/source/org/thdl/tib/text/ttt/TPairList.java +++ b/source/org/thdl/tib/text/ttt/TPairList.java @@ -520,7 +520,8 @@ class TPairList { * corresponds to exactly one Tibetan grapheme cluster (i.e., * stack). Note that U+0F7F (ACIP {:}) is part of a stack, not a * stack all on its own. */ - void populateWithTGCPairs(ArrayList pl, ArrayList indexList, int index) { + void populateWithTGCPairs(ArrayList pl, + ArrayList indexList, int index) { int sz = size(); if (sz == 0) { return; @@ -540,8 +541,8 @@ class TPairList { // The last pair: TPair p = get(i); ThdlDebug.verify(!"+".equals(p.getRight())); - int where; boolean add_U0F7F = false; + int where; if (p.getRight() != null && (where = p.getRight().indexOf(':')) >= 0) { // this ':' guy is his own TGCPair. @@ -579,27 +580,21 @@ class TPairList { } TGCPair tp; indexList.add(new Integer(index)); - tp = new TGCPair(lWylie.toString() - + (hasNonAVowel - ? ACIPRules.getWylieForACIPVowel(p.getRight()) - : ""), + tp = new TGCPair(lWylie.toString(), + (hasNonAVowel + ? ACIPRules.getWylieForACIPVowel(p.getRight()) + : ""), (isNumeric - ? TGCPair.OTHER - : (hasNonAVowel - ? (isSanskrit - ? TGCPair.SANSKRIT_WITH_VOWEL - : (isTibetan - ? TGCPair.CONSONANTAL_WITH_VOWEL - : TGCPair.OTHER)) - : (isSanskrit - ? TGCPair.SANSKRIT_WITHOUT_VOWEL - : (isTibetan - ? TGCPair.CONSONANTAL_WITHOUT_VOWEL - : TGCPair.OTHER))))); + ? TGCPair.TYPE_OTHER + : (isSanskrit + ? TGCPair.TYPE_SANSKRIT + : (isTibetan + ? TGCPair.TYPE_TIBETAN + : TGCPair.TYPE_OTHER)))); pl.add(tp); if (add_U0F7F) { indexList.add(new Integer(index)); - pl.add(new TGCPair("H", TGCPair.OTHER)); + pl.add(new TGCPair("H", null, TGCPair.TYPE_OTHER)); } } } diff --git a/source/org/thdl/tib/text/ttt/TParseTree.java b/source/org/thdl/tib/text/ttt/TParseTree.java index ea83648..2dffa42 100644 --- a/source/org/thdl/tib/text/ttt/TParseTree.java +++ b/source/org/thdl/tib/text/ttt/TParseTree.java @@ -91,7 +91,7 @@ class TParseTree { ParseIterator pi = getParseIterator(); while (pi.hasNext()) { TStackList sl = pi.next(); - if (sl.isLegalTshegBar().isLegal) { + if (sl.isLegalTshegBar(false).isLegal) { sll.add(sl); } } @@ -118,12 +118,12 @@ class TParseTree { * a unique non-illegal parse, you get it. If there's not a * unique answer, null is returned. */ // {TZANDRA} is not solved by this, DLC NOW. Solve PADMA PROBLEM! - // DLC by using this we can get rid of single-sanskrit-gc, eh? public TStackList getBestParse() { - TStackListList up = getUniqueParse(); + TStackListList up = getUniqueParse(false); if (up.size() == 1) return up.get(0); + up = getNonIllegalParses(); int sz = up.size(); if (sz == 1) { @@ -192,14 +192,17 @@ class TParseTree { * legal parses if there two or more equally good parses. By * "legal", we mean a sequence of stacks that is legal * by the rules of Tibetan tsheg bar syntax (sometimes called - * spelling). */ - public TStackListList getUniqueParse() { + * spelling). + * @param noPrefixTests true if you want to pretend that every + * stack can take every prefix, which is not the case in + * reality */ + public TStackListList getUniqueParse(boolean noPrefixTests) { TStackListList allLegalParses = new TStackListList(2); // save memory TStackListList legalParsesWithVowelOnRoot = new TStackListList(1); ParseIterator pi = getParseIterator(); while (pi.hasNext()) { TStackList sl = pi.next(); - BoolPair bpa = sl.isLegalTshegBar(); + BoolPair bpa = sl.isLegalTshegBar(noPrefixTests); if (bpa.isLegal) { if (bpa.isLegalAndHasAVowelOnRoot) legalParsesWithVowelOnRoot.add(sl); @@ -253,13 +256,23 @@ class TParseTree { public String getWarning(boolean paranoid, TPairList pl, String originalACIP) { - TStackListList up = getUniqueParse(); + + { + TStackList bestParse = getBestParse(); + TStackListList noPrefixTestsUniqueParse = getUniqueParse(true); + if (noPrefixTestsUniqueParse.size() == 1 + && !noPrefixTestsUniqueParse.get(0).equals(bestParse)) { + return "Warning: We're going with " + bestParse + ", but only because our knowledge of prefix rules says that " + noPrefixTestsUniqueParse.get(0) + " is not a legal Tibetan tsheg bar (\"syllable\")"; + } + } + + TStackListList up = getUniqueParse(false); if (null == up || up.size() != 1) { boolean isLastStack[] = new boolean[1]; TStackListList nip = getNonIllegalParses(); if (nip.size() != 1) { if (null == getBestParse()) { - return "There's not even a unique, non-illegal parse for ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}"; + return "Warning: There's not even a unique, non-illegal parse for ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}"; } else { if (getBestParse().hasStackWithoutVowel(pl, isLastStack)) { if (isLastStack[0]) { @@ -269,7 +282,7 @@ class TParseTree { } } if (paranoid) { - return "Though the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "} is unambiguous, it would be more computer-friendly if + signs were used to stack things because there are two (or more) ways to interpret this ACIP if you're not careful."; + return "Warning: Though the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "} is unambiguous, it would be more computer-friendly if + signs were used to stack things because there are two (or more) ways to interpret this ACIP if you're not careful."; } } } else { diff --git a/source/org/thdl/tib/text/ttt/TStackList.java b/source/org/thdl/tib/text/ttt/TStackList.java index 1b01308..5db6847 100644 --- a/source/org/thdl/tib/text/ttt/TStackList.java +++ b/source/org/thdl/tib/text/ttt/TStackList.java @@ -125,15 +125,17 @@ class TStackList { * Tibetan syntax (sometimes called rules of spelling). If this * is legal, then {@link BoolPair#isLegalAndHasAVowelOnRoot} will * be true if and only if there is an explicit {A} vowel on the - * root stack. */ - public BoolPair isLegalTshegBar() { - // DLC handle PADMA and other Tibetanized Sanskrit fellows. Right now we only handle single-stack guys. + * root stack. + * @param noPrefixTests true if you want to pretend that every + * stack can take every prefix, which is not the case in + * reality */ + public BoolPair isLegalTshegBar(boolean noPrefixTests) { + // DLC handle PADMA and other Tibetanized Sanskrit fellows consistently. Right now we only treat single-stack Sanskrit guys as legal. TTGCList tgcList = new TTGCList(this); StringBuffer warnings = new StringBuffer(); String candidateType - = TibTextUtils.getClassificationOfTshegBar(tgcList, warnings); - // System.out.println("DLC: " + toString() + " has candidateType " + candidateType + " and warnings " + warnings); + = TibTextUtils.getClassificationOfTshegBar(tgcList, warnings, noPrefixTests); // preliminary answer: boolean isLegal = (candidateType != "invalid");