TMW->Wylie conversion now takes advantage of prefix rules, the rules

that say "ya can take a ga prefix" etc.

The ACIP->Unicode converter now gives warnings (optionally, and by
default, inline).  This converter now produces output even when
lexical errors occur, but the output has errors and warnings inline.
This commit is contained in:
dchandler 2003-08-23 22:03:37 +00:00
parent 21ef657921
commit d5ad760230
14 changed files with 678 additions and 270 deletions

View file

@ -102,19 +102,23 @@ public class DuffPaneTest extends TestCase {
ensureKeysGiveCorrectWylie("gya"); ensureKeysGiveCorrectWylie("gya");
ensureKeysGiveCorrectWylie("g.ya"); ensureKeysGiveCorrectWylie("g.ya");
ensureKeysGiveCorrectWylie("bya"); ensureKeysGiveCorrectWylie("bya");
ensureKeysGiveCorrectWylie("b.ya"); ensureKeysGiveCorrectWylie("b.ya", "baya");
ensureKeysGiveCorrectWylie("mya"); ensureKeysGiveCorrectWylie("mya");
ensureKeysGiveCorrectWylie("m.ya"); ensureKeysGiveCorrectWylie("m.ya", "maya");
ensureKeysGiveCorrectWylie("'ya"); ensureKeysGiveCorrectWylie("'ya", "'aya");
ensureKeysGiveCorrectWylie("'.ya", "'ya"); ensureKeysGiveCorrectWylie("'.ya", "'aya");
ensureKeysGiveCorrectWylie("dya"); ensureKeysGiveCorrectWylie("dya",
ensureKeysGiveCorrectWylie("d.ya", "dya"); "daya");
ensureKeysGiveCorrectWylie("d.ya",
"daya");
ensureKeysGiveCorrectWylie("grwa"); ensureKeysGiveCorrectWylie("grwa");
ensureKeysGiveCorrectWylie("g.rwa"); ensureKeysGiveCorrectWylie("g.rwa",
"garwa");
ensureKeysGiveCorrectWylie("gra"); ensureKeysGiveCorrectWylie("gra");
ensureKeysGiveCorrectWylie("dra"); ensureKeysGiveCorrectWylie("dra");
ensureKeysGiveCorrectWylie("drwa"); ensureKeysGiveCorrectWylie("drwa");
ensureKeysGiveCorrectWylie("d.rwa"); ensureKeysGiveCorrectWylie("d.rwa",
"darwa");
ensureKeysGiveCorrectWylie("g.r", "gar"); ensureKeysGiveCorrectWylie("g.r", "gar");
ensureKeysGiveCorrectWylie("d.r", "dar"); ensureKeysGiveCorrectWylie("d.r", "dar");
ensureKeysGiveCorrectWylie("'.r", "'ar"); ensureKeysGiveCorrectWylie("'.r", "'ar");
@ -134,7 +138,7 @@ public class DuffPaneTest extends TestCase {
ensureKeysGiveCorrectWylie("t.sa", ensureKeysGiveCorrectWylie("t.sa",
"tas"); "tas");
ensureKeysGiveCorrectWylie("d.za"); ensureKeysGiveCorrectWylie("d.za", "daza");
ensureKeysGiveCorrectWylie("dza"); ensureKeysGiveCorrectWylie("dza");
ensureKeysGiveCorrectWylie("s.ha", ensureKeysGiveCorrectWylie("s.ha",
@ -219,7 +223,7 @@ public class DuffPaneTest extends TestCase {
ensureKeysGiveCorrectWylie("b.lag"); ensureKeysGiveCorrectWylie("b.lag");
ensureKeysGiveCorrectWylie("blg", ensureKeysGiveCorrectWylie("blg",
"blga"); "balga");
ensureKeysGiveCorrectWylie("b.las", ensureKeysGiveCorrectWylie("b.las",
"bals"); "bals");
@ -244,21 +248,24 @@ public class DuffPaneTest extends TestCase {
"bras"); "bras");
ensureKeysGiveCorrectWylie("bras"); ensureKeysGiveCorrectWylie("bras");
ensureKeysGiveCorrectWylie("d.wa"); ensureKeysGiveCorrectWylie("d.wa",
"dawa");
ensureKeysGiveCorrectWylie("dawa", ensureKeysGiveCorrectWylie("dawa",
"d.wa"); "dawa");
ensureKeysGiveCorrectWylie("dwa"); ensureKeysGiveCorrectWylie("dwa");
ensureKeysGiveCorrectWylie("g.wa"); ensureKeysGiveCorrectWylie("g.wa",
"gawa");
ensureKeysGiveCorrectWylie("gawa", ensureKeysGiveCorrectWylie("gawa",
"g.wa"); "gawa");
ensureKeysGiveCorrectWylie("gwa"); ensureKeysGiveCorrectWylie("gwa");
ensureKeysGiveCorrectWylie("'.wa", ensureKeysGiveCorrectWylie("'.wa",
"'wa"); "'awa");
ensureKeysGiveCorrectWylie("'awa", ensureKeysGiveCorrectWylie("'awa",
"'wa"); "'awa");
ensureKeysGiveCorrectWylie("'wa"); ensureKeysGiveCorrectWylie("'wa",
"'awa");
ensureKeysGiveCorrectWylie("gyg", ensureKeysGiveCorrectWylie("gyg",
"g.yag"); "g.yag");
@ -282,7 +289,8 @@ public class DuffPaneTest extends TestCase {
ensureKeysGiveCorrectWylie("ma.a.asa", ensureKeysGiveCorrectWylie("ma.a.asa",
"mas"); "mas");
ensureKeysGiveCorrectWylie("'ka"); ensureKeysGiveCorrectWylie("'ka",
"'aka");
ensureKeysGiveCorrectWylie("'gas"); ensureKeysGiveCorrectWylie("'gas");
@ -319,8 +327,9 @@ public class DuffPaneTest extends TestCase {
"lamanga"); "lamanga");
ensureKeysGiveCorrectWylie("b.m.ng", ensureKeysGiveCorrectWylie("b.m.ng",
"bmang"); "bamanga");
ensureKeysGiveCorrectWylie("bmang"); ensureKeysGiveCorrectWylie("bmang",
"bamanga");
ensureKeysGiveCorrectWylie("gdams"); ensureKeysGiveCorrectWylie("gdams");
ensureKeysGiveCorrectWylie("g.d.m.s.", ensureKeysGiveCorrectWylie("g.d.m.s.",
@ -372,7 +381,7 @@ public class DuffPaneTest extends TestCase {
ensureKeysGiveCorrectWylie("fivikikhigingicichijinyitithidinipiphibimitsitshidziwizhizi'iyirilishisihiTiThiDiNiShi"); ensureKeysGiveCorrectWylie("fivikikhigingicichijinyitithidinipiphibimitsitshidziwizhizi'iyirilishisihiTiThiDiNiShi");
ensureKeysGiveCorrectWylie("don't touch my coffee/that makes me very angry/supersize my drink", ensureKeysGiveCorrectWylie("don't touch my coffee/that makes me very angry/supersize my drink",
"dona'ata tocha mya cofafe/thata mkes me veraya angaraya/superasize mya drinaka"); "dona'ata tocha mya cofafe/thata makesa me veraya angaraya/superasize mya drinaka");
} }
} }

View file

@ -28,7 +28,7 @@ zur mig nyag phran tsam gyis dge ba'i gzugs can 'dus ma byas//\par
\par \par
yid 'ong bzhin ras zla gzhon 'khor lo gnyis skyes la//\par yid 'ong bzhin ras zla gzhon 'khor lo gnyis skyes la//\par
'khrul ba ster yang 'phyang mo sel byed mgo skyes kyi//\par 'khrul ba ster yang 'phyang mo sel byed mgo skyes kyi//\par
bai DUr mthing kha'i lan bu rab 'phyang dbyangs can ma//\par bai DUra mthing kha'i lan bu rab 'phyang dbyangs can ma//\par
smra ba'i dbang phyug ngag gi rgyal po nyer grub mdzod//\par smra ba'i dbang phyug ngag gi rgyal po nyer grub mdzod//\par
\par \par
gangs can lha lam yangs pa'i khyon 'dir rgyal ba'i bstan pa bcu gnyis bdag po'i gur khang mchog/\par gangs can lha lam yangs pa'i khyon 'dir rgyal ba'i bstan pa bcu gnyis bdag po'i gur khang mchog/\par

View file

@ -25,7 +25,7 @@ package org.thdl.tib.text;
context-insensitive THDL Extended Wylie representation. NOTE context-insensitive THDL Extended Wylie representation. NOTE
WELL: this is not a real grapheme cluster; I'm misusing the term WELL: this is not a real grapheme cluster; I'm misusing the term
(FIXME). It's actually whole or part of one. It's part of one (FIXME). It's actually whole or part of one. It's part of one
when this is a vowel or U+0F7F alone. when this is U+0F7F alone.
@author David Chandler */ @author David Chandler */
public class TGCPair { public class TGCPair {
@ -37,14 +37,84 @@ public class TGCPair {
public static final int SANSKRIT_WITHOUT_VOWEL = 5; public static final int SANSKRIT_WITHOUT_VOWEL = 5;
public static final int SANSKRIT_WITH_VOWEL = 6; public static final int SANSKRIT_WITH_VOWEL = 6;
public String wylie; public static final int TYPE_OTHER = 31;
public int classification; public static final int TYPE_SANSKRIT = 32;
public TGCPair(String wylie, int classification) { public static final int TYPE_TIBETAN = 33;
this.wylie = wylie;
this.classification = classification; // Sanskrit or Tibetan consonant, or number, or oddball:
private String consonantWylie;
private String vowelWylie;
public String getConsonantWylie() {
return consonantWylie;
} }
public String getVowelWylie() {
return vowelWylie;
}
/** Cludge. */
public void setWylie(String x) {
consonantWylie = x;
vowelWylie = null;
}
public String getWylie() {
StringBuffer b = new StringBuffer();
if (consonantWylie != null) {
// we may have {p-y}, but the user wants to see {py}.
for (int i = 0; i < consonantWylie.length(); i++) {
char ch = consonantWylie.charAt(i);
if ('-' != ch)
b.append(ch);
}
}
if (vowelWylie != null)
b.append(vowelWylie);
return b.toString();
}
public int classification;
/** Constructs a new TGCPair with (Tibetan or Sanskrit) consonant
* consonantWylie and vowel vowelWylie. Use
* classification==TYPE_OTHER for numbers, lone vowels, marks,
* etc. Use classification==TYPE_TIBETAN for Tibetan (not
* Tibetanized Sanskrit) and classification=TYPE_SANSKRIT for
* Tibetanized Sanskrit. */
public TGCPair(String consonantWylie, String vowelWylie, int classification) {
if ("".equals(vowelWylie))
vowelWylie = null;
// Technically, we don't need the following check, but it's
// nice for consistency's sake.
if ("".equals(consonantWylie))
consonantWylie = null;
// DLC FIXME: for speed, make these assertions:
if (classification != TYPE_OTHER
&& classification != TYPE_TIBETAN
&& classification != TYPE_SANSKRIT) {
throw new IllegalArgumentException("Bad classification " + classification + ".");
}
int realClassification = -37;
if (vowelWylie == null && classification == TYPE_TIBETAN)
realClassification = CONSONANTAL_WITHOUT_VOWEL;
if (vowelWylie != null && classification == TYPE_TIBETAN)
realClassification = CONSONANTAL_WITH_VOWEL;
if (vowelWylie == null && classification == TYPE_SANSKRIT)
realClassification = SANSKRIT_WITHOUT_VOWEL;
if (vowelWylie != null && classification == TYPE_SANSKRIT)
realClassification = SANSKRIT_WITH_VOWEL;
if (consonantWylie == null) {
if (classification != TYPE_OTHER)
throw new IllegalArgumentException("That's the very definition of a lone vowel.");
realClassification = LONE_VOWEL;
} else {
if (classification == TYPE_OTHER)
realClassification = OTHER;
}
this.consonantWylie = consonantWylie;
this.vowelWylie = vowelWylie;
this.classification = realClassification;
}
public String toString() { public String toString() {
return "<TGCPair wylie=" + wylie + " classification=" return "<TGCPair wylie=" + getWylie() + " classification="
+ classification + "/>"; + classification + "/>";
} }
} }

View file

@ -25,6 +25,9 @@ import javax.swing.text.rtf.RTFEditorKit;
import java.io.*; import java.io.*;
import org.thdl.util.ThdlDebug; import org.thdl.util.ThdlDebug;
import org.thdl.tib.text.tshegbar.LegalTshegBar;
import org.thdl.tib.text.tshegbar.UnicodeConstants;
import org.thdl.tib.text.tshegbar.UnicodeUtils;
/** /**
* Provides methods for converting back and forth between Extended * Provides methods for converting back and forth between Extended
@ -846,86 +849,64 @@ public class TibTextUtils implements THDLWylieConstants {
// sz is an overestimate (speeds us up, wastes some memory). // sz is an overestimate (speeds us up, wastes some memory).
TMWGCList gcs = new TMWGCList(sz); TMWGCList gcs = new TMWGCList(sz);
StringBuffer buildingUpGc = new StringBuffer(); StringBuffer buildingUpVowel = new StringBuffer(); // for {cui}, we append to this guy twice.
String nonVowelWylie = null; // for the "c" in {cui}
int pairType = TGCPair.TYPE_OTHER;
boolean consonantal_with_vowel = false;
boolean buildingUpSanskrit = false;
for (int i = 0; i < sz; i++) { for (int i = 0; i < sz; i++) {
DuffCode dc = (DuffCode)glyphList.get(i); DuffCode dc = (DuffCode)glyphList.get(i);
String wylie = TibetanMachineWeb.getWylieForGlyph(dc, noSuchWylie); String wylie = TibetanMachineWeb.getWylieForGlyph(dc, noSuchWylie);
boolean containsWylieVowel = false;
boolean buildingUpSanskritNext = false; boolean buildingUpSanskritNext = false;
if ((buildingUpSanskritNext if ((buildingUpSanskritNext
= TibetanMachineWeb.isWylieSanskritConsonantStack(wylie)) = TibetanMachineWeb.isWylieSanskritConsonantStack(wylie))
|| TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(wylie)) { || TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(wylie)) {
if (buildingUpGc.length() > 0) { if (buildingUpVowel.length() > 0 || null != nonVowelWylie) {
gcs.add(new TGCPair(buildingUpGc.toString(), gcs.add(new TGCPair(nonVowelWylie,
consonantal_with_vowel buildingUpVowel.toString(),
? (buildingUpSanskrit pairType));
? TGCPair.SANSKRIT_WITH_VOWEL buildingUpVowel.delete(0, buildingUpVowel.length());
: TGCPair.CONSONANTAL_WITH_VOWEL)
: (buildingUpSanskrit
? TGCPair.SANSKRIT_WITHOUT_VOWEL
: TGCPair.CONSONANTAL_WITHOUT_VOWEL)));
buildingUpGc.delete(0, buildingUpGc.length());
} }
buildingUpGc.append(wylie); // We want {p-y}, not {py}.
consonantal_with_vowel = false; nonVowelWylie
buildingUpSanskrit = buildingUpSanskritNext; = TibetanMachineWeb.getHashKeyForGlyph(dc.getFontNum(), dc.getCharNum());
} else if ((containsWylieVowel pairType = (buildingUpSanskritNext
= TibetanMachineWeb.isWylieAdornmentAndContainsVowel(wylie)) ? TGCPair.TYPE_SANSKRIT
: TGCPair.TYPE_TIBETAN);
} else if (TibetanMachineWeb.isWylieAdornmentAndContainsVowel(wylie)
|| TibetanMachineWeb.isWylieAdornment(wylie)) { || TibetanMachineWeb.isWylieAdornment(wylie)) {
buildingUpVowel.append(wylie);
if (buildingUpGc.length() > 0) {
buildingUpGc.append(wylie);
if (containsWylieVowel) {
if (debug)
System.out.println("DEBUG: with_vowel is true thanks to " + wylie);
consonantal_with_vowel = true;
}
// do not clear; we might have {cui} or {hUM}, e.g.
} else {
gcs.add(new TGCPair(wylie,
TGCPair.LONE_VOWEL));
consonantal_with_vowel = false;
}
} else { } else {
// number or weird thing: // number or weird thing:
if (buildingUpGc.length() > 0) { if (buildingUpVowel.length() > 0 || null != nonVowelWylie) {
gcs.add(new TGCPair(buildingUpGc.toString(), gcs.add(new TGCPair(nonVowelWylie,
consonantal_with_vowel buildingUpVowel.toString(),
? (buildingUpSanskrit pairType));
? TGCPair.SANSKRIT_WITH_VOWEL buildingUpVowel.delete(0, buildingUpVowel.length());
: TGCPair.CONSONANTAL_WITH_VOWEL) nonVowelWylie = null;
: (buildingUpSanskrit
? TGCPair.SANSKRIT_WITHOUT_VOWEL
: TGCPair.CONSONANTAL_WITHOUT_VOWEL)));
buildingUpGc.delete(0, buildingUpGc.length());
} }
gcs.add(new TGCPair(wylie, TGCPair.OTHER)); gcs.add(new TGCPair(wylie, null, TGCPair.TYPE_OTHER));
consonantal_with_vowel = false; pairType = TGCPair.TYPE_OTHER;
buildingUpSanskrit = false;
} }
} }
if (buildingUpGc.length() > 0) { if (buildingUpVowel.length() > 0 || null != nonVowelWylie) {
gcs.add(new TGCPair(buildingUpGc.toString(), gcs.add(new TGCPair(nonVowelWylie,
consonantal_with_vowel buildingUpVowel.toString(),
? (buildingUpSanskrit pairType));
? TGCPair.SANSKRIT_WITH_VOWEL
: TGCPair.CONSONANTAL_WITH_VOWEL)
: (buildingUpSanskrit
? TGCPair.SANSKRIT_WITHOUT_VOWEL
: TGCPair.CONSONANTAL_WITHOUT_VOWEL)));
} }
buildingUpGc = null;
return gcs; return gcs;
} }
/** Returns a string that classifies gcs as a legal Tibetan tsheg
* bar, a single Sanskrit grapheme cluster
* ("single-sanskrit-gc"), or invalid ("invalid"). If
* noPrefixTests is true, then ggyi will be seen as a
* "prefix-root", even though gya doesn't take a ga prefix. */
public static String getClassificationOfTshegBar(TGCList gcs, public static String getClassificationOfTshegBar(TGCList gcs,
// DLC the warnings are Wylie-specific // DLC the warnings are Wylie-specific
StringBuffer warnings) { StringBuffer warnings,
boolean noPrefixTests) {
String candidateType = null; String candidateType = null;
// Now that we have grapheme clusters, see if they match any // Now that we have grapheme clusters, see if they match any
// of the "legal tsheg bars": // of the "legal tsheg bars":
@ -937,10 +918,11 @@ public class TibTextUtils implements THDLWylieConstants {
|| TGCPair.SANSKRIT_WITH_VOWEL == cls) || TGCPair.SANSKRIT_WITH_VOWEL == cls)
return "single-sanskrit-gc"; return "single-sanskrit-gc";
} }
TGCPair lastPair = null;
for (int i = 0; i < sz; i++) { for (int i = 0; i < sz; i++) {
TGCPair tp = gcs.get(i); TGCPair tp = gcs.get(i);
int cls = tp.classification; int cls = tp.classification;
String wylie = tp.wylie; String wylie = tp.getWylie();
if (TGCPair.OTHER == cls) { if (TGCPair.OTHER == cls) {
if (TibetanMachineWeb.isWylieNumber(wylie)) { if (TibetanMachineWeb.isWylieNumber(wylie)) {
if (null == candidateType) { if (null == candidateType) {
@ -977,25 +959,44 @@ public class TibTextUtils implements THDLWylieConstants {
// peek ahead to distinguish between ba's, // peek ahead to distinguish between ba's,
// ba'ala and ba'am: // ba'ala and ba'am:
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null; TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie; String nextwylie = (nexttp == null) ? "" : nexttp.getWylie();
if (isAppendageNonVowelWylie(nextwylie)) { if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-prefix/root"; candidateType = "maybe-appendaged-prefix/root";
} else { } else {
if (noPrefixTests
|| isLegalPrefixRootCombo(lastPair.getConsonantWylie(),
tp.getConsonantWylie()))
candidateType = "prefix/root-root/suffix"; candidateType = "prefix/root-root/suffix";
else
candidateType = "root-suffix";
} }
} else if (TibetanMachineWeb.isWylieRight(wylie)) { } else if (TibetanMachineWeb.isWylieRight(wylie)) {
if (noPrefixTests
|| isLegalPrefixRootCombo(lastPair.getConsonantWylie(),
tp.getConsonantWylie()))
candidateType = "prefix/root-root/suffix"; candidateType = "prefix/root-root/suffix";
else
candidateType = "root-suffix";
} else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
candidateType = "appendaged-prefix/root"; candidateType = "appendaged-prefix/root";
} else { } else {
if (noPrefixTests
|| isLegalPrefixRootCombo(lastPair.getConsonantWylie(),
tp.getConsonantWylie()))
candidateType = "prefix-root"; candidateType = "prefix-root";
else {
if (null != warnings)
warnings.append("Found what would be a prefix-root combo, but the root stack with wylie " + wylie + " does not take the prefix with wylie " + lastPair.getConsonantWylie());
candidateType = "invalid";
break;
}
} }
} else if ("root" == candidateType) { } else if ("root" == candidateType) {
if (ACHUNG.equals(wylie)) { if (ACHUNG.equals(wylie)) {
// peek ahead to distinguish between pa's, // peek ahead to distinguish between pa's,
// pa'ala and pa'am: // pa'ala and pa'am:
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null; TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie; String nextwylie = (nexttp == null) ? "" : nexttp.getWylie();
if (isAppendageNonVowelWylie(nextwylie)) { if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-root"; candidateType = "maybe-appendaged-root";
} else { } else {
@ -1016,7 +1017,7 @@ public class TibTextUtils implements THDLWylieConstants {
// peek ahead to distinguish between bpa's, // peek ahead to distinguish between bpa's,
// bpa'ala and bpa'am: // bpa'ala and bpa'am:
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null; TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie; String nextwylie = (nexttp == null) ? "" : nexttp.getWylie();
if (isAppendageNonVowelWylie(nextwylie)) { if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-prefix-root"; candidateType = "maybe-appendaged-prefix-root";
} else { } else {
@ -1038,7 +1039,7 @@ public class TibTextUtils implements THDLWylieConstants {
// peek ahead to distinguish between // peek ahead to distinguish between
// gga'am and gaga'ala: // gga'am and gaga'ala:
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null; TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie; String nextwylie = (nexttp == null) ? "" : nexttp.getWylie();
if (isAppendageNonVowelWylie(nextwylie)) { if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-prefix/root-root/suffix"; candidateType = "maybe-appendaged-prefix/root-root/suffix";
} else { } else {
@ -1120,7 +1121,11 @@ public class TibTextUtils implements THDLWylieConstants {
candidateType candidateType
= candidateType.substring("maybe-".length()).intern(); = candidateType.substring("maybe-".length()).intern();
// So that we get 'am, not 'm; 'ang, not 'ng: // So that we get 'am, not 'm; 'ang, not 'ng:
tp.wylie = WYLIE_aVOWEL + tp.wylie;
// FIXME: cludge: weird place to do this.
// pa'am, not pa'm is what we want, sure,
// but doing this here is ugly.
tp.setWylie(WYLIE_aVOWEL + tp.getWylie());
} else { } else {
if (null != warnings) if (null != warnings)
warnings.append("Found a tsheg bar that has an achung (" + ACHUNG + ") tacked on, followed by some other thing whose wylie is " + wylie + "\n"); warnings.append("Found a tsheg bar that has an achung (" + ACHUNG + ") tacked on, followed by some other thing whose wylie is " + wylie + "\n");
@ -1157,6 +1162,7 @@ public class TibTextUtils implements THDLWylieConstants {
} else { } else {
throw new Error("bad cls"); throw new Error("bad cls");
} }
lastPair = tp;
} }
if (candidateType.startsWith("maybe-appendaged-")) { if (candidateType.startsWith("maybe-appendaged-")) {
if (null != warnings) if (null != warnings)
@ -1221,7 +1227,7 @@ public class TibTextUtils implements THDLWylieConstants {
StringBuffer wylieBuffer) { StringBuffer wylieBuffer) {
TGCList gcs TGCList gcs
= breakTshegBarIntoGraphemeClusters(glyphList, noSuchWylie); = breakTshegBarIntoGraphemeClusters(glyphList, noSuchWylie);
String candidateType = getClassificationOfTshegBar(gcs, warnings); String candidateType = getClassificationOfTshegBar(gcs, warnings, false);
int sz = gcs.size(); int sz = gcs.size();
if (candidateType == "invalid" if (candidateType == "invalid"
|| candidateType == "single-sanskrit-gc") { || candidateType == "single-sanskrit-gc") {
@ -1237,7 +1243,7 @@ public class TibTextUtils implements THDLWylieConstants {
for (int i = 0; i < sz; i++) { for (int i = 0; i < sz; i++) {
TGCPair tp = (TGCPair)gcs.get(i); TGCPair tp = (TGCPair)gcs.get(i);
int cls = tp.classification; int cls = tp.classification;
String wylie = tp.wylie; String wylie = tp.getWylie();
wylieBuffer.append(wylie); wylieBuffer.append(wylie);
if (TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(wylie) if (TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(wylie)
|| TibetanMachineWeb.isWylieSanskritConsonantStack(wylie)) { || TibetanMachineWeb.isWylieSanskritConsonantStack(wylie)) {
@ -1290,9 +1296,9 @@ public class TibTextUtils implements THDLWylieConstants {
leftover = 3; leftover = 3;
/* FIXME: these constants are hard-wired here, rather /* FIXME: these constants are hard-wired here, rather
* than in TibetanMachineWeb, because I'm lazy. */ * than in TibetanMachineWeb, because I'm lazy. */
String wylie1 = ((TGCPair)gcs.get(0)).wylie; String wylie1 = ((TGCPair)gcs.get(0)).getWylie();
String wylie2 = ((TGCPair)gcs.get(1)).wylie; String wylie2 = ((TGCPair)gcs.get(1)).getWylie();
String wylie3 = ((TGCPair)gcs.get(2)).wylie; String wylie3 = ((TGCPair)gcs.get(2)).getWylie();
if ((wylie1.equals("g") && (wylie2.equals("d") || wylie2.equals("n") || wylie2.equals("s"))) if ((wylie1.equals("g") && (wylie2.equals("d") || wylie2.equals("n") || wylie2.equals("s")))
|| (wylie1.equals("d") && (wylie2.equals("g") || wylie2.equals("m"))) || (wylie1.equals("d") && (wylie2.equals("g") || wylie2.equals("m")))
|| (wylie1.equals("b") && wylie2.equals("d")) || (wylie1.equals("b") && wylie2.equals("d"))
@ -1316,7 +1322,7 @@ public class TibTextUtils implements THDLWylieConstants {
|| "prefix/root" == candidateType || "prefix/root" == candidateType
|| "root-suffix-postsuffix" == candidateType || "root-suffix-postsuffix" == candidateType
|| "root-suffix" == candidateType) { || "root-suffix" == candidateType) {
String wylie1 = ((TGCPair)gcs.get(0)).wylie; String wylie1 = ((TGCPair)gcs.get(0)).getWylie();
leftover = 1; leftover = 1;
wylieBuffer.append(wylie1); wylieBuffer.append(wylie1);
if (((TGCPair)gcs.get(0)).classification if (((TGCPair)gcs.get(0)).classification
@ -1330,16 +1336,16 @@ public class TibTextUtils implements THDLWylieConstants {
} }
if ("root-suffix-postsuffix" == candidateType) { if ("root-suffix-postsuffix" == candidateType) {
leftover = 3; leftover = 3;
String wylie2 = ((TGCPair)gcs.get(1)).wylie; String wylie2 = ((TGCPair)gcs.get(1)).getWylie();
String wylie3 = ((TGCPair)gcs.get(2)).wylie; String wylie3 = ((TGCPair)gcs.get(2)).getWylie();
wylieBuffer.append(unambiguousPostAVowelWylie(wylie2, wylieBuffer.append(unambiguousPostAVowelWylie(wylie2,
wylie3)); wylie3));
} }
} else if ("prefix-root-suffix" == candidateType } else if ("prefix-root-suffix" == candidateType
|| "prefix-root" == candidateType || "prefix-root" == candidateType
|| "prefix-root-suffix-postsuffix" == candidateType) { || "prefix-root-suffix-postsuffix" == candidateType) {
String wylie1 = ((TGCPair)gcs.get(0)).wylie; String wylie1 = ((TGCPair)gcs.get(0)).getWylie();
String wylie2 = ((TGCPair)gcs.get(1)).wylie; String wylie2 = ((TGCPair)gcs.get(1)).getWylie();
leftover = 2; leftover = 2;
if (TibetanMachineWeb.isAmbiguousWylie(wylie1, wylie2)) if (TibetanMachineWeb.isAmbiguousWylie(wylie1, wylie2))
wylieBuffer.append(wylie1 + WYLIE_DISAMBIGUATING_KEY + wylie2); wylieBuffer.append(wylie1 + WYLIE_DISAMBIGUATING_KEY + wylie2);
@ -1357,8 +1363,8 @@ public class TibTextUtils implements THDLWylieConstants {
} }
if ("prefix-root-suffix-postsuffix" == candidateType) { if ("prefix-root-suffix-postsuffix" == candidateType) {
leftover = 4; leftover = 4;
String wylie3 = ((TGCPair)gcs.get(2)).wylie; String wylie3 = ((TGCPair)gcs.get(2)).getWylie();
String wylie4 = ((TGCPair)gcs.get(3)).wylie; String wylie4 = ((TGCPair)gcs.get(3)).getWylie();
wylieBuffer.append(unambiguousPostAVowelWylie(wylie3, wylieBuffer.append(unambiguousPostAVowelWylie(wylie3,
wylie4)); wylie4));
} }
@ -1371,15 +1377,15 @@ public class TibTextUtils implements THDLWylieConstants {
// append the wylie left over: // append the wylie left over:
for (int i = leftover; i < sz; i++) { for (int i = leftover; i < sz; i++) {
TGCPair tp = (TGCPair)gcs.get(i); TGCPair tp = (TGCPair)gcs.get(i);
String wylie = tp.wylie; String wylie = tp.getWylie();
wylieBuffer.append(wylie); wylieBuffer.append(wylie);
} }
} }
} }
/** /**
* Gets the Extended Wylie for a sequence of glyphs using Chandler's * Gets the Extended Wylie for a sequence of glyphs. This works as
* experimental method. This works as follows: * follows:
* *
* <p>We run along until we hit whitespace or punctuation. We take * <p>We run along until we hit whitespace or punctuation. We take
* everything before that and we see if it's a legal Tibetan tsheg bar, * everything before that and we see if it's a legal Tibetan tsheg bar,
@ -1480,4 +1486,90 @@ public class TibTextUtils implements THDLWylieConstants {
} }
return rv; return rv;
} }
/** Returns true if and only if the stack with Wylie <i>root</i>
* can take the prefix <i>prefix</i>. */
private static boolean isLegalPrefixRootCombo(String prefix, String root) {
// This will be decomposed enough. If you can decompose it,
// then it doesn't take a prefix!
if (!TibetanMachineWeb.isKnownHashKey(root)) {
root = root.replace('+', '-');
if (!TibetanMachineWeb.isKnownHashKey(root)) {
throw new Error("root is, now, " + root); // FIXME: make this an assertion
}
}
String ru = TibetanMachineWeb.getUnicodeForWylieForGlyph(root);
// ru may be for (head, root, sub), (head, root), (root), or
// (root, sub). Try all possibilities that are possible with
// a String of length ru. If there's a wa-zur, then we say
// (FIXME: do we say correctly?) that a stack with wa-zur can
// take a prefix if and only if the stack without can take a
// prefix.
if (ru == null) throw new Error("how? root is " + root); // FIXME: make this an assertion
int rl = ru.length();
if (ru.charAt(rl - 1) == UnicodeConstants.EWSUB_wa_zur)
--rl; // forget about wa-zur: see above.
if (rl == 2) {
char ch0 = ru.charAt(0);
char ch1 = UnicodeUtils.getNominalRepresentationOfSubscribedConsonant(ru.charAt(1));
// (head, root) and (root, sub) are possibilities.
if (ACHUNG.equals(prefix)) {
return LegalTshegBar.takesAchungPrefix(ch0, ch1, UnicodeConstants.EW_ABSENT)
|| LegalTshegBar.takesAchungPrefix(UnicodeConstants.EW_ABSENT, ch0, ch1);
} else if ("b".equals(prefix)) {
return LegalTshegBar.takesBao(ch0, ch1, UnicodeConstants.EW_ABSENT)
|| LegalTshegBar.takesBao(UnicodeConstants.EW_ABSENT, ch0, ch1);
} else if ("m".equals(prefix)) {
return LegalTshegBar.takesMao(ch0, ch1, UnicodeConstants.EW_ABSENT)
|| LegalTshegBar.takesMao(UnicodeConstants.EW_ABSENT, ch0, ch1);
} else if ("g".equals(prefix)) {
return LegalTshegBar.takesGao(ch0, ch1, UnicodeConstants.EW_ABSENT)
|| LegalTshegBar.takesGao(UnicodeConstants.EW_ABSENT, ch0, ch1);
} else if ("d".equals(prefix)) {
return LegalTshegBar.takesDao(ch0, ch1, UnicodeConstants.EW_ABSENT)
|| LegalTshegBar.takesDao(UnicodeConstants.EW_ABSENT, ch0, ch1);
} else {
throw new IllegalArgumentException("prefix is " + prefix);
}
} else if (rl == 1) {
char ch0 = ru.charAt(0);
// (root) is the only choice.
if (ACHUNG.equals(prefix)) {
return LegalTshegBar.takesAchungPrefix(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT);
} else if ("b".equals(prefix)) {
return LegalTshegBar.takesBao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT);
} else if ("m".equals(prefix)) {
return LegalTshegBar.takesMao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT);
} else if ("g".equals(prefix)) {
return LegalTshegBar.takesGao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT);
} else if ("d".equals(prefix)) {
return LegalTshegBar.takesDao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT);
} else {
throw new IllegalArgumentException("prefix is " + prefix);
}
} else if (rl == 3) {
char ch0 = ru.charAt(0);
char ch1 = UnicodeUtils.getNominalRepresentationOfSubscribedConsonant(ru.charAt(1));
char ch2 = UnicodeUtils.getNominalRepresentationOfSubscribedConsonant(ru.charAt(2));
// (head, root, sub) is the only choice.
if (ACHUNG.equals(prefix)) {
return LegalTshegBar.takesAchungPrefix(ch0, ch1, ch2);
} else if ("b".equals(prefix)) {
return LegalTshegBar.takesBao(ch0, ch1, ch2);
} else if ("m".equals(prefix)) {
return LegalTshegBar.takesMao(ch0, ch1, ch2);
} else if ("g".equals(prefix)) {
return LegalTshegBar.takesGao(ch0, ch1, ch2);
} else if ("d".equals(prefix)) {
return LegalTshegBar.takesDao(ch0, ch1, ch2);
} else {
throw new IllegalArgumentException("prefix is " + prefix);
}
} else {
return false;
}
}
} }

View file

@ -178,14 +178,19 @@ public class TibetanMachineWeb implements THDLWylieConstants {
// NOTE WELL: if you delete from consonants, numbers, vowels, or // NOTE WELL: if you delete from tibetanConsonants,
// others, you'll change the way Jskad's Extended Wylie keyboard // otherConsonants, numbers, vowels, or others, you'll change the
// works, yes, but you'll also change TMW->Wylie. // way Jskad's Extended Wylie keyboard works, yes, but you'll also
// change TMW->Wylie.
/** comma-delimited list of supported consonants (Tibetan and /** comma-delimited list of supported Tibetan consonants: */
Tibetanized Sanskrit): */ private static final String tibetanConsonants
private static final String consonants = "k,kh,g,ng,c,ch,j,ny,t,th,d,n,p,ph,b,m,ts,tsh,dz,w,zh,z,',y,r,l,sh,s,h,a";
= "k,kh,g,ng,c,ch,j,ny,t,th,d,n,p,ph,b,m,ts,tsh,dz,w,zh,z,',y,r,l,sh,s,h,a,T,Th,D,N,Sh,v,f,Dz";
/** comma-delimited list of supported non-Tibetan consonants, such
* as Sanskrit consonants: */
private static final String otherConsonants // va and fa are treated pretty-much like Sanskrit.
= "T,Th,D,N,Sh,v,f,Dz";
/** comma-delimited list of supported numbers (superscribed, /** comma-delimited list of supported numbers (superscribed,
subscribed, normal, half-numerals): */ subscribed, normal, half-numerals): */
@ -371,7 +376,7 @@ public class TibetanMachineWeb implements THDLWylieConstants {
charSet = new HashSet(); charSet = new HashSet();
tibSet = new HashSet(); tibSet = new HashSet();
sTok = new StringTokenizer(consonants, ","); sTok = new StringTokenizer(tibetanConsonants, ",");
while (sTok.hasMoreTokens()) { while (sTok.hasMoreTokens()) {
String ntk; String ntk;
charSet.add(ntk = sTok.nextToken()); charSet.add(ntk = sTok.nextToken());
@ -379,6 +384,15 @@ public class TibetanMachineWeb implements THDLWylieConstants {
validInputSequences.put(ntk, anyOldObjectWillDo); validInputSequences.put(ntk, anyOldObjectWillDo);
} }
sanskritStackSet = new HashSet();
sTok = new StringTokenizer(otherConsonants, ",");
while (sTok.hasMoreTokens()) {
String ntk;
charSet.add(ntk = sTok.nextToken());
sanskritStackSet.add(ntk);
validInputSequences.put(ntk, anyOldObjectWillDo);
}
numberSet = new HashSet(); numberSet = new HashSet();
sTok = new StringTokenizer(numbers, ","); sTok = new StringTokenizer(numbers, ",");
while (sTok.hasMoreTokens()) { while (sTok.hasMoreTokens()) {
@ -386,7 +400,7 @@ public class TibetanMachineWeb implements THDLWylieConstants {
// do it in <?Input:Numbers?> so that Jskad has the same // do it in <?Input:Numbers?> so that Jskad has the same
// TMW->Wylie conversion regardless of whether or not it // TMW->Wylie conversion regardless of whether or not it
// chooses to support inputting numbers. Likewise for // chooses to support inputting numbers. Likewise for
// consonants, others, and vowels. // tibetanConsonants, otherConsonants, others, and vowels.
String ntk; String ntk;
charSet.add(ntk = sTok.nextToken()); charSet.add(ntk = sTok.nextToken());
numberSet.add(ntk); numberSet.add(ntk);
@ -427,8 +441,6 @@ public class TibetanMachineWeb implements THDLWylieConstants {
boolean ignore = false; boolean ignore = false;
sanskritStackSet = new HashSet();
while ((line = in.readLine()) != null) { while ((line = in.readLine()) != null) {
if (line.startsWith("<?")) { //line is command if (line.startsWith("<?")) { //line is command
if (line.equalsIgnoreCase("<?Consonants?>")) { if (line.equalsIgnoreCase("<?Consonants?>")) {
@ -1182,6 +1194,23 @@ public static boolean hasGlyph(String hashKey) {
return true; return true;
} }
/** Returns the Unicode correspondence for the Wylie wylie, which must
* be Wylie returned by getWylieForGlyph(int, int, boolean[]).
* Returns null if the Unicode correspondence is nonexistent or
* unknown. */
public static String getUnicodeForWylieForGlyph(String wylie) {
DuffCode dc = getGlyph(wylie);
return mapTMWtoUnicode(dc.getFontNum() - 1, dc.getCharNum());
}
/**
* Returns true if and only if hashKey is a known hash key from tibwn.ini.
*/
public static boolean isKnownHashKey(String hashKey) {
DuffCode[] dc = (DuffCode[])tibHash.get(hashKey);
return (null != dc);
}
/** /**
* Gets a glyph for this hash key. Hash keys are not identical to Extended * Gets a glyph for this hash key. Hash keys are not identical to Extended
* Wylie. The hash key for a Tibetan stack separates the members of the stack * Wylie. The hash key for a Tibetan stack separates the members of the stack
@ -1193,7 +1222,7 @@ public static boolean hasGlyph(String hashKey) {
public static DuffCode getGlyph(String hashKey) { public static DuffCode getGlyph(String hashKey) {
DuffCode[] dc = (DuffCode[])tibHash.get(hashKey); DuffCode[] dc = (DuffCode[])tibHash.get(hashKey);
if (null == dc) if (null == dc)
throw new Error("It is likely that you misconfigured tibwn.ini such that, say, M is expected (i.e., it is listed as, e.g. punctuation), but no 'M~...' line appears."); throw new Error("Hash key " + hashKey + " not found; it is likely that you misconfigured tibwn.ini such that, say, M is expected (i.e., it is listed as, e.g. punctuation), but no 'M~...' line appears.");
return dc[TMW]; return dc[TMW];
} }

View file

@ -98,13 +98,6 @@ __TILDE__~93,5~~9,91~~~~~~~none
<?Input:Tibetan?> <?Input:Tibetan?>
// 0F5F,0F39 might work, but the OpenType font's author must've had
// Dza in mind if it does. Note that the bottommost horizontal stroke
// goes upward on U+0F5F and downward on U+0F5B.
Dz~146,5~~10,42~~~~~~~none
f~153,5~~10,58~1,110~1,118~1,124~1,126~10,114~10,123~0F55,0F39
v~154,5~~10,59~1,110~1,118~1,124~1,126~10,114~10,123~0F56,f39
k~33,1~1,92~1,33~1,109~1,111~1,123~1,125~10,118~10,120~0F40 k~33,1~1,92~1,33~1,109~1,111~1,123~1,125~10,118~10,120~0F40
kh~34,1~~1,34~1,109~1,118~1,123~1,125~10,114~10,123~0F41 kh~34,1~~1,34~1,109~1,118~1,123~1,125~10,114~10,123~0F41
g~35,1~1,93~1,35~1,109~1,111~1,123~1,125~10,118~10,120~0F42 g~35,1~1,93~1,35~1,109~1,111~1,123~1,125~10,118~10,120~0F42
@ -135,11 +128,6 @@ sh~59,1~1,99~1,60~1,109~1,111~1,123~1,125~10,118~10,120~0F64
s~60,1~~1,61~1,109~1,118~1,123~1,125~10,114~10,123~0F66 s~60,1~~1,61~1,109~1,118~1,123~1,125~10,114~10,123~0F66
h~61,1~1,100~1,62~1,109~1,112~1,123~1,125~10,115~10,122~0F67~1,102 h~61,1~1,100~1,62~1,109~1,112~1,123~1,125~10,115~10,122~0F67~1,102
a~62,1~~1,63~1,109~1,118~1,123~1,125~10,114~10,123~0F68 a~62,1~~1,63~1,109~1,118~1,123~1,125~10,114~10,123~0F68
T~170,1~~1,64~1,109~1,120~1,123~1,125~10,115~10,124~0F4A
Th~171,1~~1,65~1,109~1,118~1,123~1,125~10,114~10,123~0F4B
D~172,1~~1,66~1,109~1,120~1,123~1,125~10,115~10,124~0F4C
N~173,1~~1,67~1,109~1,118~1,123~1,125~10,115~10,124~0F4E
Sh~174,1~~1,68~1,109~1,118~1,123~1,125~10,115~10,124~0F65
r-k~63,1~~1,70~1,109~1,121~1,123~1,125~10,115~10,124~f62,f90 r-k~63,1~~1,70~1,109~1,121~1,123~1,125~10,115~10,124~f62,f90
r-g~64,1~~1,71~1,109~1,121~1,123~1,125~10,115~10,124~f62,f92 r-g~64,1~~1,71~1,109~1,121~1,123~1,125~10,115~10,124~f62,f92
r-ng~65,1~~1,72~1,109~1,119~1,123~1,125~10,115~10,124~f62,f94 r-ng~65,1~~1,72~1,109~1,119~1,123~1,125~10,115~10,124~f62,f94
@ -241,6 +229,17 @@ au~237,1~~8,89~~~~~~~0F7D~~8,104
// DLC FIXME: need -I as well // DLC FIXME: need -I as well
<?Input:Sanskrit?> <?Input:Sanskrit?>
// 0F5F,0F39 might work, but the OpenType font's author must've had
// Dza in mind if it does. Note that the bottommost horizontal stroke
// goes upward on U+0F5F and downward on U+0F5B.
Dz~146,5~~10,42~~~~~~~none
f~153,5~~10,58~1,110~1,118~1,124~1,126~10,114~10,123~0F55,0F39
v~154,5~~10,59~1,110~1,118~1,124~1,126~10,114~10,123~0F56,f39
T~170,1~~1,64~1,109~1,120~1,123~1,125~10,115~10,124~0F4A
Th~171,1~~1,65~1,109~1,118~1,123~1,125~10,114~10,123~0F4B
D~172,1~~1,66~1,109~1,120~1,123~1,125~10,115~10,124~0F4C
N~173,1~~1,67~1,109~1,118~1,123~1,125~10,115~10,124~0F4E
Sh~174,1~~1,68~1,109~1,118~1,123~1,125~10,115~10,124~0F65
k+Sh~175,1~~1,69~1,109~1,122~1,123~1,125~10,116~10,125~0F69 k+Sh~175,1~~1,69~1,109~1,122~1,123~1,125~10,116~10,125~0F69
k+k~33,2~~3,33~1,109~4,120~1,123~1,125~4,106~4,113~f40,f90 k+k~33,2~~3,33~1,109~4,120~1,123~1,125~4,106~4,113~f40,f90
k+kh~34,2~~3,34~1,109~4,120~1,123~1,125~4,106~4,113~f40,f91 k+kh~34,2~~3,34~1,109~4,120~1,123~1,125~4,106~4,113~f40,f91

View file

@ -1266,7 +1266,7 @@ public final class LegalTshegBar
* @param sub the {@link #isNominalRepresentationOfConsonant(char) * @param sub the {@link #isNominalRepresentationOfConsonant(char)
* nominal representation} of the subjoined letter, or EW_ABSENT * nominal representation} of the subjoined letter, or EW_ABSENT
* if not present */ * if not present */
static boolean takesGao(char head, char root, char sub) { public static boolean takesGao(char head, char root, char sub) {
if (EW_ABSENT == head) { if (EW_ABSENT == head) {
if (EW_ABSENT == sub) { if (EW_ABSENT == sub) {
return (EWC_ca == root return (EWC_ca == root
@ -1298,7 +1298,7 @@ public final class LegalTshegBar
* @param sub the {@link #isNominalRepresentationOfConsonant(char) * @param sub the {@link #isNominalRepresentationOfConsonant(char)
* nominal representation} of the subjoined letter, or EW_ABSENT * nominal representation} of the subjoined letter, or EW_ABSENT
* if not present */ * if not present */
static boolean takesDao(char head, char root, char sub) { public static boolean takesDao(char head, char root, char sub) {
if (EW_ABSENT == head) { if (EW_ABSENT == head) {
if (EW_ABSENT == sub) { if (EW_ABSENT == sub) {
return (EWC_ka == root return (EWC_ka == root
@ -1312,6 +1312,7 @@ public final class LegalTshegBar
|| (EWC_pa == root && EWC_ya == sub) || (EWC_pa == root && EWC_ya == sub)
|| (EWC_ba == root && EWC_ya == sub) || (EWC_ba == root && EWC_ya == sub)
|| (EWC_ma == root && EWC_ya == sub) || (EWC_ma == root && EWC_ya == sub)
|| (EWC_ka == root && EWC_ya == sub) // dkyil, for example
|| (EWC_ka == root && EWC_ra == sub) || (EWC_ka == root && EWC_ra == sub)
|| (EWC_ga == root && EWC_ra == sub) || (EWC_ga == root && EWC_ra == sub)
@ -1336,7 +1337,7 @@ public final class LegalTshegBar
* @param sub the {@link #isNominalRepresentationOfConsonant(char) * @param sub the {@link #isNominalRepresentationOfConsonant(char)
* nominal representation} of the subjoined letter, or EW_ABSENT * nominal representation} of the subjoined letter, or EW_ABSENT
* if not present */ * if not present */
static boolean takesAchungPrefix(char head, char root, char sub) { public static boolean takesAchungPrefix(char head, char root, char sub) {
if (EW_ABSENT == head) { if (EW_ABSENT == head) {
if (EW_ABSENT == sub) { if (EW_ABSENT == sub) {
return (EWC_ga == root return (EWC_ga == root
@ -1379,7 +1380,7 @@ public final class LegalTshegBar
* @param sub the {@link #isNominalRepresentationOfConsonant(char) * @param sub the {@link #isNominalRepresentationOfConsonant(char)
* nominal representation} of the subjoined letter, or EW_ABSENT * nominal representation} of the subjoined letter, or EW_ABSENT
* if not present */ * if not present */
static boolean takesMao(char head, char root, char sub) { public static boolean takesMao(char head, char root, char sub) {
if (EW_ABSENT == head) { if (EW_ABSENT == head) {
if (EW_ABSENT == sub) { if (EW_ABSENT == sub) {
return (EWC_kha == root return (EWC_kha == root
@ -1418,11 +1419,12 @@ public final class LegalTshegBar
* @param sub the {@link #isNominalRepresentationOfConsonant(char) * @param sub the {@link #isNominalRepresentationOfConsonant(char)
* nominal representation} of the subjoined letter, or EW_ABSENT * nominal representation} of the subjoined letter, or EW_ABSENT
* if not present */ * if not present */
static boolean takesBao(char head, char root, char sub) { public static boolean takesBao(char head, char root, char sub) {
// DLC ask Ten-lo la about Wazur. // DLC ask Ten-lo la about Wazur.
if (EW_ABSENT == head) { if (EW_ABSENT == head) {
if (EW_ABSENT == sub) { if (EW_ABSENT == sub) {
return (EWC_ka == root return (EWC_ka == root
|| EWC_sa == root // bsams, for example
|| EWC_ca == root || EWC_ca == root
|| EWC_ta == root || EWC_ta == root
|| EWC_tsa == root || EWC_tsa == root

View file

@ -232,6 +232,14 @@ public class UnicodeUtils implements UnicodeConstants {
/* DLC FIXME -- I was using 3.0 p.437-440, check 3.2. */ /* DLC FIXME -- I was using 3.0 p.437-440, check 3.2. */
} }
/** If ch is in one of the ranges U+0F90-U+0F97, U+0F99-U+0FB9,
* then this returns the same consonant in the range
* U+0F40-U+0F69. If ch is not in that range, this returns
* garbage. */
public static char getNominalRepresentationOfSubscribedConsonant(char ch) {
return (char)((int)ch-(((int)'\u0F90') - ((int)'\u0F40')));
}
/** Returns true iff ch corresponds to the Tibetan letter ra. /** Returns true iff ch corresponds to the Tibetan letter ra.
Several Unicode codepoints correspond to the Tibetan letter ra Several Unicode codepoints correspond to the Tibetan letter ra
(in its subscribed form or otherwise). Oftentimes, (in its subscribed form or otherwise). Oftentimes,

View file

@ -58,28 +58,46 @@ public class ACIPConverter {
ArrayList al = ACIPTshegBarScanner.scanFile(args[1], errors, strict, maxErrors - 1); ArrayList al = ACIPTshegBarScanner.scanFile(args[1], errors, strict, maxErrors - 1);
if (null == al) { if (null == al) {
System.err.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this"); System.err.println(maxErrors + " or more lexical errors occurred while scanning ACIP input file; is this");
System.err.println("Tibetan or English input?"); System.err.println("Tibetan or English input?");
System.err.println(""); System.err.println("");
System.err.println("First " + maxErrors + " errors scanning ACIP input file: "); if (false) {
// Nobody wants to see this. FIXME: maybe somebody; have an option.
System.err.println("First " + maxErrors + " lexical errors scanning ACIP input file: ");
System.err.println(errors); System.err.println(errors);
System.err.println("Exiting with " + maxErrors + " or more errors; please fix input file and try again."); }
System.err.println("Exiting with " + maxErrors + " or more lexical errors; please fix input file and try again.");
System.exit(1); System.exit(1);
} }
final boolean abortUponScanningError = false; // DLC MAKE ME CONFIGURABLE
// DLC NOW: BAo isn't converting.
if (errors.length() > 0) { if (errors.length() > 0) {
System.err.println("Errors scanning ACIP input file: "); System.err.println("Errors scanning ACIP input file: ");
System.err.println(errors); System.err.println(errors);
if (abortUponScanningError) {
System.err.println("Exiting; please fix input file and try again."); System.err.println("Exiting; please fix input file and try again.");
System.exit(1); System.exit(1);
} }
}
convertToUnicode(al, System.out, errors); StringBuffer warnings = new StringBuffer();
boolean putWarningsInOutput = true; // DLC make me configurable.
convertToUnicode(al, System.out, errors, warnings,
putWarningsInOutput);
if (errors.length() > 0) { if (errors.length() > 0) {
System.err.println("Errors converting ACIP input file: "); System.err.println("Errors converting ACIP input file: ");
System.err.println(errors); System.err.println(errors);
System.err.println("The output contains these errors.");
System.err.println("Exiting; please fix input file and try again."); System.err.println("Exiting; please fix input file and try again.");
System.exit(2); System.exit(2);
} }
if (warnings.length() > 0) {
System.err.println("Warnings converting ACIP input file: ");
System.err.println(warnings);
if (putWarningsInOutput)
System.err.println("The output contains these warnings.");
System.exit(2);
}
if (verbose) System.err.println("Converted " + args[1] + " perfectly."); if (verbose) System.err.println("Converted " + args[1] + " perfectly.");
System.exit(0); System.exit(0);
} }
@ -96,19 +114,30 @@ public class ACIPConverter {
{ {
throw new Error("DLC UNIMPLEMENTED"); throw new Error("DLC UNIMPLEMENTED");
} }
// DLC FIXME: sometimes { } is \u0F0B, and sometimes it is a
// space. Treat it as a tsheg only when it appears after a
// syllable or another tsheg.
/** Returns UTF-8 encoded Unicode. A bit indirect, so use this /** Returns UTF-8 encoded Unicode. A bit indirect, so use this
* for testing only if performance is a concern. If errors occur * for testing only if performance is a concern. If errors occur
* in scanning the ACIP or in converting a tsheg bar, then they * in scanning the ACIP or in converting a tsheg bar, then they
* are appended to errors if errors is non-null. Returns the * are appended to errors if errors is non-null, as well as
* written to the result. If warnings occur in scanning the ACIP
* or in converting a tsheg bar, then they are appended to
* warnings if warnings is non-null, and they are written to the
* result if writeWarningsToResult is true. Returns the
* conversion upon perfect success, null if errors occurred. * conversion upon perfect success, null if errors occurred.
*/ */
public static String convertToUnicode(String acip, public static String convertToUnicode(String acip,
StringBuffer errors) { StringBuffer errors,
StringBuffer warnings,
boolean writeWarningsToResult) {
ByteArrayOutputStream sw = new ByteArrayOutputStream(); ByteArrayOutputStream sw = new ByteArrayOutputStream();
ArrayList al = ACIPTshegBarScanner.scan(acip, errors, true /* DLC FIXME */, -1); ArrayList al = ACIPTshegBarScanner.scan(acip, errors, true /* DLC FIXME */, -1);
try { try {
if (null != al && convertToUnicode(al, sw, errors)) { if (null != al
&& convertToUnicode(al, sw, errors,
warnings, writeWarningsToResult)) {
return sw.toString("UTF-8"); return sw.toString("UTF-8");
} else { } else {
System.out.println("DLC al is " + al + " and convertToUnicode returned null."); System.out.println("DLC al is " + al + " and convertToUnicode returned null.");
@ -119,15 +148,25 @@ public class ACIPConverter {
} }
} }
/** Writes Unicode to out. If errors occur in converting a /** Writes Unicode to out. If errors occur in converting a tsheg
* tsheg bar, then they are appended to errors if errors is * bar, then they are appended to errors if errors is non-null.
* non-null. Returns true upon perfect success, false if errors * Furthermore, errors are written to out. If writeWarningsToOut
* occurred. * is true, then warnings also will be written to out. Returns
* true upon perfect success, false if errors occurred.
* @param scan result of ACIPTshegBarScanner.scan(..)
* @param out stream to which to write converted text
* @param errors if non-null, all error messages are appended
* @param warnings if non-null, all warning messages are appended
* to this
* @param writeWarningsToOut if true, then all warning messages
* are written to out in the appropriate places
* @throws IOException if we cannot write to out * @throws IOException if we cannot write to out
*/ */
public static boolean convertToUnicode(ArrayList scan, public static boolean convertToUnicode(ArrayList scan,
OutputStream out, OutputStream out,
StringBuffer errors) StringBuffer errors,
StringBuffer warnings,
boolean writeWarningsToOut)
throws IOException throws IOException
{ {
int sz = scan.size(); int sz = scan.size();
@ -139,7 +178,7 @@ public class ACIPConverter {
int stype = s.getType(); int stype = s.getType();
if (stype == ACIPString.ERROR) { if (stype == ACIPString.ERROR) {
hasErrors = true; hasErrors = true;
writer.write("[#ERROR CONVERTING ACIP DOCUMENT: "); writer.write("[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: ");
writer.write(s.getText()); writer.write(s.getText());
writer.write("]"); writer.write("]");
} else { } else {
@ -179,6 +218,21 @@ public class ACIPConverter {
if (null != errors) if (null != errors)
errors.append(errorMessage + "\n"); errors.append(errorMessage + "\n");
} else { } else {
String warning
= pt.getWarning(false, // DLC: make me configurable
pl,
s.getText());
if (null != warning) {
if (writeWarningsToOut) {
writer.write("[#WARNING CONVERTING ACIP DOCUMENT: ");
writer.write(warning);
writer.write("]");
}
if (null != warnings) {
warnings.append(warning);
warnings.append('\n');
}
}
unicode = sl.getUnicode(); unicode = sl.getUnicode();
if (null == unicode) throw new Error("DLC: HOW?"); if (null == unicode) throw new Error("DLC: HOW?");
} }

View file

@ -133,16 +133,18 @@ public class ACIPTshegBarScanner {
Stack bracketTypeStack = new Stack(); Stack bracketTypeStack = new Stack();
int startSlashIndex = -1; int startSlashIndex = -1;
int startParenIndex = -1; int startParenIndex = -1;
int numNewlines = 0;
for (int i = 0; i < sl; i++) { for (int i = 0; i < sl; i++) {
if (i < startOfString) throw new Error("bad reset"); if (i < startOfString) throw new Error("bad reset");
char ch; char ch;
ch = s.charAt(i); ch = s.charAt(i);
if (ch == '\n') ++numNewlines;
if (ACIPString.COMMENT == currentType && ch != ']') { if (ACIPString.COMMENT == currentType && ch != ']') {
if ('[' == ch) { if ('[' == ch) {
al.add(new ACIPString("Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n", al.add(new ACIPString("Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n",
ACIPString.ERROR)); ACIPString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"); + "Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }
@ -157,17 +159,18 @@ public class ACIPTshegBarScanner {
al.add(new ACIPString(s.substring(startOfString, i), al.add(new ACIPString(s.substring(startOfString, i),
currentType)); currentType));
} }
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR)); al.add(new ACIPString("Found a truly unmatched close bracket, " + s.substring(i, i+1),
ACIPString.ERROR));
if (!waitingForMatchingIllegalClose) { if (!waitingForMatchingIllegalClose) {
if (null != errors) { if (null != errors) {
errors.append("Offset " + i + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a truly unmatched close bracket, ] or }.\n"); + "Found a truly unmatched close bracket, ] or }.\n");
} }
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }
waitingForMatchingIllegalClose = false; waitingForMatchingIllegalClose = false;
if (null != errors) if (null != errors)
errors.append("Offset " + i + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); + "Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+1; startOfString = i+1;
@ -249,6 +252,11 @@ public class ACIPTshegBarScanner {
|| s.substring(i, i + "[BP]".length()).equals("{BP}"))) { || s.substring(i, i + "[BP]".length()).equals("{BP}"))) {
thingy = "[BP]"; thingy = "[BP]";
currentType = ACIPString.BP; currentType = ACIPString.BP;
} else if (i + "[BLANK PAGE]".length() <= sl
&& (s.substring(i, i + "[BLANK PAGE]".length()).equals("[BLANK PAGE]")
|| s.substring(i, i + "[BLANK PAGE]".length()).equals("{BLANK PAGE}"))) {
thingy = "[BLANK PAGE]";
currentType = ACIPString.BP;
} else if (i + "[ BP ]".length() <= sl } else if (i + "[ BP ]".length() <= sl
&& (s.substring(i, i + "[ BP ]".length()).equals("[ BP ]") && (s.substring(i, i + "[ BP ]".length()).equals("[ BP ]")
|| s.substring(i, i + "[ BP ]".length()).equals("{ BP }"))) { || s.substring(i, i + "[ BP ]".length()).equals("{ BP }"))) {
@ -414,11 +422,11 @@ public class ACIPTshegBarScanner {
// This is an error. Sometimes [COMMENTS APPEAR // This is an error. Sometimes [COMMENTS APPEAR
// WITHOUT # MARKS]. Though "... [" could cause // WITHOUT # MARKS]. Though "... [" could cause
// this too. // this too.
al.add(new ACIPString(s.substring(i, i+1), al.add(new ACIPString("Found an illegal open bracket: " + s.substring(i, i+1),
ACIPString.ERROR)); ACIPString.ERROR));
if (waitingForMatchingIllegalClose) { if (waitingForMatchingIllegalClose) {
if (null != errors) { if (null != errors) {
errors.append("Offset " + i + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.\n"); + "Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.\n");
} }
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
@ -435,7 +443,7 @@ public class ACIPTshegBarScanner {
inContext = inContext + "..."; inContext = inContext + "...";
} }
} }
errors.append("Offset " + i + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n"); + "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }
@ -477,7 +485,6 @@ public class ACIPTshegBarScanner {
if (i+numdigits+2 < sl && s.charAt(i+numdigits+2) == '.') { if (i+numdigits+2 < sl && s.charAt(i+numdigits+2) == '.') {
if (!(i+numdigits+4 < sl && isNumeric(s.charAt(i+numdigits+3)) if (!(i+numdigits+4 < sl && isNumeric(s.charAt(i+numdigits+3))
&& !isNumeric(s.charAt(i+numdigits+4)))) { && !isNumeric(s.charAt(i+numdigits+4)))) {
al.add(new ACIPString(s.substring(i, i+numdigits+3), ACIPString.ERROR));
String inContext = s.substring(i, i+Math.min(sl-i, 10)); String inContext = s.substring(i, i+Math.min(sl-i, 10));
if (inContext.indexOf("\r") >= 0) { if (inContext.indexOf("\r") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\r")); inContext = inContext.substring(0, inContext.indexOf("\r"));
@ -488,8 +495,10 @@ public class ACIPTshegBarScanner {
inContext = inContext + "..."; inContext = inContext + "...";
} }
} }
al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.",
ACIPString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.\n"); + "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+numdigits+3; startOfString = i+numdigits+3;
@ -498,7 +507,6 @@ public class ACIPTshegBarScanner {
break; break;
} }
if (i+numdigits+4 < sl && (s.charAt(i+numdigits+4) == '.' || s.charAt(i+numdigits+4) == 'A' || s.charAt(i+numdigits+4) == 'B' || s.charAt(i+numdigits+4) == 'a' || s.charAt(i+numdigits+4) == 'b' || isNumeric(s.charAt(i+numdigits+4)))) { if (i+numdigits+4 < sl && (s.charAt(i+numdigits+4) == '.' || s.charAt(i+numdigits+4) == 'A' || s.charAt(i+numdigits+4) == 'B' || s.charAt(i+numdigits+4) == 'a' || s.charAt(i+numdigits+4) == 'b' || isNumeric(s.charAt(i+numdigits+4)))) {
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
String inContext = s.substring(i, i+Math.min(sl-i, 10)); String inContext = s.substring(i, i+Math.min(sl-i, 10));
if (inContext.indexOf("\r") >= 0) { if (inContext.indexOf("\r") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\r")); inContext = inContext.substring(0, inContext.indexOf("\r"));
@ -509,8 +517,10 @@ public class ACIPTshegBarScanner {
inContext = inContext + "..."; inContext = inContext + "...";
} }
} }
al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.",
ACIPString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.\n"); + "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+1; // DLC FIXME: skip over more? startOfString = i+1; // DLC FIXME: skip over more?
@ -572,7 +582,9 @@ public class ACIPTshegBarScanner {
} }
// This case, @NNN, must come after the @NNN{AB} case. // This case, @NNN, must come after the @NNN{AB} case.
if (i+numdigits+1 < sl && s.charAt(i+numdigits+1) == ' ') { if (i+numdigits+1 < sl && (s.charAt(i+numdigits+1) == ' '
|| s.charAt(i+numdigits+1) == '\n'
|| s.charAt(i+numdigits+1) == '\r')) {
boolean allAreNumeric = true; boolean allAreNumeric = true;
for (int k = 1; k <= numdigits; k++) { for (int k = 1; k <= numdigits; k++) {
if (!isNumeric(s.charAt(i+k))) { if (!isNumeric(s.charAt(i+k))) {
@ -591,7 +603,6 @@ public class ACIPTshegBarScanner {
} }
} }
if (startOfString == i) { if (startOfString == i) {
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
String inContext = s.substring(i, i+Math.min(sl-i, 10)); String inContext = s.substring(i, i+Math.min(sl-i, 10));
if (inContext.indexOf("\r") >= 0) { if (inContext.indexOf("\r") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\r")); inContext = inContext.substring(0, inContext.indexOf("\r"));
@ -602,8 +613,10 @@ public class ACIPTshegBarScanner {
inContext = inContext + "..."; inContext = inContext + "...";
} }
} }
al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.",
ACIPString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.\n"); + "Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+1; startOfString = i+1;
@ -626,9 +639,10 @@ public class ACIPTshegBarScanner {
* it means /NYA/. We warn about // for this * it means /NYA/. We warn about // for this
* reason. \\ causes a tsheg-bar error (DLC * reason. \\ causes a tsheg-bar error (DLC
* FIXME: verify this is so). */ * FIXME: verify this is so). */
al.add(new ACIPString("//", ACIPString.ERROR)); al.add(new ACIPString("Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.",
ACIPString.ERROR));
if (errors != null) { if (errors != null) {
errors.append("Offset " + i + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\n"); + "Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\n");
} }
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
@ -661,9 +675,10 @@ public class ACIPTshegBarScanner {
if (startParenIndex >= 0) { if (startParenIndex >= 0) {
if (ch == '(') { if (ch == '(') {
al.add(new ACIPString("Nesting of parentheses () is not allowed", ACIPString.ERROR)); al.add(new ACIPString("Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.",
ACIPString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\n"); + "Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} else { } else {
@ -674,9 +689,10 @@ public class ACIPTshegBarScanner {
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
} else { } else {
if (ch == ')') { if (ch == ')') {
al.add(new ACIPString("Unexpected closing parenthesis )", ACIPString.ERROR)); al.add(new ACIPString("Unexpected closing parenthesis, ), found.",
ACIPString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Unexpected closing parenthesis, ), found.\n"); + "Unexpected closing parenthesis, ), found.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} else { } else {
@ -724,10 +740,10 @@ public class ACIPTshegBarScanner {
al.add(new ACIPString(s.substring(i, i+1), al.add(new ACIPString(s.substring(i, i+1),
ACIPString.TIBETAN_PUNCTUATION)); ACIPString.TIBETAN_PUNCTUATION));
} else { } else {
al.add(new ACIPString(s.substring(i, i+1), al.add(new ACIPString("A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".",
ACIPString.ERROR)); ACIPString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n"); + "A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }
@ -772,20 +788,25 @@ public class ACIPTshegBarScanner {
al.add(new ACIPString(s.substring(startOfString, i), al.add(new ACIPString(s.substring(startOfString, i),
currentType)); currentType));
} }
al.add(new ACIPString(s.substring(i, i+1),
ACIPString.ERROR));
if (null != errors) {
if ((int)ch == 65533) { if ((int)ch == 65533) {
errors.append("Offset " + i + ": " al.add(new ACIPString("Found an illegal, unprintable character.",
ACIPString.ERROR));
if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal, unprintable character.\n"); + "Found an illegal, unprintable character.\n");
} else if ('\\' == ch) { } else if ('\\' == ch) {
errors.append("Offset " + i + ": " al.add(new ACIPString("Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.",
ACIPString.ERROR));
if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n"); + "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n");
} else { } else {
errors.append("Offset " + i + ": " al.add(new ACIPString("Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".",
ACIPString.ERROR));
if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".\n"); + "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".\n");
} }
}
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;

View file

@ -128,7 +128,7 @@ public class PackageTest extends TestCase {
} }
{ {
TStackListList legalParses = pt.getUniqueParse(); TStackListList legalParses = pt.getUniqueParse(false);
boolean goodness2 = (expectedLegalParses == null boolean goodness2 = (expectedLegalParses == null
|| expectedLegalParses.length == legalParses.size()); || expectedLegalParses.length == legalParses.size());
for (int i = 0 ; i < legalParses.size(); i++) { for (int i = 0 ; i < legalParses.size(); i++) {
@ -139,18 +139,21 @@ public class PackageTest extends TestCase {
|| expectedLegalParses.length < i+1 || expectedLegalParses.length < i+1
|| n.equals(expectedLegalParses[i])); || n.equals(expectedLegalParses[i]));
if (!okay || !goodness2) if (!okay || !goodness2)
System.out.println("Legal parse " + (i) + " (from zero) is " + n + " (toString2=" + n.toString2() + ") and expected is " + expectedLegalParses[i]); System.out.println("Legal parse " + (i) + " (from zero) is " + n + " (toString2=" + n.toString2() + ") and expected is "
+ ((i < expectedLegalParses.length)
? expectedLegalParses[i]
: "not present"));
assertTrue(okay); assertTrue(okay);
} }
if (!goodness2) if (!goodness2)
System.out.println("You expected " + expectedLegalParses.length + " legal parses, but there were instead " + legalParses.size() + " legal parses."); System.out.println("You expected " + expectedLegalParses.length + " legal parses, but there were instead " + legalParses.size() + " legal parses for ACIP " + acip + ".");
assertTrue(goodness2); assertTrue(goodness2);
TStackListList allLegalParses = pt.getLegalParses(); TStackListList allLegalParses = pt.getLegalParses();
TStackListList decentParses = pt.getNonIllegalParses(); TStackListList decentParses = pt.getNonIllegalParses();
if (pt.getBestParse() == null) { if (pt.getBestParse() == null) {
if (legalParses.size() == 0) { if (legalParses.size() == 0) {
if (null != expectedBestParse && !"".equals(expectedBestParse)) { if (null != expectedBestParse && !"".equals(expectedBestParse)) {
System.out.print("Expected is that there is a best parse \"" + expectedBestParse + "\" but there is no best parse for acip {" + acip + "}"); System.out.print("Expected is that there is a best parse \"" + expectedBestParse + "\" but there is no best parse for ACIP {" + acip + "}");
assertTrue(false); assertTrue(false);
} }
System.out.print("ACIPNoBestParseError: There is no best parse for the ACIP {" + acip + "}; "); System.out.print("ACIPNoBestParseError: There is no best parse for the ACIP {" + acip + "}; ");
@ -163,7 +166,7 @@ public class PackageTest extends TestCase {
} }
} else { } else {
if (legalParses.size() > 1) { if (legalParses.size() > 1) {
System.out.println("ACIPTooManyLegalParsesError: see these " + legalParses.size() + " legal parses for acip " + acip + ": " + legalParses); System.out.println("ACIPTooManyLegalParsesError: see these " + legalParses.size() + " legal parses for ACIP " + acip + ": " + legalParses);
assertTrue(legalParses.size() == 2 assertTrue(legalParses.size() == 2
&& (legalParses.get(0).size() && (legalParses.get(0).size()
== 1 + legalParses.get(1).size())); == 1 + legalParses.get(1).size()));
@ -176,7 +179,7 @@ public class PackageTest extends TestCase {
if (null != expectedBestParse) { if (null != expectedBestParse) {
boolean good = pt.getBestParse().equals(expectedBestParse); boolean good = pt.getBestParse().equals(expectedBestParse);
if (!good) { if (!good) {
System.out.print("Expected best parse is \"" + expectedBestParse + "\" but the best parse is " + pt.getBestParse() + " for acip {" + acip + "}"); System.out.print("Expected best parse is \"" + expectedBestParse + "\" but the best parse is " + pt.getBestParse() + " for ACIP {" + acip + "}");
} }
assertTrue(good); assertTrue(good);
} }
@ -229,6 +232,116 @@ public class PackageTest extends TestCase {
* {@link TPairList#getACIPError()}, and {@link * {@link TPairList#getACIPError()}, and {@link
* TPairList#recoverACIP()}. */ * TPairList#recoverACIP()}. */
public void testBreakACIPIntoChunks() { public void testBreakACIPIntoChunks() {
tstHelper("GASN"); // ambiguous with regard to prefix rules
tstHelper("BARMA"); // ambiguous with regard to prefix rules
tstHelper("MARDA"); // ambiguous with regard to prefix rules
tstHelper("BBA"); // ambiguous with regard to prefix rules
tstHelper("BBLUGS"); // ambiguous with regard to prefix rules
tstHelper("BDRA"); // ambiguous with regard to prefix rules
tstHelper("BDRAG"); // ambiguous with regard to prefix rules
tstHelper("BDRA'I"); // ambiguous with regard to prefix rules
tstHelper("BDRAL"); // ambiguous with regard to prefix rules
tstHelper("BDRAN"); // ambiguous with regard to prefix rules
tstHelper("BDRANGS"); // ambiguous with regard to prefix rules
tstHelper("BDREN"); // ambiguous with regard to prefix rules
tstHelper("BDRI"); // ambiguous with regard to prefix rules
tstHelper("BDRIS"); // ambiguous with regard to prefix rules
tstHelper("BDROL"); // ambiguous with regard to prefix rules
tstHelper("BDRUG"); // ambiguous with regard to prefix rules
tstHelper("BLCAG"); // ambiguous with regard to prefix rules
tstHelper("BLCI"); // ambiguous with regard to prefix rules
tstHelper("BLKONG"); // ambiguous with regard to prefix rules
tstHelper("BLNGA"); // ambiguous with regard to prefix rules
tstHelper("BLNGAG"); // ambiguous with regard to prefix rules
tstHelper("BMA"); // ambiguous with regard to prefix rules
tstHelper("BMYOD"); // ambiguous with regard to prefix rules
tstHelper("BSALDA"); // ambiguous with regard to prefix rules
tstHelper("BSAMS"); // ambiguous with regard to prefix rules
tstHelper("BSEMS"); // ambiguous with regard to prefix rules
tstHelper("BTSAMS"); // ambiguous with regard to prefix rules
tstHelper("BTSIMS"); // ambiguous with regard to prefix rules
tstHelper("DDANG"); // ambiguous with regard to prefix rules
tstHelper("DDAR"); // ambiguous with regard to prefix rules
tstHelper("DDRANGS"); // ambiguous with regard to prefix rules
tstHelper("DDRUG"); // ambiguous with regard to prefix rules
tstHelper("DNAG"); // ambiguous with regard to prefix rules
tstHelper("DNOGS"); // ambiguous with regard to prefix rules
tstHelper("DRBAN"); // ambiguous with regard to prefix rules
tstHelper("DRGYU"); // ambiguous with regard to prefix rules
tstHelper("DRTOG"); // ambiguous with regard to prefix rules
tstHelper("DYA"); // ambiguous with regard to prefix rules
tstHelper("DYAN"); // ambiguous with regard to prefix rules
tstHelper("GDRA"); // ambiguous with regard to prefix rules
tstHelper("GDRIM"); // ambiguous with regard to prefix rules
tstHelper("GGAN"); // ambiguous with regard to prefix rules
tstHelper("GGYUR"); // ambiguous with regard to prefix rules
tstHelper("GLTAR"); // ambiguous with regard to prefix rules
tstHelper("GLTUNG"); // ambiguous with regard to prefix rules
tstHelper("GMA"); // ambiguous with regard to prefix rules
tstHelper("GMAN"); // ambiguous with regard to prefix rules
tstHelper("GMON"); // ambiguous with regard to prefix rules
tstHelper("GRDEGS"); // ambiguous with regard to prefix rules
tstHelper("GRDZU"); // ambiguous with regard to prefix rules
tstHelper("GRGYA"); // ambiguous with regard to prefix rules
tstHelper("GRNAGS"); // ambiguous with regard to prefix rules
tstHelper("GRTAN"); // ambiguous with regard to prefix rules
tstHelper("GRTOGS"); // ambiguous with regard to prefix rules
tstHelper("GRTZO"); // ambiguous with regard to prefix rules
tstHelper("GRTZOD"); // ambiguous with regard to prefix rules
tstHelper("GRTZON"); // ambiguous with regard to prefix rules
tstHelper("GSLA"); // ambiguous with regard to prefix rules
tstHelper("GSNAD"); // ambiguous with regard to prefix rules
tstHelper("GZLA"); // ambiguous with regard to prefix rules
tstHelper("MBA"); // ambiguous with regard to prefix rules
tstHelper("MBA'"); // ambiguous with regard to prefix rules
tstHelper("MBI'I"); // ambiguous with regard to prefix rules
tstHelper("MHA'A"); // ambiguous with regard to prefix rules
tstHelper("MRDA"); // ambiguous with regard to prefix rules
tstHelper("MRDO"); // ambiguous with regard to prefix rules
tstHelper("MRDZOGS"); // ambiguous with regard to prefix rules
tstHelper("MRGA"); // ambiguous with regard to prefix rules
tstHelper("MRGAD"); // ambiguous with regard to prefix rules
tstHelper("MRGAN"); // ambiguous with regard to prefix rules
tstHelper("MRJES"); // ambiguous with regard to prefix rules
tstHelper("MRJOD"); // ambiguous with regard to prefix rules
tstHelper("MRTOGS"); // ambiguous with regard to prefix rules
tstHelper("MRTOL"); // ambiguous with regard to prefix rules
tstHelper("MRTZE'I"); // ambiguous with regard to prefix rules
tstHelper("MRTZIGS"); // ambiguous with regard to prefix rules
tstHelper("MSAM"); // ambiguous with regard to prefix rules
tstHelper("MSGRIB"); // ambiguous with regard to prefix rules
tstHelper("MSKYES"); // ambiguous with regard to prefix rules
tstHelper("MSON"); // ambiguous with regard to prefix rules
tstHelper("MSOS"); // ambiguous with regard to prefix rules
tstHelper("MSTAMS"); // ambiguous with regard to prefix rules
tstHelper("MSTAN"); // ambiguous with regard to prefix rules
// If you're not careful, you'll think GGYES is a legal
// Tibetan tsheg bar and parse it as {G}{G+YE}{S}. But it's
// Sanskrit, really, because GA doesn't take a GA prefix.
// This doesn't occur in ACIP input files that I've seen, but
// GGYI (S1000I.INC) and GGYUR (S5275MC4.ACT) do occur.
tstHelper("GGYES", "{G}{G}{YE}{S}",
new String[] { "{G}{G}{YE}{S}", "{G}{G+YE}{S}", "{G+G}{YE}{S}" },
new String[] { },
"{G+G}{YE}{S}");
tstHelper("DRUG", "{D}{RU}{G}",
new String[] { "{D}{RU}{G}", "{D+RU}{G}" },
new String[] { "{D+RU}{G}" },
"{D+RU}{G}");
tstHelper("d+H+d+HA", "{d+}{H+}{d+}{HA}",
new String[] { "{d+H+d+HA}" },
new String[] { "{d+H+d+HA}" });
tstHelper("Gd+H+d+HA");
tstHelper("AUTPA", "{AU}{T}{PA}", tstHelper("AUTPA", "{AU}{T}{PA}",
new String[] { "{AU}{T}{PA}", "{AU}{T+PA}" }, new String[] { "{AU}{T}{PA}", "{AU}{T+PA}" },
new String[] { }, new String[] { },
@ -249,7 +362,8 @@ public class PackageTest extends TestCase {
new String[] { "{G+R+VA}{'I}" }); new String[] { "{G+R+VA}{'I}" });
tstHelper("G-RVA'I", "{G-}{R}{VA}{'I}", tstHelper("G-RVA'I", "{G-}{R}{VA}{'I}",
new String[] { "{G}{R+VA}{'I}" }, new String[] { "{G}{R+VA}{'I}" },
new String[] { "{G}{R+VA}{'I}" }); new String[] { },
"{G}{R+VA}{'I}");
tstHelper("RVA", "{R}{VA}", tstHelper("RVA", "{R}{VA}",
new String[] { "{R+VA}" }, new String[] { "{R+VA}" },
new String[] { "{R+VA}" }); new String[] { "{R+VA}" });
@ -6967,8 +7081,8 @@ tstHelper("ZUR");
"", "",
"[TIBETAN_NON_PUNCTUATION:{LA}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_NON_PUNCTUATION:{SGRUB}]"); // DLC FIXME "[TIBETAN_NON_PUNCTUATION:{LA}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_NON_PUNCTUATION:{SGRUB}]"); // DLC FIXME
shelp("PAS... LA", shelp("PAS... LA",
"Offset 5: A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n", "Offset 5 or maybe 5: A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n",
"[TIBETAN_NON_PUNCTUATION:{PAS}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, ERROR:{.}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{LA}]"); "[TIBETAN_NON_PUNCTUATION:{PAS}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, ERROR:{A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{LA}]");
shelp("PAS... LA", shelp("PAS... LA",
"", "",
true, true,
@ -6983,28 +7097,28 @@ tstHelper("ZUR");
shelp("", "", "[]"); shelp("", "", "[]");
shelp("[DD]", ""); shelp("[DD]", "");
shelp("[", shelp("[",
"Offset 0: Found an illegal open bracket (in context, this is [). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n"); "Offset 0 or maybe 0: Found an illegal open bracket (in context, this is [). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n");
shelp("{", shelp("{",
"Offset 0: Found an illegal open bracket (in context, this is {). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n"); "Offset 0 or maybe 0: Found an illegal open bracket (in context, this is {). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n");
shelp("DD", ""); shelp("DD", "");
shelp("DD]", shelp("DD]",
"Offset 2: Found a truly unmatched close bracket, ] or }.\nOffset 2: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); "Offset 2 or maybe 2: Found a truly unmatched close bracket, ] or }.\nOffset 2 or maybe 2: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
shelp("///NYA", "Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n"); shelp("///NYA", "Offset 1 or maybe 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
shelp("/NYA/", ""); shelp("/NYA/", "");
shelp("[?][BP][LS][DD1][DD2][DDD][DR][# (<{A COMMENT)}>]", ""); shelp("[?][BP][LS][DD1][DD2][DDD][DR][# (<{A COMMENT)}>]", "");
shelp("[LS][# A [[[[[COMMENT][LS]", shelp("[LS][# A [[[[[COMMENT][LS]",
"Offset 9: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" "Offset 9 or maybe 9: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
+ "Offset 10: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" + "Offset 10 or maybe 10: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
+ "Offset 11: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" + "Offset 11 or maybe 11: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
+ "Offset 12: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" + "Offset 12 or maybe 12: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
+ "Offset 13: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"); + "Offset 13 or maybe 13: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n");
shelp("[ILLEGAL COMMENT]", shelp("[ILLEGAL COMMENT]",
"Offset 0: Found an illegal open bracket (in context, this is [ILLEGAL C...). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 16: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); "Offset 0 or maybe 0: Found an illegal open bracket (in context, this is [ILLEGAL C...). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 16 or maybe 16: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
shelp("(BSKYABS GRO)", ""); // DLC WHAT ARE THESE FOR? shelp("(BSKYABS GRO)", ""); // DLC WHAT ARE THESE FOR?
shelp("BSKYABS GRO)", "Offset 11: Unexpected closing parenthesis, ), found.\n"); shelp("BSKYABS GRO)", "Offset 11 or maybe 11: Unexpected closing parenthesis, ), found.\n");
shelp("BSKYABS GRO(", "Offset END: Unmatched open parenthesis, (, found.\n"); shelp("BSKYABS GRO(", "Offset END: Unmatched open parenthesis, (, found.\n");
shelp("((NESTAGE))", "Offset 1: Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\nOffset 10: Unexpected closing parenthesis, ), found.\n"); shelp("((NESTAGE))", "Offset 1 or maybe 1: Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\nOffset 10 or maybe 10: Unexpected closing parenthesis, ), found.\n");
shelp("(BA)(PA)NYA(CA)", ""); shelp("(BA)(PA)NYA(CA)", "");
shelp("NYAx", ""); shelp("NYAx", "");
shelp("NYA x", ""); shelp("NYA x", "");
@ -7033,9 +7147,9 @@ tstHelper("ZUR");
shelp("(NYA ", "Offset END: Unmatched open parenthesis, (, found.\n"); shelp("(NYA ", "Offset END: Unmatched open parenthesis, (, found.\n");
shelp("[*NYA ", "Offset END: Unmatched open bracket found. A correction does not terminate.\n"); shelp("[*NYA ", "Offset END: Unmatched open bracket found. A correction does not terminate.\n");
shelp("?", "", "[QUESTION:{?}]"); shelp("?", "", "[QUESTION:{?}]");
shelp("KHAN~ BAR ", "Offset 4: Found an illegal character, ~, with ordinal 126.\n"); shelp("KHAN~ BAR ", "Offset 4 or maybe 4: Found an illegal character, ~, with ordinal 126.\n");
shelp("[* Correction with []]", shelp("[* Correction with []]",
"Offset 5: Found an illegal character, r, with ordinal 114.\nOffset 6: Found an illegal character, r, with ordinal 114.\nOffset 7: Found an illegal character, e, with ordinal 101.\nOffset 8: Found an illegal character, c, with ordinal 99.\nOffset 14: Found an illegal character, w, with ordinal 119.\nOffset 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); "Offset 5 or maybe 5: Found an illegal character, r, with ordinal 114.\nOffset 6 or maybe 6: Found an illegal character, r, with ordinal 114.\nOffset 7 or maybe 7: Found an illegal character, e, with ordinal 101.\nOffset 8 or maybe 8: Found an illegal character, c, with ordinal 99.\nOffset 14 or maybe 14: Found an illegal character, w, with ordinal 119.\nOffset 19 or maybe 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21 or maybe 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
// DLC FIXME: the line SDIG PA'I GROGS PO'I LAG TU SON PAR 'GYUR PA is followed by a blank line. Note that it's "PA", not "PA ", ending it. Autocorrect to the latter. // DLC FIXME: the line SDIG PA'I GROGS PO'I LAG TU SON PAR 'GYUR PA is followed by a blank line. Note that it's "PA", not "PA ", ending it. Autocorrect to the latter.
@ -7051,8 +7165,8 @@ tstHelper("ZUR");
uhelp(" 1\\ ", "\u0f0b\u0f21\u0f84\u0f0b"); uhelp(" 1\\ ", "\u0f0b\u0f21\u0f84\u0f0b");
} }
shelp("K\\,", shelp("K\\,",
"Offset 1: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n", "Offset 1 or maybe 1: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n",
"[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{\\}, TIBETAN_PUNCTUATION:{,}]"); "[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}, TIBETAN_PUNCTUATION:{,}]");
shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR%}]"); shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR%}]");
@ -7073,15 +7187,15 @@ tstHelper("ZUR");
shelp("@01A.3 ", "", "[FOLIO_MARKER:{@01A.3}, TIBETAN_PUNCTUATION:{ }]"); shelp("@01A.3 ", "", "[FOLIO_MARKER:{@01A.3}, TIBETAN_PUNCTUATION:{ }]");
shelp("@001 ", "", "[FOLIO_MARKER:{@001}, TIBETAN_PUNCTUATION:{ }]"); shelp("@001 ", "", "[FOLIO_MARKER:{@001}, TIBETAN_PUNCTUATION:{ }]");
shelp("@19-20A", shelp("@19-20A",
"Offset 0: Found an illegal at sign, @ (in context, this is @19-20A). @012B is an example of a legal folio marker.\n", "Offset 0 or maybe 0: Found an illegal at sign, @ (in context, this is @19-20A). @012B is an example of a legal folio marker.\n",
"[ERROR:{@}, TIBETAN_NON_PUNCTUATION:{19-20A}]"); // DLC FIXME: yes it occurs in the kangyur. "[ERROR:{Found an illegal at sign, @ (in context, this is @19-20A). @012B is an example of a legal folio marker.}, TIBETAN_NON_PUNCTUATION:{19-20A}]"); // DLC FIXME: yes it occurs in the kangyur.
shelp("@[7B]", ""); shelp("@[7B]", "");
shelp("@012A.3KA", shelp("@012A.3KA",
"", "",
"[FOLIO_MARKER:{@012A.3}, TIBETAN_NON_PUNCTUATION:{KA}]"); "[FOLIO_MARKER:{@012A.3}, TIBETAN_NON_PUNCTUATION:{KA}]");
shelp("@012A.34", shelp("@012A.34",
"Offset 0: Found an illegal at sign, @ (in context, this is @012A.34). This folio marker has a period, '.', at the end of it, which is illegal.\n", "Offset 0 or maybe 0: Found an illegal at sign, @ (in context, this is @012A.34). This folio marker has a period, '.', at the end of it, which is illegal.\n",
"[ERROR:{@012A.}, TIBETAN_NON_PUNCTUATION:{34}]"); "[ERROR:{Found an illegal at sign, @ (in context, this is @012A.34). This folio marker has a period, '.', at the end of it, which is illegal.}, TIBETAN_NON_PUNCTUATION:{34}]");
shelp("@[07B]", ""); shelp("@[07B]", "");
shelp("@[00007B]", ""); shelp("@[00007B]", "");
shelp("@7B", ""); shelp("@7B", "");
@ -7097,8 +7211,8 @@ tstHelper("ZUR");
shelp("{ DD }", "", "[DD:{{ DD }}]"); // TD3790E2.ACT shelp("{ DD }", "", "[DD:{{ DD }}]"); // TD3790E2.ACT
shelp("{ BP }", "", "[BP:{{ BP }}]"); // TD3790E2.ACT shelp("{ BP }", "", "[BP:{{ BP }}]"); // TD3790E2.ACT
shelp("//NYA\\\\", shelp("//NYA\\\\",
"Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset 5: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\nOffset 6: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n", "Offset 1 or maybe 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset 5 or maybe 5: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\nOffset 6 or maybe 6: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n",
"[START_SLASH:{/}, ERROR:{//}, END_SLASH:{/}, TIBETAN_NON_PUNCTUATION:{NYA}, ERROR:{\\}, ERROR:{\\}]"); "[START_SLASH:{/}, ERROR:{Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.}, END_SLASH:{/}, TIBETAN_NON_PUNCTUATION:{NYA}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}]");
} }
private static void uhelp(String acip) { private static void uhelp(String acip) {
@ -7106,7 +7220,7 @@ tstHelper("ZUR");
} }
private static void uhelp(String acip, String expectedUnicode) { private static void uhelp(String acip, String expectedUnicode) {
StringBuffer errors = new StringBuffer(); StringBuffer errors = new StringBuffer();
String unicode = ACIPConverter.convertToUnicode(acip, errors); String unicode = ACIPConverter.convertToUnicode(acip, errors, null, true);
if (null == unicode) { if (null == unicode) {
if (null != expectedUnicode && "none" != expectedUnicode) { if (null != expectedUnicode && "none" != expectedUnicode) {
System.out.println("No unicode exists for " + acip + " but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode)); System.out.println("No unicode exists for " + acip + " but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode));
@ -8729,22 +8843,22 @@ tstHelper("shKA");
} }
/* DLC FIXME: add test cases: from R0021F.ACE: ambiguous Tibetan/Sanskrit: /* DLC FIXME: add test cases: from R0021F.ACE: ambiguous Tibetan/Sanskrit:
BDA' þþþþ BDA'
B+DA þþþ B+DA
DBANG þþþ DBANG
D+BA þþþ D+BA
DGA' þþþþ DGA'
D+GA þþþ D+GA
DGRA þþþ DGRA
D+GRA þþþ D+GRA
DGYESþþþþþ DGYES
D+GYA þþþ D+GYA
DMAR þþþþ DMAR
D+MA þþþ D+MA
GDA' þþþþ GDA'
G+DA þþþ G+DA
GNAD þþþþ GNAD
G+NA þþþ G+NA
MNA' þþþþ MNA'
M+NA þþþ M+NA
*/ */

View file

@ -520,7 +520,8 @@ class TPairList {
* corresponds to exactly one Tibetan grapheme cluster (i.e., * corresponds to exactly one Tibetan grapheme cluster (i.e.,
* stack). Note that U+0F7F (ACIP {:}) is part of a stack, not a * stack). Note that U+0F7F (ACIP {:}) is part of a stack, not a
* stack all on its own. */ * stack all on its own. */
void populateWithTGCPairs(ArrayList pl, ArrayList indexList, int index) { void populateWithTGCPairs(ArrayList pl,
ArrayList indexList, int index) {
int sz = size(); int sz = size();
if (sz == 0) { if (sz == 0) {
return; return;
@ -540,8 +541,8 @@ class TPairList {
// The last pair: // The last pair:
TPair p = get(i); TPair p = get(i);
ThdlDebug.verify(!"+".equals(p.getRight())); ThdlDebug.verify(!"+".equals(p.getRight()));
int where;
boolean add_U0F7F = false; boolean add_U0F7F = false;
int where;
if (p.getRight() != null if (p.getRight() != null
&& (where = p.getRight().indexOf(':')) >= 0) { && (where = p.getRight().indexOf(':')) >= 0) {
// this ':' guy is his own TGCPair. // this ':' guy is his own TGCPair.
@ -579,27 +580,21 @@ class TPairList {
} }
TGCPair tp; TGCPair tp;
indexList.add(new Integer(index)); indexList.add(new Integer(index));
tp = new TGCPair(lWylie.toString() tp = new TGCPair(lWylie.toString(),
+ (hasNonAVowel (hasNonAVowel
? ACIPRules.getWylieForACIPVowel(p.getRight()) ? ACIPRules.getWylieForACIPVowel(p.getRight())
: ""), : ""),
(isNumeric (isNumeric
? TGCPair.OTHER ? TGCPair.TYPE_OTHER
: (hasNonAVowel
? (isSanskrit
? TGCPair.SANSKRIT_WITH_VOWEL
: (isTibetan
? TGCPair.CONSONANTAL_WITH_VOWEL
: TGCPair.OTHER))
: (isSanskrit : (isSanskrit
? TGCPair.SANSKRIT_WITHOUT_VOWEL ? TGCPair.TYPE_SANSKRIT
: (isTibetan : (isTibetan
? TGCPair.CONSONANTAL_WITHOUT_VOWEL ? TGCPair.TYPE_TIBETAN
: TGCPair.OTHER))))); : TGCPair.TYPE_OTHER))));
pl.add(tp); pl.add(tp);
if (add_U0F7F) { if (add_U0F7F) {
indexList.add(new Integer(index)); indexList.add(new Integer(index));
pl.add(new TGCPair("H", TGCPair.OTHER)); pl.add(new TGCPair("H", null, TGCPair.TYPE_OTHER));
} }
} }
} }

View file

@ -91,7 +91,7 @@ class TParseTree {
ParseIterator pi = getParseIterator(); ParseIterator pi = getParseIterator();
while (pi.hasNext()) { while (pi.hasNext()) {
TStackList sl = pi.next(); TStackList sl = pi.next();
if (sl.isLegalTshegBar().isLegal) { if (sl.isLegalTshegBar(false).isLegal) {
sll.add(sl); sll.add(sl);
} }
} }
@ -118,12 +118,12 @@ class TParseTree {
* a unique non-illegal parse, you get it. If there's not a * a unique non-illegal parse, you get it. If there's not a
* unique answer, null is returned. */ * unique answer, null is returned. */
// {TZANDRA} is not solved by this, DLC NOW. Solve PADMA PROBLEM! // {TZANDRA} is not solved by this, DLC NOW. Solve PADMA PROBLEM!
// DLC by using this we can get rid of single-sanskrit-gc, eh? // DLC by using this we can get rid of single-sanskrit-gc, eh?
public TStackList getBestParse() { public TStackList getBestParse() {
TStackListList up = getUniqueParse(); TStackListList up = getUniqueParse(false);
if (up.size() == 1) if (up.size() == 1)
return up.get(0); return up.get(0);
up = getNonIllegalParses(); up = getNonIllegalParses();
int sz = up.size(); int sz = up.size();
if (sz == 1) { if (sz == 1) {
@ -192,14 +192,17 @@ class TParseTree {
* legal parses if there two or more equally good parses. By * legal parses if there two or more equally good parses. By
* &quot;legal&quot;, we mean a sequence of stacks that is legal * &quot;legal&quot;, we mean a sequence of stacks that is legal
* by the rules of Tibetan tsheg bar syntax (sometimes called * by the rules of Tibetan tsheg bar syntax (sometimes called
* spelling). */ * spelling).
public TStackListList getUniqueParse() { * @param noPrefixTests true if you want to pretend that every
* stack can take every prefix, which is not the case in
* reality */
public TStackListList getUniqueParse(boolean noPrefixTests) {
TStackListList allLegalParses = new TStackListList(2); // save memory TStackListList allLegalParses = new TStackListList(2); // save memory
TStackListList legalParsesWithVowelOnRoot = new TStackListList(1); TStackListList legalParsesWithVowelOnRoot = new TStackListList(1);
ParseIterator pi = getParseIterator(); ParseIterator pi = getParseIterator();
while (pi.hasNext()) { while (pi.hasNext()) {
TStackList sl = pi.next(); TStackList sl = pi.next();
BoolPair bpa = sl.isLegalTshegBar(); BoolPair bpa = sl.isLegalTshegBar(noPrefixTests);
if (bpa.isLegal) { if (bpa.isLegal) {
if (bpa.isLegalAndHasAVowelOnRoot) if (bpa.isLegalAndHasAVowelOnRoot)
legalParsesWithVowelOnRoot.add(sl); legalParsesWithVowelOnRoot.add(sl);
@ -253,13 +256,23 @@ class TParseTree {
public String getWarning(boolean paranoid, public String getWarning(boolean paranoid,
TPairList pl, TPairList pl,
String originalACIP) { String originalACIP) {
TStackListList up = getUniqueParse();
{
TStackList bestParse = getBestParse();
TStackListList noPrefixTestsUniqueParse = getUniqueParse(true);
if (noPrefixTestsUniqueParse.size() == 1
&& !noPrefixTestsUniqueParse.get(0).equals(bestParse)) {
return "Warning: We're going with " + bestParse + ", but only because our knowledge of prefix rules says that " + noPrefixTestsUniqueParse.get(0) + " is not a legal Tibetan tsheg bar (\"syllable\")";
}
}
TStackListList up = getUniqueParse(false);
if (null == up || up.size() != 1) { if (null == up || up.size() != 1) {
boolean isLastStack[] = new boolean[1]; boolean isLastStack[] = new boolean[1];
TStackListList nip = getNonIllegalParses(); TStackListList nip = getNonIllegalParses();
if (nip.size() != 1) { if (nip.size() != 1) {
if (null == getBestParse()) { if (null == getBestParse()) {
return "There's not even a unique, non-illegal parse for ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}"; return "Warning: There's not even a unique, non-illegal parse for ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}";
} else { } else {
if (getBestParse().hasStackWithoutVowel(pl, isLastStack)) { if (getBestParse().hasStackWithoutVowel(pl, isLastStack)) {
if (isLastStack[0]) { if (isLastStack[0]) {
@ -269,7 +282,7 @@ class TParseTree {
} }
} }
if (paranoid) { if (paranoid) {
return "Though the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "} is unambiguous, it would be more computer-friendly if + signs were used to stack things because there are two (or more) ways to interpret this ACIP if you're not careful."; return "Warning: Though the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "} is unambiguous, it would be more computer-friendly if + signs were used to stack things because there are two (or more) ways to interpret this ACIP if you're not careful.";
} }
} }
} else { } else {

View file

@ -125,15 +125,17 @@ class TStackList {
* Tibetan syntax (sometimes called rules of spelling). If this * Tibetan syntax (sometimes called rules of spelling). If this
* is legal, then {@link BoolPair#isLegalAndHasAVowelOnRoot} will * is legal, then {@link BoolPair#isLegalAndHasAVowelOnRoot} will
* be true if and only if there is an explicit {A} vowel on the * be true if and only if there is an explicit {A} vowel on the
* root stack. */ * root stack.
public BoolPair isLegalTshegBar() { * @param noPrefixTests true if you want to pretend that every
// DLC handle PADMA and other Tibetanized Sanskrit fellows. Right now we only handle single-stack guys. * stack can take every prefix, which is not the case in
* reality */
public BoolPair isLegalTshegBar(boolean noPrefixTests) {
// DLC handle PADMA and other Tibetanized Sanskrit fellows consistently. Right now we only treat single-stack Sanskrit guys as legal.
TTGCList tgcList = new TTGCList(this); TTGCList tgcList = new TTGCList(this);
StringBuffer warnings = new StringBuffer(); StringBuffer warnings = new StringBuffer();
String candidateType String candidateType
= TibTextUtils.getClassificationOfTshegBar(tgcList, warnings); = TibTextUtils.getClassificationOfTshegBar(tgcList, warnings, noPrefixTests);
// System.out.println("DLC: " + toString() + " has candidateType " + candidateType + " and warnings " + warnings);
// preliminary answer: // preliminary answer:
boolean isLegal = (candidateType != "invalid"); boolean isLegal = (candidateType != "invalid");