TMW->Wylie conversion now takes advantage of prefix rules, the rules

that say "ya can take a ga prefix" etc.

The ACIP->Unicode converter now gives warnings (optionally, and by
default, inline).  This converter now produces output even when
lexical errors occur, but the output has errors and warnings inline.
This commit is contained in:
dchandler 2003-08-23 22:03:37 +00:00
parent 21ef657921
commit d5ad760230
14 changed files with 678 additions and 270 deletions

View file

@ -102,19 +102,23 @@ public class DuffPaneTest extends TestCase {
ensureKeysGiveCorrectWylie("gya");
ensureKeysGiveCorrectWylie("g.ya");
ensureKeysGiveCorrectWylie("bya");
ensureKeysGiveCorrectWylie("b.ya");
ensureKeysGiveCorrectWylie("b.ya", "baya");
ensureKeysGiveCorrectWylie("mya");
ensureKeysGiveCorrectWylie("m.ya");
ensureKeysGiveCorrectWylie("'ya");
ensureKeysGiveCorrectWylie("'.ya", "'ya");
ensureKeysGiveCorrectWylie("dya");
ensureKeysGiveCorrectWylie("d.ya", "dya");
ensureKeysGiveCorrectWylie("m.ya", "maya");
ensureKeysGiveCorrectWylie("'ya", "'aya");
ensureKeysGiveCorrectWylie("'.ya", "'aya");
ensureKeysGiveCorrectWylie("dya",
"daya");
ensureKeysGiveCorrectWylie("d.ya",
"daya");
ensureKeysGiveCorrectWylie("grwa");
ensureKeysGiveCorrectWylie("g.rwa");
ensureKeysGiveCorrectWylie("g.rwa",
"garwa");
ensureKeysGiveCorrectWylie("gra");
ensureKeysGiveCorrectWylie("dra");
ensureKeysGiveCorrectWylie("drwa");
ensureKeysGiveCorrectWylie("d.rwa");
ensureKeysGiveCorrectWylie("d.rwa",
"darwa");
ensureKeysGiveCorrectWylie("g.r", "gar");
ensureKeysGiveCorrectWylie("d.r", "dar");
ensureKeysGiveCorrectWylie("'.r", "'ar");
@ -134,7 +138,7 @@ public class DuffPaneTest extends TestCase {
ensureKeysGiveCorrectWylie("t.sa",
"tas");
ensureKeysGiveCorrectWylie("d.za");
ensureKeysGiveCorrectWylie("d.za", "daza");
ensureKeysGiveCorrectWylie("dza");
ensureKeysGiveCorrectWylie("s.ha",
@ -219,7 +223,7 @@ public class DuffPaneTest extends TestCase {
ensureKeysGiveCorrectWylie("b.lag");
ensureKeysGiveCorrectWylie("blg",
"blga");
"balga");
ensureKeysGiveCorrectWylie("b.las",
"bals");
@ -244,21 +248,24 @@ public class DuffPaneTest extends TestCase {
"bras");
ensureKeysGiveCorrectWylie("bras");
ensureKeysGiveCorrectWylie("d.wa");
ensureKeysGiveCorrectWylie("d.wa",
"dawa");
ensureKeysGiveCorrectWylie("dawa",
"d.wa");
"dawa");
ensureKeysGiveCorrectWylie("dwa");
ensureKeysGiveCorrectWylie("g.wa");
ensureKeysGiveCorrectWylie("g.wa",
"gawa");
ensureKeysGiveCorrectWylie("gawa",
"g.wa");
"gawa");
ensureKeysGiveCorrectWylie("gwa");
ensureKeysGiveCorrectWylie("'.wa",
"'wa");
"'awa");
ensureKeysGiveCorrectWylie("'awa",
"'wa");
ensureKeysGiveCorrectWylie("'wa");
"'awa");
ensureKeysGiveCorrectWylie("'wa",
"'awa");
ensureKeysGiveCorrectWylie("gyg",
"g.yag");
@ -282,7 +289,8 @@ public class DuffPaneTest extends TestCase {
ensureKeysGiveCorrectWylie("ma.a.asa",
"mas");
ensureKeysGiveCorrectWylie("'ka");
ensureKeysGiveCorrectWylie("'ka",
"'aka");
ensureKeysGiveCorrectWylie("'gas");
@ -319,8 +327,9 @@ public class DuffPaneTest extends TestCase {
"lamanga");
ensureKeysGiveCorrectWylie("b.m.ng",
"bmang");
ensureKeysGiveCorrectWylie("bmang");
"bamanga");
ensureKeysGiveCorrectWylie("bmang",
"bamanga");
ensureKeysGiveCorrectWylie("gdams");
ensureKeysGiveCorrectWylie("g.d.m.s.",
@ -372,7 +381,7 @@ public class DuffPaneTest extends TestCase {
ensureKeysGiveCorrectWylie("fivikikhigingicichijinyitithidinipiphibimitsitshidziwizhizi'iyirilishisihiTiThiDiNiShi");
ensureKeysGiveCorrectWylie("don't touch my coffee/that makes me very angry/supersize my drink",
"dona'ata tocha mya cofafe/thata mkes me veraya angaraya/superasize mya drinaka");
"dona'ata tocha mya cofafe/thata makesa me veraya angaraya/superasize mya drinaka");
}
}

View file

@ -28,7 +28,7 @@ zur mig nyag phran tsam gyis dge ba'i gzugs can 'dus ma byas//\par
\par
yid 'ong bzhin ras zla gzhon 'khor lo gnyis skyes la//\par
'khrul ba ster yang 'phyang mo sel byed mgo skyes kyi//\par
bai DUr mthing kha'i lan bu rab 'phyang dbyangs can ma//\par
bai DUra mthing kha'i lan bu rab 'phyang dbyangs can ma//\par
smra ba'i dbang phyug ngag gi rgyal po nyer grub mdzod//\par
\par
gangs can lha lam yangs pa'i khyon 'dir rgyal ba'i bstan pa bcu gnyis bdag po'i gur khang mchog/\par

View file

@ -25,7 +25,7 @@ package org.thdl.tib.text;
context-insensitive THDL Extended Wylie representation. NOTE
WELL: this is not a real grapheme cluster; I'm misusing the term
(FIXME). It's actually whole or part of one. It's part of one
when this is a vowel or U+0F7F alone.
when this is U+0F7F alone.
@author David Chandler */
public class TGCPair {
@ -37,14 +37,84 @@ public class TGCPair {
public static final int SANSKRIT_WITHOUT_VOWEL = 5;
public static final int SANSKRIT_WITH_VOWEL = 6;
public String wylie;
public int classification;
public TGCPair(String wylie, int classification) {
this.wylie = wylie;
this.classification = classification;
public static final int TYPE_OTHER = 31;
public static final int TYPE_SANSKRIT = 32;
public static final int TYPE_TIBETAN = 33;
// Sanskrit or Tibetan consonant, or number, or oddball:
private String consonantWylie;
private String vowelWylie;
public String getConsonantWylie() {
return consonantWylie;
}
public String getVowelWylie() {
return vowelWylie;
}
/** Cludge. */
public void setWylie(String x) {
consonantWylie = x;
vowelWylie = null;
}
public String getWylie() {
StringBuffer b = new StringBuffer();
if (consonantWylie != null) {
// we may have {p-y}, but the user wants to see {py}.
for (int i = 0; i < consonantWylie.length(); i++) {
char ch = consonantWylie.charAt(i);
if ('-' != ch)
b.append(ch);
}
}
if (vowelWylie != null)
b.append(vowelWylie);
return b.toString();
}
public int classification;
/** Constructs a new TGCPair with (Tibetan or Sanskrit) consonant
* consonantWylie and vowel vowelWylie. Use
* classification==TYPE_OTHER for numbers, lone vowels, marks,
* etc. Use classification==TYPE_TIBETAN for Tibetan (not
* Tibetanized Sanskrit) and classification=TYPE_SANSKRIT for
* Tibetanized Sanskrit. */
public TGCPair(String consonantWylie, String vowelWylie, int classification) {
if ("".equals(vowelWylie))
vowelWylie = null;
// Technically, we don't need the following check, but it's
// nice for consistency's sake.
if ("".equals(consonantWylie))
consonantWylie = null;
// DLC FIXME: for speed, make these assertions:
if (classification != TYPE_OTHER
&& classification != TYPE_TIBETAN
&& classification != TYPE_SANSKRIT) {
throw new IllegalArgumentException("Bad classification " + classification + ".");
}
int realClassification = -37;
if (vowelWylie == null && classification == TYPE_TIBETAN)
realClassification = CONSONANTAL_WITHOUT_VOWEL;
if (vowelWylie != null && classification == TYPE_TIBETAN)
realClassification = CONSONANTAL_WITH_VOWEL;
if (vowelWylie == null && classification == TYPE_SANSKRIT)
realClassification = SANSKRIT_WITHOUT_VOWEL;
if (vowelWylie != null && classification == TYPE_SANSKRIT)
realClassification = SANSKRIT_WITH_VOWEL;
if (consonantWylie == null) {
if (classification != TYPE_OTHER)
throw new IllegalArgumentException("That's the very definition of a lone vowel.");
realClassification = LONE_VOWEL;
} else {
if (classification == TYPE_OTHER)
realClassification = OTHER;
}
this.consonantWylie = consonantWylie;
this.vowelWylie = vowelWylie;
this.classification = realClassification;
}
public String toString() {
return "<TGCPair wylie=" + wylie + " classification="
return "<TGCPair wylie=" + getWylie() + " classification="
+ classification + "/>";
}
}

View file

@ -25,6 +25,9 @@ import javax.swing.text.rtf.RTFEditorKit;
import java.io.*;
import org.thdl.util.ThdlDebug;
import org.thdl.tib.text.tshegbar.LegalTshegBar;
import org.thdl.tib.text.tshegbar.UnicodeConstants;
import org.thdl.tib.text.tshegbar.UnicodeUtils;
/**
* Provides methods for converting back and forth between Extended
@ -846,86 +849,64 @@ public class TibTextUtils implements THDLWylieConstants {
// sz is an overestimate (speeds us up, wastes some memory).
TMWGCList gcs = new TMWGCList(sz);
StringBuffer buildingUpGc = new StringBuffer();
StringBuffer buildingUpVowel = new StringBuffer(); // for {cui}, we append to this guy twice.
String nonVowelWylie = null; // for the "c" in {cui}
int pairType = TGCPair.TYPE_OTHER;
boolean consonantal_with_vowel = false;
boolean buildingUpSanskrit = false;
for (int i = 0; i < sz; i++) {
DuffCode dc = (DuffCode)glyphList.get(i);
String wylie = TibetanMachineWeb.getWylieForGlyph(dc, noSuchWylie);
boolean containsWylieVowel = false;
boolean buildingUpSanskritNext = false;
if ((buildingUpSanskritNext
= TibetanMachineWeb.isWylieSanskritConsonantStack(wylie))
|| TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(wylie)) {
if (buildingUpGc.length() > 0) {
gcs.add(new TGCPair(buildingUpGc.toString(),
consonantal_with_vowel
? (buildingUpSanskrit
? TGCPair.SANSKRIT_WITH_VOWEL
: TGCPair.CONSONANTAL_WITH_VOWEL)
: (buildingUpSanskrit
? TGCPair.SANSKRIT_WITHOUT_VOWEL
: TGCPair.CONSONANTAL_WITHOUT_VOWEL)));
buildingUpGc.delete(0, buildingUpGc.length());
if (buildingUpVowel.length() > 0 || null != nonVowelWylie) {
gcs.add(new TGCPair(nonVowelWylie,
buildingUpVowel.toString(),
pairType));
buildingUpVowel.delete(0, buildingUpVowel.length());
}
buildingUpGc.append(wylie);
consonantal_with_vowel = false;
buildingUpSanskrit = buildingUpSanskritNext;
} else if ((containsWylieVowel
= TibetanMachineWeb.isWylieAdornmentAndContainsVowel(wylie))
// We want {p-y}, not {py}.
nonVowelWylie
= TibetanMachineWeb.getHashKeyForGlyph(dc.getFontNum(), dc.getCharNum());
pairType = (buildingUpSanskritNext
? TGCPair.TYPE_SANSKRIT
: TGCPair.TYPE_TIBETAN);
} else if (TibetanMachineWeb.isWylieAdornmentAndContainsVowel(wylie)
|| TibetanMachineWeb.isWylieAdornment(wylie)) {
if (buildingUpGc.length() > 0) {
buildingUpGc.append(wylie);
if (containsWylieVowel) {
if (debug)
System.out.println("DEBUG: with_vowel is true thanks to " + wylie);
consonantal_with_vowel = true;
}
// do not clear; we might have {cui} or {hUM}, e.g.
} else {
gcs.add(new TGCPair(wylie,
TGCPair.LONE_VOWEL));
consonantal_with_vowel = false;
}
buildingUpVowel.append(wylie);
} else {
// number or weird thing:
if (buildingUpGc.length() > 0) {
gcs.add(new TGCPair(buildingUpGc.toString(),
consonantal_with_vowel
? (buildingUpSanskrit
? TGCPair.SANSKRIT_WITH_VOWEL
: TGCPair.CONSONANTAL_WITH_VOWEL)
: (buildingUpSanskrit
? TGCPair.SANSKRIT_WITHOUT_VOWEL
: TGCPair.CONSONANTAL_WITHOUT_VOWEL)));
buildingUpGc.delete(0, buildingUpGc.length());
if (buildingUpVowel.length() > 0 || null != nonVowelWylie) {
gcs.add(new TGCPair(nonVowelWylie,
buildingUpVowel.toString(),
pairType));
buildingUpVowel.delete(0, buildingUpVowel.length());
nonVowelWylie = null;
}
gcs.add(new TGCPair(wylie, TGCPair.OTHER));
consonantal_with_vowel = false;
buildingUpSanskrit = false;
gcs.add(new TGCPair(wylie, null, TGCPair.TYPE_OTHER));
pairType = TGCPair.TYPE_OTHER;
}
}
if (buildingUpGc.length() > 0) {
gcs.add(new TGCPair(buildingUpGc.toString(),
consonantal_with_vowel
? (buildingUpSanskrit
? TGCPair.SANSKRIT_WITH_VOWEL
: TGCPair.CONSONANTAL_WITH_VOWEL)
: (buildingUpSanskrit
? TGCPair.SANSKRIT_WITHOUT_VOWEL
: TGCPair.CONSONANTAL_WITHOUT_VOWEL)));
if (buildingUpVowel.length() > 0 || null != nonVowelWylie) {
gcs.add(new TGCPair(nonVowelWylie,
buildingUpVowel.toString(),
pairType));
}
buildingUpGc = null;
return gcs;
}
/** Returns a string that classifies gcs as a legal Tibetan tsheg
* bar, a single Sanskrit grapheme cluster
* ("single-sanskrit-gc"), or invalid ("invalid"). If
* noPrefixTests is true, then ggyi will be seen as a
* "prefix-root", even though gya doesn't take a ga prefix. */
public static String getClassificationOfTshegBar(TGCList gcs,
// DLC the warnings are Wylie-specific
StringBuffer warnings) {
StringBuffer warnings,
boolean noPrefixTests) {
String candidateType = null;
// Now that we have grapheme clusters, see if they match any
// of the "legal tsheg bars":
@ -937,10 +918,11 @@ public class TibTextUtils implements THDLWylieConstants {
|| TGCPair.SANSKRIT_WITH_VOWEL == cls)
return "single-sanskrit-gc";
}
TGCPair lastPair = null;
for (int i = 0; i < sz; i++) {
TGCPair tp = gcs.get(i);
int cls = tp.classification;
String wylie = tp.wylie;
String wylie = tp.getWylie();
if (TGCPair.OTHER == cls) {
if (TibetanMachineWeb.isWylieNumber(wylie)) {
if (null == candidateType) {
@ -977,25 +959,44 @@ public class TibTextUtils implements THDLWylieConstants {
// peek ahead to distinguish between ba's,
// ba'ala and ba'am:
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
String nextwylie = (nexttp == null) ? "" : nexttp.getWylie();
if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-prefix/root";
} else {
candidateType = "prefix/root-root/suffix";
if (noPrefixTests
|| isLegalPrefixRootCombo(lastPair.getConsonantWylie(),
tp.getConsonantWylie()))
candidateType = "prefix/root-root/suffix";
else
candidateType = "root-suffix";
}
} else if (TibetanMachineWeb.isWylieRight(wylie)) {
candidateType = "prefix/root-root/suffix";
if (noPrefixTests
|| isLegalPrefixRootCombo(lastPair.getConsonantWylie(),
tp.getConsonantWylie()))
candidateType = "prefix/root-root/suffix";
else
candidateType = "root-suffix";
} else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) {
candidateType = "appendaged-prefix/root";
} else {
candidateType = "prefix-root";
if (noPrefixTests
|| isLegalPrefixRootCombo(lastPair.getConsonantWylie(),
tp.getConsonantWylie()))
candidateType = "prefix-root";
else {
if (null != warnings)
warnings.append("Found what would be a prefix-root combo, but the root stack with wylie " + wylie + " does not take the prefix with wylie " + lastPair.getConsonantWylie());
candidateType = "invalid";
break;
}
}
} else if ("root" == candidateType) {
if (ACHUNG.equals(wylie)) {
// peek ahead to distinguish between pa's,
// pa'ala and pa'am:
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
String nextwylie = (nexttp == null) ? "" : nexttp.getWylie();
if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-root";
} else {
@ -1016,7 +1017,7 @@ public class TibTextUtils implements THDLWylieConstants {
// peek ahead to distinguish between bpa's,
// bpa'ala and bpa'am:
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
String nextwylie = (nexttp == null) ? "" : nexttp.getWylie();
if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-prefix-root";
} else {
@ -1038,7 +1039,7 @@ public class TibTextUtils implements THDLWylieConstants {
// peek ahead to distinguish between
// gga'am and gaga'ala:
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
String nextwylie = (nexttp == null) ? "" : nexttp.getWylie();
if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-prefix/root-root/suffix";
} else {
@ -1120,7 +1121,11 @@ public class TibTextUtils implements THDLWylieConstants {
candidateType
= candidateType.substring("maybe-".length()).intern();
// So that we get 'am, not 'm; 'ang, not 'ng:
tp.wylie = WYLIE_aVOWEL + tp.wylie;
// FIXME: cludge: weird place to do this.
// pa'am, not pa'm is what we want, sure,
// but doing this here is ugly.
tp.setWylie(WYLIE_aVOWEL + tp.getWylie());
} else {
if (null != warnings)
warnings.append("Found a tsheg bar that has an achung (" + ACHUNG + ") tacked on, followed by some other thing whose wylie is " + wylie + "\n");
@ -1157,6 +1162,7 @@ public class TibTextUtils implements THDLWylieConstants {
} else {
throw new Error("bad cls");
}
lastPair = tp;
}
if (candidateType.startsWith("maybe-appendaged-")) {
if (null != warnings)
@ -1221,7 +1227,7 @@ public class TibTextUtils implements THDLWylieConstants {
StringBuffer wylieBuffer) {
TGCList gcs
= breakTshegBarIntoGraphemeClusters(glyphList, noSuchWylie);
String candidateType = getClassificationOfTshegBar(gcs, warnings);
String candidateType = getClassificationOfTshegBar(gcs, warnings, false);
int sz = gcs.size();
if (candidateType == "invalid"
|| candidateType == "single-sanskrit-gc") {
@ -1237,7 +1243,7 @@ public class TibTextUtils implements THDLWylieConstants {
for (int i = 0; i < sz; i++) {
TGCPair tp = (TGCPair)gcs.get(i);
int cls = tp.classification;
String wylie = tp.wylie;
String wylie = tp.getWylie();
wylieBuffer.append(wylie);
if (TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(wylie)
|| TibetanMachineWeb.isWylieSanskritConsonantStack(wylie)) {
@ -1290,9 +1296,9 @@ public class TibTextUtils implements THDLWylieConstants {
leftover = 3;
/* FIXME: these constants are hard-wired here, rather
* than in TibetanMachineWeb, because I'm lazy. */
String wylie1 = ((TGCPair)gcs.get(0)).wylie;
String wylie2 = ((TGCPair)gcs.get(1)).wylie;
String wylie3 = ((TGCPair)gcs.get(2)).wylie;
String wylie1 = ((TGCPair)gcs.get(0)).getWylie();
String wylie2 = ((TGCPair)gcs.get(1)).getWylie();
String wylie3 = ((TGCPair)gcs.get(2)).getWylie();
if ((wylie1.equals("g") && (wylie2.equals("d") || wylie2.equals("n") || wylie2.equals("s")))
|| (wylie1.equals("d") && (wylie2.equals("g") || wylie2.equals("m")))
|| (wylie1.equals("b") && wylie2.equals("d"))
@ -1316,7 +1322,7 @@ public class TibTextUtils implements THDLWylieConstants {
|| "prefix/root" == candidateType
|| "root-suffix-postsuffix" == candidateType
|| "root-suffix" == candidateType) {
String wylie1 = ((TGCPair)gcs.get(0)).wylie;
String wylie1 = ((TGCPair)gcs.get(0)).getWylie();
leftover = 1;
wylieBuffer.append(wylie1);
if (((TGCPair)gcs.get(0)).classification
@ -1330,16 +1336,16 @@ public class TibTextUtils implements THDLWylieConstants {
}
if ("root-suffix-postsuffix" == candidateType) {
leftover = 3;
String wylie2 = ((TGCPair)gcs.get(1)).wylie;
String wylie3 = ((TGCPair)gcs.get(2)).wylie;
String wylie2 = ((TGCPair)gcs.get(1)).getWylie();
String wylie3 = ((TGCPair)gcs.get(2)).getWylie();
wylieBuffer.append(unambiguousPostAVowelWylie(wylie2,
wylie3));
}
} else if ("prefix-root-suffix" == candidateType
|| "prefix-root" == candidateType
|| "prefix-root-suffix-postsuffix" == candidateType) {
String wylie1 = ((TGCPair)gcs.get(0)).wylie;
String wylie2 = ((TGCPair)gcs.get(1)).wylie;
String wylie1 = ((TGCPair)gcs.get(0)).getWylie();
String wylie2 = ((TGCPair)gcs.get(1)).getWylie();
leftover = 2;
if (TibetanMachineWeb.isAmbiguousWylie(wylie1, wylie2))
wylieBuffer.append(wylie1 + WYLIE_DISAMBIGUATING_KEY + wylie2);
@ -1357,8 +1363,8 @@ public class TibTextUtils implements THDLWylieConstants {
}
if ("prefix-root-suffix-postsuffix" == candidateType) {
leftover = 4;
String wylie3 = ((TGCPair)gcs.get(2)).wylie;
String wylie4 = ((TGCPair)gcs.get(3)).wylie;
String wylie3 = ((TGCPair)gcs.get(2)).getWylie();
String wylie4 = ((TGCPair)gcs.get(3)).getWylie();
wylieBuffer.append(unambiguousPostAVowelWylie(wylie3,
wylie4));
}
@ -1371,15 +1377,15 @@ public class TibTextUtils implements THDLWylieConstants {
// append the wylie left over:
for (int i = leftover; i < sz; i++) {
TGCPair tp = (TGCPair)gcs.get(i);
String wylie = tp.wylie;
String wylie = tp.getWylie();
wylieBuffer.append(wylie);
}
}
}
/**
* Gets the Extended Wylie for a sequence of glyphs using Chandler's
* experimental method. This works as follows:
* Gets the Extended Wylie for a sequence of glyphs. This works as
* follows:
*
* <p>We run along until we hit whitespace or punctuation. We take
* everything before that and we see if it's a legal Tibetan tsheg bar,
@ -1480,4 +1486,90 @@ public class TibTextUtils implements THDLWylieConstants {
}
return rv;
}
/** Returns true if and only if the stack with Wylie <i>root</i>
* can take the prefix <i>prefix</i>. */
private static boolean isLegalPrefixRootCombo(String prefix, String root) {
// This will be decomposed enough. If you can decompose it,
// then it doesn't take a prefix!
if (!TibetanMachineWeb.isKnownHashKey(root)) {
root = root.replace('+', '-');
if (!TibetanMachineWeb.isKnownHashKey(root)) {
throw new Error("root is, now, " + root); // FIXME: make this an assertion
}
}
String ru = TibetanMachineWeb.getUnicodeForWylieForGlyph(root);
// ru may be for (head, root, sub), (head, root), (root), or
// (root, sub). Try all possibilities that are possible with
// a String of length ru. If there's a wa-zur, then we say
// (FIXME: do we say correctly?) that a stack with wa-zur can
// take a prefix if and only if the stack without can take a
// prefix.
if (ru == null) throw new Error("how? root is " + root); // FIXME: make this an assertion
int rl = ru.length();
if (ru.charAt(rl - 1) == UnicodeConstants.EWSUB_wa_zur)
--rl; // forget about wa-zur: see above.
if (rl == 2) {
char ch0 = ru.charAt(0);
char ch1 = UnicodeUtils.getNominalRepresentationOfSubscribedConsonant(ru.charAt(1));
// (head, root) and (root, sub) are possibilities.
if (ACHUNG.equals(prefix)) {
return LegalTshegBar.takesAchungPrefix(ch0, ch1, UnicodeConstants.EW_ABSENT)
|| LegalTshegBar.takesAchungPrefix(UnicodeConstants.EW_ABSENT, ch0, ch1);
} else if ("b".equals(prefix)) {
return LegalTshegBar.takesBao(ch0, ch1, UnicodeConstants.EW_ABSENT)
|| LegalTshegBar.takesBao(UnicodeConstants.EW_ABSENT, ch0, ch1);
} else if ("m".equals(prefix)) {
return LegalTshegBar.takesMao(ch0, ch1, UnicodeConstants.EW_ABSENT)
|| LegalTshegBar.takesMao(UnicodeConstants.EW_ABSENT, ch0, ch1);
} else if ("g".equals(prefix)) {
return LegalTshegBar.takesGao(ch0, ch1, UnicodeConstants.EW_ABSENT)
|| LegalTshegBar.takesGao(UnicodeConstants.EW_ABSENT, ch0, ch1);
} else if ("d".equals(prefix)) {
return LegalTshegBar.takesDao(ch0, ch1, UnicodeConstants.EW_ABSENT)
|| LegalTshegBar.takesDao(UnicodeConstants.EW_ABSENT, ch0, ch1);
} else {
throw new IllegalArgumentException("prefix is " + prefix);
}
} else if (rl == 1) {
char ch0 = ru.charAt(0);
// (root) is the only choice.
if (ACHUNG.equals(prefix)) {
return LegalTshegBar.takesAchungPrefix(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT);
} else if ("b".equals(prefix)) {
return LegalTshegBar.takesBao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT);
} else if ("m".equals(prefix)) {
return LegalTshegBar.takesMao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT);
} else if ("g".equals(prefix)) {
return LegalTshegBar.takesGao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT);
} else if ("d".equals(prefix)) {
return LegalTshegBar.takesDao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT);
} else {
throw new IllegalArgumentException("prefix is " + prefix);
}
} else if (rl == 3) {
char ch0 = ru.charAt(0);
char ch1 = UnicodeUtils.getNominalRepresentationOfSubscribedConsonant(ru.charAt(1));
char ch2 = UnicodeUtils.getNominalRepresentationOfSubscribedConsonant(ru.charAt(2));
// (head, root, sub) is the only choice.
if (ACHUNG.equals(prefix)) {
return LegalTshegBar.takesAchungPrefix(ch0, ch1, ch2);
} else if ("b".equals(prefix)) {
return LegalTshegBar.takesBao(ch0, ch1, ch2);
} else if ("m".equals(prefix)) {
return LegalTshegBar.takesMao(ch0, ch1, ch2);
} else if ("g".equals(prefix)) {
return LegalTshegBar.takesGao(ch0, ch1, ch2);
} else if ("d".equals(prefix)) {
return LegalTshegBar.takesDao(ch0, ch1, ch2);
} else {
throw new IllegalArgumentException("prefix is " + prefix);
}
} else {
return false;
}
}
}

View file

@ -178,14 +178,19 @@ public class TibetanMachineWeb implements THDLWylieConstants {
// NOTE WELL: if you delete from consonants, numbers, vowels, or
// others, you'll change the way Jskad's Extended Wylie keyboard
// works, yes, but you'll also change TMW->Wylie.
// NOTE WELL: if you delete from tibetanConsonants,
// otherConsonants, numbers, vowels, or others, you'll change the
// way Jskad's Extended Wylie keyboard works, yes, but you'll also
// change TMW->Wylie.
/** comma-delimited list of supported consonants (Tibetan and
Tibetanized Sanskrit): */
private static final String consonants
= "k,kh,g,ng,c,ch,j,ny,t,th,d,n,p,ph,b,m,ts,tsh,dz,w,zh,z,',y,r,l,sh,s,h,a,T,Th,D,N,Sh,v,f,Dz";
/** comma-delimited list of supported Tibetan consonants: */
private static final String tibetanConsonants
= "k,kh,g,ng,c,ch,j,ny,t,th,d,n,p,ph,b,m,ts,tsh,dz,w,zh,z,',y,r,l,sh,s,h,a";
/** comma-delimited list of supported non-Tibetan consonants, such
* as Sanskrit consonants: */
private static final String otherConsonants // va and fa are treated pretty-much like Sanskrit.
= "T,Th,D,N,Sh,v,f,Dz";
/** comma-delimited list of supported numbers (superscribed,
subscribed, normal, half-numerals): */
@ -371,7 +376,7 @@ public class TibetanMachineWeb implements THDLWylieConstants {
charSet = new HashSet();
tibSet = new HashSet();
sTok = new StringTokenizer(consonants, ",");
sTok = new StringTokenizer(tibetanConsonants, ",");
while (sTok.hasMoreTokens()) {
String ntk;
charSet.add(ntk = sTok.nextToken());
@ -379,6 +384,15 @@ public class TibetanMachineWeb implements THDLWylieConstants {
validInputSequences.put(ntk, anyOldObjectWillDo);
}
sanskritStackSet = new HashSet();
sTok = new StringTokenizer(otherConsonants, ",");
while (sTok.hasMoreTokens()) {
String ntk;
charSet.add(ntk = sTok.nextToken());
sanskritStackSet.add(ntk);
validInputSequences.put(ntk, anyOldObjectWillDo);
}
numberSet = new HashSet();
sTok = new StringTokenizer(numbers, ",");
while (sTok.hasMoreTokens()) {
@ -386,7 +400,7 @@ public class TibetanMachineWeb implements THDLWylieConstants {
// do it in <?Input:Numbers?> so that Jskad has the same
// TMW->Wylie conversion regardless of whether or not it
// chooses to support inputting numbers. Likewise for
// consonants, others, and vowels.
// tibetanConsonants, otherConsonants, others, and vowels.
String ntk;
charSet.add(ntk = sTok.nextToken());
numberSet.add(ntk);
@ -427,8 +441,6 @@ public class TibetanMachineWeb implements THDLWylieConstants {
boolean ignore = false;
sanskritStackSet = new HashSet();
while ((line = in.readLine()) != null) {
if (line.startsWith("<?")) { //line is command
if (line.equalsIgnoreCase("<?Consonants?>")) {
@ -1182,6 +1194,23 @@ public static boolean hasGlyph(String hashKey) {
return true;
}
/** Returns the Unicode correspondence for the Wylie wylie, which must
* be Wylie returned by getWylieForGlyph(int, int, boolean[]).
* Returns null if the Unicode correspondence is nonexistent or
* unknown. */
public static String getUnicodeForWylieForGlyph(String wylie) {
DuffCode dc = getGlyph(wylie);
return mapTMWtoUnicode(dc.getFontNum() - 1, dc.getCharNum());
}
/**
* Returns true if and only if hashKey is a known hash key from tibwn.ini.
*/
public static boolean isKnownHashKey(String hashKey) {
DuffCode[] dc = (DuffCode[])tibHash.get(hashKey);
return (null != dc);
}
/**
* Gets a glyph for this hash key. Hash keys are not identical to Extended
* Wylie. The hash key for a Tibetan stack separates the members of the stack
@ -1193,7 +1222,7 @@ public static boolean hasGlyph(String hashKey) {
public static DuffCode getGlyph(String hashKey) {
DuffCode[] dc = (DuffCode[])tibHash.get(hashKey);
if (null == dc)
throw new Error("It is likely that you misconfigured tibwn.ini such that, say, M is expected (i.e., it is listed as, e.g. punctuation), but no 'M~...' line appears.");
throw new Error("Hash key " + hashKey + " not found; it is likely that you misconfigured tibwn.ini such that, say, M is expected (i.e., it is listed as, e.g. punctuation), but no 'M~...' line appears.");
return dc[TMW];
}

View file

@ -98,13 +98,6 @@ __TILDE__~93,5~~9,91~~~~~~~none
<?Input:Tibetan?>
// 0F5F,0F39 might work, but the OpenType font's author must've had
// Dza in mind if it does. Note that the bottommost horizontal stroke
// goes upward on U+0F5F and downward on U+0F5B.
Dz~146,5~~10,42~~~~~~~none
f~153,5~~10,58~1,110~1,118~1,124~1,126~10,114~10,123~0F55,0F39
v~154,5~~10,59~1,110~1,118~1,124~1,126~10,114~10,123~0F56,f39
k~33,1~1,92~1,33~1,109~1,111~1,123~1,125~10,118~10,120~0F40
kh~34,1~~1,34~1,109~1,118~1,123~1,125~10,114~10,123~0F41
g~35,1~1,93~1,35~1,109~1,111~1,123~1,125~10,118~10,120~0F42
@ -135,11 +128,6 @@ sh~59,1~1,99~1,60~1,109~1,111~1,123~1,125~10,118~10,120~0F64
s~60,1~~1,61~1,109~1,118~1,123~1,125~10,114~10,123~0F66
h~61,1~1,100~1,62~1,109~1,112~1,123~1,125~10,115~10,122~0F67~1,102
a~62,1~~1,63~1,109~1,118~1,123~1,125~10,114~10,123~0F68
T~170,1~~1,64~1,109~1,120~1,123~1,125~10,115~10,124~0F4A
Th~171,1~~1,65~1,109~1,118~1,123~1,125~10,114~10,123~0F4B
D~172,1~~1,66~1,109~1,120~1,123~1,125~10,115~10,124~0F4C
N~173,1~~1,67~1,109~1,118~1,123~1,125~10,115~10,124~0F4E
Sh~174,1~~1,68~1,109~1,118~1,123~1,125~10,115~10,124~0F65
r-k~63,1~~1,70~1,109~1,121~1,123~1,125~10,115~10,124~f62,f90
r-g~64,1~~1,71~1,109~1,121~1,123~1,125~10,115~10,124~f62,f92
r-ng~65,1~~1,72~1,109~1,119~1,123~1,125~10,115~10,124~f62,f94
@ -241,6 +229,17 @@ au~237,1~~8,89~~~~~~~0F7D~~8,104
// DLC FIXME: need -I as well
<?Input:Sanskrit?>
// 0F5F,0F39 might work, but the OpenType font's author must've had
// Dza in mind if it does. Note that the bottommost horizontal stroke
// goes upward on U+0F5F and downward on U+0F5B.
Dz~146,5~~10,42~~~~~~~none
f~153,5~~10,58~1,110~1,118~1,124~1,126~10,114~10,123~0F55,0F39
v~154,5~~10,59~1,110~1,118~1,124~1,126~10,114~10,123~0F56,f39
T~170,1~~1,64~1,109~1,120~1,123~1,125~10,115~10,124~0F4A
Th~171,1~~1,65~1,109~1,118~1,123~1,125~10,114~10,123~0F4B
D~172,1~~1,66~1,109~1,120~1,123~1,125~10,115~10,124~0F4C
N~173,1~~1,67~1,109~1,118~1,123~1,125~10,115~10,124~0F4E
Sh~174,1~~1,68~1,109~1,118~1,123~1,125~10,115~10,124~0F65
k+Sh~175,1~~1,69~1,109~1,122~1,123~1,125~10,116~10,125~0F69
k+k~33,2~~3,33~1,109~4,120~1,123~1,125~4,106~4,113~f40,f90
k+kh~34,2~~3,34~1,109~4,120~1,123~1,125~4,106~4,113~f40,f91

View file

@ -1266,7 +1266,7 @@ public final class LegalTshegBar
* @param sub the {@link #isNominalRepresentationOfConsonant(char)
* nominal representation} of the subjoined letter, or EW_ABSENT
* if not present */
static boolean takesGao(char head, char root, char sub) {
public static boolean takesGao(char head, char root, char sub) {
if (EW_ABSENT == head) {
if (EW_ABSENT == sub) {
return (EWC_ca == root
@ -1298,7 +1298,7 @@ public final class LegalTshegBar
* @param sub the {@link #isNominalRepresentationOfConsonant(char)
* nominal representation} of the subjoined letter, or EW_ABSENT
* if not present */
static boolean takesDao(char head, char root, char sub) {
public static boolean takesDao(char head, char root, char sub) {
if (EW_ABSENT == head) {
if (EW_ABSENT == sub) {
return (EWC_ka == root
@ -1312,6 +1312,7 @@ public final class LegalTshegBar
|| (EWC_pa == root && EWC_ya == sub)
|| (EWC_ba == root && EWC_ya == sub)
|| (EWC_ma == root && EWC_ya == sub)
|| (EWC_ka == root && EWC_ya == sub) // dkyil, for example
|| (EWC_ka == root && EWC_ra == sub)
|| (EWC_ga == root && EWC_ra == sub)
@ -1336,7 +1337,7 @@ public final class LegalTshegBar
* @param sub the {@link #isNominalRepresentationOfConsonant(char)
* nominal representation} of the subjoined letter, or EW_ABSENT
* if not present */
static boolean takesAchungPrefix(char head, char root, char sub) {
public static boolean takesAchungPrefix(char head, char root, char sub) {
if (EW_ABSENT == head) {
if (EW_ABSENT == sub) {
return (EWC_ga == root
@ -1379,7 +1380,7 @@ public final class LegalTshegBar
* @param sub the {@link #isNominalRepresentationOfConsonant(char)
* nominal representation} of the subjoined letter, or EW_ABSENT
* if not present */
static boolean takesMao(char head, char root, char sub) {
public static boolean takesMao(char head, char root, char sub) {
if (EW_ABSENT == head) {
if (EW_ABSENT == sub) {
return (EWC_kha == root
@ -1418,11 +1419,12 @@ public final class LegalTshegBar
* @param sub the {@link #isNominalRepresentationOfConsonant(char)
* nominal representation} of the subjoined letter, or EW_ABSENT
* if not present */
static boolean takesBao(char head, char root, char sub) {
public static boolean takesBao(char head, char root, char sub) {
// DLC ask Ten-lo la about Wazur.
if (EW_ABSENT == head) {
if (EW_ABSENT == sub) {
return (EWC_ka == root
|| EWC_sa == root // bsams, for example
|| EWC_ca == root
|| EWC_ta == root
|| EWC_tsa == root

View file

@ -232,6 +232,14 @@ public class UnicodeUtils implements UnicodeConstants {
/* DLC FIXME -- I was using 3.0 p.437-440, check 3.2. */
}
/** If ch is in one of the ranges U+0F90-U+0F97, U+0F99-U+0FB9,
* then this returns the same consonant in the range
* U+0F40-U+0F69. If ch is not in that range, this returns
* garbage. */
public static char getNominalRepresentationOfSubscribedConsonant(char ch) {
return (char)((int)ch-(((int)'\u0F90') - ((int)'\u0F40')));
}
/** Returns true iff ch corresponds to the Tibetan letter ra.
Several Unicode codepoints correspond to the Tibetan letter ra
(in its subscribed form or otherwise). Oftentimes,

View file

@ -58,28 +58,46 @@ public class ACIPConverter {
ArrayList al = ACIPTshegBarScanner.scanFile(args[1], errors, strict, maxErrors - 1);
if (null == al) {
System.err.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this");
System.err.println(maxErrors + " or more lexical errors occurred while scanning ACIP input file; is this");
System.err.println("Tibetan or English input?");
System.err.println("");
System.err.println("First " + maxErrors + " errors scanning ACIP input file: ");
System.err.println(errors);
System.err.println("Exiting with " + maxErrors + " or more errors; please fix input file and try again.");
if (false) {
// Nobody wants to see this. FIXME: maybe somebody; have an option.
System.err.println("First " + maxErrors + " lexical errors scanning ACIP input file: ");
System.err.println(errors);
}
System.err.println("Exiting with " + maxErrors + " or more lexical errors; please fix input file and try again.");
System.exit(1);
}
final boolean abortUponScanningError = false; // DLC MAKE ME CONFIGURABLE
// DLC NOW: BAo isn't converting.
if (errors.length() > 0) {
System.err.println("Errors scanning ACIP input file: ");
System.err.println(errors);
System.err.println("Exiting; please fix input file and try again.");
System.exit(1);
if (abortUponScanningError) {
System.err.println("Exiting; please fix input file and try again.");
System.exit(1);
}
}
convertToUnicode(al, System.out, errors);
StringBuffer warnings = new StringBuffer();
boolean putWarningsInOutput = true; // DLC make me configurable.
convertToUnicode(al, System.out, errors, warnings,
putWarningsInOutput);
if (errors.length() > 0) {
System.err.println("Errors converting ACIP input file: ");
System.err.println(errors);
System.err.println("The output contains these errors.");
System.err.println("Exiting; please fix input file and try again.");
System.exit(2);
}
if (warnings.length() > 0) {
System.err.println("Warnings converting ACIP input file: ");
System.err.println(warnings);
if (putWarningsInOutput)
System.err.println("The output contains these warnings.");
System.exit(2);
}
if (verbose) System.err.println("Converted " + args[1] + " perfectly.");
System.exit(0);
}
@ -96,19 +114,30 @@ public class ACIPConverter {
{
throw new Error("DLC UNIMPLEMENTED");
}
// DLC FIXME: sometimes { } is \u0F0B, and sometimes it is a
// space. Treat it as a tsheg only when it appears after a
// syllable or another tsheg.
/** Returns UTF-8 encoded Unicode. A bit indirect, so use this
* for testing only if performance is a concern. If errors occur
* in scanning the ACIP or in converting a tsheg bar, then they
* are appended to errors if errors is non-null. Returns the
* are appended to errors if errors is non-null, as well as
* written to the result. If warnings occur in scanning the ACIP
* or in converting a tsheg bar, then they are appended to
* warnings if warnings is non-null, and they are written to the
* result if writeWarningsToResult is true. Returns the
* conversion upon perfect success, null if errors occurred.
*/
public static String convertToUnicode(String acip,
StringBuffer errors) {
StringBuffer errors,
StringBuffer warnings,
boolean writeWarningsToResult) {
ByteArrayOutputStream sw = new ByteArrayOutputStream();
ArrayList al = ACIPTshegBarScanner.scan(acip, errors, true /* DLC FIXME */, -1);
try {
if (null != al && convertToUnicode(al, sw, errors)) {
if (null != al
&& convertToUnicode(al, sw, errors,
warnings, writeWarningsToResult)) {
return sw.toString("UTF-8");
} else {
System.out.println("DLC al is " + al + " and convertToUnicode returned null.");
@ -119,15 +148,25 @@ public class ACIPConverter {
}
}
/** Writes Unicode to out. If errors occur in converting a
* tsheg bar, then they are appended to errors if errors is
* non-null. Returns true upon perfect success, false if errors
* occurred.
/** Writes Unicode to out. If errors occur in converting a tsheg
* bar, then they are appended to errors if errors is non-null.
* Furthermore, errors are written to out. If writeWarningsToOut
* is true, then warnings also will be written to out. Returns
* true upon perfect success, false if errors occurred.
* @param scan result of ACIPTshegBarScanner.scan(..)
* @param out stream to which to write converted text
* @param errors if non-null, all error messages are appended
* @param warnings if non-null, all warning messages are appended
* to this
* @param writeWarningsToOut if true, then all warning messages
* are written to out in the appropriate places
* @throws IOException if we cannot write to out
*/
public static boolean convertToUnicode(ArrayList scan,
OutputStream out,
StringBuffer errors)
StringBuffer errors,
StringBuffer warnings,
boolean writeWarningsToOut)
throws IOException
{
int sz = scan.size();
@ -139,7 +178,7 @@ public class ACIPConverter {
int stype = s.getType();
if (stype == ACIPString.ERROR) {
hasErrors = true;
writer.write("[#ERROR CONVERTING ACIP DOCUMENT: ");
writer.write("[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: ");
writer.write(s.getText());
writer.write("]");
} else {
@ -179,6 +218,21 @@ public class ACIPConverter {
if (null != errors)
errors.append(errorMessage + "\n");
} else {
String warning
= pt.getWarning(false, // DLC: make me configurable
pl,
s.getText());
if (null != warning) {
if (writeWarningsToOut) {
writer.write("[#WARNING CONVERTING ACIP DOCUMENT: ");
writer.write(warning);
writer.write("]");
}
if (null != warnings) {
warnings.append(warning);
warnings.append('\n');
}
}
unicode = sl.getUnicode();
if (null == unicode) throw new Error("DLC: HOW?");
}

View file

@ -133,16 +133,18 @@ public class ACIPTshegBarScanner {
Stack bracketTypeStack = new Stack();
int startSlashIndex = -1;
int startParenIndex = -1;
int numNewlines = 0;
for (int i = 0; i < sl; i++) {
if (i < startOfString) throw new Error("bad reset");
char ch;
ch = s.charAt(i);
if (ch == '\n') ++numNewlines;
if (ACIPString.COMMENT == currentType && ch != ']') {
if ('[' == ch) {
al.add(new ACIPString("Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n",
ACIPString.ERROR));
if (null != errors)
errors.append("Offset " + i + ": "
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
}
@ -157,17 +159,18 @@ public class ACIPTshegBarScanner {
al.add(new ACIPString(s.substring(startOfString, i),
currentType));
}
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
al.add(new ACIPString("Found a truly unmatched close bracket, " + s.substring(i, i+1),
ACIPString.ERROR));
if (!waitingForMatchingIllegalClose) {
if (null != errors) {
errors.append("Offset " + i + ": "
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a truly unmatched close bracket, ] or }.\n");
}
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
}
waitingForMatchingIllegalClose = false;
if (null != errors)
errors.append("Offset " + i + ": "
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+1;
@ -249,6 +252,11 @@ public class ACIPTshegBarScanner {
|| s.substring(i, i + "[BP]".length()).equals("{BP}"))) {
thingy = "[BP]";
currentType = ACIPString.BP;
} else if (i + "[BLANK PAGE]".length() <= sl
&& (s.substring(i, i + "[BLANK PAGE]".length()).equals("[BLANK PAGE]")
|| s.substring(i, i + "[BLANK PAGE]".length()).equals("{BLANK PAGE}"))) {
thingy = "[BLANK PAGE]";
currentType = ACIPString.BP;
} else if (i + "[ BP ]".length() <= sl
&& (s.substring(i, i + "[ BP ]".length()).equals("[ BP ]")
|| s.substring(i, i + "[ BP ]".length()).equals("{ BP }"))) {
@ -414,11 +422,11 @@ public class ACIPTshegBarScanner {
// This is an error. Sometimes [COMMENTS APPEAR
// WITHOUT # MARKS]. Though "... [" could cause
// this too.
al.add(new ACIPString(s.substring(i, i+1),
al.add(new ACIPString("Found an illegal open bracket: " + s.substring(i, i+1),
ACIPString.ERROR));
if (waitingForMatchingIllegalClose) {
if (null != errors) {
errors.append("Offset " + i + ": "
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.\n");
}
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
@ -435,7 +443,7 @@ public class ACIPTshegBarScanner {
inContext = inContext + "...";
}
}
errors.append("Offset " + i + ": "
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
}
@ -477,7 +485,6 @@ public class ACIPTshegBarScanner {
if (i+numdigits+2 < sl && s.charAt(i+numdigits+2) == '.') {
if (!(i+numdigits+4 < sl && isNumeric(s.charAt(i+numdigits+3))
&& !isNumeric(s.charAt(i+numdigits+4)))) {
al.add(new ACIPString(s.substring(i, i+numdigits+3), ACIPString.ERROR));
String inContext = s.substring(i, i+Math.min(sl-i, 10));
if (inContext.indexOf("\r") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\r"));
@ -488,8 +495,10 @@ public class ACIPTshegBarScanner {
inContext = inContext + "...";
}
}
al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.",
ACIPString.ERROR));
if (null != errors)
errors.append("Offset " + i + ": "
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+numdigits+3;
@ -498,7 +507,6 @@ public class ACIPTshegBarScanner {
break;
}
if (i+numdigits+4 < sl && (s.charAt(i+numdigits+4) == '.' || s.charAt(i+numdigits+4) == 'A' || s.charAt(i+numdigits+4) == 'B' || s.charAt(i+numdigits+4) == 'a' || s.charAt(i+numdigits+4) == 'b' || isNumeric(s.charAt(i+numdigits+4)))) {
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
String inContext = s.substring(i, i+Math.min(sl-i, 10));
if (inContext.indexOf("\r") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\r"));
@ -509,8 +517,10 @@ public class ACIPTshegBarScanner {
inContext = inContext + "...";
}
}
al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.",
ACIPString.ERROR));
if (null != errors)
errors.append("Offset " + i + ": "
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+1; // DLC FIXME: skip over more?
@ -572,7 +582,9 @@ public class ACIPTshegBarScanner {
}
// This case, @NNN, must come after the @NNN{AB} case.
if (i+numdigits+1 < sl && s.charAt(i+numdigits+1) == ' ') {
if (i+numdigits+1 < sl && (s.charAt(i+numdigits+1) == ' '
|| s.charAt(i+numdigits+1) == '\n'
|| s.charAt(i+numdigits+1) == '\r')) {
boolean allAreNumeric = true;
for (int k = 1; k <= numdigits; k++) {
if (!isNumeric(s.charAt(i+k))) {
@ -591,7 +603,6 @@ public class ACIPTshegBarScanner {
}
}
if (startOfString == i) {
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
String inContext = s.substring(i, i+Math.min(sl-i, 10));
if (inContext.indexOf("\r") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\r"));
@ -602,8 +613,10 @@ public class ACIPTshegBarScanner {
inContext = inContext + "...";
}
}
al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.",
ACIPString.ERROR));
if (null != errors)
errors.append("Offset " + i + ": "
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+1;
@ -626,9 +639,10 @@ public class ACIPTshegBarScanner {
* it means /NYA/. We warn about // for this
* reason. \\ causes a tsheg-bar error (DLC
* FIXME: verify this is so). */
al.add(new ACIPString("//", ACIPString.ERROR));
al.add(new ACIPString("Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.",
ACIPString.ERROR));
if (errors != null) {
errors.append("Offset " + i + ": "
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\n");
}
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
@ -661,9 +675,10 @@ public class ACIPTshegBarScanner {
if (startParenIndex >= 0) {
if (ch == '(') {
al.add(new ACIPString("Nesting of parentheses () is not allowed", ACIPString.ERROR));
al.add(new ACIPString("Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.",
ACIPString.ERROR));
if (null != errors)
errors.append("Offset " + i + ": "
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} else {
@ -674,9 +689,10 @@ public class ACIPTshegBarScanner {
currentType = ACIPString.ERROR;
} else {
if (ch == ')') {
al.add(new ACIPString("Unexpected closing parenthesis )", ACIPString.ERROR));
al.add(new ACIPString("Unexpected closing parenthesis, ), found.",
ACIPString.ERROR));
if (null != errors)
errors.append("Offset " + i + ": "
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Unexpected closing parenthesis, ), found.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} else {
@ -724,10 +740,10 @@ public class ACIPTshegBarScanner {
al.add(new ACIPString(s.substring(i, i+1),
ACIPString.TIBETAN_PUNCTUATION));
} else {
al.add(new ACIPString(s.substring(i, i+1),
al.add(new ACIPString("A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".",
ACIPString.ERROR));
if (null != errors)
errors.append("Offset " + i + ": "
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
}
@ -772,19 +788,24 @@ public class ACIPTshegBarScanner {
al.add(new ACIPString(s.substring(startOfString, i),
currentType));
}
al.add(new ACIPString(s.substring(i, i+1),
ACIPString.ERROR));
if (null != errors) {
if ((int)ch == 65533) {
errors.append("Offset " + i + ": "
if ((int)ch == 65533) {
al.add(new ACIPString("Found an illegal, unprintable character.",
ACIPString.ERROR));
if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal, unprintable character.\n");
} else if ('\\' == ch) {
errors.append("Offset " + i + ": "
} else if ('\\' == ch) {
al.add(new ACIPString("Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.",
ACIPString.ERROR));
if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n");
} else {
errors.append("Offset " + i + ": "
} else {
al.add(new ACIPString("Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".",
ACIPString.ERROR));
if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".\n");
}
}
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+1;

View file

@ -128,7 +128,7 @@ public class PackageTest extends TestCase {
}
{
TStackListList legalParses = pt.getUniqueParse();
TStackListList legalParses = pt.getUniqueParse(false);
boolean goodness2 = (expectedLegalParses == null
|| expectedLegalParses.length == legalParses.size());
for (int i = 0 ; i < legalParses.size(); i++) {
@ -139,18 +139,21 @@ public class PackageTest extends TestCase {
|| expectedLegalParses.length < i+1
|| n.equals(expectedLegalParses[i]));
if (!okay || !goodness2)
System.out.println("Legal parse " + (i) + " (from zero) is " + n + " (toString2=" + n.toString2() + ") and expected is " + expectedLegalParses[i]);
System.out.println("Legal parse " + (i) + " (from zero) is " + n + " (toString2=" + n.toString2() + ") and expected is "
+ ((i < expectedLegalParses.length)
? expectedLegalParses[i]
: "not present"));
assertTrue(okay);
}
if (!goodness2)
System.out.println("You expected " + expectedLegalParses.length + " legal parses, but there were instead " + legalParses.size() + " legal parses.");
System.out.println("You expected " + expectedLegalParses.length + " legal parses, but there were instead " + legalParses.size() + " legal parses for ACIP " + acip + ".");
assertTrue(goodness2);
TStackListList allLegalParses = pt.getLegalParses();
TStackListList decentParses = pt.getNonIllegalParses();
if (pt.getBestParse() == null) {
if (legalParses.size() == 0) {
if (null != expectedBestParse && !"".equals(expectedBestParse)) {
System.out.print("Expected is that there is a best parse \"" + expectedBestParse + "\" but there is no best parse for acip {" + acip + "}");
System.out.print("Expected is that there is a best parse \"" + expectedBestParse + "\" but there is no best parse for ACIP {" + acip + "}");
assertTrue(false);
}
System.out.print("ACIPNoBestParseError: There is no best parse for the ACIP {" + acip + "}; ");
@ -163,7 +166,7 @@ public class PackageTest extends TestCase {
}
} else {
if (legalParses.size() > 1) {
System.out.println("ACIPTooManyLegalParsesError: see these " + legalParses.size() + " legal parses for acip " + acip + ": " + legalParses);
System.out.println("ACIPTooManyLegalParsesError: see these " + legalParses.size() + " legal parses for ACIP " + acip + ": " + legalParses);
assertTrue(legalParses.size() == 2
&& (legalParses.get(0).size()
== 1 + legalParses.get(1).size()));
@ -176,7 +179,7 @@ public class PackageTest extends TestCase {
if (null != expectedBestParse) {
boolean good = pt.getBestParse().equals(expectedBestParse);
if (!good) {
System.out.print("Expected best parse is \"" + expectedBestParse + "\" but the best parse is " + pt.getBestParse() + " for acip {" + acip + "}");
System.out.print("Expected best parse is \"" + expectedBestParse + "\" but the best parse is " + pt.getBestParse() + " for ACIP {" + acip + "}");
}
assertTrue(good);
}
@ -229,6 +232,116 @@ public class PackageTest extends TestCase {
* {@link TPairList#getACIPError()}, and {@link
* TPairList#recoverACIP()}. */
public void testBreakACIPIntoChunks() {
tstHelper("GASN"); // ambiguous with regard to prefix rules
tstHelper("BARMA"); // ambiguous with regard to prefix rules
tstHelper("MARDA"); // ambiguous with regard to prefix rules
tstHelper("BBA"); // ambiguous with regard to prefix rules
tstHelper("BBLUGS"); // ambiguous with regard to prefix rules
tstHelper("BDRA"); // ambiguous with regard to prefix rules
tstHelper("BDRAG"); // ambiguous with regard to prefix rules
tstHelper("BDRA'I"); // ambiguous with regard to prefix rules
tstHelper("BDRAL"); // ambiguous with regard to prefix rules
tstHelper("BDRAN"); // ambiguous with regard to prefix rules
tstHelper("BDRANGS"); // ambiguous with regard to prefix rules
tstHelper("BDREN"); // ambiguous with regard to prefix rules
tstHelper("BDRI"); // ambiguous with regard to prefix rules
tstHelper("BDRIS"); // ambiguous with regard to prefix rules
tstHelper("BDROL"); // ambiguous with regard to prefix rules
tstHelper("BDRUG"); // ambiguous with regard to prefix rules
tstHelper("BLCAG"); // ambiguous with regard to prefix rules
tstHelper("BLCI"); // ambiguous with regard to prefix rules
tstHelper("BLKONG"); // ambiguous with regard to prefix rules
tstHelper("BLNGA"); // ambiguous with regard to prefix rules
tstHelper("BLNGAG"); // ambiguous with regard to prefix rules
tstHelper("BMA"); // ambiguous with regard to prefix rules
tstHelper("BMYOD"); // ambiguous with regard to prefix rules
tstHelper("BSALDA"); // ambiguous with regard to prefix rules
tstHelper("BSAMS"); // ambiguous with regard to prefix rules
tstHelper("BSEMS"); // ambiguous with regard to prefix rules
tstHelper("BTSAMS"); // ambiguous with regard to prefix rules
tstHelper("BTSIMS"); // ambiguous with regard to prefix rules
tstHelper("DDANG"); // ambiguous with regard to prefix rules
tstHelper("DDAR"); // ambiguous with regard to prefix rules
tstHelper("DDRANGS"); // ambiguous with regard to prefix rules
tstHelper("DDRUG"); // ambiguous with regard to prefix rules
tstHelper("DNAG"); // ambiguous with regard to prefix rules
tstHelper("DNOGS"); // ambiguous with regard to prefix rules
tstHelper("DRBAN"); // ambiguous with regard to prefix rules
tstHelper("DRGYU"); // ambiguous with regard to prefix rules
tstHelper("DRTOG"); // ambiguous with regard to prefix rules
tstHelper("DYA"); // ambiguous with regard to prefix rules
tstHelper("DYAN"); // ambiguous with regard to prefix rules
tstHelper("GDRA"); // ambiguous with regard to prefix rules
tstHelper("GDRIM"); // ambiguous with regard to prefix rules
tstHelper("GGAN"); // ambiguous with regard to prefix rules
tstHelper("GGYUR"); // ambiguous with regard to prefix rules
tstHelper("GLTAR"); // ambiguous with regard to prefix rules
tstHelper("GLTUNG"); // ambiguous with regard to prefix rules
tstHelper("GMA"); // ambiguous with regard to prefix rules
tstHelper("GMAN"); // ambiguous with regard to prefix rules
tstHelper("GMON"); // ambiguous with regard to prefix rules
tstHelper("GRDEGS"); // ambiguous with regard to prefix rules
tstHelper("GRDZU"); // ambiguous with regard to prefix rules
tstHelper("GRGYA"); // ambiguous with regard to prefix rules
tstHelper("GRNAGS"); // ambiguous with regard to prefix rules
tstHelper("GRTAN"); // ambiguous with regard to prefix rules
tstHelper("GRTOGS"); // ambiguous with regard to prefix rules
tstHelper("GRTZO"); // ambiguous with regard to prefix rules
tstHelper("GRTZOD"); // ambiguous with regard to prefix rules
tstHelper("GRTZON"); // ambiguous with regard to prefix rules
tstHelper("GSLA"); // ambiguous with regard to prefix rules
tstHelper("GSNAD"); // ambiguous with regard to prefix rules
tstHelper("GZLA"); // ambiguous with regard to prefix rules
tstHelper("MBA"); // ambiguous with regard to prefix rules
tstHelper("MBA'"); // ambiguous with regard to prefix rules
tstHelper("MBI'I"); // ambiguous with regard to prefix rules
tstHelper("MHA'A"); // ambiguous with regard to prefix rules
tstHelper("MRDA"); // ambiguous with regard to prefix rules
tstHelper("MRDO"); // ambiguous with regard to prefix rules
tstHelper("MRDZOGS"); // ambiguous with regard to prefix rules
tstHelper("MRGA"); // ambiguous with regard to prefix rules
tstHelper("MRGAD"); // ambiguous with regard to prefix rules
tstHelper("MRGAN"); // ambiguous with regard to prefix rules
tstHelper("MRJES"); // ambiguous with regard to prefix rules
tstHelper("MRJOD"); // ambiguous with regard to prefix rules
tstHelper("MRTOGS"); // ambiguous with regard to prefix rules
tstHelper("MRTOL"); // ambiguous with regard to prefix rules
tstHelper("MRTZE'I"); // ambiguous with regard to prefix rules
tstHelper("MRTZIGS"); // ambiguous with regard to prefix rules
tstHelper("MSAM"); // ambiguous with regard to prefix rules
tstHelper("MSGRIB"); // ambiguous with regard to prefix rules
tstHelper("MSKYES"); // ambiguous with regard to prefix rules
tstHelper("MSON"); // ambiguous with regard to prefix rules
tstHelper("MSOS"); // ambiguous with regard to prefix rules
tstHelper("MSTAMS"); // ambiguous with regard to prefix rules
tstHelper("MSTAN"); // ambiguous with regard to prefix rules
// If you're not careful, you'll think GGYES is a legal
// Tibetan tsheg bar and parse it as {G}{G+YE}{S}. But it's
// Sanskrit, really, because GA doesn't take a GA prefix.
// This doesn't occur in ACIP input files that I've seen, but
// GGYI (S1000I.INC) and GGYUR (S5275MC4.ACT) do occur.
tstHelper("GGYES", "{G}{G}{YE}{S}",
new String[] { "{G}{G}{YE}{S}", "{G}{G+YE}{S}", "{G+G}{YE}{S}" },
new String[] { },
"{G+G}{YE}{S}");
tstHelper("DRUG", "{D}{RU}{G}",
new String[] { "{D}{RU}{G}", "{D+RU}{G}" },
new String[] { "{D+RU}{G}" },
"{D+RU}{G}");
tstHelper("d+H+d+HA", "{d+}{H+}{d+}{HA}",
new String[] { "{d+H+d+HA}" },
new String[] { "{d+H+d+HA}" });
tstHelper("Gd+H+d+HA");
tstHelper("AUTPA", "{AU}{T}{PA}",
new String[] { "{AU}{T}{PA}", "{AU}{T+PA}" },
new String[] { },
@ -249,7 +362,8 @@ public class PackageTest extends TestCase {
new String[] { "{G+R+VA}{'I}" });
tstHelper("G-RVA'I", "{G-}{R}{VA}{'I}",
new String[] { "{G}{R+VA}{'I}" },
new String[] { "{G}{R+VA}{'I}" });
new String[] { },
"{G}{R+VA}{'I}");
tstHelper("RVA", "{R}{VA}",
new String[] { "{R+VA}" },
new String[] { "{R+VA}" });
@ -6967,8 +7081,8 @@ tstHelper("ZUR");
"",
"[TIBETAN_NON_PUNCTUATION:{LA}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_NON_PUNCTUATION:{SGRUB}]"); // DLC FIXME
shelp("PAS... LA",
"Offset 5: A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n",
"[TIBETAN_NON_PUNCTUATION:{PAS}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, ERROR:{.}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{LA}]");
"Offset 5 or maybe 5: A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n",
"[TIBETAN_NON_PUNCTUATION:{PAS}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, ERROR:{A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{LA}]");
shelp("PAS... LA",
"",
true,
@ -6983,28 +7097,28 @@ tstHelper("ZUR");
shelp("", "", "[]");
shelp("[DD]", "");
shelp("[",
"Offset 0: Found an illegal open bracket (in context, this is [). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n");
"Offset 0 or maybe 0: Found an illegal open bracket (in context, this is [). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n");
shelp("{",
"Offset 0: Found an illegal open bracket (in context, this is {). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n");
"Offset 0 or maybe 0: Found an illegal open bracket (in context, this is {). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n");
shelp("DD", "");
shelp("DD]",
"Offset 2: Found a truly unmatched close bracket, ] or }.\nOffset 2: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
"Offset 2 or maybe 2: Found a truly unmatched close bracket, ] or }.\nOffset 2 or maybe 2: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
shelp("///NYA", "Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
shelp("///NYA", "Offset 1 or maybe 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
shelp("/NYA/", "");
shelp("[?][BP][LS][DD1][DD2][DDD][DR][# (<{A COMMENT)}>]", "");
shelp("[LS][# A [[[[[COMMENT][LS]",
"Offset 9: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
+ "Offset 10: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
+ "Offset 11: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
+ "Offset 12: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
+ "Offset 13: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n");
"Offset 9 or maybe 9: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
+ "Offset 10 or maybe 10: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
+ "Offset 11 or maybe 11: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
+ "Offset 12 or maybe 12: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
+ "Offset 13 or maybe 13: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n");
shelp("[ILLEGAL COMMENT]",
"Offset 0: Found an illegal open bracket (in context, this is [ILLEGAL C...). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 16: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
"Offset 0 or maybe 0: Found an illegal open bracket (in context, this is [ILLEGAL C...). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 16 or maybe 16: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
shelp("(BSKYABS GRO)", ""); // DLC WHAT ARE THESE FOR?
shelp("BSKYABS GRO)", "Offset 11: Unexpected closing parenthesis, ), found.\n");
shelp("BSKYABS GRO)", "Offset 11 or maybe 11: Unexpected closing parenthesis, ), found.\n");
shelp("BSKYABS GRO(", "Offset END: Unmatched open parenthesis, (, found.\n");
shelp("((NESTAGE))", "Offset 1: Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\nOffset 10: Unexpected closing parenthesis, ), found.\n");
shelp("((NESTAGE))", "Offset 1 or maybe 1: Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\nOffset 10 or maybe 10: Unexpected closing parenthesis, ), found.\n");
shelp("(BA)(PA)NYA(CA)", "");
shelp("NYAx", "");
shelp("NYA x", "");
@ -7033,9 +7147,9 @@ tstHelper("ZUR");
shelp("(NYA ", "Offset END: Unmatched open parenthesis, (, found.\n");
shelp("[*NYA ", "Offset END: Unmatched open bracket found. A correction does not terminate.\n");
shelp("?", "", "[QUESTION:{?}]");
shelp("KHAN~ BAR ", "Offset 4: Found an illegal character, ~, with ordinal 126.\n");
shelp("KHAN~ BAR ", "Offset 4 or maybe 4: Found an illegal character, ~, with ordinal 126.\n");
shelp("[* Correction with []]",
"Offset 5: Found an illegal character, r, with ordinal 114.\nOffset 6: Found an illegal character, r, with ordinal 114.\nOffset 7: Found an illegal character, e, with ordinal 101.\nOffset 8: Found an illegal character, c, with ordinal 99.\nOffset 14: Found an illegal character, w, with ordinal 119.\nOffset 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
"Offset 5 or maybe 5: Found an illegal character, r, with ordinal 114.\nOffset 6 or maybe 6: Found an illegal character, r, with ordinal 114.\nOffset 7 or maybe 7: Found an illegal character, e, with ordinal 101.\nOffset 8 or maybe 8: Found an illegal character, c, with ordinal 99.\nOffset 14 or maybe 14: Found an illegal character, w, with ordinal 119.\nOffset 19 or maybe 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21 or maybe 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
// DLC FIXME: the line SDIG PA'I GROGS PO'I LAG TU SON PAR 'GYUR PA is followed by a blank line. Note that it's "PA", not "PA ", ending it. Autocorrect to the latter.
@ -7051,8 +7165,8 @@ tstHelper("ZUR");
uhelp(" 1\\ ", "\u0f0b\u0f21\u0f84\u0f0b");
}
shelp("K\\,",
"Offset 1: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n",
"[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{\\}, TIBETAN_PUNCTUATION:{,}]");
"Offset 1 or maybe 1: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n",
"[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}, TIBETAN_PUNCTUATION:{,}]");
shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR%}]");
@ -7073,15 +7187,15 @@ tstHelper("ZUR");
shelp("@01A.3 ", "", "[FOLIO_MARKER:{@01A.3}, TIBETAN_PUNCTUATION:{ }]");
shelp("@001 ", "", "[FOLIO_MARKER:{@001}, TIBETAN_PUNCTUATION:{ }]");
shelp("@19-20A",
"Offset 0: Found an illegal at sign, @ (in context, this is @19-20A). @012B is an example of a legal folio marker.\n",
"[ERROR:{@}, TIBETAN_NON_PUNCTUATION:{19-20A}]"); // DLC FIXME: yes it occurs in the kangyur.
"Offset 0 or maybe 0: Found an illegal at sign, @ (in context, this is @19-20A). @012B is an example of a legal folio marker.\n",
"[ERROR:{Found an illegal at sign, @ (in context, this is @19-20A). @012B is an example of a legal folio marker.}, TIBETAN_NON_PUNCTUATION:{19-20A}]"); // DLC FIXME: yes it occurs in the kangyur.
shelp("@[7B]", "");
shelp("@012A.3KA",
"",
"[FOLIO_MARKER:{@012A.3}, TIBETAN_NON_PUNCTUATION:{KA}]");
shelp("@012A.34",
"Offset 0: Found an illegal at sign, @ (in context, this is @012A.34). This folio marker has a period, '.', at the end of it, which is illegal.\n",
"[ERROR:{@012A.}, TIBETAN_NON_PUNCTUATION:{34}]");
"Offset 0 or maybe 0: Found an illegal at sign, @ (in context, this is @012A.34). This folio marker has a period, '.', at the end of it, which is illegal.\n",
"[ERROR:{Found an illegal at sign, @ (in context, this is @012A.34). This folio marker has a period, '.', at the end of it, which is illegal.}, TIBETAN_NON_PUNCTUATION:{34}]");
shelp("@[07B]", "");
shelp("@[00007B]", "");
shelp("@7B", "");
@ -7097,8 +7211,8 @@ tstHelper("ZUR");
shelp("{ DD }", "", "[DD:{{ DD }}]"); // TD3790E2.ACT
shelp("{ BP }", "", "[BP:{{ BP }}]"); // TD3790E2.ACT
shelp("//NYA\\\\",
"Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset 5: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\nOffset 6: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n",
"[START_SLASH:{/}, ERROR:{//}, END_SLASH:{/}, TIBETAN_NON_PUNCTUATION:{NYA}, ERROR:{\\}, ERROR:{\\}]");
"Offset 1 or maybe 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset 5 or maybe 5: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\nOffset 6 or maybe 6: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n",
"[START_SLASH:{/}, ERROR:{Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.}, END_SLASH:{/}, TIBETAN_NON_PUNCTUATION:{NYA}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}]");
}
private static void uhelp(String acip) {
@ -7106,7 +7220,7 @@ tstHelper("ZUR");
}
private static void uhelp(String acip, String expectedUnicode) {
StringBuffer errors = new StringBuffer();
String unicode = ACIPConverter.convertToUnicode(acip, errors);
String unicode = ACIPConverter.convertToUnicode(acip, errors, null, true);
if (null == unicode) {
if (null != expectedUnicode && "none" != expectedUnicode) {
System.out.println("No unicode exists for " + acip + " but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode));
@ -8729,22 +8843,22 @@ tstHelper("shKA");
}
/* DLC FIXME: add test cases: from R0021F.ACE: ambiguous Tibetan/Sanskrit:
BDA' þþþþ
B+DA þþþ
DBANG þþþ
D+BA þþþ
DGA' þþþþ
D+GA þþþ
DGRA þþþ
D+GRA þþþ
DGYESþþþþþ
D+GYA þþþ
DMAR þþþþ
D+MA þþþ
GDA' þþþþ
G+DA þþþ
GNAD þþþþ
G+NA þþþ
MNA' þþþþ
M+NA þþþ
BDA'
B+DA
DBANG
D+BA
DGA'
D+GA
DGRA
D+GRA
DGYES
D+GYA
DMAR
D+MA
GDA'
G+DA
GNAD
G+NA
MNA'
M+NA
*/

View file

@ -520,7 +520,8 @@ class TPairList {
* corresponds to exactly one Tibetan grapheme cluster (i.e.,
* stack). Note that U+0F7F (ACIP {:}) is part of a stack, not a
* stack all on its own. */
void populateWithTGCPairs(ArrayList pl, ArrayList indexList, int index) {
void populateWithTGCPairs(ArrayList pl,
ArrayList indexList, int index) {
int sz = size();
if (sz == 0) {
return;
@ -540,8 +541,8 @@ class TPairList {
// The last pair:
TPair p = get(i);
ThdlDebug.verify(!"+".equals(p.getRight()));
int where;
boolean add_U0F7F = false;
int where;
if (p.getRight() != null
&& (where = p.getRight().indexOf(':')) >= 0) {
// this ':' guy is his own TGCPair.
@ -579,27 +580,21 @@ class TPairList {
}
TGCPair tp;
indexList.add(new Integer(index));
tp = new TGCPair(lWylie.toString()
+ (hasNonAVowel
? ACIPRules.getWylieForACIPVowel(p.getRight())
: ""),
tp = new TGCPair(lWylie.toString(),
(hasNonAVowel
? ACIPRules.getWylieForACIPVowel(p.getRight())
: ""),
(isNumeric
? TGCPair.OTHER
: (hasNonAVowel
? (isSanskrit
? TGCPair.SANSKRIT_WITH_VOWEL
: (isTibetan
? TGCPair.CONSONANTAL_WITH_VOWEL
: TGCPair.OTHER))
: (isSanskrit
? TGCPair.SANSKRIT_WITHOUT_VOWEL
: (isTibetan
? TGCPair.CONSONANTAL_WITHOUT_VOWEL
: TGCPair.OTHER)))));
? TGCPair.TYPE_OTHER
: (isSanskrit
? TGCPair.TYPE_SANSKRIT
: (isTibetan
? TGCPair.TYPE_TIBETAN
: TGCPair.TYPE_OTHER))));
pl.add(tp);
if (add_U0F7F) {
indexList.add(new Integer(index));
pl.add(new TGCPair("H", TGCPair.OTHER));
pl.add(new TGCPair("H", null, TGCPair.TYPE_OTHER));
}
}
}

View file

@ -91,7 +91,7 @@ class TParseTree {
ParseIterator pi = getParseIterator();
while (pi.hasNext()) {
TStackList sl = pi.next();
if (sl.isLegalTshegBar().isLegal) {
if (sl.isLegalTshegBar(false).isLegal) {
sll.add(sl);
}
}
@ -118,12 +118,12 @@ class TParseTree {
* a unique non-illegal parse, you get it. If there's not a
* unique answer, null is returned. */
// {TZANDRA} is not solved by this, DLC NOW. Solve PADMA PROBLEM!
// DLC by using this we can get rid of single-sanskrit-gc, eh?
public TStackList getBestParse() {
TStackListList up = getUniqueParse();
TStackListList up = getUniqueParse(false);
if (up.size() == 1)
return up.get(0);
up = getNonIllegalParses();
int sz = up.size();
if (sz == 1) {
@ -192,14 +192,17 @@ class TParseTree {
* legal parses if there two or more equally good parses. By
* &quot;legal&quot;, we mean a sequence of stacks that is legal
* by the rules of Tibetan tsheg bar syntax (sometimes called
* spelling). */
public TStackListList getUniqueParse() {
* spelling).
* @param noPrefixTests true if you want to pretend that every
* stack can take every prefix, which is not the case in
* reality */
public TStackListList getUniqueParse(boolean noPrefixTests) {
TStackListList allLegalParses = new TStackListList(2); // save memory
TStackListList legalParsesWithVowelOnRoot = new TStackListList(1);
ParseIterator pi = getParseIterator();
while (pi.hasNext()) {
TStackList sl = pi.next();
BoolPair bpa = sl.isLegalTshegBar();
BoolPair bpa = sl.isLegalTshegBar(noPrefixTests);
if (bpa.isLegal) {
if (bpa.isLegalAndHasAVowelOnRoot)
legalParsesWithVowelOnRoot.add(sl);
@ -253,13 +256,23 @@ class TParseTree {
public String getWarning(boolean paranoid,
TPairList pl,
String originalACIP) {
TStackListList up = getUniqueParse();
{
TStackList bestParse = getBestParse();
TStackListList noPrefixTestsUniqueParse = getUniqueParse(true);
if (noPrefixTestsUniqueParse.size() == 1
&& !noPrefixTestsUniqueParse.get(0).equals(bestParse)) {
return "Warning: We're going with " + bestParse + ", but only because our knowledge of prefix rules says that " + noPrefixTestsUniqueParse.get(0) + " is not a legal Tibetan tsheg bar (\"syllable\")";
}
}
TStackListList up = getUniqueParse(false);
if (null == up || up.size() != 1) {
boolean isLastStack[] = new boolean[1];
TStackListList nip = getNonIllegalParses();
if (nip.size() != 1) {
if (null == getBestParse()) {
return "There's not even a unique, non-illegal parse for ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}";
return "Warning: There's not even a unique, non-illegal parse for ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}";
} else {
if (getBestParse().hasStackWithoutVowel(pl, isLastStack)) {
if (isLastStack[0]) {
@ -269,7 +282,7 @@ class TParseTree {
}
}
if (paranoid) {
return "Though the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "} is unambiguous, it would be more computer-friendly if + signs were used to stack things because there are two (or more) ways to interpret this ACIP if you're not careful.";
return "Warning: Though the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "} is unambiguous, it would be more computer-friendly if + signs were used to stack things because there are two (or more) ways to interpret this ACIP if you're not careful.";
}
}
} else {

View file

@ -125,15 +125,17 @@ class TStackList {
* Tibetan syntax (sometimes called rules of spelling). If this
* is legal, then {@link BoolPair#isLegalAndHasAVowelOnRoot} will
* be true if and only if there is an explicit {A} vowel on the
* root stack. */
public BoolPair isLegalTshegBar() {
// DLC handle PADMA and other Tibetanized Sanskrit fellows. Right now we only handle single-stack guys.
* root stack.
* @param noPrefixTests true if you want to pretend that every
* stack can take every prefix, which is not the case in
* reality */
public BoolPair isLegalTshegBar(boolean noPrefixTests) {
// DLC handle PADMA and other Tibetanized Sanskrit fellows consistently. Right now we only treat single-stack Sanskrit guys as legal.
TTGCList tgcList = new TTGCList(this);
StringBuffer warnings = new StringBuffer();
String candidateType
= TibTextUtils.getClassificationOfTshegBar(tgcList, warnings);
// System.out.println("DLC: " + toString() + " has candidateType " + candidateType + " and warnings " + warnings);
= TibTextUtils.getClassificationOfTshegBar(tgcList, warnings, noPrefixTests);
// preliminary answer:
boolean isLegal = (candidateType != "invalid");