From d5ad7602304b290d11e98258785f4e8a223cab84 Mon Sep 17 00:00:00 2001 From: dchandler Date: Sat, 23 Aug 2003 22:03:37 +0000 Subject: [PATCH] TMW->Wylie conversion now takes advantage of prefix rules, the rules that say "ya can take a ga prefix" etc. The ACIP->Unicode converter now gives warnings (optionally, and by default, inline). This converter now produces output even when lexical errors occur, but the output has errors and warnings inline. --- source/org/thdl/tib/input/DuffPaneTest.java | 51 ++-- ...O_THDL_WYLIETest1ResultConversion.expected | 2 +- source/org/thdl/tib/text/TGCPair.java | 84 +++++- source/org/thdl/tib/text/TibTextUtils.java | 254 ++++++++++++------ .../org/thdl/tib/text/TibetanMachineWeb.java | 53 +++- source/org/thdl/tib/text/tibwn.ini | 23 +- .../thdl/tib/text/tshegbar/LegalTshegBar.java | 12 +- .../thdl/tib/text/tshegbar/UnicodeUtils.java | 8 + .../org/thdl/tib/text/ttt/ACIPConverter.java | 86 ++++-- .../tib/text/ttt/ACIPTshegBarScanner.java | 85 +++--- source/org/thdl/tib/text/ttt/PackageTest.java | 214 +++++++++++---- source/org/thdl/tib/text/ttt/TPairList.java | 33 +-- source/org/thdl/tib/text/ttt/TParseTree.java | 31 ++- source/org/thdl/tib/text/ttt/TStackList.java | 12 +- 14 files changed, 678 insertions(+), 270 deletions(-) diff --git a/source/org/thdl/tib/input/DuffPaneTest.java b/source/org/thdl/tib/input/DuffPaneTest.java index 102e256..55705c9 100644 --- a/source/org/thdl/tib/input/DuffPaneTest.java +++ b/source/org/thdl/tib/input/DuffPaneTest.java @@ -102,19 +102,23 @@ public class DuffPaneTest extends TestCase { ensureKeysGiveCorrectWylie("gya"); ensureKeysGiveCorrectWylie("g.ya"); ensureKeysGiveCorrectWylie("bya"); - ensureKeysGiveCorrectWylie("b.ya"); + ensureKeysGiveCorrectWylie("b.ya", "baya"); ensureKeysGiveCorrectWylie("mya"); - ensureKeysGiveCorrectWylie("m.ya"); - ensureKeysGiveCorrectWylie("'ya"); - ensureKeysGiveCorrectWylie("'.ya", "'ya"); - ensureKeysGiveCorrectWylie("dya"); - ensureKeysGiveCorrectWylie("d.ya", "dya"); + ensureKeysGiveCorrectWylie("m.ya", "maya"); + ensureKeysGiveCorrectWylie("'ya", "'aya"); + ensureKeysGiveCorrectWylie("'.ya", "'aya"); + ensureKeysGiveCorrectWylie("dya", + "daya"); + ensureKeysGiveCorrectWylie("d.ya", + "daya"); ensureKeysGiveCorrectWylie("grwa"); - ensureKeysGiveCorrectWylie("g.rwa"); + ensureKeysGiveCorrectWylie("g.rwa", + "garwa"); ensureKeysGiveCorrectWylie("gra"); ensureKeysGiveCorrectWylie("dra"); ensureKeysGiveCorrectWylie("drwa"); - ensureKeysGiveCorrectWylie("d.rwa"); + ensureKeysGiveCorrectWylie("d.rwa", + "darwa"); ensureKeysGiveCorrectWylie("g.r", "gar"); ensureKeysGiveCorrectWylie("d.r", "dar"); ensureKeysGiveCorrectWylie("'.r", "'ar"); @@ -134,7 +138,7 @@ public class DuffPaneTest extends TestCase { ensureKeysGiveCorrectWylie("t.sa", "tas"); - ensureKeysGiveCorrectWylie("d.za"); + ensureKeysGiveCorrectWylie("d.za", "daza"); ensureKeysGiveCorrectWylie("dza"); ensureKeysGiveCorrectWylie("s.ha", @@ -219,7 +223,7 @@ public class DuffPaneTest extends TestCase { ensureKeysGiveCorrectWylie("b.lag"); ensureKeysGiveCorrectWylie("blg", - "blga"); + "balga"); ensureKeysGiveCorrectWylie("b.las", "bals"); @@ -244,21 +248,24 @@ public class DuffPaneTest extends TestCase { "bras"); ensureKeysGiveCorrectWylie("bras"); - ensureKeysGiveCorrectWylie("d.wa"); + ensureKeysGiveCorrectWylie("d.wa", + "dawa"); ensureKeysGiveCorrectWylie("dawa", - "d.wa"); + "dawa"); ensureKeysGiveCorrectWylie("dwa"); - ensureKeysGiveCorrectWylie("g.wa"); + ensureKeysGiveCorrectWylie("g.wa", + "gawa"); ensureKeysGiveCorrectWylie("gawa", - "g.wa"); + "gawa"); ensureKeysGiveCorrectWylie("gwa"); ensureKeysGiveCorrectWylie("'.wa", - "'wa"); + "'awa"); ensureKeysGiveCorrectWylie("'awa", - "'wa"); - ensureKeysGiveCorrectWylie("'wa"); + "'awa"); + ensureKeysGiveCorrectWylie("'wa", + "'awa"); ensureKeysGiveCorrectWylie("gyg", "g.yag"); @@ -282,7 +289,8 @@ public class DuffPaneTest extends TestCase { ensureKeysGiveCorrectWylie("ma.a.asa", "mas"); - ensureKeysGiveCorrectWylie("'ka"); + ensureKeysGiveCorrectWylie("'ka", + "'aka"); ensureKeysGiveCorrectWylie("'gas"); @@ -319,8 +327,9 @@ public class DuffPaneTest extends TestCase { "lamanga"); ensureKeysGiveCorrectWylie("b.m.ng", - "bmang"); - ensureKeysGiveCorrectWylie("bmang"); + "bamanga"); + ensureKeysGiveCorrectWylie("bmang", + "bamanga"); ensureKeysGiveCorrectWylie("gdams"); ensureKeysGiveCorrectWylie("g.d.m.s.", @@ -372,7 +381,7 @@ public class DuffPaneTest extends TestCase { ensureKeysGiveCorrectWylie("fivikikhigingicichijinyitithidinipiphibimitsitshidziwizhizi'iyirilishisihiTiThiDiNiShi"); ensureKeysGiveCorrectWylie("don't touch my coffee/that makes me very angry/supersize my drink", - "dona'ata tocha mya cofafe/thata mkes me veraya angaraya/superasize mya drinaka"); + "dona'ata tocha mya cofafe/thata makesa me veraya angaraya/superasize mya drinaka"); } } diff --git a/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest1ResultConversion.expected b/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest1ResultConversion.expected index bdcd796..be3a254 100644 --- a/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest1ResultConversion.expected +++ b/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIETest1ResultConversion.expected @@ -28,7 +28,7 @@ zur mig nyag phran tsam gyis dge ba'i gzugs can 'dus ma byas//\par \par yid 'ong bzhin ras zla gzhon 'khor lo gnyis skyes la//\par 'khrul ba ster yang 'phyang mo sel byed mgo skyes kyi//\par -bai DUr mthing kha'i lan bu rab 'phyang dbyangs can ma//\par +bai DUra mthing kha'i lan bu rab 'phyang dbyangs can ma//\par smra ba'i dbang phyug ngag gi rgyal po nyer grub mdzod//\par \par gangs can lha lam yangs pa'i khyon 'dir rgyal ba'i bstan pa bcu gnyis bdag po'i gur khang mchog/\par diff --git a/source/org/thdl/tib/text/TGCPair.java b/source/org/thdl/tib/text/TGCPair.java index d681cbd..9049b98 100644 --- a/source/org/thdl/tib/text/TGCPair.java +++ b/source/org/thdl/tib/text/TGCPair.java @@ -25,7 +25,7 @@ package org.thdl.tib.text; context-insensitive THDL Extended Wylie representation. NOTE WELL: this is not a real grapheme cluster; I'm misusing the term (FIXME). It's actually whole or part of one. It's part of one - when this is a vowel or U+0F7F alone. + when this is U+0F7F alone. @author David Chandler */ public class TGCPair { @@ -37,14 +37,84 @@ public class TGCPair { public static final int SANSKRIT_WITHOUT_VOWEL = 5; public static final int SANSKRIT_WITH_VOWEL = 6; - public String wylie; - public int classification; - public TGCPair(String wylie, int classification) { - this.wylie = wylie; - this.classification = classification; + public static final int TYPE_OTHER = 31; + public static final int TYPE_SANSKRIT = 32; + public static final int TYPE_TIBETAN = 33; + + // Sanskrit or Tibetan consonant, or number, or oddball: + private String consonantWylie; + private String vowelWylie; + public String getConsonantWylie() { + return consonantWylie; } + public String getVowelWylie() { + return vowelWylie; + } + /** Cludge. */ + public void setWylie(String x) { + consonantWylie = x; + vowelWylie = null; + } + public String getWylie() { + StringBuffer b = new StringBuffer(); + if (consonantWylie != null) { + // we may have {p-y}, but the user wants to see {py}. + for (int i = 0; i < consonantWylie.length(); i++) { + char ch = consonantWylie.charAt(i); + if ('-' != ch) + b.append(ch); + } + } + if (vowelWylie != null) + b.append(vowelWylie); + return b.toString(); + } + public int classification; + /** Constructs a new TGCPair with (Tibetan or Sanskrit) consonant + * consonantWylie and vowel vowelWylie. Use + * classification==TYPE_OTHER for numbers, lone vowels, marks, + * etc. Use classification==TYPE_TIBETAN for Tibetan (not + * Tibetanized Sanskrit) and classification=TYPE_SANSKRIT for + * Tibetanized Sanskrit. */ + public TGCPair(String consonantWylie, String vowelWylie, int classification) { + if ("".equals(vowelWylie)) + vowelWylie = null; + // Technically, we don't need the following check, but it's + // nice for consistency's sake. + if ("".equals(consonantWylie)) + consonantWylie = null; + + // DLC FIXME: for speed, make these assertions: + if (classification != TYPE_OTHER + && classification != TYPE_TIBETAN + && classification != TYPE_SANSKRIT) { + throw new IllegalArgumentException("Bad classification " + classification + "."); + } + int realClassification = -37; + if (vowelWylie == null && classification == TYPE_TIBETAN) + realClassification = CONSONANTAL_WITHOUT_VOWEL; + if (vowelWylie != null && classification == TYPE_TIBETAN) + realClassification = CONSONANTAL_WITH_VOWEL; + if (vowelWylie == null && classification == TYPE_SANSKRIT) + realClassification = SANSKRIT_WITHOUT_VOWEL; + if (vowelWylie != null && classification == TYPE_SANSKRIT) + realClassification = SANSKRIT_WITH_VOWEL; + if (consonantWylie == null) { + if (classification != TYPE_OTHER) + throw new IllegalArgumentException("That's the very definition of a lone vowel."); + realClassification = LONE_VOWEL; + } else { + if (classification == TYPE_OTHER) + realClassification = OTHER; + } + + this.consonantWylie = consonantWylie; + this.vowelWylie = vowelWylie; + this.classification = realClassification; + } + public String toString() { - return ""; } } diff --git a/source/org/thdl/tib/text/TibTextUtils.java b/source/org/thdl/tib/text/TibTextUtils.java index f42695a..7b5e418 100644 --- a/source/org/thdl/tib/text/TibTextUtils.java +++ b/source/org/thdl/tib/text/TibTextUtils.java @@ -25,6 +25,9 @@ import javax.swing.text.rtf.RTFEditorKit; import java.io.*; import org.thdl.util.ThdlDebug; +import org.thdl.tib.text.tshegbar.LegalTshegBar; +import org.thdl.tib.text.tshegbar.UnicodeConstants; +import org.thdl.tib.text.tshegbar.UnicodeUtils; /** * Provides methods for converting back and forth between Extended @@ -846,86 +849,64 @@ public class TibTextUtils implements THDLWylieConstants { // sz is an overestimate (speeds us up, wastes some memory). TMWGCList gcs = new TMWGCList(sz); - StringBuffer buildingUpGc = new StringBuffer(); + StringBuffer buildingUpVowel = new StringBuffer(); // for {cui}, we append to this guy twice. + String nonVowelWylie = null; // for the "c" in {cui} + int pairType = TGCPair.TYPE_OTHER; - boolean consonantal_with_vowel = false; - boolean buildingUpSanskrit = false; for (int i = 0; i < sz; i++) { DuffCode dc = (DuffCode)glyphList.get(i); String wylie = TibetanMachineWeb.getWylieForGlyph(dc, noSuchWylie); - boolean containsWylieVowel = false; boolean buildingUpSanskritNext = false; if ((buildingUpSanskritNext = TibetanMachineWeb.isWylieSanskritConsonantStack(wylie)) || TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(wylie)) { - if (buildingUpGc.length() > 0) { - gcs.add(new TGCPair(buildingUpGc.toString(), - consonantal_with_vowel - ? (buildingUpSanskrit - ? TGCPair.SANSKRIT_WITH_VOWEL - : TGCPair.CONSONANTAL_WITH_VOWEL) - : (buildingUpSanskrit - ? TGCPair.SANSKRIT_WITHOUT_VOWEL - : TGCPair.CONSONANTAL_WITHOUT_VOWEL))); - buildingUpGc.delete(0, buildingUpGc.length()); + if (buildingUpVowel.length() > 0 || null != nonVowelWylie) { + gcs.add(new TGCPair(nonVowelWylie, + buildingUpVowel.toString(), + pairType)); + buildingUpVowel.delete(0, buildingUpVowel.length()); } - buildingUpGc.append(wylie); - consonantal_with_vowel = false; - buildingUpSanskrit = buildingUpSanskritNext; - } else if ((containsWylieVowel - = TibetanMachineWeb.isWylieAdornmentAndContainsVowel(wylie)) + // We want {p-y}, not {py}. + nonVowelWylie + = TibetanMachineWeb.getHashKeyForGlyph(dc.getFontNum(), dc.getCharNum()); + pairType = (buildingUpSanskritNext + ? TGCPair.TYPE_SANSKRIT + : TGCPair.TYPE_TIBETAN); + } else if (TibetanMachineWeb.isWylieAdornmentAndContainsVowel(wylie) || TibetanMachineWeb.isWylieAdornment(wylie)) { - - if (buildingUpGc.length() > 0) { - buildingUpGc.append(wylie); - if (containsWylieVowel) { - if (debug) - System.out.println("DEBUG: with_vowel is true thanks to " + wylie); - consonantal_with_vowel = true; - } - // do not clear; we might have {cui} or {hUM}, e.g. - } else { - gcs.add(new TGCPair(wylie, - TGCPair.LONE_VOWEL)); - consonantal_with_vowel = false; - } + buildingUpVowel.append(wylie); } else { // number or weird thing: - if (buildingUpGc.length() > 0) { - gcs.add(new TGCPair(buildingUpGc.toString(), - consonantal_with_vowel - ? (buildingUpSanskrit - ? TGCPair.SANSKRIT_WITH_VOWEL - : TGCPair.CONSONANTAL_WITH_VOWEL) - : (buildingUpSanskrit - ? TGCPair.SANSKRIT_WITHOUT_VOWEL - : TGCPair.CONSONANTAL_WITHOUT_VOWEL))); - buildingUpGc.delete(0, buildingUpGc.length()); + if (buildingUpVowel.length() > 0 || null != nonVowelWylie) { + gcs.add(new TGCPair(nonVowelWylie, + buildingUpVowel.toString(), + pairType)); + buildingUpVowel.delete(0, buildingUpVowel.length()); + nonVowelWylie = null; } - gcs.add(new TGCPair(wylie, TGCPair.OTHER)); - consonantal_with_vowel = false; - buildingUpSanskrit = false; + gcs.add(new TGCPair(wylie, null, TGCPair.TYPE_OTHER)); + pairType = TGCPair.TYPE_OTHER; } } - if (buildingUpGc.length() > 0) { - gcs.add(new TGCPair(buildingUpGc.toString(), - consonantal_with_vowel - ? (buildingUpSanskrit - ? TGCPair.SANSKRIT_WITH_VOWEL - : TGCPair.CONSONANTAL_WITH_VOWEL) - : (buildingUpSanskrit - ? TGCPair.SANSKRIT_WITHOUT_VOWEL - : TGCPair.CONSONANTAL_WITHOUT_VOWEL))); + if (buildingUpVowel.length() > 0 || null != nonVowelWylie) { + gcs.add(new TGCPair(nonVowelWylie, + buildingUpVowel.toString(), + pairType)); } - buildingUpGc = null; return gcs; } + /** Returns a string that classifies gcs as a legal Tibetan tsheg + * bar, a single Sanskrit grapheme cluster + * ("single-sanskrit-gc"), or invalid ("invalid"). If + * noPrefixTests is true, then ggyi will be seen as a + * "prefix-root", even though gya doesn't take a ga prefix. */ public static String getClassificationOfTshegBar(TGCList gcs, // DLC the warnings are Wylie-specific - StringBuffer warnings) { + StringBuffer warnings, + boolean noPrefixTests) { String candidateType = null; // Now that we have grapheme clusters, see if they match any // of the "legal tsheg bars": @@ -937,10 +918,11 @@ public class TibTextUtils implements THDLWylieConstants { || TGCPair.SANSKRIT_WITH_VOWEL == cls) return "single-sanskrit-gc"; } + TGCPair lastPair = null; for (int i = 0; i < sz; i++) { TGCPair tp = gcs.get(i); int cls = tp.classification; - String wylie = tp.wylie; + String wylie = tp.getWylie(); if (TGCPair.OTHER == cls) { if (TibetanMachineWeb.isWylieNumber(wylie)) { if (null == candidateType) { @@ -977,25 +959,44 @@ public class TibTextUtils implements THDLWylieConstants { // peek ahead to distinguish between ba's, // ba'ala and ba'am: TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null; - String nextwylie = (nexttp == null) ? "" : nexttp.wylie; + String nextwylie = (nexttp == null) ? "" : nexttp.getWylie(); if (isAppendageNonVowelWylie(nextwylie)) { candidateType = "maybe-appendaged-prefix/root"; } else { - candidateType = "prefix/root-root/suffix"; + if (noPrefixTests + || isLegalPrefixRootCombo(lastPair.getConsonantWylie(), + tp.getConsonantWylie())) + candidateType = "prefix/root-root/suffix"; + else + candidateType = "root-suffix"; } } else if (TibetanMachineWeb.isWylieRight(wylie)) { - candidateType = "prefix/root-root/suffix"; + if (noPrefixTests + || isLegalPrefixRootCombo(lastPair.getConsonantWylie(), + tp.getConsonantWylie())) + candidateType = "prefix/root-root/suffix"; + else + candidateType = "root-suffix"; } else if (TibetanMachineWeb.isWylieAchungAppendage(wylie)) { candidateType = "appendaged-prefix/root"; } else { - candidateType = "prefix-root"; + if (noPrefixTests + || isLegalPrefixRootCombo(lastPair.getConsonantWylie(), + tp.getConsonantWylie())) + candidateType = "prefix-root"; + else { + if (null != warnings) + warnings.append("Found what would be a prefix-root combo, but the root stack with wylie " + wylie + " does not take the prefix with wylie " + lastPair.getConsonantWylie()); + candidateType = "invalid"; + break; + } } } else if ("root" == candidateType) { if (ACHUNG.equals(wylie)) { // peek ahead to distinguish between pa's, // pa'ala and pa'am: TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null; - String nextwylie = (nexttp == null) ? "" : nexttp.wylie; + String nextwylie = (nexttp == null) ? "" : nexttp.getWylie(); if (isAppendageNonVowelWylie(nextwylie)) { candidateType = "maybe-appendaged-root"; } else { @@ -1016,7 +1017,7 @@ public class TibTextUtils implements THDLWylieConstants { // peek ahead to distinguish between bpa's, // bpa'ala and bpa'am: TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null; - String nextwylie = (nexttp == null) ? "" : nexttp.wylie; + String nextwylie = (nexttp == null) ? "" : nexttp.getWylie(); if (isAppendageNonVowelWylie(nextwylie)) { candidateType = "maybe-appendaged-prefix-root"; } else { @@ -1038,7 +1039,7 @@ public class TibTextUtils implements THDLWylieConstants { // peek ahead to distinguish between // gga'am and gaga'ala: TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null; - String nextwylie = (nexttp == null) ? "" : nexttp.wylie; + String nextwylie = (nexttp == null) ? "" : nexttp.getWylie(); if (isAppendageNonVowelWylie(nextwylie)) { candidateType = "maybe-appendaged-prefix/root-root/suffix"; } else { @@ -1120,7 +1121,11 @@ public class TibTextUtils implements THDLWylieConstants { candidateType = candidateType.substring("maybe-".length()).intern(); // So that we get 'am, not 'm; 'ang, not 'ng: - tp.wylie = WYLIE_aVOWEL + tp.wylie; + + // FIXME: cludge: weird place to do this. + // pa'am, not pa'm is what we want, sure, + // but doing this here is ugly. + tp.setWylie(WYLIE_aVOWEL + tp.getWylie()); } else { if (null != warnings) warnings.append("Found a tsheg bar that has an achung (" + ACHUNG + ") tacked on, followed by some other thing whose wylie is " + wylie + "\n"); @@ -1157,6 +1162,7 @@ public class TibTextUtils implements THDLWylieConstants { } else { throw new Error("bad cls"); } + lastPair = tp; } if (candidateType.startsWith("maybe-appendaged-")) { if (null != warnings) @@ -1221,7 +1227,7 @@ public class TibTextUtils implements THDLWylieConstants { StringBuffer wylieBuffer) { TGCList gcs = breakTshegBarIntoGraphemeClusters(glyphList, noSuchWylie); - String candidateType = getClassificationOfTshegBar(gcs, warnings); + String candidateType = getClassificationOfTshegBar(gcs, warnings, false); int sz = gcs.size(); if (candidateType == "invalid" || candidateType == "single-sanskrit-gc") { @@ -1237,7 +1243,7 @@ public class TibTextUtils implements THDLWylieConstants { for (int i = 0; i < sz; i++) { TGCPair tp = (TGCPair)gcs.get(i); int cls = tp.classification; - String wylie = tp.wylie; + String wylie = tp.getWylie(); wylieBuffer.append(wylie); if (TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(wylie) || TibetanMachineWeb.isWylieSanskritConsonantStack(wylie)) { @@ -1290,9 +1296,9 @@ public class TibTextUtils implements THDLWylieConstants { leftover = 3; /* FIXME: these constants are hard-wired here, rather * than in TibetanMachineWeb, because I'm lazy. */ - String wylie1 = ((TGCPair)gcs.get(0)).wylie; - String wylie2 = ((TGCPair)gcs.get(1)).wylie; - String wylie3 = ((TGCPair)gcs.get(2)).wylie; + String wylie1 = ((TGCPair)gcs.get(0)).getWylie(); + String wylie2 = ((TGCPair)gcs.get(1)).getWylie(); + String wylie3 = ((TGCPair)gcs.get(2)).getWylie(); if ((wylie1.equals("g") && (wylie2.equals("d") || wylie2.equals("n") || wylie2.equals("s"))) || (wylie1.equals("d") && (wylie2.equals("g") || wylie2.equals("m"))) || (wylie1.equals("b") && wylie2.equals("d")) @@ -1316,7 +1322,7 @@ public class TibTextUtils implements THDLWylieConstants { || "prefix/root" == candidateType || "root-suffix-postsuffix" == candidateType || "root-suffix" == candidateType) { - String wylie1 = ((TGCPair)gcs.get(0)).wylie; + String wylie1 = ((TGCPair)gcs.get(0)).getWylie(); leftover = 1; wylieBuffer.append(wylie1); if (((TGCPair)gcs.get(0)).classification @@ -1330,16 +1336,16 @@ public class TibTextUtils implements THDLWylieConstants { } if ("root-suffix-postsuffix" == candidateType) { leftover = 3; - String wylie2 = ((TGCPair)gcs.get(1)).wylie; - String wylie3 = ((TGCPair)gcs.get(2)).wylie; + String wylie2 = ((TGCPair)gcs.get(1)).getWylie(); + String wylie3 = ((TGCPair)gcs.get(2)).getWylie(); wylieBuffer.append(unambiguousPostAVowelWylie(wylie2, wylie3)); } } else if ("prefix-root-suffix" == candidateType || "prefix-root" == candidateType || "prefix-root-suffix-postsuffix" == candidateType) { - String wylie1 = ((TGCPair)gcs.get(0)).wylie; - String wylie2 = ((TGCPair)gcs.get(1)).wylie; + String wylie1 = ((TGCPair)gcs.get(0)).getWylie(); + String wylie2 = ((TGCPair)gcs.get(1)).getWylie(); leftover = 2; if (TibetanMachineWeb.isAmbiguousWylie(wylie1, wylie2)) wylieBuffer.append(wylie1 + WYLIE_DISAMBIGUATING_KEY + wylie2); @@ -1357,8 +1363,8 @@ public class TibTextUtils implements THDLWylieConstants { } if ("prefix-root-suffix-postsuffix" == candidateType) { leftover = 4; - String wylie3 = ((TGCPair)gcs.get(2)).wylie; - String wylie4 = ((TGCPair)gcs.get(3)).wylie; + String wylie3 = ((TGCPair)gcs.get(2)).getWylie(); + String wylie4 = ((TGCPair)gcs.get(3)).getWylie(); wylieBuffer.append(unambiguousPostAVowelWylie(wylie3, wylie4)); } @@ -1371,15 +1377,15 @@ public class TibTextUtils implements THDLWylieConstants { // append the wylie left over: for (int i = leftover; i < sz; i++) { TGCPair tp = (TGCPair)gcs.get(i); - String wylie = tp.wylie; + String wylie = tp.getWylie(); wylieBuffer.append(wylie); } } } /** -* Gets the Extended Wylie for a sequence of glyphs using Chandler's -* experimental method. This works as follows: +* Gets the Extended Wylie for a sequence of glyphs. This works as +* follows: * *

We run along until we hit whitespace or punctuation. We take * everything before that and we see if it's a legal Tibetan tsheg bar, @@ -1480,4 +1486,90 @@ public class TibTextUtils implements THDLWylieConstants { } return rv; } + + /** Returns true if and only if the stack with Wylie root + * can take the prefix prefix. */ + private static boolean isLegalPrefixRootCombo(String prefix, String root) { + // This will be decomposed enough. If you can decompose it, + // then it doesn't take a prefix! + if (!TibetanMachineWeb.isKnownHashKey(root)) { + root = root.replace('+', '-'); + if (!TibetanMachineWeb.isKnownHashKey(root)) { + throw new Error("root is, now, " + root); // FIXME: make this an assertion + } + } + String ru = TibetanMachineWeb.getUnicodeForWylieForGlyph(root); + + // ru may be for (head, root, sub), (head, root), (root), or + // (root, sub). Try all possibilities that are possible with + // a String of length ru. If there's a wa-zur, then we say + // (FIXME: do we say correctly?) that a stack with wa-zur can + // take a prefix if and only if the stack without can take a + // prefix. + + if (ru == null) throw new Error("how? root is " + root); // FIXME: make this an assertion + int rl = ru.length(); + if (ru.charAt(rl - 1) == UnicodeConstants.EWSUB_wa_zur) + --rl; // forget about wa-zur: see above. + if (rl == 2) { + char ch0 = ru.charAt(0); + char ch1 = UnicodeUtils.getNominalRepresentationOfSubscribedConsonant(ru.charAt(1)); + + // (head, root) and (root, sub) are possibilities. + if (ACHUNG.equals(prefix)) { + return LegalTshegBar.takesAchungPrefix(ch0, ch1, UnicodeConstants.EW_ABSENT) + || LegalTshegBar.takesAchungPrefix(UnicodeConstants.EW_ABSENT, ch0, ch1); + } else if ("b".equals(prefix)) { + return LegalTshegBar.takesBao(ch0, ch1, UnicodeConstants.EW_ABSENT) + || LegalTshegBar.takesBao(UnicodeConstants.EW_ABSENT, ch0, ch1); + } else if ("m".equals(prefix)) { + return LegalTshegBar.takesMao(ch0, ch1, UnicodeConstants.EW_ABSENT) + || LegalTshegBar.takesMao(UnicodeConstants.EW_ABSENT, ch0, ch1); + } else if ("g".equals(prefix)) { + return LegalTshegBar.takesGao(ch0, ch1, UnicodeConstants.EW_ABSENT) + || LegalTshegBar.takesGao(UnicodeConstants.EW_ABSENT, ch0, ch1); + } else if ("d".equals(prefix)) { + return LegalTshegBar.takesDao(ch0, ch1, UnicodeConstants.EW_ABSENT) + || LegalTshegBar.takesDao(UnicodeConstants.EW_ABSENT, ch0, ch1); + } else { + throw new IllegalArgumentException("prefix is " + prefix); + } + } else if (rl == 1) { + char ch0 = ru.charAt(0); + // (root) is the only choice. + if (ACHUNG.equals(prefix)) { + return LegalTshegBar.takesAchungPrefix(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT); + } else if ("b".equals(prefix)) { + return LegalTshegBar.takesBao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT); + } else if ("m".equals(prefix)) { + return LegalTshegBar.takesMao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT); + } else if ("g".equals(prefix)) { + return LegalTshegBar.takesGao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT); + } else if ("d".equals(prefix)) { + return LegalTshegBar.takesDao(UnicodeConstants.EW_ABSENT, ch0, UnicodeConstants.EW_ABSENT); + } else { + throw new IllegalArgumentException("prefix is " + prefix); + } + } else if (rl == 3) { + char ch0 = ru.charAt(0); + char ch1 = UnicodeUtils.getNominalRepresentationOfSubscribedConsonant(ru.charAt(1)); + char ch2 = UnicodeUtils.getNominalRepresentationOfSubscribedConsonant(ru.charAt(2)); + // (head, root, sub) is the only choice. + if (ACHUNG.equals(prefix)) { + return LegalTshegBar.takesAchungPrefix(ch0, ch1, ch2); + } else if ("b".equals(prefix)) { + return LegalTshegBar.takesBao(ch0, ch1, ch2); + } else if ("m".equals(prefix)) { + return LegalTshegBar.takesMao(ch0, ch1, ch2); + } else if ("g".equals(prefix)) { + return LegalTshegBar.takesGao(ch0, ch1, ch2); + } else if ("d".equals(prefix)) { + return LegalTshegBar.takesDao(ch0, ch1, ch2); + } else { + throw new IllegalArgumentException("prefix is " + prefix); + } + } else { + return false; + } + } } diff --git a/source/org/thdl/tib/text/TibetanMachineWeb.java b/source/org/thdl/tib/text/TibetanMachineWeb.java index 8ee9fb2..6200473 100644 --- a/source/org/thdl/tib/text/TibetanMachineWeb.java +++ b/source/org/thdl/tib/text/TibetanMachineWeb.java @@ -178,14 +178,19 @@ public class TibetanMachineWeb implements THDLWylieConstants { - // NOTE WELL: if you delete from consonants, numbers, vowels, or - // others, you'll change the way Jskad's Extended Wylie keyboard - // works, yes, but you'll also change TMW->Wylie. + // NOTE WELL: if you delete from tibetanConsonants, + // otherConsonants, numbers, vowels, or others, you'll change the + // way Jskad's Extended Wylie keyboard works, yes, but you'll also + // change TMW->Wylie. - /** comma-delimited list of supported consonants (Tibetan and - Tibetanized Sanskrit): */ - private static final String consonants - = "k,kh,g,ng,c,ch,j,ny,t,th,d,n,p,ph,b,m,ts,tsh,dz,w,zh,z,',y,r,l,sh,s,h,a,T,Th,D,N,Sh,v,f,Dz"; + /** comma-delimited list of supported Tibetan consonants: */ + private static final String tibetanConsonants + = "k,kh,g,ng,c,ch,j,ny,t,th,d,n,p,ph,b,m,ts,tsh,dz,w,zh,z,',y,r,l,sh,s,h,a"; + + /** comma-delimited list of supported non-Tibetan consonants, such + * as Sanskrit consonants: */ + private static final String otherConsonants // va and fa are treated pretty-much like Sanskrit. + = "T,Th,D,N,Sh,v,f,Dz"; /** comma-delimited list of supported numbers (superscribed, subscribed, normal, half-numerals): */ @@ -371,7 +376,7 @@ public class TibetanMachineWeb implements THDLWylieConstants { charSet = new HashSet(); tibSet = new HashSet(); - sTok = new StringTokenizer(consonants, ","); + sTok = new StringTokenizer(tibetanConsonants, ","); while (sTok.hasMoreTokens()) { String ntk; charSet.add(ntk = sTok.nextToken()); @@ -379,6 +384,15 @@ public class TibetanMachineWeb implements THDLWylieConstants { validInputSequences.put(ntk, anyOldObjectWillDo); } + sanskritStackSet = new HashSet(); + sTok = new StringTokenizer(otherConsonants, ","); + while (sTok.hasMoreTokens()) { + String ntk; + charSet.add(ntk = sTok.nextToken()); + sanskritStackSet.add(ntk); + validInputSequences.put(ntk, anyOldObjectWillDo); + } + numberSet = new HashSet(); sTok = new StringTokenizer(numbers, ","); while (sTok.hasMoreTokens()) { @@ -386,7 +400,7 @@ public class TibetanMachineWeb implements THDLWylieConstants { // do it in so that Jskad has the same // TMW->Wylie conversion regardless of whether or not it // chooses to support inputting numbers. Likewise for - // consonants, others, and vowels. + // tibetanConsonants, otherConsonants, others, and vowels. String ntk; charSet.add(ntk = sTok.nextToken()); numberSet.add(ntk); @@ -427,8 +441,6 @@ public class TibetanMachineWeb implements THDLWylieConstants { boolean ignore = false; - sanskritStackSet = new HashSet(); - while ((line = in.readLine()) != null) { if (line.startsWith("")) { @@ -1182,6 +1194,23 @@ public static boolean hasGlyph(String hashKey) { return true; } +/** Returns the Unicode correspondence for the Wylie wylie, which must + * be Wylie returned by getWylieForGlyph(int, int, boolean[]). + * Returns null if the Unicode correspondence is nonexistent or + * unknown. */ +public static String getUnicodeForWylieForGlyph(String wylie) { + DuffCode dc = getGlyph(wylie); + return mapTMWtoUnicode(dc.getFontNum() - 1, dc.getCharNum()); +} + +/** +* Returns true if and only if hashKey is a known hash key from tibwn.ini. +*/ +public static boolean isKnownHashKey(String hashKey) { + DuffCode[] dc = (DuffCode[])tibHash.get(hashKey); + return (null != dc); +} + /** * Gets a glyph for this hash key. Hash keys are not identical to Extended * Wylie. The hash key for a Tibetan stack separates the members of the stack @@ -1193,7 +1222,7 @@ public static boolean hasGlyph(String hashKey) { public static DuffCode getGlyph(String hashKey) { DuffCode[] dc = (DuffCode[])tibHash.get(hashKey); if (null == dc) - throw new Error("It is likely that you misconfigured tibwn.ini such that, say, M is expected (i.e., it is listed as, e.g. punctuation), but no 'M~...' line appears."); + throw new Error("Hash key " + hashKey + " not found; it is likely that you misconfigured tibwn.ini such that, say, M is expected (i.e., it is listed as, e.g. punctuation), but no 'M~...' line appears."); return dc[TMW]; } diff --git a/source/org/thdl/tib/text/tibwn.ini b/source/org/thdl/tib/text/tibwn.ini index 175d57c..160e3b9 100644 --- a/source/org/thdl/tib/text/tibwn.ini +++ b/source/org/thdl/tib/text/tibwn.ini @@ -98,13 +98,6 @@ __TILDE__~93,5~~9,91~~~~~~~none - -// 0F5F,0F39 might work, but the OpenType font's author must've had -// Dza in mind if it does. Note that the bottommost horizontal stroke -// goes upward on U+0F5F and downward on U+0F5B. -Dz~146,5~~10,42~~~~~~~none -f~153,5~~10,58~1,110~1,118~1,124~1,126~10,114~10,123~0F55,0F39 -v~154,5~~10,59~1,110~1,118~1,124~1,126~10,114~10,123~0F56,f39 k~33,1~1,92~1,33~1,109~1,111~1,123~1,125~10,118~10,120~0F40 kh~34,1~~1,34~1,109~1,118~1,123~1,125~10,114~10,123~0F41 g~35,1~1,93~1,35~1,109~1,111~1,123~1,125~10,118~10,120~0F42 @@ -135,11 +128,6 @@ sh~59,1~1,99~1,60~1,109~1,111~1,123~1,125~10,118~10,120~0F64 s~60,1~~1,61~1,109~1,118~1,123~1,125~10,114~10,123~0F66 h~61,1~1,100~1,62~1,109~1,112~1,123~1,125~10,115~10,122~0F67~1,102 a~62,1~~1,63~1,109~1,118~1,123~1,125~10,114~10,123~0F68 -T~170,1~~1,64~1,109~1,120~1,123~1,125~10,115~10,124~0F4A -Th~171,1~~1,65~1,109~1,118~1,123~1,125~10,114~10,123~0F4B -D~172,1~~1,66~1,109~1,120~1,123~1,125~10,115~10,124~0F4C -N~173,1~~1,67~1,109~1,118~1,123~1,125~10,115~10,124~0F4E -Sh~174,1~~1,68~1,109~1,118~1,123~1,125~10,115~10,124~0F65 r-k~63,1~~1,70~1,109~1,121~1,123~1,125~10,115~10,124~f62,f90 r-g~64,1~~1,71~1,109~1,121~1,123~1,125~10,115~10,124~f62,f92 r-ng~65,1~~1,72~1,109~1,119~1,123~1,125~10,115~10,124~f62,f94 @@ -241,6 +229,17 @@ au~237,1~~8,89~~~~~~~0F7D~~8,104 // DLC FIXME: need -I as well +// 0F5F,0F39 might work, but the OpenType font's author must've had +// Dza in mind if it does. Note that the bottommost horizontal stroke +// goes upward on U+0F5F and downward on U+0F5B. +Dz~146,5~~10,42~~~~~~~none +f~153,5~~10,58~1,110~1,118~1,124~1,126~10,114~10,123~0F55,0F39 +v~154,5~~10,59~1,110~1,118~1,124~1,126~10,114~10,123~0F56,f39 +T~170,1~~1,64~1,109~1,120~1,123~1,125~10,115~10,124~0F4A +Th~171,1~~1,65~1,109~1,118~1,123~1,125~10,114~10,123~0F4B +D~172,1~~1,66~1,109~1,120~1,123~1,125~10,115~10,124~0F4C +N~173,1~~1,67~1,109~1,118~1,123~1,125~10,115~10,124~0F4E +Sh~174,1~~1,68~1,109~1,118~1,123~1,125~10,115~10,124~0F65 k+Sh~175,1~~1,69~1,109~1,122~1,123~1,125~10,116~10,125~0F69 k+k~33,2~~3,33~1,109~4,120~1,123~1,125~4,106~4,113~f40,f90 k+kh~34,2~~3,34~1,109~4,120~1,123~1,125~4,106~4,113~f40,f91 diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java index 5dcb0fc..1e53ad2 100644 --- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java +++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java @@ -1266,7 +1266,7 @@ public final class LegalTshegBar * @param sub the {@link #isNominalRepresentationOfConsonant(char) * nominal representation} of the subjoined letter, or EW_ABSENT * if not present */ - static boolean takesGao(char head, char root, char sub) { + public static boolean takesGao(char head, char root, char sub) { if (EW_ABSENT == head) { if (EW_ABSENT == sub) { return (EWC_ca == root @@ -1298,7 +1298,7 @@ public final class LegalTshegBar * @param sub the {@link #isNominalRepresentationOfConsonant(char) * nominal representation} of the subjoined letter, or EW_ABSENT * if not present */ - static boolean takesDao(char head, char root, char sub) { + public static boolean takesDao(char head, char root, char sub) { if (EW_ABSENT == head) { if (EW_ABSENT == sub) { return (EWC_ka == root @@ -1312,6 +1312,7 @@ public final class LegalTshegBar || (EWC_pa == root && EWC_ya == sub) || (EWC_ba == root && EWC_ya == sub) || (EWC_ma == root && EWC_ya == sub) + || (EWC_ka == root && EWC_ya == sub) // dkyil, for example || (EWC_ka == root && EWC_ra == sub) || (EWC_ga == root && EWC_ra == sub) @@ -1336,7 +1337,7 @@ public final class LegalTshegBar * @param sub the {@link #isNominalRepresentationOfConsonant(char) * nominal representation} of the subjoined letter, or EW_ABSENT * if not present */ - static boolean takesAchungPrefix(char head, char root, char sub) { + public static boolean takesAchungPrefix(char head, char root, char sub) { if (EW_ABSENT == head) { if (EW_ABSENT == sub) { return (EWC_ga == root @@ -1379,7 +1380,7 @@ public final class LegalTshegBar * @param sub the {@link #isNominalRepresentationOfConsonant(char) * nominal representation} of the subjoined letter, or EW_ABSENT * if not present */ - static boolean takesMao(char head, char root, char sub) { + public static boolean takesMao(char head, char root, char sub) { if (EW_ABSENT == head) { if (EW_ABSENT == sub) { return (EWC_kha == root @@ -1418,11 +1419,12 @@ public final class LegalTshegBar * @param sub the {@link #isNominalRepresentationOfConsonant(char) * nominal representation} of the subjoined letter, or EW_ABSENT * if not present */ - static boolean takesBao(char head, char root, char sub) { + public static boolean takesBao(char head, char root, char sub) { // DLC ask Ten-lo la about Wazur. if (EW_ABSENT == head) { if (EW_ABSENT == sub) { return (EWC_ka == root + || EWC_sa == root // bsams, for example || EWC_ca == root || EWC_ta == root || EWC_tsa == root diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java index 544df36..5f18e32 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java @@ -232,6 +232,14 @@ public class UnicodeUtils implements UnicodeConstants { /* DLC FIXME -- I was using 3.0 p.437-440, check 3.2. */ } + /** If ch is in one of the ranges U+0F90-U+0F97, U+0F99-U+0FB9, + * then this returns the same consonant in the range + * U+0F40-U+0F69. If ch is not in that range, this returns + * garbage. */ + public static char getNominalRepresentationOfSubscribedConsonant(char ch) { + return (char)((int)ch-(((int)'\u0F90') - ((int)'\u0F40'))); + } + /** Returns true iff ch corresponds to the Tibetan letter ra. Several Unicode codepoints correspond to the Tibetan letter ra (in its subscribed form or otherwise). Oftentimes, diff --git a/source/org/thdl/tib/text/ttt/ACIPConverter.java b/source/org/thdl/tib/text/ttt/ACIPConverter.java index 03c40fa..d8166ba 100644 --- a/source/org/thdl/tib/text/ttt/ACIPConverter.java +++ b/source/org/thdl/tib/text/ttt/ACIPConverter.java @@ -58,28 +58,46 @@ public class ACIPConverter { ArrayList al = ACIPTshegBarScanner.scanFile(args[1], errors, strict, maxErrors - 1); if (null == al) { - System.err.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this"); + System.err.println(maxErrors + " or more lexical errors occurred while scanning ACIP input file; is this"); System.err.println("Tibetan or English input?"); System.err.println(""); - System.err.println("First " + maxErrors + " errors scanning ACIP input file: "); - System.err.println(errors); - System.err.println("Exiting with " + maxErrors + " or more errors; please fix input file and try again."); + if (false) { + // Nobody wants to see this. FIXME: maybe somebody; have an option. + System.err.println("First " + maxErrors + " lexical errors scanning ACIP input file: "); + System.err.println(errors); + } + System.err.println("Exiting with " + maxErrors + " or more lexical errors; please fix input file and try again."); System.exit(1); } + final boolean abortUponScanningError = false; // DLC MAKE ME CONFIGURABLE + // DLC NOW: BAo isn't converting. if (errors.length() > 0) { System.err.println("Errors scanning ACIP input file: "); System.err.println(errors); - System.err.println("Exiting; please fix input file and try again."); - System.exit(1); + if (abortUponScanningError) { + System.err.println("Exiting; please fix input file and try again."); + System.exit(1); + } } - convertToUnicode(al, System.out, errors); + StringBuffer warnings = new StringBuffer(); + boolean putWarningsInOutput = true; // DLC make me configurable. + convertToUnicode(al, System.out, errors, warnings, + putWarningsInOutput); if (errors.length() > 0) { System.err.println("Errors converting ACIP input file: "); System.err.println(errors); + System.err.println("The output contains these errors."); System.err.println("Exiting; please fix input file and try again."); System.exit(2); } + if (warnings.length() > 0) { + System.err.println("Warnings converting ACIP input file: "); + System.err.println(warnings); + if (putWarningsInOutput) + System.err.println("The output contains these warnings."); + System.exit(2); + } if (verbose) System.err.println("Converted " + args[1] + " perfectly."); System.exit(0); } @@ -96,19 +114,30 @@ public class ACIPConverter { { throw new Error("DLC UNIMPLEMENTED"); } + // DLC FIXME: sometimes { } is \u0F0B, and sometimes it is a + // space. Treat it as a tsheg only when it appears after a + // syllable or another tsheg. /** Returns UTF-8 encoded Unicode. A bit indirect, so use this * for testing only if performance is a concern. If errors occur * in scanning the ACIP or in converting a tsheg bar, then they - * are appended to errors if errors is non-null. Returns the + * are appended to errors if errors is non-null, as well as + * written to the result. If warnings occur in scanning the ACIP + * or in converting a tsheg bar, then they are appended to + * warnings if warnings is non-null, and they are written to the + * result if writeWarningsToResult is true. Returns the * conversion upon perfect success, null if errors occurred. */ public static String convertToUnicode(String acip, - StringBuffer errors) { + StringBuffer errors, + StringBuffer warnings, + boolean writeWarningsToResult) { ByteArrayOutputStream sw = new ByteArrayOutputStream(); ArrayList al = ACIPTshegBarScanner.scan(acip, errors, true /* DLC FIXME */, -1); try { - if (null != al && convertToUnicode(al, sw, errors)) { + if (null != al + && convertToUnicode(al, sw, errors, + warnings, writeWarningsToResult)) { return sw.toString("UTF-8"); } else { System.out.println("DLC al is " + al + " and convertToUnicode returned null."); @@ -119,15 +148,25 @@ public class ACIPConverter { } } - /** Writes Unicode to out. If errors occur in converting a - * tsheg bar, then they are appended to errors if errors is - * non-null. Returns true upon perfect success, false if errors - * occurred. + /** Writes Unicode to out. If errors occur in converting a tsheg + * bar, then they are appended to errors if errors is non-null. + * Furthermore, errors are written to out. If writeWarningsToOut + * is true, then warnings also will be written to out. Returns + * true upon perfect success, false if errors occurred. + * @param scan result of ACIPTshegBarScanner.scan(..) + * @param out stream to which to write converted text + * @param errors if non-null, all error messages are appended + * @param warnings if non-null, all warning messages are appended + * to this + * @param writeWarningsToOut if true, then all warning messages + * are written to out in the appropriate places * @throws IOException if we cannot write to out */ public static boolean convertToUnicode(ArrayList scan, OutputStream out, - StringBuffer errors) + StringBuffer errors, + StringBuffer warnings, + boolean writeWarningsToOut) throws IOException { int sz = scan.size(); @@ -139,7 +178,7 @@ public class ACIPConverter { int stype = s.getType(); if (stype == ACIPString.ERROR) { hasErrors = true; - writer.write("[#ERROR CONVERTING ACIP DOCUMENT: "); + writer.write("[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: "); writer.write(s.getText()); writer.write("]"); } else { @@ -179,6 +218,21 @@ public class ACIPConverter { if (null != errors) errors.append(errorMessage + "\n"); } else { + String warning + = pt.getWarning(false, // DLC: make me configurable + pl, + s.getText()); + if (null != warning) { + if (writeWarningsToOut) { + writer.write("[#WARNING CONVERTING ACIP DOCUMENT: "); + writer.write(warning); + writer.write("]"); + } + if (null != warnings) { + warnings.append(warning); + warnings.append('\n'); + } + } unicode = sl.getUnicode(); if (null == unicode) throw new Error("DLC: HOW?"); } diff --git a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java index 2879683..bea56a6 100644 --- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java +++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java @@ -133,16 +133,18 @@ public class ACIPTshegBarScanner { Stack bracketTypeStack = new Stack(); int startSlashIndex = -1; int startParenIndex = -1; + int numNewlines = 0; for (int i = 0; i < sl; i++) { if (i < startOfString) throw new Error("bad reset"); char ch; ch = s.charAt(i); + if (ch == '\n') ++numNewlines; if (ACIPString.COMMENT == currentType && ch != ']') { if ('[' == ch) { al.add(new ACIPString("Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n", ACIPString.ERROR)); if (null != errors) - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"); if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; } @@ -157,17 +159,18 @@ public class ACIPTshegBarScanner { al.add(new ACIPString(s.substring(startOfString, i), currentType)); } - al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR)); + al.add(new ACIPString("Found a truly unmatched close bracket, " + s.substring(i, i+1), + ACIPString.ERROR)); if (!waitingForMatchingIllegalClose) { if (null != errors) { - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found a truly unmatched close bracket, ] or }.\n"); } if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; } waitingForMatchingIllegalClose = false; if (null != errors) - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; startOfString = i+1; @@ -249,6 +252,11 @@ public class ACIPTshegBarScanner { || s.substring(i, i + "[BP]".length()).equals("{BP}"))) { thingy = "[BP]"; currentType = ACIPString.BP; + } else if (i + "[BLANK PAGE]".length() <= sl + && (s.substring(i, i + "[BLANK PAGE]".length()).equals("[BLANK PAGE]") + || s.substring(i, i + "[BLANK PAGE]".length()).equals("{BLANK PAGE}"))) { + thingy = "[BLANK PAGE]"; + currentType = ACIPString.BP; } else if (i + "[ BP ]".length() <= sl && (s.substring(i, i + "[ BP ]".length()).equals("[ BP ]") || s.substring(i, i + "[ BP ]".length()).equals("{ BP }"))) { @@ -414,11 +422,11 @@ public class ACIPTshegBarScanner { // This is an error. Sometimes [COMMENTS APPEAR // WITHOUT # MARKS]. Though "... [" could cause // this too. - al.add(new ACIPString(s.substring(i, i+1), + al.add(new ACIPString("Found an illegal open bracket: " + s.substring(i, i+1), ACIPString.ERROR)); if (waitingForMatchingIllegalClose) { if (null != errors) { - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.\n"); } if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; @@ -435,7 +443,7 @@ public class ACIPTshegBarScanner { inContext = inContext + "..."; } } - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n"); if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; } @@ -477,7 +485,6 @@ public class ACIPTshegBarScanner { if (i+numdigits+2 < sl && s.charAt(i+numdigits+2) == '.') { if (!(i+numdigits+4 < sl && isNumeric(s.charAt(i+numdigits+3)) && !isNumeric(s.charAt(i+numdigits+4)))) { - al.add(new ACIPString(s.substring(i, i+numdigits+3), ACIPString.ERROR)); String inContext = s.substring(i, i+Math.min(sl-i, 10)); if (inContext.indexOf("\r") >= 0) { inContext = inContext.substring(0, inContext.indexOf("\r")); @@ -488,8 +495,10 @@ public class ACIPTshegBarScanner { inContext = inContext + "..."; } } + al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.", + ACIPString.ERROR)); if (null != errors) - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.\n"); if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; startOfString = i+numdigits+3; @@ -498,7 +507,6 @@ public class ACIPTshegBarScanner { break; } if (i+numdigits+4 < sl && (s.charAt(i+numdigits+4) == '.' || s.charAt(i+numdigits+4) == 'A' || s.charAt(i+numdigits+4) == 'B' || s.charAt(i+numdigits+4) == 'a' || s.charAt(i+numdigits+4) == 'b' || isNumeric(s.charAt(i+numdigits+4)))) { - al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR)); String inContext = s.substring(i, i+Math.min(sl-i, 10)); if (inContext.indexOf("\r") >= 0) { inContext = inContext.substring(0, inContext.indexOf("\r")); @@ -509,8 +517,10 @@ public class ACIPTshegBarScanner { inContext = inContext + "..."; } } + al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.", + ACIPString.ERROR)); if (null != errors) - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.\n"); if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; startOfString = i+1; // DLC FIXME: skip over more? @@ -572,7 +582,9 @@ public class ACIPTshegBarScanner { } // This case, @NNN, must come after the @NNN{AB} case. - if (i+numdigits+1 < sl && s.charAt(i+numdigits+1) == ' ') { + if (i+numdigits+1 < sl && (s.charAt(i+numdigits+1) == ' ' + || s.charAt(i+numdigits+1) == '\n' + || s.charAt(i+numdigits+1) == '\r')) { boolean allAreNumeric = true; for (int k = 1; k <= numdigits; k++) { if (!isNumeric(s.charAt(i+k))) { @@ -591,7 +603,6 @@ public class ACIPTshegBarScanner { } } if (startOfString == i) { - al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR)); String inContext = s.substring(i, i+Math.min(sl-i, 10)); if (inContext.indexOf("\r") >= 0) { inContext = inContext.substring(0, inContext.indexOf("\r")); @@ -602,8 +613,10 @@ public class ACIPTshegBarScanner { inContext = inContext + "..."; } } + al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.", + ACIPString.ERROR)); if (null != errors) - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.\n"); if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; startOfString = i+1; @@ -626,9 +639,10 @@ public class ACIPTshegBarScanner { * it means /NYA/. We warn about // for this * reason. \\ causes a tsheg-bar error (DLC * FIXME: verify this is so). */ - al.add(new ACIPString("//", ACIPString.ERROR)); + al.add(new ACIPString("Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.", + ACIPString.ERROR)); if (errors != null) { - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\n"); } if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; @@ -661,9 +675,10 @@ public class ACIPTshegBarScanner { if (startParenIndex >= 0) { if (ch == '(') { - al.add(new ACIPString("Nesting of parentheses () is not allowed", ACIPString.ERROR)); + al.add(new ACIPString("Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.", + ACIPString.ERROR)); if (null != errors) - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\n"); if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; } else { @@ -674,9 +689,10 @@ public class ACIPTshegBarScanner { currentType = ACIPString.ERROR; } else { if (ch == ')') { - al.add(new ACIPString("Unexpected closing parenthesis )", ACIPString.ERROR)); + al.add(new ACIPString("Unexpected closing parenthesis, ), found.", + ACIPString.ERROR)); if (null != errors) - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Unexpected closing parenthesis, ), found.\n"); if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; } else { @@ -724,10 +740,10 @@ public class ACIPTshegBarScanner { al.add(new ACIPString(s.substring(i, i+1), ACIPString.TIBETAN_PUNCTUATION)); } else { - al.add(new ACIPString(s.substring(i, i+1), + al.add(new ACIPString("A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".", ACIPString.ERROR)); if (null != errors) - errors.append("Offset " + i + ": " + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n"); if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; } @@ -772,19 +788,24 @@ public class ACIPTshegBarScanner { al.add(new ACIPString(s.substring(startOfString, i), currentType)); } - al.add(new ACIPString(s.substring(i, i+1), - ACIPString.ERROR)); - if (null != errors) { - if ((int)ch == 65533) { - errors.append("Offset " + i + ": " + if ((int)ch == 65533) { + al.add(new ACIPString("Found an illegal, unprintable character.", + ACIPString.ERROR)); + if (null != errors) + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found an illegal, unprintable character.\n"); - } else if ('\\' == ch) { - errors.append("Offset " + i + ": " + } else if ('\\' == ch) { + al.add(new ACIPString("Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.", + ACIPString.ERROR)); + if (null != errors) + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n"); - } else { - errors.append("Offset " + i + ": " + } else { + al.add(new ACIPString("Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".", + ACIPString.ERROR)); + if (null != errors) + errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " + "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".\n"); - } } if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; startOfString = i+1; diff --git a/source/org/thdl/tib/text/ttt/PackageTest.java b/source/org/thdl/tib/text/ttt/PackageTest.java index b447da1..b3c8c11 100644 --- a/source/org/thdl/tib/text/ttt/PackageTest.java +++ b/source/org/thdl/tib/text/ttt/PackageTest.java @@ -128,7 +128,7 @@ public class PackageTest extends TestCase { } { - TStackListList legalParses = pt.getUniqueParse(); + TStackListList legalParses = pt.getUniqueParse(false); boolean goodness2 = (expectedLegalParses == null || expectedLegalParses.length == legalParses.size()); for (int i = 0 ; i < legalParses.size(); i++) { @@ -139,18 +139,21 @@ public class PackageTest extends TestCase { || expectedLegalParses.length < i+1 || n.equals(expectedLegalParses[i])); if (!okay || !goodness2) - System.out.println("Legal parse " + (i) + " (from zero) is " + n + " (toString2=" + n.toString2() + ") and expected is " + expectedLegalParses[i]); + System.out.println("Legal parse " + (i) + " (from zero) is " + n + " (toString2=" + n.toString2() + ") and expected is " + + ((i < expectedLegalParses.length) + ? expectedLegalParses[i] + : "not present")); assertTrue(okay); } if (!goodness2) - System.out.println("You expected " + expectedLegalParses.length + " legal parses, but there were instead " + legalParses.size() + " legal parses."); + System.out.println("You expected " + expectedLegalParses.length + " legal parses, but there were instead " + legalParses.size() + " legal parses for ACIP " + acip + "."); assertTrue(goodness2); TStackListList allLegalParses = pt.getLegalParses(); TStackListList decentParses = pt.getNonIllegalParses(); if (pt.getBestParse() == null) { if (legalParses.size() == 0) { if (null != expectedBestParse && !"".equals(expectedBestParse)) { - System.out.print("Expected is that there is a best parse \"" + expectedBestParse + "\" but there is no best parse for acip {" + acip + "}"); + System.out.print("Expected is that there is a best parse \"" + expectedBestParse + "\" but there is no best parse for ACIP {" + acip + "}"); assertTrue(false); } System.out.print("ACIPNoBestParseError: There is no best parse for the ACIP {" + acip + "}; "); @@ -163,7 +166,7 @@ public class PackageTest extends TestCase { } } else { if (legalParses.size() > 1) { - System.out.println("ACIPTooManyLegalParsesError: see these " + legalParses.size() + " legal parses for acip " + acip + ": " + legalParses); + System.out.println("ACIPTooManyLegalParsesError: see these " + legalParses.size() + " legal parses for ACIP " + acip + ": " + legalParses); assertTrue(legalParses.size() == 2 && (legalParses.get(0).size() == 1 + legalParses.get(1).size())); @@ -176,7 +179,7 @@ public class PackageTest extends TestCase { if (null != expectedBestParse) { boolean good = pt.getBestParse().equals(expectedBestParse); if (!good) { - System.out.print("Expected best parse is \"" + expectedBestParse + "\" but the best parse is " + pt.getBestParse() + " for acip {" + acip + "}"); + System.out.print("Expected best parse is \"" + expectedBestParse + "\" but the best parse is " + pt.getBestParse() + " for ACIP {" + acip + "}"); } assertTrue(good); } @@ -229,6 +232,116 @@ public class PackageTest extends TestCase { * {@link TPairList#getACIPError()}, and {@link * TPairList#recoverACIP()}. */ public void testBreakACIPIntoChunks() { +tstHelper("GASN"); // ambiguous with regard to prefix rules +tstHelper("BARMA"); // ambiguous with regard to prefix rules +tstHelper("MARDA"); // ambiguous with regard to prefix rules +tstHelper("BBA"); // ambiguous with regard to prefix rules +tstHelper("BBLUGS"); // ambiguous with regard to prefix rules +tstHelper("BDRA"); // ambiguous with regard to prefix rules +tstHelper("BDRAG"); // ambiguous with regard to prefix rules +tstHelper("BDRA'I"); // ambiguous with regard to prefix rules +tstHelper("BDRAL"); // ambiguous with regard to prefix rules +tstHelper("BDRAN"); // ambiguous with regard to prefix rules +tstHelper("BDRANGS"); // ambiguous with regard to prefix rules +tstHelper("BDREN"); // ambiguous with regard to prefix rules +tstHelper("BDRI"); // ambiguous with regard to prefix rules +tstHelper("BDRIS"); // ambiguous with regard to prefix rules +tstHelper("BDROL"); // ambiguous with regard to prefix rules +tstHelper("BDRUG"); // ambiguous with regard to prefix rules +tstHelper("BLCAG"); // ambiguous with regard to prefix rules +tstHelper("BLCI"); // ambiguous with regard to prefix rules +tstHelper("BLKONG"); // ambiguous with regard to prefix rules +tstHelper("BLNGA"); // ambiguous with regard to prefix rules +tstHelper("BLNGAG"); // ambiguous with regard to prefix rules +tstHelper("BMA"); // ambiguous with regard to prefix rules +tstHelper("BMYOD"); // ambiguous with regard to prefix rules +tstHelper("BSALDA"); // ambiguous with regard to prefix rules +tstHelper("BSAMS"); // ambiguous with regard to prefix rules +tstHelper("BSEMS"); // ambiguous with regard to prefix rules +tstHelper("BTSAMS"); // ambiguous with regard to prefix rules +tstHelper("BTSIMS"); // ambiguous with regard to prefix rules +tstHelper("DDANG"); // ambiguous with regard to prefix rules +tstHelper("DDAR"); // ambiguous with regard to prefix rules +tstHelper("DDRANGS"); // ambiguous with regard to prefix rules +tstHelper("DDRUG"); // ambiguous with regard to prefix rules +tstHelper("DNAG"); // ambiguous with regard to prefix rules +tstHelper("DNOGS"); // ambiguous with regard to prefix rules +tstHelper("DRBAN"); // ambiguous with regard to prefix rules +tstHelper("DRGYU"); // ambiguous with regard to prefix rules +tstHelper("DRTOG"); // ambiguous with regard to prefix rules +tstHelper("DYA"); // ambiguous with regard to prefix rules +tstHelper("DYAN"); // ambiguous with regard to prefix rules +tstHelper("GDRA"); // ambiguous with regard to prefix rules +tstHelper("GDRIM"); // ambiguous with regard to prefix rules +tstHelper("GGAN"); // ambiguous with regard to prefix rules +tstHelper("GGYUR"); // ambiguous with regard to prefix rules +tstHelper("GLTAR"); // ambiguous with regard to prefix rules +tstHelper("GLTUNG"); // ambiguous with regard to prefix rules +tstHelper("GMA"); // ambiguous with regard to prefix rules +tstHelper("GMAN"); // ambiguous with regard to prefix rules +tstHelper("GMON"); // ambiguous with regard to prefix rules +tstHelper("GRDEGS"); // ambiguous with regard to prefix rules +tstHelper("GRDZU"); // ambiguous with regard to prefix rules +tstHelper("GRGYA"); // ambiguous with regard to prefix rules +tstHelper("GRNAGS"); // ambiguous with regard to prefix rules +tstHelper("GRTAN"); // ambiguous with regard to prefix rules +tstHelper("GRTOGS"); // ambiguous with regard to prefix rules +tstHelper("GRTZO"); // ambiguous with regard to prefix rules +tstHelper("GRTZOD"); // ambiguous with regard to prefix rules +tstHelper("GRTZON"); // ambiguous with regard to prefix rules +tstHelper("GSLA"); // ambiguous with regard to prefix rules +tstHelper("GSNAD"); // ambiguous with regard to prefix rules +tstHelper("GZLA"); // ambiguous with regard to prefix rules +tstHelper("MBA"); // ambiguous with regard to prefix rules +tstHelper("MBA'"); // ambiguous with regard to prefix rules +tstHelper("MBI'I"); // ambiguous with regard to prefix rules +tstHelper("MHA'A"); // ambiguous with regard to prefix rules +tstHelper("MRDA"); // ambiguous with regard to prefix rules +tstHelper("MRDO"); // ambiguous with regard to prefix rules +tstHelper("MRDZOGS"); // ambiguous with regard to prefix rules +tstHelper("MRGA"); // ambiguous with regard to prefix rules +tstHelper("MRGAD"); // ambiguous with regard to prefix rules +tstHelper("MRGAN"); // ambiguous with regard to prefix rules +tstHelper("MRJES"); // ambiguous with regard to prefix rules +tstHelper("MRJOD"); // ambiguous with regard to prefix rules +tstHelper("MRTOGS"); // ambiguous with regard to prefix rules +tstHelper("MRTOL"); // ambiguous with regard to prefix rules +tstHelper("MRTZE'I"); // ambiguous with regard to prefix rules +tstHelper("MRTZIGS"); // ambiguous with regard to prefix rules +tstHelper("MSAM"); // ambiguous with regard to prefix rules +tstHelper("MSGRIB"); // ambiguous with regard to prefix rules +tstHelper("MSKYES"); // ambiguous with regard to prefix rules +tstHelper("MSON"); // ambiguous with regard to prefix rules +tstHelper("MSOS"); // ambiguous with regard to prefix rules +tstHelper("MSTAMS"); // ambiguous with regard to prefix rules +tstHelper("MSTAN"); // ambiguous with regard to prefix rules + + + + + + // If you're not careful, you'll think GGYES is a legal + // Tibetan tsheg bar and parse it as {G}{G+YE}{S}. But it's + // Sanskrit, really, because GA doesn't take a GA prefix. + // This doesn't occur in ACIP input files that I've seen, but + // GGYI (S1000I.INC) and GGYUR (S5275MC4.ACT) do occur. + tstHelper("GGYES", "{G}{G}{YE}{S}", + new String[] { "{G}{G}{YE}{S}", "{G}{G+YE}{S}", "{G+G}{YE}{S}" }, + new String[] { }, + "{G+G}{YE}{S}"); + + tstHelper("DRUG", "{D}{RU}{G}", + new String[] { "{D}{RU}{G}", "{D+RU}{G}" }, + new String[] { "{D+RU}{G}" }, + "{D+RU}{G}"); + + + tstHelper("d+H+d+HA", "{d+}{H+}{d+}{HA}", + new String[] { "{d+H+d+HA}" }, + new String[] { "{d+H+d+HA}" }); + + tstHelper("Gd+H+d+HA"); + tstHelper("AUTPA", "{AU}{T}{PA}", new String[] { "{AU}{T}{PA}", "{AU}{T+PA}" }, new String[] { }, @@ -249,7 +362,8 @@ public class PackageTest extends TestCase { new String[] { "{G+R+VA}{'I}" }); tstHelper("G-RVA'I", "{G-}{R}{VA}{'I}", new String[] { "{G}{R+VA}{'I}" }, - new String[] { "{G}{R+VA}{'I}" }); + new String[] { }, + "{G}{R+VA}{'I}"); tstHelper("RVA", "{R}{VA}", new String[] { "{R+VA}" }, new String[] { "{R+VA}" }); @@ -6967,8 +7081,8 @@ tstHelper("ZUR"); "", "[TIBETAN_NON_PUNCTUATION:{LA}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_NON_PUNCTUATION:{SGRUB}]"); // DLC FIXME shelp("PAS... LA", - "Offset 5: A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n", - "[TIBETAN_NON_PUNCTUATION:{PAS}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, ERROR:{.}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{LA}]"); + "Offset 5 or maybe 5: A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n", + "[TIBETAN_NON_PUNCTUATION:{PAS}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, ERROR:{A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{LA}]"); shelp("PAS... LA", "", true, @@ -6983,28 +7097,28 @@ tstHelper("ZUR"); shelp("", "", "[]"); shelp("[DD]", ""); shelp("[", - "Offset 0: Found an illegal open bracket (in context, this is [). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n"); + "Offset 0 or maybe 0: Found an illegal open bracket (in context, this is [). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n"); shelp("{", - "Offset 0: Found an illegal open bracket (in context, this is {). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n"); + "Offset 0 or maybe 0: Found an illegal open bracket (in context, this is {). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n"); shelp("DD", ""); shelp("DD]", - "Offset 2: Found a truly unmatched close bracket, ] or }.\nOffset 2: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); + "Offset 2 or maybe 2: Found a truly unmatched close bracket, ] or }.\nOffset 2 or maybe 2: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); - shelp("///NYA", "Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n"); + shelp("///NYA", "Offset 1 or maybe 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n"); shelp("/NYA/", ""); shelp("[?][BP][LS][DD1][DD2][DDD][DR][# (<{A COMMENT)}>]", ""); shelp("[LS][# A [[[[[COMMENT][LS]", - "Offset 9: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" - + "Offset 10: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" - + "Offset 11: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" - + "Offset 12: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" - + "Offset 13: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"); + "Offset 9 or maybe 9: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" + + "Offset 10 or maybe 10: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" + + "Offset 11 or maybe 11: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" + + "Offset 12 or maybe 12: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n" + + "Offset 13 or maybe 13: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"); shelp("[ILLEGAL COMMENT]", - "Offset 0: Found an illegal open bracket (in context, this is [ILLEGAL C...). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 16: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); + "Offset 0 or maybe 0: Found an illegal open bracket (in context, this is [ILLEGAL C...). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 16 or maybe 16: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); shelp("(BSKYABS GRO)", ""); // DLC WHAT ARE THESE FOR? - shelp("BSKYABS GRO)", "Offset 11: Unexpected closing parenthesis, ), found.\n"); + shelp("BSKYABS GRO)", "Offset 11 or maybe 11: Unexpected closing parenthesis, ), found.\n"); shelp("BSKYABS GRO(", "Offset END: Unmatched open parenthesis, (, found.\n"); - shelp("((NESTAGE))", "Offset 1: Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\nOffset 10: Unexpected closing parenthesis, ), found.\n"); + shelp("((NESTAGE))", "Offset 1 or maybe 1: Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\nOffset 10 or maybe 10: Unexpected closing parenthesis, ), found.\n"); shelp("(BA)(PA)NYA(CA)", ""); shelp("NYAx", ""); shelp("NYA x", ""); @@ -7033,9 +7147,9 @@ tstHelper("ZUR"); shelp("(NYA ", "Offset END: Unmatched open parenthesis, (, found.\n"); shelp("[*NYA ", "Offset END: Unmatched open bracket found. A correction does not terminate.\n"); shelp("?", "", "[QUESTION:{?}]"); - shelp("KHAN~ BAR ", "Offset 4: Found an illegal character, ~, with ordinal 126.\n"); + shelp("KHAN~ BAR ", "Offset 4 or maybe 4: Found an illegal character, ~, with ordinal 126.\n"); shelp("[* Correction with []]", - "Offset 5: Found an illegal character, r, with ordinal 114.\nOffset 6: Found an illegal character, r, with ordinal 114.\nOffset 7: Found an illegal character, e, with ordinal 101.\nOffset 8: Found an illegal character, c, with ordinal 99.\nOffset 14: Found an illegal character, w, with ordinal 119.\nOffset 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); + "Offset 5 or maybe 5: Found an illegal character, r, with ordinal 114.\nOffset 6 or maybe 6: Found an illegal character, r, with ordinal 114.\nOffset 7 or maybe 7: Found an illegal character, e, with ordinal 101.\nOffset 8 or maybe 8: Found an illegal character, c, with ordinal 99.\nOffset 14 or maybe 14: Found an illegal character, w, with ordinal 119.\nOffset 19 or maybe 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21 or maybe 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); // DLC FIXME: the line SDIG PA'I GROGS PO'I LAG TU SON PAR 'GYUR PA is followed by a blank line. Note that it's "PA", not "PA ", ending it. Autocorrect to the latter. @@ -7051,8 +7165,8 @@ tstHelper("ZUR"); uhelp(" 1\\ ", "\u0f0b\u0f21\u0f84\u0f0b"); } shelp("K\\,", - "Offset 1: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n", - "[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{\\}, TIBETAN_PUNCTUATION:{,}]"); + "Offset 1 or maybe 1: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n", + "[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}, TIBETAN_PUNCTUATION:{,}]"); shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR%}]"); @@ -7073,15 +7187,15 @@ tstHelper("ZUR"); shelp("@01A.3 ", "", "[FOLIO_MARKER:{@01A.3}, TIBETAN_PUNCTUATION:{ }]"); shelp("@001 ", "", "[FOLIO_MARKER:{@001}, TIBETAN_PUNCTUATION:{ }]"); shelp("@19-20A", - "Offset 0: Found an illegal at sign, @ (in context, this is @19-20A). @012B is an example of a legal folio marker.\n", - "[ERROR:{@}, TIBETAN_NON_PUNCTUATION:{19-20A}]"); // DLC FIXME: yes it occurs in the kangyur. + "Offset 0 or maybe 0: Found an illegal at sign, @ (in context, this is @19-20A). @012B is an example of a legal folio marker.\n", + "[ERROR:{Found an illegal at sign, @ (in context, this is @19-20A). @012B is an example of a legal folio marker.}, TIBETAN_NON_PUNCTUATION:{19-20A}]"); // DLC FIXME: yes it occurs in the kangyur. shelp("@[7B]", ""); shelp("@012A.3KA", "", "[FOLIO_MARKER:{@012A.3}, TIBETAN_NON_PUNCTUATION:{KA}]"); shelp("@012A.34", - "Offset 0: Found an illegal at sign, @ (in context, this is @012A.34). This folio marker has a period, '.', at the end of it, which is illegal.\n", - "[ERROR:{@012A.}, TIBETAN_NON_PUNCTUATION:{34}]"); + "Offset 0 or maybe 0: Found an illegal at sign, @ (in context, this is @012A.34). This folio marker has a period, '.', at the end of it, which is illegal.\n", + "[ERROR:{Found an illegal at sign, @ (in context, this is @012A.34). This folio marker has a period, '.', at the end of it, which is illegal.}, TIBETAN_NON_PUNCTUATION:{34}]"); shelp("@[07B]", ""); shelp("@[00007B]", ""); shelp("@7B", ""); @@ -7097,8 +7211,8 @@ tstHelper("ZUR"); shelp("{ DD }", "", "[DD:{{ DD }}]"); // TD3790E2.ACT shelp("{ BP }", "", "[BP:{{ BP }}]"); // TD3790E2.ACT shelp("//NYA\\\\", - "Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset 5: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\nOffset 6: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n", - "[START_SLASH:{/}, ERROR:{//}, END_SLASH:{/}, TIBETAN_NON_PUNCTUATION:{NYA}, ERROR:{\\}, ERROR:{\\}]"); + "Offset 1 or maybe 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset 5 or maybe 5: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\nOffset 6 or maybe 6: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n", + "[START_SLASH:{/}, ERROR:{Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.}, END_SLASH:{/}, TIBETAN_NON_PUNCTUATION:{NYA}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}]"); } private static void uhelp(String acip) { @@ -7106,7 +7220,7 @@ tstHelper("ZUR"); } private static void uhelp(String acip, String expectedUnicode) { StringBuffer errors = new StringBuffer(); - String unicode = ACIPConverter.convertToUnicode(acip, errors); + String unicode = ACIPConverter.convertToUnicode(acip, errors, null, true); if (null == unicode) { if (null != expectedUnicode && "none" != expectedUnicode) { System.out.println("No unicode exists for " + acip + " but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode)); @@ -8729,22 +8843,22 @@ tstHelper("shKA"); } /* DLC FIXME: add test cases: from R0021F.ACE: ambiguous Tibetan/Sanskrit: - BDA' þþþþ -B+DA þþþ -DBANG þþþ -D+BA þþþ -DGA' þþþþ -D+GA þþþ -DGRA þþþ -D+GRA þþþ -DGYESþþþþþ -D+GYA þþþ -DMAR þþþþ -D+MA þþþ -GDA' þþþþ -G+DA þþþ -GNAD þþþþ -G+NA þþþ -MNA' þþþþ -M+NA þþþ +BDA' +B+DA +DBANG +D+BA +DGA' +D+GA +DGRA +D+GRA +DGYES +D+GYA +DMAR +D+MA +GDA' +G+DA +GNAD +G+NA +MNA' +M+NA */ diff --git a/source/org/thdl/tib/text/ttt/TPairList.java b/source/org/thdl/tib/text/ttt/TPairList.java index 1d97639..c1ebfd5 100644 --- a/source/org/thdl/tib/text/ttt/TPairList.java +++ b/source/org/thdl/tib/text/ttt/TPairList.java @@ -520,7 +520,8 @@ class TPairList { * corresponds to exactly one Tibetan grapheme cluster (i.e., * stack). Note that U+0F7F (ACIP {:}) is part of a stack, not a * stack all on its own. */ - void populateWithTGCPairs(ArrayList pl, ArrayList indexList, int index) { + void populateWithTGCPairs(ArrayList pl, + ArrayList indexList, int index) { int sz = size(); if (sz == 0) { return; @@ -540,8 +541,8 @@ class TPairList { // The last pair: TPair p = get(i); ThdlDebug.verify(!"+".equals(p.getRight())); - int where; boolean add_U0F7F = false; + int where; if (p.getRight() != null && (where = p.getRight().indexOf(':')) >= 0) { // this ':' guy is his own TGCPair. @@ -579,27 +580,21 @@ class TPairList { } TGCPair tp; indexList.add(new Integer(index)); - tp = new TGCPair(lWylie.toString() - + (hasNonAVowel - ? ACIPRules.getWylieForACIPVowel(p.getRight()) - : ""), + tp = new TGCPair(lWylie.toString(), + (hasNonAVowel + ? ACIPRules.getWylieForACIPVowel(p.getRight()) + : ""), (isNumeric - ? TGCPair.OTHER - : (hasNonAVowel - ? (isSanskrit - ? TGCPair.SANSKRIT_WITH_VOWEL - : (isTibetan - ? TGCPair.CONSONANTAL_WITH_VOWEL - : TGCPair.OTHER)) - : (isSanskrit - ? TGCPair.SANSKRIT_WITHOUT_VOWEL - : (isTibetan - ? TGCPair.CONSONANTAL_WITHOUT_VOWEL - : TGCPair.OTHER))))); + ? TGCPair.TYPE_OTHER + : (isSanskrit + ? TGCPair.TYPE_SANSKRIT + : (isTibetan + ? TGCPair.TYPE_TIBETAN + : TGCPair.TYPE_OTHER)))); pl.add(tp); if (add_U0F7F) { indexList.add(new Integer(index)); - pl.add(new TGCPair("H", TGCPair.OTHER)); + pl.add(new TGCPair("H", null, TGCPair.TYPE_OTHER)); } } } diff --git a/source/org/thdl/tib/text/ttt/TParseTree.java b/source/org/thdl/tib/text/ttt/TParseTree.java index ea83648..2dffa42 100644 --- a/source/org/thdl/tib/text/ttt/TParseTree.java +++ b/source/org/thdl/tib/text/ttt/TParseTree.java @@ -91,7 +91,7 @@ class TParseTree { ParseIterator pi = getParseIterator(); while (pi.hasNext()) { TStackList sl = pi.next(); - if (sl.isLegalTshegBar().isLegal) { + if (sl.isLegalTshegBar(false).isLegal) { sll.add(sl); } } @@ -118,12 +118,12 @@ class TParseTree { * a unique non-illegal parse, you get it. If there's not a * unique answer, null is returned. */ // {TZANDRA} is not solved by this, DLC NOW. Solve PADMA PROBLEM! - // DLC by using this we can get rid of single-sanskrit-gc, eh? public TStackList getBestParse() { - TStackListList up = getUniqueParse(); + TStackListList up = getUniqueParse(false); if (up.size() == 1) return up.get(0); + up = getNonIllegalParses(); int sz = up.size(); if (sz == 1) { @@ -192,14 +192,17 @@ class TParseTree { * legal parses if there two or more equally good parses. By * "legal", we mean a sequence of stacks that is legal * by the rules of Tibetan tsheg bar syntax (sometimes called - * spelling). */ - public TStackListList getUniqueParse() { + * spelling). + * @param noPrefixTests true if you want to pretend that every + * stack can take every prefix, which is not the case in + * reality */ + public TStackListList getUniqueParse(boolean noPrefixTests) { TStackListList allLegalParses = new TStackListList(2); // save memory TStackListList legalParsesWithVowelOnRoot = new TStackListList(1); ParseIterator pi = getParseIterator(); while (pi.hasNext()) { TStackList sl = pi.next(); - BoolPair bpa = sl.isLegalTshegBar(); + BoolPair bpa = sl.isLegalTshegBar(noPrefixTests); if (bpa.isLegal) { if (bpa.isLegalAndHasAVowelOnRoot) legalParsesWithVowelOnRoot.add(sl); @@ -253,13 +256,23 @@ class TParseTree { public String getWarning(boolean paranoid, TPairList pl, String originalACIP) { - TStackListList up = getUniqueParse(); + + { + TStackList bestParse = getBestParse(); + TStackListList noPrefixTestsUniqueParse = getUniqueParse(true); + if (noPrefixTestsUniqueParse.size() == 1 + && !noPrefixTestsUniqueParse.get(0).equals(bestParse)) { + return "Warning: We're going with " + bestParse + ", but only because our knowledge of prefix rules says that " + noPrefixTestsUniqueParse.get(0) + " is not a legal Tibetan tsheg bar (\"syllable\")"; + } + } + + TStackListList up = getUniqueParse(false); if (null == up || up.size() != 1) { boolean isLastStack[] = new boolean[1]; TStackListList nip = getNonIllegalParses(); if (nip.size() != 1) { if (null == getBestParse()) { - return "There's not even a unique, non-illegal parse for ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}"; + return "Warning: There's not even a unique, non-illegal parse for ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}"; } else { if (getBestParse().hasStackWithoutVowel(pl, isLastStack)) { if (isLastStack[0]) { @@ -269,7 +282,7 @@ class TParseTree { } } if (paranoid) { - return "Though the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "} is unambiguous, it would be more computer-friendly if + signs were used to stack things because there are two (or more) ways to interpret this ACIP if you're not careful."; + return "Warning: Though the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "} is unambiguous, it would be more computer-friendly if + signs were used to stack things because there are two (or more) ways to interpret this ACIP if you're not careful."; } } } else { diff --git a/source/org/thdl/tib/text/ttt/TStackList.java b/source/org/thdl/tib/text/ttt/TStackList.java index 1b01308..5db6847 100644 --- a/source/org/thdl/tib/text/ttt/TStackList.java +++ b/source/org/thdl/tib/text/ttt/TStackList.java @@ -125,15 +125,17 @@ class TStackList { * Tibetan syntax (sometimes called rules of spelling). If this * is legal, then {@link BoolPair#isLegalAndHasAVowelOnRoot} will * be true if and only if there is an explicit {A} vowel on the - * root stack. */ - public BoolPair isLegalTshegBar() { - // DLC handle PADMA and other Tibetanized Sanskrit fellows. Right now we only handle single-stack guys. + * root stack. + * @param noPrefixTests true if you want to pretend that every + * stack can take every prefix, which is not the case in + * reality */ + public BoolPair isLegalTshegBar(boolean noPrefixTests) { + // DLC handle PADMA and other Tibetanized Sanskrit fellows consistently. Right now we only treat single-stack Sanskrit guys as legal. TTGCList tgcList = new TTGCList(this); StringBuffer warnings = new StringBuffer(); String candidateType - = TibTextUtils.getClassificationOfTshegBar(tgcList, warnings); - // System.out.println("DLC: " + toString() + " has candidateType " + candidateType + " and warnings " + warnings); + = TibTextUtils.getClassificationOfTshegBar(tgcList, warnings, noPrefixTests); // preliminary answer: boolean isLegal = (candidateType != "invalid");