From 5e18feb47d2ce212024513c2f2243bf277ebda17 Mon Sep 17 00:00:00 2001 From: dchandler Date: Thu, 16 Oct 2003 04:15:10 +0000 Subject: [PATCH] ACIP now stacks greedily. TTTTTA is T+T+T+T+TA, even though that stack doesn't exist in TM or TMW. Robert Chilton, in personal correspondence, agreed that this is the way to do things. ACIP handles the appendages 'AM, 'ANG, 'US, 'UR, 'I, 'O, and 'U correctly. --- source/org/thdl/tib/text/TibetanDocument.java | 24 +- .../org/thdl/tib/text/ttt/ACIPConverter.java | 60 ++- source/org/thdl/tib/text/ttt/ACIPRules.java | 5 + source/org/thdl/tib/text/ttt/PackageTest.java | 266 +++++++----- source/org/thdl/tib/text/ttt/TPair.java | 49 ++- source/org/thdl/tib/text/ttt/TPairList.java | 399 +++++++++--------- .../thdl/tib/text/ttt/TPairListFactory.java | 55 ++- source/org/thdl/tib/text/ttt/TParseTree.java | 23 +- source/org/thdl/tib/text/ttt/TStackList.java | 37 +- source/org/thdl/tib/text/ttt/package.html | 6 + 10 files changed, 576 insertions(+), 348 deletions(-) diff --git a/source/org/thdl/tib/text/TibetanDocument.java b/source/org/thdl/tib/text/TibetanDocument.java index b29ffdc..7eee74e 100644 --- a/source/org/thdl/tib/text/TibetanDocument.java +++ b/source/org/thdl/tib/text/TibetanDocument.java @@ -232,16 +232,28 @@ public class TibetanDocument extends DefaultStyledDocument { * @param color the color in which to insert, which is used if and only * if {@link #colorsEnabled() colors are enabled} */ - public void appendDuffCodes(DuffCode[] glyphs, Color color) { + public void appendDuffCodes(DuffCode[] glyphs, Color color) { // PERFORMANCE FIXME: this isn't so speedy, but it reuses // existing code. for (int i = 0; i < glyphs.length; i++) { - insertDuff(getLength(), - new DuffData[] { new DuffData(new String(new char[] { glyphs[i].getCharacter() }), - glyphs[i].getFontNum()) }, - color); + appendDuffCode(glyphs[i], color); } - } + } + +/** +* Appends glyph to the end of this document. +* @param glyph the Tibetan glyph you want to insert +* @param color the color in which to insert, which is used if and only +* if {@link #colorsEnabled() colors are enabled} +*/ + public void appendDuffCode(DuffCode glyph, Color color) { + // PERFORMANCE FIXME: this isn't so speedy, but it reuses + // existing code. + insertDuff(getLength(), + new DuffData[] { new DuffData(new String(new char[] { glyph.getCharacter() }), + glyph.getFontNum()) }, + color); + } /** Replacing can be more efficient than inserting and then diff --git a/source/org/thdl/tib/text/ttt/ACIPConverter.java b/source/org/thdl/tib/text/ttt/ACIPConverter.java index 9262bb3..9cd602b 100644 --- a/source/org/thdl/tib/text/ttt/ACIPConverter.java +++ b/source/org/thdl/tib/text/ttt/ACIPConverter.java @@ -1,3 +1,6 @@ +// DLC NOW: 'US etc. -- do we handle them all? +// DLC NOW WARN ON NNYA and DBA +// DLC NOW: implement Robert Chilton-supplied prefix rules /* The contents of this file are subject to the THDL Open Community License Version 1.0 (the "License"); you may not use this file except in compliance @@ -348,13 +351,14 @@ public class ACIPConverter { if (null != tdoc) tdoc.appendRoman(text, Color.BLACK); } else { String unicode = null; - DuffCode[] duff = null; + Object[] duff = null; if (stype == TString.TIBETAN_NON_PUNCTUATION) { lastGuyWasNonPunct = true; - TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText()); + TPairList pls[] = TPairListFactory.breakACIPIntoChunks(s.getText()); String acipError; - if ((acipError = pl.getACIPError()) != null) { + if ((acipError = pls[0].getACIPError()) != null + && (null == pls[1] || pls[1].getACIPError() != null)) { hasErrors = true; String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS THESE ERRORS: " + acipError + "]"; if (null != writer) writer.write(errorMessage); @@ -362,8 +366,10 @@ public class ACIPConverter { if (null != errors) errors.append(errorMessage + "\n"); } else { - TParseTree pt = pl.getParseTree(); - if (null == pt) { + TParseTree pt0 = pls[0].getParseTree(); + TParseTree pt1 = ((null == pls[1]) + ? null : pls[1].getParseTree()); + if (null == pt0 && null == pt1) { hasErrors = true; String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " IS ESSENTIALLY NOTHING.]"; if (null != writer) writer.write(errorMessage); @@ -371,8 +377,10 @@ public class ACIPConverter { if (null != errors) errors.append(errorMessage + "\n"); } else { - TStackList sl = pt.getBestParse(); - if (null == sl) { + TStackList sl0 = pt0.getBestParse(); + TStackList sl1 = ((null == pt1) + ? null : pt1.getBestParse()); + if (null == sl0 && null == sl1) { hasErrors = true; String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS NO LEGAL PARSES.]"; if (null != writer) writer.write(errorMessage); @@ -380,6 +388,25 @@ public class ACIPConverter { if (null != errors) errors.append(errorMessage + "\n"); } else { + TStackList sl = sl0; + TPairList pl = pls[0]; + TParseTree pt = pt0; + // set sl equal to the best choice of sl0 and sl1. + if (null != sl1) { + BoolTriple sl0bt = sl0.isLegalTshegBar(false); + BoolTriple sl1bt = sl1.isLegalTshegBar(false); + int ct; + if ((ct = sl0bt.compareTo(sl1bt)) < 0) { + sl = sl1; + pl = pls[1]; + pt = pt1; + } else if (0 == ct) { + // sl remains sl0 -- '* is + // a vowel unless it's + // clearly part of an + // appendage like 'AM. + } + } lastGuy = sl; String warning = null; if ("None" != warningLevel) { @@ -428,10 +455,10 @@ public class ACIPConverter { color = Color.BLACK; if (stype == TString.START_SLASH) { if (null != writer) unicode = "\u0F3C"; - if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph("(") }; + if (null != tdoc) duff = new Object[] { TibetanMachineWeb.getGlyph("(") }; } else if (stype == TString.END_SLASH) { if (null != writer) unicode = "\u0F3D"; - if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph(")") }; + if (null != tdoc) duff = new Object[] { TibetanMachineWeb.getGlyph(")") }; } else if (stype == TString.TIBETAN_PUNCTUATION) { // For ACIP, tshegs are used as both // tshegs and whitespace. We treat a @@ -499,7 +526,7 @@ public class ACIPConverter { } else { String wy = ACIPRules.getWylieForACIPOther(s.getText()); if (null == wy) throw new Error("No wylie for ACIP " + s.getText()); - duff = new DuffCode[] { TibetanMachineWeb.getGlyph(wy) }; + duff = new Object[] { TibetanMachineWeb.getGlyph(wy) }; } } } @@ -526,7 +553,18 @@ public class ACIPConverter { if (null != writer && null != unicode) writer.write(unicode); if (null != tdoc) { if (null != duff && 0 != duff.length) { - tdoc.appendDuffCodes(duff, color); + for (int j = 0; j < duff.length; j++) { + if (duff[j] instanceof DuffCode) + tdoc.appendDuffCode((DuffCode)duff[j], + color); + else { + hasErrors = true; + if (null != errors) + errors.append((String)duff[j] + "\n"); + tdoc.appendRoman((String)duff[j], + Color.RED); + } + } } else { // this happens when you have an // [#ERROR]-producing tsheg bar. diff --git a/source/org/thdl/tib/text/ttt/ACIPRules.java b/source/org/thdl/tib/text/ttt/ACIPRules.java index 5508262..ad535f7 100644 --- a/source/org/thdl/tib/text/ttt/ACIPRules.java +++ b/source/org/thdl/tib/text/ttt/ACIPRules.java @@ -179,6 +179,11 @@ public class ACIPRules { wylieToACIP.put(EWTS, ACIP); } + /** Returns true if and only if s is an ACIP consonant. */ + static final boolean isACIPConsonant(String s) { + return (null != ACIPRules.getWylieForACIPConsonant(s)); + } + private static HashMap acipConsonant2wylie = null; /** Returns the EWTS corresponding to the given ACIP consonant * (without the "A" vowel). Returns null if there is no such diff --git a/source/org/thdl/tib/text/ttt/PackageTest.java b/source/org/thdl/tib/text/ttt/PackageTest.java index ba71f11..c99c164 100644 --- a/source/org/thdl/tib/text/ttt/PackageTest.java +++ b/source/org/thdl/tib/text/ttt/PackageTest.java @@ -52,28 +52,31 @@ public class PackageTest extends TestCase { public PackageTest() { } private static void tstHelper(String acip) { - tstHelper2(acip, null, false, null, null, null); + tstHelper2(acip, null, false, null, null, null, 0); } private static void tstHelper(String acip, String expectedPairs) { - tstHelper2(acip, expectedPairs, false, null, null, null); + tstHelper2(acip, expectedPairs, false, null, null, null, 0); } private static void tstHelper(String acip, String[] expectedParses) { - tstHelper2(acip, null, false, expectedParses, null, null); + tstHelper2(acip, null, false, expectedParses, null, null, 0); } private static void tstHelper(String acip, String expectedPairs, String[] expectedParses) { - tstHelper2(acip, expectedPairs, false, expectedParses, null, null); + tstHelper2(acip, expectedPairs, false, expectedParses, null, null, 0); } private static void tstHelper(String acip, String expectedPairs, String[] expectedParses, String[] legalParses) { - tstHelper2(acip, expectedPairs, false, expectedParses, legalParses, null); + tstHelper2(acip, expectedPairs, false, expectedParses, legalParses, null, 0); } private static void tstHelper(String acip, String expectedPairs, String[] expectedParses, String[] legalParses, String expectedBestParse) { - tstHelper2(acip, expectedPairs, false, expectedParses, legalParses, expectedBestParse); + tstHelper2(acip, expectedPairs, false, expectedParses, legalParses, expectedBestParse, 0); + } + private static void tstHelper(String acip, String expectedPairs, String[] expectedParses, String[] legalParses, String expectedBestParse, int which) { + tstHelper2(acip, expectedPairs, false, expectedParses, legalParses, expectedBestParse, which); } private static void tstHelper2(String acip) { tstHelper2(acip, null); } private static void tstHelper2(String acip, String expectedPairs) { - tstHelper2(acip, expectedPairs, true, null, null, null); + tstHelper2(acip, expectedPairs, true, null, null, null, 0); } private static final boolean sdebug = false; @@ -82,8 +85,10 @@ public class PackageTest extends TestCase { boolean debug, String[] expectedParses, String[] expectedLegalParses, - String expectedBestParse) { - TPairList l = TPairListFactory.breakACIPIntoChunks(acip); + String expectedBestParse, + int pairListToUse) { + TPairList[] la = TPairListFactory.breakACIPIntoChunks(acip); + TPairList l = la[pairListToUse]; if (sdebug || debug) System.out.println("ACIP=" + acip + " and l'=" + l); if (expectedPairs != null) { @@ -93,6 +98,10 @@ public class PackageTest extends TestCase { } } + if (null == l) { + assertTrue("!null!".equals(expectedBestParse)); + return; + } TParseTree pt = l.getParseTree(); if (pt == null) { if (sdebug || debug) @@ -110,7 +119,7 @@ public class PackageTest extends TestCase { int np = pt.numberOfParses(); boolean goodness = expectedParses == null || expectedParses.length == np; if (sdebug || debug || !goodness) - System.out.println("ACIP=" + acip + " and parse tree=" + pt + " /size " + pt.size() + "; /pairs " + pt.numberOfPairs() + "; /numParses " + np); + System.out.println("ACIP=" + acip + " and expectedParses is " + expectedParses + " with length " + ((null == expectedParses)?0:expectedParses.length) + " and parse tree=" + pt + " /size " + pt.size() + "; /pairs " + pt.numberOfPairs() + "; /numParses " + np); assertTrue(goodness); { @@ -204,7 +213,9 @@ public class PackageTest extends TestCase { && (acip.indexOf('6') < 0) && (acip.indexOf('7') < 0) && (acip.indexOf('8') < 0) - && (acip.indexOf('9') < 0)) { + && (acip.indexOf('9') < 0) + && pairListToUse == 1 + && (acip.indexOf('\'') < 0)) { System.out.println("acip=" + acip + "; recovery is " + l.recoverACIP()); assertTrue(false); @@ -214,18 +225,25 @@ public class PackageTest extends TestCase { // DLC FIXME: warn if we have to use the "what stacks take a GA prefix?" rules to get a unique legal parse. public void testCutoff() { - // this would be exponential running time, so we cut it off: + // this would once be exponential running time, so we'd cut it off: tstHelper("BRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTNBRTN"); } public void testSlowestTshegBar() { - // this would be exponential running time, so we cut it off: + // this would once be exponential running time, so we'd cut it off: tstHelper("BRTNBRTNBRTNB"); } public void testPerformance() { tstHelper("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"); - tstHelperboolean x = false; + try { + tstHelper(""); + } catch (IllegalArgumentException e) { + x = true; + } + assertTrue(x); + tstHelper("9012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678"); } /** Tests {@link TPairListFactory#breakACIPIntoChunks(String)}, @@ -332,20 +350,71 @@ tstHelper("KA'", "[(K . A), (' . )]", // If you're not careful, you'll think GGYES is a legal // Tibetan tsheg bar and parse it as {G}{G+YE}{S}. But it's - // Sanskrit, really, because GA doesn't take a GA prefix. + // non-native, really, because GA doesn't take a GA prefix. // This doesn't occur in ACIP input files that I've seen, but // GGYI (S1000I.INC) and GGYUR (S5275MC4.ACT) do occur. tstHelper("GGYES", "{G}{G}{YE}{S}", - new String[] { "{G}{G}{YE}{S}", "{G}{G+YE}{S}", "{G+G}{YE}{S}" }, + new String[] { "{G+G+YE}{S}", "{G}{G+YE}{S}" }, new String[] { }, - "{G+G}{YE}{S}"); + "{G+G+YE}{S}"); // DLC FIXME: warn about BDE vs. B+DE. color such differently. Maybe an inputter saw B+DE and typed in BDE, not thinking. tstHelper("BDE", "{B}{DE}", - new String[] { "{B}{DE}", "{B+DE}" }, + new String[] { "{B+DE}", "{B}{DE}" }, new String[] { "{B}{DE}" }, "{B}{DE}"); + tstHelper("GDAMS'O", "{G}{DA}{M}{S'O}", + new String[] { + "{G+DA}{M+S'O}", + "{G}{DA}{M+S'O}", + }, + new String[] { }, + "{G+DA}{M+S'O}", 0); + + tstHelper("GDAMS'O", "{G}{DA}{M}{S-}{'O}", + new String[] { + "{G+DA}{M+S}{'O}", + "{G+DA}{M}{S}{'O}", + "{G}{DA}{M+S}{'O}", + "{G}{DA}{M}{S}{'O}", + }, + new String[] { "{G}{DA}{M}{S}{'O}" }, + "{G}{DA}{M}{S}{'O}", 1); + + tstHelper("SNYAMS'AM'ANG", "{S}{NYA}{M}{S-}{'A}{M-}{'A}{NG}", null, null, "{S+NYA}{M}{S}{'A}{M}{'A}{NG}", 1); + tstHelper("SNYAMS'AM'ANG", "{S}{NYA}{M}{S'A}{M'A}{NG}", null, null, "{S+NYA}{M+S'A}{M'A}{NG}", 0); + tstHelper("SNYAM'AM", null, null, null, "{S+NYA}{M}{'A}{M}", 1); + tstHelper("SNYAMS'AM", null, null, null, "{S+NYA}{M}{S}{'A}{M}", 1); + tstHelper("SNYAM-'A-M", null, null, null, "!null!", 1); + tstHelper("SNYAM-'A-M", null, null, null, "{S+NYA}{M}{'A}{M}", 0); + tstHelper("SNY-M-'-M", null, null, null, "{S+NY}{M}{'}{M}", 0); + tstHelper("SNY-M-'-M", null, null, null, "!null!", 1); + tstHelper("SNYAMS'AM'ANG'U'I'O", null, null, null, "{S+NYA}{M}{S}{'A}{M}{'A}{NG}{'U}{'I}{'O}", 1); + tstHelper("SNYAMS'I'AM'ANG'U'I'O", null, null, null, "{S+NYA}{M}{S}{'I}{'A}{M}{'A}{NG}{'U}{'I}{'O}", 1); + tstHelper("SNYAM+S+'O", null, null, null, "{S+NYA}{M+S+'O}", 0); + tstHelper("SNYAMS+'O", null, null, null, "{S+NYA}{M+S+'O}", 0); + tstHelper("SNYAMS+'O", null, null, null, "{S+NYA}{M+S+'O}", 0); + tstHelper("SAM'UR'US", null, null, null, "{SA}{M}{'U}{R}{'U}{S}", 1); + tstHelper("SAM'US", null, null, null, "{SA}{M}{'U}{S}", 1); + tstHelper("SAM'AM", null, null, null, "{SA}{M}{'A}{M}", 1); + tstHelper("SAMS'ANG", null, null, null, "{SA}{M}{S}{'A}{NG}", 1); + tstHelper("SNYANGD'O", null, null, null, "{S+NYA}{NG}{D}{'O}", 1); + tstHelper("T-SNYANGD'O", null, null, null, "{T}{S+NYA}{NG+D}{'O}", 1); // T is no prefix, so NG+D, not NG-D + tstHelper("T-SNYANGD'O", null, null, null, "{T}{S+NYA}{NG+D'O}", 0); + + tstHelper("SNYAM+S+'O", null, null, null, "{S+NYA}{M+S+'O}", 0); + tstHelper("SNYAMS+'O", null, null, null, "{S+NYA}{M+S+'O}", 0); + + tstHelper("GDAMS", null, null, null, "{G}{DA}{M}{S}", 0); + tstHelper("GDAM-S'O", null, null, null, "{G}{DA}{M}{S}{'O}", 1); + tstHelper("GDAM-C'O", null, null, null, "{G+DA}{M}{C'O}", 0); + tstHelper("GDAM-C'O", null, null, null, "{G+DA}{M}{C}{'O}", 1); + tstHelper("GDAMS", null, null, null, "{G}{DA}{M}{S}", 0); + // DLC NOW: FIXME: tstHelper("DKHY", null, null, null, "{D}{KH+YA}", 0); + // DLC DKHY'O should give parse tree {{D-KH+Y'O}, {D+KH+Y'O}} + // DLC DKHYA'O should give parse tree {{D-KH+YA'O}, {D+KH+YA'O}} + tstHelper("SHR'I", "{SH}{R'I}", null, null, @@ -357,7 +426,7 @@ tstHelper("KA'", "[(K . A), (' . )]", // DLC FIXME: do TMW->ACIP->TMW->ACIP round-trip. tstHelper("DRUG", "{D}{RU}{G}", - new String[] { "{D}{RU}{G}", "{D+RU}{G}" }, + new String[] { "{D+RU}{G}", "{D}{RU}{G}" }, new String[] { "{D+RU}{G}" }, "{D+RU}{G}"); @@ -369,22 +438,22 @@ tstHelper("KA'", "[(K . A), (' . )]", tstHelper("Gd+H+d+HA"); tstHelper("AUTPA", "{AU}{T}{PA}", - new String[] { "{AU}{T}{PA}", "{AU}{T+PA}" }, + new String[] { "{AU}{T+PA}" }, new String[] { }, "{AU}{T+PA}"); tstHelper("PADMA", "{PA}{D}{MA}", null, null); tstHelper("PADMA", "{PA}{D}{MA}", - new String[] { "{PA}{D}{MA}", "{PA}{D+MA}" }, + new String[] { "{PA}{D+MA}" }, new String[] { }, "{PA}{D+MA}"); tstHelper("PADMDM", "{PA}{D}{M}{D}{M}", null, new String[] { }, - "{PA}{D+M}{D+M}"); + "{PA}{D+M+D+M}"); tstHelper("GRVA'I", "{G}{R}{VA}{'I}", - new String[] { "{G}{R+VA}{'I}", "{G+R+VA}{'I}" }, + new String[] { "{G+R+VA}{'I}", "{G}{R+VA}{'I}" }, new String[] { "{G+R+VA}{'I}" }); tstHelper("G-RVA'I", "{G-}{R}{VA}{'I}", new String[] { "{G}{R+VA}{'I}" }, @@ -393,22 +462,24 @@ tstHelper("KA'", "[(K . A), (' . )]", tstHelper("RVA", "{R}{VA}", new String[] { "{R+VA}" }, new String[] { "{R+VA}" }); + tstHelper("VA", "{VA}", + new String[] { "{VA}" }, + new String[] { }, + ""); + tstHelper("K+O", "{K+}{O}", new String[] { }, new String[] { }); tstHelper("K+0", "{K+}{0}", new String[] { }, new String[] { }); tstHelper("0+K", "{0-}{+-}{K}", new String[] { }, new String[] { }); tstHelper("0+0", "{0-}{+-}{0}", new String[] { }, new String[] { }); - // DLC add tests for BRTAN, BLTA, BLAG, BRAG, B-LAG, B-RAG - - // MARK for searching tstHelper("0", "{0}", new String[] { "{0}" }, new String[] { "{0}" }); tstHelper("0123", "{0-}{1-}{2-}{3}", new String[] { "{0}{1}{2}{3}" }, new String[] { "{0}{1}{2}{3}" }); tstHelper("0-123", "{0-}{-}{1-}{2-}{3}", new String[] { "{0}{1}{2}{3}" }, new String[] { "{0}{1}{2}{3}" }); - tstHelper("0123KA", "{0-}{1-}{2-}{3-}{KA}", new String[] { "{0}{1}{2}{3}{KA}" }, + tstHelper("0123KA", "{0-}{1-}{2-}{3-}{KA}", new String[] { }, new String[] { }); - tstHelper("G0123KA", "{G}{0-}{1-}{2-}{3-}{KA}", new String[] { "{G}{0}{1}{2}{3}{KA}" }, + tstHelper("G0123KA", "{G}{0-}{1-}{2-}{3-}{KA}", new String[] { }, new String[] { }); tstHelper("BHA"); tstHelper("BHE"); @@ -418,10 +489,10 @@ tstHelper("KA'", "[(K . A), (' . )]", tstHelper("D-VA"); tstHelper("DVA"); tstHelper("SRAS", "{S}{RA}{S}", - new String[] { "{S}{RA}{S}", "{S+RA}{S}" }, + new String[] { "{S+RA}{S}" }, new String[] { "{S+RA}{S}" }); tstHelper("SARS", "{SA}{R}{S}", - new String[] { "{SA}{R}{S}", "{SA}{R+S}" }, + new String[] { "{SA}{R+S}", "{SA}{R}{S}" }, new String[] { "{SA}{R}{S}" }); tstHelper("SARAS", "{SA}{RA}{S}", new String[] { "{SA}{RA}{S}" }, @@ -429,9 +500,9 @@ tstHelper("KA'", "[(K . A), (' . )]", tstHelper("SHLO", "{SH}{LO}", - new String[] { "{SH}{LO}", "{SH+LO}" }, + new String[] { "{SH+LO}" }, new String[] { "{SH+LO}" }); - tstHelper("ZLUM", "{Z}{LU}{M}", new String[] { "{Z}{LU}{M}", "{Z+LU}{M}" }, new String[] { "{Z+LU}{M}" }); + tstHelper("ZLUM", "{Z}{LU}{M}", new String[] { "{Z+LU}{M}" }, new String[] { "{Z+LU}{M}" }); tstHelper("K'EE", "{K'EE}"); tstHelper("K'O", "{K'O}"); tstHelper("K'OO", "{K'OO}"); @@ -439,8 +510,7 @@ tstHelper("KA'", "[(K . A), (' . )]", tstHelper("K'i", "{K'i}"); tstHelper("K'A", "{K'A}"); tstHelper("B+DDZ", "{B+}{D}{DZ}", - new String[] { "{B+D}{DZ}", - "{B+D+DZ}" }); // we're conservative. + new String[] { "{B+D+DZ}" }); // we're conservative. // A heuristic is to // say that B+DDZ must // be {B+D}{DZ} @@ -452,62 +522,44 @@ tstHelper("KA'", "[(K . A), (' . )]", // that we know the // keyboardist was // aware of the plus - // operator. + // operator. DLC FIXME: warn in this case! tstHelper("BRTN--GA", "{B}{R}{T}{N-}{-}{GA}", new String[] { - "{B}{R}{T}{N}{GA}", - "{B}{R}{T+N}{GA}", - "{B}{R+T}{N}{GA}", - "{B+R}{T}{N}{GA}", - "{B+R}{T+N}{GA}", - }); - tstHelper("BR-TN"); // DLC: no legal parses, and 2 decent ones, eh? + "{B+R+T+N}{GA}", + "{B}{R+T+N}{GA}" + }, + new String[] {}, + "{B+R+T+N}{GA}"); + tstHelper("BR-TN"); tstHelper("BRTN", "{B}{R}{T}{N}", new String[] { - "{B}{R}{T}{N}", - "{B}{R}{T+N}", - "{B}{R+T}{N}", - "{B+R}{T}{N}", - "{B+R}{T+N}", + "{B+R+T+N}", + "{B}{R+T+N}" }, - new String[] { - "{B}{R+T}{N}" // prefix-root-suffix - }); + new String[] { }, + "{B+R+T+N}"); + tstHelper("BRT-N", + "{B}{R}{T-}{N}", + null, + null, + "{B}{R+T}{N}"); + tstHelper("BRTAN", + "{B}{R}{TA}{N}", + null, + null, + "{B}{R+TA}{N}"); tstHelper("BRTN-BRTN", "{B}{R}{T}{N-}{B}{R}{T}{N}", new String[] { - "{B}{R}{T}{N}{B}{R}{T}{N}", - "{B}{R}{T}{N}{B}{R}{T+N}", - "{B}{R}{T}{N}{B}{R+T}{N}", - "{B}{R}{T}{N}{B+R}{T}{N}", - "{B}{R}{T}{N}{B+R}{T+N}", - "{B}{R}{T+N}{B}{R}{T}{N}", - "{B}{R}{T+N}{B}{R}{T+N}", - "{B}{R}{T+N}{B}{R+T}{N}", - "{B}{R}{T+N}{B+R}{T}{N}", - "{B}{R}{T+N}{B+R}{T+N}", - "{B}{R+T}{N}{B}{R}{T}{N}", - "{B}{R+T}{N}{B}{R}{T+N}", - "{B}{R+T}{N}{B}{R+T}{N}", - "{B}{R+T}{N}{B+R}{T}{N}", - "{B}{R+T}{N}{B+R}{T+N}", - "{B+R}{T}{N}{B}{R}{T}{N}", - "{B+R}{T}{N}{B}{R}{T+N}", - "{B+R}{T}{N}{B}{R+T}{N}", - "{B+R}{T}{N}{B+R}{T}{N}", - "{B+R}{T}{N}{B+R}{T+N}", - "{B+R}{T+N}{B}{R}{T}{N}", - "{B+R}{T+N}{B}{R}{T+N}", - "{B+R}{T+N}{B}{R+T}{N}", - "{B+R}{T+N}{B+R}{T}{N}", - "{B+R}{T+N}{B+R}{T+N}", + "{B+R+T+N}{B+R+T+N}", + "{B}{R+T+N}{B+R+T+N}" }); // has 25 parses tstHelper("B+R-T-N-B-R-T+N", new String[] { "{B+R}{T}{N}{B}{R}{T+N}" }); // has 1 parse - tstHelper("B+G+K", "{B+}{G+}{K}", new String[] { }); // no parses. + tstHelper("B+G+K", "{B+}{G+}{K}", new String[] { "{B+G+K}" }, null, "{B+G+K}"); // no parses. tstHelper("G-YA", new String[] { "{G}{YA}" }); // has 1 parse tstHelper("G+YA", @@ -516,51 +568,31 @@ tstHelper("KA'", "[(K . A), (' . )]", new String[] { "{G+YAm:}" }); // has 1 parse tstHelper("BRTN-BLTA", "{B}{R}{T}{N-}{B}{L}{TA}", new String[] { - "{B}{R}{T}{N}{B}{L}{TA}", - "{B}{R}{T}{N}{B}{L+TA}", - "{B}{R}{T}{N}{B+L}{TA}", - "{B}{R}{T+N}{B}{L}{TA}", - "{B}{R}{T+N}{B}{L+TA}", - "{B}{R}{T+N}{B+L}{TA}", - "{B}{R+T}{N}{B}{L}{TA}", - "{B}{R+T}{N}{B}{L+TA}", - "{B}{R+T}{N}{B+L}{TA}", - "{B+R}{T}{N}{B}{L}{TA}", - "{B+R}{T}{N}{B}{L+TA}", - "{B+R}{T}{N}{B+L}{TA}", - "{B+R}{T+N}{B}{L}{TA}", - "{B+R}{T+N}{B}{L+TA}", - "{B+R}{T+N}{B+L}{TA}", + "{B+R+T+N}{B+L+TA}", + "{B}{R+T+N}{B+L+TA}" }); // has 15 parses - tstHelper("BSABS", new String[] { "{B}{SA}{B}{S}" }); + tstHelper("BSABS", + new String[] { "{B+SA}{B+S}", "{B+SA}{B}{S}", "{B}{SA}{B+S}", "{B}{SA}{B}{S}" }); tstHelper("ZUNGS"); tstHelper("BRTIB", "{B}{R}{TI}{B}", new String[] { - "{B}{R}{TI}{B}", - "{B}{R+TI}{B}", - "{B+R}{TI}{B}", + "{B+R+TI}{B}", + "{B}{R+TI}{B}" }); tstHelper("PRiTZTSVA", "{P}{Ri}{TZ}{TS}{VA}", new String[] { - "{P}{Ri}{TZ}{TS+VA}", - "{P}{Ri}{TZ+TS+VA}", - "{P+Ri}{TZ}{TS+VA}", "{P+Ri}{TZ+TS+VA}" }); tstHelper("SPYOMS", "{S}{P}{YO}{M}{S}", new String[] { - "{S}{P}{YO}{M}{S}", - "{S}{P}{YO}{M+S}", - "{S}{P+YO}{M}{S}", - "{S}{P+YO}{M+S}", - "{S+P}{YO}{M}{S}", - "{S+P}{YO}{M+S}", - "{S+P+YO}{M}{S}", "{S+P+YO}{M+S}", + "{S+P+YO}{M}{S}", }); tstHelper(":'AO", "[(: . -), (' . ), (A . O)]"); + tstHelper("m'AO", "[(m . -), (' . ), (A . O)]"); + tstHelper("m:'AO", "[(m . -), (: . -), (' . ), (A . O)]"); tstHelper("AA:", "[(A . A:)]", new String[] { "{AA:}" }); tstHelper("KE:", "[(K . E:)]"); tstHelper("K:", "[(K . ), (: . )]", @@ -2174,6 +2206,7 @@ tstHelper("DBANG"); tstHelper("DBAR"); tstHelper("DBAS"); tstHelper("DBE"); +// DLC NOW: TMW->ACIP doesn't do {KHA (KA)}. tstHelper("DBEN"); tstHelper("DBER"); tstHelper("DBES"); @@ -7135,7 +7168,7 @@ tstHelper("ZUR"); + "Offset 13 or maybe 13: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"); shelp("[ILLEGAL COMMENT]", "Offset 0 or maybe 0: Found an illegal open bracket (in context, this is [ILLEGAL C...). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 16 or maybe 16: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); - shelp("(BSKYABS GRO)", ""); // DLC WHAT ARE THESE FOR? + shelp("(BSKYABS GRO)", ""); shelp("BSKYABS GRO)", "Offset 11 or maybe 11: Unexpected closing parenthesis, ), found.\n"); shelp("BSKYABS GRO(", "Offset END: Unmatched open parenthesis, (, found.\n"); shelp("((NESTAGE))", "Offset 1 or maybe 1: Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\nOffset 10 or maybe 10: Unexpected closing parenthesis, ), found.\n"); @@ -7171,7 +7204,7 @@ tstHelper("ZUR"); shelp("[* Correction with []]", "Offset 5 or maybe 5: Found an illegal character, r, with ordinal 114.\nOffset 6 or maybe 6: Found an illegal character, r, with ordinal 114.\nOffset 7 or maybe 7: Found an illegal character, e, with ordinal 101.\nOffset 8 or maybe 8: Found an illegal character, c, with ordinal 99.\nOffset 14 or maybe 14: Found an illegal character, w, with ordinal 119.\nOffset 19 or maybe 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21 or maybe 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); - // DLC FIXME: the line SDIG PA'I GROGS PO'I LAG TU SON PAR 'GYUR PA is followed by a blank line. Note that it's "PA", not "PA ", ending it. Autocorrect to the latter. + // DLC DOC: the line SDIG PA'I GROGS PO'I LAG TU SON PAR 'GYUR PA is followed by a blank line. Note that it's "PA", not "PA ", ending it. We autocorrect to the latter. // DLC FIXME: @0B1 isn't handled correctly! @@ -7268,7 +7301,7 @@ tstHelper("ZUR"); System.out.println("No unicode exists for " + acip + " but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode)); assertTrue(false); } - System.out.println("DLC: Unicode for " + acip + " can't be had; errors are " + errors); + System.out.println("Unicode for " + acip + " can't be had; errors are " + errors); } else { if (null != expectedUnicode && !expectedUnicode.equals(unicode)) { System.out.println("The unicode for " + acip + " is " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(unicode) + ", but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode)); @@ -7278,6 +7311,18 @@ tstHelper("ZUR"); } public void testACIPConversion() { + uhelp("DZHDZHA", "\u0f5c\u0fac"); // tricky because DZHDZA is not in TMW but DZHDZHA is + uhelp("DZHDZA", "\u0f5c\u0fab"); + uhelp("P+S+N+YA", "\u0f54\u0fb6\u0fa3\u0fb1"); + uhelp("PSNYA", "\u0f54\u0fb6\u0f99"); // Is this P+S+N+YA? No, it's P+S+NYA. But, DLC, warn! + uhelp("NNYA", "\u0f53\u0f99"); // DLC warn + uhelp("GHNYA", "\u0f43\u0f99"); + + // TS+NYA and T+S+N+YA are both legal, so what is TSNYA? + // Private correspondence with Robert Chilton says that it is + // TS+NYA, but he warns that such are suspect. + + uhelp("THAG PA", "\u0f50\u0f42\u0f0b\u0f54"); uhelp("KA \nKHA\n\nGA", "\u0f40\u0f0b\u0f41\u0f0b\n\n\u0f42"); uhelp("KA%\nKHA", "\u0f40\u0f35\u0f0b\u0f41"); uhelp("KA%", "\u0f40\u0f35"); @@ -7317,7 +7362,7 @@ tstHelper("ZUR"); uhelp("*#HUm: G+DHOO GRO`;.,", "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d"); uhelp("*#HUm: K+DHA GRO`;.,", - "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") K+DHA IS ESSENTIALLY NOTHING.]\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d"); + "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f40\u0fa2\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d"); // DLC FIXME: the file ACIP_SHRI should be made into an ACIP->TMW automated test case } @@ -7507,7 +7552,6 @@ tstHelper("ZUR"); tstHelper("AUT"); tstHelper("AUTPALA'I"); tstHelper("B'I"); - tstHelper("B. (DLC!)"); tstHelper("BA"); tstHelper("BA'"); tstHelper("BA'A"); @@ -7919,7 +7963,6 @@ tstHelper("ZUR"); tstHelper("GDONGS"); tstHelper("GDUD"); tstHelper("GDUG"); - tstHelper("GDUG.PA (DLC!)"); tstHelper("GDUGS"); tstHelper("GDUN"); tstHelper("GE"); @@ -8474,7 +8517,6 @@ tstHelper("ZUR"); tstHelper("RNGUS"); tstHelper("RNOGS"); tstHelper("RO"); - tstHelper("RO.STOD (DLC!)"); tstHelper("ROL"); tstHelper("RSBOD"); tstHelper("RTAG"); diff --git a/source/org/thdl/tib/text/ttt/TPair.java b/source/org/thdl/tib/text/ttt/TPair.java index f442007..a3f8e7f 100644 --- a/source/org/thdl/tib/text/ttt/TPair.java +++ b/source/org/thdl/tib/text/ttt/TPair.java @@ -118,7 +118,7 @@ class TPair { return (null != l && ((null == r || "".equals(r)) || "-".equals(r) - || "A".equals(r)) // DLC though check for BASKYABS and warn because BSKYABS is more common + || "A".equals(r)) // DLC FIXME: though check for BASKYABS and warn because BSKYABS is more common && ("'".equals(l) || "M".equals(l) || "B".equals(l) @@ -126,12 +126,52 @@ class TPair { || "G".equals(l))); } + /** Returns true if and only if this pair could be a Tibetan + * secondary sufffix. */ + boolean isPostSuffix() { + return (null != l + && ((null == r || "".equals(r)) + || "-".equals(r) + || "A".equals(r)) // DLC FIXME: though warn about GAMASA vs. GAMS + && ("S".equals(l) + || "D".equals(l))); + } + + /** Returns true if and only if this pair could be a Tibetan + * sufffix. DLC FIXME: ACIP specific, just like isPostSuffix() and isPrefix() */ + boolean isSuffix() { + return (null != l + && ((null == r || "".equals(r)) + || "-".equals(r) + || "A".equals(r)) + && ("S".equals(l) + || "G".equals(l) + || "D".equals(l) + || "M".equals(l) + || "'".equals(l) + || "B".equals(l) + || "NG".equals(l) + || "N".equals(l) + || "L".equals(l) + || "R".equals(l))); + } + /** Returns true if and only if this pair is merely a * disambiguator. */ boolean isDisambiguator() { return ("-".equals(r) && getLeft() == null); } + /** Yep, this works for TPairs. */ + public boolean equals(Object x) { + if (x instanceof TPair) { + TPair p = (TPair)x; + return ((getLeft() == p.getLeft() || (getLeft() != null && getLeft().equals(p.getLeft()))) + || (getRight() == p.getRight() || (getRight() != null && getRight().equals(p.getRight())))); + } + return false; + } + /** Returns an TPair that is like this pair except that it has * a "+" on the right if this pair is empty on the right and is * empty on the right if this pair has a disambiguator (i.e., a @@ -195,4 +235,11 @@ class TPair { if (null != x) sb.append(x); } } + + /** Returns true if this pair is surely the last pair in an ACIP + * stack. Stacking continues through (* . ) and (* . +), but + * stops anywhere else. */ + boolean endsACIPStack() { + return (getRight() != null && !"+".equals(getRight())); + } } diff --git a/source/org/thdl/tib/text/ttt/TPairList.java b/source/org/thdl/tib/text/ttt/TPairList.java index 6858734..6df7031 100644 --- a/source/org/thdl/tib/text/ttt/TPairList.java +++ b/source/org/thdl/tib/text/ttt/TPairList.java @@ -284,216 +284,216 @@ class TPairList { * syntax) to do so. If this list of pairs has something clearly * illegal in it, or is empty, or is merely a list of * disambiguators etc., then this returns null. Never returns an - * empty parse tree. */ + * empty parse tree. + */ public TParseTree getParseTree() { - TParseTree pt = new TParseTree(); + // We treat [(B . ), (G . +), (K . ), (T . A)] as if it could + // be {B+G+K+T} or {B}{G+K+T}; we handle prefixes specially + // this way. [(T . ), (G . +), (K . ), (T . A)] is clearly + // {T+G+K+TA} (and, DLC FIXME, we should warn that there are + // some pluses but not all) + // + // We don't care if T+G+K+T is in TMW or not -- there is no + // master list of stacks. + int sz = size(); - int firstPair = 0; for (int i = 0; i < sz; i++) { - - // We treat [(B . ), (G . +), (K . ), (T . A)] as if it - // could be {B+G+K+T} or {B}{G+K}{T} or {B+G+K}{T} or - // {B}{G+K+T} (modulo stack legality); we're conservative. - // (Though some stacks won't be legal.) + TPair p = get(i); + if (p.getLeft() == null && !"-".equals(p.getRight())) + return null; // clearly illegal. + if ("+".equals(p.getLeft())) + return null; // clearly illegal. + if (":".equals(p.getLeft())) + return null; // clearly illegal. + if ("m".equals(p.getLeft())) + return null; // clearly illegal. + if ("m:".equals(p.getLeft())) + return null; // clearly illegal. + } + TParseTree pt = new TParseTree(); + if (sz < 1) return null; + + // When we see a stretch of ACIP without a disambiguator or a + // vowel, that stretch is taken to be one stack unless it may + // be prefix-root or suffix-postsuffix or suffix/postsuffix-' + // -- the latter necessary because GAMS'I is GAM-S-'I, not + // GAM-S+'I. 'UR, 'US, 'ANG, 'AM, 'I, 'O, 'U -- all begin + // with '. So we can have zero, one, two, or three special + // break locations. (The kind that aren't special are the + // break after G in G-DAMS, or the break after G in GADAMS or + // GEDAMS.) + // + // If a nonnegative number appears in breakLocations[i], it + // means that pair i may or may not be stacked with pair i+1. + int nextBreakLoc = 0; + int breakLocations[] = { -1, -1, -1 }; + + boolean mayHavePrefix; + + // Handle the first pair specially -- it could be a prefix. + if (ddebug) System.out.println("i is " + 0); + if ((mayHavePrefix = get(0).isPrefix()) && null == get(0).getRight()) { + // special case: we must have a branch in the parse tree + // for the initial part of this pair list. For example, + // is DKHYA D+KH+YA or D-KH+YA? It depends on prefix + // rules (can KH+YA take a DA prefix?), so the parse tree + // includes both. + breakLocations[nextBreakLoc++] = 0; + } + + // stack numbers start at 1. + int stackNumber = (get(0).endsACIPStack()) ? 2 : 1; + // this starts at 0. + int stackStart = (get(0).endsACIPStack()) ? 1 : 0; + + int numeric = 0; // 1 means surely, 0 means we don't know yet, -1 means surely not + + for (int i = 1; i < sz; i++) { if (ddebug) System.out.println("i is " + i); TPair p = get(i); - if (p.getRight() == null && firstPair + 1 < sz) { - // Here's the ambiguity. Let's fill up sl. (B . ) (G - // . +) (K . A) could be {B+G+KA} or {BA}{G+KA}, so we - // go until we hit a vowel and then break into - // TPairLists. - int start = firstPair; - int blanks[] = new int[sz - start]; // we may not use all of this. - int j; - for (j = start; j < sz; j++) { - TPair pj = get(j); - boolean isBlank; - if (ddebug) System.out.println("right guy is " + pj.getRight()); - if (pj.isDisambiguator()) - blanks[j-start] = ALWAYS_STOP_STACKING; - else { - if (!(isBlank = (pj.getRight() == null)) && !"+".equals(pj.getRight())) { - if (ddebug) System.out.println("breaker breaker at j=" + j); - break; - } - blanks[j-start] = isBlank ? STOP_STACK : ALWAYS_KEEP_STACKING; - } - } - if (j >= sz) j = sz - 1; - blanks[j-start] = ALWAYS_STOP_STACKING; - - // get(j) [corresponding to blanks[j-i]] is - // the last pair in the ambiguous stretch; get(i) - // [corresponding to blanks[0]] is the first. - - // We'll end up doing 2**(j-i+1) (i.e., (1 << - // (j-i+1))) iterations. If that's going to be too - // many, let's just say there's no legal parse. FIXME: - // give a nice error message in this case. - if (ddebug) System.out.println("ddebug: we're going to do 2^" + (j-i+1) + " [or " + (1 << (j-i+1)) + "] wacky iterations!"); - if ((j-i+1) > 13) // if you don't use 13, then change PackageTest.testSlowestTshegBar(). - return null; - - boolean keepGoing = true; - TStackListList sll = new TStackListList(); - do { - // Add the stack list currently specified by - // blanks if all the stacks in it are legal. -// DLC DELETE { -// ArrayList x = new ArrayList((j-start+1)); -// for (int ii = 0; ii < (j-start+1); ii++) -// x.add(new Integer(blanks[ii])); -// } - TStackList sl = new TStackList(sz - start); - boolean illegal = false; - TPairList currentStack = new TPairList(); - for (int k = 0; k < j-start+1; k++) { - TPair pk = get(start + k); - if (!pk.isDisambiguator()) { - currentStack.add(pk.insideStack()); - if (blanks[k] == STOP_STACK) { - if (currentStack.isLegalTibetanOrSanskritStack()) - sl.add(currentStack.asStack()); - else { - illegal = true; - break; - } - currentStack = new TPairList(); - } - } - } - if (!illegal && !currentStack.isEmpty()) { - if (currentStack.isLegalTibetanOrSanskritStack()) { - TPairList stack = currentStack.asStack(); - if (ddebug) System.out.println("adding currentStack " + stack + " to sl " + sl); - sl.add(stack); - } else { - illegal = true; - } - } - if (!illegal) { - if (ddebug) System.out.println("adding sl " + sl + " to sll " + sll); - sll.add(sl); - } - - // Update blanks. Think of this as doing base 2 - // arithmetic where STOP_STACK is zero, - // KEEP_STACKING is one, and ALWAYS_KEEP_STACKING - // and ALWAYS_STOP_STACKING are digits we cannot - // modify. We'll end up doing 2^M iterations, - // where M is the number of fields in blanks that - // are not equal to ALWAYS_KEEP_STACKING or - // ALWAYS_STOP_STACKING. - keepGoing = false; - for (int k = j-start; k >= 0; k--) { - if (blanks[k] == STOP_STACK) { - keepGoing = true; - blanks[k] = KEEP_STACKING; - // reset all digits to the right of k to - // "zero": - for (int m = k + 1; m < j-start+1; m++) { - if (blanks[m] == KEEP_STACKING) - blanks[m] = STOP_STACK; - } - break; - } - } - } while (keepGoing); - if (sll.isEmpty()) - return null; // STXAL or shT+ZNAGN, e.g. - else { - if (ddebug) System.out.println("adding sll " + sll + " to parse tree " + pt); - pt.add(sll); - } - - if (ddebug) System.out.println("i is " + i + " and j is " + j + " and we are resetting so that i==j+1 next time."); - i = j; - firstPair = j + 1; - } else if ("+".equals(p.getRight())) { - // Keep firstPair where it is. + boolean nn; + if ((nn = p.isNumeric()) && ("+".equals(get(i-1).getRight()) + || "+".equals(p.getRight()))) + return null; // clearly illegal. You can't stack numbers. + if (nn) { + if (-1 == numeric) + return null; // you can't mix numbers and letters. + else if (0 == numeric) + numeric = 1; } else { - // Add all pairs in the range [firstPair, i]. Some - // pairs are stacks all by themselves, some pairs have - // '+' on the right and are thus just part of a stack. - // We'll add a whole number of stacks, though. - - // this is initialized to hold the max we might use: - TStackListList sll - = new TStackListList(i - firstPair + 1); + if (numeric == 1) + return null; // you can't mix numbers and letters. + else if (0 == numeric && !p.isDisambiguator()) + numeric = -1; + } - TPairList currentStack = new TPairList(); - for (int j = firstPair; j <= i; j++) { - TPair pj = get(j); - if (!pj.isDisambiguator()) { - currentStack.add(pj.insideStack()); - if (!"+".equals(pj.getRight())) { - if (currentStack.isLegalTibetanOrSanskritStack()) - sll.add(new TStackList(currentStack.asStack())); - else { - return null; - } - currentStack = new TPairList(); + if (i+1==sz || p.endsACIPStack()) { + if (/* the stack ending here might really be + suffix-postsuffix or + suffix-appendage or + suffix-postsuffix-appendage */ + (mayHavePrefix && (stackNumber == 2 || stackNumber == 3)) + || (!mayHavePrefix && (stackNumber == 2))) { + if (i > stackStart) { + if (get(stackStart).isSuffix() + && (get(stackStart+1).isPostSuffix() // suffix-postsuffix + || "'".equals(get(stackStart+1).getLeft()))) // suffix-appendage + breakLocations[nextBreakLoc++] = stackStart; + if (i > stackStart + 1) { + // three to play with, maybe it's + // suffix-postsuffix-appendage. + if (get(stackStart).isSuffix() + && get(stackStart+1).isPostSuffix() + && "'".equals(get(stackStart+2).getLeft())) + breakLocations[nextBreakLoc++] = stackStart+1; } } + // else no need to insert a breakLocation, we're + // breaking hard. } - if (!currentStack.isEmpty()) - throw new Error("how can this happen? currentStack is " + currentStack); - - if (!sll.isEmpty()) { - if (ddebug) System.out.println("adding sll " + sll + " to parse tree " + pt); - pt.add(sll); - firstPair = i + 1; - } // else you probably have {G--YA} or something as - // your tsheg bar. + if (/* the stack ending here might really be + postsuffix-appendage (e.g., GDAM-S'O) */ + (mayHavePrefix && (stackNumber == 3 || stackNumber == 4)) + || (!mayHavePrefix && (stackNumber == 3))) { + if (i == stackStart+1) { // because GDAM--S'O is illegal, and because it's 'ANG, not 'NG, 'AM, not 'M -- ' always ends the stack + if (get(stackStart).isPostSuffix() + && "'".equals(get(stackStart+1).getLeft())) + breakLocations[nextBreakLoc++] = stackStart; + } + } + ++stackNumber; + stackStart = i+1; } } + // DLC FIXME: we no longer need all these breakLocations -- we can handle SAM'AM'ANG + + // Now go from hard break (i.e., (* . VOWEL or -)) to hard + // break (and there's a hard break after the last pair, of + // course, even if it is (G . ) or (G . +) [the latter being + // hideously illegal]). Between the hard breaks, there will + // be 1, 2, or 4 (can you see why 8 isn't possible, though + // numBreaks can be 3?) possible parses. There are two of DGA + // in DGAMS'O -- D-GA and D+GA. There are 4 of MS'O in + // DGAMS'O -- M-S-'O, M-S+'O, M+S-'O, and M+S+'O. Add one + // TStackListList per hard break to pt, the parse tree. + int startLoc = 0; // which pair starts this hard break? + + // DLC FIXME: assert this + if ((breakLocations[1] >= 0 && breakLocations[1] <= breakLocations[0]) + || (breakLocations[2] >= 0 && breakLocations[2] <= breakLocations[1])) + throw new Error("breakLocations is monotonically increasing, ain't it?"); + + for (int i = 0; i < sz; i++) { + if (i+1 == sz || get(i).endsACIPStack()) { + TStackListList sll = new TStackListList(4); // maximum is 4. + + int numBreaks = 0; + int breakStart = -1; + for (int jj = 0; jj < breakLocations.length; jj++) { + if (breakLocations[jj] >= startLoc + && breakLocations[jj] <= i) { + if (breakStart < 0) + breakStart = jj; + ++numBreaks; + } + } + + // Count from [0, 1< 0) { + for (int j = 0; breakStart+j < 3; j++) { + if (k == breakLocations[breakStart+j] + && 1 == ((counter >> j) & 1)) { + if (!currentStack.isEmpty()) + sl.add(currentStack.asStack()); + currentStack = new TPairList(); + break; // shouldn't matter, but you never know + } + } + } + } + } + if (!sl.isEmpty()) { + sll.add(sl); + } + } + + if (!sll.isEmpty()) + pt.add(sll); + startLoc = i+1; + } + } + + if (pt.isEmpty()) return null; return pt; } - /** Returns true if and only if this list of TPairs can be - * interpreted as a legal Tibetan stack or a legal Tibetanized - * Sanskrit stack. This is private because a precondition is - * that no vowels or disambiguators appear except possibly in the - * final pair. */ - private boolean isLegalTibetanOrSanskritStack() { - StringBuffer tibetan = new StringBuffer(); - StringBuffer sanskrit = new StringBuffer(); - int sz = size(); - - // Special case because otherwise wa-zur alone would be seen - // as legal. - if (sz == 1 && "V".equals(get(0).getLeft())) - return false; - - for (int i = 0; i < sz; i++) { - TPair p = get(i); - String ewts_form - = ACIPRules.getWylieForACIPConsonant(p.getLeft()); - if (null == ewts_form) { - if (p.isNumeric()) - ewts_form = p.getLeft(); - } - if (null == ewts_form) { - if (ddebug) System.out.println("testing " + toString2() + " for legality said false. numeric?" + p.isNumeric() + "[1]"); - return false; - } - tibetan.append(ewts_form); - sanskrit.append(ewts_form); - if (i + 1 < sz) { - tibetan.append('-'); - sanskrit.append('+'); - } - } - boolean ans = - (TibetanMachineWeb.hasGlyph(tibetan.toString()) - || TibetanMachineWeb.hasGlyph(sanskrit.toString())); - if (ddebug) System.out.println("testing " + toString2() + " for legality said " + ans + " [2]; san is " + sanskrit + " tib is " + tibetan + "."); - return ans; - } private static final boolean ddebug = false; /** Mutates this TPairList object such that the last pair is @@ -611,9 +611,11 @@ class TPairList { } /** Appends the DuffCodes that correspond to this grapheme cluster - * to duff. Assumes this is one grapheme cluster. */ - void getDuff(ArrayList duff) { - int previousSize = duff.size(); + * to duffsAndErrors, or appends a String that is an error + * message saying that TMW cannot represent this grapheme + * cluster. */ + void getDuff(ArrayList duffsAndErrors) { + int previousSize = duffsAndErrors.size(); StringBuffer wylieForConsonant = new StringBuffer(); for (int x = 0; x + 1 < size(); x++) { wylieForConsonant.append(get(x).getWylie(false)); @@ -624,17 +626,18 @@ class TPairList { if (!TibetanMachineWeb.isKnownHashKey(hashKey)) { hashKey = hashKey.replace('+', '-'); if (!TibetanMachineWeb.isKnownHashKey(hashKey)) { - throw new Error("How did this happen?"); + duffsAndErrors.add("[#ERROR The ACIP {" + recoverACIP() + "} cannot be represented with the TibetanMachine or TibetanMachineWeb fonts because no such glyph exists in these fonts.]"); + return; } } if (lastPair.getRight() == null || lastPair.equals("-")) { - duff.add(TibetanMachineWeb.getGlyph(hashKey)); + duffsAndErrors.add(TibetanMachineWeb.getGlyph(hashKey)); } else { - ACIPRules.getDuffForACIPVowel(duff, + ACIPRules.getDuffForACIPVowel(duffsAndErrors, TibetanMachineWeb.getGlyph(hashKey), lastPair.getRight()); } - if (previousSize == duff.size()) + if (previousSize == duffsAndErrors.size()) throw new Error("TPairList with no duffs? " + toString()); // DLC FIXME: change to assertion. } } diff --git a/source/org/thdl/tib/text/ttt/TPairListFactory.java b/source/org/thdl/tib/text/ttt/TPairListFactory.java index 9264b6d..8e19629 100644 --- a/source/org/thdl/tib/text/ttt/TPairListFactory.java +++ b/source/org/thdl/tib/text/ttt/TPairListFactory.java @@ -38,9 +38,42 @@ class TPairListFactory { * rest would be suboptimal, so we backtrack to [(T . )] and then * finally become [(T . ), (A . A)]. We look for (A . ) and ( * . ) in the rest in order to say "the rest would be - * suboptimal", i.e. we use TPairList.hasSimpleError() - * @param acip a string of ACIP with no punctuation in it */ - static TPairList breakACIPIntoChunks(String acip) { + * suboptimal", i.e. we use TPairList.hasSimpleError().

+ * + *

There is one case where we break things up into two pair + * lists -- I found out about this case too late to do anything + * clean about it. SNYAM'AM, e.g., breaks up into [(S . ), (NY + * . A), (M . 'A), (M . )], which is incorrect -- [(S . ), (NY + * . A), (M . ), (' . A), (M . )] is correct. But we don't know + * which is correct without parsing, so both are returned. The + * clean treatment (low-priority FIXME) would be to lex into a + * form that didn't insist 'A was either a vowel or a consonant. + * Then the parser would figure it out.

+ * + * @param acip a string of ACIP with no punctuation in it + * @return an array of one or two pair lists, if the former, then + * the second element will be null, if the latter, the second + * element will have (* . ), (' . *) instead of (* . '*) which + * the former has @throws IllegalArgumentException if acip is too + * large for us to break into chunks (we're recursive, not + * iterative, so the boundary can be increased a lot if you care, + * but you don't) */ + static TPairList[] breakACIPIntoChunks(String acip) throws IllegalArgumentException { + try { + TPairList a = breakHelper(acip, true); + TPairList b = breakHelper(acip, false); + if (a.equals(b)) + return new TPairList[] { a, null }; + else + return new TPairList[] { a, b }; + } catch (StackOverflowError e) { + throw new IllegalArgumentException("Input too large[1]: " + acip); + } catch (OutOfMemoryError e) { + throw new IllegalArgumentException("Input too large[2]: " + acip); + } + } + /** Helps {@link breakACIPIntoChunks(String)}. */ + private static TPairList breakHelper(String acip, boolean tickIsVowel) { // base case for our recursion: if ("".equals(acip)) @@ -50,9 +83,21 @@ class TPairListFactory { int howMuchBuf[] = new int[1]; TPair head = getFirstConsonantAndVowel(acipBuf, howMuchBuf); int howMuch = howMuchBuf[0]; + if (!tickIsVowel + && null != head.getLeft() + && null != head.getRight() + && head.getRight().startsWith("'")) { + head = new TPair(head.getLeft(), + // Without this disambiguator, we are + // less efficient (8 parses, not 4) and + // we can't handle PA'AM'ANG etc. + "-"); + howMuch = head.getLeft().length(); + } + TPairList tail; if ((tail - = breakACIPIntoChunks(acipBuf.substring(howMuch))).hasSimpleError()) { + = breakHelper(acipBuf.substring(howMuch), tickIsVowel)).hasSimpleError()) { for (int i = 1; i < howMuch; i++) { // try giving i characters back if that leaves us with // a legal head and makes the rest free of simple @@ -61,7 +106,7 @@ class TPairListFactory { TPair newHead; if ((newHead = head.minusNRightmostACIPCharacters(i)).isLegal() && !(newTail - = breakACIPIntoChunks(acipBuf.substring(howMuch - i))).hasSimpleError()) { + = breakHelper(acipBuf.substring(howMuch - i), tickIsVowel)).hasSimpleError()) { newTail.prepend(newHead); return newTail; } diff --git a/source/org/thdl/tib/text/ttt/TParseTree.java b/source/org/thdl/tib/text/ttt/TParseTree.java index 64c6ed1..841177b 100644 --- a/source/org/thdl/tib/text/ttt/TParseTree.java +++ b/source/org/thdl/tib/text/ttt/TParseTree.java @@ -184,10 +184,7 @@ class TParseTree { } /** Returns a list containing the unique legal parse of this parse - * tree if there is a unique legal parse. Note that {SRAS} has a - * unique legal parse, though {SRS} has two equally good parses; - * i.e., note that the {A} vowel is treated specially here - * (unlike in {@link #getLegalParses()}). Returns an empty list + * tree if there is a unique legal parse. Returns an empty list * if there are no legal parses. Returns a list containing all * legal parses if there two or more equally good parses. By * "legal", we mean a sequence of stacks that is legal @@ -223,13 +220,21 @@ class TParseTree { if (allStrictlyLegalParses.size() > 2) throw new Error("can this happen?"); if (legalParsesWithVowelOnRoot.size() == 2) { - if (legalParsesWithVowelOnRoot.get(0).size() != 1 + legalParsesWithVowelOnRoot.get(1).size()) - throw new Error("Something other than the G-YA vs. GYA case appeared. Sorry for your trouble! " + legalParsesWithVowelOnRoot.get(0) + " ;; " + legalParsesWithVowelOnRoot.get(1)); - return new TStackListList(legalParsesWithVowelOnRoot.get(1)); + if (legalParsesWithVowelOnRoot.get(0).size() + != 1 + legalParsesWithVowelOnRoot.get(1).size()) { + // MARDA is MAR+DA or MA-R-DA -- both are legal if + // noPrefixTests. + return new TStackListList(); + } else { + // G-YA vs. GYA. + return new TStackListList(legalParsesWithVowelOnRoot.get(1)); + } } if (allNonillegalParses.size() == 2) { - if (allNonillegalParses.get(0).size() != 1 + allNonillegalParses.get(1).size()) - throw new Error("Something other than the G-YA vs. GYA case appeared. Sorry for your trouble! " + allNonillegalParses.get(0) + " ;; " + allNonillegalParses.get(1)); + if (allNonillegalParses.get(0).size() != 1 + allNonillegalParses.get(1).size()) { + // BDREN, e.g., if noPrefixTests: + return new TStackListList(); + } return new TStackListList(allNonillegalParses.get(1)); } return allNonillegalParses; diff --git a/source/org/thdl/tib/text/ttt/TStackList.java b/source/org/thdl/tib/text/ttt/TStackList.java index 647ce54..e624bb6 100644 --- a/source/org/thdl/tib/text/ttt/TStackList.java +++ b/source/org/thdl/tib/text/ttt/TStackList.java @@ -131,7 +131,7 @@ class TStackList { * stack can take every prefix, which is not the case in * reality */ public BoolTriple isLegalTshegBar(boolean noPrefixTests) { - // DLC handle PADMA and other Tibetanized Sanskrit fellows consistently. Right now we only treat single-stack Sanskrit guys as legal. + // DLC Should we handle PADMA and other Tibetanized Sanskrit fellows consistently? Right now we only treat single-stack Sanskrit guys as legal. TTGCList tgcList = new TTGCList(this); StringBuffer warnings = new StringBuffer(); @@ -191,8 +191,10 @@ class TStackList { * @param isLastStack if non-null, then isLastStack[0] will be * set to true if and only if the very last stack is the only * stack not to have a vowel or disambiguator on it */ + // DLC FIXME: DELETE THIS WARNING and this code unless EWTS will need it... boolean hasStackWithoutVowel(TPairList opl, boolean[] isLastStack) { int runningSize = 0; + // DLC FIXME: MARDA is MARD==MAR-D to us, but is probably MAR+DA, warn for (int i = 0; i < size(); i++) { TPairList pl = get(i); String l; @@ -207,7 +209,7 @@ class TStackList { } } if (runningSize != opl.sizeMinusDisambiguators()) - throw new IllegalArgumentException("opl (" + opl + ") is bad for this stack list (" + toString() + ")"); + throw new IllegalArgumentException("runningSize = " + runningSize + "; opl.sizeMinusDisambiguators = " + opl.sizeMinusDisambiguators() + "; opl (" + opl + ") is bad for this stack list (" + toString() + ")"); return false; } @@ -219,8 +221,11 @@ class TStackList { } return u.toString(); } - /** Returns the DuffCodes corresponding to this stack list. */ - DuffCode[] getDuff() { + /** Returns the DuffCodes and errors corresponding to this stack + list. Each element of the array is a DuffCode or a String, the + latter if and only if the TMW font cannot represent the + corresponding stack in this list. */ + Object[] getDuff() { ArrayList al = new ArrayList(size()*2); // rough estimate int count = 0; for (int i = 0; i < size(); i++) { @@ -229,20 +234,40 @@ class TStackList { if (size() > 0 && al.size() == 0) { throw new Error("But this stack list, " + this + ", contains " + size() + " stacks! How can it not have DuffCodes associated with it?"); } - return (DuffCode[])al.toArray(new DuffCode[] { }); + return al.toArray(); } } /** Too simple to comment. */ -class BoolTriple { +class BoolTriple implements Comparable { boolean isLegal; boolean isLegalButSanskrit; // some subset are legal but legal Sanskrit -- the single sanskrit stacks are this way, such as B+DE. boolean isLegalAndHasAVowelOnRoot; BoolTriple(boolean isLegal, boolean isLegalButSanskrit, boolean isLegalAndHasAVowelOnRoot) { + if (!isLegal && (isLegalButSanskrit || isLegalAndHasAVowelOnRoot)) + throw new IllegalArgumentException(); this.isLegal = isLegal; this.isLegalButSanskrit = isLegalButSanskrit; this.isLegalAndHasAVowelOnRoot = isLegalAndHasAVowelOnRoot; } + private int score() { + int score = 0; + if (isLegalAndHasAVowelOnRoot) { + score += 5; + } + if (isLegal) { + score += 5; + } + if (isLegalButSanskrit) { + score -= 3; + } + return score; + } + /** The most legal BoolTriple compares higher. */ + public int compareTo(Object o) { + BoolTriple b = (BoolTriple)o; + return score() - b.score(); + } } diff --git a/source/org/thdl/tib/text/ttt/package.html b/source/org/thdl/tib/text/ttt/package.html index db7b7fe..75c2fe6 100644 --- a/source/org/thdl/tib/text/ttt/package.html +++ b/source/org/thdl/tib/text/ttt/package.html @@ -25,6 +25,12 @@ Machine Web and methods for converting EWTS transliteration into Tibetan Machine Web.  It has extensive tests, though probably not mentioned in these Javadoc documents.

+

+When you see the term "Sanskrit" used here, it often means +non-native (not native Tibetan, in other words) rather than truly +Tibetanized Sanskrit.  It is overloaded to refer to Tibetanized +Chinese, Tibetanized Sanskrit, etc. +

Related Documentation

@see org.thdl.tib.text