From 7198f2336110f4236754355f1b830a12ac2e8cff Mon Sep 17 00:00:00 2001 From: dchandler Date: Mon, 20 Jun 2005 06:18:00 +0000 Subject: [PATCH] I really hesitate to commit this because I'm not sure what it brings to the table exactly and I fear that it makes the ACIP->Tibetan converter code a lot uglier. The TODO(DLC)[EWTS->Tibetan] comments littered throughout are part of the ugliness; they point to the ugliness. If each were addressed, cleanliness could perhaps be achieved. I've largely forgotten exactly what this change does, but it attempts to improve EWTS->Tibetan conversion. The lexer is probably really, really primitive. I concentrate here on converting a single tsheg bar rather than a whole document. Eclipse was used during part of my journey here and some imports were reorganized merely because I could. :) (Eclipse was needed when the usual ant build failed to run a new test EWTSTest. And I wanted its debugger.) Next steps: end-to-end EWTS tests should bring many problems to light. Fix those. Triage all the TODO comments. I don't know that I'll ever really trust the implementation. The tests are valuable, though. A clean implementation of EWTS->Tibetan in Jython might hold enough interest for me; I'd like to learn Python. --- .classpath | 19 +- build.xml | 20 +- junitbuild.xml | 2 +- source/org/thdl/tib/input/Jskad.java | 64 +- source/org/thdl/tib/text/TGCPair.java | 2 +- .../thdl/tib/text/tshegbar/UnicodeUtils.java | 20 + .../tshegbar/ValidatingUnicodeReader.java | 2 +- .../tshegbar/ValidatingUnicodeReaderTest.java | 6 +- source/org/thdl/tib/text/ttt/ACIPTraits.java | 55 +- .../tib/text/ttt/ACIPTshegBarScanner.java | 9 +- source/org/thdl/tib/text/ttt/EWTSTest.java | 578 ++++++++++++------ source/org/thdl/tib/text/ttt/EWTSTraits.java | 271 +++++++- .../tib/text/ttt/EWTSTshegBarScanner.java | 121 +++- .../thdl/tib/text/ttt/EWTStibwniniTest.java | 214 ++++--- .../thdl/tib/text/ttt/ErrorsAndWarnings.java | 69 ++- .../tib/text/ttt/LotsOfTshegBarsTest.java | 4 +- .../thdl/tib/text/ttt/MidLexSubstitution.java | 4 +- source/org/thdl/tib/text/ttt/PackageTest.java | 17 +- .../org/thdl/tib/text/ttt/ParseIterator.java | 2 +- source/org/thdl/tib/text/ttt/TConverter.java | 52 +- source/org/thdl/tib/text/ttt/TPair.java | 21 +- source/org/thdl/tib/text/ttt/TPairList.java | 142 +++-- .../thdl/tib/text/ttt/TPairListFactory.java | 430 +++++++++---- source/org/thdl/tib/text/ttt/TParseTree.java | 52 +- source/org/thdl/tib/text/ttt/TStackList.java | 10 +- source/org/thdl/tib/text/ttt/TString.java | 9 +- source/org/thdl/tib/text/ttt/TTGCList.java | 4 +- source/org/thdl/tib/text/ttt/TTraits.java | 84 +++ .../thdl/tib/text/ttt/TTshegBarScanner.java | 12 +- source/org/thdl/util/HTMLPane.java | 3 +- source/org/thdl/util/Link.java | 1 - source/org/thdl/util/RTFFixerInputStream.java | 8 +- .../thdl/util/RTFFixerInputStreamTest.java | 3 +- source/org/thdl/util/RTFPane.java | 3 +- source/org/thdl/util/SimpleFrame.java | 5 +- .../org/thdl/util/SimplifiedLinkedList.java | 2 +- .../org/thdl/util/SimplifiedListIterator.java | 2 +- source/org/thdl/util/StatusBar.java | 8 +- source/org/thdl/util/ThdlAbstractAction.java | 5 +- source/org/thdl/util/ThdlActionListener.java | 4 +- source/org/thdl/util/ThdlDebug.java | 7 +- source/org/thdl/util/ThdlI18n.java | 1 + .../org/thdl/util/ThdlLazyExceptionTest.java | 4 +- source/org/thdl/util/ThdlOptions.java | 9 +- source/org/thdl/util/Trie.java | 1 - 45 files changed, 1666 insertions(+), 695 deletions(-) diff --git a/.classpath b/.classpath index 26aa4dc..a9649b2 100644 --- a/.classpath +++ b/.classpath @@ -1,15 +1,10 @@ - - - - - - - - - - - - + + + + + + + diff --git a/build.xml b/build.xml index 6d6f3c8..b25979f 100644 --- a/build.xml +++ b/build.xml @@ -472,6 +472,16 @@ the jvm starting tomcat: description="compiles all JUnit test cases that can be compiled in the present CLASSPATH (NB that this distinction is just wishful thinking for now because we have such weak test coverage at this point)" > + + + + + - - - - - - - - - + diff --git a/source/org/thdl/tib/input/Jskad.java b/source/org/thdl/tib/input/Jskad.java index e1d5bdd..75da83f 100644 --- a/source/org/thdl/tib/input/Jskad.java +++ b/source/org/thdl/tib/input/Jskad.java @@ -18,31 +18,59 @@ Contributor(s): ______________________________________. package org.thdl.tib.input; -import java.io.*; +import java.awt.BorderLayout; +import java.awt.Cursor; +import java.awt.Dimension; +import java.awt.Frame; +import java.awt.LayoutManager; +import java.awt.Point; +import java.awt.event.ActionEvent; +import java.awt.event.KeyEvent; +import java.awt.event.WindowAdapter; +import java.awt.event.WindowEvent; +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.IOException; +import java.io.InputStream; import java.net.URL; -import java.awt.*; -import java.awt.event.*; - -import java.awt.print.*; -import javax.swing.plaf.basic.*; - -import javax.swing.*; -import javax.swing.event.*; -import javax.swing.text.*; -import javax.swing.text.rtf.*; - import java.util.Vector; -import org.thdl.tib.text.*; -import org.thdl.util.ThdlDebug; +import javax.swing.Box; +import javax.swing.JApplet; +import javax.swing.JComboBox; +import javax.swing.JFileChooser; +import javax.swing.JFrame; +import javax.swing.JInternalFrame; +import javax.swing.JLabel; +import javax.swing.JMenu; +import javax.swing.JMenuBar; +import javax.swing.JMenuItem; +import javax.swing.JOptionPane; +import javax.swing.JPanel; +import javax.swing.JScrollPane; +import javax.swing.JToolBar; +import javax.swing.KeyStroke; +import javax.swing.SwingUtilities; +import javax.swing.UIManager; +import javax.swing.WindowConstants; +import javax.swing.event.DocumentEvent; +import javax.swing.event.DocumentListener; +import javax.swing.text.BadLocationException; + +import org.thdl.tib.text.TibetanDocument; import org.thdl.util.RTFFixerInputStream; -import org.thdl.util.ThdlOptions; -import org.thdl.util.ThdlVersion; +import org.thdl.util.SimpleFrame; import org.thdl.util.StatusBar; import org.thdl.util.ThdlActionListener; -import org.thdl.util.HTMLPane; -import org.thdl.util.SimpleFrame; +import org.thdl.util.ThdlDebug; import org.thdl.util.ThdlLazyException; +import org.thdl.util.ThdlOptions; +import org.thdl.util.ThdlVersion; import calpa.html.CalHTMLPane; diff --git a/source/org/thdl/tib/text/TGCPair.java b/source/org/thdl/tib/text/TGCPair.java index 9276dd7..02958e3 100644 --- a/source/org/thdl/tib/text/TGCPair.java +++ b/source/org/thdl/tib/text/TGCPair.java @@ -258,7 +258,7 @@ public class TGCPair implements THDLWylieConstants { } if (mark < v.length()) { vowelish_sb.append(v.substring(mark)); - ThdlDebug.noteIffyCode(); + // TODO(DLC)[EWTS->Tibetan]: ThdlDebug.noteIffyCode(); // FIXME(dchandler): what should I do here? I doubt v is // valid. } diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java index b8c32d6..e081334 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java @@ -506,5 +506,25 @@ public class UnicodeUtils implements UnicodeConstants { } while (mutated_this_time_through); return mutated; } + + /** Returns true iff ch is a valid Tibetan codepoint in Unicode + * 4.0: */ + public boolean isTibetanUnicodeCodepoint(char ch) { + // NOTE: could use an array of 256 booleans for speed but I'm lazy + return ((ch >= '\u0f00' && ch <= '\u0fcf') + && !(ch == '\u0f48' + || (ch > '\u0f6a' && ch < '\u0f71') + || (ch > '\u0f8b' && ch < '\u0f90') + || ch == '\u0f98' + || ch == '\u0fbd' + || ch == '\u0fcd' + || ch == '\u0fce')); + } + + /** Returns true iff ch is in 0F00-0FFF but isn't a valid Tibetan + * codepoint in Unicode 4.0: */ + public boolean isInvalidTibetanUnicode(char ch) { + return (isInTibetanRange(ch) && !isTibetanUnicodeCodepoint(ch)); + } } diff --git a/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java b/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java index e3ad84e..75fccb0 100644 --- a/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java +++ b/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReader.java @@ -258,7 +258,7 @@ class ValidatingUnicodeReader implements UnicodeReadingStateMachineConstants { throws TibetanSyntaxException { Vector syllables = new Vector(); - int grcls_len = grcls.length(); + int grcls_len = grcls.size(); int beginning_of_cluster = 0; for (int i = 0; i < grcls_len; i++) { UnicodeGraphemeCluster current_grcl diff --git a/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReaderTest.java b/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReaderTest.java index bf6ae3d..5d530b0 100644 --- a/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReaderTest.java +++ b/source/org/thdl/tib/text/tshegbar/ValidatingUnicodeReaderTest.java @@ -178,9 +178,9 @@ class ValidatingUnicodeReaderTest { } } - DLC; - assertTrue(ValidatingUnicodeReader.isFullyValidUnicode( - "\u0F\u0F\u0F\u0F\u0F")); +// DLC; +// assertTrue(ValidatingUnicodeReader.isFullyValidUnicode( +// "\u0F00\u0F00\u0F00\u0F00\u0F00")); } void testSyntacticallyLegalUnicodeToThdlWylie() { diff --git a/source/org/thdl/tib/text/ttt/ACIPTraits.java b/source/org/thdl/tib/text/ttt/ACIPTraits.java index dd4abec..c075ae2 100644 --- a/source/org/thdl/tib/text/ttt/ACIPTraits.java +++ b/source/org/thdl/tib/text/ttt/ACIPTraits.java @@ -18,17 +18,15 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; -import java.util.HashSet; import java.util.ArrayList; import java.util.HashMap; import java.util.StringTokenizer; -import java.util.List; -import org.thdl.util.ThdlOptions; import org.thdl.tib.text.DuffCode; import org.thdl.tib.text.THDLWylieConstants; -import org.thdl.tib.text.TibetanMachineWeb; import org.thdl.tib.text.TibTextUtils; +import org.thdl.tib.text.TibetanMachineWeb; +import org.thdl.util.ThdlOptions; /** A singleton class that should contain (but due to laziness and @@ -62,7 +60,9 @@ public final class ACIPTraits implements TTraits { public int maxWowelLength() { return MAX_WOWEL_LENGTH; } public boolean hasSimpleError(TPair p) { - return ("A".equals(p.getLeft()) && null == p.getRight()); + return (("A".equals(p.getLeft()) && null == p.getRight()) + || (null == p.getLeft() + && !this.disambiguator().equals(p.getRight()))); } public String aVowel() { return "A"; } @@ -95,6 +95,11 @@ public final class ACIPTraits implements TTraits { private HashMap superACIP2unicode = null; private HashMap subACIP2unicode = null; + + public String getUnicodeForWowel(String wowel) { + return getUnicodeFor(wowel, /* doesn't matter: */ true); + } + public /* synchronized */ String getUnicodeFor(String acip, boolean subscribed) { if (superACIP2unicode == null) { final boolean compactUnicode @@ -588,5 +593,45 @@ public final class ACIPTraits implements TTraits { if (wowel.indexOf(':') >= 0) duff.add(TibetanMachineWeb.getGlyph(getEwtsForOther(":"))); } + + public String shortTranslitName() { return "ACIP"; } + + public boolean isClearlyIllegal(TPair p) { + if (p.getLeft() == null + && !disambiguator().equals(p.getRight())) + return true; + if ("+".equals(p.getLeft())) + return true; + if (isWowel(p.getLeft()) + && !aVowel().equals(p.getLeft())) // achen + return true; + if (":".equals(p.getLeft())) + return true; + if ("m".equals(p.getLeft())) + return true; + if ("m:".equals(p.getLeft())) + return true; + return false; + } + + public TPairList[] breakTshegBarIntoChunks(String tt, boolean sh) { + try { + return TPairListFactory.breakACIPIntoChunks(tt, sh); + } catch (StackOverflowError e) { + throw new IllegalArgumentException("Input too large[1]: " + tt); + } catch (OutOfMemoryError e) { + throw new IllegalArgumentException("Input too large[2]: " + tt); + } + } + + public boolean isACIP() { return true; } + + public boolean vowelAloneImpliesAChen() { return false; } + + public boolean vowelsMayStack() { return false; } + + public boolean isUnicodeWowel(char ch) { return false; } + + public boolean couldBeValidStack(TPairList pl) { return true; } } diff --git a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java index 9a750c5..31d3eaf 100644 --- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java +++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java @@ -115,7 +115,8 @@ class ACIPTshegBarScanner extends TTshegBarScanner { al.add(new TString("ACIP", errMsg = ErrorsAndWarnings.getMessage(code, shortMessages, - translit), + translit, + ACIPTraits.instance()), TString.ERROR)); if (null != errors) errors.append("Offset " + ((i < 0) ? "END" : ("" + i)) @@ -792,7 +793,8 @@ class ACIPTshegBarScanner extends TTshegBarScanner { al.add(new TString("ACIP", ErrorsAndWarnings.getMessage(510, shortMessages, - "" + ch), + "" + ch, + ACIPTraits.instance()), TString.WARNING)); } startOfString = i+1; @@ -902,7 +904,8 @@ class ACIPTshegBarScanner extends TTshegBarScanner { al.add(new TString("ACIP", ErrorsAndWarnings.getMessage(504, shortMessages, - "" + ch), + "" + ch, + ACIPTraits.instance()), TString.WARNING)); } } diff --git a/source/org/thdl/tib/text/ttt/EWTSTest.java b/source/org/thdl/tib/text/ttt/EWTSTest.java index b2676e2..0f87de6 100644 --- a/source/org/thdl/tib/text/ttt/EWTSTest.java +++ b/source/org/thdl/tib/text/ttt/EWTSTest.java @@ -18,12 +18,12 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; -import org.thdl.util.ThdlOptions; - -import java.util.ArrayList; +import java.io.PrintStream; import junit.framework.TestCase; +import org.thdl.util.ThdlOptions; +import org.thdl.tib.text.tshegbar.UnicodeUtils; /** Tests this package's ability to understand EWTS and turn it into * the appropriate TMW or Unicode. @@ -53,16 +53,106 @@ public class EWTSTest extends TestCase { public EWTSTest() { } + /** Prints a human-readable explanation of how actual and expected + * differ to out. Precondition: expected is non-null, out is + * non-null */ + static void explainInequality(String actual, String expected, PrintStream out) { + if (null == actual) + out.println("Expected \"" + + UnicodeUtils.unicodeStringToPrettyString(expected) + + "\" but found the null string"); + if (actual.length() != expected.length()) { + out.println("Expected a string with " + expected.length() + + " characters but found a string with " + + actual.length() + " characters"); + return; + } + for (int i = 0; i < actual.length(); i++) { + if (actual.charAt(i) != expected.charAt(i)) { + out.println("Expected string \"" + UnicodeUtils.unicodeStringToPrettyString(expected) + "\" but found the string \"" + + UnicodeUtils.unicodeStringToPrettyString(actual) + + "\" which differs at character " + i + " (counting from zero, not one)"); + } + } + } + /** Causes a JUnit test case failure unless the EWTS document ewts * converts to the unicode expectedUnicode. */ static void ewts2uni_test(String ewts, String expectedUnicode) { - // TODO(DLC)[EWTS->Tibetan]: NOW! Implement me. + StringBuffer errors = new StringBuffer(); + String unicode = TConverter.convertToUnicodeText(EWTSTraits.instance(), + ewts, errors, + null, true, + "None", // TODO(DLC)[EWTS->Tibetan]: ??? + false /* short warnings */); + if (null == unicode) { + if (null != expectedUnicode && "none" != expectedUnicode) { + System.out.println("No unicode exists for " + ewts + + " but you expected " + + UnicodeUtils.unicodeStringToPrettyString(expectedUnicode)); + assertTrue(false); + } + System.out.println("Unicode for " + ewts + " can't be had; errors are " + errors); + } else { + if (null != expectedUnicode && !expectedUnicode.equals(unicode)) { + explainInequality(unicode, expectedUnicode, System.out); + if (UnicodeUtils.unicodeStringToPrettyString(unicode).equals(UnicodeUtils.unicodeStringToPrettyString(expectedUnicode))) { + System.out.println("UGLY strings: The unicode for\n \"" + ewts + + "\"\nis\n \"" + + unicode + + "\",\nbut you expected\n \"" + + expectedUnicode + + "\""); + } else { + System.out.println("The unicode for\n \"" + ewts + + "\"\nis\n \"" + + UnicodeUtils.unicodeStringToPrettyString(unicode) + + "\",\nbut you expected\n \"" + + UnicodeUtils.unicodeStringToPrettyString(expectedUnicode) + + "\""); + } + { + StringBuffer sb = new StringBuffer(ewts); + EWTSTshegBarScanner.ExpandEscapeSequences(sb); + TPairList[] la + = EWTSTraits.instance().breakTshegBarIntoChunks(sb.toString(), false); + assertTrue(la[1] == null); + System.out.println("EWTS=" + ewts + " and l'=" + la[0].toString2()); + } + assertTrue(false); + } + } + } + + /** Returns true iff ewts is not a valid EWTS string. */ + static boolean hasEwtsError(String ewts) { + StringBuffer errors = new StringBuffer(); + String unicode = TConverter.convertToUnicodeText(EWTSTraits.instance(), + ewts, errors, + null, true, + "None", // TODO(DLC)[EWTS->Tibetan]: ??? + true); + // TODO(DLC)[EWTS->Tibetan]: Is this sufficient? + return (null == unicode || errors.length() > 0); } /** Causes a JUnit test case failure iff the EWTS document ewts is * legal EWTS transliteration. */ static void assert_EWTS_error(String ewts) { - // TODO(DLC)[EWTS->Tibetan]: NOW! Implement me. + boolean ewts_error = hasEwtsError(ewts); + assertTrue(ewts_error); + } + + /** Tests that the EWTS->unicode converter isn't completely + braindead. */ + public void testEwtsBasics() { + ewts2uni_test("ma", "\u0f58"); + ewts2uni_test("mi", "\u0f58\u0f72"); + ewts2uni_test("mi ", "\u0f58\u0f72\u0f0b"); + ewts2uni_test("mi/", "\u0f58\u0f72\u0f0d"); + ewts2uni_test("bra ", "\u0f56\u0fb2\u0f0b"); + ewts2uni_test("b+ra ", "\u0f56\u0fb2\u0f0b"); + ewts2uni_test("b+Ra ", "\u0f56\u0fbc\u0f0b"); } /** Miscellaneous tests of EWTS->Unicode conversion. */ @@ -83,17 +173,18 @@ public class EWTSTest extends TestCase { ewts2uni_test("k+Ya", "\u0f40\u0FBB"); ewts2uni_test("k+Ra", "\u0f40\u0FBC"); ewts2uni_test("k+wa", "\u0f40\u0Fad"); - ewts2uni_test("k+ya", "\u0f40\u0Fb3"); + ewts2uni_test("k+la", "\u0f40\u0Fb3"); + ewts2uni_test("k+ya", "\u0f40\u0Fb1"); ewts2uni_test("k+ra", "\u0f40\u0Fb2"); ewts2uni_test("r-I", "\u0f62\u0f81"); ewts2uni_test("l-I", "\u0f63\u0f81"); ewts2uni_test("r-i", "\u0f62\u0f80"); ewts2uni_test("l-i", "\u0f63\u0f80"); - ewts2uni_test("gr-i", "\u0f42\u0f76"); // TODO(DLC)[EWTS->Tibetan]: "\u0f42\u0fb2\u0f80" - ewts2uni_test("gr-I", "\u0f42\u0f77"); // TODO(DLC)[EWTS->Tibetan]: "\u0f42\u0fb2\u0f81" - ewts2uni_test("gl-i", "\u0f42\u0f78"); // TODO(DLC)[EWTS->Tibetan]: "\u0f42\u0fb3\u0f80" - ewts2uni_test("gl-I", "\u0f42\u0f79"); // TODO(DLC)[EWTS->Tibetan]: "\u0f42\u0fb3\u0f81" + ewts2uni_test("gr-i", "\u0f42\u0fb2\u0f80"); + ewts2uni_test("gr-I", "\u0f42\u0fb2\u0f81"); + ewts2uni_test("gl-i", "\u0f42\u0fb3\u0f80"); + ewts2uni_test("gl-I", "\u0f42\u0fb3\u0f81"); } @@ -102,26 +193,39 @@ public class EWTSTest extends TestCase { * mostly by testing that the Unicode generated for a single * wowel or set of wowels atop achen (U+0F68) is correct. */ public void test__EWTS__wowels_on_achen() { + + assert_EWTS_error("+yo"); + ewts2uni_test("a+yo", "\u0f68\u0fb1\u0f7c"); + ewts2uni_test("a+yo+o", "\u0f68\u0fb1\u0f7c\u0f7c"); + ewts2uni_test("a+ya.una", "\u0f68\u0fb1\u0f68\u0f74\u0f53"); + ewts2uni_test("a+yauna", "\u0f68\u0fb1\u0f7d\u0f53"); // TODO(DLC)[EWTS->Tibetan]: warn that '.' might have been needed + ewts2uni_test("a+yoona", "\u0f68\u0fb1\u0f7c\u0f68\u0f7c\u0f53"); // TODO(DLC)[EWTS->Tibetan]: warn! + ewts2uni_test("a+yoon", "\u0f68\u0fb1\u0f7c\u0f68\u0f7c\u0f53"); // TODO(DLC)[EWTS->Tibetan]: warn! +// ewts2uni_test("a+yo+ona", "TODO(DLC)[EWTS->Tibetan]"); + ewts2uni_test("A", "\u0f68\u0f71"); ewts2uni_test("i", "\u0f68\u0f72"); - ewts2uni_test("I", "\u0f68\u0f73"); + ewts2uni_test("I", "\u0f68\u0f71\u0f72"); ewts2uni_test("u", "\u0f68\u0f74"); - ewts2uni_test("U", "\u0f68\u0f75"); - ewts2uni_test("a+r-i", "\u0f68\u0f76"); - ewts2uni_test("a+r-I", "\u0f68\u0f77"); - ewts2uni_test("a+l-i", "\u0f68\u0f78"); - ewts2uni_test("a+l-I", "\u0f68\u0f79"); + ewts2uni_test("U", "\u0f68\u0f71\u0f74"); + ewts2uni_test("a+r-i", "\u0f68\u0fb2\u0f80"); + ewts2uni_test("a+r-I", "\u0f68\u0fb2\u0f81"); + ewts2uni_test("a+l-i", "\u0f68\u0fb3\u0f80"); + ewts2uni_test("a+l-I", "\u0f68\u0fb3\u0f81"); ewts2uni_test("e", "\u0f68\u0f7a"); ewts2uni_test("ai", "\u0f68\u0f7b"); + // ewts2uni_test("ao", "\u0f68\u0f68\u0f7c"); // TODO(DLC)[EWTS->Tibetan]: + // assert_EWTS_error("ao"); // TODO(DLC)[EWTS->Tibetan]: ewts2uni_test("o", "\u0f68\u0f7c"); ewts2uni_test("au", "\u0f68\u0f7d"); - ewts2uni_test("aM", "\u0f68\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("aH", "\u0f68\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + // ewts2uni_test("aM", "\u0f68\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + // ewts2uni_test("aH", "\u0f68\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("-i", "\u0f68\u0f80"); ewts2uni_test("-I", "\u0f68\u0f81"); - ewts2uni_test("a~M`", "\u0f68\u0f82"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("a~M", "\u0f68\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("a?", "\u0f68\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + // ewts2uni_test("a~M`", "\u0f68\u0f82"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + // ewts2uni_test("a~M", "\u0f68\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say +// ewts2uni_test("a?", "\u0f68\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("\\u0f68", "\u0f68"); ewts2uni_test("a\\u0f86", "\u0f68\u0f86"); ewts2uni_test("a\\U0f86", "\u0f68\u0f86"); ewts2uni_test("a\\U0F86", "\u0f68\u0f86"); @@ -132,24 +236,32 @@ public class EWTSTest extends TestCase { ewts2uni_test("a\\u00000F86", "\u0f68\u0f86"); ewts2uni_test("a\\u0f87", "\u0f68\u0f87"); - ewts2uni_test("aMH", "\u0f68\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("aHM", "\u0f68\u0f7f\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - +// ewts2uni_test("aMH", "\u0f68\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say +// ewts2uni_test("aHM", "\u0f68\u0f7f\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("a", "\u0f68"); + } + + public void test__EWTS__stacked_wowels_on_achen() { + if (false) { // TODO(DLC)[EWTS->Tibetan]: make this true ASAP + ewts2uni_test("o+o", "\u0f68\u0f7c\u0f7c"); + assert_EWTS_error("a+o"); // TODO(DLC)[EWTS->Tibetan]:? + assert_EWTS_error("o+a"); // TODO(DLC)[EWTS->Tibetan]:? + assert_EWTS_error("ka+o"); // TODO(DLC)[EWTS->Tibetan]:? // Than's e-mails of Aug 10 and Aug 11, 2004 say that A+i is // the same as I and o+o is the same as au. - ewts2uni_test("A+i", "\u0f68\u0f73"); - ewts2uni_test("o+o", "\u0f68\u0f7d"); - ewts2uni_test("e+e", "\u0f68\u0f7b"); - ewts2uni_test("e+e+e", "\u0f68\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("e+e+e+e", "\u0f68\u0f7b\u0f7b"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("e+e+e+e+e", "\u0f68\u0f7b\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + ewts2uni_test("A+i", "\u0f68\u0f71\u0f72"); + ewts2uni_test("e+e", "\u0f68\u0f7a\u0f7a"); + ewts2uni_test("e+e+e", "\u0f68\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + ewts2uni_test("e+e+e+e", "\u0f68\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + ewts2uni_test("e+e+e+e+e", "\u0f68\u0f7a\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? ewts2uni_test("o+e", "\u0f68\u0f7c\u0f7a"); - ewts2uni_test("u+A+i+o+e", "\u0f68\u0f74\u0f72\u0f7c\u0f7a"); - ewts2uni_test("u+A+i+o+eHM", "\u0f68\u0f74\u0f72\u0f7c\u0f7a\u0f7f\u0f7e"); - ewts2uni_test("u+A", "\u0f68\u0f75"); + ewts2uni_test("u+A+i+o+e", "\u0f68\u0f74\u0f71\u0f72\u0f7c\u0f7a"); + ewts2uni_test("u+A+i+o+eHM", "\u0f68\u0f74\u0f71\u0f72\u0f7c\u0f7a\u0f7f\u0f7e"); + ewts2uni_test("u+A", "\u0f68\u0f74\u0f71"); - ewts2uni_test("a", "\u0f68"); + ewts2uni_test("o+-I", "DLC"); + } } /** Tests that our implementation of EWTS's wowels are correct, @@ -158,14 +270,16 @@ public class EWTSTest extends TestCase { public void test__EWTS__wowels_on_ka() { ewts2uni_test("kA", "\u0f40\u0f71"); ewts2uni_test("ki", "\u0f40\u0f72"); - ewts2uni_test("kI", "\u0f40\u0f73"); + ewts2uni_test("kI", "\u0f40\u0f71\u0f72"); ewts2uni_test("ku", "\u0f40\u0f74"); - ewts2uni_test("kU", "\u0f40\u0f75"); - ewts2uni_test("ka+r-i", "\u0f40\u0f76"); - ewts2uni_test("ka+r-I", "\u0f40\u0f77"); - ewts2uni_test("ka+l-i", "\u0f40\u0f78"); - ewts2uni_test("ka+l-I", "\u0f40\u0f79"); + ewts2uni_test("kU", "\u0f40\u0f71\u0f74"); + ewts2uni_test("k+r-i", "\u0f40\u0fb2\u0f80"); + ewts2uni_test("k+r-I", "\u0f40\u0fb2\u0f81"); + ewts2uni_test("k+l-i", "\u0f40\u0fb3\u0f80"); + ewts2uni_test("k+l-I", "\u0f40\u0fb3\u0f81"); ewts2uni_test("ke", "\u0f40\u0f7a"); + ewts2uni_test("e", "\u0f68\u0f7a"); + ewts2uni_test("a", "\u0f68"); ewts2uni_test("kai", "\u0f40\u0f7b"); ewts2uni_test("ko", "\u0f40\u0f7c"); ewts2uni_test("kau", "\u0f40\u0f7d"); @@ -192,34 +306,39 @@ public class EWTSTest extends TestCase { // Than's e-mails of Aug 10 and Aug 11, 2004 say that A+i is // the same as I and o+o is the same as au. - ewts2uni_test("kA+i", "\u0f40\u0f73"); - ewts2uni_test("ko+o", "\u0f40\u0f7d"); - ewts2uni_test("ke+e", "\u0f40\u0f7b"); - ewts2uni_test("ke+e+e", "\u0f40\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("ke+e+e+e", "\u0f40\u0f7b\u0f7b"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("ke+e+e+e+e", "\u0f40\u0f7b\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + ewts2uni_test("kA+i", "\u0f40\u0f71\u0f72"); + ewts2uni_test("ko+o", "\u0f40\u0f7c\u0f7c"); + ewts2uni_test("ke+e", "\u0f40\u0f7a\u0f7a"); + ewts2uni_test("ke+e+e", "\u0f40\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + ewts2uni_test("ke+e+e+e", "\u0f40\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + ewts2uni_test("ke+e+e+e+e", "\u0f40\u0f7a\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? ewts2uni_test("ko+e", "\u0f40\u0f7c\u0f7a"); - ewts2uni_test("ku+A+i+o+e", "\u0f40\u0f74\u0f72\u0f7c\u0f7a"); - ewts2uni_test("ku+A+i+o+eHM", "\u0f40\u0f74\u0f72\u0f7c\u0f7a\u0f7f\u0f7e"); - ewts2uni_test("ku+A", "\u0f40\u0f75"); + ewts2uni_test("ku+A+i+o+e", "\u0f40\u0f74\u0f71\u0f72\u0f7c\u0f7a"); + ewts2uni_test("ku+A+i+o+eHM", "\u0f40\u0f74\u0f71\u0f72\u0f7c\u0f7a\u0f7f\u0f7e"); + ewts2uni_test("ku+A", "\u0f40\u0f74\u0f71"); ewts2uni_test("k", "\u0f40"); ewts2uni_test("ka", "\u0f40"); + + assert_EWTS_error("ka+r-i"); // TODO(DLC)[EWTS->Tibetan]: right? + assert_EWTS_error("ka+r-I"); + assert_EWTS_error("ka+l-i"); + assert_EWTS_error("ka+l-I"); + + assert_EWTS_error("ko+a"); + assert_EWTS_error("ka+o"); } /** Tests that our implementation of EWTS's wowels are correct, * mostly by testing that the Unicode generated for a single * wowel or set of wowels atop achung (U+0F60) is correct. */ public void test__EWTS__wowels_on_achung() { + ewts2uni_test("'a", "\u0f60"); ewts2uni_test("'A", "\u0f60\u0f71"); ewts2uni_test("'i", "\u0f60\u0f72"); - ewts2uni_test("'I", "\u0f60\u0f73"); + ewts2uni_test("'I", "\u0f60\u0f71\u0f72"); ewts2uni_test("'u", "\u0f60\u0f74"); - ewts2uni_test("'U", "\u0f60\u0f75"); - ewts2uni_test("'a+r-i", "\u0f60\u0f76"); - ewts2uni_test("'a+r-I", "\u0f60\u0f77"); - ewts2uni_test("'a+l-i", "\u0f60\u0f78"); - ewts2uni_test("'a+l-I", "\u0f60\u0f79"); + ewts2uni_test("'U", "\u0f60\u0f71\u0f74"); ewts2uni_test("'e", "\u0f60\u0f7a"); ewts2uni_test("'ai", "\u0f60\u0f7b"); ewts2uni_test("'o", "\u0f60\u0f7c"); @@ -247,75 +366,81 @@ public class EWTSTest extends TestCase { // Than's e-mails of Aug 10 and Aug 11, 2004 say that A+i is // the same as I and o+o is the same as au. - ewts2uni_test("'A+i", "\u0f60\u0f73"); - ewts2uni_test("'o+o", "\u0f60\u0f7d"); - ewts2uni_test("'e+e", "\u0f60\u0f7b"); - ewts2uni_test("'e+e+e", "\u0f60\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("'e+e+e+e", "\u0f60\u0f7b\u0f7b"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("'e+e+e+e+e", "\u0f60\u0f7b\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + ewts2uni_test("'A+i", "\u0f60\u0f71\u0f72"); + ewts2uni_test("'o+o", "\u0f60\u0f7c\u0f7c"); + ewts2uni_test("'e+e", "\u0f60\u0f7a\u0f7a"); + ewts2uni_test("'e+e+e", "\u0f60\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + ewts2uni_test("'e+e+e+e", "\u0f60\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + ewts2uni_test("'e+e+e+e+e", "\u0f60\u0f7a\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? ewts2uni_test("'o+e", "\u0f60\u0f7c\u0f7a"); - ewts2uni_test("'u+A+i+o+e", "\u0f60\u0f74\u0f72\u0f7c\u0f7a"); - ewts2uni_test("'u+A+i+o+eHM", "\u0f60\u0f74\u0f72\u0f7c\u0f7a\u0f7f\u0f7e"); + ewts2uni_test("'u+A+i+o+e", "\u0f60\u0f74\u0f71\u0f72\u0f7c\u0f7a"); + ewts2uni_test("'u+A+i+o+eHM", "\u0f60\u0f74\u0f71\u0f72\u0f7c\u0f7a\u0f7f\u0f7e"); - ewts2uni_test("'u+A", "\u0f60\u0f75"); + ewts2uni_test("'u+A", "\u0f60\u0f74\u0f71"); ewts2uni_test("'", "\u0f60"); ewts2uni_test("'a", "\u0f60"); + + ewts2uni_test("'+r-i", "\u0f60\u0fb2\u0f80"); + ewts2uni_test("'+r-I", "\u0f60\u0fb2\u0f81"); + ewts2uni_test("'+l-i", "\u0f60\u0fb3\u0f80"); + ewts2uni_test("'+l-I", "\u0f60\u0fb3\u0f81"); } /** Tests that our implementation of EWTS's wowels are correct, * mostly by testing that the Unicode generated for a single * wowel or set of wowels atop k+Sh (U+0F69) is correct. */ public void test__EWTS__wowels_on_kSh() { - ewts2uni_test("k+ShA", "\u0f69\u0f71"); - ewts2uni_test("k+Shi", "\u0f69\u0f72"); - ewts2uni_test("k+ShI", "\u0f69\u0f73"); - ewts2uni_test("k+Shu", "\u0f69\u0f74"); - ewts2uni_test("k+ShU", "\u0f69\u0f75"); - ewts2uni_test("k+Sha+r-i", "\u0f69\u0f76"); - ewts2uni_test("k+Sha+r-I", "\u0f69\u0f77"); - ewts2uni_test("k+Sha+l-i", "\u0f69\u0f78"); - ewts2uni_test("k+Sha+l-I", "\u0f69\u0f79"); - ewts2uni_test("k+She", "\u0f69\u0f7a"); - ewts2uni_test("k+Shai", "\u0f69\u0f7b"); - ewts2uni_test("k+Sho", "\u0f69\u0f7c"); - ewts2uni_test("k+Shau", "\u0f69\u0f7d"); - ewts2uni_test("k+ShaM", "\u0f69\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("k+ShaH", "\u0f69\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("k+Sh-i", "\u0f69\u0f80"); - ewts2uni_test("k+Sh-I", "\u0f69\u0f81"); - ewts2uni_test("k+Sha~M`", "\u0f69\u0f82"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("k+Sha~M", "\u0f69\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("k+Sha?", "\u0f69\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("k+Sha\\u0f86", "\u0f69\u0f86"); - ewts2uni_test("k+Sha\\U0f86", "\u0f69\u0f86"); - ewts2uni_test("k+Sha\\U0F86", "\u0f69\u0f86"); - ewts2uni_test("k+Sha\\u0F86", "\u0f69\u0f86"); - ewts2uni_test("k+Sha\\u00000f86", "\u0f69\u0f86"); - ewts2uni_test("k+Sha\\u00000f86", "\u0f69\u0f86"); - ewts2uni_test("k+Sha\\u00000F86", "\u0f69\u0f86"); - ewts2uni_test("k+Sha\\u00000F86", "\u0f69\u0f86"); - ewts2uni_test("k+Sha\\u0f87", "\u0f69\u0f87"); + ewts2uni_test("k+ShA", "\u0f40\u0fb5\u0f71"); + ewts2uni_test("k+Shi", "\u0f40\u0fb5\u0f72"); + ewts2uni_test("k+ShI", "\u0f40\u0fb5\u0f71\u0f72"); + ewts2uni_test("k+Shu", "\u0f40\u0fb5\u0f74"); + ewts2uni_test("k+ShU", "\u0f40\u0fb5\u0f71\u0f74"); + ewts2uni_test("k+She", "\u0f40\u0fb5\u0f7a"); + ewts2uni_test("k+Shai", "\u0f40\u0fb5\u0f7b"); + ewts2uni_test("k+Sho", "\u0f40\u0fb5\u0f7c"); + ewts2uni_test("k+Shau", "\u0f40\u0fb5\u0f7d"); + ewts2uni_test("k+ShaM", "\u0f40\u0fb5\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("k+ShaH", "\u0f40\u0fb5\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("k+Sh-i", "\u0f40\u0fb5\u0f80"); + ewts2uni_test("k+Sh-I", "\u0f40\u0fb5\u0f81"); + ewts2uni_test("k+Sha~M`", "\u0f40\u0fb5\u0f82"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("k+Sha~M", "\u0f40\u0fb5\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("k+Sha?", "\u0f40\u0fb5\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("k+Sha\\u0f86", "\u0f40\u0fb5\u0f86"); + ewts2uni_test("k+Sha\\U0f86", "\u0f40\u0fb5\u0f86"); + ewts2uni_test("k+Sha\\U0F86", "\u0f40\u0fb5\u0f86"); + ewts2uni_test("k+Sha\\u0F86", "\u0f40\u0fb5\u0f86"); + ewts2uni_test("k+Sha\\u00000f86", "\u0f40\u0fb5\u0f86"); + ewts2uni_test("k+Sha\\u00000f86", "\u0f40\u0fb5\u0f86"); + ewts2uni_test("k+Sha\\u00000F86", "\u0f40\u0fb5\u0f86"); + ewts2uni_test("k+Sha\\u00000F86", "\u0f40\u0fb5\u0f86"); + ewts2uni_test("k+Sha\\u0f87", "\u0f40\u0fb5\u0f87"); - ewts2uni_test("k+ShaMH", "\u0f69\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("k+ShaHM", "\u0f69\u0f7f\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("k+ShaMH", "\u0f40\u0fb5\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("k+ShaHM", "\u0f40\u0fb5\u0f7f\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say // Than's e-mails of Aug 10 and Aug 11, 2004 say that A+i is // the same as I and o+o is the same as au. - ewts2uni_test("k+ShA+i", "\u0f69\u0f73"); - ewts2uni_test("k+Sho+o", "\u0f69\u0f7d"); - ewts2uni_test("k+She+e", "\u0f69\u0f7b"); - ewts2uni_test("k+She+e+e", "\u0f69\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("k+She+e+e+e", "\u0f69\u0f7b\u0f7b"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("k+She+e+e+e+e", "\u0f69\u0f7b\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("k+Sho+e", "\u0f69\u0f7c\u0f7a"); - ewts2uni_test("k+Shu+A+i+o+e", "\u0f69\u0f74\u0f72\u0f7c\u0f7a"); - ewts2uni_test("k+Shu+A+i+o+eHM", "\u0f69\u0f74\u0f72\u0f7c\u0f7a\u0f7f\u0f7e"); - ewts2uni_test("k+Shu+A", "\u0f69\u0f75"); + ewts2uni_test("k+ShA+i", "\u0f40\u0fb5\u0f71\u0f72"); + ewts2uni_test("k+Sho+o", "\u0f40\u0fb5\u0f7c\u0f7c"); + ewts2uni_test("k+She+e", "\u0f40\u0fb5\u0f7a\u0f7a"); + ewts2uni_test("k+She+e+e", "\u0f40\u0fb5\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + ewts2uni_test("k+She+e+e+e", "\u0f40\u0fb5\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + ewts2uni_test("k+She+e+e+e+e", "\u0f40\u0fb5\u0f7a\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + ewts2uni_test("k+Sho+e", "\u0f40\u0fb5\u0f7c\u0f7a"); + ewts2uni_test("k+Shu+A+i+o+e", "\u0f40\u0fb5\u0f74\u0f71\u0f72\u0f7c\u0f7a"); + ewts2uni_test("k+Shu+A+i+o+eHM", "\u0f40\u0fb5\u0f74\u0f71\u0f72\u0f7c\u0f7a\u0f7f\u0f7e"); + ewts2uni_test("k+Shu+A", "\u0f40\u0fb5\u0f74\u0f71"); - ewts2uni_test("k+Sh", "\u0f69"); - ewts2uni_test("k+Sha", "\u0f69"); + ewts2uni_test("k+Sh", "\u0f40\u0fb5"); + ewts2uni_test("k+Sha", "\u0f40\u0fb5"); + + ewts2uni_test("k+Sh+r-i", "\u0f40\u0fb5\u0fb2\u0f80"); + ewts2uni_test("k+Sh+r-I", "\u0f40\u0fb5\u0fb2\u0f81"); + ewts2uni_test("k+Sh+l-i", "\u0f40\u0fb5\u0fb3\u0f80"); + ewts2uni_test("k+Sh+l-I", "\u0f40\u0fb5\u0fb3\u0f81"); } /** Tests that our implementation of EWTS's wowels are correct, @@ -325,25 +450,22 @@ public class EWTSTest extends TestCase { public void test__EWTS__wowels_on_phyw() { ewts2uni_test("phywA", "\u0f55\u0fb1\u0fad\u0f71"); ewts2uni_test("phywi", "\u0f55\u0fb1\u0fad\u0f72"); - ewts2uni_test("phywI", "\u0f55\u0fb1\u0fad\u0f73"); + ewts2uni_test("phywI", "\u0f55\u0fb1\u0fad\u0f71\u0f72"); ewts2uni_test("phywu", "\u0f55\u0fb1\u0fad\u0f74"); - ewts2uni_test("phywU", "\u0f55\u0fb1\u0fad\u0f75"); - ewts2uni_test("phywa+r-i", "\u0f55\u0fb1\u0fad\u0f76"); - ewts2uni_test("phywa+r-I", "\u0f55\u0fb1\u0fad\u0f77"); - ewts2uni_test("phywa+l-i", "\u0f55\u0fb1\u0fad\u0f78"); - ewts2uni_test("phywa+l-I", "\u0f55\u0fb1\u0fad\u0f79"); + ewts2uni_test("phywU", "\u0f55\u0fb1\u0fad\u0f71\u0f74"); ewts2uni_test("phywe", "\u0f55\u0fb1\u0fad\u0f7a"); ewts2uni_test("phywai", "\u0f55\u0fb1\u0fad\u0f7b"); ewts2uni_test("phywo", "\u0f55\u0fb1\u0fad\u0f7c"); ewts2uni_test("phywau", "\u0f55\u0fb1\u0fad\u0f7d"); - ewts2uni_test("phywaM", "\u0f55\u0fb1\u0fad\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("phywaH", "\u0f55\u0fb1\u0fad\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say ewts2uni_test("phyw-i", "\u0f55\u0fb1\u0fad\u0f80"); ewts2uni_test("phyw-I", "\u0f55\u0fb1\u0fad\u0f81"); - ewts2uni_test("phywa~M`", "\u0f55\u0fb1\u0fad\u0f82"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("phywa~M", "\u0f55\u0fb1\u0fad\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("phywa?", "\u0f55\u0fb1\u0fad\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("phyw\\u0f86", "\u0f55\u0fb1\u0fad\u0f86"); + assertEquals(EWTSTraits.instance().getUnicodeForWowel("\u0f86+\u0f84"), "\u0f86\u0f84"); + + ewts2uni_test("phyw\\u0f84\\u0f86", "\u0f55\u0fb1\u0fad\u0f84\u0f86"); + ewts2uni_test("phyw\\u0f84\u0f86", "\u0f55\u0fb1\u0fad\u0f84\u0f86"); ewts2uni_test("phywa\\u0f86", "\u0f55\u0fb1\u0fad\u0f86"); + ewts2uni_test("phywa\\u0f86\u0f84", "\u0f55\u0fb1\u0fad\u0f86\u0f84"); ewts2uni_test("phywa\\U0f86", "\u0f55\u0fb1\u0fad\u0f86"); ewts2uni_test("phywa\\U0F86", "\u0f55\u0fb1\u0fad\u0f86"); ewts2uni_test("phywa\\u0F86", "\u0f55\u0fb1\u0fad\u0f86"); @@ -353,25 +475,34 @@ public class EWTSTest extends TestCase { ewts2uni_test("phywa\\u00000F86", "\u0f55\u0fb1\u0fad\u0f86"); ewts2uni_test("phywa\\u0f87", "\u0f55\u0fb1\u0fad\u0f87"); - ewts2uni_test("phywaMH", "\u0f55\u0fb1\u0fad\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - ewts2uni_test("phywaHM", "\u0f55\u0fb1\u0fad\u0f7f\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say - // Than's e-mails of Aug 10 and Aug 11, 2004 say that A+i is // the same as I and o+o is the same as au. - ewts2uni_test("phywA+i", "\u0f55\u0fb1\u0fad\u0f73"); - ewts2uni_test("phywo+o", "\u0f55\u0fb1\u0fad\u0f7d"); - ewts2uni_test("phywe+e", "\u0f55\u0fb1\u0fad\u0f7b"); - ewts2uni_test("phywe+e+e", "\u0f55\u0fb1\u0fad\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("phywe+e+e+e", "\u0f55\u0fb1\u0fad\u0f7b\u0f7b"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("phywe+e+e+e+e", "\u0f55\u0fb1\u0fad\u0f7b\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + ewts2uni_test("phywA+i", "\u0f55\u0fb1\u0fad\u0f71\u0f72"); + ewts2uni_test("phywo+o", "\u0f55\u0fb1\u0fad\u0f7c\u0f7c"); + ewts2uni_test("phywe+e", "\u0f55\u0fb1\u0fad\u0f7a\u0f7a"); + ewts2uni_test("phywe+e+e", "\u0f55\u0fb1\u0fad\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + ewts2uni_test("phywe+e+e+e", "\u0f55\u0fb1\u0fad\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + ewts2uni_test("phywe+e+e+e+e", "\u0f55\u0fb1\u0fad\u0f7a\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? ewts2uni_test("phywo+e", "\u0f55\u0fb1\u0fad\u0f7c\u0f7a"); - ewts2uni_test("phywu+A+i+o+e", "\u0f55\u0fb1\u0fad\u0f74\u0f72\u0f7c\u0f7a"); - ewts2uni_test("phywu+A+i+o+eHM", "\u0f55\u0fb1\u0fad\u0f74\u0f72\u0f7c\u0f7a\u0f7f\u0f7e"); - ewts2uni_test("phywu+A", "\u0f55\u0fb1\u0fad\u0f75"); + ewts2uni_test("phywu+A+i+o+e", "\u0f55\u0fb1\u0fad\u0f74\u0f71\u0f72\u0f7c\u0f7a"); + ewts2uni_test("phywu+A+i+o+eHM", "\u0f55\u0fb1\u0fad\u0f74\u0f71\u0f72\u0f7c\u0f7a\u0f7f\u0f7e"); + ewts2uni_test("phywu+A", "\u0f55\u0fb1\u0fad\u0f74\u0f71"); ewts2uni_test("phyw", "\u0f55\u0fb1\u0fad"); ewts2uni_test("phywa", "\u0f55\u0fb1\u0fad"); + + ewts2uni_test("phywaM", "\u0f55\u0fb1\u0fad\u0f7e"); /* TODO(DLC)[EWTS->Tibetan]: NOW: aM is not a wowel! */ // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("phywaH", "\u0f55\u0fb1\u0fad\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("phywa~M`", "\u0f55\u0fb1\u0fad\u0f82"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("phywa~M", "\u0f55\u0fb1\u0fad\u0f83"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("phywa?", "\u0f55\u0fb1\u0fad\u0f84"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("phywaMH", "\u0f55\u0fb1\u0fad\u0f7e\u0f7f"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + ewts2uni_test("phywaHM", "\u0f55\u0fb1\u0fad\u0f7f\u0f7e"); // TODO(DLC)[EWTS->Tibetan]: than needs to say + + assert_EWTS_error("phywr-i"); + assert_EWTS_error("phyw+r-i"); + assert_EWTS_error("phyw+l-i"); } /** Tests that our implementation of EWTS's wowels are correct, @@ -382,13 +513,9 @@ public class EWTSTest extends TestCase { public void test__EWTS__wowels_on_kjjkkj() { ewts2uni_test("k+j+j+k+k+jA", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f71"); ewts2uni_test("k+j+j+k+k+ji", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f72"); - ewts2uni_test("k+j+j+k+k+jI", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f73"); + ewts2uni_test("k+j+j+k+k+jI", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f71\u0f72"); ewts2uni_test("k+j+j+k+k+ju", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f74"); - ewts2uni_test("k+j+j+k+k+jU", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f75"); - ewts2uni_test("k+j+j+k+k+ja+r-i", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f76"); - ewts2uni_test("k+j+j+k+k+ja+r-I", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f77"); - ewts2uni_test("k+j+j+k+k+ja+l-i", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f78"); - ewts2uni_test("k+j+j+k+k+ja+l-I", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f79"); + ewts2uni_test("k+j+j+k+k+jU", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f71\u0f74"); ewts2uni_test("k+j+j+k+k+je", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7a"); ewts2uni_test("k+j+j+k+k+jai", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7b"); ewts2uni_test("k+j+j+k+k+jo", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7c"); @@ -416,85 +543,52 @@ public class EWTSTest extends TestCase { // Than's e-mails of Aug 10 and Aug 11, 2004 say that A+i is // the same as I and o+o is the same as au. - ewts2uni_test("k+j+j+k+k+jA+i", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f73"); - ewts2uni_test("k+j+j+k+k+jo+o", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7d"); - ewts2uni_test("k+j+j+k+k+je+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7b"); - ewts2uni_test("k+j+j+k+k+je+e+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("k+j+j+k+k+je+e+e+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7b\u0f7b"); // TODO(DLC)[EWTS->Tibetan]:? - ewts2uni_test("k+j+j+k+k+je+e+e+e+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7b\u0f7b\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + ewts2uni_test("k+j+j+k+k+jA+i", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f71\u0f72"); + ewts2uni_test("k+j+j+k+k+jo+o", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7c\u0f7c"); + ewts2uni_test("k+j+j+k+k+je+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7a\u0f7a"); + ewts2uni_test("k+j+j+k+k+je+e+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + ewts2uni_test("k+j+j+k+k+je+e+e+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? + ewts2uni_test("k+j+j+k+k+je+e+e+e+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7a\u0f7a\u0f7a\u0f7a\u0f7a"); // TODO(DLC)[EWTS->Tibetan]:? ewts2uni_test("k+j+j+k+k+jo+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f7c\u0f7a"); - ewts2uni_test("k+j+j+k+k+ju+A+i+o+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f74\u0f72\u0f7c\u0f7a"); - ewts2uni_test("k+j+j+k+k+ju+A+i+o+eHM", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f74\u0f72\u0f7c\u0f7a\u0f7f\u0f7e"); - ewts2uni_test("k+j+j+k+k+ju+A", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f75"); + ewts2uni_test("k+j+j+k+k+ju+A+i+o+e", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f74\u0f71\u0f72\u0f7c\u0f7a"); + ewts2uni_test("k+j+j+k+k+ju+A+i+o+eHM", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f74\u0f71\u0f72\u0f7c\u0f7a\u0f7f\u0f7e"); + ewts2uni_test("k+j+j+k+k+ju+A", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0f74\u0f71"); ewts2uni_test("k+j+j+k+k+j", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97"); ewts2uni_test("k+j+j+k+k+ja", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97"); + ewts2uni_test("k+j+j+k+k+j+r-i", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0fb2\u0f80"); + ewts2uni_test("k+j+j+k+k+j+r-I", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0fb2\u0f81"); + ewts2uni_test("k+j+j+k+k+j+l-i", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0fb3\u0f80"); + ewts2uni_test("k+j+j+k+k+j+l-I", "\u0f40\u0f97\u0f97\u0f90\u0f90\u0f97\u0fb3\u0f81"); } /** Tests that the EWTS that the spec says corresponds to each * codepoint really does. */ public void test__EWTS__tags_each_unicode_value() { + ewts2uni_test("\\u0ef0", "\u0ef0"); + for (char i = '\u0ef0'; i < '\u1010'; i++) { + // invalid codepoint like U+0F48? No problem! TODO(DLC)[EWTS->Tibetan]: NOTE: use a unicode "spell checker" to find such problems + String s = new String(new char[] { i }); + ewts2uni_test(UnicodeUtils.unicodeStringToPrettyString(s), s); + ewts2uni_test("\\" + UnicodeUtils.unicodeStringToPrettyString(s), s); + } ewts2uni_test("\\u0000", "\u0000"); ewts2uni_test("\\u0eff", "\u0eff"); - ewts2uni_test("\\u0eff", "\u0eff"); ewts2uni_test("\\u0f00", "\u0f00"); ewts2uni_test("\\u0f40", "\u0f40"); - ewts2uni_test("\\u0f70", "\u0f70"); - ewts2uni_test("\\u0fff", "\u0fff"); + assert_EWTS_error("\\u0f70"); // reserved codepoint + assert_EWTS_error("\\u0fff"); // reserved codepoint ewts2uni_test("\\uf000", "\uf000"); ewts2uni_test("\\uf01f", "\uf01f"); ewts2uni_test("\\uefff", "\uefff"); - ewts2uni_test("\\ucafe0000", "\ucafe0000"); - ewts2uni_test("\\ucafe0eff", "\ucafe0eff"); - ewts2uni_test("\\ucafe0eff", "\ucafe0eff"); - ewts2uni_test("\\ucafe0f00", "\ucafe0f00"); - ewts2uni_test("\\ucafe0f40", "\ucafe0f40"); - ewts2uni_test("\\ucafe0f70", "\ucafe0f70"); - ewts2uni_test("\\ucafe0fff", "\ucafe0fff"); - ewts2uni_test("\\ucafef000", "\ucafef000"); - ewts2uni_test("\\ucafef01f", "\ucafef01f"); - ewts2uni_test("\\ucafeefff", "\ucafeefff"); - - - ewts2uni_test("\\u00000000", "\u00000000"); - ewts2uni_test("\\u00000eff", "\u00000eff"); - ewts2uni_test("\\u00000eff", "\u00000eff"); - ewts2uni_test("\\u00000f00", "\u00000f00"); - ewts2uni_test("\\u00000f40", "\u00000f40"); - ewts2uni_test("\\u00000f70", "\u00000f70"); - ewts2uni_test("\\u00000fff", "\u00000fff"); - ewts2uni_test("\\u0000f000", "\u0000f000"); - ewts2uni_test("\\u0000f01f", "\u0000f01f"); - ewts2uni_test("\\u0000efff", "\u0000efff"); - - ewts2uni_test("\\u00000000", "\u0000"); - ewts2uni_test("\\u00000eff", "\u0eff"); - ewts2uni_test("\\u00000eff", "\u0eff"); - ewts2uni_test("\\u00000f00", "\u0f00"); - ewts2uni_test("\\u00000f40", "\u0f40"); - ewts2uni_test("\\u00000f70", "\u0f70"); - ewts2uni_test("\\u00000fff", "\u0fff"); - ewts2uni_test("\\u0000f000", "\uf000"); - ewts2uni_test("\\u0000f01f", "\uf01f"); - ewts2uni_test("\\u0000efff", "\uefff"); - - ewts2uni_test("\\UcaFe0000", "\ucaFe0000"); - ewts2uni_test("\\UcaFe0eff", "\ucaFe0eff"); - ewts2uni_test("\\UcaFe0eff", "\ucaFe0eff"); - ewts2uni_test("\\UcaFe0f00", "\ucaFe0f00"); - ewts2uni_test("\\UcaFe0f40", "\ucaFe0f40"); - ewts2uni_test("\\UcaFe0f70", "\ucaFe0f70"); - ewts2uni_test("\\UcaFe0fff", "\ucaFe0fff"); - ewts2uni_test("\\UcaFef000", "\ucaFef000"); - ewts2uni_test("\\UcaFef01f", "\ucaFef01f"); - ewts2uni_test("\\UcaFeefff", "\ucaFeefff"); // Below was semiautomatically generated from the EWTS spec's // 'ewts.xml' representation (early August 2004 edition): ewts2uni_test("v", "\u0F56\u0F39"); ewts2uni_test("f", "\u0F55\u0F39"); - + ewts2uni_test("\u0f88+ka", "\u0f88\u0f90"); + ewts2uni_test("\u0f88+kha", "\u0f88\u0f91"); ewts2uni_test("oM", "\u0F00"); ewts2uni_test("\\u0F01", "\u0F01"); ewts2uni_test("\\u0F02", "\u0F02"); @@ -599,13 +693,13 @@ public class EWTSTest extends TestCase { ewts2uni_test("s", "\u0F66"); ewts2uni_test("h", "\u0F67"); ewts2uni_test("a", "\u0F68"); - ewts2uni_test("k+Sh", "\u0F69"); + ewts2uni_test("k+Sh", "\u0f40\u0fb5"); // there is no way in EWTS to specify \u0f69 in particular without using \\u0f69 ewts2uni_test("R+", "\u0F6A"); // TODO(DLC)[EWTS->Tibetan]: move to illegal test - ewts2uni_test("A", "\u0F71"); + ewts2uni_test("A", "\u0F71"); // TODO(DLC)[EWTS->Tibetan]: no?! see above ewts2uni_test("i", "\u0F72"); - ewts2uni_test("I", "\u0F73"); + ewts2uni_test("I", "\u0F71\u0F72"); ewts2uni_test("u", "\u0F74"); - ewts2uni_test("U", "\u0F75"); + ewts2uni_test("U", "\u0F71\u0F74"); ewts2uni_test("r-i", "\u0F76"); ewts2uni_test("r-I", "\u0F77"); ewts2uni_test("l-i", "\u0F78"); @@ -731,15 +825,75 @@ public class EWTSTest extends TestCase { ewts2uni_test("\\uF041", "\uF041"); ewts2uni_test("\\uF042", "\uF042"); } + + public void test__EWTS__long_wowels() { + ewts2uni_test("k-I~M`~X", "\u0f40\u0f81\u0f82\u0f35"); // TODO(DLC)[EWTS->Tibetan]: actually the 0f68 stuff could be true... ask + } + + public void test__EWTS__32bit_unicode_escapes() { + assert_EWTS_error("\\u00010000"); // TODO(dchandler): make it work + assert_EWTS_error("\\uF0010000"); // TODO(dchandler): make it work + ewts2uni_test("\\ucafe0000", + "[#ERROR Sorry, we don't yet support Unicode escape sequences above 0x0000FFFF! File a bug.]"); + // TODO(dchandler): make it "\ucafe0000"); + if (false) { + ewts2uni_test("\\ucafe0eff", "\ucafe0eff"); + ewts2uni_test("\\ucafe0eff", "\ucafe0eff"); + ewts2uni_test("\\ucafe0f00", "\ucafe0f00"); + ewts2uni_test("\\ucafe0f40", "\ucafe0f40"); + ewts2uni_test("\\ucafe0f70", "\ucafe0f70"); + ewts2uni_test("\\ucafe0fff", "\ucafe0fff"); + ewts2uni_test("\\ucafef000", "\ucafef000"); + ewts2uni_test("\\ucafef01f", "\ucafef01f"); + ewts2uni_test("\\ucafeefff", "\ucafeefff"); + + ewts2uni_test("\\uffffffff", "\uffffffff"); + ewts2uni_test("\\ueeeeeee2", "\ueeeeeee2"); + } + + ewts2uni_test("\\u00000000", "\u00000000"); + ewts2uni_test("\\u00000eff", "\u00000eff"); + ewts2uni_test("\\u00000eff", "\u00000eff"); + ewts2uni_test("\\u00000f00", "\u00000f00"); + ewts2uni_test("\\u00000f40", "\u00000f40"); + ewts2uni_test("\\u00000f70", "\u00000f70"); + ewts2uni_test("\\u00000fff", "\u00000fff"); + ewts2uni_test("\\u0000f000", "\u0000f000"); + ewts2uni_test("\\u0000f01f", "\u0000f01f"); + ewts2uni_test("\\u0000efff", "\u0000efff"); + + ewts2uni_test("\\u00000000", "\u0000"); + ewts2uni_test("\\u00000eff", "\u0eff"); + ewts2uni_test("\\u00000eff", "\u0eff"); + ewts2uni_test("\\u00000f00", "\u0f00"); + ewts2uni_test("\\u00000f40", "\u0f40"); + ewts2uni_test("\\u00000f70", "\u0f70"); + ewts2uni_test("\\u00000fff", "\u0fff"); + ewts2uni_test("\\u0000f000", "\uf000"); + ewts2uni_test("\\u0000f01f", "\uf01f"); + ewts2uni_test("\\u0000efff", "\uefff"); + + assert_EWTS_error("\\UcaFe0000"); + if (false) { // TODO(dchandler): make these work + ewts2uni_test("\\UcaFe0000", "\ucaFe0000"); + ewts2uni_test("\\UcaFe0eff", "\ucaFe0eff"); + ewts2uni_test("\\UcaFe0eff", "\ucaFe0eff"); + ewts2uni_test("\\UcaFe0f00", "\ucaFe0f00"); + ewts2uni_test("\\UcaFe0f40", "\ucaFe0f40"); + ewts2uni_test("\\UcaFe0f70", "\ucaFe0f70"); + ewts2uni_test("\\UcaFe0fff", "\ucaFe0fff"); + ewts2uni_test("\\UcaFef000", "\ucaFef000"); + ewts2uni_test("\\UcaFef01f", "\ucaFef01f"); + ewts2uni_test("\\UcaFeefff", "\ucaFeefff"); + } + + } // TODO(DLC)[EWTS->Tibetan]: test that "\[JAVA_SOURCE_WILL_NOT_COMPILE_WITHOUT_ME]uxxxx " works out well /** Tests that certain strings are not legal EWTS. */ public void test__EWTS__illegal_things() { - assert_EWTS_error("k\\u0f19"); // only numbers combine with f19,f18,f3e,f3f - assert_EWTS_error("k\\u0f18"); // only numbers combine with f19,f18,f3e,f3f - assert_EWTS_error("k\\u0f3e"); // only numbers combine with f19,f18,f3e,f3f - assert_EWTS_error("k\\u0f3f"); // only numbers combine with f19,f18,f3e,f3f + assert_EWTS_error("m+"); assert_EWTS_error("kSha"); // use "k+Sha" instead @@ -763,7 +917,27 @@ public class EWTSTest extends TestCase { assert_EWTS_error("al-I"); assert_EWTS_error("g..ya"); // use "g.ya" instead + assert_EWTS_error("m.."); assert_EWTS_error("g"); // use "ga" instead TODO(DLC)[EWTS->Tibetan]:? + + assert_EWTS_error("k\\u0f19"); // only numbers combine with f19,f18,f3e,f3f + assert_EWTS_error("k\\u0f18"); // only numbers combine with f19,f18,f3e,f3f + assert_EWTS_error("k\\u0f3e"); // only numbers combine with f19,f18,f3e,f3f + assert_EWTS_error("k\\u0f3f"); // only numbers combine with f19,f18,f3e,f3f + } + + public void testDLCFailingNow() { // TODO(DLC)[EWTS->Tibetan] + assert_EWTS_error("\\u0f19"); + assert_EWTS_error("\\u0f18"); + assert_EWTS_error("\\u0f19\u0f20"); // wrong order... + + { + ewts2uni_test("'a+r-i", "\u0f60\u0fb2\u0f80"); // TODO(DLC)[EWTS->Tibetan]: NOW: prefix rules should make this invalid! + ewts2uni_test("'a+r-I", "\u0f60\u0fb2\u0f81"); + ewts2uni_test("'a+l-i", "\u0f60\u0fb3\u0f80");// TODO(DLC)[EWTS->Tibetan]: NOW error handling is CRAP + ewts2uni_test("'a+l-I", "\u0f60\u0fb3\u0f81"); + } + } } @@ -779,8 +953,6 @@ public class EWTSTest extends TestCase { // \u0f40\u0f7a\u0f74 is illegal (thus \u0f40\u0f74\u0f7a is // what you probably intended), have it find \u0f7a\u0f74. // - // TODO(DLC)[EWTS->Tibetan]:: and have it find \u0f7a\u0f7a and suggest \u0f7b, etc. - // // TODO(DLC)[EWTS->Tibetan]: and \u0f7f\u0f7e is probably illegal and should be switched? // TODO(DLC)[EWTS->Tibetan]: flesh out \[JAVA_SOURCE_WILL_NOT_COMPILE_WITHOUT_ME]u rules in lexing, is it like Java (where in Java source code, escapes are done in a pre-lexing pass)? no, right, \u0060 causes \u0060 in the output... and \u0f40a is not like ka. escapes separate tsheg bars as far as lexing is concerned, yes? But we use them (and only them, i.e. there is no other transliteration available) for some Tibetan Unicode characters, and then ka\[JAVA_SOURCE_WILL_NOT_COMPILE_WITHOUT_ME]u0fXX may need to seem Java-ish, maybe? diff --git a/source/org/thdl/tib/text/ttt/EWTSTraits.java b/source/org/thdl/tib/text/ttt/EWTSTraits.java index bfef618..b31067e 100644 --- a/source/org/thdl/tib/text/ttt/EWTSTraits.java +++ b/source/org/thdl/tib/text/ttt/EWTSTraits.java @@ -16,10 +16,15 @@ All Rights Reserved. Contributor(s): ______________________________________. */ + // TODO(DLC)[EWTS->Tibetan]: TibetanMachineWeb has duplication of much of this! + package org.thdl.tib.text.ttt; import java.util.ArrayList; + import org.thdl.tib.text.DuffCode; +import org.thdl.tib.text.TibetanMachineWeb; +import org.thdl.util.ThdlDebug; /** A singleton class that should contain (but due to laziness and * ignorance probably does not contain) all the traits that make EWTS @@ -46,41 +51,68 @@ public final class EWTSTraits implements TTraits { /** Returns '.'. */ public char disambiguatorChar() { return '.'; } + // TODO(DLC)[EWTS->Tibetan]: isClearlyIllegal and hasSimpleError are different why? public boolean hasSimpleError(TPair p) { - return ("a".equals(p.getLeft()) && null == p.getRight()); // TODO(DLC)[EWTS->Tibetan]: (a.e) is bad, one of (.a) or (a.) is bad + if (pairHasBadWowel(p)) return true; + return (("a".equals(p.getLeft()) && null == p.getRight()) + || ("a".equals(p.getLeft()) + && null != p.getRight() + && TibetanMachineWeb.isWylieVowel(p.getRight()))); // TODO(DLC)[EWTS->Tibetan]: or Unicode wowels? test "a\u0f74" and "a\u0f7e" + // TODO(DLC)[EWTS->Tibetan]: (a.e) is bad, one of (.a) or (a.) is bad } /** {tsh}, the longest consonant, has 3 characters, so this is * three. */ public int maxConsonantLength() { return 3; } - /** {-i~M`}, in a tie for the longest wowel, has 6 characters, so - * this is six. (No, 'l-i' and 'r-i' are not wowels (but '-i' - * is). */ - public int maxWowelLength() { return 5; } + /** {-i~M`}, in a tie for the longest wowel, has 5 characters, so + * this is five. (No, 'l-i' and 'r-i' are not wowels (but '-i' + * is). (TODO(DLC)[EWTS->Tibetan]: this is crap! you can put arbitrary wowels + * together using plus signs or Unicode escapes) */ + public int maxWowelLength() { return 3; /* a~M` (TODO(DLC)[EWTS->Tibetan]:! why the 'a'?) */} + + public boolean isUnicodeConsonant(char ch) { + return ((ch != '\u0f48' && ch >= '\u0f40' && ch <= '\u0f6a') + || (ch != '\u0f98' && ch >= '\u0f90' && ch <= '\u0fbc')); + } + + public boolean isUnicodeWowel(char ch) { + // TODO(DLC)[EWTS->Tibetan]: what about combiners that combine only with digits? TEST + return ((ch >= '\u0f71' && ch <= '\u0f84') + || isUnicodeWowelThatRequiresAChen(ch)); + } // TODO(DLC)[EWTS->Tibetan]: u,e,i,o? If not, document the special treatment in this function's comment public boolean isConsonant(String s) { + if (s.length() == 1 && isUnicodeConsonant(s.charAt(0))) return true; + if (aVowel().equals(s)) return false; // In EWTS, "a" is both a consonant and a vowel, but we treat it as just a vowel and insert the implied a-chen if you have a TPair ( . a) (TODO(DLC)[EWTS->Tibetan]: right?) + + // TODO(DLC)[EWTS->Tibetan]: numbers are consonants? + // TODO(DLC)[EWTS->Tibetan]: just g for now - return "g".equals(s); + return TibetanMachineWeb.isWylieChar(s); } public boolean isWowel(String s) { + return (getUnicodeForWowel(s) != null); + /* TODO(DLC)[EWTS->Tibetan]: test ko+m+e etc. // TODO(DLC)[EWTS->Tibetan]: all non-consonant combiners? 0f71 0f87 etc.? + if (s.length() == 1 && isUnicodeWowel(s.charAt(0))) return true; return ("a".equals(s) || "e".equals(s) || "i".equals(s) || "o".equals(s) || "u".equals(s) - || "?".equals(s) // TODO(DLC)[EWTS->Tibetan]: 0f84 virama??? - // TODO(DLC)[EWTS->Tibetan]: & ~M` ~M ??? || "U".equals(s) || "I".equals(s) || "A".equals(s) || "-i".equals(s) || "-I".equals(s) - || "H".equals(s) - || "M".equals(s)); // TODO(DLC)[EWTS->Tibetan]:??? + || "au".equals(s) + || "ai".equals(s) + || isWowelThatRequiresAChen(s)); + // TODO(DLC)[EWTS->Tibetan]:??? + */ } public String aVowel() { return "a"; } @@ -125,5 +157,222 @@ public final class EWTSTraits implements TTraits { throw new Error("TODO(DLC)[EWTS->Tibetan]"); } - public String getUnicodeFor(String l, boolean subscribed) { throw new Error("TODO(DLC)[EWTS->Tibetan]"); } + public String getUnicodeForWowel(String wowel) { + if ("a".equals(wowel)) + return ""; + return helpGetUnicodeForWowel(wowel); + } + + private String helpGetUnicodeForWowel(String wowel) { + if ("a".equals(wowel)) + return null; // ko+a+e is invalid, e.g. + if (wowel.length() == 1 && isUnicodeWowel(wowel.charAt(0))) + return wowel; + // handle o+u, etc. + int i; + if ((i = wowel.indexOf("+")) >= 0) { + // recurse. + + // Chris Fynn says \u0f7c\u0f7c is different from \u0f7d. + // So o+o is not the same as au. e+e is not the same as + // ai. + String left = helpGetUnicodeForWowel(wowel.substring(0, i)); + String right = helpGetUnicodeForWowel(wowel.substring(i + 1)); + if (null != left && null != right) + return left + right; + else + return null; + } else { + // Handle vowels. (TODO(dchandler): tibwn.ini has this + // info, use that instead of duplicating it in this code.) + if ("i".equals(wowel)) return "\u0f72"; + if ("u".equals(wowel)) return "\u0f74"; + if ("A".equals(wowel)) return "\u0f71"; + if ("U".equals(wowel)) return "\u0f71\u0f74"; // \u0f75 is discouraged + if ("e".equals(wowel)) return "\u0f7a"; + if ("o".equals(wowel)) return "\u0f7c"; + if ("-i".equals(wowel)) return "\u0f80"; + if ("ai".equals(wowel)) return "\u0f7b"; + if ("au".equals(wowel)) return "\u0f7d"; + if ("-I".equals(wowel)) return "\u0f81"; + if ("I".equals(wowel)) return "\u0f71\u0f72"; // \u0f73 is discouraged + + // TODO(DLC)[EWTS->Tibetan]: fix me! + // DLC say ah if ("aM".equals(wowel)) return "\u0f7e"; + if ("M".equals(wowel)) return "\u0f7e"; + // DLC say ah if ("aH".equals(wowel)) return "\u0f7f"; + if ("H".equals(wowel)) return "\u0f7f"; + // DLC say ah if ("a?".equals(wowel)) return "\u0f84"; + if ("?".equals(wowel)) return "\u0f84"; + // DLC say ah if ("a~M".equals(wowel)) return "\u0f83"; + if ("~M".equals(wowel)) return "\u0f83"; + // DLC say ah if ("a~M`".equals(wowel)) return "\u0f82"; + if ("~M`".equals(wowel)) return "\u0f82"; + // DLC say ah if ("aX".equals(wowel)) return "\u0f37"; + if ("X".equals(wowel)) return "\u0f37"; + // DLC say ah if ("a~X".equals(wowel)) return "\u0f35"; + if ("~X".equals(wowel)) return "\u0f35"; + + return null; + } + } + + public String getUnicodeFor(String l, boolean subscribed) { + + // First, handle "\u0f71\u0f84\u0f86", "", "\u0f74", etc. + { + boolean already_done = true; + for (int i = 0; i < l.length(); i++) { + if (!(l.charAt(0) >= '\u0f00' && l.charAt(0) <= '\u0fff')) { + already_done = false; + break; + } + } + if (already_done) + return l; // TODO(dchandler): \u0fff etc. are not valid code points, though. Do we handle that well? + } + + // TODO(DLC)[EWTS->Tibetan]:: vowels !subscribed could mean (a . i)???? I doubt it but test "i"->"\u0f68\u0f72" etc. + + if (subscribed) { + if ("R".equals(l)) return "\u0fbc"; + if ("Y".equals(l)) return "\u0fbb"; + if ("W".equals(l)) return "\u0fba"; + + // g+h etc. should not be inputs to this function, but for + // completeness they're here. + if ("k".equals(l)) return "\u0F90"; + if ("kh".equals(l)) return "\u0F91"; + if ("g".equals(l)) return "\u0F92"; + if ("g+h".equals(l)) return "\u0F93"; + if ("ng".equals(l)) return "\u0F94"; + if ("c".equals(l)) return "\u0F95"; + if ("ch".equals(l)) return "\u0F96"; + if ("j".equals(l)) return "\u0F97"; + if ("ny".equals(l)) return "\u0F99"; + if ("T".equals(l)) return "\u0F9A"; + if ("Th".equals(l)) return "\u0F9B"; + if ("D".equals(l)) return "\u0F9C"; + if ("D+h".equals(l)) return "\u0F9D"; + if ("N".equals(l)) return "\u0F9E"; + if ("t".equals(l)) return "\u0F9F"; + if ("th".equals(l)) return "\u0FA0"; + if ("d".equals(l)) return "\u0FA1"; + if ("d+h".equals(l)) return "\u0FA2"; + if ("n".equals(l)) return "\u0FA3"; + if ("p".equals(l)) return "\u0FA4"; + if ("ph".equals(l)) return "\u0FA5"; + if ("b".equals(l)) return "\u0FA6"; + if ("b+h".equals(l)) return "\u0FA7"; + if ("m".equals(l)) return "\u0FA8"; + if ("ts".equals(l)) return "\u0FA9"; + if ("tsh".equals(l)) return "\u0FAA"; + if ("dz".equals(l)) return "\u0FAB"; + if ("dz+h".equals(l)) return "\u0FAC"; + if ("w".equals(l)) return "\u0FAD"; // TODO(DLC)[EWTS->Tibetan]:: ??? + if ("zh".equals(l)) return "\u0FAE"; + if ("z".equals(l)) return "\u0FAF"; + if ("'".equals(l)) return "\u0FB0"; + if ("y".equals(l)) return "\u0FB1"; + if ("r".equals(l)) return "\u0FB2"; + if ("l".equals(l)) return "\u0FB3"; + if ("sh".equals(l)) return "\u0FB4"; + if ("Sh".equals(l)) return "\u0FB5"; + if ("s".equals(l)) return "\u0FB6"; + if ("h".equals(l)) return "\u0FB7"; + if ("a".equals(l)) return "\u0FB8"; + if ("k+Sh".equals(l)) return "\u0FB9"; + if (false) throw new Error("TODO(DLC)[EWTS->Tibetan]:: subscribed for " + l); + return null; + } else { + if ("R".equals(l)) return "\u0f6a"; + if ("Y".equals(l)) return "\u0f61"; + if ("W".equals(l)) return "\u0f5d"; + + if (!TibetanMachineWeb.isKnownHashKey(l)) { + ThdlDebug.noteIffyCode(); + return null; + } + String s = TibetanMachineWeb.getUnicodeForWylieForGlyph(l); + if (null == s) + ThdlDebug.noteIffyCode(); + return s; + } + } + + public String shortTranslitName() { return "EWTS"; } + + private boolean pairHasBadWowel(TPair p) { + return (null != p.getRight() + && !disambiguator().equals(p.getRight()) + && !"+".equals(p.getRight()) + && null == getUnicodeForWowel(p.getRight())); + } + public boolean isClearlyIllegal(TPair p) { + if (pairHasBadWowel(p)) return true; + if (p.getLeft() == null + && (p.getRight() == null || + (!disambiguator().equals(p.getRight()) + && !isWowel(p.getRight())))) + return true; + if ("+".equals(p.getLeft())) + return true; + if (p.getLeft() != null && isWowel(p.getLeft()) + && !aVowel().equals(p.getLeft())) // achen + return true; + return false; + } + + public TPairList[] breakTshegBarIntoChunks(String tt, boolean sh) { + if (sh) throw new IllegalArgumentException("Don't do that, silly!"); + try { + return TPairListFactory.breakEWTSIntoChunks(tt); + } catch (StackOverflowError e) { + throw new IllegalArgumentException("Input too large[1]: " + tt); + } catch (OutOfMemoryError e) { + throw new IllegalArgumentException("Input too large[2]: " + tt); + } + } + + public boolean isACIP() { return false; } + + public boolean vowelAloneImpliesAChen() { return true; } + + public boolean vowelsMayStack() { return true; } + + public boolean isWowelThatRequiresAChen(String s) { + // TODO(DLC)[EWTS->Tibetan]: fix me! + return ((s.length() == 1 && (isUnicodeWowelThatRequiresAChen(s.charAt(0)) + || "?MHX".indexOf(s.charAt(0)) >= 0)) + // DLC say ah || "aM".equals(s) // DLC funny... (DLC NOW too funny! affects longest wowel length!) + // DLC say ah || "a?".equals(s) // DLC funny... + // DLC say ah || "aH".equals(s) // DLC funny... + // DLC say ah || "aX".equals(s) // DLC funny... + || "~X".equals(s) + // DLC say ah || "a~X".equals(s) // DLC funny... + || "~M".equals(s) + // DLC say ah || "a~M".equals(s) // DLC funny... + || "~M`".equals(s) + // DLC say ah || "a~M`".equals(s) // DLC funny... + ); + } + + public boolean isUnicodeWowelThatRequiresAChen(char ch) { + // TODO(DLC)[EWTS->Tibetan]: ask if 18 19 3e 3f combine only with digits + return "\u0f35\u0f37\u0f18\u0f19\u0f3e\u0f3f\u0f86\u0f87\u0fc6".indexOf(ch) >= 0; + } + + public boolean couldBeValidStack(TPairList pl) { + StringBuffer hashKey = new StringBuffer(); + boolean allHavePlus = true; + for (int i = 0; i < pl.size(); i++) { + if (i + 1 < pl.size() && !"+".equals(pl.get(i).getRight())) + allHavePlus = false; + if (0 != hashKey.length()) + hashKey.append('-'); + hashKey.append(pl.get(i).getLeft()); + } + return (allHavePlus + || TibetanMachineWeb.hasGlyph(hashKey.toString())); // TODO(DLC)[EWTS->Tibetan]: test with smra and tsma and bdgya + } } diff --git a/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java b/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java index 7315675..6688b3a 100644 --- a/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java +++ b/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java @@ -18,6 +18,7 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; +import java.math.BigInteger; import java.util.ArrayList; /** @@ -31,16 +32,130 @@ import java.util.ArrayList; * * @author David Chandler */ class EWTSTshegBarScanner extends TTshegBarScanner { + + /** Returns true iff ch can appear within an EWTS tsheg bar. */ + protected static boolean isValidInsideTshegBar(char ch) { + // '\\' is absent, but should it be? TODO(DLC)[EWTS->Tibetan] + return ((ch >= '0' && ch <= '9') + || (ch >= '\u0f71' && ch <= '\u0f84') + || EWTSTraits.instance().isUnicodeConsonant(ch) + || EWTSTraits.instance().isUnicodeWowel(ch) + || (ch >= '\u0f20' && ch <= '\u0f33') + || "khgncjytdpbmtstdzwzz'rlafvTDNSWYReuioIAUMHX?^\u0f39\u0f35\u0f37.+~'`-\u0f19\u0f18\u0f3f\u0f3e\u0f86\u0f87\u0f88".indexOf(ch) >= 0); + } + /** See the comment in TTshegBarScanner. This does not find - errors and warnings that you'd think of a parser finding (DLC + errors and warnings that you'd think of a parser finding (TODO(DLC)[EWTS->Tibetan]: DOES IT?). */ - public ArrayList scan(String s, StringBuffer errors, int maxErrors, + public ArrayList scan(String s, StringBuffer errors, int maxErrors, // TODO(DLC)[EWTS->Tibetan]: ignored boolean shortMessages, String warningLevel) { // the size depends on whether it's mostly Tibetan or mostly // Latin and a number of other factors. This is meant to be // an underestimate, but not too much of an underestimate. ArrayList al = new ArrayList(s.length() / 10); - throw new Error("DLC unimplemented"); + + // TODO(DLC)[EWTS->Tibetan]: use jflex, javacc or something similar + + // TODO(DLC)[EWTS->Tibetan]: what about Unicode escapes like \u0f20? When do you do that? Immediately like Java source files? I think so and then we can say that oddballs like \u0f19 are valid within tsheg bars. + + StringBuffer sb = new StringBuffer(s); + ExpandEscapeSequences(sb); + int sl = sb.length(); + for (int i = 0; i < sl; i++) { + if (isValidInsideTshegBar(sb.charAt(i))) { + StringBuffer tbsb = new StringBuffer(); + for (; i < sl; i++) { + if (isValidInsideTshegBar(sb.charAt(i))) + tbsb.append(sb.charAt(i)); + else { + --i; + break; + } + } + al.add(new TString("EWTS", tbsb.toString(), + TString.TIBETAN_NON_PUNCTUATION)); + } else { + if (" /;|!:=_@#$%<>()\r\n\t".indexOf(sb.charAt(i)) >= 0) + al.add(new TString("EWTS", sb.substring(i, i+1), + TString.TIBETAN_PUNCTUATION)); + else + al.add(new TString("EWTS", "ERROR TODO(DLC)[EWTS->Tibetan]: this character is illegal in EWTS: " + sb.substring(i, i+1), + TString.ERROR)); + } + } + return al; + } + + /** Modifies the EWTS in sb such that Unicode escape sequences are + * expanded. */ + public static void ExpandEscapeSequences(StringBuffer sb) { + int sl; + for (int i = 0; i < (sl = sb.length()); i++) { + if (i + "\\u00000000".length() <= sl) { + if (sb.charAt(i) == '\\' && sb.charAt(i + 1) == 'u' || sb.charAt(i + 1) == 'U') { + boolean isEscape = true; + for (int j = 0; j < "00000000".length(); j++) { + char ch = sb.charAt(i + "\\u".length() + j); + if (!((ch <= '9' && ch >= '0') + || (ch <= 'F' && ch >= 'A') + || (ch <= 'f' && ch >= 'a'))) { + isEscape = false; + break; + } + } + if (isEscape) { + long x = -1; + try { + BigInteger bigx = new java.math.BigInteger(sb.substring(i+2, i+10), 16); + x = bigx.longValue(); + if (!(bigx.compareTo(new BigInteger("0", 16)) >= 0 + && bigx.compareTo(new BigInteger("FFFFFFFF", 16)) <= 0)) + x = -1; + } catch (NumberFormatException e) { + // leave x == -1 + } + if (x >= 0 && x <= 0xFFFF) { + sb.replace(i, i + "\\uXXXXyyyy".length(), new String(new char[] { (char)x })); + continue; + } else if (x >= 0x00000000L + && x <= 0xFFFFFFFFL) { +// TODO(DLC)[EWTS->Tibetan]: do nothing? test errors al.add(new TString("EWTS", "Sorry, we don't yet support Unicode escape sequences above 0x0000FFFF! File a bug.", + //TString.ERROR)); + i += "uXXXXYYYY".length(); + continue; + } + } + } + } + if (i + "\\u0000".length() <= sl) { + if (sb.charAt(i) == '\\' && sb.charAt(i + 1) == 'u' || sb.charAt(i + 1) == 'U') { + boolean isEscape = true; + for (int j = 0; j < "0000".length(); j++) { + char ch = sb.charAt(i + "\\u".length() + j); + if (!((ch <= '9' && ch >= '0') + || (ch <= 'F' && ch >= 'A') + || (ch <= 'f' && ch >= 'a'))) { + isEscape = false; + break; + } + } + if (isEscape) { + int x = -1; + try { + if (!((x = Integer.parseInt(sb.substring(i+2, i+6), 16)) >= 0x0000 + && x <= 0xFFFF)) + x = -1; + } catch (NumberFormatException e) { + // leave x == -1 + } + if (x >= 0) { + sb.replace(i, i + "\\uXXXX".length(), new String(new char[] { (char)x })); + continue; + } + } + } + } + } } /** non-public because this is a singleton */ diff --git a/source/org/thdl/tib/text/ttt/EWTStibwniniTest.java b/source/org/thdl/tib/text/ttt/EWTStibwniniTest.java index fa06510..0f37895 100644 --- a/source/org/thdl/tib/text/ttt/EWTStibwniniTest.java +++ b/source/org/thdl/tib/text/ttt/EWTStibwniniTest.java @@ -18,12 +18,10 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; -import org.thdl.util.ThdlOptions; - -import java.util.ArrayList; - import junit.framework.TestCase; +import org.thdl.util.ThdlOptions; + /** Tests this package's ability to understand EWTS and turn it into * the appropriate TMW or Unicode by throwing a lot of @@ -67,6 +65,15 @@ public class EWTStibwniniTest extends TestCase { EWTSTest.assert_EWTS_error(ewts); } + /** Asserts that ewts is valid EWTS. Call this for those strings + that someone might intend a stack in TMW for, but that really + mean two or more stacks in EWTS thanks to prefix rules. g+ga, + for example, might be mistakenly input as gga. If so, it's + legal EWTS because ga takes a ga prefix. */ + private static void special_case(String ewts) { + assertTrue(!EWTSTest.hasEwtsError(ewts)); + } + /** Tests that all of the standard stacks are treated like * standard stacks and that none of the non-standard stacks in * the TMW font are treated like standard stacks. I generated @@ -393,7 +400,7 @@ public class EWTStibwniniTest extends TestCase { ewts2uni_test("N", "\u0F4E"); ewts2uni_test("Sh", "\u0F65"); - ewts2uni_test("k+Sh", "\u0F69"); + ewts2uni_test("k+Sh", "\u0f40\u0fb5"); // TODO(DLC)[EWTS->Tibetan]: \u0F69 instead? Shouldn't matter by the unicode standard's terms, and a tiny, separate translator on unicode-to-unicode ought to be better. But maybe change tibwn.ini? ewts2uni_test("k+k", "\u0f40\u0f90"); ewts2uni_test("k+kh", "\u0f40\u0f91"); ewts2uni_test("k+ng", "\u0f40\u0f94"); @@ -437,16 +444,16 @@ public class EWTStibwniniTest extends TestCase { ewts2uni_test("g+m", "\u0f42\u0fa8"); ewts2uni_test("g+m+y", "\u0f42\u0fa8\u0fb1"); ewts2uni_test("g+r+y", "\u0f42\u0fb2\u0fb1"); - ewts2uni_test("g+h", "\u0F43"); - ewts2uni_test("g+h+g+h", "\u0f43\u0f92\u0fb7"); - ewts2uni_test("g+h+ny", "\u0f43\u0f99"); - ewts2uni_test("g+h+n", "\u0f43\u0fa3"); - ewts2uni_test("g+h+n+y", "\u0f43\u0fa3\u0fb1"); - ewts2uni_test("g+h+m", "\u0f43\u0fa8"); - ewts2uni_test("g+h+l", "\u0f43\u0fb3"); - ewts2uni_test("g+h+y", "\u0f43\u0fb1"); - ewts2uni_test("g+h+r", "\u0f43\u0fb2"); - ewts2uni_test("g+h+w", "\u0f43\u0fad"); + ewts2uni_test("g+h", "\u0f42\u0fb7"); // TODO(DLC)[EWTS->Tibetan]: \u0F43 instead? Shouldn't matter by the unicode standard's terms, and a tiny, separate translator on unicode-to-unicode ought to be better. But maybe change tibwn.ini? (Same goes for every occurrence of \u0f42\u0fb7 in this file.) + ewts2uni_test("g+h+g+h", "\u0f42\u0fb7\u0f92\u0fb7"); + ewts2uni_test("g+h+ny", "\u0f42\u0fb7\u0f99"); + ewts2uni_test("g+h+n", "\u0f42\u0fb7\u0fa3"); + ewts2uni_test("g+h+n+y", "\u0f42\u0fb7\u0fa3\u0fb1"); + ewts2uni_test("g+h+m", "\u0f42\u0fb7\u0fa8"); + ewts2uni_test("g+h+l", "\u0f42\u0fb7\u0fb3"); + ewts2uni_test("g+h+y", "\u0f42\u0fb7\u0fb1"); + ewts2uni_test("g+h+r", "\u0f42\u0fb7\u0fb2"); + ewts2uni_test("g+h+w", "\u0f42\u0fb7\u0fad"); ewts2uni_test("ng+k", "\u0f44\u0f90"); ewts2uni_test("ng+k+t", "\u0f44\u0f90\u0f9f"); ewts2uni_test("ng+k+t+y", "\u0f44\u0f90\u0f9f\u0fb1"); @@ -499,11 +506,11 @@ public class EWTStibwniniTest extends TestCase { ewts2uni_test("dz+y", "\u0f5b\u0fb1"); ewts2uni_test("dz+r", "\u0f5b\u0fb2"); ewts2uni_test("dz+w", "\u0f5b\u0fad"); - ewts2uni_test("dz+h", "\u0F5C"); - ewts2uni_test("dz+h+y", "\u0f5c\u0fb1"); - ewts2uni_test("dz+h+r", "\u0f5c\u0fb2"); - ewts2uni_test("dz+h+l", "\u0f5c\u0fb3"); - ewts2uni_test("dz+h+w", "\u0f5c\u0fad"); + ewts2uni_test("dz+h", "\u0F5B\u0FB7"); // TODO(DLC)[EWTS->Tibetan]: 0f5c is what tibwn.ini has + ewts2uni_test("dz+h+y", "\u0f5b\u0fb7\u0fb1"); // TODO(DLC)[EWTS->Tibetan]: 0f5c is what tibwn.ini has + ewts2uni_test("dz+h+r", "\u0f5b\u0fb7\u0fb2"); // TODO(DLC)[EWTS->Tibetan]: 0f5c is what tibwn.ini has + ewts2uni_test("dz+h+l", "\u0f5b\u0fb7\u0fb3"); // TODO(DLC)[EWTS->Tibetan]: 0f5c is what tibwn.ini has + ewts2uni_test("dz+h+w", "\u0f5b\u0fb7\u0fad"); // TODO(DLC)[EWTS->Tibetan]: 0f5c is what tibwn.ini has ewts2uni_test("ny+ts", "\u0f49\u0fa9"); ewts2uni_test("ny+ts+m", "\u0f49\u0fa9\u0fa8"); ewts2uni_test("ny+ts+y", "\u0f49\u0fa9\u0fb1"); @@ -541,12 +548,16 @@ public class EWTStibwniniTest extends TestCase { ewts2uni_test("D+y", "\u0f4c\u0fb1"); ewts2uni_test("D+r", "\u0f4c\u0fb2"); ewts2uni_test("D+w", "\u0f4c\u0fad"); - ewts2uni_test("D+h", "\u0F4D"); - ewts2uni_test("D+h+D+h", "\u0f4d\u0f9d"); - ewts2uni_test("D+h+m", "\u0f4d\u0fa8"); - ewts2uni_test("D+h+y", "\u0f4d\u0fb1"); - ewts2uni_test("D+h+r", "\u0f4d\u0fb2"); - ewts2uni_test("D+h+w", "\u0f4d\u0fad"); + ewts2uni_test("D+h", "\u0F4C\u0FB7"); // TODO(DLC)[EWTS->Tibetan]: 0f4d is what tibwn.ini has + { + // TODO(DLC)[EWTS->Tibetan]: 0f4d is what tibwn.ini has + ewts2uni_test("D+h+D+h", "\u0f4c\u0fb7\u0f9c\u0fb7"); + // TODO(DLC)[EWTS->Tibetan]: 0f9d is what tibwn.ini has + } + ewts2uni_test("D+h+m", "\u0f4c\u0fb7\u0fa8"); // TODO(DLC)[EWTS->Tibetan]: 0f4d is what tibwn.ini has + ewts2uni_test("D+h+y", "\u0f4c\u0fb7\u0fb1"); // TODO(DLC)[EWTS->Tibetan]: 0f4d is what tibwn.ini has + ewts2uni_test("D+h+r", "\u0f4c\u0fb7\u0fb2"); // TODO(DLC)[EWTS->Tibetan]: 0f4d is what tibwn.ini has + ewts2uni_test("D+h+w", "\u0f4c\u0fb7\u0fad"); // TODO(DLC)[EWTS->Tibetan]: 0f4d is what tibwn.ini has ewts2uni_test("N+T", "\u0f4e\u0f9a"); ewts2uni_test("N+Th", "\u0f4e\u0f9b"); ewts2uni_test("N+D", "\u0f4e\u0f9c"); @@ -592,7 +603,8 @@ public class EWTStibwniniTest extends TestCase { ewts2uni_test("t+s+w", "\u0f4f\u0fb6\u0fad"); ewts2uni_test("t+r+y", "\u0f4f\u0fb2\u0fb1"); ewts2uni_test("t+w+y", "\u0f4f\u0fad\u0fb1"); - ewts2uni_test("t+k+Sh", "\u0f4f\u0fb9"); + ewts2uni_test("t+k+Sh", "\u0f4f\u0f90\u0fb5"); // TODO(DLC)[EWTS->Tibetan]: 0fb9 is what tibwn.ini has + ewts2uni_test("th+y", "\u0f50\u0fb1"); ewts2uni_test("th+w", "\u0f50\u0fad"); ewts2uni_test("d+g", "\u0f51\u0f92"); @@ -620,14 +632,14 @@ public class EWTStibwniniTest extends TestCase { ewts2uni_test("d+y", "\u0f51\u0fb1"); ewts2uni_test("d+r+y", "\u0f51\u0fb2\u0fb1"); ewts2uni_test("d+w+y", "\u0f51\u0fad\u0fb1"); - ewts2uni_test("d+h", "\u0F52"); - ewts2uni_test("d+h+n", "\u0f52\u0fa3"); - ewts2uni_test("d+h+n+y", "\u0f52\u0fa3\u0fb1"); - ewts2uni_test("d+h+m", "\u0f52\u0fa8"); - ewts2uni_test("d+h+y", "\u0f52\u0fb1"); - ewts2uni_test("d+h+r", "\u0f52\u0fb2"); - ewts2uni_test("d+h+r+y", "\u0f52\u0fb2\u0fb1"); - ewts2uni_test("d+h+w", "\u0f52\u0fad"); + ewts2uni_test("d+h", "\u0F51\u0fb7"); // TODO(DLC)[EWTS->Tibetan]: 0f52 is what tibwn.ini has + ewts2uni_test("d+h+n", "\u0f51\u0fb7\u0fa3"); // TODO(DLC)[EWTS->Tibetan]: 0f52 is what tibwn.ini has + ewts2uni_test("d+h+n+y", "\u0f51\u0fb7\u0fa3\u0fb1"); // TODO(DLC)[EWTS->Tibetan]: 0f52 is what tibwn.ini has + ewts2uni_test("d+h+m", "\u0f51\u0fb7\u0fa8"); // TODO(DLC)[EWTS->Tibetan]: 0f52 is what tibwn.ini has + ewts2uni_test("d+h+y", "\u0f51\u0fb7\u0fb1"); // TODO(DLC)[EWTS->Tibetan]: 0f52 is what tibwn.ini has + ewts2uni_test("d+h+r", "\u0f51\u0fb7\u0fb2"); // TODO(DLC)[EWTS->Tibetan]: 0f52 is what tibwn.ini has + ewts2uni_test("d+h+r+y", "\u0f51\u0fb7\u0fb2\u0fb1"); // TODO(DLC)[EWTS->Tibetan]: 0f52 is what tibwn.ini has + ewts2uni_test("d+h+w", "\u0f51\u0fb7\u0fad"); // TODO(DLC)[EWTS->Tibetan]: 0f52 is what tibwn.ini has ewts2uni_test("n+k", "\u0f53\u0f90"); ewts2uni_test("n+k+t", "\u0f53\u0f90\u0f9f"); ewts2uni_test("n+g+h", "\u0f53\u0f92\u0fb7"); @@ -651,7 +663,7 @@ public class EWTStibwniniTest extends TestCase { ewts2uni_test("n+d+h+r", "\u0f53\u0fa1\u0fb7\u0fb2"); ewts2uni_test("n+d+h+y", "\u0f53\u0fa1\u0fb7\u0fb1"); ewts2uni_test("n+n", "\u0f53\u0fa3"); - ewts2uni_test("n+n+y", "\u0f53\u0fa3\u0f61"); + ewts2uni_test("n+n+y", "\u0f53\u0fa3\u0fb1"); ewts2uni_test("n+p", "\u0f53\u0fa4"); ewts2uni_test("n+p+r", "\u0f53\u0fa4\u0fb2"); ewts2uni_test("n+ph", "\u0f53\u0fa5"); @@ -692,13 +704,13 @@ public class EWTStibwniniTest extends TestCase { ewts2uni_test("b+b+h", "\u0f56\u0fa6\u0fb7"); ewts2uni_test("b+b+h+y", "\u0f56\u0fa6\u0fb7\u0fb1"); ewts2uni_test("b+m", "\u0f56\u0fa8"); - ewts2uni_test("b+h", "\u0F57"); - ewts2uni_test("b+h+N", "\u0f57\u0f9e"); - ewts2uni_test("b+h+n", "\u0f57\u0fa3"); - ewts2uni_test("b+h+m", "\u0f57\u0fa8"); - ewts2uni_test("b+h+y", "\u0f57\u0fb1"); - ewts2uni_test("b+h+r", "\u0f57\u0fb2"); - ewts2uni_test("b+h+w", "\u0f57\u0fad"); + ewts2uni_test("b+h", "\u0F56\u0fb7"); // TODO(DLC)[EWTS->Tibetan]: 0f57 is what tibwn.ini has + ewts2uni_test("b+h+N", "\u0f56\u0fb7\u0f9e"); // TODO(DLC)[EWTS->Tibetan]: 0f57 is what tibwn.ini has + ewts2uni_test("b+h+n", "\u0f56\u0fb7\u0fa3"); // TODO(DLC)[EWTS->Tibetan]: 0f57 is what tibwn.ini has + ewts2uni_test("b+h+m", "\u0f56\u0fb7\u0fa8"); // TODO(DLC)[EWTS->Tibetan]: 0f57 is what tibwn.ini has + ewts2uni_test("b+h+y", "\u0f56\u0fb7\u0fb1"); // TODO(DLC)[EWTS->Tibetan]: 0f57 is what tibwn.ini has + ewts2uni_test("b+h+r", "\u0f56\u0fb7\u0fb2"); // TODO(DLC)[EWTS->Tibetan]: 0f57 is what tibwn.ini has + ewts2uni_test("b+h+w", "\u0f56\u0fb7\u0fad"); // TODO(DLC)[EWTS->Tibetan]: 0f57 is what tibwn.ini has ewts2uni_test("m+ny", "\u0f58\u0f99"); ewts2uni_test("m+N", "\u0f58\u0f9e"); ewts2uni_test("m+n", "\u0f58\u0fa3"); @@ -736,13 +748,13 @@ public class EWTStibwniniTest extends TestCase { ewts2uni_test("r+t+s+n+y", "\u0f62\u0f9f\u0fb6\u0fa3\u0fb1"); ewts2uni_test("r+th", "\u0f62\u0fa0"); ewts2uni_test("r+th+y", "\u0f62\u0fa0\u0fb1"); - ewts2uni_test("r+d+d+h", "\u0f62\u0fa1\u0fa2"); - ewts2uni_test("r+d+d+h+y", "\u0f62\u0fa1\u0fa2\u0fb1"); + ewts2uni_test("r+d+d+h", "\u0f62\u0fa1\u0fa1\u0fb7"); // TODO(DLC)[EWTS->Tibetan]: 0fa2 is what tibwn.ini has + ewts2uni_test("r+d+d+h+y", "\u0f62\u0fa1\u0fa1\u0fb7\u0fb1"); // TODO(DLC)[EWTS->Tibetan]: 0fa2 is what tibwn.ini has ewts2uni_test("r+d+y", "\u0f62\u0fa1\u0fb1"); - ewts2uni_test("r+d+h", "\u0f62\u0fa1\u0fb7"); - ewts2uni_test("r+d+h+m", "\u0f62\u0fa1\u0fb7\u0fa8"); - ewts2uni_test("r+d+h+y", "\u0f62\u0fa2\u0fb1"); - ewts2uni_test("r+d+h+r", "\u0f62\u0fa2\u0fb2"); + ewts2uni_test("r+d+h", "\u0f62\u0fa1\u0fb7"); // TODO(DLC)[EWTS->Tibetan]: 0fa2 is what tibwn.ini has + ewts2uni_test("r+d+h+m", "\u0f62\u0fa1\u0fb7\u0fa8"); // TODO(DLC)[EWTS->Tibetan]: 0fa2 is what tibwn.ini has + ewts2uni_test("r+d+h+y", "\u0f62\u0fa1\u0fb7\u0fb1"); // TODO(DLC)[EWTS->Tibetan]: 0fa2 is what tibwn.ini has + ewts2uni_test("r+d+h+r", "\u0f62\u0fa1\u0fb7\u0fb2"); // TODO(DLC)[EWTS->Tibetan]: 0fa2 is what tibwn.ini has ewts2uni_test("r+p", "\u0f62\u0fa4"); ewts2uni_test("r+b+p", "\u0f62\u0fa6\u0fa4"); ewts2uni_test("r+b+b", "\u0f62\u0fa6\u0fa6"); @@ -780,22 +792,22 @@ public class EWTStibwniniTest extends TestCase { assert_EWTS_error("khkha"); assert_EWTS_error("khna"); assert_EWTS_error("khla"); - assert_EWTS_error("gga"); + special_case("gga"); assert_EWTS_error("ggha"); - assert_EWTS_error("gnya"); - assert_EWTS_error("gda"); + special_case("gnya"); + special_case("gda"); assert_EWTS_error("gdha"); assert_EWTS_error("gdhya"); assert_EWTS_error("gdhwa"); - assert_EWTS_error("gna"); - assert_EWTS_error("gnya"); - assert_EWTS_error("gpa"); + special_case("gna"); + special_case("gnya"); + special_case("gpa"); assert_EWTS_error("gbha"); assert_EWTS_error("gbhya"); - assert_EWTS_error("gma"); - assert_EWTS_error("gmya"); + special_case("gma"); + special_case("gmya"); assert_EWTS_error("grya"); - assert_EWTS_error("gha"); + special_case("gha"); assert_EWTS_error("ghgha"); assert_EWTS_error("ghnya"); assert_EWTS_error("ghna"); @@ -803,8 +815,8 @@ public class EWTStibwniniTest extends TestCase { assert_EWTS_error("ghma"); assert_EWTS_error("ghla"); assert_EWTS_error("ghya"); - assert_EWTS_error("ghra"); - assert_EWTS_error("ghwa"); + special_case("ghra"); + special_case("ghwa"); assert_EWTS_error("ngka"); assert_EWTS_error("ngkta"); assert_EWTS_error("ngktya"); @@ -939,7 +951,7 @@ public class EWTStibwniniTest extends TestCase { assert_EWTS_error("tmya"); assert_EWTS_error("tya"); assert_EWTS_error("trna"); - assert_EWTS_error("tsa"); + special_case("tsa"); assert_EWTS_error("tstha"); assert_EWTS_error("tsna"); assert_EWTS_error("tsnya"); @@ -947,45 +959,45 @@ public class EWTStibwniniTest extends TestCase { assert_EWTS_error("tsmya"); assert_EWTS_error("tsya"); assert_EWTS_error("tsra"); - assert_EWTS_error("tswa"); + special_case("tswa"); assert_EWTS_error("trya"); assert_EWTS_error("twya"); assert_EWTS_error("tkSha"); assert_EWTS_error("thya"); assert_EWTS_error("thwa"); - assert_EWTS_error("dga"); - assert_EWTS_error("dgya"); - assert_EWTS_error("dgra"); + special_case("dga"); + special_case("dgya"); + special_case("dgra"); assert_EWTS_error("dgha"); assert_EWTS_error("dghra"); - assert_EWTS_error("ddza"); - assert_EWTS_error("dda"); + special_case("ddza"); + special_case("dda"); assert_EWTS_error("ddya"); - assert_EWTS_error("ddra"); - assert_EWTS_error("ddwa"); + special_case("ddra"); + special_case("ddwa"); assert_EWTS_error("ddha"); assert_EWTS_error("ddhna"); assert_EWTS_error("ddhya"); assert_EWTS_error("ddhra"); assert_EWTS_error("ddhwa"); - assert_EWTS_error("dna"); - assert_EWTS_error("dba"); - assert_EWTS_error("dbra"); + special_case("dna"); + special_case("dba"); + special_case("dbra"); assert_EWTS_error("dbha"); assert_EWTS_error("dbhya"); assert_EWTS_error("dbhra"); - assert_EWTS_error("dma"); - assert_EWTS_error("dya"); + special_case("dma"); + special_case("dya"); assert_EWTS_error("drya"); assert_EWTS_error("dwya"); - assert_EWTS_error("dha"); + special_case("dha"); assert_EWTS_error("dhna"); assert_EWTS_error("dhnya"); assert_EWTS_error("dhma"); assert_EWTS_error("dhya"); - assert_EWTS_error("dhra"); + special_case("dhra"); assert_EWTS_error("dhrya"); - assert_EWTS_error("dhwa"); + special_case("dhwa"); assert_EWTS_error("nka"); assert_EWTS_error("nkta"); assert_EWTS_error("ngha"); @@ -1016,7 +1028,7 @@ public class EWTStibwniniTest extends TestCase { assert_EWTS_error("nma"); assert_EWTS_error("nbhya"); assert_EWTS_error("ntsa"); - assert_EWTS_error("nya"); + special_case("nya"); assert_EWTS_error("nra"); assert_EWTS_error("nwa"); assert_EWTS_error("nwya"); @@ -1039,39 +1051,39 @@ public class EWTStibwniniTest extends TestCase { assert_EWTS_error("pswa"); assert_EWTS_error("psya"); assert_EWTS_error("bgha"); - assert_EWTS_error("bdza"); - assert_EWTS_error("bda"); + special_case("bdza"); + special_case("bda"); assert_EWTS_error("bddza"); assert_EWTS_error("bdha"); assert_EWTS_error("bdhwa"); - assert_EWTS_error("bta"); - assert_EWTS_error("bna"); - assert_EWTS_error("bba"); + special_case("bta"); + special_case("bna"); + special_case("bba"); assert_EWTS_error("bbha"); assert_EWTS_error("bbhya"); - assert_EWTS_error("bma"); - assert_EWTS_error("bha"); + special_case("bma"); + special_case("bha"); assert_EWTS_error("bhNa"); assert_EWTS_error("bhna"); assert_EWTS_error("bhma"); assert_EWTS_error("bhya"); - assert_EWTS_error("bhra"); - assert_EWTS_error("bhwa"); - assert_EWTS_error("mnya"); - assert_EWTS_error("mNa"); - assert_EWTS_error("mna"); - assert_EWTS_error("mnya"); - assert_EWTS_error("mpa"); - assert_EWTS_error("mpra"); - assert_EWTS_error("mpha"); - assert_EWTS_error("mba"); + special_case("bhra"); + special_case("bhwa"); + special_case("mnya"); + special_case("mNa"); // TODO(DLC)[EWTS->Tibetan]: do prefix rules really allow mNa? I think not. + special_case("mna"); + special_case("mnya"); + special_case("mpa"); + special_case("mpra"); + special_case("mpha"); + special_case("mba"); assert_EWTS_error("mbha"); assert_EWTS_error("mbhya"); - assert_EWTS_error("mma"); - assert_EWTS_error("mla"); - assert_EWTS_error("mwa"); - assert_EWTS_error("msa"); - assert_EWTS_error("mha"); + special_case("mma"); + special_case("mla"); + special_case("mwa"); + special_case("msa"); + special_case("mha"); assert_EWTS_error("yYa"); assert_EWTS_error("yra"); assert_EWTS_error("ywa"); @@ -1089,7 +1101,7 @@ public class EWTStibwniniTest extends TestCase { assert_EWTS_error("rNa"); assert_EWTS_error("rtwa"); assert_EWTS_error("rtta"); - assert_EWTS_error("rtsa"); + special_case("rtsa"); assert_EWTS_error("rtsna"); assert_EWTS_error("rtsnya"); assert_EWTS_error("rtha"); diff --git a/source/org/thdl/tib/text/ttt/ErrorsAndWarnings.java b/source/org/thdl/tib/text/ttt/ErrorsAndWarnings.java index 97c3c02..7b5ac1c 100644 --- a/source/org/thdl/tib/text/ttt/ErrorsAndWarnings.java +++ b/source/org/thdl/tib/text/ttt/ErrorsAndWarnings.java @@ -18,11 +18,11 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; +import java.util.HashMap; + import org.thdl.util.ThdlDebug; import org.thdl.util.ThdlOptions; -import java.util.HashMap; - /** A noninstantiable class that knows about every user-visible error * or warning message. Each has a unique integer key starting at 101 * for those messages that are errors and starting at 501 for those @@ -96,7 +96,8 @@ public class ErrorsAndWarnings { messages that take more than one "parameter", if you will, like message 501. */ static String getMessage(int code, boolean shortMessages, - String translit) { + String translit, + TTraits traits) { // Let's make sure that no unknown code is used during // development: ThdlDebug.verify("unknown code " + code, @@ -123,27 +124,35 @@ public class ErrorsAndWarnings { return "" + code + ": There's not even a unique, non-illegal parse for {" + translit + "}"; case 102: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Found an open bracket, '" + translit + "', within a [#COMMENT]-style comment. Brackets may not appear in comments."; case 103: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Found a truly unmatched close bracket, '" + translit + "'."; case 104: // See also 140 + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Found a closing bracket, '" + translit + "', without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this."; case 105: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Found a truly unmatched open bracket, '[' or '{', prior to this current illegal open bracket, '" + translit + "'."; case 106: // see also 139 + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Found an illegal open bracket (in context, this is '" + translit + "'). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?"; case 107: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Found an illegal at sign, @ (in context, this is " + translit + "). This folio marker has a period, '.', at the end of it, which is illegal."; case 108: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Found an illegal at sign, @ (in context, this is " + translit + "). This folio marker is not followed by whitespace, as is expected."; case 109: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Found an illegal at sign, @ (in context, this is " + translit + "). @012B is an example of a legal folio marker."; case 110: @@ -152,21 +161,26 @@ public class ErrorsAndWarnings { /////NYA/. We warn about // for this reason. \\ causes a tsheg-bar //error. */ + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\."; case 111: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Found an illegal open parenthesis, '('. Nesting of parentheses is not allowed."; case 112: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Unexpected closing parenthesis, ')', found."; case 113: - return "" + code + ": The ACIP {?}, found alone, may intend U+0F08, but it may intend a question mark, i.e. '?', in the output. It may even mean that the original text could not be deciphered with certainty, like the ACIP {[?]} does."; + ThdlDebug.verify(traits.isACIP()); + return "" + code + ": The " + traits.shortTranslitName() + " {?}, found alone, may intend U+0F08, but it may intend a question mark, i.e. '?', in the output. It may even mean that the original text could not be deciphered with certainty, like the " + traits.shortTranslitName() + " {[?]} does."; case 114: return "" + code + ": Found an illegal, unprintable character."; case 115: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Found a backslash, \\, which the ACIP Tibetan Input Code standard says represents a Sanskrit virama. In practice, though, this is so often misused (to represent U+0F3D) that {\\} always generates this error. If you want a Sanskrit virama, change the input document to use {\\u0F84} instead of {\\}. If you want U+0F3D, use {/NYA/} or {/NYA\\u0F3D}."; case 116: @@ -174,37 +188,44 @@ public class ErrorsAndWarnings { return "" + code + ": Found an illegal character, '" + translit + "', with ordinal (in decimal) " + (int)translit.charAt(0) + "."; case 117: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Unexpected end of input; truly unmatched open bracket found."; case 118: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Unmatched open bracket found. A comment does not terminate."; case 119: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Unmatched open bracket found. A correction does not terminate."; case 120: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Slashes are supposed to occur in pairs, but the input had an unmatched '/' character."; case 121: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Parentheses are supposed to occur in pairs, but the input had an unmatched parenthesis, '('."; case 122: - return "" + code + ": Warning, empty tsheg bar found while converting from ACIP!"; + return "" + code + ": Warning, empty tsheg bar found while converting from " + traits.shortTranslitName() + "!"; case 123: - return "" + code + ": Cannot convert ACIP {" + translit + "} because it contains a number but also a non-number."; + return "" + code + ": Cannot convert " + traits.shortTranslitName() + " {" + translit + "} because it contains a number but also a non-number."; case 124: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Cannot convert ACIP {" + translit + "} because {V}, wa-zur, appears without being subscribed to a consonant."; case 125: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Cannot convert ACIP {" + translit + "} because we would be required to assume that {A} is a consonant, when it is not clear if it is a consonant or a vowel."; case 126: - return "" + code + ": Cannot convert ACIP {" + translit + "} because it ends with a '+'."; + return "" + code + ": Cannot convert " + traits.shortTranslitName() + " {" + translit + "} because it ends with a '+'."; case 127: - return "" + code + ": Cannot convert ACIP {" + translit + "} because it ends with a '-'."; + return "" + code + ": Cannot convert " + traits.shortTranslitName() + " {" + translit + "} because it ends with a disambiguator (i.e., '" + traits.disambiguator() + "')."; case 128: // fall through case 129: @@ -214,13 +235,14 @@ public class ErrorsAndWarnings { return "" + code + ": The tsheg bar (\"syllable\") {" + translit + "} is essentially nothing."; case 131: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": The ACIP caret, {^}, must precede a tsheg bar."; case 132: - return "" + code + ": The ACIP {" + translit + "} must be glued to the end of a tsheg bar, but this one was not."; + return "" + code + ": The " + traits.shortTranslitName() + " {" + translit + "} must be glued to the end of a tsheg bar, but this one was not."; case 133: - return "" + code + ": Cannot convert the ACIP {" + translit + "} to Tibetan because it is unclear what the result should be. The correct output would likely require special mark-up."; + return "" + code + ": Cannot convert the " + traits.shortTranslitName() + " {" + translit + "} to Tibetan because it is unclear what the result should be. The correct output would likely require special mark-up."; case 134: return "" + code + ": The tsheg bar (\"syllable\") {" + translit + "} has no legal parses."; @@ -241,21 +263,26 @@ public class ErrorsAndWarnings { // See also 106. case 139: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Found an illegal open bracket (in context, this is '" + translit + "'). There is no matching closing bracket."; case 140: // see also 104 + ThdlDebug.verify(traits.isACIP()); ThdlDebug.verify(translit.length() == 1); return "" + code + ": Unmatched closing bracket, '" + translit + "', found. Pairs are expected, as in [#THIS] or [THAT]. Nesting is not allowed."; case 141: + ThdlDebug.verify(traits.isACIP()); ThdlDebug.verify(translit.length() == 1); return "" + code + ": While waiting for a closing bracket, an opening bracket, '" + translit + "', was found instead. Nesting of bracketed expressions is not permitted."; case 142: // this number is referenced in error 143's message + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Because you requested conversion to a Unicode text file, there is no way to indicate that the font size is supposed to decrease starting here and continuing until error 143. That is, this is the beginning of a region in YIG CHUNG."; case 143: // this number is referenced in error 142's message + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Because you requested conversion to a Unicode text file, there is no way to indicate that the font size is supposed to increase (go back to the size it was before the last error 142, that is) starting here. That is, this is the end of a region in YIG CHUNG."; @@ -270,27 +297,32 @@ public class ErrorsAndWarnings { return "" + code + ": The last stack does not have a vowel in {" + translit + "}; this may indicate a typo, because Sanskrit, which this probably is (because it's not legal Tibetan), should have a vowel after each stack."; case 503: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": Though {" + translit + "} is unambiguous, it would be more computer-friendly if '+' signs were used to stack things because there are two (or more) ways to interpret this ACIP if you're not careful."; case 504: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": The ACIP {" + translit + "} is treated by this converter as U+0F35, but sometimes might represent U+0F14 in practice. To avoid seeing this warning again, change the input to use {\\u0F35} instead of {" + translit + "}."; case 505: return "" + code + ": There is a useless disambiguator in {" + translit + "}."; case 506: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": There is a stack of three or more consonants in {" + translit + "} that uses at least one '+' but does not use a '+' between each consonant."; case 507: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": There is a chance that the ACIP {" + translit + "} was intended to represent more consonants than we parsed it as representing -- GHNYA, e.g., means GH+NYA, but you can imagine seeing GH+N+YA and typing GHNYA for it too."; // TMW has glyphs for both GH+N+YA (G+H+N+YA) and GH+NYA (G+H+NYA). case 508: // see 509 also - return "" + code + ": The ACIP {" + translit + "} has been interpreted as two stacks, not one, but you may wish to confirm that the original text had two stacks as it would be an easy mistake to make to see one stack (because there is such a stack used in Sanskrit transliteration for this particular sequence) and forget to input it with '+' characters."; + return "" + code + ": The " + traits.shortTranslitName() + " {" + translit + "} has been interpreted as two stacks, not one, but you may wish to confirm that the original text had two stacks as it would be an easy mistake to make to see one stack (because there is such a stack used in Sanskrit transliteration for this particular sequence) and forget to input it with '+' characters."; case 509: // see 508 also - return "" + code + ": The ACIP {" + translit + "} has an initial sequence that has been interpreted as two stacks, a prefix and a root stack, not one nonnative stack, but you may wish to confirm that the original text had two stacks as it would be an easy mistake to make to see one stack (because there is such a stack used in Sanskrit transliteration for this particular sequence) and forget to input it with '+' characters."; + return "" + code + ": The " + traits.shortTranslitName() + " {" + translit + "} has an initial sequence that has been interpreted as two stacks, a prefix and a root stack, not one nonnative stack, but you may wish to confirm that the original text had two stacks as it would be an easy mistake to make to see one stack (because there is such a stack used in Sanskrit transliteration for this particular sequence) and forget to input it with '+' characters."; case 510: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": A non-breaking tsheg, '" + translit + "', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\"."; @@ -298,9 +330,10 @@ public class ErrorsAndWarnings { // ERROR 137 and WARNING 511 are the same: case 137: /* fall through */ case 511: - return "" + code + ": The ACIP {" + translit + "} cannot be represented with the TibetanMachine or TibetanMachineWeb fonts because no such glyph exists in these fonts. The TibetanMachineWeb font has only a limited number of ready-made, precomposed glyphs, and {" + translit + "} is not one of them."; + return "" + code + ": The " + traits.shortTranslitName() + " {" + translit + "} cannot be represented with the TibetanMachine or TibetanMachineWeb fonts because no such glyph exists in these fonts. The TibetanMachineWeb font has only a limited number of ready-made, precomposed glyphs, and {" + translit + "} is not one of them."; case 512: + ThdlDebug.verify(traits.isACIP()); return "" + code + ": There is a chance that the ACIP {" + translit + "} was intended to represent more consonants than we parsed it as representing -- GHNYA, e.g., means GH+NYA, but you can imagine seeing GH+N+YA and typing GHNYA for it too. In fact, there are glyphs in the Tibetan Machine font for N+N+Y, N+G+H, G+N+Y, G+H+N+Y, T+N+Y, T+S+TH, T+S+N, T+S+N+Y, TS+NY, TS+N+Y, H+N+Y, M+N+Y, T+S+M, T+S+M+Y, T+S+Y, T+S+R, T+S+V, N+T+S, T+S, S+H, R+T+S, R+T+S+N, R+T+S+N+Y, and N+Y, indicating the importance of these easily mistyped stacks, so the possibility is very real."; @@ -391,11 +424,11 @@ public class ErrorsAndWarnings { severityMap.put(new Integer(num), (null != opt) ? opt : defaultSeverities[num - 501]); } - // DLC FIXME: make 506 an error? or a new, super-high priority class of warning? + // TODO(DLC)[EWTS->Tibetan] FIXME: make 506 an error? or a new, super-high priority class of warning? } /** Prints out the long forms of the error messages, which will - help a user to decipher the short forms. */ + help a user to decipher the short forms. TODO(DLC)[EWTS->Tibetan]: ACIP only */ public static void printErrorAndWarningDescriptions(java.io.PrintStream out) { final String translit = "X"; out.println("ACIP->Tibetan ERRORS are as follows, and appear in their short forms, embedded"); @@ -407,7 +440,8 @@ public class ErrorsAndWarnings { } else if (129 == num) { out.println("129: Cannot convert ACIP {" + translit + "} because " + "+" + " is not an ACIP consonant."); } else { - out.println(getMessage(num, false, translit)); + out.println(getMessage(num, false, translit, + ACIPTraits.instance())); } out.println(""); } @@ -419,7 +453,8 @@ public class ErrorsAndWarnings { if (501 == num) { out.println("501: Using " + translit + ", but only because the tool's knowledge of prefix rules (see the documentation) says that " + "XX" + " is not a legal Tibetan tsheg bar (\"syllable\")"); } else { - out.println(getMessage(num, false, translit)); + out.println(getMessage(num, false, translit, + ACIPTraits.instance())); } out.println(""); } diff --git a/source/org/thdl/tib/text/ttt/LotsOfTshegBarsTest.java b/source/org/thdl/tib/text/ttt/LotsOfTshegBarsTest.java index a3c4f99..89cd1b3 100644 --- a/source/org/thdl/tib/text/ttt/LotsOfTshegBarsTest.java +++ b/source/org/thdl/tib/text/ttt/LotsOfTshegBarsTest.java @@ -21,10 +21,10 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; -import org.thdl.util.ThdlOptions; - import junit.framework.TestCase; +import org.thdl.util.ThdlOptions; + /** Tests ACIP-to-Tibetan conversions using tsheg bars from real ACIP * files. Lots of tsheg bars. diff --git a/source/org/thdl/tib/text/ttt/MidLexSubstitution.java b/source/org/thdl/tib/text/ttt/MidLexSubstitution.java index 5123a38..41e0177 100644 --- a/source/org/thdl/tib/text/ttt/MidLexSubstitution.java +++ b/source/org/thdl/tib/text/ttt/MidLexSubstitution.java @@ -18,12 +18,12 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; -import org.thdl.util.ThdlOptions; - import java.util.ArrayList; import java.util.HashMap; import java.util.StringTokenizer; +import org.thdl.util.ThdlOptions; + /** MidLexSubstitution is a hack that lets the end user clumsily fix * the EWTS-to-Tibetan and ACIP-to-Tibetan converters without having * to modify the source code. diff --git a/source/org/thdl/tib/text/ttt/PackageTest.java b/source/org/thdl/tib/text/ttt/PackageTest.java index eff8d50..fb6ae0a 100644 --- a/source/org/thdl/tib/text/ttt/PackageTest.java +++ b/source/org/thdl/tib/text/ttt/PackageTest.java @@ -21,12 +21,12 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; -import org.thdl.util.ThdlOptions; - import java.util.ArrayList; import junit.framework.TestCase; +import org.thdl.util.ThdlOptions; + /** Tests this package, especially {@link #TPairListFactory} and * {@link TPairList}. Tests use ACIP more than EWTS. @@ -275,7 +275,8 @@ public class PackageTest extends TestCase { String[] expectedLegalParses, String expectedBestParse, int pairListToUse) { - TPairList[] la = TPairListFactory.breakACIPIntoChunks(acip, true); + TPairList[] la + = ACIPTraits.instance().breakTshegBarIntoChunks(acip, true); TPairList l = la[(pairListToUse == -1) ? 0 : ((pairListToUse >= 1) ? 1 : pairListToUse)]; if (sdebug || debug) System.out.println("ACIP=" + acip + " and l'=" + l); @@ -302,9 +303,9 @@ public class PackageTest extends TestCase { return; } else { String s; - if ((s = pt.getWarning("Most", l, acip, false)) != null) { + if ((s = pt.getWarning("Most", l, acip, false, ACIPTraits.instance())) != null) { System.out.println(s); - } else if ((s = pt.getWarning("All", l, acip, false)) != null) + } else if ((s = pt.getWarning("All", l, acip, false, ACIPTraits.instance())) != null) if (sdebug || debug) System.out.println("Paranoiac warning is this: " + s); } int np = pt.numberOfParses(); @@ -447,9 +448,9 @@ public class PackageTest extends TestCase { tstHelper("9012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678"); } - /** Tests {@link TPairListFactory#breakACIPIntoChunks(String, - * boolean)}, {@link TPairList#getACIPError(String, boolean)}, and {@link - * TPairList#recoverACIP()}. */ + /** Tests {@link ACIPTraits#breakTshegBarIntoChunks(String, + * boolean)}, {@link TPairList#getACIPError(String, boolean)}, + * and {@link TPairList#recoverACIP()}. */ public void testBreakACIPIntoChunks() { tstHelper("GASN"); // ambiguous with regard to prefix rules tstHelper("BARMA"); // ambiguous with regard to prefix rules diff --git a/source/org/thdl/tib/text/ttt/ParseIterator.java b/source/org/thdl/tib/text/ttt/ParseIterator.java index 24c8af6..06bcaf0 100644 --- a/source/org/thdl/tib/text/ttt/ParseIterator.java +++ b/source/org/thdl/tib/text/ttt/ParseIterator.java @@ -18,9 +18,9 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; +import java.util.ArrayList; import java.util.ListIterator; import java.util.NoSuchElementException; -import java.util.ArrayList; /** An object that can iterate over an {@link TParseTree}. * diff --git a/source/org/thdl/tib/text/ttt/TConverter.java b/source/org/thdl/tib/text/ttt/TConverter.java index bd889dc..cfc5025 100644 --- a/source/org/thdl/tib/text/ttt/TConverter.java +++ b/source/org/thdl/tib/text/ttt/TConverter.java @@ -18,16 +18,19 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; -import java.io.*; -import java.util.ArrayList; -import java.util.Stack; import java.awt.Color; +import java.io.BufferedWriter; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.util.ArrayList; -import org.thdl.util.ThdlDebug; -import org.thdl.util.ThdlOptions; +import org.thdl.tib.text.DuffCode; import org.thdl.tib.text.TibetanDocument; import org.thdl.tib.text.TibetanMachineWeb; -import org.thdl.tib.text.DuffCode; +import org.thdl.util.ThdlDebug; +import org.thdl.util.ThdlOptions; // TODO(DLC)[EWTS->Tibetan]: THis class is broken for ewts. But kill this class unless it needs to exist. /** @@ -338,9 +341,9 @@ public class TConverter { if (smallFontSize >= regularFontSize) smallFontSize = regularFontSize - 1; if (colors) - tdoc.enableColors(); + TibetanDocument.enableColors(); else - tdoc.disableColors(); + TibetanDocument.disableColors(); } int sz = scan.size(); @@ -371,7 +374,8 @@ public class TConverter { } } else if (stype == TString.TSHEG_BAR_ADORNMENT) { if (lastGuyWasNonPunct) { - String err = "[#ERROR " + ErrorsAndWarnings.getMessage(133, shortMessages, s.getText()) + "]"; + String err = "[#ERROR " + ErrorsAndWarnings.getMessage(133, shortMessages, s.getText(), + ttraits) + "]"; if (null != writer) { String uni = ttraits.getUnicodeFor(s.getText(), false); if (null == uni) { @@ -434,7 +438,9 @@ public class TConverter { Object[] duff = null; if (stype == TString.TIBETAN_NON_PUNCTUATION) { lastGuyWasNonPunct = true; - TPairList pls[] = TPairListFactory.breakACIPIntoChunks(s.getText(), false); + TPairList pls[] + = ttraits.breakTshegBarIntoChunks(s.getText(), + false); String acipError; if ((acipError = pls[0].getACIPError(s.getText(), shortMessages)) != null @@ -457,7 +463,8 @@ public class TConverter { hasErrors = true; String errorMessage = ("[#ERROR " - + ErrorsAndWarnings.getMessage(130, shortMessages, s.getText()) + + ErrorsAndWarnings.getMessage(130, shortMessages, s.getText(), + ttraits) + "]"); if (null != writer) writer.write(errorMessage); if (null != tdoc) { @@ -478,7 +485,8 @@ public class TConverter { "[#ERROR " + ErrorsAndWarnings.getMessage(134, shortMessages, - s.getText()) + s.getText(), + ttraits) + "]"; if (null != writer) writer.write(errorMessage); @@ -516,7 +524,8 @@ public class TConverter { warning = pt.getWarning(warningLevel, pl, s.getText(), - shortMessages); + shortMessages, + ttraits); } if (null != warning) { if (writeWarningsToOut) { @@ -632,7 +641,7 @@ public class TConverter { // one) and then a comma: peekaheadFindsSpacesAndComma(scan, i+1))) { if (null != writer) { - unicode = " "; // DLC NOW FIXME: allow for U+00A0 between two shads (0F0D or 0F0E), and optionally insert a U+200B after the shad following the whitespace so that stupid software will break lines more nicely + unicode = " "; // TODO(DLC)[EWTS->Tibetan]: FIXME: allow for U+00A0 between two shads (0F0D or 0F0E), and optionally insert a U+200B after the shad following the whitespace so that stupid software will break lines more nicely done = true; } if (null != tdoc) { @@ -692,7 +701,8 @@ public class TConverter { writer.write("[ERROR " + ErrorsAndWarnings.getMessage(142, shortMessages, - "(" /* hard-coded ACIP value */) + "]"); + "(" /* hard-coded ACIP value */, + ttraits) + "]"); if (null != tdoc) { tdoc.setTibetanFontSize(smallFontSize); } @@ -702,7 +712,8 @@ public class TConverter { writer.write("[ERROR " + ErrorsAndWarnings.getMessage(143, shortMessages, - ")" /* hard-coded ACIP value */) + "]"); + ")" /* hard-coded ACIP value. TODO(DLC)[EWTS->Tibetan]: and above*/, + ttraits) + "]"); if (null != tdoc) { tdoc.setTibetanFontSize(regularFontSize); } @@ -717,7 +728,8 @@ public class TConverter { "[#ERROR " + ErrorsAndWarnings.getMessage(135, shortMessages, - "" + ch) + "" + ch, + ttraits) + "]"; writer.write(errorMessage); if (null != errors) @@ -729,7 +741,8 @@ public class TConverter { "[#ERROR " + ErrorsAndWarnings.getMessage(138, shortMessages, - "" + ch) + "" + ch, + ttraits) + "]"; writer.write(errorMessage); if (null != errors) @@ -746,7 +759,8 @@ public class TConverter { "[#ERROR " + ErrorsAndWarnings.getMessage(136, shortMessages, - s.getText()) + s.getText(), + ttraits) + "]"; tdoc.appendRoman(tdocLocation[0], errorMessage, diff --git a/source/org/thdl/tib/text/ttt/TPair.java b/source/org/thdl/tib/text/ttt/TPair.java index 8814493..b6c2e14 100644 --- a/source/org/thdl/tib/text/ttt/TPair.java +++ b/source/org/thdl/tib/text/ttt/TPair.java @@ -19,10 +19,6 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; import org.thdl.util.ThdlDebug; -import org.thdl.tib.text.TibetanMachineWeb; -import org.thdl.tib.text.DuffCode; - -import java.util.ArrayList; /** An ordered pair used in ACIP/EWTS-to-TMW/Unicode conversion. The * left side is the consonant or empty; the right side is either the @@ -182,8 +178,14 @@ class TPair { /** Returns true if this pair contains a Tibetan number. */ boolean isNumeric() { - char ch; - return (l != null && l.length() == 1 && (ch = l.charAt(0)) >= '0' && ch <= '9'); + if (l != null && l.length() == 1) { + char ch = l.charAt(0); + return ((ch >= '0' && ch <= '9') + || (ch >= '\u0f18' && ch <= '\u0f33') + || ch == '\u0f3e' || ch == '\u0f3f'); + } + return false; + // TODO(DLC)[EWTS->Tibetan]: what about half-numbers? } String getWylie() { @@ -209,7 +211,7 @@ class TPair { if (null == leftWylie) leftWylie = ""; if (justLeft) return leftWylie; String rightWylie = null; - if ("-".equals(getRight())) + if (traits.disambiguator().equals(getRight())) rightWylie = "."; else if ("+".equals(getRight())) rightWylie = "+"; @@ -238,8 +240,9 @@ class TPair { consonantSB.append(x); } if (null != getRight() - && !("-".equals(getRight()) || "+".equals(getRight()) || "A".equals(getRight()))) { - String x = traits.getUnicodeFor(getRight(), subscribed); + && !(traits.disambiguator().equals(getRight()) + || "+".equals(getRight()) || traits.aVowel().equals(getRight()))) { + String x = traits.getUnicodeForWowel(getRight()); if (null == x) throw new Error("TPair: " + getRight() + " has no Uni"); vowelSB.append(x); } diff --git a/source/org/thdl/tib/text/ttt/TPairList.java b/source/org/thdl/tib/text/ttt/TPairList.java index 2452a51..13c5969 100644 --- a/source/org/thdl/tib/text/ttt/TPairList.java +++ b/source/org/thdl/tib/text/ttt/TPairList.java @@ -20,13 +20,12 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; -import org.thdl.tib.text.TibetanMachineWeb; -import org.thdl.tib.text.DuffCode; -import org.thdl.tib.text.TGCPair; -import org.thdl.util.ThdlDebug; - -import java.util.HashMap; import java.util.ArrayList; +import java.util.HashMap; + +import org.thdl.tib.text.TGCPair; +import org.thdl.tib.text.TibetanMachineWeb; +import org.thdl.util.ThdlDebug; /** A list of {@link TPair TPairs}, typically corresponding to * one tsheg bar. l' in the design doc is a TPairList. @@ -101,6 +100,11 @@ class TPairList { al.add(0, p); } + /** Appends p to the current list of TPairs. */ + public void append(TPair p) { + al.add(p); + } + /** Returns the number of TPairs in this list. */ public int size() { return al.size(); } @@ -145,12 +149,11 @@ class TPairList { /** Returns true if this list contains ( . ) or (A . ), * which are two simple errors you encounter if you interpret DAA * or TAA or DAI or DAE the wrong way. TODO(DLC)[EWTS->Tibetan]: ACIP vs. EWTS */ - boolean hasSimpleError(TTraits ttraits) { + boolean hasSimpleError() { int sz = size(); for (int i = 0; i < sz; i++) { TPair p = get(i); - if ((null == p.getLeft() && !ttraits.disambiguator().equals(p.getRight())) - || ttraits.hasSimpleError(p)) + if (traits.hasSimpleError(p)) return true; } return false; @@ -161,7 +164,7 @@ class TPairList { * Returns an error message, or null if there is no error that * you can find without the help of tsheg bar syntax rules. */ // FIXME: This is needlessly ACIP specific -- rename and change text of messages - String getACIPError(String originalACIP, boolean shortMessages) { + String getACIPError(String originalACIP, boolean shortMessages) { // TODO(DLC)[EWTS->Tibetan] misnomer. // FIXME: this returns just the first error. List all errors // at once. int sz = size(); @@ -169,46 +172,60 @@ class TPairList { return ErrorsAndWarnings.getMessage(122, shortMessages, ((null != originalACIP) ? originalACIP - : "")); + : ""), + traits); String translit = (null != originalACIP) ? originalACIP : recoverACIP(); boolean mustBeEntirelyNumeric = get(0).isNumeric(); for (int i = 0; i < sz; i++) { TPair p = get(i); if (mustBeEntirelyNumeric != p.isNumeric()) - return ErrorsAndWarnings.getMessage(123, shortMessages, translit); + return ErrorsAndWarnings.getMessage(123, shortMessages, translit, traits); - if ((i == 0 && "V".equals(p.getLeft())) - || (i > 0 && "V".equals(p.getLeft()) - && (null != get(i - 1).getRight() - && !"+".equals(get(i - 1).getRight())))) { - return ErrorsAndWarnings.getMessage(124, shortMessages, translit); - } else if ("A".equals(p.getLeft()) && (null == p.getRight() || "".equals(p.getRight()))) { - return ErrorsAndWarnings.getMessage(125, shortMessages, translit); - } else if ((null == p.getLeft() && !"-".equals(p.getRight())) + if (traits.isACIP() + && ((i == 0 && "V".equals(p.getLeft())) + || (i > 0 && "V".equals(p.getLeft()) + && (null != get(i - 1).getRight() + && !"+".equals(get(i - 1).getRight()))))) { + return ErrorsAndWarnings.getMessage(124, shortMessages, translit, traits); + } else if (traits.aVowel().equals(p.getLeft()) + && (null == p.getRight() + || "".equals(p.getRight()))) { + return ErrorsAndWarnings.getMessage(125, shortMessages, translit, traits); + } else if (null != p.getRight() + && !"+".equals(p.getRight()) + && !traits.disambiguator().equals(p.getRight()) + && !traits.isWowel(p.getRight()) + && false /* TODO(DLC)[EWTS->Tibetan]: think about this harder. */) { + return "ErrorNumberDLC1: We don't yet support stacking vowels, convert {" + translit + "} manually."; + // TODO(DLC)[EWTS->Tibetan]: test, i think we do support it + } else if ((null == p.getLeft() + && (!traits.disambiguator().equals(p.getRight()) + && (!traits.vowelAloneImpliesAChen() + || !traits.aVowel().equals(p.getRight())))) || (null != p.getLeft() - && !traits.isConsonant(p.getLeft()) + && (!traits.isConsonant(p.getLeft()) && (!traits.vowelAloneImpliesAChen() || !traits.aVowel().equals(p.getLeft()))) && !p.isNumeric())) { // FIXME: stop handling this outside of ErrorsAndWarnings: if (null == p.getLeft()) { if (shortMessages) return "128: {" + translit + "}"; else - return "128: Cannot convert ACIP {" + translit + "} because " + p.getRight() + " is a \"vowel\" without an associated consonant."; + return "128: Cannot convert " + traits.shortTranslitName() + " {" + translit + "} because " + p.getRight() + " is a \"vowel\" without an associated consonant."; } else { if (shortMessages) return "129: {" + translit + "}"; else - return "129: Cannot convert ACIP {" + translit + "} because " + p.getLeft() + " is not an ACIP consonant."; + return "129: Cannot convert " + traits.shortTranslitName() + " {" + translit + "} because " + p.getLeft() + " is not an " + traits.shortTranslitName() + " consonant."; } } } if ("+".equals(get(sz - 1).getRight())) { - return ErrorsAndWarnings.getMessage(126, shortMessages, translit); + return ErrorsAndWarnings.getMessage(126, shortMessages, translit, traits); } // FIXME: really this is a warning, not an error: - if ("-".equals(get(sz - 1).getRight())) { - return ErrorsAndWarnings.getMessage(127, shortMessages, translit); + if (traits.disambiguator().equals(get(sz - 1).getRight())) { + return ErrorsAndWarnings.getMessage(127, shortMessages, translit, traits); } return null; } @@ -245,6 +262,9 @@ class TPairList { * empty parse tree. */ public TParseTree getParseTree() { + // TODO(DLC)[EWTS->Tibetan]: EWTS NOTE: this is still useful for EWTS: In EWTS, bkra + // is b.k+ra, smra is s+m+ra, and tshmra is invalid. + // We treat [(B . ), (G . +), (K . ), (T . A)] as if it could // be {B+G+K+T} or {B}{G+K+T}; we handle prefixes specially // this way. [(T . ), (G . +), (K . ), (T . A)] is clearly @@ -254,22 +274,10 @@ class TPairList { // master list of stacks. int sz = size(); - for (int i = 0; i < sz; i++) { - TPair p = get(i); - if (p.getLeft() == null && !"-".equals(p.getRight())) - return null; // clearly illegal. - if ("+".equals(p.getLeft())) - return null; // clearly illegal. - if (":".equals(p.getLeft())) - return null; // clearly illegal. - if ("m".equals(p.getLeft())) - return null; // clearly illegal. - if ("m:".equals(p.getLeft())) - return null; // clearly illegal. - } + for (int i = 0; i < sz; i++) + if (traits.isClearlyIllegal(get(i))) + return null; - - TParseTree pt = new TParseTree(); if (sz < 1) return null; // When we see a stretch of ACIP without a disambiguator or a @@ -387,7 +395,7 @@ class TPairList { if ((breakLocations[1] >= 0 && breakLocations[1] <= breakLocations[0]) || (breakLocations[2] >= 0 && breakLocations[2] <= breakLocations[1])) throw new Error("breakLocations is monotonically increasing, ain't it?"); - + TParseTree pt = new TParseTree(); for (int i = 0; i < sz; i++) { if (i+1 == sz || get(i).endsACIPStack()) { TStackListList sll = new TStackListList(4); // maximum is 4. @@ -412,35 +420,54 @@ class TPairList { // and only if b1 is one, etc. for (int counter = 0; counter < (1< 0) { for (int j = 0; breakStart+j < 3; j++) { if (k == breakLocations[breakStart+j] && 1 == ((counter >> j) & 1)) { - if (!currentStack.isEmpty()) - sl.add(currentStack.asStack()); + if (!currentStack.isEmpty()) { + if (traits.couldBeValidStack(currentStackUnmodified)) { + sl.add(currentStack.asStack()); + } else { + slIsInvalid = true; + break; + } + } currentStack = new TPairList(traits); + currentStackUnmodified = new TPairList(traits); break; // shouldn't matter, but you never know } } } } } - if (!sl.isEmpty()) { + if (!slIsInvalid && !sl.isEmpty()) { sll.add(sl); } } @@ -467,7 +494,7 @@ class TPairList { TPair lastPair = get(size() - 1); if ("+".equals(lastPair.getRight())) al.set(size() - 1, new TPair(traits, lastPair.getLeft(), null)); - else if ("-".equals(lastPair.getRight())) + else if (traits.disambiguator().equals(lastPair.getRight())) al.set(size() - 1, new TPair(traits, lastPair.getLeft(), null)); } return this; @@ -507,14 +534,15 @@ class TPairList { boolean add_U0F7F = false; int where; if (p.getRight() != null - && (where = p.getRight().indexOf(':')) >= 0) { + && (where = p.getRight().indexOf(':')) >= 0) { // TODO(DLC)[EWTS->Tibetan] // this ':' guy is his own TGCPair. add_U0F7F = true; StringBuffer rr = new StringBuffer(p.getRight()); rr.deleteCharAt(where); p = new TPair(traits, p.getLeft(), rr.toString()); } - boolean hasNonAVowel = (!"A".equals(p.getRight()) && null != p.getRight()); + boolean hasNonAVowel = (!traits.aVowel().equals(p.getRight()) + && null != p.getRight()); String thislWylie = traits.getEwtsForConsonant(p.getLeft()); if (thislWylie == null) { char ch; @@ -560,7 +588,7 @@ class TPairList { pl.add(tp); if (add_U0F7F) { indexList.add(new Integer(index)); - pl.add(new TGCPair("H", null, TGCPair.TYPE_OTHER)); + pl.add(new TGCPair("H", null, TGCPair.TYPE_OTHER)); // TODO(DLC)[EWTS->Tibetan] } } } @@ -618,7 +646,7 @@ class TPairList { unicodeExceptionsMap.put("\u0f62\u0fb6", "\u0f6a\u0fb6"); // RS } String mapEntry = (String)unicodeExceptionsMap.get(nonVowelSB.toString()); - if (null != mapEntry) + if (traits.isACIP() && null != mapEntry) sb.append(mapEntry); else sb.append(nonVowelSB); @@ -696,11 +724,13 @@ class TPairList { ? 137 : 511, shortMessages, - recoverACIP())); + recoverACIP(), + traits)); return; } } - if (lastPair.getRight() == null || lastPair.equals("-")) { + if (lastPair.getRight() == null + || lastPair.equals(traits.disambiguator())) { duffsAndErrors.add(TibetanMachineWeb.getGlyph(hashKey)); } else { traits.getDuffForWowel(duffsAndErrors, diff --git a/source/org/thdl/tib/text/ttt/TPairListFactory.java b/source/org/thdl/tib/text/ttt/TPairListFactory.java index 6d79136..e2c7e9e 100644 --- a/source/org/thdl/tib/text/ttt/TPairListFactory.java +++ b/source/org/thdl/tib/text/ttt/TPairListFactory.java @@ -23,88 +23,23 @@ package org.thdl.tib.text.ttt; /** A factory for creating {@link TPairList TPairLists} from * Strings of ACIP. * @author David Chandler */ +// TODO(DLC)[EWTS->Tibetan]: kill this class; put it all in TTraits. class TPairListFactory { /** This class is not instantiable. */ private TPairListFactory() { } - /** Returns one or two new TPairList instances. Breaks an ACIP - * tsheg bar (roughly a "syllable") into chunks; this - * computes l' (for you design doc enthusiasts). - * - *

Here's a rough sketch of the algorithm: run along getting - * the current TPair as big as you can. If you get it very - * big, but there's something illegal afterward that wouldn't - * otherwise be illegal, undo as little as possible to correct. - * For example, G'A'I becomes [(G . 'A), (' . I)], and TAA - * becomes [(T . A)] in a first pass but then we see that the - * rest would be suboptimal, so we backtrack to [(T . )] and then - * finally become [(T . ), (A . A)]. We look for (A . ) and ( - * . ) in the rest in order to say "the rest would be - * suboptimal", i.e. we use TPairList.hasSimpleError(TTraits).

- * - *

There is one case where we break things up into two pair - * lists if and only if specialHandlingForAppendages is true -- I - * thought the converter had a bug because I saw SNYAM'AM in - * KD0003I2.ACT. I asked Robert Chilton, though, and he said - * "SNYAM'AM " was likely a typo for "SNYAM 'AM", so leave - * specialHandlingForAppendages false.

- * - *

I found out about (OK, as it turns out, imagined) this case - * too late to do anything clean about it. SNYAM'AM, e.g., - * breaks up into [(S . ), (NY . A), (M . 'A), (M . )], which is - * incorrect -- [(S . ), (NY . A), (M . ), (' . A), (M . )] is - * correct. But we don't know which is correct without parsing, - * so both are returned. The clean treatment would be to lex - * into a form that didn't insist 'A was either a vowel or a - * consonant. Then the parser would figure it out. But don't - * bother, because specialHandlingForAppendages should be false - * always.

- * - * @param acip a string of ACIP with no punctuation in it - * @param specialHandlingForAppendages true if and only if you - * want SNYAM'AM to ultimately parse as {S+NYA}{M}{'A}{M} instead - * of {S+NYA}{M'A}{M} - * @return an array of one or two pair lists, if the former, then - * the second element will be null, if the latter, the second - * element will have (* . ), (' . *) instead of (* . '*) which - * the former has - * @throws IllegalArgumentException if acip is too large for us - * to break into chunks (we're recursive, not iterative, so the - * boundary can be increased a lot if you care, but you don't) */ - static TPairList[] breakACIPIntoChunks(String acip, - boolean specialHandlingForAppendages) - throws IllegalArgumentException - { - try { - TTraits ttraits = ACIPTraits.instance(); - TPairList a = breakHelperACIP(acip, true, false, ttraits); - TPairList b = null; - if (specialHandlingForAppendages) - b = breakHelperACIP(acip, false, false, ttraits); - if (null != b && a.equals(b)) - return new TPairList[] { a, null }; - else - return new TPairList[] { a, b }; - } catch (StackOverflowError e) { - throw new IllegalArgumentException("Input too large[1]: " + acip); - } catch (OutOfMemoryError e) { - throw new IllegalArgumentException("Input too large[2]: " + acip); - } - } - - /** TODO(DLC)[EWTS->Tibetan]: doc */ - static TPairList[] breakEWTSIntoChunks(String ewts) - throws IllegalArgumentException - { - try { - return new TPairList[] { - breakHelperEWTS(ewts, EWTSTraits.instance()), null - }; - } catch (StackOverflowError e) { - throw new IllegalArgumentException("Input too large[1]: " + ewts); - } catch (OutOfMemoryError e) { - throw new IllegalArgumentException("Input too large[2]: " + ewts); - } + /** See {@link TTraits#breakTshegBarIntoChunks}. */ + static TPairList[] breakACIPIntoChunks(String tt, + boolean specialHandlingForAppendages) { + TTraits ttraits = ACIPTraits.instance(); + TPairList a = breakHelperACIP(tt, true, false, ttraits); + TPairList b = null; + if (specialHandlingForAppendages) + b = breakHelperACIP(tt, false, false, ttraits); + if (null != b && a.equals(b)) + return new TPairList[] { a, null }; + else + return new TPairList[] { a, b }; } /** Helps {@link #breakACIPIntoChunks(String,boolean)}. @@ -149,7 +84,7 @@ class TPairListFactory { || (head.getRight() != null && !"+".equals(head.getRight()) && !"-".equals(head.getRight())), - ttraits)).hasSimpleError(ttraits)) { + ttraits)).hasSimpleError()) { for (int i = 1; i < howMuch; i++) { // try giving i characters back if that leaves us with // a legal head and makes the rest free of simple @@ -164,7 +99,7 @@ class TPairListFactory { || (newHead.getRight() != null && !"+".equals(newHead.getRight()) && !"-".equals(newHead.getRight())), - ttraits)).hasSimpleError(ttraits)) { + ttraits)).hasSimpleError()) { newTail.prepend(newHead); return newTail; } @@ -176,6 +111,136 @@ class TPairListFactory { return tail; } + /** See {@link TTraits#breakTshegBarIntoChunks}. */ + static TPairList[] breakEWTSIntoChunks(String ewts) + throws IllegalArgumentException + { + EWTSTraits traits = EWTSTraits.instance(); + TPairList pl = breakHelperEWTS(ewts, traits); + TPairList npl = pl; + + // TODO(DLC)[EWTS->Tibetan]: this crap ain't workin' for kaHM. But kaeM and kaMe shouldn't work, right? Figure out what EWTS really says... + + // TODO(DLC)[EWTS->Tibetan]: for "a\\0f86" e.g.: + if (pl.size() > 1) { + npl = new TPairList(traits, pl.size()); + + for (int i = pl.size() - 1; i >= 1; i--) { + TPair left = pl.get(i - 1); + TPair right = pl.get(i); + if (traits.aVowel().equals(left.getRight()) + && left.getLeft() == null + && right.getLeft() == null + && traits.isWowelThatRequiresAChen(right.getRight())) { + npl.prepend(new TPair(traits, traits.aVowel(), right.getRight())); + --i; + } else if (traits.aVowel().equals(left.getRight()) + && left.getLeft() != null + && right.getLeft() == null + && traits.isWowelThatRequiresAChen(right.getRight()) + && false /* TODO(DLC)[EWTS->Tibetan]: ewts kaM is bothersome now */) { + npl.prepend(new TPair(traits, left.getLeft(), right.getRight())); + --i; + } else { + npl.prepend(right); + if (i == 1) + npl.prepend(left); + } + } + } + + TPairList nnpl; + if (true) { + // Collapse ( . wowel1) ( . wowel2) into ( + // . wowel1+wowel2). Then collapse (* . a) ( . x) into (* + // . x). Also, if an a-chen (\u0f68) is implied, then + // insert it. + TPairList xnnpl = new TPairList(traits, pl.size()); + for (int i = 0; i < npl.size(); ) { + TPair p = npl.get(i); + int set_i_to = i + 1; + if (p.getLeft() == null + && p.getRight() != null + && !traits.disambiguator().equals(p.getRight()) + && !"+".equals(p.getRight())) { + StringBuffer sb = new StringBuffer(p.getRight()); + for (int j = i + 1; j < npl.size(); j++) { + TPair p2 = npl.get(j); + if (p2.getLeft() == null + && p2.getRight() != null + && !traits.disambiguator().equals(p2.getRight()) + && !"+".equals(p2.getRight())) + { + sb.append("+" + p2.getRight()); + set_i_to = j + 1; + } else { + break; + } + } + p = new TPair(traits, traits.aVowel(), sb.toString()); + } + // TODO(DLC)[EWTS->Tibetan]: Do we still have "ai" converting to the wrong thing. "ae"? + xnnpl.append(p); + i = set_i_to; + } + + nnpl = new TPairList(traits, pl.size()); + // (* . a ) ( . x) ... ( . y) -> (* . a+x+...+y) + for (int i = 0; i < xnnpl.size(); ) { + TPair p = xnnpl.get(i); + int set_i_to = i + 1; + if (traits.aVowel().equals(p.getRight())) { + StringBuffer sb = new StringBuffer(p.getRight()); + for (int j = i + 1; j < xnnpl.size(); j++) { + TPair p2 = xnnpl.get(j); + if (p2.getLeft() == null + && p2.getRight() != null + && !traits.disambiguator().equals(p2.getRight()) + && !"+".equals(p2.getRight())) + { + // TODO(DLC)[EWTS->Tibetan] a+o+e is what we'll get.. maybe we want just o+e? + sb.append("+" + p2.getRight()); + set_i_to = j + 1; + } else { + break; + } + } + p = new TPair(traits, p.getLeft(), sb.toString()); + } + + if (false) { // TODO(DLC)[EWTS->Tibetan]: bra is screwed up, do in it stacklist? + // EWTS does not think that kra is k+ra. Replace + // (consonant . ) with (consonant . DISAMBIGUATOR): + if (p.getRight() == null && p.getLeft() != null + && i + 1 < xnnpl.size()) + p = new TPair(traits, p.getLeft(), traits.disambiguator()); + } + + nnpl.append(p); + i = set_i_to; + } + } else { + // TODO(DLC)[EWTS->Tibetan]: this block is not executing. kill it after testing and thinking + nnpl = new TPairList(traits, pl.size()); + + for (int i = npl.size() - 1; i >= 0; i--) { + TPair p = npl.get(i); + if (p.getLeft() == null + && p.getRight() != null + && !traits.disambiguator().equals(p.getRight()) + && !"+".equals(p.getRight())) /* TODO(DLC)[EWTS->Tibetan] this should be equivalent to isWowel(p.getRight()) but o+o shows that's not true yet */ + p = new TPair(traits, traits.aVowel(), p.getRight()); + // TODO(DLC)[EWTS->Tibetan]: do you still have "ai" converting to the wrong thing? ("ae" also?) + nnpl.prepend(p); + } + } + + // TODO(DLC)[EWTS->Tibetan]: this nnpl crap was before getFirstConsonantAndVowel got fixed. Try killing it! + return new TPairList[] { + nnpl, null + }; + } + // TODO(DLC)[EWTS->Tibetan]: doc private static TPairList breakHelperEWTS(String ewts, TTraits ttraits) { @@ -190,7 +255,7 @@ class TPairListFactory { TPairList tail; if ((tail = breakHelperEWTS(ewtsBuf.substring(howMuch), - ttraits)).hasSimpleError(ttraits)) { + ttraits)).hasSimpleError()) { for (int i = 1; i < howMuch; i++) { // try giving i characters back if that leaves us with // a legal head and makes the rest free of simple @@ -199,7 +264,7 @@ class TPairListFactory { TPair newHead; if ((newHead = head.minusNRightmostTransliterationCharacters(i)).isLegal() && !(newTail - = breakHelperEWTS(ewtsBuf.substring(howMuch - i), ttraits)).hasSimpleError(ttraits)) { + = breakHelperEWTS(ewtsBuf.substring(howMuch - i), ttraits)).hasSimpleError()) { newTail.prepend(newHead); return newTail; } @@ -211,101 +276,193 @@ class TPairListFactory { return tail; } - /** Returns the largest TPair we can make from the acip starting - * from the left. This will return a size zero pair if and only - * if acip is the empty string; otherwise, it may return a pair - * with either the left or right component empty. This mutates - * acip when we run into {NA+YA}; it mutates acip into {N+YA}. - * For {NE+YA}, it does not mutate acip or behave intelligently. - * A later phase will need to turn that into {N+YE} or an error - * or whatever you like. howMuch[0] will be set to the number of - * characters of acip that this call has consumed. */ - private static TPair getFirstConsonantAndVowel(StringBuffer acip, // TODO(DLC)[EWTS->Tibetan]: function name needs ACIP in it? + private static String GetInitialVowel(TTraits ttraits, String tx, + String startOfVowel) { + if (null == startOfVowel) startOfVowel = ""; + boolean startsWithPlus = false; + if (!"".equals(startOfVowel) + && (!ttraits.vowelsMayStack() + || (tx.length() < 1 || !(startsWithPlus = tx.substring(0, 1).equals("+"))))) + return ("".equals(startOfVowel) ? null : startOfVowel); + if (startsWithPlus) + tx = tx.substring(1); + for (int i = Math.min(ttraits.maxWowelLength(), tx.length()); i >= 1; i--) { + String t = tx.substring(0, i); + if (ttraits.isWowel(t) + || (ttraits.isACIP() + // Or these, which we massage into "Am", "Am:", and + // "A:" because I didn't think {Pm} should be treated + // like {PAm} originally: + // TODO(DLC)[EWTS->Tibetan]: NOW NIGHTMARE + && ("m".equals(t) || "m:".equals(t) || ":".equals(t)))) { + // If this is followed by +wowel[+wowel[+wowel... in EWTS then that's part of the vowel also: + return GetInitialVowel(ttraits, + tx.substring(i), + startOfVowel + (startsWithPlus ? "+" : "") + t); + } + } + return null; + } + + + /** Returns the largest TPair we can make from the transliteration + * starting from the left. This will return a size zero pair if + * and only if tx is the empty string; otherwise, it may return a + * pair with either the left or right component empty. [FOR + * ACIP:] This mutates tx when we run into {NA+YA}; it mutates tx + * into {N+YA}. For {NE+YA}, it does not mutate tx or behave + * intelligently. A later phase will need to turn that into + * {N+YE} or an error or whatever you like. howMuch[0] will be + * set to the number of characters of tx that this call has + * consumed. */ + private static TPair getFirstConsonantAndVowel(StringBuffer tx, // TODO(DLC)[EWTS->Tibetan]: function name needs ACIP in it? int howMuch[], TTraits ttraits) { - // Note that it is *not* the case that if acip.substring(0, N) + // To handle EWTS "phywa\\u0f84\u0f86" [yes that's two slashes + // and then one slash], for example, we need to make the wowel + // (the getRight() field of the returned TPair) contain + // everything that it should. + // + // It can't hurt in ACIP, though I don't recall if ACIP's lexer + // allows Unicode characters. + TPair og = helpGetFirstConsonantAndVowel(tx, howMuch, ttraits); + int len = tx.length(); + StringBuffer x = null; + while (howMuch[0] < len) { + if (isUnicodeWowelChar(tx.charAt(howMuch[0]))) { + if (null == x) x = new StringBuffer(); // rarely happens + if (x.length() > 0) x.append('+'); + x.append(tx.charAt(howMuch[0]++)); + } else { + break; + } + } + // In EWTS, deal with M, ~M`, etc. They're much like + // UnicodeWowelCharacters. + if (ttraits instanceof EWTSTraits) { + EWTSTraits tt = (EWTSTraits)ttraits; + while (howMuch[0] < len) { + int howMuchExtra[] = new int[] { 0 }; + TPair p + = helpGetFirstConsonantAndVowel(new StringBuffer(tx.substring(howMuch[0])), + howMuchExtra, + ttraits); + if (p.getLeft() == null + && p.getRight() != null + && tt.isWowelThatRequiresAChen(p.getRight())) { + if (null == x) x = new StringBuffer(); // rarely happens + String extra; + if (x.length() > 0) x.append('+'); + x.append(extra = tx.substring(howMuch[0], howMuch[0] + howMuchExtra[0])); + // System.out.println("extra is " + extra); TODO(DLC)[EWTS->Tibetan] + howMuch[0] += howMuchExtra[0]; + } else { + break; + } + } + } + if (null != x) + return new TPair(ttraits, og.getLeft(), + (null == og.getRight() || ttraits.aVowel().equals(og.getRight())) + ? x.toString() + : (og.getRight() + "+" + x.toString())); + else + return og; + } + private static TPair helpGetFirstConsonantAndVowel(StringBuffer tx, // TODO(DLC)[EWTS->Tibetan]: function name needs ACIP in it? + int howMuch[], + TTraits ttraits) { + // Note that it is *not* the case that if tx.substring(0, N) // is legal (according to TPair.isLegal()), then - // acip.substring(0, N-1) is legal for all N. For example, + // tx.substring(0, N-1) is legal for all N. For example, // think of ACIP's {shA} and {KshA}. However, 's' is the only - // tricky fellow, so it is true that acip.substring(0, N-1) is - // either legal or ends with 's' if acip.substring(0, N) is - // legal. + // tricky fellow in ACIP, so in ACIP it is true that + // tx.substring(0, N-1) is either legal or ends with 's' if + // tx.substring(0, N) is legal. // // We don't, however, use this approach. We just try to find // a consonant of length 3, and then, failing that, of length // 2, etc. Likewise with vowels. This avoids the issue. - int i, xl = acip.length(); + int i, xl = tx.length(); + // TODO(DLC)[EWTS->Tibetan]: nasty special case! + if (false && !ttraits.isACIP() /* TODO(DLC)[EWTS->Tibetan]: isEWTS! */ + && xl >= 2 && tx.charAt(0) == 'a' && (tx.charAt(1) == 'i' || tx.charAt(1) == 'u')) { + howMuch[0] = 2; + return new TPair(ttraits, null, tx.substring(0, 2)); + // TODO(DLC)[EWTS->Tibetan]: test that "au" alone is \u0f68\u0f7d, "ai" alone is \u0f68\u0f7b in EWTS. + } if (0 == xl) { howMuch[0] = 0; return new TPair(ttraits, null, null); } - if (acip.charAt(0) == ttraits.disambiguatorChar()) { + if (tx.charAt(0) == ttraits.disambiguatorChar()) { howMuch[0] = 1; return new TPair(ttraits, null, ttraits.disambiguator()); } - char ch = acip.charAt(0); + char ch = tx.charAt(0); // Numbers never appear in stacks, so if you see 1234, that's - // like seeing 1-2-3-4. + // like seeing 1-2-3-4. Though in EWTS you can have '0\u0f19' if (ch >= '0' && ch <= '9') { + // TODO(DLC)[EWTS->Tibetan]: test case: 0e should have a-chen and 0\u0f74 should go through without errors. + if (xl > 1 && ttraits.isUnicodeWowel(tx.charAt(1))) { + howMuch[0] = 2; + return new TPair(ttraits, tx.substring(0, 1), tx.substring(1, 2)); + } + howMuch[0] = 1; // not 2... - return new TPair(ttraits, acip.substring(0, 1), (xl == 1) ? null : ttraits.disambiguator()); + return new TPair(ttraits, tx.substring(0, 1), (xl == 1) ? null : ttraits.disambiguator()); } String l = null, r = null; for (i = Math.min(ttraits.maxConsonantLength(), xl); i >= 1; i--) { String t = null; - if (ttraits.isConsonant(t = acip.substring(0, i))) { + if (ttraits.isConsonant(t = tx.substring(0, i)) + || (ttraits.vowelAloneImpliesAChen() // handle EWTS {a+yo} + && ttraits.aVowel().equals(tx.substring(0, i)) + && i < xl && tx.substring(i, i + i).equals("+"))) { l = t; break; } } int ll = (null == l) ? 0 : l.length(); - if (null != l && xl > ll && acip.charAt(ll) == ttraits.disambiguatorChar()) { + if (null != l && xl > ll && tx.charAt(ll) == ttraits.disambiguatorChar()) { howMuch[0] = l.length() + 1; return new TPair(ttraits, l, ttraits.disambiguator()); } - if (null != l && xl > ll && acip.charAt(ll) == '+') { + if (null != l && xl > ll && tx.charAt(ll) == '+') { howMuch[0] = l.length() + 1; return new TPair(ttraits, l, "+"); } - for (i = Math.min(ttraits.maxWowelLength(), xl - ll); i >= 1; i--) { - String t = null; - if (ttraits.isWowel(t = acip.substring(ll, ll + i)) - // Or these, which we massage into "Am", "Am:", and - // "A:" because I didn't think {Pm} should be treated - // like {PAm} originally: - // TODO(DLC)[EWTS->Tibetan]: NOW NIGHTMARE - || "m".equals(t) || "m:".equals(t) || ":".equals(t)) { - r = t; - break; - } - } - - // Treat {BATA+SA'I} like {BAT+SA'I}: - int z; - if (null != l && /* TODO(DLC)[EWTS->Tibetan]: */"A".equals(r) && ((z = ll + /* TODO(DLC)[EWTS->Tibetan]: */"A".length()) < xl) - && acip.charAt(z) == '+') { - acip.deleteCharAt(z-1); - howMuch[0] = l.length() + 1; - return new TPair(ttraits, l, "+"); - } - - // Allow Pm to mean PAm, P: to mean PA:, Pm: to mean PAm:. /* TODO(DLC)[EWTS->Tibetan]: */ int mod = 0; - if ("m".equals(r)) { r = "Am"; mod = -1; } - if (":".equals(r)) { r = "A:"; mod = -1; } - if ("m:".equals(r)) { r = "Am:"; mod = -1; } - if (":m".equals(r)) { r = "A:m"; mod = -1; } // not seen, though... + r = GetInitialVowel(ttraits, tx.substring(ll), null); + if (ttraits.isACIP()) { + // Treat {BATA+SA'I} like {BAT+SA'I}: // TODO(DLC)[EWTS->Tibetan]: in EWTS??? + int z; + if (null != l + && ttraits.aVowel().equals(r) + && ((z = ll + ttraits.aVowel().length()) < xl) + && tx.charAt(z) == '+') { + tx.deleteCharAt(z-1); + howMuch[0] = l.length() + 1; + return new TPair(ttraits, l, "+"); + } + + // Allow Pm to mean PAm, P: to mean PA:, Pm: to mean PAm:. /* TODO(DLC)[EWTS->Tibetan]: in EWTS? */ + if ("m".equals(r)) { r = "Am"; mod = -1; } + if (":".equals(r)) { r = "A:"; mod = -1; } + if ("m:".equals(r)) { r = "Am:"; mod = -1; } + if (":m".equals(r)) { r = "A:m"; mod = -1; } // not seen, though... + } // what if we see a character that's not part of any wowel or // consonant? We return it. if (null == l && null == r) { howMuch[0] = 1; // not 2... // add a disambiguator to avoid exponential running time: - return new TPair(ttraits, acip.substring(0, 1), + return new TPair(ttraits, tx.substring(0, 1), (xl == 1) ? null : ttraits.disambiguator()); } @@ -314,6 +471,13 @@ class TPairListFactory { + mod); return new TPair(ttraits, l, r); } // TODO(DLC)[EWTS->Tibetan]: + + private static boolean isUnicodeWowelChar(char ch) { + return ((ch >= '\u0f71' && ch <= '\u0f84') + || "\u0f35\u0f37\u0f18\u0f19\u0f3e\u0f3f\u0f86\u0f87\u0fc6".indexOf(ch) >= 0); + // TODO(dchandler): should we really allow "phywa\\u0f18", or + // does \u0f18 only combine with digits? + } } diff --git a/source/org/thdl/tib/text/ttt/TParseTree.java b/source/org/thdl/tib/text/ttt/TParseTree.java index 2dba84a..f81b433 100644 --- a/source/org/thdl/tib/text/ttt/TParseTree.java +++ b/source/org/thdl/tib/text/ttt/TParseTree.java @@ -18,8 +18,6 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; -import org.thdl.util.ThdlDebug; - import java.util.ArrayList; /** A list of non-empty list of {@link TStackListList @@ -129,6 +127,10 @@ class TParseTree { if (sz == 1) { return up.get(0); } else if (sz > 1) { + // TODO(DLC)[EWTS->Tibetan]: does this still happen? If so, when? + // + // System.out.println("SHO NUFF, >1 non-illegal parses still happens"); + // {PADMA}, for example. Our technique is to go from the // left and stack as much as we can. So {PA}{D}{MA} is // inferior to {PA}{D+MA}, and {PA}{D+MA}{D}{MA} is @@ -279,7 +281,8 @@ class TParseTree { public String getWarning(String warningLevel, TPairList pl, String originalACIP, - boolean shortMessages) { + boolean shortMessages, + TTraits traits) { // ROOM_FOR_IMPROVEMENT: Allow one tsheg bar to have multiple // warnings/errors associated with it. Make this a private // subroutine, and have the public getWarning(..) call on this @@ -301,7 +304,7 @@ class TParseTree { if (shortMessages) return "501: Using " + bestParse + ", not " + noPrefixTestsUniqueParse.get(0); else - return "501: Using " + bestParse + ((null != originalACIP) ? (" for the ACIP {" + originalACIP + "}") : "") + ", but only because the tool's knowledge of prefix rules (see the documentation) says that " + noPrefixTestsUniqueParse.get(0) + " is not a legal Tibetan tsheg bar (\"syllable\")"; + return "501: Using " + bestParse + ((null != originalACIP) ? (" for the " + traits.shortTranslitName() + " {" + originalACIP + "}") : "") + ", but only because the tool's knowledge of prefix rules (see the documentation) says that " + noPrefixTestsUniqueParse.get(0) + " is not a legal Tibetan tsheg bar (\"syllable\")"; } } @@ -321,27 +324,31 @@ class TParseTree { // FIXME: The caller will prepend "WARNING " to this error! if (ErrorsAndWarnings.isEnabled(101, warningLevel)) return ErrorsAndWarnings.getMessage(101, shortMessages, - translit); + translit, + traits); } else { if (bestParse.hasStackWithoutVowel(pl, isLastStack)) { if (isLastStack[0]) { if (ErrorsAndWarnings.isEnabled(502, warningLevel)) return ErrorsAndWarnings.getMessage(502, shortMessages, - translit); + translit, + traits); } else { throw new Error("Can't happen now that we stack greedily"); } } if (ErrorsAndWarnings.isEnabled(503, warningLevel)) return ErrorsAndWarnings.getMessage(503, shortMessages, - translit); + translit, + traits); } } else { if (nip.get(0).hasStackWithoutVowel(pl, isLastStack)) { if (isLastStack[0]) { if (ErrorsAndWarnings.isEnabled(502, warningLevel)) return ErrorsAndWarnings.getMessage(502, shortMessages, - translit); + translit, + traits); } else { throw new Error("Can't happen now that we stack greedily [2]"); } @@ -362,7 +369,8 @@ class TParseTree { ++plnum; if (ErrorsAndWarnings.isEnabled(505, warningLevel)) return ErrorsAndWarnings.getMessage(505, shortMessages, - translit); + translit, + traits); } plnum = 0; for (int stackNum = 0; stackNum < bestParse.size(); stackNum++) { @@ -380,14 +388,16 @@ class TParseTree { else if (type == 1) if (ErrorsAndWarnings.isEnabled(506, warningLevel)) return ErrorsAndWarnings.getMessage(506, shortMessages, - translit); + translit, + traits); } else { if (type == 0) type = 1; else if (type == -1) if (ErrorsAndWarnings.isEnabled(506, warningLevel)) return ErrorsAndWarnings.getMessage(506, shortMessages, - translit); + translit, + traits); } } if (stackSize > 1 && tp.getLeft() != null && tp.getLeft().length() > 1) { @@ -445,14 +455,16 @@ n+t+s if (ErrorsAndWarnings.isEnabled(warningNum, warningLevel)) return ErrorsAndWarnings.getMessage(warningNum, shortMessages, - translit); + translit, + traits); } while (plnum < pl.size() && pl.get(plnum).isDisambiguator()) { ++plnum; if (ErrorsAndWarnings.isEnabled(505, warningLevel)) return ErrorsAndWarnings.getMessage(505, shortMessages, - translit); + translit, + traits); } } } @@ -472,11 +484,13 @@ n+t+s if (pl.size() == 3) { if (ErrorsAndWarnings.isEnabled(508, warningLevel)) return ErrorsAndWarnings.getMessage(508, shortMessages, - translit); + translit, + traits); } else { if (ErrorsAndWarnings.isEnabled(509, warningLevel)) return ErrorsAndWarnings.getMessage(509, shortMessages, - translit); + translit, + traits); } } } @@ -497,11 +511,13 @@ n+t+s if (pl.size() == 2) { if (ErrorsAndWarnings.isEnabled(508, warningLevel)) return ErrorsAndWarnings.getMessage(508, shortMessages, - translit); + translit, + traits); } else { if (ErrorsAndWarnings.isEnabled(509, warningLevel)) return ErrorsAndWarnings.getMessage(509, shortMessages, - translit); + translit, + traits); } } } @@ -513,7 +529,7 @@ n+t+s /** Returns something akin to the ACIP input (okay, maybe 1-2-3-4 * instead of 1234, and maybe AUTPA instead of AUT-PA) * corresponding to this parse tree. */ - public String recoverACIP() { + public String recoverACIP() { // TODO(DLC)[EWTS->Tibetan]: acip-specific ParseIterator pi = getParseIterator(); if (pi.hasNext()) { return pi.next().recoverACIP(); diff --git a/source/org/thdl/tib/text/ttt/TStackList.java b/source/org/thdl/tib/text/ttt/TStackList.java index 6007acf..e02a152 100644 --- a/source/org/thdl/tib/text/ttt/TStackList.java +++ b/source/org/thdl/tib/text/ttt/TStackList.java @@ -18,14 +18,12 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; -import org.thdl.tib.text.TibTextUtils; -import org.thdl.tib.text.TGCList; -import org.thdl.tib.text.DuffCode; - import java.util.ArrayList; -import java.util.HashMap; import java.util.ListIterator; +import org.thdl.tib.text.TGCList; +import org.thdl.tib.text.TibTextUtils; + /** A list of {@link TPairList TPairLists}, each of which is for * a stack (a grapheme cluster), typically corresponding to one tsheg * bar. @@ -165,7 +163,7 @@ class TStackList { TPairList pl = get(pairListIndex); TPair p = pl.get(pl.size() - 1); isLegalAndHasAVowelOnRoot - = (p.getRight() != null && p.getRight().startsWith("A")); // could be {A:}, e.g. + = (p.getRight() != null && p.getRight().startsWith("A")); // could be {A:}, e.g. TODO(DLC)[EWTS->Tibetan]: ??? if (isLegalAndHasAVowelOnRoot) break; } diff --git a/source/org/thdl/tib/text/ttt/TString.java b/source/org/thdl/tib/text/ttt/TString.java index 17c1656..90fb9d1 100644 --- a/source/org/thdl/tib/text/ttt/TString.java +++ b/source/org/thdl/tib/text/ttt/TString.java @@ -18,12 +18,11 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; -import org.thdl.util.ThdlOptions; -import org.thdl.util.ThdlDebug; -import org.thdl.tib.text.tshegbar.UnicodeUtils; - import java.util.HashSet; -import java.io.*; + +import org.thdl.tib.text.tshegbar.UnicodeUtils; +import org.thdl.util.ThdlDebug; +import org.thdl.util.ThdlOptions; /** * An TString is some Latin text and a type, the type stating whether diff --git a/source/org/thdl/tib/text/ttt/TTGCList.java b/source/org/thdl/tib/text/ttt/TTGCList.java index 0a97971..6eca573 100644 --- a/source/org/thdl/tib/text/ttt/TTGCList.java +++ b/source/org/thdl/tib/text/ttt/TTGCList.java @@ -18,11 +18,11 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; +import java.util.ArrayList; + import org.thdl.tib.text.TGCList; import org.thdl.tib.text.TGCPair; -import java.util.ArrayList; - /** A list of grapheme clusters. * * @author David Chandler */ diff --git a/source/org/thdl/tib/text/ttt/TTraits.java b/source/org/thdl/tib/text/ttt/TTraits.java index d6eac0a..790d847 100644 --- a/source/org/thdl/tib/text/ttt/TTraits.java +++ b/source/org/thdl/tib/text/ttt/TTraits.java @@ -19,6 +19,7 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; import java.util.ArrayList; + import org.thdl.tib.text.DuffCode; /** A TTraits object encapsulates all the things that make a @@ -65,6 +66,11 @@ interface TTraits { * any wowel) */ boolean isConsonant(String s); + /** Returns true if and only if this transliteration scheme supports + * Tibetan Unicode characters and if ch is such a character and is a + * wowel. */ + boolean isUnicodeWowel(char ch); + /** Returns true if and only if s is a stretch of * transliteration corresponding to a Tibetan wowel (without any * [achen or other] consonant) */ @@ -120,6 +126,10 @@ interface TTraits { * null if l is unknown. */ String getUnicodeFor(String l, boolean subscribed); + /** Returns the unicode for a wowel. Returns null if l is + * unknown. */ + String getUnicodeForWowel(String wowel); + /** Returns a scanner that can break up a string of transliteration. */ TTshegBarScanner scanner(); @@ -127,4 +137,78 @@ interface TTraits { /** Gets the duffcodes for wowel, such that they look good with * the preceding glyph, and appends them to duff. */ void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel); + + /** Human-readable name of this transliteration for short error + strings. */ + String shortTranslitName(); + + /** Returns true if and only pair is clearly not valid + transliteration. */ + boolean isClearlyIllegal(TPair pair); + + /** Returns one or two new TPairList instances. Breaks a + * transliterated tsheg bar (roughly a "syllable") into + * chunks; this computes l' (for you design doc enthusiasts). + * + *

Here's a rough sketch of the algorithm: run along getting + * the current TPair as big as you can. If you get it very big, + * but there's something illegal afterward that wouldn't + * otherwise be illegal, undo as little as possible to correct. + * For example, ACIP {G'A'I} becomes [(G . 'A), (' . I)], and + * ACIP {TAA} becomes [(T . A)] in a first pass but then we see + * that the rest would be suboptimal, so we backtrack to [(T . )] + * and then finally become [(T . ), (A . A)]. We look for (A . ) + * and ( . ) in the rest in order to say "the rest would + * be suboptimal", i.e. we use {@link + * TPairList.hasSimpleError()}.

+ * + *

There is one case where we break things up into two pair + * lists if and only if specialHandlingForAppendages is true -- I + * thought the converter had a bug because I saw ACIP {SNYAM'AM} + * in KD0003I2.ACT. I asked Robert Chilton, though, and he said + * "SNYAM'AM " was likely a typo for "SNYAM 'AM", so leave + * specialHandlingForAppendages false.

+ * + *

I found out about (OK, as it turns out, imagined) this case + * too late to do anything clean about it. ACIP {SNYAM'AM}, + * e.g., breaks up into [(S . ), (NY . A), (M . 'A), (M . )], + * which is incorrect -- [(S . ), (NY . A), (M . ), (' . A), (M + * . )] is correct. But we don't know which is correct without + * parsing, so both are returned. The clean treatment would be + * to lex into a form that didn't insist ACIP {'A} was either a + * vowel or a consonant. Then the parser would figure it out. + * But don't bother, because specialHandlingForAppendages should + * be false always.

+ * + * @param tt a string of transliteration corresponding to a tsheg + * bar (i.e., it has no punctuation in it) + * @param specialHandlingForAppendages true if and only if you + * want ACIP {SNYAM'AM} to ultimately parse as {S+NYA}{M}{'A}{M} + * instead of {S+NYA}{M'A}{M} + * @return an array of length two consisting of one or two pair + * lists. If the former, then the second element will be null, + * if the latter, the second element will have (* . ), (' . *) + * instead of (* . '*) which the former has. */ + TPairList[] breakTshegBarIntoChunks(String tt, + boolean specialHandlingForAppendages); + + /** Returns true if and only if these are ACIP transliteration's + traits. TODO(dchandler): get rid of this function. Any + caller is employing a hack. */ + boolean isACIP(); + + /** Returns true if and only if a vowel all by its lonesome has an + * implied a-chen (U+0F68) with it. (ACIP requires "AI" to + * represent a-chen with gigu, but EWTS requires "i".)*/ + boolean vowelAloneImpliesAChen(); + + /** Returns true if and only if multiple vowels (TODO(dchandler): + * wowels?) may appear on a single consonant stack via the + * stacking operator, '+'. */ + boolean vowelsMayStack(); + + /** Returns true if and only if pl could represent one TPairList + in a tsheg bar. (EWTS's list of standard stacks comes into + play; ACIP always returns true.) */ + boolean couldBeValidStack(TPairList pl); } diff --git a/source/org/thdl/tib/text/ttt/TTshegBarScanner.java b/source/org/thdl/tib/text/ttt/TTshegBarScanner.java index 0835a3b..fcbdab8 100644 --- a/source/org/thdl/tib/text/ttt/TTshegBarScanner.java +++ b/source/org/thdl/tib/text/ttt/TTshegBarScanner.java @@ -18,16 +18,12 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; -import java.io.IOException; -import java.io.FileInputStream; -import java.io.InputStreamReader; -import java.io.InputStream; import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; import java.util.ArrayList; -import java.util.Stack; - -import org.thdl.util.ThdlDebug; -import org.thdl.util.ThdlOptions; /** * A TTshegBarScanner is able to break up Strings of transliterated diff --git a/source/org/thdl/util/HTMLPane.java b/source/org/thdl/util/HTMLPane.java index bfd9051..884110b 100644 --- a/source/org/thdl/util/HTMLPane.java +++ b/source/org/thdl/util/HTMLPane.java @@ -21,8 +21,9 @@ package org.thdl.util; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; -import javax.swing.JScrollPane; + import javax.swing.JEditorPane; +import javax.swing.JScrollPane; /** An HTMLPane is a JScrollPane displaying the contents of an HTML * file. DLC FIXME: at present, neither internal nor external diff --git a/source/org/thdl/util/Link.java b/source/org/thdl/util/Link.java index 04f5d53..65396b7 100644 --- a/source/org/thdl/util/Link.java +++ b/source/org/thdl/util/Link.java @@ -17,7 +17,6 @@ Contributor(s): ______________________________________. */ package org.thdl.util; -import java.io.*; /** Used by {@link SimplifiedLinkedList} to provide the implementation of a simple dynamic link list. diff --git a/source/org/thdl/util/RTFFixerInputStream.java b/source/org/thdl/util/RTFFixerInputStream.java index a2744b0..99923e0 100644 --- a/source/org/thdl/util/RTFFixerInputStream.java +++ b/source/org/thdl/util/RTFFixerInputStream.java @@ -18,13 +18,11 @@ Contributor(s): ______________________________________. package org.thdl.util; -import org.thdl.util.ThdlDebug; - -import java.util.ArrayList; -import java.io.IOException; -import java.io.FilterInputStream; import java.io.BufferedInputStream; +import java.io.FilterInputStream; +import java.io.IOException; import java.io.InputStream; +import java.util.ArrayList; /** Provides an input stream that fixes another RTF input stream so diff --git a/source/org/thdl/util/RTFFixerInputStreamTest.java b/source/org/thdl/util/RTFFixerInputStreamTest.java index 55057e3..c979fb7 100644 --- a/source/org/thdl/util/RTFFixerInputStreamTest.java +++ b/source/org/thdl/util/RTFFixerInputStreamTest.java @@ -18,11 +18,12 @@ Contributor(s): ______________________________________. package org.thdl.util; -import junit.framework.TestCase; import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.UnsupportedEncodingException; +import junit.framework.TestCase; + /** * @author David Chandler * diff --git a/source/org/thdl/util/RTFPane.java b/source/org/thdl/util/RTFPane.java index 7d2e1a6..283bdf2 100644 --- a/source/org/thdl/util/RTFPane.java +++ b/source/org/thdl/util/RTFPane.java @@ -21,11 +21,12 @@ package org.thdl.util; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; + import javax.swing.JScrollPane; import javax.swing.JTextPane; +import javax.swing.text.BadLocationException; import javax.swing.text.DefaultStyledDocument; import javax.swing.text.rtf.RTFEditorKit; -import javax.swing.text.BadLocationException; /** An RTFPane is a JScrollPane displaying the contents of a rich text file (an RTF file). */ diff --git a/source/org/thdl/util/SimpleFrame.java b/source/org/thdl/util/SimpleFrame.java index aee7f97..84a56cf 100644 --- a/source/org/thdl/util/SimpleFrame.java +++ b/source/org/thdl/util/SimpleFrame.java @@ -18,13 +18,12 @@ Contributor(s): ______________________________________. package org.thdl.util; -import javax.swing.JFrame; -import java.awt.Container; import java.awt.Component; +import java.awt.Container; import java.awt.event.ComponentAdapter; import java.awt.event.ComponentEvent; -import org.thdl.util.RTFPane; +import javax.swing.JFrame; /** An SimpleFrame is a top-level window displaying a JScrollPane. */ public class SimpleFrame extends JFrame { diff --git a/source/org/thdl/util/SimplifiedLinkedList.java b/source/org/thdl/util/SimplifiedLinkedList.java index b9527ac..72261ff 100644 --- a/source/org/thdl/util/SimplifiedLinkedList.java +++ b/source/org/thdl/util/SimplifiedLinkedList.java @@ -18,7 +18,7 @@ Contributor(s): ______________________________________. package org.thdl.util; -import java.io.*; +import java.io.PrintWriter; /** Implementation of a simple dynamic link list. Be careful with word order! Why not just use java.util.LinkedList? It is not supported for the diff --git a/source/org/thdl/util/SimplifiedListIterator.java b/source/org/thdl/util/SimplifiedListIterator.java index 2d7a559..8a0a90d 100644 --- a/source/org/thdl/util/SimplifiedListIterator.java +++ b/source/org/thdl/util/SimplifiedListIterator.java @@ -17,7 +17,7 @@ Contributor(s): ______________________________________. */ package org.thdl.util; -import java.util.*; +import java.util.LinkedList; /** Used by {@link LinkedList} to provide the implementation of a simple dynamic link list. diff --git a/source/org/thdl/util/StatusBar.java b/source/org/thdl/util/StatusBar.java index 262240c..d100f8e 100644 --- a/source/org/thdl/util/StatusBar.java +++ b/source/org/thdl/util/StatusBar.java @@ -18,11 +18,13 @@ Contributor(s): ______________________________________. package org.thdl.util; -import java.awt.*; -import java.awt.event.*; -import javax.swing.*; import java.util.Stack; +import javax.swing.BoxLayout; +import javax.swing.JLabel; +import javax.swing.JPanel; +import javax.swing.SwingConstants; + /** A StatusBar can be added to a component, typically to the bottom of it, in order to show the user the status of the program. There are methods to change the status, and there are actually a LIFO diff --git a/source/org/thdl/util/ThdlAbstractAction.java b/source/org/thdl/util/ThdlAbstractAction.java index 82b421e..8d2b411 100644 --- a/source/org/thdl/util/ThdlAbstractAction.java +++ b/source/org/thdl/util/ThdlAbstractAction.java @@ -18,11 +18,10 @@ Contributor(s): ______________________________________. package org.thdl.util; -import javax.swing.AbstractAction; -import javax.swing.Icon; import java.awt.event.ActionEvent; -import org.thdl.util.ThdlDebug; +import javax.swing.AbstractAction; +import javax.swing.Icon; /** * This ActionListener is like any other except in the way that it diff --git a/source/org/thdl/util/ThdlActionListener.java b/source/org/thdl/util/ThdlActionListener.java index 349a098..4847f7b 100644 --- a/source/org/thdl/util/ThdlActionListener.java +++ b/source/org/thdl/util/ThdlActionListener.java @@ -18,10 +18,8 @@ Contributor(s): ______________________________________. package org.thdl.util; -import java.awt.event.ActionListener; import java.awt.event.ActionEvent; - -import org.thdl.util.ThdlDebug; +import java.awt.event.ActionListener; /** * This ActionListener is like any other except in the way that it diff --git a/source/org/thdl/util/ThdlDebug.java b/source/org/thdl/util/ThdlDebug.java index 8cee4e4..c954cf0 100644 --- a/source/org/thdl/util/ThdlDebug.java +++ b/source/org/thdl/util/ThdlDebug.java @@ -18,12 +18,9 @@ Contributor(s): ______________________________________. package org.thdl.util; -import java.io.PrintStream; -import java.io.FileOutputStream; import java.io.File; - -import org.thdl.util.TeeStream; -import org.thdl.util.ThdlOptions; +import java.io.FileOutputStream; +import java.io.PrintStream; /** * This uninstantiable class provides assertions and the like in a diff --git a/source/org/thdl/util/ThdlI18n.java b/source/org/thdl/util/ThdlI18n.java index b78f3d6..56d98a5 100644 --- a/source/org/thdl/util/ThdlI18n.java +++ b/source/org/thdl/util/ThdlI18n.java @@ -2,6 +2,7 @@ package org.thdl.util; import java.util.Locale; import java.util.ResourceBundle; + import javax.swing.JComponent; public class ThdlI18n { diff --git a/source/org/thdl/util/ThdlLazyExceptionTest.java b/source/org/thdl/util/ThdlLazyExceptionTest.java index 386acc0..ba8bdf7 100644 --- a/source/org/thdl/util/ThdlLazyExceptionTest.java +++ b/source/org/thdl/util/ThdlLazyExceptionTest.java @@ -18,9 +18,9 @@ Contributor(s): ______________________________________. package org.thdl.util; -import junit.framework.TestCase; +import java.io.IOException; -import java.io.IOException; /* a checked exception */ +import junit.framework.TestCase; /** * @author David Chandler diff --git a/source/org/thdl/util/ThdlOptions.java b/source/org/thdl/util/ThdlOptions.java index 8261290..015a0c7 100644 --- a/source/org/thdl/util/ThdlOptions.java +++ b/source/org/thdl/util/ThdlOptions.java @@ -18,17 +18,14 @@ Contributor(s): ______________________________________. package org.thdl.util; -import java.io.InputStream; +import java.io.File; import java.io.FileInputStream; +import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.io.IOException; -import java.io.File; -import java.io.FileNotFoundException; +import java.io.InputStream; import java.util.Properties; -import org.thdl.util.ThdlLazyException; -import org.thdl.util.OperatingSystemUtils; - /** * Provides a clean interface to the multi-tiered system of user * preferences (also known as options). diff --git a/source/org/thdl/util/Trie.java b/source/org/thdl/util/Trie.java index 760382e..64f02bc 100644 --- a/source/org/thdl/util/Trie.java +++ b/source/org/thdl/util/Trie.java @@ -81,7 +81,6 @@ Contributor(s): ______________________________________. package org.thdl.util; -import org.thdl.util.ThdlDebug; /** * A digital search trie for 7-bit ASCII text. The API is a subset of