From 045c4069c9bb21bf44455c7a38a8a7d71313e5fb Mon Sep 17 00:00:00 2001 From: dchandler Date: Sun, 31 Aug 2003 16:06:35 +0000 Subject: [PATCH] Preliminary ACIP->TMW support is in place. {DU} gives you something less beautiful than what Jskad would give, so more work is needed. --- source/org/thdl/tib/input/ConvertDialog.java | 5 +- source/org/thdl/tib/input/DuffPane.java | 6 +- .../tib/input/FontConverterConstants.java | 5 +- .../org/thdl/tib/input/TibetanConverter.java | 18 +- source/org/thdl/tib/text/TibetanDocument.java | 48 +++++ source/org/thdl/tib/text/tibwn.ini | 4 +- .../org/thdl/tib/text/ttt/ACIPConverter.java | 188 +++++++++++++----- source/org/thdl/tib/text/ttt/ACIPRules.java | 79 +++++++- source/org/thdl/tib/text/ttt/PackageTest.java | 10 + source/org/thdl/tib/text/ttt/TPair.java | 23 ++- source/org/thdl/tib/text/ttt/TPairList.java | 19 ++ source/org/thdl/tib/text/ttt/TStackList.java | 14 ++ 12 files changed, 355 insertions(+), 64 deletions(-) diff --git a/source/org/thdl/tib/input/ConvertDialog.java b/source/org/thdl/tib/input/ConvertDialog.java index 1eb3238..afbdafa 100644 --- a/source/org/thdl/tib/input/ConvertDialog.java +++ b/source/org/thdl/tib/input/ConvertDialog.java @@ -69,7 +69,8 @@ class ConvertDialog extends JDialog ConvertDialog.this.theRealActionPerformed(e); }}; private void updateWarningLevels() { - if (choices.getSelectedItem() == ACIP_TO_UNI) + if (choices.getSelectedItem() == ACIP_TO_UNI + || choices.getSelectedItem() == ACIP_TO_TMW) this.warningLevels.enable(); else this.warningLevels.disable(); @@ -418,7 +419,7 @@ class ConvertDialog extends JDialog newFileNamePrefix = suggested_WYLIE_prefix; } else if (TMW_TO_UNI == ct || ACIP_TO_UNI == ct) { newFileNamePrefix = suggested_TO_UNI_prefix; - } else if (TM_TO_TMW == ct) { + } else if (TM_TO_TMW == ct || ACIP_TO_TMW == ct) { newFileNamePrefix = suggested_TO_TMW_prefix; } else { ThdlDebug.verify(TMW_TO_TM == ct); diff --git a/source/org/thdl/tib/input/DuffPane.java b/source/org/thdl/tib/input/DuffPane.java index fce1391..27e78e8 100644 --- a/source/org/thdl/tib/input/DuffPane.java +++ b/source/org/thdl/tib/input/DuffPane.java @@ -615,10 +615,8 @@ public class DuffPane extends TibetanPane implements FocusListener { */ public void setRomanAttributeSet(String font, int size) { if (getTibDoc() != null) { - SimpleAttributeSet ras = new SimpleAttributeSet(); - StyleConstants.setFontFamily(ras, romanFontFamily = font); - StyleConstants.setFontSize(ras, romanFontSize = size); - getTibDoc().setRomanAttributeSet(ras); + getTibDoc().setRomanAttributeSet(romanFontFamily = font, + romanFontSize = size); } } diff --git a/source/org/thdl/tib/input/FontConverterConstants.java b/source/org/thdl/tib/input/FontConverterConstants.java index 95fecbd..fa25303 100644 --- a/source/org/thdl/tib/input/FontConverterConstants.java +++ b/source/org/thdl/tib/input/FontConverterConstants.java @@ -27,6 +27,7 @@ import java.awt.*; interface FontConverterConstants { final String ACIP_TO_UNI = "ACIP to Unicode"; + final String ACIP_TO_TMW = "ACIP to TMW"; final String TM_TO_TMW = "TM to TMW"; final String TMW_TO_UNI = "TMW to Unicode"; final String TMW_TO_WYLIE = "TMW to Wylie"; @@ -36,7 +37,9 @@ interface FontConverterConstants final String FIND_ALL_NON_TMW = "Find all non-TMW"; final String FIND_ALL_NON_TM = "Find all non-TM"; - final String[] CHOICES = new String[]{ + final String[] CHOICES = new String[] { + ACIP_TO_UNI, + ACIP_TO_TMW, TM_TO_TMW, TMW_TO_UNI, TMW_TO_WYLIE, diff --git a/source/org/thdl/tib/input/TibetanConverter.java b/source/org/thdl/tib/input/TibetanConverter.java index 947c3ea..d62d292 100644 --- a/source/org/thdl/tib/input/TibetanConverter.java +++ b/source/org/thdl/tib/input/TibetanConverter.java @@ -71,6 +71,7 @@ public class TibetanConverter implements FontConverterConstants { boolean convertToUnicodeMode = false; boolean convertToTMMode = false; boolean convertACIPToUniMode = false; + boolean convertACIPToTMWMode = false; boolean convertToTMWMode = false; boolean convertToWylieMode = false; boolean findSomeNonTMWMode = false; @@ -91,6 +92,8 @@ public class TibetanConverter implements FontConverterConstants { = args[0].equals("--to-tibetan-machine-web")) || (convertACIPToUniMode = args[0].equals("--acip-to-unicode")) + || (convertACIPToTMWMode + = args[0].equals("--acip-to-tmw")) || (convertToUnicodeMode = args[0].equals("--to-unicode")) || (convertToWylieMode @@ -180,6 +183,8 @@ public class TibetanConverter implements FontConverterConstants { conversionTag = TM_TO_TMW; } else if (convertACIPToUniMode) { conversionTag = ACIP_TO_UNI; + } else if (convertACIPToTMWMode) { + conversionTag = ACIP_TO_TMW; } else { ThdlDebug.verify(convertToTMMode); conversionTag = TMW_TO_TM; @@ -205,7 +210,7 @@ public class TibetanConverter implements FontConverterConstants { honored. */ static int reallyConvert(InputStream in, PrintStream out, String ct, String warningLevel) { - if (ACIP_TO_UNI == ct) { + if (ACIP_TO_UNI == ct || ACIP_TO_TMW == ct) { try { ArrayList al = ACIPTshegBarScanner.scanStream(in, null, 250 - 1 // DLC FIXME: make me configurable @@ -214,10 +219,17 @@ public class TibetanConverter implements FontConverterConstants { return 47; StringBuffer warnings = new StringBuffer(); boolean embeddedWarnings = (warningLevel != "None"); - if (!ACIPConverter.convertToUnicode(al, out, null, warnings, + if (ACIP_TO_UNI == ct) { + if (!ACIPConverter.convertToUnicode(al, out, null, warnings, + embeddedWarnings, + warningLevel)) + return 46; + } else { + if (!ACIPConverter.convertToTMW(al, out, null, warnings, embeddedWarnings, warningLevel)) - return 46; + return 46; + } if (embeddedWarnings && warnings.length() > 0) return 45; else diff --git a/source/org/thdl/tib/text/TibetanDocument.java b/source/org/thdl/tib/text/TibetanDocument.java index 36e1d6b..48bca39 100644 --- a/source/org/thdl/tib/text/TibetanDocument.java +++ b/source/org/thdl/tib/text/TibetanDocument.java @@ -141,6 +141,32 @@ public class TibetanDocument extends DefaultStyledDocument { appendDuff(tibetanFontSize, offset, s, attr); } +/** +* Inserts Latin text into the document. The font size is applied +* automatically, according to the current Roman font size. +* @param offset the position at which you want to insert text +* @param s the string you want to insert +* @see #setRomanAttributeSet(AttributeSet) +*/ + public void appendRoman(int offset, String s) throws BadLocationException { + ThdlDebug.verify(getRomanAttributeSet() != null); + insertString(offset, s, getRomanAttributeSet()); + } + +/** +* Inserts Latin text at the end of the document. The font size is +* applied automatically, according to the current Roman font size. +* @param s the string you want to insert +* @see #setRomanAttributeSet(AttributeSet) +*/ + public void appendRoman(String s) { + try { + appendRoman(getLength(), s); + } catch (BadLocationException e) { + throw new Error("can't happen"); + } + } + private void appendDuff(int fontSize, int offset, String s, MutableAttributeSet attr) { try { StyleConstants.setFontSize(attr, fontSize); @@ -160,6 +186,19 @@ public class TibetanDocument extends DefaultStyledDocument { return insertDuff(tibetanFontSize, pos, glyphs, true); } +/** +* Appends all DuffCodes in glyphs to the end of this document. +*/ + public void appendDuffCodes(DuffCode[] glyphs) { + // PERFORMANCE FIXME: this isn't so speedy, but it reuses + // existing code. + for (int i = 0; i < glyphs.length; i++) { + insertDuff(getLength(), + new DuffData[] { new DuffData(new String(new char[] { glyphs[i].getCharacter() }), + glyphs[i].getFontNum()) }); + } + } + /** Replacing can be more efficient than inserting and then removing. This replaces the glyph at position pos with glyph, @@ -1039,6 +1078,15 @@ public class TibetanDocument extends DefaultStyledDocument { romanAttributeSet = ras; } + /** Sets the attribute set applied to Roman text in this + document. */ + public void setRomanAttributeSet(String font, int size) { + SimpleAttributeSet ras = new SimpleAttributeSet(); + StyleConstants.setFontFamily(ras, font); + StyleConstants.setFontSize(ras, size); + setRomanAttributeSet(ras); + } + /** * Converts the specified portion of this document to THDL Extended * Wylie. diff --git a/source/org/thdl/tib/text/tibwn.ini b/source/org/thdl/tib/text/tibwn.ini index de62969..bb8666e 100644 --- a/source/org/thdl/tib/text/tibwn.ini +++ b/source/org/thdl/tib/text/tibwn.ini @@ -29,9 +29,9 @@ $~38,5~~9,41~~~~~~~0F06 #~200,1~~9,39~~~~~~~0F05 // Yig.mgo.tsheg.shad: %~39,5~~9,42~~~~~~~0F07 -// dbu.khang.g-yon: +// dbu.khang.g-yon: (If this changes, edit ACIPConverter) (~208,1~~9,93~~~~~~~0F3C -// dbu.khang.g-yas: +// dbu.khang.g-yas: (If this changes, edit ACIPConverter) )~209,1~~9,94~~~~~~~0F3D H~239,1~~8,92~~~~~~~0F7F diff --git a/source/org/thdl/tib/text/ttt/ACIPConverter.java b/source/org/thdl/tib/text/ttt/ACIPConverter.java index bce35eb..9067613 100644 --- a/source/org/thdl/tib/text/ttt/ACIPConverter.java +++ b/source/org/thdl/tib/text/ttt/ACIPConverter.java @@ -24,10 +24,14 @@ import java.util.Stack; import org.thdl.util.ThdlDebug; import org.thdl.util.ThdlOptions; +import org.thdl.tib.text.TibetanDocument; +import org.thdl.tib.text.TibetanMachineWeb; +import org.thdl.tib.text.DuffCode; /** -* This class is able to convert an ACIP file into Tibetan Machine Web. -* From there, TMW->Unicode takes you to Unicode. +* This class is able to convert an ACIP file into Tibetan Machine Web +* and an ACIP file into TMW. ACIP->Unicode should yield the same +* results as ACIP->TMW followed by TMW->Unicode (FIXME: test it!) * @author David Chandler */ public class ACIPConverter { @@ -86,38 +90,70 @@ public class ACIPConverter { warnings = new StringBuffer(); putWarningsInOutput = true; } - convertToUnicode(al, System.out, errors, warnings, - putWarningsInOutput, warningLevel); + convertToTMW(al, System.out, errors, warnings, + putWarningsInOutput, warningLevel); + int retCode = 0; if (errors.length() > 0) { System.err.println("Errors converting ACIP input file: "); System.err.println(errors); System.err.println("The output contains these errors."); System.err.println("Exiting; please fix input file and try again."); - System.exit(2); + retCode = 2; } if (null != warnings && warnings.length() > 0) { System.err.println("Warnings converting ACIP input file: "); System.err.println(warnings); if (putWarningsInOutput) System.err.println("The output contains these warnings."); - System.exit(2); + retCode = 2; } - if (verbose) System.err.println("Converted " + args[0] + " perfectly."); - System.exit(0); + if (0 == retCode) { + if (verbose) System.err.println("Converted " + args[0] + " perfectly."); + } + System.exit(retCode); + // DLC NOW: tRAStA is not converter correctly to Unicode, and + // no warning is given when converting to TMW. } /** Writes TMW/Latin to out. If errors occur in converting a - * tsheg bar, then they are appended to errors if errors is - * non-null. Returns true upon perfect success, false if errors + * tsheg bar, then they are written into the output, and also + * appended to errors if errors is non-null. If warnings occur + * in converting a tsheg bar, then they are written into the + * output if writeWarningsToResult is true, and also appended to + * warnings if warnings is non-null. Returns true upon perfect + * success or if there were merely warnings, false if errors * occurred. * @throws IOException if we cannot write to out */ - public static boolean convertToTMW(ArrayList scan, String latinFont, - OutputStream out, StringBuffer errors) + public static boolean convertToTMW(ArrayList scan, + OutputStream out, + StringBuffer errors, + StringBuffer warnings, + boolean writeWarningsToResult, + String warningLevel) throws IOException { - throw new Error("DLC UNIMPLEMENTED"); + TibetanDocument tdoc = new TibetanDocument(); + tdoc.setRomanAttributeSet("Courier", 14); // DLC make me configurable. + boolean rv + = convertToTMW(scan, tdoc, errors, warnings, + writeWarningsToResult, warningLevel); + tdoc.writeRTFOutputStream(out); + return rv; } + + private static boolean convertToTMW(ArrayList scan, + TibetanDocument tdoc, + StringBuffer errors, + StringBuffer warnings, + boolean writeWarningsToResult, + String warningLevel) + throws IOException + { + return convertTo(false, scan, null, tdoc, errors, warnings, + writeWarningsToResult, warningLevel); + } + // DLC FIXME: sometimes { } is \u0F0B, and sometimes it is a // space. Treat it as a tsheg only when it appears after a // syllable or another tsheg. @@ -130,7 +166,8 @@ public class ACIPConverter { * or in converting a tsheg bar, then they are appended to * warnings if warnings is non-null, and they are written to the * result if writeWarningsToResult is true. Returns the - * conversion upon perfect success, null if errors occurred. + * conversion upon perfect success or if there were merely + * warnings, null if errors occurred. */ public static String convertToUnicode(String acip, StringBuffer errors, @@ -174,25 +211,43 @@ public class ACIPConverter { boolean writeWarningsToOut, String warningLevel) throws IOException + { + return convertTo(true, scan, out, null, errors, warnings, + writeWarningsToOut, warningLevel); + } + + private static boolean convertTo(boolean toUnicode, // else to TMW + ArrayList scan, + OutputStream out, // for toUnicode mode + TibetanDocument tdoc, // for !toUnicode mode + StringBuffer errors, + StringBuffer warnings, + boolean writeWarningsToOut, + String warningLevel) + throws IOException { int sz = scan.size(); boolean hasErrors = false; - BufferedWriter writer - = new BufferedWriter(new OutputStreamWriter(out, "UTF-8")); + BufferedWriter writer = null; + if (toUnicode) + writer + = new BufferedWriter(new OutputStreamWriter(out, "UTF-8")); for (int i = 0; i < sz; i++) { ACIPString s = (ACIPString)scan.get(i); int stype = s.getType(); if (stype == ACIPString.ERROR) { hasErrors = true; - writer.write("[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: "); - writer.write(s.getText()); - writer.write("]"); + String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]"; + if (null != writer) writer.write(text); + if (null != tdoc) tdoc.appendRoman(text); } else if (stype == ACIPString.WARNING) { if (writeWarningsToOut) { - writer.write("[#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: "); - writer.write(s.getText()); - writer.write("]"); + String text = "[#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: " + s.getText() + "]"; + if (null != writer) writer.write(text); + if (null != tdoc) tdoc.appendRoman(text); } + // DLC NOW: Warning: We're going with {'}{R}{DA}, but only because our knowledge of prefix rules says that {'}{R+DA} is not a legal Tibetan tsheg bar ("syllable") + if (null != warnings) { warnings.append("Warning: Lexical warning: "); warnings.append(s.getText()); @@ -200,13 +255,15 @@ public class ACIPConverter { } } else { if (s.isLatin(stype)) { - if (stype == ACIPString.FOLIO_MARKER) - writer.write("{"); - writer.write(s.getText()); - if (stype == ACIPString.FOLIO_MARKER) - writer.write("}"); + String text + = (((stype == ACIPString.FOLIO_MARKER) ? "{" : "") + + s.getText() + + ((stype == ACIPString.FOLIO_MARKER) ? "}" : "")); + if (null != writer) writer.write(text); + if (null != tdoc) tdoc.appendRoman(text); } else { String unicode = null; + DuffCode[] duff = null; if (stype == ACIPString.TIBETAN_NON_PUNCTUATION) { TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText()); String acipError; @@ -214,7 +271,8 @@ public class ACIPConverter { if ((acipError = pl.getACIPError()) != null) { hasErrors = true; String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS THESE ERRORS: " + acipError + "]"; - writer.write(errorMessage); + if (null != writer) writer.write(errorMessage); + if (null != tdoc) tdoc.appendRoman(errorMessage); if (null != errors) errors.append(errorMessage + "\n"); } else { @@ -222,7 +280,8 @@ public class ACIPConverter { if (null == pt) { hasErrors = true; String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " IS ESSENTIALLY NOTHING.]"; - writer.write(errorMessage); + if (null != writer) writer.write(errorMessage); + if (null != tdoc) tdoc.appendRoman(errorMessage); if (null != errors) errors.append(errorMessage + "\n"); } else { @@ -230,7 +289,8 @@ public class ACIPConverter { if (null == sl) { hasErrors = true; String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS NO LEGAL PARSES.]"; - writer.write(errorMessage); + if (null != writer) writer.write(errorMessage); + if (null != tdoc) tdoc.appendRoman(errorMessage); if (null != errors) errors.append(errorMessage + "\n"); } else { @@ -240,36 +300,74 @@ public class ACIPConverter { s.getText()); if (null != warning) { if (writeWarningsToOut) { - writer.write("[#WARNING CONVERTING ACIP DOCUMENT: "); - writer.write(warning); - writer.write("]"); + String text + = ("[#WARNING CONVERTING ACIP DOCUMENT: " + + warning + "]"); + if (null != writer) writer.write(text); + if (null != tdoc) tdoc.appendRoman(text); } if (null != warnings) { warnings.append(warning); warnings.append('\n'); } } - unicode = sl.getUnicode(); - if (null == unicode) throw new Error("FIXME: make this an assertion"); + if (null != writer) { + unicode = sl.getUnicode(); + if (null == unicode) throw new Error("FIXME: make this an assertion 4"); + } + if (null != tdoc) { + duff = sl.getDuff(); + if (0 == duff.length) { + throw new Error("No DuffCodes for stack list " + sl); // FIXME: make this an assertion + } + } } } } } else { - if (stype == ACIPString.START_SLASH) - unicode = "\u0F3C"; - else if (stype == ACIPString.END_SLASH) - unicode = "\u0F3D"; - else - unicode = ACIPRules.getUnicodeFor(s.getText(), false); - if (null == unicode) throw new Error("FIXME: make this an assertion"); + if (stype == ACIPString.START_SLASH) { + if (null != writer) unicode = "\u0F3C"; + if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph("(") }; + } else if (stype == ACIPString.END_SLASH) { + if (null != writer) unicode = "\u0F3D"; + if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph(")") }; + } else { + if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false); + if (null != tdoc) { + if (s.getText().equals("\r") || s.getText().equals("\t") || s.getText().equals("\n")) { + tdoc.appendRoman(s.getText()); + continue; + } + else { + String wy = ACIPRules.getWylieForACIPOther(s.getText()); + if (null == wy) throw new Error("No wylie for ACIP " + s.getText()); + duff = new DuffCode[] { TibetanMachineWeb.getGlyph(wy) }; + } + } + } + if (null != writer && null == unicode) + throw new Error("FIXME: make this an assertion 1"); + if (null != tdoc && (null == duff || 0 == duff.length)) + throw new Error("FIXME: make this an assertion 2"); } - if (null != unicode) { - writer.write(unicode); + if (null != writer && null != unicode) writer.write(unicode); + if (null != tdoc) { + if (null != duff && 0 != duff.length) { + tdoc.appendDuffCodes(duff); + // DLC NOW FIXME: use TibTextUtils.getVowel logic to make the output beautiful. + } else { + // this happens when you have an + // [#ERROR]-producing tsheg bar. + + // System.err.println("Bad tsheg bar with ACIP {" + s.getText() + "}"); + } } } } } - writer.close(); + if (null != writer) { + writer.close(); + } return !hasErrors; } } diff --git a/source/org/thdl/tib/text/ttt/ACIPRules.java b/source/org/thdl/tib/text/ttt/ACIPRules.java index a9b885b..d01945e 100644 --- a/source/org/thdl/tib/text/ttt/ACIPRules.java +++ b/source/org/thdl/tib/text/ttt/ACIPRules.java @@ -19,8 +19,12 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; import java.util.HashSet; +import java.util.ArrayList; import java.util.HashMap; +import org.thdl.tib.text.DuffCode; +import org.thdl.tib.text.TibetanMachineWeb; + /** Canonizes some facts regarding the ACIP transcription system. * @author David Chandler */ class ACIPRules { @@ -36,7 +40,9 @@ class ACIPRules { private static HashSet acipVowels = null; private static String[][] baseVowels = new String[][] { - // { ACIP, EWTS, EWTS for '\'' + baseVowels[][0] }: + // { ACIP, EWTS, EWTS for ACIP {'\'' + baseVowels[][0]}, vowel + // numbers (see TibetanMachineWeb's VOWEL_A, VOWEL_o, etc.) + // for ACIP, vowel numbers for ACIP {'\'' + baseVowels[][0]} { "A", "a", "A" }, { "I", "i", "I" }, { "U", "u", "U" }, @@ -70,7 +76,7 @@ class ACIPRules { // DLC keep this code in sync with getUnicodeFor. // DLC keep this code in sync with getWylieForACIPVowel - // DLC '\' for visarga? how shall we do \ the visarga? like a vowel or not? + // DLC '\' for virama? how shall we do \ the virama? like a vowel or not? } } return (acipVowels.contains(s)); @@ -211,6 +217,39 @@ class ACIPRules { return (String)acipVowel2wylie.get(acip); } + private static HashMap acipOther2wylie = null; + /** Returns the EWTS corresponding to the given ACIP puncuation or + * mark. Returns null if there is no such EWTS. */ + static final String getWylieForACIPOther(String acip) { + if (acipOther2wylie == null) { + acipOther2wylie = new HashMap(37); + + // DLC FIXME: check all these again. + acipOther2wylie.put(",", "/"); + acipOther2wylie.put(" ", " "); + acipOther2wylie.put(".", "*"); + acipOther2wylie.put("|", "|"); + acipOther2wylie.put("`", "!"); + acipOther2wylie.put(";", ";"); + acipOther2wylie.put("*", "@"); + acipOther2wylie.put("#", "@#"); + acipOther2wylie.put("%", "%"); + acipOther2wylie.put("&", "&"); + + acipOther2wylie.put("0", "0"); + acipOther2wylie.put("1", "1"); + acipOther2wylie.put("2", "2"); + acipOther2wylie.put("3", "3"); + acipOther2wylie.put("4", "4"); + acipOther2wylie.put("5", "5"); + acipOther2wylie.put("6", "6"); + acipOther2wylie.put("7", "7"); + acipOther2wylie.put("8", "8"); + acipOther2wylie.put("9", "9"); + } + return (String)acipOther2wylie.get(acip); + } + private static HashMap superACIP2unicode = null; private static HashMap subACIP2unicode = null; /** If acip is an ACIP consonant or vowel or punctuation mark, @@ -416,6 +455,42 @@ class ACIPRules { if (null != u) return u; } return (String)superACIP2unicode.get(acip); + } + + + /** DLC DOC: Gets the duffcodes for vowel, such that they look good with hashKey, and appends them to r. */ + static void getDuffForACIPVowel(ArrayList r, String hashKey, String vowel) { + if (null == vowel) return; + if (null == getWylieForACIPVowel(vowel)) // FIXME: expensive assertion! Use assert. + throw new IllegalArgumentException("Vowel " + vowel + " isn't in the small set of vowels we handle correctly."); + if (!TibetanMachineWeb.isKnownHashKey(hashKey)) // FIXME: expensive assertion! Use assert. + throw new IllegalArgumentException("bad hashKey"); + + // Order matters here. + if (vowel.indexOf("'U") >= 0) + r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_U)); + else { + if (vowel.indexOf('\'') >= 0) + r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_A)); + if (vowel.indexOf("EE") >= 0) + r.add(TibetanMachineWeb.getGlyph("ai")); + else if (vowel.indexOf('E') >= 0) + r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_e)); + if (vowel.indexOf("OO") >= 0) + r.add(TibetanMachineWeb.getGlyph("au")); + else if (vowel.indexOf('O') >= 0) + r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_o)); + if (vowel.indexOf('I') >= 0) + r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_i)); + if (vowel.indexOf('U') >= 0) + r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_u)); + if (vowel.indexOf('i') >= 0) + r.add(TibetanMachineWeb.getGlyph("-i")); + } + if (vowel.indexOf('m') >= 0) + r.add(TibetanMachineWeb.getGlyph("M")); + if (vowel.indexOf(':') >= 0) + r.add(TibetanMachineWeb.getGlyph("H")); } } diff --git a/source/org/thdl/tib/text/ttt/PackageTest.java b/source/org/thdl/tib/text/ttt/PackageTest.java index 4e2875b..a69be4c 100644 --- a/source/org/thdl/tib/text/ttt/PackageTest.java +++ b/source/org/thdl/tib/text/ttt/PackageTest.java @@ -319,6 +319,16 @@ tstHelper("MSTAN"); // ambiguous with regard to prefix rules +tstHelper("KA'", "[(K . A), (' . )]", + new String[] { "{KA}{'}" }, + new String[] { "{KA}{'}" }, + "{KA}{'}"); // DLC NOW + + tstHelper("A'AAMA", "{A}{'}{AA}{MA}"); // FIXME: how should we parse this? + + tstHelper("K+K+KA", "{K+}{K+}{KA}"); + + // If you're not careful, you'll think GGYES is a legal // Tibetan tsheg bar and parse it as {G}{G+YE}{S}. But it's diff --git a/source/org/thdl/tib/text/ttt/TPair.java b/source/org/thdl/tib/text/ttt/TPair.java index e598e20..2bf21b8 100644 --- a/source/org/thdl/tib/text/ttt/TPair.java +++ b/source/org/thdl/tib/text/ttt/TPair.java @@ -19,6 +19,10 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; import org.thdl.util.ThdlDebug; +import org.thdl.tib.text.TibetanMachineWeb; +import org.thdl.tib.text.DuffCode; + +import java.util.ArrayList; /** An ordered pair used in ACIP-to-TMW conversion. The left side is * the consonant or empty; the right side is the vowel, '+', or '-'. @@ -70,7 +74,9 @@ class TPair { /** Returns an TPair that is like this one except that it is * missing N characters. The characters are taken from r, the - * right side, first and from l, the left side, second. + * right side, first and from l, the left side, second. The pair + * returned may be illegal, such as the (A . ') you can get from + * ACIP {A'AAMA}. * @throw IllegalArgumentException if N is out of range */ TPair minusNRightmostACIPCharacters(int N) throws IllegalArgumentException @@ -80,7 +86,7 @@ class TPair { if (N > size()) throw new IllegalArgumentException("Don't have that many to remove."); if (N < 1) - throw new IllegalArgumentException("You should't call this if you don't want to remove any."); + throw new IllegalArgumentException("You shouldn't call this if you don't want to remove any."); if (null != r && (sz = r.length()) > 0) { int min = Math.min(sz, N); newR = r.substring(0, sz - min); @@ -101,7 +107,7 @@ class TPair { return false; if (null != l && !ACIPRules.isConsonant(l)) return false; - if (null != r && !ACIPRules.isVowel(l)) + if (null != r && !ACIPRules.isVowel(r)) return false; return true; } @@ -146,8 +152,14 @@ class TPair { return (l != null && l.length() == 1 && (ch = l.charAt(0)) >= '0' && ch <= '9'); } - /** Returns the EWTS Wylie that corresponds to this pair. Untested. */ String getWylie() { + return getWylie(false); + } + + /** Returns the EWTS Wylie that corresponds to this pair if + * justLeft is false, or the EWTS Wylie that corresponds to just + * {@link #getLeft()} if justLeft is true. */ + String getWylie(boolean justLeft) { String leftWylie = null; if (getLeft() != null) { leftWylie = ACIPRules.getWylieForACIPConsonant(getLeft()); @@ -156,6 +168,8 @@ class TPair { leftWylie = getLeft(); } } + if (null == leftWylie) leftWylie = ""; + if (justLeft) return leftWylie; String rightWylie = null; if ("-".equals(getRight())) rightWylie = "."; @@ -163,7 +177,6 @@ class TPair { rightWylie = "+"; else if (getRight() != null) rightWylie = ACIPRules.getWylieForACIPVowel(getRight()); - if (null == leftWylie) leftWylie = ""; if (null == rightWylie) rightWylie = ""; return leftWylie + rightWylie; } diff --git a/source/org/thdl/tib/text/ttt/TPairList.java b/source/org/thdl/tib/text/ttt/TPairList.java index c1ebfd5..6549a01 100644 --- a/source/org/thdl/tib/text/ttt/TPairList.java +++ b/source/org/thdl/tib/text/ttt/TPairList.java @@ -609,5 +609,24 @@ class TPairList { } } + /** Appends the DuffCodes that correspond to this grapheme cluster + * to duff. Assumes this is one grapheme cluster. */ + void getDuff(ArrayList duff) { + StringBuffer wylieForConsonant = new StringBuffer(); + for (int x = 0; x + 1 < size(); x++) { + wylieForConsonant.append(get(x).getWylie(false)); + } + TPair lastPair = get(size() - 1); + wylieForConsonant.append(lastPair.getWylie(true)); + String hashKey = wylieForConsonant.toString(); + if (!TibetanMachineWeb.isKnownHashKey(hashKey)) { + hashKey = hashKey.replace('+', '-'); + if (!TibetanMachineWeb.isKnownHashKey(hashKey)) { + throw new Error("How did this happen?"); + } + } + duff.add(TibetanMachineWeb.getGlyph(hashKey)); + ACIPRules.getDuffForACIPVowel(duff, hashKey, lastPair.getRight()); + } } // DLC FIXME: handle 'o' and 'x', e.g. KAo and NYAx. diff --git a/source/org/thdl/tib/text/ttt/TStackList.java b/source/org/thdl/tib/text/ttt/TStackList.java index 5db6847..05efc2d 100644 --- a/source/org/thdl/tib/text/ttt/TStackList.java +++ b/source/org/thdl/tib/text/ttt/TStackList.java @@ -20,6 +20,7 @@ package org.thdl.tib.text.ttt; import org.thdl.tib.text.TibTextUtils; import org.thdl.tib.text.TGCList; +import org.thdl.tib.text.DuffCode; import java.util.ArrayList; import java.util.ListIterator; @@ -216,8 +217,21 @@ class TStackList { } return u.toString(); } + /** DLC DOC */ + DuffCode[] getDuff() { + ArrayList al = new ArrayList(size()*2); // rough estimate + int count = 0; + for (int i = 0; i < size(); i++) { + get(i).getDuff(al); + } + if (size() > 0 && al.size() == 0) { + throw new Error("But this stack list, " + this + ", contains " + size() + " stacks! How can it not have DuffCodes associated with it?"); + } + return (DuffCode[])al.toArray(new DuffCode[] { }); + } } +/** Too simple to comment. */ class BoolPair { boolean isLegal; boolean isLegalAndHasAVowelOnRoot;