diff --git a/source/org/thdl/tib/input/ConvertDialog.java b/source/org/thdl/tib/input/ConvertDialog.java index c8ea973..ff48577 100644 --- a/source/org/thdl/tib/input/ConvertDialog.java +++ b/source/org/thdl/tib/input/ConvertDialog.java @@ -223,10 +223,13 @@ class ConvertDialog extends JDialog JButton src = (JButton)ae.getSource(); if (src == browseOld) { jfc.setFileFilter((ACIP_TO_UNI_TEXT.equals((String)choices.getSelectedItem()) - || ACIP_TO_TMW.equals((String)choices.getSelectedItem())) + || WYLIE_TO_UNI_TEXT.equals((String)choices.getSelectedItem()) + || ACIP_TO_TMW.equals((String)choices.getSelectedItem()) + || WYLIE_TO_TMW.equals((String)choices.getSelectedItem())) ? acipff : rtfff); } else { jfc.setFileFilter((ACIP_TO_UNI_TEXT.equals((String)choices.getSelectedItem()) + || WYLIE_TO_UNI_TEXT.equals((String)choices.getSelectedItem()) || TMW_TO_ACIP_TEXT.equals((String)choices.getSelectedItem()) || TMW_TO_WYLIE_TEXT.equals((String)choices.getSelectedItem())) ? acipff : rtfff); @@ -457,7 +460,7 @@ class ConvertDialog extends JDialog } else if (FIND_ALL_NON_TM == ct) { newFileNamePrefix = "AllNonTM__"; newFileNameExtension = ".TXT"; - } else if (TMW_TO_SAME_TWM == ct) { + } else if (TMW_TO_SAME_TMW == ct) { newFileNamePrefix = "TMW_to_same_TMW__"; newFileNameExtension = ".RTF"; } else { // conversion mode @@ -471,13 +474,15 @@ class ConvertDialog extends JDialog } else if (TMW_TO_ACIP_TEXT == ct) { newFileNamePrefix = suggested_ACIP_prefix; newFileNameExtension = ".TXT"; - } else if (TMW_TO_UNI == ct || ACIP_TO_UNI_TEXT == ct) { + } else if (TMW_TO_UNI == ct || ACIP_TO_UNI_TEXT == ct + || WYLIE_TO_UNI_TEXT == ct) { newFileNamePrefix = suggested_TO_UNI_prefix; - if (ACIP_TO_UNI_TEXT == ct) + if (ACIP_TO_UNI_TEXT == ct || WYLIE_TO_UNI_TEXT == ct) newFileNameExtension = ".TXT"; - } else if (TM_TO_TMW == ct || ACIP_TO_TMW == ct) { + } else if (TM_TO_TMW == ct || ACIP_TO_TMW == ct + || WYLIE_TO_TMW == ct) { newFileNamePrefix = suggested_TO_TMW_prefix; - if (ACIP_TO_TMW == ct) + if (ACIP_TO_TMW == ct || WYLIE_TO_TMW == ct) newFileNameExtension = ".RTF"; } else { ThdlDebug.verify(TMW_TO_TM == ct); @@ -509,6 +514,7 @@ class ConvertDialog extends JDialog } } + // TODO(DLC)[EWTS->Tibetan]: we use for wylie (ewts) too... public class ACIPFileFilter extends javax.swing.filechooser.FileFilter { public boolean accept(File f) diff --git a/source/org/thdl/tib/input/FontConverterConstants.java b/source/org/thdl/tib/input/FontConverterConstants.java index ab0896d..4620a37 100644 --- a/source/org/thdl/tib/input/FontConverterConstants.java +++ b/source/org/thdl/tib/input/FontConverterConstants.java @@ -26,7 +26,9 @@ import java.awt.*; @author Nathaniel Garson, Tibetan and Himalayan Digital Library */ interface FontConverterConstants { - final String TMW_TO_SAME_TWM = "TMW to the same TMW (for testing only) (RTF->RTF)"; + final String WYLIE_TO_UNI_TEXT = "Wylie to Unicode (Text->Text)"; + final String WYLIE_TO_TMW = "Wylie to TMW (Text->RTF)"; + final String TMW_TO_SAME_TMW = "TMW to the same TMW (for testing only) (RTF->RTF)"; final String ACIP_TO_UNI_TEXT = "ACIP to Unicode (Text->Text)"; final String ACIP_TO_TMW = "ACIP to TMW (Text->RTF)"; final String TMW_TO_ACIP = "TMW to ACIP (RTF->RTF)"; @@ -42,6 +44,10 @@ interface FontConverterConstants final String FIND_ALL_NON_TM = "Find all non-TM (in RTF)"; final String[] CHOICES = new String[] { + /* TODO(DLC)[EWTS->Tibetan]: once we're done debugging: + WYLIE_TO_UNI_TEXT, + WYLIE_TO_TMW, + */ ACIP_TO_UNI_TEXT, ACIP_TO_TMW, TMW_TO_ACIP, @@ -58,7 +64,9 @@ interface FontConverterConstants }; final String[] DEBUG_CHOICES = new String[] { - TMW_TO_SAME_TWM, + TMW_TO_SAME_TMW, + WYLIE_TO_UNI_TEXT, + WYLIE_TO_TMW, ACIP_TO_UNI_TEXT, ACIP_TO_TMW, TMW_TO_ACIP, diff --git a/source/org/thdl/tib/input/TibetanConverter.java b/source/org/thdl/tib/input/TibetanConverter.java index ab2184c..ee932e0 100644 --- a/source/org/thdl/tib/input/TibetanConverter.java +++ b/source/org/thdl/tib/input/TibetanConverter.java @@ -28,6 +28,8 @@ import org.thdl.tib.text.*; import org.thdl.tib.text.ttt.TConverter; import org.thdl.tib.text.ttt.ACIPTraits; +import org.thdl.tib.text.ttt.EWTSTraits; +import org.thdl.tib.text.ttt.TTraits; import java.util.ArrayList; /** TibetanConverter is a command-line utility for converting to and @@ -71,6 +73,8 @@ public class TibetanConverter implements FontConverterConstants { boolean convertToTMMode = false; boolean convertACIPToUniMode = false; boolean convertACIPToTMWMode = false; + boolean convertWylieToUniMode = false; + boolean convertWylieToTMWMode = false; boolean convertToTMWMode = false; boolean convertToWylieRTFMode = false; boolean convertToWylieTextMode = false; @@ -116,6 +120,10 @@ public class TibetanConverter implements FontConverterConstants { = args[numArgs - 2].equals("--acip-to-unicode")) || (convertACIPToTMWMode = args[numArgs - 2].equals("--acip-to-tmw")) + || (convertWylieToUniMode + = args[numArgs - 2].equals("--wylie-to-unicode")) + || (convertWylieToTMWMode + = args[numArgs - 2].equals("--wylie-to-tmw")) || (convertToUnicodeMode = args[numArgs - 2].equals("--to-unicode")) || (convertToWylieRTFMode @@ -147,6 +155,7 @@ public class TibetanConverter implements FontConverterConstants { out.println(" | --to-tibetan-machine | --to-tibetan-machine-web"); out.println(" | --to-unicode | --to-wylie | --to-acip"); out.println(" | --to-wylie-text | --to-acip-text"); + out.println(" | --wylie-to-unicode | --wylie-to-tmw"); out.println(" | --acip-to-unicode | --acip-to-tmw RTF_file|TXT_file"); out.println(" | TibetanConverter [--version | -v | --help | -h]"); out.println(""); @@ -251,13 +260,17 @@ public class TibetanConverter implements FontConverterConstants { } else if (convertToUnicodeMode) { conversionTag = TMW_TO_UNI; } else if (convertTmwToTmwMode) { - conversionTag = TMW_TO_SAME_TWM; + conversionTag = TMW_TO_SAME_TMW; } else if (convertToTMWMode) { conversionTag = TM_TO_TMW; } else if (convertACIPToUniMode) { conversionTag = ACIP_TO_UNI_TEXT; } else if (convertACIPToTMWMode) { conversionTag = ACIP_TO_TMW; + } else if (convertWylieToUniMode) { + conversionTag = WYLIE_TO_UNI_TEXT; + } else if (convertWylieToTMWMode) { + conversionTag = WYLIE_TO_TMW; } else { ThdlDebug.verify(convertToTMMode); conversionTag = TMW_TO_TM; @@ -294,20 +307,28 @@ public class TibetanConverter implements FontConverterConstants { static int reallyConvert(InputStream in, PrintStream out, String ct, String warningLevel, boolean shortMessages, boolean colors) { - if (ACIP_TO_UNI_TEXT == ct || ACIP_TO_TMW == ct) { + if (ACIP_TO_UNI_TEXT == ct || ACIP_TO_TMW == ct + || WYLIE_TO_UNI_TEXT == ct || WYLIE_TO_TMW == ct) { try { ArrayList al - = ACIPTraits.instance().scanner().scanStream(in, null, - ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have", - 1000 - 1), - shortMessages, - warningLevel); + = ((ACIP_TO_UNI_TEXT == ct || ACIP_TO_TMW == ct) + ? (TTraits)ACIPTraits.instance() + : (TTraits)EWTSTraits.instance()).scanner().scanStream(in, null, + ThdlOptions.getIntegerOption((ACIP_TO_UNI_TEXT == ct || ACIP_TO_TMW == ct) + ? "thdl.most.errors.a.tibetan.acip.document.can.have" + : "thdl.most.errors.a.tibetan.ewts.document.can.have", + 1000 - 1), + shortMessages, + warningLevel); if (null == al) return 47; boolean embeddedWarnings = (warningLevel != "None"); boolean hasWarnings[] = new boolean[] { false }; - if (ACIP_TO_UNI_TEXT == ct) { - if (!TConverter.convertToUnicodeText(ACIPTraits.instance(), + if (ACIP_TO_UNI_TEXT == ct + || WYLIE_TO_UNI_TEXT == ct) { + if (!TConverter.convertToUnicodeText((WYLIE_TO_UNI_TEXT == ct) + ? (TTraits)EWTSTraits.instance() + : (TTraits)ACIPTraits.instance(), al, out, null, null, hasWarnings, embeddedWarnings, @@ -315,8 +336,9 @@ public class TibetanConverter implements FontConverterConstants { shortMessages)) return 46; } else { - if (ct != ACIP_TO_TMW) throw new Error("badness"); - if (!TConverter.convertToTMW(ACIPTraits.instance(), + if (!TConverter.convertToTMW((WYLIE_TO_TMW == ct) + ? (TTraits)EWTSTraits.instance() + : (TTraits)ACIPTraits.instance(), al, out, null, null, hasWarnings, embeddedWarnings, @@ -402,7 +424,7 @@ public class TibetanConverter implements FontConverterConstants { int exitCode = 0; ThdlDebug.verify(((TMW_TO_TM == ct) ? 1 : 0) - + ((TMW_TO_SAME_TWM == ct) ? 1 : 0) + + ((TMW_TO_SAME_TMW == ct) ? 1 : 0) + ((TMW_TO_UNI == ct) ? 1 : 0) + ((TM_TO_TMW == ct) ? 1 : 0) + ((TMW_TO_ACIP == ct) ? 1 : 0) @@ -411,7 +433,7 @@ public class TibetanConverter implements FontConverterConstants { + ((TMW_TO_WYLIE_TEXT == ct) ? 1 : 0) == 1); long numAttemptedReplacements[] = new long[] { 0 }; - if (TMW_TO_SAME_TWM == ct) { + if (TMW_TO_SAME_TMW == ct) { // Identity conversion for testing if (tdoc.identityTmwToTmwConversion(0, tdoc.getLength(), diff --git a/source/org/thdl/tib/text/ttt/ACIPTraits.java b/source/org/thdl/tib/text/ttt/ACIPTraits.java index c075ae2..10d204c 100644 --- a/source/org/thdl/tib/text/ttt/ACIPTraits.java +++ b/source/org/thdl/tib/text/ttt/ACIPTraits.java @@ -560,6 +560,7 @@ public final class ACIPTraits implements TTraits { } else if (wowel.indexOf("'I") >= 0) { TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.I_VOWEL, context_added); } else { + // TODO(dchandler): I don't understand why we go from else ifs to this form... if (wowel.indexOf('\'') >= 0) { TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.A_VOWEL, context_added); } diff --git a/source/org/thdl/tib/text/ttt/EWTSTraits.java b/source/org/thdl/tib/text/ttt/EWTSTraits.java index b31067e..9c42b7d 100644 --- a/source/org/thdl/tib/text/ttt/EWTSTraits.java +++ b/source/org/thdl/tib/text/ttt/EWTSTraits.java @@ -23,6 +23,8 @@ package org.thdl.tib.text.ttt; import java.util.ArrayList; import org.thdl.tib.text.DuffCode; +import org.thdl.tib.text.THDLWylieConstants; +import org.thdl.tib.text.TibTextUtils; import org.thdl.tib.text.TibetanMachineWeb; import org.thdl.util.ThdlDebug; @@ -154,7 +156,70 @@ public final class EWTSTraits implements TTraits { public TTshegBarScanner scanner() { return EWTSTshegBarScanner.instance(); } public void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel) { - throw new Error("TODO(DLC)[EWTS->Tibetan]"); + + // TODO(DLC)[EWTS->Tibetan]: I have no confidence in this! test, test, test. + + // Order matters here. + boolean context_added[] = new boolean[] { false }; + if (wowel.equals(THDLWylieConstants.WYLIE_aVOWEL)) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.WYLIE_aVOWEL, context_added); + } else { + // TODO(DLC)[EWTS->Tibetan]: test vowel stacking + if (wowel.indexOf(THDLWylieConstants.U_VOWEL) >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.U_VOWEL, context_added); + } + if (wowel.indexOf(THDLWylieConstants.reverse_I_VOWEL) >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.reverse_I_VOWEL, context_added); + } else if (wowel.indexOf(THDLWylieConstants.I_VOWEL) >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.I_VOWEL, context_added); + } + if (wowel.indexOf(THDLWylieConstants.A_VOWEL) >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.A_VOWEL, context_added); + } + if (wowel.indexOf(THDLWylieConstants.ai_VOWEL) >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.ai_VOWEL, context_added); + } + if (wowel.indexOf(THDLWylieConstants.au_VOWEL) >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.au_VOWEL, context_added); + } + if (wowel.indexOf(THDLWylieConstants.reverse_i_VOWEL) >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.reverse_i_VOWEL, context_added); + } else if (wowel.indexOf(THDLWylieConstants.i_VOWEL) >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.i_VOWEL, context_added); + } + if (wowel.indexOf(THDLWylieConstants.e_VOWEL) >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.e_VOWEL, context_added); + } + if (wowel.indexOf(THDLWylieConstants.o_VOWEL) >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.o_VOWEL, context_added); + } + if (wowel.indexOf(THDLWylieConstants.u_VOWEL) >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.u_VOWEL, context_added); + } + if (wowel.indexOf("~X") >= 0) { // TODO(DLC)[EWTS->Tibetan]: introduce THDLWylieConstants.blah + duff.add(TibetanMachineWeb.getGlyph("~X")); + } else if (wowel.indexOf("X") >= 0) { // TODO(DLC)[EWTS->Tibetan]: introduce THDLWylieConstants.blah + duff.add(TibetanMachineWeb.getGlyph("X")); + } + } + // FIXME: Use TMW9.61, the "o'i" special combination, when appropriate. + + if (wowel.indexOf('M') >= 0) { + DuffCode last = null; + if (duff.size() > 0) { + last = (DuffCode)duff.get(duff.size() - 1); + duff.remove(duff.size() - 1); // getBindu will add it back... + // TODO(DLC)[EWTS->Tibetan]: is this okay???? when is a bindu okay to be alone??? + } + TibTextUtils.getBindu(duff, last); + } + if (wowel.indexOf('H') >= 0) + duff.add(TibetanMachineWeb.getGlyph("H")); + + + // TODO(DLC)[EWTS->Tibetan]: verify that no part of wowel is discarded! acip does that. 'jam~X I think we screw up, e.g. + + // TODO(DLC)[EWTS->Tibetan]:: are bindus are screwed up in the unicode output? i see (with tmuni font) lone bindus without glyphs to stack on } public String getUnicodeForWowel(String wowel) { @@ -218,12 +283,17 @@ public final class EWTSTraits implements TTraits { } public String getUnicodeFor(String l, boolean subscribed) { - + // First, handle "\u0f71\u0f84\u0f86", "", "\u0f74", etc. { boolean already_done = true; for (int i = 0; i < l.length(); i++) { - if (!(l.charAt(0) >= '\u0f00' && l.charAt(0) <= '\u0fff')) { + char ch = l.charAt(i); + if ((ch < '\u0f00' || ch > '\u0fff') + && '\n' != ch + && '\r' != ch) { + // TODO(DLC)[EWTS->Tibetan]: Is this the place + // where we want to interpret how newlines work??? already_done = false; break; } diff --git a/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java b/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java index 6688b3a..d450364 100644 --- a/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java +++ b/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java @@ -61,6 +61,9 @@ class EWTSTshegBarScanner extends TTshegBarScanner { StringBuffer sb = new StringBuffer(s); ExpandEscapeSequences(sb); int sl = sb.length(); + // TODO(DLC)[EWTS->Tibetan]:: '@#', in ewts->tmw, is not working + // TODO(DLC)[EWTS->Tibetan]:: 'jamX 'jam~X one is not working in ->tmw mode + // TODO(DLC)[EWTS->Tibetan]:: dzaHsogs is not working for (int i = 0; i < sl; i++) { if (isValidInsideTshegBar(sb.charAt(i))) { StringBuffer tbsb = new StringBuffer(); @@ -75,7 +78,7 @@ class EWTSTshegBarScanner extends TTshegBarScanner { al.add(new TString("EWTS", tbsb.toString(), TString.TIBETAN_NON_PUNCTUATION)); } else { - if (" /;|!:=_@#$%<>()\r\n\t".indexOf(sb.charAt(i)) >= 0) + if (" /;|!:=_@#$%<>()\r\n\t*".indexOf(sb.charAt(i)) >= 0) al.add(new TString("EWTS", sb.substring(i, i+1), TString.TIBETAN_PUNCTUATION)); else diff --git a/source/org/thdl/tib/text/ttt/TParseTree.java b/source/org/thdl/tib/text/ttt/TParseTree.java index f81b433..14eaa18 100644 --- a/source/org/thdl/tib/text/ttt/TParseTree.java +++ b/source/org/thdl/tib/text/ttt/TParseTree.java @@ -327,14 +327,16 @@ class TParseTree { translit, traits); } else { - if (bestParse.hasStackWithoutVowel(pl, isLastStack)) { + if (bestParse.hasStackWithoutVowel(traits.isACIP(), + pl, isLastStack)) { if (isLastStack[0]) { if (ErrorsAndWarnings.isEnabled(502, warningLevel)) return ErrorsAndWarnings.getMessage(502, shortMessages, translit, traits); } else { - throw new Error("Can't happen now that we stack greedily"); + if (traits.isACIP()) + throw new Error("Can't happen now that we stack greedily"); } } if (ErrorsAndWarnings.isEnabled(503, warningLevel)) @@ -343,14 +345,16 @@ class TParseTree { traits); } } else { - if (nip.get(0).hasStackWithoutVowel(pl, isLastStack)) { + if (nip.get(0).hasStackWithoutVowel(traits.isACIP(), + pl, isLastStack)) { if (isLastStack[0]) { if (ErrorsAndWarnings.isEnabled(502, warningLevel)) return ErrorsAndWarnings.getMessage(502, shortMessages, translit, traits); } else { - throw new Error("Can't happen now that we stack greedily [2]"); + if (traits.isACIP()) + throw new Error("Can't happen now that we stack greedily [2]"); } } } diff --git a/source/org/thdl/tib/text/ttt/TStackList.java b/source/org/thdl/tib/text/ttt/TStackList.java index e02a152..aca0fd0 100644 --- a/source/org/thdl/tib/text/ttt/TStackList.java +++ b/source/org/thdl/tib/text/ttt/TStackList.java @@ -193,14 +193,15 @@ class TStackList { /** Returns true if and only if this stack list contains a stack * that does not end in a vowel or disambiguator. Note that this - * is not erroneous for legal Tibetan like {BRTAN}, where {B} has + * is not erroneous for legal Tibetan like ACIP {BRTAN}, where {B} has * no vowel, but it is a warning sign for Sanskrit stacks. + * @param isACIP true iff opl is ACIP (not EWTS) * @param opl the pair list from which this stack list * originated * @param isLastStack if non-null, then isLastStack[0] will be * set to true if and only if the very last stack is the only * stack not to have a vowel or disambiguator on it */ - boolean hasStackWithoutVowel(TPairList opl, boolean[] isLastStack) { + boolean hasStackWithoutVowel(boolean isACIP, TPairList opl, boolean[] isLastStack) { int runningSize = 0; // FIXME: MARDA is MARD==MAR-D to us, but is probably MAR+DA, warn -- see 838470 for (int i = 0; i < size(); i++) { @@ -213,15 +214,16 @@ class TStackList { && l.charAt(0) >= '0' && l.charAt(0) <= '9')) { if (null != isLastStack) { isLastStack[0] = (i + 1 == size()); - if (!isLastStack[0]) { + if (!isLastStack[0] && isACIP) { throw new Error("But we now stack greedily!"); } } return true; } } - if (runningSize != opl.sizeMinusDisambiguators()) + if (runningSize != opl.sizeMinusDisambiguators()) { throw new IllegalArgumentException("runningSize = " + runningSize + "; opl.sizeMinusDisambiguators = " + opl.sizeMinusDisambiguators() + "; opl (" + opl + ") is bad for this stack list (" + toString() + ")"); + } return false; } diff --git a/source/org/thdl/tib/text/ttt/TTraits.java b/source/org/thdl/tib/text/ttt/TTraits.java index 790d847..645fe52 100644 --- a/source/org/thdl/tib/text/ttt/TTraits.java +++ b/source/org/thdl/tib/text/ttt/TTraits.java @@ -34,7 +34,7 @@ import org.thdl.tib.text.DuffCode; * *
It is very likely that classes that implement this interface * will choose to use the design pattern 'singleton'. */ -interface TTraits { +public interface TTraits { /** Returns the disambiguator for this transliteration scheme, * which had better be a string containing just one character * lest {@link #disambiguatorChar()} become nonsensical for