EWTS->TMW fixes. Wowel handling still isn't perfect but I'm lazy.

Jskad now uses the new EWTS->TMW routine, not the old, and thus the
"(Buggy)" label is [unfairly, perhaps] dropped.
This commit is contained in:
dchandler 2005-07-07 01:30:03 +00:00
parent 0f99c402df
commit 982350371d
7 changed files with 129 additions and 86 deletions

View file

@ -19,17 +19,17 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text;
/**
* An exception thrown whenever ACIP->TMW conversion in the Jskad GUI
* runs into invalid ACIP.
* An exception thrown whenever an EWTS->TMW or ACIP->TMWconversion in
* the Jskad GUI runs into an invalid transliteration string.
* @author David Chandler */
public class InvalidACIPException extends Exception {
public class InvalidTransliterationException extends Exception {
private String error;
/**
* Creates an InvalidACIPException.
* Creates an InvalidTransliterationException.
* @param s an error message
*/
public InvalidACIPException(String s) {
public InvalidTransliterationException(String s) {
error = s;
}

View file

@ -22,7 +22,15 @@ package org.thdl.tib.text;
* @see TibetanMachineWeb */
public interface THDLWylieConstants {
/**
* the Wylie for bindu/anusvara
* the Wylie for U+0F82
*/
public static final String U0F82 = "~M`";
/**
* the Wylie for U+0F83
*/
public static final String U0F83 = "~M";
/**
* the Wylie for bindu/anusvara (U+0F7E)
*/
public static final char BINDU = 'M';
/**
@ -52,6 +60,10 @@ public interface THDLWylieConstants {
*/
public static final String WYLIE_aVOWEL = "a";
/**
* the Wylie for U+0F39
*/
public static final String WYLIE_TSA_PHRU = "^";
/**
* the Wylie for achung
*/
public static final char ACHUNG_character = '\'';

View file

@ -25,7 +25,9 @@ import javax.swing.text.rtf.RTFEditorKit;
import java.io.*;
import org.thdl.util.ThdlDebug;
import org.thdl.tib.text.ttt.TTraits;
import org.thdl.tib.text.ttt.ACIPTraits;
import org.thdl.tib.text.ttt.EWTSTraits;
import org.thdl.tib.text.ttt.TConverter;
import org.thdl.tib.text.tshegbar.LegalTshegBar;
import org.thdl.tib.text.tshegbar.UnicodeConstants;
@ -312,34 +314,44 @@ public class TibTextUtils implements THDLWylieConstants {
= new boolean[] { false };
/**
* Converts a string of ACIP into TibetanMachineWeb and inserts that
* into tdoc at offset loc.
* @param acip the ACIP you want to convert
* Converts a string of transliteration into TibetanMachineWeb and
* inserts that into tdoc at offset loc.
* @param EWTSNotACIP true if you want THDL Extended Wylie, false if
* you want ACIP
* @param translit the transliteration you want to convert
* @param tdoc the document in which to insert the TMW
* @param loc the offset inside the document at which to insert the TMW
* @param withWarnings true if and only if you want warnings to appear
* in the output, such as "this could be a mistranscription of blah..."
* @throws InvalidACIPException if the ACIP is deemed invalid, i.e. if
* it does not conform to the ACIP transcription rules (those in the
* official document and the subtler rules pieced together by David
* Chandler through study and private correspondence with Robert
* Chilton)
* @throws InvalidTransliterationException if the transliteration is
* deemed invalid, i.e. if it does not conform to the transcription
* rules (those in the official document and the subtler rules pieced
* together by David Chandler through study and private correspondence
* with Robert Chilton (for ACIP), Than Garson, David Germano, Chris
* Fynn, and others)
* @return the number of characters inserted into tdoc */
public static int insertTibetanMachineWebForACIP(String acip,
TibetanDocument tdoc,
int loc,
boolean withWarnings)
throws InvalidACIPException
public static int insertTibetanMachineWebForTranslit(boolean EWTSNotACIP,
String translit,
TibetanDocument tdoc,
int loc,
boolean withWarnings)
throws InvalidTransliterationException
{
StringBuffer errors = new StringBuffer();
String warningLevel = withWarnings ? "All" : "None";
ArrayList al = ACIPTraits.instance().scanner().scan(acip, errors, 500,
false, warningLevel);
TTraits traits = (EWTSNotACIP
? (TTraits)EWTSTraits.instance()
: (TTraits)ACIPTraits.instance());
ArrayList al = traits.scanner().scan(translit, errors, 500,
false, warningLevel);
if (null == al || errors.length() > 0) {
if (errors.length() > 0)
throw new InvalidACIPException(errors.toString());
throw new InvalidTransliterationException(errors.toString());
else
throw new InvalidACIPException("Fatal error converting ACIP to TMW.");
throw new InvalidTransliterationException("Fatal error converting "
+ traits.shortTranslitName()
+ " to TMW.");
}
boolean colors = withWarnings;
boolean putWarningsInOutput = false;
@ -348,7 +360,7 @@ public class TibTextUtils implements THDLWylieConstants {
}
try {
int tloc[] = new int[] { loc };
TConverter.convertToTMW(ACIPTraits.instance(), al, tdoc, null, null,
TConverter.convertToTMW(traits, al, tdoc, null, null,
null, putWarningsInOutput, warningLevel,
false, colors, tloc);
return tloc[0] - loc;
@ -364,8 +376,13 @@ public class TibTextUtils implements THDLWylieConstants {
* corresponding to the Wylie text
* @throws InvalidWylieException if the Wylie is deemed invalid,
* i.e. if it does not conform to the Extended Wylie standard
* @deprecated by insertTibetanMachineWebForTranslit
*/
public static DuffData[] getTibetanMachineWebForEWTS(String wylie) throws InvalidWylieException {
ThdlDebug.noteIffyCode(); // deprecated method!
// TODO(dchandler): remove it and
// hopefully a ton of code that
// only it uses.
List chars = new ArrayList();
DuffCode dc;
int start = 0;

View file

@ -79,6 +79,11 @@ public class EWTSTest extends TestCase {
/** Causes a JUnit test case failure unless the EWTS document ewts
* converts to the unicode expectedUnicode. */
static void ewts2uni_test(String ewts, String expectedUnicode) {
// TODO(DLC)[EWTS->Tibetan]: In addition to what this
// currently does, have this function convert to TMW and
// convert that TMW to Unicode and verify that the result is
// the same. Almost every call should allow for that.
StringBuffer errors = new StringBuffer();
String unicode = TConverter.convertToUnicodeText(EWTSTraits.instance(),
ewts, errors,

View file

@ -164,6 +164,10 @@ public final class EWTSTraits implements TTraits {
// TODO(DLC)[EWTS->Tibetan]: I have no confidence in this! test, test, test.
// TODO(DLC)[EWTS->Tibetan]: ko+o doesn't work. kai+-i doesn't work.
// TODO(DLC)[EWTS->Tibetan]: kai doesn't work.
// Order matters here.
boolean context_added[] = new boolean[] { false };
if (wowel.equals(THDLWylieConstants.WYLIE_aVOWEL)) {
@ -183,11 +187,7 @@ public final class EWTSTraits implements TTraits {
}
if (wowel.indexOf(THDLWylieConstants.ai_VOWEL) >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.ai_VOWEL, context_added);
}
if (wowel.indexOf(THDLWylieConstants.au_VOWEL) >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.au_VOWEL, context_added);
}
if (wowel.indexOf(THDLWylieConstants.reverse_i_VOWEL) >= 0) {
} else if (wowel.indexOf(THDLWylieConstants.reverse_i_VOWEL) >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.reverse_i_VOWEL, context_added);
} else if (wowel.indexOf(THDLWylieConstants.i_VOWEL) >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.i_VOWEL, context_added);
@ -198,7 +198,9 @@ public final class EWTSTraits implements TTraits {
if (wowel.indexOf(THDLWylieConstants.o_VOWEL) >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.o_VOWEL, context_added);
}
if (wowel.indexOf(THDLWylieConstants.u_VOWEL) >= 0) {
if (wowel.indexOf(THDLWylieConstants.au_VOWEL) >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.au_VOWEL, context_added);
} else if (wowel.indexOf(THDLWylieConstants.u_VOWEL) >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.u_VOWEL, context_added);
}
if (wowel.indexOf("~X") >= 0) { // TODO(DLC)[EWTS->Tibetan]: introduce THDLWylieConstants.blah
@ -209,7 +211,12 @@ public final class EWTSTraits implements TTraits {
}
// FIXME: Use TMW9.61, the "o'i" special combination, when appropriate.
if (wowel.indexOf('M') >= 0) {
if (wowel.indexOf(THDLWylieConstants.BINDU) >= 0
// TODO(DLC)[EWTS->Tibetan]: This is really ugly... we
// rely on the fact that we know every Wylie wowel that
// contains 'M'. Let's, instead, parse the wowel.
&& wowel.indexOf(THDLWylieConstants.U0F82) < 0
&& wowel.indexOf(THDLWylieConstants.U0F83) < 0) {
DuffCode last = null;
if (!context_added[0]) {
last = preceding;
@ -219,10 +226,35 @@ public final class EWTSTraits implements TTraits {
// TODO(DLC)[EWTS->Tibetan]: is this okay???? when is a bindu okay to be alone???
}
TibTextUtils.getBindu(duff, last);
context_added[0] = true;
}
if (!context_added[0]) {
duff.add(preceding);
}
if (wowel.indexOf('H') >= 0)
duff.add(TibetanMachineWeb.getGlyph("H"));
int ix;
if ((ix = wowel.indexOf(THDLWylieConstants.WYLIE_TSA_PHRU)) >= 0) {
// This likely won't look good! TMW has glyphs for [va]
// and [fa], so use that transliteration if you care, not
// [ph^] or [b^].
duff.add(TibetanMachineWeb.getGlyph(THDLWylieConstants.WYLIE_TSA_PHRU));
StringBuffer sb = new StringBuffer(wowel);
sb.replace(ix, ix + THDLWylieConstants.WYLIE_TSA_PHRU.length(), "");
wowel = sb.toString();
}
if ((ix = wowel.indexOf(THDLWylieConstants.U0F82)) >= 0) {
duff.add(TibetanMachineWeb.getGlyph(THDLWylieConstants.U0F82));
StringBuffer sb = new StringBuffer(wowel);
sb.replace(ix, ix + THDLWylieConstants.U0F82.length(), "");
wowel = sb.toString();
}
if ((ix = wowel.indexOf(THDLWylieConstants.U0F83)) >= 0) {
duff.add(TibetanMachineWeb.getGlyph(THDLWylieConstants.U0F83));
StringBuffer sb = new StringBuffer(wowel);
sb.replace(ix, ix + THDLWylieConstants.U0F83.length(), "");
wowel = sb.toString();
}
// TODO(DLC)[EWTS->Tibetan]: verify that no part of wowel is discarded! acip does that. 'jam~X I think we screw up, e.g.