Preliminary ACIP->TMW support is in place. {DU} gives you something

less beautiful than what Jskad would give, so more work is needed.
This commit is contained in:
dchandler 2003-08-31 16:06:35 +00:00
parent 1f4d53be2e
commit 045c4069c9
12 changed files with 355 additions and 64 deletions

View file

@ -69,7 +69,8 @@ class ConvertDialog extends JDialog
ConvertDialog.this.theRealActionPerformed(e); ConvertDialog.this.theRealActionPerformed(e);
}}; }};
private void updateWarningLevels() { private void updateWarningLevels() {
if (choices.getSelectedItem() == ACIP_TO_UNI) if (choices.getSelectedItem() == ACIP_TO_UNI
|| choices.getSelectedItem() == ACIP_TO_TMW)
this.warningLevels.enable(); this.warningLevels.enable();
else else
this.warningLevels.disable(); this.warningLevels.disable();
@ -418,7 +419,7 @@ class ConvertDialog extends JDialog
newFileNamePrefix = suggested_WYLIE_prefix; newFileNamePrefix = suggested_WYLIE_prefix;
} else if (TMW_TO_UNI == ct || ACIP_TO_UNI == ct) { } else if (TMW_TO_UNI == ct || ACIP_TO_UNI == ct) {
newFileNamePrefix = suggested_TO_UNI_prefix; newFileNamePrefix = suggested_TO_UNI_prefix;
} else if (TM_TO_TMW == ct) { } else if (TM_TO_TMW == ct || ACIP_TO_TMW == ct) {
newFileNamePrefix = suggested_TO_TMW_prefix; newFileNamePrefix = suggested_TO_TMW_prefix;
} else { } else {
ThdlDebug.verify(TMW_TO_TM == ct); ThdlDebug.verify(TMW_TO_TM == ct);

View file

@ -615,10 +615,8 @@ public class DuffPane extends TibetanPane implements FocusListener {
*/ */
public void setRomanAttributeSet(String font, int size) { public void setRomanAttributeSet(String font, int size) {
if (getTibDoc() != null) { if (getTibDoc() != null) {
SimpleAttributeSet ras = new SimpleAttributeSet(); getTibDoc().setRomanAttributeSet(romanFontFamily = font,
StyleConstants.setFontFamily(ras, romanFontFamily = font); romanFontSize = size);
StyleConstants.setFontSize(ras, romanFontSize = size);
getTibDoc().setRomanAttributeSet(ras);
} }
} }

View file

@ -27,6 +27,7 @@ import java.awt.*;
interface FontConverterConstants interface FontConverterConstants
{ {
final String ACIP_TO_UNI = "ACIP to Unicode"; final String ACIP_TO_UNI = "ACIP to Unicode";
final String ACIP_TO_TMW = "ACIP to TMW";
final String TM_TO_TMW = "TM to TMW"; final String TM_TO_TMW = "TM to TMW";
final String TMW_TO_UNI = "TMW to Unicode"; final String TMW_TO_UNI = "TMW to Unicode";
final String TMW_TO_WYLIE = "TMW to Wylie"; final String TMW_TO_WYLIE = "TMW to Wylie";
@ -36,7 +37,9 @@ interface FontConverterConstants
final String FIND_ALL_NON_TMW = "Find all non-TMW"; final String FIND_ALL_NON_TMW = "Find all non-TMW";
final String FIND_ALL_NON_TM = "Find all non-TM"; final String FIND_ALL_NON_TM = "Find all non-TM";
final String[] CHOICES = new String[]{ final String[] CHOICES = new String[] {
ACIP_TO_UNI,
ACIP_TO_TMW,
TM_TO_TMW, TM_TO_TMW,
TMW_TO_UNI, TMW_TO_UNI,
TMW_TO_WYLIE, TMW_TO_WYLIE,

View file

@ -71,6 +71,7 @@ public class TibetanConverter implements FontConverterConstants {
boolean convertToUnicodeMode = false; boolean convertToUnicodeMode = false;
boolean convertToTMMode = false; boolean convertToTMMode = false;
boolean convertACIPToUniMode = false; boolean convertACIPToUniMode = false;
boolean convertACIPToTMWMode = false;
boolean convertToTMWMode = false; boolean convertToTMWMode = false;
boolean convertToWylieMode = false; boolean convertToWylieMode = false;
boolean findSomeNonTMWMode = false; boolean findSomeNonTMWMode = false;
@ -91,6 +92,8 @@ public class TibetanConverter implements FontConverterConstants {
= args[0].equals("--to-tibetan-machine-web")) = args[0].equals("--to-tibetan-machine-web"))
|| (convertACIPToUniMode || (convertACIPToUniMode
= args[0].equals("--acip-to-unicode")) = args[0].equals("--acip-to-unicode"))
|| (convertACIPToTMWMode
= args[0].equals("--acip-to-tmw"))
|| (convertToUnicodeMode || (convertToUnicodeMode
= args[0].equals("--to-unicode")) = args[0].equals("--to-unicode"))
|| (convertToWylieMode || (convertToWylieMode
@ -180,6 +183,8 @@ public class TibetanConverter implements FontConverterConstants {
conversionTag = TM_TO_TMW; conversionTag = TM_TO_TMW;
} else if (convertACIPToUniMode) { } else if (convertACIPToUniMode) {
conversionTag = ACIP_TO_UNI; conversionTag = ACIP_TO_UNI;
} else if (convertACIPToTMWMode) {
conversionTag = ACIP_TO_TMW;
} else { } else {
ThdlDebug.verify(convertToTMMode); ThdlDebug.verify(convertToTMMode);
conversionTag = TMW_TO_TM; conversionTag = TMW_TO_TM;
@ -205,7 +210,7 @@ public class TibetanConverter implements FontConverterConstants {
honored. */ honored. */
static int reallyConvert(InputStream in, PrintStream out, String ct, static int reallyConvert(InputStream in, PrintStream out, String ct,
String warningLevel) { String warningLevel) {
if (ACIP_TO_UNI == ct) { if (ACIP_TO_UNI == ct || ACIP_TO_TMW == ct) {
try { try {
ArrayList al = ACIPTshegBarScanner.scanStream(in, null, ArrayList al = ACIPTshegBarScanner.scanStream(in, null,
250 - 1 // DLC FIXME: make me configurable 250 - 1 // DLC FIXME: make me configurable
@ -214,10 +219,17 @@ public class TibetanConverter implements FontConverterConstants {
return 47; return 47;
StringBuffer warnings = new StringBuffer(); StringBuffer warnings = new StringBuffer();
boolean embeddedWarnings = (warningLevel != "None"); boolean embeddedWarnings = (warningLevel != "None");
if (!ACIPConverter.convertToUnicode(al, out, null, warnings, if (ACIP_TO_UNI == ct) {
if (!ACIPConverter.convertToUnicode(al, out, null, warnings,
embeddedWarnings,
warningLevel))
return 46;
} else {
if (!ACIPConverter.convertToTMW(al, out, null, warnings,
embeddedWarnings, embeddedWarnings,
warningLevel)) warningLevel))
return 46; return 46;
}
if (embeddedWarnings && warnings.length() > 0) if (embeddedWarnings && warnings.length() > 0)
return 45; return 45;
else else

View file

@ -141,6 +141,32 @@ public class TibetanDocument extends DefaultStyledDocument {
appendDuff(tibetanFontSize, offset, s, attr); appendDuff(tibetanFontSize, offset, s, attr);
} }
/**
* Inserts Latin text into the document. The font size is applied
* automatically, according to the current Roman font size.
* @param offset the position at which you want to insert text
* @param s the string you want to insert
* @see #setRomanAttributeSet(AttributeSet)
*/
public void appendRoman(int offset, String s) throws BadLocationException {
ThdlDebug.verify(getRomanAttributeSet() != null);
insertString(offset, s, getRomanAttributeSet());
}
/**
* Inserts Latin text at the end of the document. The font size is
* applied automatically, according to the current Roman font size.
* @param s the string you want to insert
* @see #setRomanAttributeSet(AttributeSet)
*/
public void appendRoman(String s) {
try {
appendRoman(getLength(), s);
} catch (BadLocationException e) {
throw new Error("can't happen");
}
}
private void appendDuff(int fontSize, int offset, String s, MutableAttributeSet attr) { private void appendDuff(int fontSize, int offset, String s, MutableAttributeSet attr) {
try { try {
StyleConstants.setFontSize(attr, fontSize); StyleConstants.setFontSize(attr, fontSize);
@ -160,6 +186,19 @@ public class TibetanDocument extends DefaultStyledDocument {
return insertDuff(tibetanFontSize, pos, glyphs, true); return insertDuff(tibetanFontSize, pos, glyphs, true);
} }
/**
* Appends all DuffCodes in glyphs to the end of this document.
*/
public void appendDuffCodes(DuffCode[] glyphs) {
// PERFORMANCE FIXME: this isn't so speedy, but it reuses
// existing code.
for (int i = 0; i < glyphs.length; i++) {
insertDuff(getLength(),
new DuffData[] { new DuffData(new String(new char[] { glyphs[i].getCharacter() }),
glyphs[i].getFontNum()) });
}
}
/** Replacing can be more efficient than inserting and then /** Replacing can be more efficient than inserting and then
removing. This replaces the glyph at position pos with glyph, removing. This replaces the glyph at position pos with glyph,
@ -1039,6 +1078,15 @@ public class TibetanDocument extends DefaultStyledDocument {
romanAttributeSet = ras; romanAttributeSet = ras;
} }
/** Sets the attribute set applied to Roman text in this
document. */
public void setRomanAttributeSet(String font, int size) {
SimpleAttributeSet ras = new SimpleAttributeSet();
StyleConstants.setFontFamily(ras, font);
StyleConstants.setFontSize(ras, size);
setRomanAttributeSet(ras);
}
/** /**
* Converts the specified portion of this document to THDL Extended * Converts the specified portion of this document to THDL Extended
* Wylie. * Wylie.

View file

@ -29,9 +29,9 @@ $~38,5~~9,41~~~~~~~0F06
#~200,1~~9,39~~~~~~~0F05 #~200,1~~9,39~~~~~~~0F05
// Yig.mgo.tsheg.shad: // Yig.mgo.tsheg.shad:
%~39,5~~9,42~~~~~~~0F07 %~39,5~~9,42~~~~~~~0F07
// dbu.khang.g-yon: // dbu.khang.g-yon: (If this changes, edit ACIPConverter)
(~208,1~~9,93~~~~~~~0F3C (~208,1~~9,93~~~~~~~0F3C
// dbu.khang.g-yas: // dbu.khang.g-yas: (If this changes, edit ACIPConverter)
)~209,1~~9,94~~~~~~~0F3D )~209,1~~9,94~~~~~~~0F3D
H~239,1~~8,92~~~~~~~0F7F H~239,1~~8,92~~~~~~~0F7F

View file

@ -24,10 +24,14 @@ import java.util.Stack;
import org.thdl.util.ThdlDebug; import org.thdl.util.ThdlDebug;
import org.thdl.util.ThdlOptions; import org.thdl.util.ThdlOptions;
import org.thdl.tib.text.TibetanDocument;
import org.thdl.tib.text.TibetanMachineWeb;
import org.thdl.tib.text.DuffCode;
/** /**
* This class is able to convert an ACIP file into Tibetan Machine Web. * This class is able to convert an ACIP file into Tibetan Machine Web
* From there, TMW->Unicode takes you to Unicode. * and an ACIP file into TMW. ACIP->Unicode should yield the same
* results as ACIP->TMW followed by TMW->Unicode (FIXME: test it!)
* @author David Chandler * @author David Chandler
*/ */
public class ACIPConverter { public class ACIPConverter {
@ -86,38 +90,70 @@ public class ACIPConverter {
warnings = new StringBuffer(); warnings = new StringBuffer();
putWarningsInOutput = true; putWarningsInOutput = true;
} }
convertToUnicode(al, System.out, errors, warnings, convertToTMW(al, System.out, errors, warnings,
putWarningsInOutput, warningLevel); putWarningsInOutput, warningLevel);
int retCode = 0;
if (errors.length() > 0) { if (errors.length() > 0) {
System.err.println("Errors converting ACIP input file: "); System.err.println("Errors converting ACIP input file: ");
System.err.println(errors); System.err.println(errors);
System.err.println("The output contains these errors."); System.err.println("The output contains these errors.");
System.err.println("Exiting; please fix input file and try again."); System.err.println("Exiting; please fix input file and try again.");
System.exit(2); retCode = 2;
} }
if (null != warnings && warnings.length() > 0) { if (null != warnings && warnings.length() > 0) {
System.err.println("Warnings converting ACIP input file: "); System.err.println("Warnings converting ACIP input file: ");
System.err.println(warnings); System.err.println(warnings);
if (putWarningsInOutput) if (putWarningsInOutput)
System.err.println("The output contains these warnings."); System.err.println("The output contains these warnings.");
System.exit(2); retCode = 2;
} }
if (verbose) System.err.println("Converted " + args[0] + " perfectly."); if (0 == retCode) {
System.exit(0); if (verbose) System.err.println("Converted " + args[0] + " perfectly.");
}
System.exit(retCode);
// DLC NOW: tRAStA is not converter correctly to Unicode, and
// no warning is given when converting to TMW.
} }
/** Writes TMW/Latin to out. If errors occur in converting a /** Writes TMW/Latin to out. If errors occur in converting a
* tsheg bar, then they are appended to errors if errors is * tsheg bar, then they are written into the output, and also
* non-null. Returns true upon perfect success, false if errors * appended to errors if errors is non-null. If warnings occur
* in converting a tsheg bar, then they are written into the
* output if writeWarningsToResult is true, and also appended to
* warnings if warnings is non-null. Returns true upon perfect
* success or if there were merely warnings, false if errors
* occurred. * occurred.
* @throws IOException if we cannot write to out * @throws IOException if we cannot write to out
*/ */
public static boolean convertToTMW(ArrayList scan, String latinFont, public static boolean convertToTMW(ArrayList scan,
OutputStream out, StringBuffer errors) OutputStream out,
StringBuffer errors,
StringBuffer warnings,
boolean writeWarningsToResult,
String warningLevel)
throws IOException throws IOException
{ {
throw new Error("DLC UNIMPLEMENTED"); TibetanDocument tdoc = new TibetanDocument();
tdoc.setRomanAttributeSet("Courier", 14); // DLC make me configurable.
boolean rv
= convertToTMW(scan, tdoc, errors, warnings,
writeWarningsToResult, warningLevel);
tdoc.writeRTFOutputStream(out);
return rv;
} }
private static boolean convertToTMW(ArrayList scan,
TibetanDocument tdoc,
StringBuffer errors,
StringBuffer warnings,
boolean writeWarningsToResult,
String warningLevel)
throws IOException
{
return convertTo(false, scan, null, tdoc, errors, warnings,
writeWarningsToResult, warningLevel);
}
// DLC FIXME: sometimes { } is \u0F0B, and sometimes it is a // DLC FIXME: sometimes { } is \u0F0B, and sometimes it is a
// space. Treat it as a tsheg only when it appears after a // space. Treat it as a tsheg only when it appears after a
// syllable or another tsheg. // syllable or another tsheg.
@ -130,7 +166,8 @@ public class ACIPConverter {
* or in converting a tsheg bar, then they are appended to * or in converting a tsheg bar, then they are appended to
* warnings if warnings is non-null, and they are written to the * warnings if warnings is non-null, and they are written to the
* result if writeWarningsToResult is true. Returns the * result if writeWarningsToResult is true. Returns the
* conversion upon perfect success, null if errors occurred. * conversion upon perfect success or if there were merely
* warnings, null if errors occurred.
*/ */
public static String convertToUnicode(String acip, public static String convertToUnicode(String acip,
StringBuffer errors, StringBuffer errors,
@ -174,25 +211,43 @@ public class ACIPConverter {
boolean writeWarningsToOut, boolean writeWarningsToOut,
String warningLevel) String warningLevel)
throws IOException throws IOException
{
return convertTo(true, scan, out, null, errors, warnings,
writeWarningsToOut, warningLevel);
}
private static boolean convertTo(boolean toUnicode, // else to TMW
ArrayList scan,
OutputStream out, // for toUnicode mode
TibetanDocument tdoc, // for !toUnicode mode
StringBuffer errors,
StringBuffer warnings,
boolean writeWarningsToOut,
String warningLevel)
throws IOException
{ {
int sz = scan.size(); int sz = scan.size();
boolean hasErrors = false; boolean hasErrors = false;
BufferedWriter writer BufferedWriter writer = null;
= new BufferedWriter(new OutputStreamWriter(out, "UTF-8")); if (toUnicode)
writer
= new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
for (int i = 0; i < sz; i++) { for (int i = 0; i < sz; i++) {
ACIPString s = (ACIPString)scan.get(i); ACIPString s = (ACIPString)scan.get(i);
int stype = s.getType(); int stype = s.getType();
if (stype == ACIPString.ERROR) { if (stype == ACIPString.ERROR) {
hasErrors = true; hasErrors = true;
writer.write("[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: "); String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]";
writer.write(s.getText()); if (null != writer) writer.write(text);
writer.write("]"); if (null != tdoc) tdoc.appendRoman(text);
} else if (stype == ACIPString.WARNING) { } else if (stype == ACIPString.WARNING) {
if (writeWarningsToOut) { if (writeWarningsToOut) {
writer.write("[#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: "); String text = "[#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: " + s.getText() + "]";
writer.write(s.getText()); if (null != writer) writer.write(text);
writer.write("]"); if (null != tdoc) tdoc.appendRoman(text);
} }
// DLC NOW: Warning: We're going with {'}{R}{DA}, but only because our knowledge of prefix rules says that {'}{R+DA} is not a legal Tibetan tsheg bar ("syllable")
if (null != warnings) { if (null != warnings) {
warnings.append("Warning: Lexical warning: "); warnings.append("Warning: Lexical warning: ");
warnings.append(s.getText()); warnings.append(s.getText());
@ -200,13 +255,15 @@ public class ACIPConverter {
} }
} else { } else {
if (s.isLatin(stype)) { if (s.isLatin(stype)) {
if (stype == ACIPString.FOLIO_MARKER) String text
writer.write("{"); = (((stype == ACIPString.FOLIO_MARKER) ? "{" : "")
writer.write(s.getText()); + s.getText()
if (stype == ACIPString.FOLIO_MARKER) + ((stype == ACIPString.FOLIO_MARKER) ? "}" : ""));
writer.write("}"); if (null != writer) writer.write(text);
if (null != tdoc) tdoc.appendRoman(text);
} else { } else {
String unicode = null; String unicode = null;
DuffCode[] duff = null;
if (stype == ACIPString.TIBETAN_NON_PUNCTUATION) { if (stype == ACIPString.TIBETAN_NON_PUNCTUATION) {
TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText()); TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText());
String acipError; String acipError;
@ -214,7 +271,8 @@ public class ACIPConverter {
if ((acipError = pl.getACIPError()) != null) { if ((acipError = pl.getACIPError()) != null) {
hasErrors = true; hasErrors = true;
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS THESE ERRORS: " + acipError + "]"; String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS THESE ERRORS: " + acipError + "]";
writer.write(errorMessage); if (null != writer) writer.write(errorMessage);
if (null != tdoc) tdoc.appendRoman(errorMessage);
if (null != errors) if (null != errors)
errors.append(errorMessage + "\n"); errors.append(errorMessage + "\n");
} else { } else {
@ -222,7 +280,8 @@ public class ACIPConverter {
if (null == pt) { if (null == pt) {
hasErrors = true; hasErrors = true;
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " IS ESSENTIALLY NOTHING.]"; String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " IS ESSENTIALLY NOTHING.]";
writer.write(errorMessage); if (null != writer) writer.write(errorMessage);
if (null != tdoc) tdoc.appendRoman(errorMessage);
if (null != errors) if (null != errors)
errors.append(errorMessage + "\n"); errors.append(errorMessage + "\n");
} else { } else {
@ -230,7 +289,8 @@ public class ACIPConverter {
if (null == sl) { if (null == sl) {
hasErrors = true; hasErrors = true;
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS NO LEGAL PARSES.]"; String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS NO LEGAL PARSES.]";
writer.write(errorMessage); if (null != writer) writer.write(errorMessage);
if (null != tdoc) tdoc.appendRoman(errorMessage);
if (null != errors) if (null != errors)
errors.append(errorMessage + "\n"); errors.append(errorMessage + "\n");
} else { } else {
@ -240,36 +300,74 @@ public class ACIPConverter {
s.getText()); s.getText());
if (null != warning) { if (null != warning) {
if (writeWarningsToOut) { if (writeWarningsToOut) {
writer.write("[#WARNING CONVERTING ACIP DOCUMENT: "); String text
writer.write(warning); = ("[#WARNING CONVERTING ACIP DOCUMENT: "
writer.write("]"); + warning + "]");
if (null != writer) writer.write(text);
if (null != tdoc) tdoc.appendRoman(text);
} }
if (null != warnings) { if (null != warnings) {
warnings.append(warning); warnings.append(warning);
warnings.append('\n'); warnings.append('\n');
} }
} }
unicode = sl.getUnicode(); if (null != writer) {
if (null == unicode) throw new Error("FIXME: make this an assertion"); unicode = sl.getUnicode();
if (null == unicode) throw new Error("FIXME: make this an assertion 4");
}
if (null != tdoc) {
duff = sl.getDuff();
if (0 == duff.length) {
throw new Error("No DuffCodes for stack list " + sl); // FIXME: make this an assertion
}
}
} }
} }
} }
} else { } else {
if (stype == ACIPString.START_SLASH) if (stype == ACIPString.START_SLASH) {
unicode = "\u0F3C"; if (null != writer) unicode = "\u0F3C";
else if (stype == ACIPString.END_SLASH) if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph("(") };
unicode = "\u0F3D"; } else if (stype == ACIPString.END_SLASH) {
else if (null != writer) unicode = "\u0F3D";
unicode = ACIPRules.getUnicodeFor(s.getText(), false); if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph(")") };
if (null == unicode) throw new Error("FIXME: make this an assertion"); } else {
if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false);
if (null != tdoc) {
if (s.getText().equals("\r") || s.getText().equals("\t") || s.getText().equals("\n")) {
tdoc.appendRoman(s.getText());
continue;
}
else {
String wy = ACIPRules.getWylieForACIPOther(s.getText());
if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
duff = new DuffCode[] { TibetanMachineWeb.getGlyph(wy) };
}
}
}
if (null != writer && null == unicode)
throw new Error("FIXME: make this an assertion 1");
if (null != tdoc && (null == duff || 0 == duff.length))
throw new Error("FIXME: make this an assertion 2");
} }
if (null != unicode) { if (null != writer && null != unicode) writer.write(unicode);
writer.write(unicode); if (null != tdoc) {
if (null != duff && 0 != duff.length) {
tdoc.appendDuffCodes(duff);
// DLC NOW FIXME: use TibTextUtils.getVowel logic to make the output beautiful.
} else {
// this happens when you have an
// [#ERROR]-producing tsheg bar.
// System.err.println("Bad tsheg bar with ACIP {" + s.getText() + "}");
}
} }
} }
} }
} }
writer.close(); if (null != writer) {
writer.close();
}
return !hasErrors; return !hasErrors;
} }
} }

View file

@ -19,8 +19,12 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt; package org.thdl.tib.text.ttt;
import java.util.HashSet; import java.util.HashSet;
import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import org.thdl.tib.text.DuffCode;
import org.thdl.tib.text.TibetanMachineWeb;
/** Canonizes some facts regarding the ACIP transcription system. /** Canonizes some facts regarding the ACIP transcription system.
* @author David Chandler */ * @author David Chandler */
class ACIPRules { class ACIPRules {
@ -36,7 +40,9 @@ class ACIPRules {
private static HashSet acipVowels = null; private static HashSet acipVowels = null;
private static String[][] baseVowels = new String[][] { private static String[][] baseVowels = new String[][] {
// { ACIP, EWTS, EWTS for '\'' + baseVowels[][0] }: // { ACIP, EWTS, EWTS for ACIP {'\'' + baseVowels[][0]}, vowel
// numbers (see TibetanMachineWeb's VOWEL_A, VOWEL_o, etc.)
// for ACIP, vowel numbers for ACIP {'\'' + baseVowels[][0]}
{ "A", "a", "A" }, { "A", "a", "A" },
{ "I", "i", "I" }, { "I", "i", "I" },
{ "U", "u", "U" }, { "U", "u", "U" },
@ -70,7 +76,7 @@ class ACIPRules {
// DLC keep this code in sync with getUnicodeFor. // DLC keep this code in sync with getUnicodeFor.
// DLC keep this code in sync with getWylieForACIPVowel // DLC keep this code in sync with getWylieForACIPVowel
// DLC '\' for visarga? how shall we do \ the visarga? like a vowel or not? // DLC '\' for virama? how shall we do \ the virama? like a vowel or not?
} }
} }
return (acipVowels.contains(s)); return (acipVowels.contains(s));
@ -211,6 +217,39 @@ class ACIPRules {
return (String)acipVowel2wylie.get(acip); return (String)acipVowel2wylie.get(acip);
} }
private static HashMap acipOther2wylie = null;
/** Returns the EWTS corresponding to the given ACIP puncuation or
* mark. Returns null if there is no such EWTS. */
static final String getWylieForACIPOther(String acip) {
if (acipOther2wylie == null) {
acipOther2wylie = new HashMap(37);
// DLC FIXME: check all these again.
acipOther2wylie.put(",", "/");
acipOther2wylie.put(" ", " ");
acipOther2wylie.put(".", "*");
acipOther2wylie.put("|", "|");
acipOther2wylie.put("`", "!");
acipOther2wylie.put(";", ";");
acipOther2wylie.put("*", "@");
acipOther2wylie.put("#", "@#");
acipOther2wylie.put("%", "%");
acipOther2wylie.put("&", "&");
acipOther2wylie.put("0", "0");
acipOther2wylie.put("1", "1");
acipOther2wylie.put("2", "2");
acipOther2wylie.put("3", "3");
acipOther2wylie.put("4", "4");
acipOther2wylie.put("5", "5");
acipOther2wylie.put("6", "6");
acipOther2wylie.put("7", "7");
acipOther2wylie.put("8", "8");
acipOther2wylie.put("9", "9");
}
return (String)acipOther2wylie.get(acip);
}
private static HashMap superACIP2unicode = null; private static HashMap superACIP2unicode = null;
private static HashMap subACIP2unicode = null; private static HashMap subACIP2unicode = null;
/** If acip is an ACIP consonant or vowel or punctuation mark, /** If acip is an ACIP consonant or vowel or punctuation mark,
@ -416,6 +455,42 @@ class ACIPRules {
if (null != u) return u; if (null != u) return u;
} }
return (String)superACIP2unicode.get(acip); return (String)superACIP2unicode.get(acip);
}
/** DLC DOC: Gets the duffcodes for vowel, such that they look good with hashKey, and appends them to r. */
static void getDuffForACIPVowel(ArrayList r, String hashKey, String vowel) {
if (null == vowel) return;
if (null == getWylieForACIPVowel(vowel)) // FIXME: expensive assertion! Use assert.
throw new IllegalArgumentException("Vowel " + vowel + " isn't in the small set of vowels we handle correctly.");
if (!TibetanMachineWeb.isKnownHashKey(hashKey)) // FIXME: expensive assertion! Use assert.
throw new IllegalArgumentException("bad hashKey");
// Order matters here.
if (vowel.indexOf("'U") >= 0)
r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_U));
else {
if (vowel.indexOf('\'') >= 0)
r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_A));
if (vowel.indexOf("EE") >= 0)
r.add(TibetanMachineWeb.getGlyph("ai"));
else if (vowel.indexOf('E') >= 0)
r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_e));
if (vowel.indexOf("OO") >= 0)
r.add(TibetanMachineWeb.getGlyph("au"));
else if (vowel.indexOf('O') >= 0)
r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_o));
if (vowel.indexOf('I') >= 0)
r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_i));
if (vowel.indexOf('U') >= 0)
r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_u));
if (vowel.indexOf('i') >= 0)
r.add(TibetanMachineWeb.getGlyph("-i"));
}
if (vowel.indexOf('m') >= 0)
r.add(TibetanMachineWeb.getGlyph("M"));
if (vowel.indexOf(':') >= 0)
r.add(TibetanMachineWeb.getGlyph("H"));
} }
} }

View file

@ -319,6 +319,16 @@ tstHelper("MSTAN"); // ambiguous with regard to prefix rules
tstHelper("KA'", "[(K . A), (' . )]",
new String[] { "{KA}{'}" },
new String[] { "{KA}{'}" },
"{KA}{'}"); // DLC NOW
tstHelper("A'AAMA", "{A}{'}{AA}{MA}"); // FIXME: how should we parse this?
tstHelper("K+K+KA", "{K+}{K+}{KA}");
// If you're not careful, you'll think GGYES is a legal // If you're not careful, you'll think GGYES is a legal
// Tibetan tsheg bar and parse it as {G}{G+YE}{S}. But it's // Tibetan tsheg bar and parse it as {G}{G+YE}{S}. But it's

View file

@ -19,6 +19,10 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt; package org.thdl.tib.text.ttt;
import org.thdl.util.ThdlDebug; import org.thdl.util.ThdlDebug;
import org.thdl.tib.text.TibetanMachineWeb;
import org.thdl.tib.text.DuffCode;
import java.util.ArrayList;
/** An ordered pair used in ACIP-to-TMW conversion. The left side is /** An ordered pair used in ACIP-to-TMW conversion. The left side is
* the consonant or empty; the right side is the vowel, '+', or '-'. * the consonant or empty; the right side is the vowel, '+', or '-'.
@ -70,7 +74,9 @@ class TPair {
/** Returns an TPair that is like this one except that it is /** Returns an TPair that is like this one except that it is
* missing N characters. The characters are taken from r, the * missing N characters. The characters are taken from r, the
* right side, first and from l, the left side, second. * right side, first and from l, the left side, second. The pair
* returned may be illegal, such as the (A . ') you can get from
* ACIP {A'AAMA}.
* @throw IllegalArgumentException if N is out of range */ * @throw IllegalArgumentException if N is out of range */
TPair minusNRightmostACIPCharacters(int N) TPair minusNRightmostACIPCharacters(int N)
throws IllegalArgumentException throws IllegalArgumentException
@ -80,7 +86,7 @@ class TPair {
if (N > size()) if (N > size())
throw new IllegalArgumentException("Don't have that many to remove."); throw new IllegalArgumentException("Don't have that many to remove.");
if (N < 1) if (N < 1)
throw new IllegalArgumentException("You should't call this if you don't want to remove any."); throw new IllegalArgumentException("You shouldn't call this if you don't want to remove any.");
if (null != r && (sz = r.length()) > 0) { if (null != r && (sz = r.length()) > 0) {
int min = Math.min(sz, N); int min = Math.min(sz, N);
newR = r.substring(0, sz - min); newR = r.substring(0, sz - min);
@ -101,7 +107,7 @@ class TPair {
return false; return false;
if (null != l && !ACIPRules.isConsonant(l)) if (null != l && !ACIPRules.isConsonant(l))
return false; return false;
if (null != r && !ACIPRules.isVowel(l)) if (null != r && !ACIPRules.isVowel(r))
return false; return false;
return true; return true;
} }
@ -146,8 +152,14 @@ class TPair {
return (l != null && l.length() == 1 && (ch = l.charAt(0)) >= '0' && ch <= '9'); return (l != null && l.length() == 1 && (ch = l.charAt(0)) >= '0' && ch <= '9');
} }
/** Returns the EWTS Wylie that corresponds to this pair. Untested. */
String getWylie() { String getWylie() {
return getWylie(false);
}
/** Returns the EWTS Wylie that corresponds to this pair if
* justLeft is false, or the EWTS Wylie that corresponds to just
* {@link #getLeft()} if justLeft is true. */
String getWylie(boolean justLeft) {
String leftWylie = null; String leftWylie = null;
if (getLeft() != null) { if (getLeft() != null) {
leftWylie = ACIPRules.getWylieForACIPConsonant(getLeft()); leftWylie = ACIPRules.getWylieForACIPConsonant(getLeft());
@ -156,6 +168,8 @@ class TPair {
leftWylie = getLeft(); leftWylie = getLeft();
} }
} }
if (null == leftWylie) leftWylie = "";
if (justLeft) return leftWylie;
String rightWylie = null; String rightWylie = null;
if ("-".equals(getRight())) if ("-".equals(getRight()))
rightWylie = "."; rightWylie = ".";
@ -163,7 +177,6 @@ class TPair {
rightWylie = "+"; rightWylie = "+";
else if (getRight() != null) else if (getRight() != null)
rightWylie = ACIPRules.getWylieForACIPVowel(getRight()); rightWylie = ACIPRules.getWylieForACIPVowel(getRight());
if (null == leftWylie) leftWylie = "";
if (null == rightWylie) rightWylie = ""; if (null == rightWylie) rightWylie = "";
return leftWylie + rightWylie; return leftWylie + rightWylie;
} }

View file

@ -609,5 +609,24 @@ class TPairList {
} }
} }
/** Appends the DuffCodes that correspond to this grapheme cluster
* to duff. Assumes this is one grapheme cluster. */
void getDuff(ArrayList duff) {
StringBuffer wylieForConsonant = new StringBuffer();
for (int x = 0; x + 1 < size(); x++) {
wylieForConsonant.append(get(x).getWylie(false));
}
TPair lastPair = get(size() - 1);
wylieForConsonant.append(lastPair.getWylie(true));
String hashKey = wylieForConsonant.toString();
if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {
hashKey = hashKey.replace('+', '-');
if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {
throw new Error("How did this happen?");
}
}
duff.add(TibetanMachineWeb.getGlyph(hashKey));
ACIPRules.getDuffForACIPVowel(duff, hashKey, lastPair.getRight());
}
} }
// DLC FIXME: handle 'o' and 'x', e.g. KAo and NYAx. // DLC FIXME: handle 'o' and 'x', e.g. KAo and NYAx.

View file

@ -20,6 +20,7 @@ package org.thdl.tib.text.ttt;
import org.thdl.tib.text.TibTextUtils; import org.thdl.tib.text.TibTextUtils;
import org.thdl.tib.text.TGCList; import org.thdl.tib.text.TGCList;
import org.thdl.tib.text.DuffCode;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.ListIterator; import java.util.ListIterator;
@ -216,8 +217,21 @@ class TStackList {
} }
return u.toString(); return u.toString();
} }
/** DLC DOC */
DuffCode[] getDuff() {
ArrayList al = new ArrayList(size()*2); // rough estimate
int count = 0;
for (int i = 0; i < size(); i++) {
get(i).getDuff(al);
}
if (size() > 0 && al.size() == 0) {
throw new Error("But this stack list, " + this + ", contains " + size() + " stacks! How can it not have DuffCodes associated with it?");
}
return (DuffCode[])al.toArray(new DuffCode[] { });
}
} }
/** Too simple to comment. */
class BoolPair { class BoolPair {
boolean isLegal; boolean isLegal;
boolean isLegalAndHasAVowelOnRoot; boolean isLegalAndHasAVowelOnRoot;