Preliminary ACIP->TMW support is in place. {DU} gives you something
less beautiful than what Jskad would give, so more work is needed.
This commit is contained in:
parent
1f4d53be2e
commit
045c4069c9
12 changed files with 355 additions and 64 deletions
|
@ -69,7 +69,8 @@ class ConvertDialog extends JDialog
|
|||
ConvertDialog.this.theRealActionPerformed(e);
|
||||
}};
|
||||
private void updateWarningLevels() {
|
||||
if (choices.getSelectedItem() == ACIP_TO_UNI)
|
||||
if (choices.getSelectedItem() == ACIP_TO_UNI
|
||||
|| choices.getSelectedItem() == ACIP_TO_TMW)
|
||||
this.warningLevels.enable();
|
||||
else
|
||||
this.warningLevels.disable();
|
||||
|
@ -418,7 +419,7 @@ class ConvertDialog extends JDialog
|
|||
newFileNamePrefix = suggested_WYLIE_prefix;
|
||||
} else if (TMW_TO_UNI == ct || ACIP_TO_UNI == ct) {
|
||||
newFileNamePrefix = suggested_TO_UNI_prefix;
|
||||
} else if (TM_TO_TMW == ct) {
|
||||
} else if (TM_TO_TMW == ct || ACIP_TO_TMW == ct) {
|
||||
newFileNamePrefix = suggested_TO_TMW_prefix;
|
||||
} else {
|
||||
ThdlDebug.verify(TMW_TO_TM == ct);
|
||||
|
|
|
@ -615,10 +615,8 @@ public class DuffPane extends TibetanPane implements FocusListener {
|
|||
*/
|
||||
public void setRomanAttributeSet(String font, int size) {
|
||||
if (getTibDoc() != null) {
|
||||
SimpleAttributeSet ras = new SimpleAttributeSet();
|
||||
StyleConstants.setFontFamily(ras, romanFontFamily = font);
|
||||
StyleConstants.setFontSize(ras, romanFontSize = size);
|
||||
getTibDoc().setRomanAttributeSet(ras);
|
||||
getTibDoc().setRomanAttributeSet(romanFontFamily = font,
|
||||
romanFontSize = size);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -27,6 +27,7 @@ import java.awt.*;
|
|||
interface FontConverterConstants
|
||||
{
|
||||
final String ACIP_TO_UNI = "ACIP to Unicode";
|
||||
final String ACIP_TO_TMW = "ACIP to TMW";
|
||||
final String TM_TO_TMW = "TM to TMW";
|
||||
final String TMW_TO_UNI = "TMW to Unicode";
|
||||
final String TMW_TO_WYLIE = "TMW to Wylie";
|
||||
|
@ -36,7 +37,9 @@ interface FontConverterConstants
|
|||
final String FIND_ALL_NON_TMW = "Find all non-TMW";
|
||||
final String FIND_ALL_NON_TM = "Find all non-TM";
|
||||
|
||||
final String[] CHOICES = new String[]{
|
||||
final String[] CHOICES = new String[] {
|
||||
ACIP_TO_UNI,
|
||||
ACIP_TO_TMW,
|
||||
TM_TO_TMW,
|
||||
TMW_TO_UNI,
|
||||
TMW_TO_WYLIE,
|
||||
|
|
|
@ -71,6 +71,7 @@ public class TibetanConverter implements FontConverterConstants {
|
|||
boolean convertToUnicodeMode = false;
|
||||
boolean convertToTMMode = false;
|
||||
boolean convertACIPToUniMode = false;
|
||||
boolean convertACIPToTMWMode = false;
|
||||
boolean convertToTMWMode = false;
|
||||
boolean convertToWylieMode = false;
|
||||
boolean findSomeNonTMWMode = false;
|
||||
|
@ -91,6 +92,8 @@ public class TibetanConverter implements FontConverterConstants {
|
|||
= args[0].equals("--to-tibetan-machine-web"))
|
||||
|| (convertACIPToUniMode
|
||||
= args[0].equals("--acip-to-unicode"))
|
||||
|| (convertACIPToTMWMode
|
||||
= args[0].equals("--acip-to-tmw"))
|
||||
|| (convertToUnicodeMode
|
||||
= args[0].equals("--to-unicode"))
|
||||
|| (convertToWylieMode
|
||||
|
@ -180,6 +183,8 @@ public class TibetanConverter implements FontConverterConstants {
|
|||
conversionTag = TM_TO_TMW;
|
||||
} else if (convertACIPToUniMode) {
|
||||
conversionTag = ACIP_TO_UNI;
|
||||
} else if (convertACIPToTMWMode) {
|
||||
conversionTag = ACIP_TO_TMW;
|
||||
} else {
|
||||
ThdlDebug.verify(convertToTMMode);
|
||||
conversionTag = TMW_TO_TM;
|
||||
|
@ -205,7 +210,7 @@ public class TibetanConverter implements FontConverterConstants {
|
|||
honored. */
|
||||
static int reallyConvert(InputStream in, PrintStream out, String ct,
|
||||
String warningLevel) {
|
||||
if (ACIP_TO_UNI == ct) {
|
||||
if (ACIP_TO_UNI == ct || ACIP_TO_TMW == ct) {
|
||||
try {
|
||||
ArrayList al = ACIPTshegBarScanner.scanStream(in, null,
|
||||
250 - 1 // DLC FIXME: make me configurable
|
||||
|
@ -214,10 +219,17 @@ public class TibetanConverter implements FontConverterConstants {
|
|||
return 47;
|
||||
StringBuffer warnings = new StringBuffer();
|
||||
boolean embeddedWarnings = (warningLevel != "None");
|
||||
if (!ACIPConverter.convertToUnicode(al, out, null, warnings,
|
||||
if (ACIP_TO_UNI == ct) {
|
||||
if (!ACIPConverter.convertToUnicode(al, out, null, warnings,
|
||||
embeddedWarnings,
|
||||
warningLevel))
|
||||
return 46;
|
||||
} else {
|
||||
if (!ACIPConverter.convertToTMW(al, out, null, warnings,
|
||||
embeddedWarnings,
|
||||
warningLevel))
|
||||
return 46;
|
||||
return 46;
|
||||
}
|
||||
if (embeddedWarnings && warnings.length() > 0)
|
||||
return 45;
|
||||
else
|
||||
|
|
|
@ -141,6 +141,32 @@ public class TibetanDocument extends DefaultStyledDocument {
|
|||
appendDuff(tibetanFontSize, offset, s, attr);
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts Latin text into the document. The font size is applied
|
||||
* automatically, according to the current Roman font size.
|
||||
* @param offset the position at which you want to insert text
|
||||
* @param s the string you want to insert
|
||||
* @see #setRomanAttributeSet(AttributeSet)
|
||||
*/
|
||||
public void appendRoman(int offset, String s) throws BadLocationException {
|
||||
ThdlDebug.verify(getRomanAttributeSet() != null);
|
||||
insertString(offset, s, getRomanAttributeSet());
|
||||
}
|
||||
|
||||
/**
|
||||
* Inserts Latin text at the end of the document. The font size is
|
||||
* applied automatically, according to the current Roman font size.
|
||||
* @param s the string you want to insert
|
||||
* @see #setRomanAttributeSet(AttributeSet)
|
||||
*/
|
||||
public void appendRoman(String s) {
|
||||
try {
|
||||
appendRoman(getLength(), s);
|
||||
} catch (BadLocationException e) {
|
||||
throw new Error("can't happen");
|
||||
}
|
||||
}
|
||||
|
||||
private void appendDuff(int fontSize, int offset, String s, MutableAttributeSet attr) {
|
||||
try {
|
||||
StyleConstants.setFontSize(attr, fontSize);
|
||||
|
@ -160,6 +186,19 @@ public class TibetanDocument extends DefaultStyledDocument {
|
|||
return insertDuff(tibetanFontSize, pos, glyphs, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Appends all DuffCodes in glyphs to the end of this document.
|
||||
*/
|
||||
public void appendDuffCodes(DuffCode[] glyphs) {
|
||||
// PERFORMANCE FIXME: this isn't so speedy, but it reuses
|
||||
// existing code.
|
||||
for (int i = 0; i < glyphs.length; i++) {
|
||||
insertDuff(getLength(),
|
||||
new DuffData[] { new DuffData(new String(new char[] { glyphs[i].getCharacter() }),
|
||||
glyphs[i].getFontNum()) });
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/** Replacing can be more efficient than inserting and then
|
||||
removing. This replaces the glyph at position pos with glyph,
|
||||
|
@ -1039,6 +1078,15 @@ public class TibetanDocument extends DefaultStyledDocument {
|
|||
romanAttributeSet = ras;
|
||||
}
|
||||
|
||||
/** Sets the attribute set applied to Roman text in this
|
||||
document. */
|
||||
public void setRomanAttributeSet(String font, int size) {
|
||||
SimpleAttributeSet ras = new SimpleAttributeSet();
|
||||
StyleConstants.setFontFamily(ras, font);
|
||||
StyleConstants.setFontSize(ras, size);
|
||||
setRomanAttributeSet(ras);
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts the specified portion of this document to THDL Extended
|
||||
* Wylie.
|
||||
|
|
|
@ -29,9 +29,9 @@ $~38,5~~9,41~~~~~~~0F06
|
|||
#~200,1~~9,39~~~~~~~0F05
|
||||
// Yig.mgo.tsheg.shad:
|
||||
%~39,5~~9,42~~~~~~~0F07
|
||||
// dbu.khang.g-yon:
|
||||
// dbu.khang.g-yon: (If this changes, edit ACIPConverter)
|
||||
(~208,1~~9,93~~~~~~~0F3C
|
||||
// dbu.khang.g-yas:
|
||||
// dbu.khang.g-yas: (If this changes, edit ACIPConverter)
|
||||
)~209,1~~9,94~~~~~~~0F3D
|
||||
H~239,1~~8,92~~~~~~~0F7F
|
||||
|
||||
|
|
|
@ -24,10 +24,14 @@ import java.util.Stack;
|
|||
|
||||
import org.thdl.util.ThdlDebug;
|
||||
import org.thdl.util.ThdlOptions;
|
||||
import org.thdl.tib.text.TibetanDocument;
|
||||
import org.thdl.tib.text.TibetanMachineWeb;
|
||||
import org.thdl.tib.text.DuffCode;
|
||||
|
||||
/**
|
||||
* This class is able to convert an ACIP file into Tibetan Machine Web.
|
||||
* From there, TMW->Unicode takes you to Unicode.
|
||||
* This class is able to convert an ACIP file into Tibetan Machine Web
|
||||
* and an ACIP file into TMW. ACIP->Unicode should yield the same
|
||||
* results as ACIP->TMW followed by TMW->Unicode (FIXME: test it!)
|
||||
* @author David Chandler
|
||||
*/
|
||||
public class ACIPConverter {
|
||||
|
@ -86,38 +90,70 @@ public class ACIPConverter {
|
|||
warnings = new StringBuffer();
|
||||
putWarningsInOutput = true;
|
||||
}
|
||||
convertToUnicode(al, System.out, errors, warnings,
|
||||
putWarningsInOutput, warningLevel);
|
||||
convertToTMW(al, System.out, errors, warnings,
|
||||
putWarningsInOutput, warningLevel);
|
||||
int retCode = 0;
|
||||
if (errors.length() > 0) {
|
||||
System.err.println("Errors converting ACIP input file: ");
|
||||
System.err.println(errors);
|
||||
System.err.println("The output contains these errors.");
|
||||
System.err.println("Exiting; please fix input file and try again.");
|
||||
System.exit(2);
|
||||
retCode = 2;
|
||||
}
|
||||
if (null != warnings && warnings.length() > 0) {
|
||||
System.err.println("Warnings converting ACIP input file: ");
|
||||
System.err.println(warnings);
|
||||
if (putWarningsInOutput)
|
||||
System.err.println("The output contains these warnings.");
|
||||
System.exit(2);
|
||||
retCode = 2;
|
||||
}
|
||||
if (verbose) System.err.println("Converted " + args[0] + " perfectly.");
|
||||
System.exit(0);
|
||||
if (0 == retCode) {
|
||||
if (verbose) System.err.println("Converted " + args[0] + " perfectly.");
|
||||
}
|
||||
System.exit(retCode);
|
||||
// DLC NOW: tRAStA is not converter correctly to Unicode, and
|
||||
// no warning is given when converting to TMW.
|
||||
}
|
||||
|
||||
/** Writes TMW/Latin to out. If errors occur in converting a
|
||||
* tsheg bar, then they are appended to errors if errors is
|
||||
* non-null. Returns true upon perfect success, false if errors
|
||||
* tsheg bar, then they are written into the output, and also
|
||||
* appended to errors if errors is non-null. If warnings occur
|
||||
* in converting a tsheg bar, then they are written into the
|
||||
* output if writeWarningsToResult is true, and also appended to
|
||||
* warnings if warnings is non-null. Returns true upon perfect
|
||||
* success or if there were merely warnings, false if errors
|
||||
* occurred.
|
||||
* @throws IOException if we cannot write to out
|
||||
*/
|
||||
public static boolean convertToTMW(ArrayList scan, String latinFont,
|
||||
OutputStream out, StringBuffer errors)
|
||||
public static boolean convertToTMW(ArrayList scan,
|
||||
OutputStream out,
|
||||
StringBuffer errors,
|
||||
StringBuffer warnings,
|
||||
boolean writeWarningsToResult,
|
||||
String warningLevel)
|
||||
throws IOException
|
||||
{
|
||||
throw new Error("DLC UNIMPLEMENTED");
|
||||
TibetanDocument tdoc = new TibetanDocument();
|
||||
tdoc.setRomanAttributeSet("Courier", 14); // DLC make me configurable.
|
||||
boolean rv
|
||||
= convertToTMW(scan, tdoc, errors, warnings,
|
||||
writeWarningsToResult, warningLevel);
|
||||
tdoc.writeRTFOutputStream(out);
|
||||
return rv;
|
||||
}
|
||||
|
||||
private static boolean convertToTMW(ArrayList scan,
|
||||
TibetanDocument tdoc,
|
||||
StringBuffer errors,
|
||||
StringBuffer warnings,
|
||||
boolean writeWarningsToResult,
|
||||
String warningLevel)
|
||||
throws IOException
|
||||
{
|
||||
return convertTo(false, scan, null, tdoc, errors, warnings,
|
||||
writeWarningsToResult, warningLevel);
|
||||
}
|
||||
|
||||
// DLC FIXME: sometimes { } is \u0F0B, and sometimes it is a
|
||||
// space. Treat it as a tsheg only when it appears after a
|
||||
// syllable or another tsheg.
|
||||
|
@ -130,7 +166,8 @@ public class ACIPConverter {
|
|||
* or in converting a tsheg bar, then they are appended to
|
||||
* warnings if warnings is non-null, and they are written to the
|
||||
* result if writeWarningsToResult is true. Returns the
|
||||
* conversion upon perfect success, null if errors occurred.
|
||||
* conversion upon perfect success or if there were merely
|
||||
* warnings, null if errors occurred.
|
||||
*/
|
||||
public static String convertToUnicode(String acip,
|
||||
StringBuffer errors,
|
||||
|
@ -174,25 +211,43 @@ public class ACIPConverter {
|
|||
boolean writeWarningsToOut,
|
||||
String warningLevel)
|
||||
throws IOException
|
||||
{
|
||||
return convertTo(true, scan, out, null, errors, warnings,
|
||||
writeWarningsToOut, warningLevel);
|
||||
}
|
||||
|
||||
private static boolean convertTo(boolean toUnicode, // else to TMW
|
||||
ArrayList scan,
|
||||
OutputStream out, // for toUnicode mode
|
||||
TibetanDocument tdoc, // for !toUnicode mode
|
||||
StringBuffer errors,
|
||||
StringBuffer warnings,
|
||||
boolean writeWarningsToOut,
|
||||
String warningLevel)
|
||||
throws IOException
|
||||
{
|
||||
int sz = scan.size();
|
||||
boolean hasErrors = false;
|
||||
BufferedWriter writer
|
||||
= new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
|
||||
BufferedWriter writer = null;
|
||||
if (toUnicode)
|
||||
writer
|
||||
= new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
|
||||
for (int i = 0; i < sz; i++) {
|
||||
ACIPString s = (ACIPString)scan.get(i);
|
||||
int stype = s.getType();
|
||||
if (stype == ACIPString.ERROR) {
|
||||
hasErrors = true;
|
||||
writer.write("[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: ");
|
||||
writer.write(s.getText());
|
||||
writer.write("]");
|
||||
String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]";
|
||||
if (null != writer) writer.write(text);
|
||||
if (null != tdoc) tdoc.appendRoman(text);
|
||||
} else if (stype == ACIPString.WARNING) {
|
||||
if (writeWarningsToOut) {
|
||||
writer.write("[#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: ");
|
||||
writer.write(s.getText());
|
||||
writer.write("]");
|
||||
String text = "[#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: " + s.getText() + "]";
|
||||
if (null != writer) writer.write(text);
|
||||
if (null != tdoc) tdoc.appendRoman(text);
|
||||
}
|
||||
// DLC NOW: Warning: We're going with {'}{R}{DA}, but only because our knowledge of prefix rules says that {'}{R+DA} is not a legal Tibetan tsheg bar ("syllable")
|
||||
|
||||
if (null != warnings) {
|
||||
warnings.append("Warning: Lexical warning: ");
|
||||
warnings.append(s.getText());
|
||||
|
@ -200,13 +255,15 @@ public class ACIPConverter {
|
|||
}
|
||||
} else {
|
||||
if (s.isLatin(stype)) {
|
||||
if (stype == ACIPString.FOLIO_MARKER)
|
||||
writer.write("{");
|
||||
writer.write(s.getText());
|
||||
if (stype == ACIPString.FOLIO_MARKER)
|
||||
writer.write("}");
|
||||
String text
|
||||
= (((stype == ACIPString.FOLIO_MARKER) ? "{" : "")
|
||||
+ s.getText()
|
||||
+ ((stype == ACIPString.FOLIO_MARKER) ? "}" : ""));
|
||||
if (null != writer) writer.write(text);
|
||||
if (null != tdoc) tdoc.appendRoman(text);
|
||||
} else {
|
||||
String unicode = null;
|
||||
DuffCode[] duff = null;
|
||||
if (stype == ACIPString.TIBETAN_NON_PUNCTUATION) {
|
||||
TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText());
|
||||
String acipError;
|
||||
|
@ -214,7 +271,8 @@ public class ACIPConverter {
|
|||
if ((acipError = pl.getACIPError()) != null) {
|
||||
hasErrors = true;
|
||||
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS THESE ERRORS: " + acipError + "]";
|
||||
writer.write(errorMessage);
|
||||
if (null != writer) writer.write(errorMessage);
|
||||
if (null != tdoc) tdoc.appendRoman(errorMessage);
|
||||
if (null != errors)
|
||||
errors.append(errorMessage + "\n");
|
||||
} else {
|
||||
|
@ -222,7 +280,8 @@ public class ACIPConverter {
|
|||
if (null == pt) {
|
||||
hasErrors = true;
|
||||
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " IS ESSENTIALLY NOTHING.]";
|
||||
writer.write(errorMessage);
|
||||
if (null != writer) writer.write(errorMessage);
|
||||
if (null != tdoc) tdoc.appendRoman(errorMessage);
|
||||
if (null != errors)
|
||||
errors.append(errorMessage + "\n");
|
||||
} else {
|
||||
|
@ -230,7 +289,8 @@ public class ACIPConverter {
|
|||
if (null == sl) {
|
||||
hasErrors = true;
|
||||
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS NO LEGAL PARSES.]";
|
||||
writer.write(errorMessage);
|
||||
if (null != writer) writer.write(errorMessage);
|
||||
if (null != tdoc) tdoc.appendRoman(errorMessage);
|
||||
if (null != errors)
|
||||
errors.append(errorMessage + "\n");
|
||||
} else {
|
||||
|
@ -240,36 +300,74 @@ public class ACIPConverter {
|
|||
s.getText());
|
||||
if (null != warning) {
|
||||
if (writeWarningsToOut) {
|
||||
writer.write("[#WARNING CONVERTING ACIP DOCUMENT: ");
|
||||
writer.write(warning);
|
||||
writer.write("]");
|
||||
String text
|
||||
= ("[#WARNING CONVERTING ACIP DOCUMENT: "
|
||||
+ warning + "]");
|
||||
if (null != writer) writer.write(text);
|
||||
if (null != tdoc) tdoc.appendRoman(text);
|
||||
}
|
||||
if (null != warnings) {
|
||||
warnings.append(warning);
|
||||
warnings.append('\n');
|
||||
}
|
||||
}
|
||||
unicode = sl.getUnicode();
|
||||
if (null == unicode) throw new Error("FIXME: make this an assertion");
|
||||
if (null != writer) {
|
||||
unicode = sl.getUnicode();
|
||||
if (null == unicode) throw new Error("FIXME: make this an assertion 4");
|
||||
}
|
||||
if (null != tdoc) {
|
||||
duff = sl.getDuff();
|
||||
if (0 == duff.length) {
|
||||
throw new Error("No DuffCodes for stack list " + sl); // FIXME: make this an assertion
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (stype == ACIPString.START_SLASH)
|
||||
unicode = "\u0F3C";
|
||||
else if (stype == ACIPString.END_SLASH)
|
||||
unicode = "\u0F3D";
|
||||
else
|
||||
unicode = ACIPRules.getUnicodeFor(s.getText(), false);
|
||||
if (null == unicode) throw new Error("FIXME: make this an assertion");
|
||||
if (stype == ACIPString.START_SLASH) {
|
||||
if (null != writer) unicode = "\u0F3C";
|
||||
if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph("(") };
|
||||
} else if (stype == ACIPString.END_SLASH) {
|
||||
if (null != writer) unicode = "\u0F3D";
|
||||
if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph(")") };
|
||||
} else {
|
||||
if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false);
|
||||
if (null != tdoc) {
|
||||
if (s.getText().equals("\r") || s.getText().equals("\t") || s.getText().equals("\n")) {
|
||||
tdoc.appendRoman(s.getText());
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
String wy = ACIPRules.getWylieForACIPOther(s.getText());
|
||||
if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
|
||||
duff = new DuffCode[] { TibetanMachineWeb.getGlyph(wy) };
|
||||
}
|
||||
}
|
||||
}
|
||||
if (null != writer && null == unicode)
|
||||
throw new Error("FIXME: make this an assertion 1");
|
||||
if (null != tdoc && (null == duff || 0 == duff.length))
|
||||
throw new Error("FIXME: make this an assertion 2");
|
||||
}
|
||||
if (null != unicode) {
|
||||
writer.write(unicode);
|
||||
if (null != writer && null != unicode) writer.write(unicode);
|
||||
if (null != tdoc) {
|
||||
if (null != duff && 0 != duff.length) {
|
||||
tdoc.appendDuffCodes(duff);
|
||||
// DLC NOW FIXME: use TibTextUtils.getVowel logic to make the output beautiful.
|
||||
} else {
|
||||
// this happens when you have an
|
||||
// [#ERROR]-producing tsheg bar.
|
||||
|
||||
// System.err.println("Bad tsheg bar with ACIP {" + s.getText() + "}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
writer.close();
|
||||
if (null != writer) {
|
||||
writer.close();
|
||||
}
|
||||
return !hasErrors;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,8 +19,12 @@ Contributor(s): ______________________________________.
|
|||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
|
||||
import org.thdl.tib.text.DuffCode;
|
||||
import org.thdl.tib.text.TibetanMachineWeb;
|
||||
|
||||
/** Canonizes some facts regarding the ACIP transcription system.
|
||||
* @author David Chandler */
|
||||
class ACIPRules {
|
||||
|
@ -36,7 +40,9 @@ class ACIPRules {
|
|||
private static HashSet acipVowels = null;
|
||||
|
||||
private static String[][] baseVowels = new String[][] {
|
||||
// { ACIP, EWTS, EWTS for '\'' + baseVowels[][0] }:
|
||||
// { ACIP, EWTS, EWTS for ACIP {'\'' + baseVowels[][0]}, vowel
|
||||
// numbers (see TibetanMachineWeb's VOWEL_A, VOWEL_o, etc.)
|
||||
// for ACIP, vowel numbers for ACIP {'\'' + baseVowels[][0]}
|
||||
{ "A", "a", "A" },
|
||||
{ "I", "i", "I" },
|
||||
{ "U", "u", "U" },
|
||||
|
@ -70,7 +76,7 @@ class ACIPRules {
|
|||
// DLC keep this code in sync with getUnicodeFor.
|
||||
// DLC keep this code in sync with getWylieForACIPVowel
|
||||
|
||||
// DLC '\' for visarga? how shall we do \ the visarga? like a vowel or not?
|
||||
// DLC '\' for virama? how shall we do \ the virama? like a vowel or not?
|
||||
}
|
||||
}
|
||||
return (acipVowels.contains(s));
|
||||
|
@ -211,6 +217,39 @@ class ACIPRules {
|
|||
return (String)acipVowel2wylie.get(acip);
|
||||
}
|
||||
|
||||
private static HashMap acipOther2wylie = null;
|
||||
/** Returns the EWTS corresponding to the given ACIP puncuation or
|
||||
* mark. Returns null if there is no such EWTS. */
|
||||
static final String getWylieForACIPOther(String acip) {
|
||||
if (acipOther2wylie == null) {
|
||||
acipOther2wylie = new HashMap(37);
|
||||
|
||||
// DLC FIXME: check all these again.
|
||||
acipOther2wylie.put(",", "/");
|
||||
acipOther2wylie.put(" ", " ");
|
||||
acipOther2wylie.put(".", "*");
|
||||
acipOther2wylie.put("|", "|");
|
||||
acipOther2wylie.put("`", "!");
|
||||
acipOther2wylie.put(";", ";");
|
||||
acipOther2wylie.put("*", "@");
|
||||
acipOther2wylie.put("#", "@#");
|
||||
acipOther2wylie.put("%", "%");
|
||||
acipOther2wylie.put("&", "&");
|
||||
|
||||
acipOther2wylie.put("0", "0");
|
||||
acipOther2wylie.put("1", "1");
|
||||
acipOther2wylie.put("2", "2");
|
||||
acipOther2wylie.put("3", "3");
|
||||
acipOther2wylie.put("4", "4");
|
||||
acipOther2wylie.put("5", "5");
|
||||
acipOther2wylie.put("6", "6");
|
||||
acipOther2wylie.put("7", "7");
|
||||
acipOther2wylie.put("8", "8");
|
||||
acipOther2wylie.put("9", "9");
|
||||
}
|
||||
return (String)acipOther2wylie.get(acip);
|
||||
}
|
||||
|
||||
private static HashMap superACIP2unicode = null;
|
||||
private static HashMap subACIP2unicode = null;
|
||||
/** If acip is an ACIP consonant or vowel or punctuation mark,
|
||||
|
@ -416,6 +455,42 @@ class ACIPRules {
|
|||
if (null != u) return u;
|
||||
}
|
||||
return (String)superACIP2unicode.get(acip);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** DLC DOC: Gets the duffcodes for vowel, such that they look good with hashKey, and appends them to r. */
|
||||
static void getDuffForACIPVowel(ArrayList r, String hashKey, String vowel) {
|
||||
if (null == vowel) return;
|
||||
if (null == getWylieForACIPVowel(vowel)) // FIXME: expensive assertion! Use assert.
|
||||
throw new IllegalArgumentException("Vowel " + vowel + " isn't in the small set of vowels we handle correctly.");
|
||||
if (!TibetanMachineWeb.isKnownHashKey(hashKey)) // FIXME: expensive assertion! Use assert.
|
||||
throw new IllegalArgumentException("bad hashKey");
|
||||
|
||||
// Order matters here.
|
||||
if (vowel.indexOf("'U") >= 0)
|
||||
r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_U));
|
||||
else {
|
||||
if (vowel.indexOf('\'') >= 0)
|
||||
r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_A));
|
||||
if (vowel.indexOf("EE") >= 0)
|
||||
r.add(TibetanMachineWeb.getGlyph("ai"));
|
||||
else if (vowel.indexOf('E') >= 0)
|
||||
r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_e));
|
||||
if (vowel.indexOf("OO") >= 0)
|
||||
r.add(TibetanMachineWeb.getGlyph("au"));
|
||||
else if (vowel.indexOf('O') >= 0)
|
||||
r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_o));
|
||||
if (vowel.indexOf('I') >= 0)
|
||||
r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_i));
|
||||
if (vowel.indexOf('U') >= 0)
|
||||
r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_u));
|
||||
if (vowel.indexOf('i') >= 0)
|
||||
r.add(TibetanMachineWeb.getGlyph("-i"));
|
||||
}
|
||||
if (vowel.indexOf('m') >= 0)
|
||||
r.add(TibetanMachineWeb.getGlyph("M"));
|
||||
if (vowel.indexOf(':') >= 0)
|
||||
r.add(TibetanMachineWeb.getGlyph("H"));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -319,6 +319,16 @@ tstHelper("MSTAN"); // ambiguous with regard to prefix rules
|
|||
|
||||
|
||||
|
||||
tstHelper("KA'", "[(K . A), (' . )]",
|
||||
new String[] { "{KA}{'}" },
|
||||
new String[] { "{KA}{'}" },
|
||||
"{KA}{'}"); // DLC NOW
|
||||
|
||||
tstHelper("A'AAMA", "{A}{'}{AA}{MA}"); // FIXME: how should we parse this?
|
||||
|
||||
tstHelper("K+K+KA", "{K+}{K+}{KA}");
|
||||
|
||||
|
||||
|
||||
// If you're not careful, you'll think GGYES is a legal
|
||||
// Tibetan tsheg bar and parse it as {G}{G+YE}{S}. But it's
|
||||
|
|
|
@ -19,6 +19,10 @@ Contributor(s): ______________________________________.
|
|||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import org.thdl.util.ThdlDebug;
|
||||
import org.thdl.tib.text.TibetanMachineWeb;
|
||||
import org.thdl.tib.text.DuffCode;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/** An ordered pair used in ACIP-to-TMW conversion. The left side is
|
||||
* the consonant or empty; the right side is the vowel, '+', or '-'.
|
||||
|
@ -70,7 +74,9 @@ class TPair {
|
|||
|
||||
/** Returns an TPair that is like this one except that it is
|
||||
* missing N characters. The characters are taken from r, the
|
||||
* right side, first and from l, the left side, second.
|
||||
* right side, first and from l, the left side, second. The pair
|
||||
* returned may be illegal, such as the (A . ') you can get from
|
||||
* ACIP {A'AAMA}.
|
||||
* @throw IllegalArgumentException if N is out of range */
|
||||
TPair minusNRightmostACIPCharacters(int N)
|
||||
throws IllegalArgumentException
|
||||
|
@ -80,7 +86,7 @@ class TPair {
|
|||
if (N > size())
|
||||
throw new IllegalArgumentException("Don't have that many to remove.");
|
||||
if (N < 1)
|
||||
throw new IllegalArgumentException("You should't call this if you don't want to remove any.");
|
||||
throw new IllegalArgumentException("You shouldn't call this if you don't want to remove any.");
|
||||
if (null != r && (sz = r.length()) > 0) {
|
||||
int min = Math.min(sz, N);
|
||||
newR = r.substring(0, sz - min);
|
||||
|
@ -101,7 +107,7 @@ class TPair {
|
|||
return false;
|
||||
if (null != l && !ACIPRules.isConsonant(l))
|
||||
return false;
|
||||
if (null != r && !ACIPRules.isVowel(l))
|
||||
if (null != r && !ACIPRules.isVowel(r))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
@ -146,8 +152,14 @@ class TPair {
|
|||
return (l != null && l.length() == 1 && (ch = l.charAt(0)) >= '0' && ch <= '9');
|
||||
}
|
||||
|
||||
/** Returns the EWTS Wylie that corresponds to this pair. Untested. */
|
||||
String getWylie() {
|
||||
return getWylie(false);
|
||||
}
|
||||
|
||||
/** Returns the EWTS Wylie that corresponds to this pair if
|
||||
* justLeft is false, or the EWTS Wylie that corresponds to just
|
||||
* {@link #getLeft()} if justLeft is true. */
|
||||
String getWylie(boolean justLeft) {
|
||||
String leftWylie = null;
|
||||
if (getLeft() != null) {
|
||||
leftWylie = ACIPRules.getWylieForACIPConsonant(getLeft());
|
||||
|
@ -156,6 +168,8 @@ class TPair {
|
|||
leftWylie = getLeft();
|
||||
}
|
||||
}
|
||||
if (null == leftWylie) leftWylie = "";
|
||||
if (justLeft) return leftWylie;
|
||||
String rightWylie = null;
|
||||
if ("-".equals(getRight()))
|
||||
rightWylie = ".";
|
||||
|
@ -163,7 +177,6 @@ class TPair {
|
|||
rightWylie = "+";
|
||||
else if (getRight() != null)
|
||||
rightWylie = ACIPRules.getWylieForACIPVowel(getRight());
|
||||
if (null == leftWylie) leftWylie = "";
|
||||
if (null == rightWylie) rightWylie = "";
|
||||
return leftWylie + rightWylie;
|
||||
}
|
||||
|
|
|
@ -609,5 +609,24 @@ class TPairList {
|
|||
}
|
||||
}
|
||||
|
||||
/** Appends the DuffCodes that correspond to this grapheme cluster
|
||||
* to duff. Assumes this is one grapheme cluster. */
|
||||
void getDuff(ArrayList duff) {
|
||||
StringBuffer wylieForConsonant = new StringBuffer();
|
||||
for (int x = 0; x + 1 < size(); x++) {
|
||||
wylieForConsonant.append(get(x).getWylie(false));
|
||||
}
|
||||
TPair lastPair = get(size() - 1);
|
||||
wylieForConsonant.append(lastPair.getWylie(true));
|
||||
String hashKey = wylieForConsonant.toString();
|
||||
if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {
|
||||
hashKey = hashKey.replace('+', '-');
|
||||
if (!TibetanMachineWeb.isKnownHashKey(hashKey)) {
|
||||
throw new Error("How did this happen?");
|
||||
}
|
||||
}
|
||||
duff.add(TibetanMachineWeb.getGlyph(hashKey));
|
||||
ACIPRules.getDuffForACIPVowel(duff, hashKey, lastPair.getRight());
|
||||
}
|
||||
}
|
||||
// DLC FIXME: handle 'o' and 'x', e.g. KAo and NYAx.
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.thdl.tib.text.ttt;
|
|||
|
||||
import org.thdl.tib.text.TibTextUtils;
|
||||
import org.thdl.tib.text.TGCList;
|
||||
import org.thdl.tib.text.DuffCode;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.ListIterator;
|
||||
|
@ -216,8 +217,21 @@ class TStackList {
|
|||
}
|
||||
return u.toString();
|
||||
}
|
||||
/** DLC DOC */
|
||||
DuffCode[] getDuff() {
|
||||
ArrayList al = new ArrayList(size()*2); // rough estimate
|
||||
int count = 0;
|
||||
for (int i = 0; i < size(); i++) {
|
||||
get(i).getDuff(al);
|
||||
}
|
||||
if (size() > 0 && al.size() == 0) {
|
||||
throw new Error("But this stack list, " + this + ", contains " + size() + " stacks! How can it not have DuffCodes associated with it?");
|
||||
}
|
||||
return (DuffCode[])al.toArray(new DuffCode[] { });
|
||||
}
|
||||
}
|
||||
|
||||
/** Too simple to comment. */
|
||||
class BoolPair {
|
||||
boolean isLegal;
|
||||
boolean isLegalAndHasAVowelOnRoot;
|
||||
|
|
Loading…
Reference in a new issue