ACIP->TMW and ACIP->Unicode are now smart about when a newline is

really a newline and when a space is really a tsheg. The space in {KA
,MDO} is a tsheg, but the space in {GA ,MDO} is not.
This commit is contained in:
dchandler 2003-09-04 04:04:21 +00:00
parent 72e531e515
commit d2749cecd0
2 changed files with 100 additions and 18 deletions

View file

@ -35,13 +35,6 @@ import org.thdl.tib.text.DuffCode;
* @author David Chandler * @author David Chandler
*/ */
public class ACIPConverter { public class ACIPConverter {
static {
// We don't want to load the TM or TMW font files ourselves:
ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
ThdlOptions.setUserPreference("thdl.debug", true);
}
// DLC NOW: (KA)'s info is lost when you convert to Unicode text instead of Unicode RTF. Give an ERROR. // DLC NOW: (KA)'s info is lost when you convert to Unicode text instead of Unicode RTF. Give an ERROR.
/** Command-line converter. Gives error messages on standard /** Command-line converter. Gives error messages on standard
@ -52,6 +45,11 @@ public class ACIPConverter {
public static void main(String[] args) public static void main(String[] args)
throws IOException throws IOException
{ {
// We don't want to load the TM or TMW font files ourselves:
ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
ThdlOptions.setUserPreference("thdl.debug", true);
boolean verbose = true; boolean verbose = true;
if (args.length != 1) { if (args.length != 1) {
System.out.println("Bad args! Need just the name of the ACIP text file."); System.out.println("Bad args! Need just the name of the ACIP text file.");
@ -216,6 +214,24 @@ public class ACIPConverter {
writeWarningsToOut, warningLevel); writeWarningsToOut, warningLevel);
} }
private static boolean peekaheadFindsSpacesAndComma(ArrayList /* of ACIPString */ scan,
int pos) {
int sz = scan.size();
while (pos < sz) {
ACIPString s = (ACIPString)scan.get(pos++);
if (s.getType() == ACIPString.TIBETAN_PUNCTUATION && s.getText().equals(" ")) {
// keep going
} else {
if (s.getType() == ACIPString.TIBETAN_PUNCTUATION && s.getText().equals(",")) {
return true;
} else {
return false;
}
}
}
return false;
}
private static boolean convertTo(boolean toUnicode, // else to TMW private static boolean convertTo(boolean toUnicode, // else to TMW
ArrayList scan, ArrayList scan,
OutputStream out, // for toUnicode mode OutputStream out, // for toUnicode mode
@ -232,15 +248,21 @@ public class ACIPConverter {
if (toUnicode) if (toUnicode)
writer writer
= new BufferedWriter(new OutputStreamWriter(out, "UTF-8")); = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
boolean lastGuyWasNonPunct = false;
TStackList lastGuy = null;
for (int i = 0; i < sz; i++) { for (int i = 0; i < sz; i++) {
ACIPString s = (ACIPString)scan.get(i); ACIPString s = (ACIPString)scan.get(i);
int stype = s.getType(); int stype = s.getType();
if (stype == ACIPString.ERROR) { if (stype == ACIPString.ERROR) {
lastGuyWasNonPunct = false;
lastGuy = null;
hasErrors = true; hasErrors = true;
String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]"; String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]";
if (null != writer) writer.write(text); if (null != writer) writer.write(text);
if (null != tdoc) tdoc.appendRoman(text); if (null != tdoc) tdoc.appendRoman(text);
} else if (stype == ACIPString.WARNING) { } else if (stype == ACIPString.WARNING) {
lastGuyWasNonPunct = false;
lastGuy = null;
if (writeWarningsToOut) { if (writeWarningsToOut) {
String text = "[#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: " + s.getText() + "]"; String text = "[#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: " + s.getText() + "]";
if (null != writer) writer.write(text); if (null != writer) writer.write(text);
@ -255,6 +277,8 @@ public class ACIPConverter {
} }
} else { } else {
if (s.isLatin(stype)) { if (s.isLatin(stype)) {
lastGuyWasNonPunct = false;
lastGuy = null;
String text String text
= (((stype == ACIPString.FOLIO_MARKER) ? "{" : "") = (((stype == ACIPString.FOLIO_MARKER) ? "{" : "")
+ s.getText() + s.getText()
@ -265,6 +289,7 @@ public class ACIPConverter {
String unicode = null; String unicode = null;
DuffCode[] duff = null; DuffCode[] duff = null;
if (stype == ACIPString.TIBETAN_NON_PUNCTUATION) { if (stype == ACIPString.TIBETAN_NON_PUNCTUATION) {
lastGuyWasNonPunct = true;
TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText()); TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText());
String acipError; String acipError;
@ -294,6 +319,7 @@ public class ACIPConverter {
if (null != errors) if (null != errors)
errors.append(errorMessage + "\n"); errors.append(errorMessage + "\n");
} else { } else {
lastGuy = sl;
String warning String warning
= pt.getWarning(warningLevel, = pt.getWarning(warningLevel,
pl, pl,
@ -332,16 +358,50 @@ public class ACIPConverter {
if (null != writer) unicode = "\u0F3D"; if (null != writer) unicode = "\u0F3D";
if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph(")") }; if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph(")") };
} else { } else {
if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false); // For ACIP, tshegs are used as both
if (null != tdoc) { // tshegs and whitespace. We treat a
if (s.getText().equals("\r") || s.getText().equals("\t") || s.getText().equals("\n")) { // space as a tsheg if and only if it
tdoc.appendRoman(s.getText()); // occurs after TIBETAN_NON_PUNCTUATION.
continue; // But "SHIG ,MDO" is an example of a
// special case, needed because a tsheg is
// not used after a GA in Tibetan
// typesetting.
boolean done = false;
// DLC what about after numbers? marks?
if (s.getText().equals(" ")) {
TPairList lpl = null;
if (!lastGuyWasNonPunct
|| (null != lastGuy
&& (lpl = lastGuy.get(lastGuy.size() - 1)).size() == 1
&& lpl.get(0).getLeft().equals("G")
&& // it's (G . anything)
// followed by some number
// of spaces (at least one,
// this one) and then a
// comma:
peekaheadFindsSpacesAndComma(scan, i+1))) {
if (null != writer) {
unicode = " ";
done = true;
}
if (null != tdoc) {
tdoc.appendRoman(" ");
continue;
}
} }
else { }
String wy = ACIPRules.getWylieForACIPOther(s.getText()); if (!done) {
if (null == wy) throw new Error("No wylie for ACIP " + s.getText()); if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false);
duff = new DuffCode[] { TibetanMachineWeb.getGlyph(wy) }; if (null != tdoc) {
if (s.getText().equals("\r") || s.getText().equals("\t") || s.getText().equals("\n")) {
tdoc.appendRoman(s.getText());
continue;
}
else {
String wy = ACIPRules.getWylieForACIPOther(s.getText());
if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
duff = new DuffCode[] { TibetanMachineWeb.getGlyph(wy) };
}
} }
} }
} }
@ -349,6 +409,8 @@ public class ACIPConverter {
throw new Error("FIXME: make this an assertion 1"); throw new Error("FIXME: make this an assertion 1");
if (null != tdoc && (null == duff || 0 == duff.length)) if (null != tdoc && (null == duff || 0 == duff.length))
throw new Error("FIXME: make this an assertion 2"); throw new Error("FIXME: make this an assertion 2");
lastGuyWasNonPunct = false;
lastGuy = null;
} }
if (null != writer && null != unicode) writer.write(unicode); if (null != writer && null != unicode) writer.write(unicode);
if (null != tdoc) { if (null != tdoc) {

View file

@ -767,13 +767,33 @@ public class ACIPTshegBarScanner {
case ';': case ';':
case '`': case '`':
case '#': case '#':
// The tsheg bar ends here; new token. // The tsheg bar ends here; new token.
if (startOfString < i) { if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i), al.add(new ACIPString(s.substring(startOfString, i),
currentType)); currentType));
} }
al.add(new ACIPString(s.substring(i, i+1),
ACIPString.TIBETAN_PUNCTUATION)); // Insert a tsheg if necessary. ACIP files aren't
// careful, so "KA\r\n" and "GA\n" appear where "KA
// \r\n" and "GA \n" should appear.
if (('\r' == ch
|| '\n' == ch)
&& !al.isEmpty()
&& ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION) {
al.add(new ACIPString(" ",
ACIPString.TIBETAN_PUNCTUATION));
}
// Don't add in a "\r\n" or "\n" unless there's a
// blank line.
boolean rn = false;
if (('\n' != ch && '\r' != ch)
|| ((rn = ('\n' == ch && i >= 3 && s.charAt(i-3) == '\r' && s.charAt(i-2) == '\n' && s.charAt(i-1) == '\r'))
|| ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n'))) {
al.add(new ACIPString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
ACIPString.TIBETAN_PUNCTUATION));
}
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
break; // end TIBETAN_PUNCTUATION case break; // end TIBETAN_PUNCTUATION case