ACIP->TMW and ACIP->Unicode are now smart about when a newline is

really a newline and when a space is really a tsheg. The space in {KA
,MDO} is a tsheg, but the space in {GA ,MDO} is not.
This commit is contained in:
dchandler 2003-09-04 04:04:21 +00:00
parent 72e531e515
commit d2749cecd0
2 changed files with 100 additions and 18 deletions

View file

@ -35,13 +35,6 @@ import org.thdl.tib.text.DuffCode;
* @author David Chandler
*/
public class ACIPConverter {
static {
// We don't want to load the TM or TMW font files ourselves:
ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
ThdlOptions.setUserPreference("thdl.debug", true);
}
// DLC NOW: (KA)'s info is lost when you convert to Unicode text instead of Unicode RTF. Give an ERROR.
/** Command-line converter. Gives error messages on standard
@ -52,6 +45,11 @@ public class ACIPConverter {
public static void main(String[] args)
throws IOException
{
// We don't want to load the TM or TMW font files ourselves:
ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
ThdlOptions.setUserPreference("thdl.debug", true);
boolean verbose = true;
if (args.length != 1) {
System.out.println("Bad args! Need just the name of the ACIP text file.");
@ -216,6 +214,24 @@ public class ACIPConverter {
writeWarningsToOut, warningLevel);
}
private static boolean peekaheadFindsSpacesAndComma(ArrayList /* of ACIPString */ scan,
int pos) {
int sz = scan.size();
while (pos < sz) {
ACIPString s = (ACIPString)scan.get(pos++);
if (s.getType() == ACIPString.TIBETAN_PUNCTUATION && s.getText().equals(" ")) {
// keep going
} else {
if (s.getType() == ACIPString.TIBETAN_PUNCTUATION && s.getText().equals(",")) {
return true;
} else {
return false;
}
}
}
return false;
}
private static boolean convertTo(boolean toUnicode, // else to TMW
ArrayList scan,
OutputStream out, // for toUnicode mode
@ -232,15 +248,21 @@ public class ACIPConverter {
if (toUnicode)
writer
= new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
boolean lastGuyWasNonPunct = false;
TStackList lastGuy = null;
for (int i = 0; i < sz; i++) {
ACIPString s = (ACIPString)scan.get(i);
int stype = s.getType();
if (stype == ACIPString.ERROR) {
lastGuyWasNonPunct = false;
lastGuy = null;
hasErrors = true;
String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]";
if (null != writer) writer.write(text);
if (null != tdoc) tdoc.appendRoman(text);
} else if (stype == ACIPString.WARNING) {
lastGuyWasNonPunct = false;
lastGuy = null;
if (writeWarningsToOut) {
String text = "[#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: " + s.getText() + "]";
if (null != writer) writer.write(text);
@ -255,6 +277,8 @@ public class ACIPConverter {
}
} else {
if (s.isLatin(stype)) {
lastGuyWasNonPunct = false;
lastGuy = null;
String text
= (((stype == ACIPString.FOLIO_MARKER) ? "{" : "")
+ s.getText()
@ -265,6 +289,7 @@ public class ACIPConverter {
String unicode = null;
DuffCode[] duff = null;
if (stype == ACIPString.TIBETAN_NON_PUNCTUATION) {
lastGuyWasNonPunct = true;
TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText());
String acipError;
@ -294,6 +319,7 @@ public class ACIPConverter {
if (null != errors)
errors.append(errorMessage + "\n");
} else {
lastGuy = sl;
String warning
= pt.getWarning(warningLevel,
pl,
@ -332,16 +358,50 @@ public class ACIPConverter {
if (null != writer) unicode = "\u0F3D";
if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph(")") };
} else {
if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false);
if (null != tdoc) {
if (s.getText().equals("\r") || s.getText().equals("\t") || s.getText().equals("\n")) {
tdoc.appendRoman(s.getText());
continue;
// For ACIP, tshegs are used as both
// tshegs and whitespace. We treat a
// space as a tsheg if and only if it
// occurs after TIBETAN_NON_PUNCTUATION.
// But "SHIG ,MDO" is an example of a
// special case, needed because a tsheg is
// not used after a GA in Tibetan
// typesetting.
boolean done = false;
// DLC what about after numbers? marks?
if (s.getText().equals(" ")) {
TPairList lpl = null;
if (!lastGuyWasNonPunct
|| (null != lastGuy
&& (lpl = lastGuy.get(lastGuy.size() - 1)).size() == 1
&& lpl.get(0).getLeft().equals("G")
&& // it's (G . anything)
// followed by some number
// of spaces (at least one,
// this one) and then a
// comma:
peekaheadFindsSpacesAndComma(scan, i+1))) {
if (null != writer) {
unicode = " ";
done = true;
}
if (null != tdoc) {
tdoc.appendRoman(" ");
continue;
}
}
else {
String wy = ACIPRules.getWylieForACIPOther(s.getText());
if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
duff = new DuffCode[] { TibetanMachineWeb.getGlyph(wy) };
}
if (!done) {
if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false);
if (null != tdoc) {
if (s.getText().equals("\r") || s.getText().equals("\t") || s.getText().equals("\n")) {
tdoc.appendRoman(s.getText());
continue;
}
else {
String wy = ACIPRules.getWylieForACIPOther(s.getText());
if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
duff = new DuffCode[] { TibetanMachineWeb.getGlyph(wy) };
}
}
}
}
@ -349,6 +409,8 @@ public class ACIPConverter {
throw new Error("FIXME: make this an assertion 1");
if (null != tdoc && (null == duff || 0 == duff.length))
throw new Error("FIXME: make this an assertion 2");
lastGuyWasNonPunct = false;
lastGuy = null;
}
if (null != writer && null != unicode) writer.write(unicode);
if (null != tdoc) {

View file

@ -767,13 +767,33 @@ public class ACIPTshegBarScanner {
case ';':
case '`':
case '#':
// The tsheg bar ends here; new token.
if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i),
currentType));
}
al.add(new ACIPString(s.substring(i, i+1),
ACIPString.TIBETAN_PUNCTUATION));
// Insert a tsheg if necessary. ACIP files aren't
// careful, so "KA\r\n" and "GA\n" appear where "KA
// \r\n" and "GA \n" should appear.
if (('\r' == ch
|| '\n' == ch)
&& !al.isEmpty()
&& ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION) {
al.add(new ACIPString(" ",
ACIPString.TIBETAN_PUNCTUATION));
}
// Don't add in a "\r\n" or "\n" unless there's a
// blank line.
boolean rn = false;
if (('\n' != ch && '\r' != ch)
|| ((rn = ('\n' == ch && i >= 3 && s.charAt(i-3) == '\r' && s.charAt(i-2) == '\n' && s.charAt(i-1) == '\r'))
|| ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n'))) {
al.add(new ACIPString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
ACIPString.TIBETAN_PUNCTUATION));
}
startOfString = i+1;
currentType = ACIPString.ERROR;
break; // end TIBETAN_PUNCTUATION case