ACIP->TMW and ACIP->Unicode are now smart about when a newline is
really a newline and when a space is really a tsheg. The space in {KA ,MDO} is a tsheg, but the space in {GA ,MDO} is not.
This commit is contained in:
parent
72e531e515
commit
d2749cecd0
2 changed files with 100 additions and 18 deletions
|
@ -35,13 +35,6 @@ import org.thdl.tib.text.DuffCode;
|
|||
* @author David Chandler
|
||||
*/
|
||||
public class ACIPConverter {
|
||||
static {
|
||||
// We don't want to load the TM or TMW font files ourselves:
|
||||
ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
|
||||
ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
|
||||
ThdlOptions.setUserPreference("thdl.debug", true);
|
||||
}
|
||||
|
||||
// DLC NOW: (KA)'s info is lost when you convert to Unicode text instead of Unicode RTF. Give an ERROR.
|
||||
|
||||
/** Command-line converter. Gives error messages on standard
|
||||
|
@ -52,6 +45,11 @@ public class ACIPConverter {
|
|||
public static void main(String[] args)
|
||||
throws IOException
|
||||
{
|
||||
// We don't want to load the TM or TMW font files ourselves:
|
||||
ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
|
||||
ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
|
||||
ThdlOptions.setUserPreference("thdl.debug", true);
|
||||
|
||||
boolean verbose = true;
|
||||
if (args.length != 1) {
|
||||
System.out.println("Bad args! Need just the name of the ACIP text file.");
|
||||
|
@ -216,6 +214,24 @@ public class ACIPConverter {
|
|||
writeWarningsToOut, warningLevel);
|
||||
}
|
||||
|
||||
private static boolean peekaheadFindsSpacesAndComma(ArrayList /* of ACIPString */ scan,
|
||||
int pos) {
|
||||
int sz = scan.size();
|
||||
while (pos < sz) {
|
||||
ACIPString s = (ACIPString)scan.get(pos++);
|
||||
if (s.getType() == ACIPString.TIBETAN_PUNCTUATION && s.getText().equals(" ")) {
|
||||
// keep going
|
||||
} else {
|
||||
if (s.getType() == ACIPString.TIBETAN_PUNCTUATION && s.getText().equals(",")) {
|
||||
return true;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
private static boolean convertTo(boolean toUnicode, // else to TMW
|
||||
ArrayList scan,
|
||||
OutputStream out, // for toUnicode mode
|
||||
|
@ -232,15 +248,21 @@ public class ACIPConverter {
|
|||
if (toUnicode)
|
||||
writer
|
||||
= new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
|
||||
boolean lastGuyWasNonPunct = false;
|
||||
TStackList lastGuy = null;
|
||||
for (int i = 0; i < sz; i++) {
|
||||
ACIPString s = (ACIPString)scan.get(i);
|
||||
int stype = s.getType();
|
||||
if (stype == ACIPString.ERROR) {
|
||||
lastGuyWasNonPunct = false;
|
||||
lastGuy = null;
|
||||
hasErrors = true;
|
||||
String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]";
|
||||
if (null != writer) writer.write(text);
|
||||
if (null != tdoc) tdoc.appendRoman(text);
|
||||
} else if (stype == ACIPString.WARNING) {
|
||||
lastGuyWasNonPunct = false;
|
||||
lastGuy = null;
|
||||
if (writeWarningsToOut) {
|
||||
String text = "[#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: " + s.getText() + "]";
|
||||
if (null != writer) writer.write(text);
|
||||
|
@ -255,6 +277,8 @@ public class ACIPConverter {
|
|||
}
|
||||
} else {
|
||||
if (s.isLatin(stype)) {
|
||||
lastGuyWasNonPunct = false;
|
||||
lastGuy = null;
|
||||
String text
|
||||
= (((stype == ACIPString.FOLIO_MARKER) ? "{" : "")
|
||||
+ s.getText()
|
||||
|
@ -265,6 +289,7 @@ public class ACIPConverter {
|
|||
String unicode = null;
|
||||
DuffCode[] duff = null;
|
||||
if (stype == ACIPString.TIBETAN_NON_PUNCTUATION) {
|
||||
lastGuyWasNonPunct = true;
|
||||
TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText());
|
||||
String acipError;
|
||||
|
||||
|
@ -294,6 +319,7 @@ public class ACIPConverter {
|
|||
if (null != errors)
|
||||
errors.append(errorMessage + "\n");
|
||||
} else {
|
||||
lastGuy = sl;
|
||||
String warning
|
||||
= pt.getWarning(warningLevel,
|
||||
pl,
|
||||
|
@ -332,16 +358,50 @@ public class ACIPConverter {
|
|||
if (null != writer) unicode = "\u0F3D";
|
||||
if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph(")") };
|
||||
} else {
|
||||
if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false);
|
||||
if (null != tdoc) {
|
||||
if (s.getText().equals("\r") || s.getText().equals("\t") || s.getText().equals("\n")) {
|
||||
tdoc.appendRoman(s.getText());
|
||||
continue;
|
||||
// For ACIP, tshegs are used as both
|
||||
// tshegs and whitespace. We treat a
|
||||
// space as a tsheg if and only if it
|
||||
// occurs after TIBETAN_NON_PUNCTUATION.
|
||||
// But "SHIG ,MDO" is an example of a
|
||||
// special case, needed because a tsheg is
|
||||
// not used after a GA in Tibetan
|
||||
// typesetting.
|
||||
boolean done = false;
|
||||
// DLC what about after numbers? marks?
|
||||
if (s.getText().equals(" ")) {
|
||||
TPairList lpl = null;
|
||||
if (!lastGuyWasNonPunct
|
||||
|| (null != lastGuy
|
||||
&& (lpl = lastGuy.get(lastGuy.size() - 1)).size() == 1
|
||||
&& lpl.get(0).getLeft().equals("G")
|
||||
&& // it's (G . anything)
|
||||
// followed by some number
|
||||
// of spaces (at least one,
|
||||
// this one) and then a
|
||||
// comma:
|
||||
peekaheadFindsSpacesAndComma(scan, i+1))) {
|
||||
if (null != writer) {
|
||||
unicode = " ";
|
||||
done = true;
|
||||
}
|
||||
if (null != tdoc) {
|
||||
tdoc.appendRoman(" ");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
else {
|
||||
String wy = ACIPRules.getWylieForACIPOther(s.getText());
|
||||
if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
|
||||
duff = new DuffCode[] { TibetanMachineWeb.getGlyph(wy) };
|
||||
}
|
||||
if (!done) {
|
||||
if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false);
|
||||
if (null != tdoc) {
|
||||
if (s.getText().equals("\r") || s.getText().equals("\t") || s.getText().equals("\n")) {
|
||||
tdoc.appendRoman(s.getText());
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
String wy = ACIPRules.getWylieForACIPOther(s.getText());
|
||||
if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
|
||||
duff = new DuffCode[] { TibetanMachineWeb.getGlyph(wy) };
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -349,6 +409,8 @@ public class ACIPConverter {
|
|||
throw new Error("FIXME: make this an assertion 1");
|
||||
if (null != tdoc && (null == duff || 0 == duff.length))
|
||||
throw new Error("FIXME: make this an assertion 2");
|
||||
lastGuyWasNonPunct = false;
|
||||
lastGuy = null;
|
||||
}
|
||||
if (null != writer && null != unicode) writer.write(unicode);
|
||||
if (null != tdoc) {
|
||||
|
|
|
@ -767,13 +767,33 @@ public class ACIPTshegBarScanner {
|
|||
case ';':
|
||||
case '`':
|
||||
case '#':
|
||||
|
||||
// The tsheg bar ends here; new token.
|
||||
if (startOfString < i) {
|
||||
al.add(new ACIPString(s.substring(startOfString, i),
|
||||
currentType));
|
||||
}
|
||||
al.add(new ACIPString(s.substring(i, i+1),
|
||||
ACIPString.TIBETAN_PUNCTUATION));
|
||||
|
||||
// Insert a tsheg if necessary. ACIP files aren't
|
||||
// careful, so "KA\r\n" and "GA\n" appear where "KA
|
||||
// \r\n" and "GA \n" should appear.
|
||||
if (('\r' == ch
|
||||
|| '\n' == ch)
|
||||
&& !al.isEmpty()
|
||||
&& ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION) {
|
||||
al.add(new ACIPString(" ",
|
||||
ACIPString.TIBETAN_PUNCTUATION));
|
||||
}
|
||||
|
||||
// Don't add in a "\r\n" or "\n" unless there's a
|
||||
// blank line.
|
||||
boolean rn = false;
|
||||
if (('\n' != ch && '\r' != ch)
|
||||
|| ((rn = ('\n' == ch && i >= 3 && s.charAt(i-3) == '\r' && s.charAt(i-2) == '\n' && s.charAt(i-1) == '\r'))
|
||||
|| ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n'))) {
|
||||
al.add(new ACIPString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
|
||||
ACIPString.TIBETAN_PUNCTUATION));
|
||||
}
|
||||
startOfString = i+1;
|
||||
currentType = ACIPString.ERROR;
|
||||
break; // end TIBETAN_PUNCTUATION case
|
||||
|
|
Loading…
Reference in a new issue