ACIP->TMW and ACIP->Unicode are now smart about when a newline is
really a newline and when a space is really a tsheg. The space in {KA ,MDO} is a tsheg, but the space in {GA ,MDO} is not.
This commit is contained in:
parent
72e531e515
commit
d2749cecd0
2 changed files with 100 additions and 18 deletions
|
@ -35,13 +35,6 @@ import org.thdl.tib.text.DuffCode;
|
||||||
* @author David Chandler
|
* @author David Chandler
|
||||||
*/
|
*/
|
||||||
public class ACIPConverter {
|
public class ACIPConverter {
|
||||||
static {
|
|
||||||
// We don't want to load the TM or TMW font files ourselves:
|
|
||||||
ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
|
|
||||||
ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
|
|
||||||
ThdlOptions.setUserPreference("thdl.debug", true);
|
|
||||||
}
|
|
||||||
|
|
||||||
// DLC NOW: (KA)'s info is lost when you convert to Unicode text instead of Unicode RTF. Give an ERROR.
|
// DLC NOW: (KA)'s info is lost when you convert to Unicode text instead of Unicode RTF. Give an ERROR.
|
||||||
|
|
||||||
/** Command-line converter. Gives error messages on standard
|
/** Command-line converter. Gives error messages on standard
|
||||||
|
@ -52,6 +45,11 @@ public class ACIPConverter {
|
||||||
public static void main(String[] args)
|
public static void main(String[] args)
|
||||||
throws IOException
|
throws IOException
|
||||||
{
|
{
|
||||||
|
// We don't want to load the TM or TMW font files ourselves:
|
||||||
|
ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
|
||||||
|
ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
|
||||||
|
ThdlOptions.setUserPreference("thdl.debug", true);
|
||||||
|
|
||||||
boolean verbose = true;
|
boolean verbose = true;
|
||||||
if (args.length != 1) {
|
if (args.length != 1) {
|
||||||
System.out.println("Bad args! Need just the name of the ACIP text file.");
|
System.out.println("Bad args! Need just the name of the ACIP text file.");
|
||||||
|
@ -216,6 +214,24 @@ public class ACIPConverter {
|
||||||
writeWarningsToOut, warningLevel);
|
writeWarningsToOut, warningLevel);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static boolean peekaheadFindsSpacesAndComma(ArrayList /* of ACIPString */ scan,
|
||||||
|
int pos) {
|
||||||
|
int sz = scan.size();
|
||||||
|
while (pos < sz) {
|
||||||
|
ACIPString s = (ACIPString)scan.get(pos++);
|
||||||
|
if (s.getType() == ACIPString.TIBETAN_PUNCTUATION && s.getText().equals(" ")) {
|
||||||
|
// keep going
|
||||||
|
} else {
|
||||||
|
if (s.getType() == ACIPString.TIBETAN_PUNCTUATION && s.getText().equals(",")) {
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
private static boolean convertTo(boolean toUnicode, // else to TMW
|
private static boolean convertTo(boolean toUnicode, // else to TMW
|
||||||
ArrayList scan,
|
ArrayList scan,
|
||||||
OutputStream out, // for toUnicode mode
|
OutputStream out, // for toUnicode mode
|
||||||
|
@ -232,15 +248,21 @@ public class ACIPConverter {
|
||||||
if (toUnicode)
|
if (toUnicode)
|
||||||
writer
|
writer
|
||||||
= new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
|
= new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
|
||||||
|
boolean lastGuyWasNonPunct = false;
|
||||||
|
TStackList lastGuy = null;
|
||||||
for (int i = 0; i < sz; i++) {
|
for (int i = 0; i < sz; i++) {
|
||||||
ACIPString s = (ACIPString)scan.get(i);
|
ACIPString s = (ACIPString)scan.get(i);
|
||||||
int stype = s.getType();
|
int stype = s.getType();
|
||||||
if (stype == ACIPString.ERROR) {
|
if (stype == ACIPString.ERROR) {
|
||||||
|
lastGuyWasNonPunct = false;
|
||||||
|
lastGuy = null;
|
||||||
hasErrors = true;
|
hasErrors = true;
|
||||||
String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]";
|
String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]";
|
||||||
if (null != writer) writer.write(text);
|
if (null != writer) writer.write(text);
|
||||||
if (null != tdoc) tdoc.appendRoman(text);
|
if (null != tdoc) tdoc.appendRoman(text);
|
||||||
} else if (stype == ACIPString.WARNING) {
|
} else if (stype == ACIPString.WARNING) {
|
||||||
|
lastGuyWasNonPunct = false;
|
||||||
|
lastGuy = null;
|
||||||
if (writeWarningsToOut) {
|
if (writeWarningsToOut) {
|
||||||
String text = "[#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: " + s.getText() + "]";
|
String text = "[#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: " + s.getText() + "]";
|
||||||
if (null != writer) writer.write(text);
|
if (null != writer) writer.write(text);
|
||||||
|
@ -255,6 +277,8 @@ public class ACIPConverter {
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (s.isLatin(stype)) {
|
if (s.isLatin(stype)) {
|
||||||
|
lastGuyWasNonPunct = false;
|
||||||
|
lastGuy = null;
|
||||||
String text
|
String text
|
||||||
= (((stype == ACIPString.FOLIO_MARKER) ? "{" : "")
|
= (((stype == ACIPString.FOLIO_MARKER) ? "{" : "")
|
||||||
+ s.getText()
|
+ s.getText()
|
||||||
|
@ -265,6 +289,7 @@ public class ACIPConverter {
|
||||||
String unicode = null;
|
String unicode = null;
|
||||||
DuffCode[] duff = null;
|
DuffCode[] duff = null;
|
||||||
if (stype == ACIPString.TIBETAN_NON_PUNCTUATION) {
|
if (stype == ACIPString.TIBETAN_NON_PUNCTUATION) {
|
||||||
|
lastGuyWasNonPunct = true;
|
||||||
TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText());
|
TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText());
|
||||||
String acipError;
|
String acipError;
|
||||||
|
|
||||||
|
@ -294,6 +319,7 @@ public class ACIPConverter {
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append(errorMessage + "\n");
|
errors.append(errorMessage + "\n");
|
||||||
} else {
|
} else {
|
||||||
|
lastGuy = sl;
|
||||||
String warning
|
String warning
|
||||||
= pt.getWarning(warningLevel,
|
= pt.getWarning(warningLevel,
|
||||||
pl,
|
pl,
|
||||||
|
@ -332,16 +358,50 @@ public class ACIPConverter {
|
||||||
if (null != writer) unicode = "\u0F3D";
|
if (null != writer) unicode = "\u0F3D";
|
||||||
if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph(")") };
|
if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph(")") };
|
||||||
} else {
|
} else {
|
||||||
if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false);
|
// For ACIP, tshegs are used as both
|
||||||
if (null != tdoc) {
|
// tshegs and whitespace. We treat a
|
||||||
if (s.getText().equals("\r") || s.getText().equals("\t") || s.getText().equals("\n")) {
|
// space as a tsheg if and only if it
|
||||||
tdoc.appendRoman(s.getText());
|
// occurs after TIBETAN_NON_PUNCTUATION.
|
||||||
continue;
|
// But "SHIG ,MDO" is an example of a
|
||||||
|
// special case, needed because a tsheg is
|
||||||
|
// not used after a GA in Tibetan
|
||||||
|
// typesetting.
|
||||||
|
boolean done = false;
|
||||||
|
// DLC what about after numbers? marks?
|
||||||
|
if (s.getText().equals(" ")) {
|
||||||
|
TPairList lpl = null;
|
||||||
|
if (!lastGuyWasNonPunct
|
||||||
|
|| (null != lastGuy
|
||||||
|
&& (lpl = lastGuy.get(lastGuy.size() - 1)).size() == 1
|
||||||
|
&& lpl.get(0).getLeft().equals("G")
|
||||||
|
&& // it's (G . anything)
|
||||||
|
// followed by some number
|
||||||
|
// of spaces (at least one,
|
||||||
|
// this one) and then a
|
||||||
|
// comma:
|
||||||
|
peekaheadFindsSpacesAndComma(scan, i+1))) {
|
||||||
|
if (null != writer) {
|
||||||
|
unicode = " ";
|
||||||
|
done = true;
|
||||||
|
}
|
||||||
|
if (null != tdoc) {
|
||||||
|
tdoc.appendRoman(" ");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else {
|
}
|
||||||
String wy = ACIPRules.getWylieForACIPOther(s.getText());
|
if (!done) {
|
||||||
if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
|
if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false);
|
||||||
duff = new DuffCode[] { TibetanMachineWeb.getGlyph(wy) };
|
if (null != tdoc) {
|
||||||
|
if (s.getText().equals("\r") || s.getText().equals("\t") || s.getText().equals("\n")) {
|
||||||
|
tdoc.appendRoman(s.getText());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
String wy = ACIPRules.getWylieForACIPOther(s.getText());
|
||||||
|
if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
|
||||||
|
duff = new DuffCode[] { TibetanMachineWeb.getGlyph(wy) };
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -349,6 +409,8 @@ public class ACIPConverter {
|
||||||
throw new Error("FIXME: make this an assertion 1");
|
throw new Error("FIXME: make this an assertion 1");
|
||||||
if (null != tdoc && (null == duff || 0 == duff.length))
|
if (null != tdoc && (null == duff || 0 == duff.length))
|
||||||
throw new Error("FIXME: make this an assertion 2");
|
throw new Error("FIXME: make this an assertion 2");
|
||||||
|
lastGuyWasNonPunct = false;
|
||||||
|
lastGuy = null;
|
||||||
}
|
}
|
||||||
if (null != writer && null != unicode) writer.write(unicode);
|
if (null != writer && null != unicode) writer.write(unicode);
|
||||||
if (null != tdoc) {
|
if (null != tdoc) {
|
||||||
|
|
|
@ -767,13 +767,33 @@ public class ACIPTshegBarScanner {
|
||||||
case ';':
|
case ';':
|
||||||
case '`':
|
case '`':
|
||||||
case '#':
|
case '#':
|
||||||
|
|
||||||
// The tsheg bar ends here; new token.
|
// The tsheg bar ends here; new token.
|
||||||
if (startOfString < i) {
|
if (startOfString < i) {
|
||||||
al.add(new ACIPString(s.substring(startOfString, i),
|
al.add(new ACIPString(s.substring(startOfString, i),
|
||||||
currentType));
|
currentType));
|
||||||
}
|
}
|
||||||
al.add(new ACIPString(s.substring(i, i+1),
|
|
||||||
ACIPString.TIBETAN_PUNCTUATION));
|
// Insert a tsheg if necessary. ACIP files aren't
|
||||||
|
// careful, so "KA\r\n" and "GA\n" appear where "KA
|
||||||
|
// \r\n" and "GA \n" should appear.
|
||||||
|
if (('\r' == ch
|
||||||
|
|| '\n' == ch)
|
||||||
|
&& !al.isEmpty()
|
||||||
|
&& ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION) {
|
||||||
|
al.add(new ACIPString(" ",
|
||||||
|
ACIPString.TIBETAN_PUNCTUATION));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Don't add in a "\r\n" or "\n" unless there's a
|
||||||
|
// blank line.
|
||||||
|
boolean rn = false;
|
||||||
|
if (('\n' != ch && '\r' != ch)
|
||||||
|
|| ((rn = ('\n' == ch && i >= 3 && s.charAt(i-3) == '\r' && s.charAt(i-2) == '\n' && s.charAt(i-1) == '\r'))
|
||||||
|
|| ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n'))) {
|
||||||
|
al.add(new ACIPString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
|
||||||
|
ACIPString.TIBETAN_PUNCTUATION));
|
||||||
|
}
|
||||||
startOfString = i+1;
|
startOfString = i+1;
|
||||||
currentType = ACIPString.ERROR;
|
currentType = ACIPString.ERROR;
|
||||||
break; // end TIBETAN_PUNCTUATION case
|
break; // end TIBETAN_PUNCTUATION case
|
||||||
|
|
Loading…
Reference in a new issue