I've improved the ACIP tsheg bar scanner to handle a lot of illegal

constructions that occur in practice.
This commit is contained in:
dchandler 2003-08-16 16:13:53 +00:00
parent 2a57439516
commit 0b91ed0beb

View file

@ -33,6 +33,41 @@ import org.thdl.util.ThdlDebug;
* @author David Chandler * @author David Chandler
*/ */
public class ACIPTshegBarScanner { public class ACIPTshegBarScanner {
// DLC DOC
public static void main(String[] args) throws IOException {
if (args.length != 1) {
System.out.println("Bad args! Need just the ACIP file's path.");
System.exit(1);
}
StringBuffer errors = new StringBuffer();
ArrayList al = scanFile(args[0], errors);
if (errors.length() > 0) {
System.out.println("Errors scanning ACIP input file: ");
System.out.println(errors);
System.out.println("Exiting; please fix input file and try again.");
System.exit(1);
}
System.out.println("Good scan!");
System.exit(0);
}
// DLC DOC
// DLC FIXME: not so efficient; copies the whole file into memory first
public static ArrayList scanFile(String fname, StringBuffer errors) throws IOException {
StringBuffer s = new StringBuffer();
char ch[] = new char[8192];
BufferedReader in
= new BufferedReader(new InputStreamReader(new FileInputStream(fname))); // DLC FIXME: specify encoding.
int amt;
while (-1 != (amt = in.read(ch))) {
s.append(ch, 0, amt);
}
return scan(s.toString(), errors);
}
/** Returns a list of {@link ACIPString ACIPStrings} corresponding /** Returns a list of {@link ACIPString ACIPStrings} corresponding
* to s, possibly the empty list (when the empty string is the * to s, possibly the empty list (when the empty string is the
* input). Each String is either a Latin comment, some Latin * input). Each String is either a Latin comment, some Latin
@ -41,9 +76,13 @@ public class ACIPTshegBarScanner {
* *
* <p>This not only scans; it finds all the errors a parser would * <p>This not only scans; it finds all the errors a parser would
* too, like "NYA x" and "(" and ")" and "/NYA" etc. It puts * too, like "NYA x" and "(" and ")" and "/NYA" etc. It puts
* those in as ACIPStrings with type {@link ACIPString#ERROR}. * those in as ACIPStrings with type {@link ACIPString#ERROR},
* and also, if errors is non-null, appends helpful messages to
* errors, each followed by a '\n'. There is at least one case
* where no ERROR ACIPString will appear but errors will be
* modified.
*/ */
public static ArrayList scan(String s) { public static ArrayList scan(String s, StringBuffer errors) {
// the size depends on whether it's mostly Tibetan or mostly // the size depends on whether it's mostly Tibetan or mostly
// Latin and a number of other factors. This is meant to be // Latin and a number of other factors. This is meant to be
@ -60,9 +99,18 @@ public class ACIPTshegBarScanner {
if (i < startOfString) throw new Error("bad reset"); if (i < startOfString) throw new Error("bad reset");
char ch; char ch;
ch = s.charAt(i); ch = s.charAt(i);
if (ACIPString.COMMENT == currentType && ch != ']') if (ACIPString.COMMENT == currentType && ch != ']') {
if ('[' == ch) {
al.add(new ACIPString("Found an open square bracket, [, within a [#COMMENT]-style comment. Square brackets may not appear in comments.\n",
ACIPString.ERROR));
if (null != errors)
errors.append("Offset " + i + ": "
+ "Found an open square bracket, [, within a [#COMMENT]-style comment. Square brackets may not appear in comments.\n");
}
continue; continue;
}
switch (ch) { switch (ch) {
case '}':
case ']': case ']':
if (bracketTypeStack.empty()) { if (bracketTypeStack.empty()) {
// Error. // Error.
@ -71,6 +119,9 @@ public class ACIPTshegBarScanner {
currentType)); currentType));
} }
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR)); al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
if (null != errors)
errors.append("Offset " + i + ": "
+ "Found a closing square bracket, ], without a matching open square bracket, [. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
} else { } else {
@ -89,8 +140,11 @@ public class ACIPTshegBarScanner {
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
} }
break; break; // end ']','}' case
case '{': // NOTE WELL: KX0016I.ACT, KD0095M.ACT, and a
// host of other ACIP files use {} brackets like
// [] brackets. I treat both the same.
case '[': case '[':
// This definitely indicates a new token. // This definitely indicates a new token.
if (startOfString < i) { if (startOfString < i) {
@ -102,37 +156,107 @@ public class ACIPTshegBarScanner {
String thingy = null; String thingy = null;
if (i + "[DD]".length() <= sl if (i + "[DD]".length() <= sl
&& s.substring(i, i + "[DD]".length()).equals("[DD]")) { && (s.substring(i, i + "[DD]".length()).equals("[DD]")
|| s.substring(i, i + "[DD]".length()).equals("{DD}"))) {
thingy = "[DD]"; thingy = "[DD]";
currentType = ACIPString.DD; currentType = ACIPString.DD;
} else if (i + "[DD1]".length() <= sl } else if (i + "[DD1]".length() <= sl
&& s.substring(i, i + "[DD1]".length()).equals("[DD1]")) { && (s.substring(i, i + "[DD1]".length()).equals("[DD1]")
|| s.substring(i, i + "[DD1]".length()).equals("{DD1}"))) {
thingy = "[DD1]"; thingy = "[DD1]";
currentType = ACIPString.DD; currentType = ACIPString.DD;
} else if (i + "[DD2]".length() <= sl } else if (i + "[DD2]".length() <= sl
&& s.substring(i, i + "[DD2]".length()).equals("[DD2]")) { && (s.substring(i, i + "[DD2]".length()).equals("[DD2]")
|| s.substring(i, i + "[DD2]".length()).equals("{DD2}"))) {
thingy = "[DD2]"; thingy = "[DD2]";
currentType = ACIPString.DD; currentType = ACIPString.DD;
} else if (i + "[DDD]".length() <= sl } else if (i + "[DDD]".length() <= sl
&& s.substring(i, i + "[DDD]".length()).equals("[DDD]")) { && (s.substring(i, i + "[DDD]".length()).equals("[DDD]")
|| s.substring(i, i + "[DDD]".length()).equals("{DDD}"))) {
thingy = "[DDD]"; thingy = "[DDD]";
currentType = ACIPString.DD; currentType = ACIPString.DD;
} else if (i + "[DR]".length() <= sl } else if (i + "[DR]".length() <= sl
&& s.substring(i, i + "[DR]".length()).equals("[DR]")) { && (s.substring(i, i + "[DR]".length()).equals("[DR]")
|| s.substring(i, i + "[DR]".length()).equals("{DR}"))) {
thingy = "[DR]"; thingy = "[DR]";
currentType = ACIPString.DR; currentType = ACIPString.DR;
} else if (i + "[LS]".length() <= sl } else if (i + "[LS]".length() <= sl
&& s.substring(i, i + "[LS]".length()).equals("[LS]")) { && (s.substring(i, i + "[LS]".length()).equals("[LS]")
|| s.substring(i, i + "[LS]".length()).equals("{LS}"))) {
thingy = "[LS]"; thingy = "[LS]";
currentType = ACIPString.LS; currentType = ACIPString.LS;
} else if (i + "[BP]".length() <= sl } else if (i + "[BP]".length() <= sl
&& s.substring(i, i + "[BP]".length()).equals("[BP]")) { && (s.substring(i, i + "[BP]".length()).equals("[BP]")
|| s.substring(i, i + "[BP]".length()).equals("{BP}"))) {
thingy = "[BP]"; thingy = "[BP]";
currentType = ACIPString.BP; currentType = ACIPString.BP;
} else if (i + "[ BP ]".length() <= sl
&& (s.substring(i, i + "[ BP ]".length()).equals("[ BP ]")
|| s.substring(i, i + "[ BP ]".length()).equals("{ BP }"))) {
thingy = "{ BP }"; // found in TD3790E2.ACT
currentType = ACIPString.BP;
} else if (i + "[ DD ]".length() <= sl
&& (s.substring(i, i + "[ DD ]".length()).equals("[ DD ]")
|| s.substring(i, i + "[ DD ]".length()).equals("{ DD }"))) {
thingy = "{ DD }"; // found in TD3790E2.ACT
currentType = ACIPString.DD;
} else if (i + "[?]".length() <= sl } else if (i + "[?]".length() <= sl
&& s.substring(i, i + "[?]".length()).equals("[?]")) { && (s.substring(i, i + "[?]".length()).equals("[?]")
|| s.substring(i, i + "[?]".length()).equals("{?}"))) {
thingy = "[?]"; thingy = "[?]";
currentType = ACIPString.QUESTION; currentType = ACIPString.QUESTION;
} else {
// We see comments appear not as [#COMMENT], but
// as [COMMENT] sometimes. We make special cases
// for some English comments. DLC FIXME: put
// these in a config file.
String[] englishComments = new String[] {
"FIRST", "SECOND", // S5274I.ACT
"Additional verses added by Khen Rinpoche here are", // S0216M.ACT
"ADDENDUM: The text of", // S0216M.ACT
"END OF ADDENDUM", // S0216M.ACT
"Some of the verses added here by Khen Rinpoche include:", // S0216M.ACT
"Note that, in the second verse, the {YUL LJONG} was orignally {GANG LJONG},\nand is now recited this way since the ceremony is not only taking place in Tibet.", // S0216M.ACT
"Note that, in the second verse, the {YUL LJONG} was orignally {GANG LJONG},\r\nand is now recited this way since the ceremony is not only taking place in Tibet.", // S0216M.ACT
"text missing", // S6954E1.ACT
"INCOMPLETE", // TD3817I.INC
"MISSING PAGE", // S0935m.act
"MISSING FOLIO", // S0975I.INC
"UNCLEAR LINE", // S0839D1I.INC
"THE FOLLOWING TEXT HAS INCOMPLETE SECTIONS, WHICH ARE ON ORDER", // SE6260A.INC
"@DATA INCOMPLETE HERE", // SE6260A.INC
"@DATA MISSING HERE", // SE6260A.INC
"DATA INCOMPLETE HERE", // TD4226I2.INC
"DATA MISSING HERE", // just being consistent
"FOLLOWING SECTION WAS NOT AVAILABLE WHEN THIS EDITION WAS\nPRINTED, AND IS SUPPLIED FROM ANOTHER, PROBABLY THE ORIGINAL:", // S0018N.ACT
"FOLLOWING SECTION WAS NOT AVAILABLE WHEN THIS EDITION WAS\r\nPRINTED, AND IS SUPPLIED FROM ANOTHER, PROBABLY THE ORIGINAL:", // S0018N.ACT
"THESE PAGE NUMBERS RESERVED IN THIS EDITION FOR PAGES\nMISSING FROM ORIGINAL ON WHICH IT WAS BASED", // S0018N.ACT
"THESE PAGE NUMBERS RESERVED IN THIS EDITION FOR PAGES\r\nMISSING FROM ORIGINAL ON WHICH IT WAS BASED", // S0018N.ACT
"PAGE NUMBERS RESERVED FROM THIS EDITION FOR MISSING\nSECTION SUPPLIED BY PRECEDING", // S0018N.ACT
"PAGE NUMBERS RESERVED FROM THIS EDITION FOR MISSING\r\nSECTION SUPPLIED BY PRECEDING", // S0018N.ACT
"SW: OK", // S0057M.ACT
"m:ok", // S0057M.ACT
"A FIRST ONE\nMISSING HERE?", // S0057M.ACT
"A FIRST ONE\r\nMISSING HERE?", // S0057M.ACT
"THE INITIAL PART OF THIS TEXT WAS INPUT BY THE SERA MEY LIBRARY IN\nTIBETAN FONT AND NEEDS TO BE REDONE BY DOUBLE INPUT", // S0195A1.INC
"THE INITIAL PART OF THIS TEXT WAS INPUT BY THE SERA MEY LIBRARY IN\r\nTIBETAN FONT AND NEEDS TO BE REDONE BY DOUBLE INPUT", // S0195A1.INC
};
boolean foundOne = false;
for (int ec = 0; ec < englishComments.length; ec++) {
if (i + 2 + englishComments[ec].length() <= sl
&& (s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]")
|| s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]"))) {
al.add(new ACIPString("[#" + englishComments[ec] + "]",
ACIPString.COMMENT));
startOfString = i + 2 + englishComments[ec].length();
i = startOfString - 1;
foundOne = true;
break;
}
}
if (foundOne)
break;
} }
if (null != thingy) { if (null != thingy) {
al.add(new ACIPString(thingy, al.add(new ACIPString(thingy,
@ -157,10 +281,18 @@ public class ACIPTshegBarScanner {
// "... [" could cause this too. // "... [" could cause this too.
al.add(new ACIPString(s.substring(i, i+1), al.add(new ACIPString(s.substring(i, i+1),
ACIPString.ERROR)); ACIPString.ERROR));
if (null != errors) {
String inContext = s.substring(i, i+Math.min(sl-i, 10));
if (sl-i > 10) {
inContext = inContext + "...";
}
errors.append("Offset " + i + ": "
+ "Found an illegal open square bracket, [ (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open square bracket?\n");
}
startOfString = i + 1; startOfString = i + 1;
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
} }
break; // end '[' case break; // end '[','{' case
case '@': case '@':
// This definitely indicates a new token. // This definitely indicates a new token.
@ -185,8 +317,31 @@ public class ACIPTshegBarScanner {
} }
} }
if (allAreNumeric) { if (allAreNumeric) {
al.add(new ACIPString(s.substring(i, i+numdigits+2), ACIPString.FOLIO_MARKER)); al.add(new ACIPString(s.substring(i, i+numdigits+2),
ACIPString.FOLIO_MARKER));
startOfString = i+numdigits+2; startOfString = i+numdigits+2;
i = startOfString - 1;
currentType = ACIPString.ERROR;
break;
}
}
// System.out.println("DLC NOW HERE xxx y:" + (i+numdigits+3 < sl) + " z:" + s.charAt(i+1) + s.charAt(i+numdigits+2) + s.charAt(i+numdigits+3));
if (i+numdigits+3 < sl
&& s.charAt(i+1) == '[' && s.charAt(i+numdigits+3) == ']'
&& (s.charAt(i+numdigits+2) == 'A' || s.charAt(i+numdigits+2) == 'B')) {
boolean allAreNumeric = true;
for (int k = 1; k <= numdigits; k++) {
if (!isNumeric(s.charAt(i+1+k))) {
allAreNumeric = false;
break;
}
}
if (allAreNumeric) {
al.add(new ACIPString(s.substring(i, i+numdigits+4),
ACIPString.FOLIO_MARKER));
startOfString = i+numdigits+4;
i = startOfString - 1;
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
break; break;
} }
@ -194,6 +349,9 @@ public class ACIPTshegBarScanner {
} }
if (startOfString == i) { if (startOfString == i) {
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR)); al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
if (null != errors)
errors.append("Offset " + i + ": "
+ "Found an illegal at sign, @. @012B is an example of a legal folio marker.\n");
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
} }
@ -209,13 +367,15 @@ public class ACIPTshegBarScanner {
} }
if (startSlashIndex >= 0) { if (startSlashIndex >= 0) {
al.add(new ACIPString(s.substring(i, i+1), ACIPString.END_SLASH)); al.add(new ACIPString(s.substring(i, i+1),
ACIPString.END_SLASH));
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
startSlashIndex = -1; startSlashIndex = -1;
} else { } else {
startSlashIndex = i; startSlashIndex = i;
al.add(new ACIPString(s.substring(i, i+1), ACIPString.START_SLASH)); al.add(new ACIPString(s.substring(i, i+1),
ACIPString.START_SLASH));
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
} }
@ -234,26 +394,68 @@ public class ACIPTshegBarScanner {
// DLC support nesting like (NYA (BA))? // DLC support nesting like (NYA (BA))?
if (startParenIndex >= 0) { if (startParenIndex >= 0) {
if (ch == '(') if (ch == '(') {
al.add(new ACIPString("Nesting of parentheses () is not allowed", ACIPString.ERROR)); al.add(new ACIPString("Nesting of parentheses () is not allowed", ACIPString.ERROR));
else { if (null != errors)
errors.append("Offset " + i + ": "
+ "Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\n");
} else {
al.add(new ACIPString(s.substring(i, i+1), ACIPString.END_PAREN)); al.add(new ACIPString(s.substring(i, i+1), ACIPString.END_PAREN));
startParenIndex = -1; startParenIndex = -1;
} }
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
} else { } else {
if (ch == ')') if (ch == ')') {
al.add(new ACIPString("Unexpected closing parenthesis )", ACIPString.ERROR)); al.add(new ACIPString("Unexpected closing parenthesis )", ACIPString.ERROR));
else { if (null != errors)
errors.append("Offset " + i + ": "
+ "Unexpected closing parenthesis, ), found.\n");
} else {
startParenIndex = i; startParenIndex = i;
al.add(new ACIPString(s.substring(i, i+1), ACIPString.START_PAREN)); al.add(new ACIPString(s.substring(i, i+1), ACIPString.START_PAREN));
} }
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
} }
break; // end '/' case break; // end '(',')' case
case '?':
if (bracketTypeStack.empty()) {
// The tsheg bar ends here; new token.
if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i),
currentType));
}
al.add(new ACIPString(s.substring(i, i+1),
ACIPString.QUESTION));
startOfString = i+1;
currentType = ACIPString.ERROR;
} // else this is [*TR'A ?] or the like.
break; // end '?' case
case '.':
// This definitely indicates a new token.
if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i),
currentType));
startOfString = i;
currentType = ACIPString.ERROR;
}
// . is used for a non-breaking tsheg, such as in {NGO.,} and {....,DAM}. We give an error unless , or . follows '.'.
if (i + 1 < sl && (s.charAt(i+1) == '.' || s.charAt(i+1) == ',')) {
al.add(new ACIPString(s.substring(i, i+1),
ACIPString.TIBETAN_PUNCTUATION));
} else {
al.add(new ACIPString("A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\".",
ACIPString.ERROR));
if (null != errors)
errors.append("Offset " + i + ": "
+ "A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\".\n");
}
break; // end '.' case
// Classic tsheg bar enders: // Classic tsheg bar enders:
case ' ': case ' ':
@ -277,6 +479,13 @@ public class ACIPTshegBarScanner {
break; // end TIBETAN_PUNCTUATION case break; // end TIBETAN_PUNCTUATION case
default: default:
if (!bracketTypeStack.empty()) {
int stackTop = ((Integer)bracketTypeStack.peek()).intValue();
if (ACIPString.CORRECTION_START == stackTop && '?' == ch) {
// allow it through...
break;
}
}
if (!(isNumeric(ch) || isAlpha(ch))) { if (!(isNumeric(ch) || isAlpha(ch))) {
if (startOfString < i) { if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i), al.add(new ACIPString(s.substring(startOfString, i),
@ -284,6 +493,9 @@ public class ACIPTshegBarScanner {
} }
al.add(new ACIPString(s.substring(i, i+1), al.add(new ACIPString(s.substring(i, i+1),
ACIPString.ERROR)); ACIPString.ERROR));
if (null != errors)
errors.append("Offset " + i + ": "
+ "Found an illegal character, " + ch + "\n");
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
} else { } else {
@ -297,18 +509,33 @@ public class ACIPTshegBarScanner {
if (startOfString < sl) { if (startOfString < sl) {
al.add(new ACIPString(s.substring(startOfString, sl), al.add(new ACIPString(s.substring(startOfString, sl),
currentType)); currentType));
}
if (!bracketTypeStack.empty()) { if (!bracketTypeStack.empty()) {
al.add(new ACIPString("UNEXPECTED END OF INPUT", al.add(new ACIPString("UNEXPECTED END OF INPUT",
ACIPString.ERROR)); ACIPString.ERROR));
if (null != errors) {
if (ACIPString.COMMENT == currentType) {
errors.append("Offset END: "
+ "Unmatched open square bracket, [, found. A comment does not terminate.\n");
} else {
errors.append("Offset END: "
+ "Unmatched open square bracket, [, found. A correction does not terminate.\n");
}
}
} }
if (startSlashIndex >= 0) { if (startSlashIndex >= 0) {
al.add(new ACIPString("Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.", al.add(new ACIPString("Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.",
ACIPString.ERROR)); ACIPString.ERROR));
if (null != errors)
errors.append("Offset END: "
+ "Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
} }
if (startParenIndex >= 0) { if (startParenIndex >= 0) {
al.add(new ACIPString("Parentheses are supposed to occur in pairs, but the input had an unmatched parenthesis.", al.add(new ACIPString("Parentheses are supposed to occur in pairs, but the input had an unmatched parenthesis.",
ACIPString.ERROR)); ACIPString.ERROR));
} if (null != errors)
errors.append("Offset END: "
+ "Unmatched open parenthesis, (, found.\n");
} }
return al; return al;
} }
@ -320,12 +547,16 @@ public class ACIPTshegBarScanner {
/** See implementation. */ /** See implementation. */
private static boolean isAlpha(char ch) { private static boolean isAlpha(char ch) {
return ch == '\'' return ch == '\'' // 23rd consonant
// combining punctuation: // combining punctuation, vowels:
|| ch == '%' || ch == '%'
|| ch == 'o' || ch == 'o'
|| ch == 'x' || ch == 'x'
|| ch == ':'
|| ch == '-'
|| ch == '+'
|| (ch >= 'A' && ch <= 'Z') || (ch >= 'A' && ch <= 'Z')
|| (ch >= 'a' && ch <= 'z'); || (ch >= 'a' && ch <= 'z');