Fixed a couple of small bugs.
Only 250 errors are reported now; this is important if you try to convert an English document.
This commit is contained in:
parent
4581a2d8ab
commit
39451d8879
2 changed files with 50 additions and 8 deletions
|
@ -46,8 +46,19 @@ public class ACIPTshegBarScanner {
|
||||||
System.exit(1);
|
System.exit(1);
|
||||||
}
|
}
|
||||||
StringBuffer errors = new StringBuffer();
|
StringBuffer errors = new StringBuffer();
|
||||||
ArrayList al = scanFile(args[1], errors, strict);
|
int maxErrors = 250;
|
||||||
|
ArrayList al = scanFile(args[1], errors, strict, maxErrors - 1);
|
||||||
|
|
||||||
|
if (null == al) {
|
||||||
|
System.out.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this");
|
||||||
|
System.out.println("Tibetan or English input?");
|
||||||
|
System.out.println("");
|
||||||
|
System.out.println("First " + maxErrors + " errors scanning ACIP input file: ");
|
||||||
|
System.out.println(errors);
|
||||||
|
System.out.println("Exiting with " + maxErrors + " or more errors; please fix input file and try again.");
|
||||||
|
System.exit(1);
|
||||||
|
} else {
|
||||||
|
}
|
||||||
if (errors.length() > 0) {
|
if (errors.length() > 0) {
|
||||||
System.out.println("Errors scanning ACIP input file: ");
|
System.out.println("Errors scanning ACIP input file: ");
|
||||||
System.out.println(errors);
|
System.out.println(errors);
|
||||||
|
@ -66,7 +77,9 @@ public class ACIPTshegBarScanner {
|
||||||
* scan. <p>FIXME: not so efficient; copies the whole file into
|
* scan. <p>FIXME: not so efficient; copies the whole file into
|
||||||
* memory first.
|
* memory first.
|
||||||
* @throws IOException if we cannot read in the ACIP input file */
|
* @throws IOException if we cannot read in the ACIP input file */
|
||||||
public static ArrayList scanFile(String fname, StringBuffer errors, boolean strict) throws IOException {
|
public static ArrayList scanFile(String fname, StringBuffer errors, boolean strict, int maxErrors)
|
||||||
|
throws IOException
|
||||||
|
{
|
||||||
StringBuffer s = new StringBuffer();
|
StringBuffer s = new StringBuffer();
|
||||||
char ch[] = new char[8192];
|
char ch[] = new char[8192];
|
||||||
BufferedReader in
|
BufferedReader in
|
||||||
|
@ -77,7 +90,7 @@ public class ACIPTshegBarScanner {
|
||||||
while (-1 != (amt = in.read(ch))) {
|
while (-1 != (amt = in.read(ch))) {
|
||||||
s.append(ch, 0, amt);
|
s.append(ch, 0, amt);
|
||||||
}
|
}
|
||||||
return scan(s.toString(), errors, !strict);
|
return scan(s.toString(), errors, !strict, maxErrors);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns a list of {@link ACIPString ACIPStrings} corresponding
|
/** Returns a list of {@link ACIPString ACIPStrings} corresponding
|
||||||
|
@ -93,15 +106,24 @@ public class ACIPTshegBarScanner {
|
||||||
* errors, each followed by a '\n'. There is at least one case
|
* errors, each followed by a '\n'. There is at least one case
|
||||||
* where no ERROR ACIPString will appear but errors will be
|
* where no ERROR ACIPString will appear but errors will be
|
||||||
* modified.
|
* modified.
|
||||||
|
* @param s the ACIP text
|
||||||
|
* @param errors if non-null, the buffer to which to append error
|
||||||
|
* messages
|
||||||
* @param lenientPeriods if and only if this is true, periods
|
* @param lenientPeriods if and only if this is true, periods
|
||||||
* will never cause errors, even if iffy text like "PAS... LA "
|
* will never cause errors, even if iffy text like "PAS... LA "
|
||||||
* appears.
|
* appears.
|
||||||
|
* @param maxErrors if nonnegative, then scanning will stop when
|
||||||
|
* more than maxErrors errors occur. In this event, null is
|
||||||
|
* returned.
|
||||||
|
* @return null if more than maxErrors errors occur, or the scan
|
||||||
|
* otherwise
|
||||||
*/
|
*/
|
||||||
public static ArrayList scan(String s, StringBuffer errors, boolean lenientPeriods) {
|
public static ArrayList scan(String s, StringBuffer errors, boolean lenientPeriods, int maxErrors) {
|
||||||
|
|
||||||
// the size depends on whether it's mostly Tibetan or mostly
|
// the size depends on whether it's mostly Tibetan or mostly
|
||||||
// Latin and a number of other factors. This is meant to be
|
// Latin and a number of other factors. This is meant to be
|
||||||
// an underestimate, but not too much of an underestimate.
|
// an underestimate, but not too much of an underestimate.
|
||||||
|
int numErrors = 0;
|
||||||
ArrayList al = new ArrayList(s.length() / 10);
|
ArrayList al = new ArrayList(s.length() / 10);
|
||||||
|
|
||||||
boolean waitingForMatchingIllegalClose = false;
|
boolean waitingForMatchingIllegalClose = false;
|
||||||
|
@ -122,6 +144,7 @@ public class ACIPTshegBarScanner {
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ": "
|
errors.append("Offset " + i + ": "
|
||||||
+ "Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n");
|
+ "Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n");
|
||||||
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -138,13 +161,15 @@ public class ACIPTshegBarScanner {
|
||||||
if (!waitingForMatchingIllegalClose) {
|
if (!waitingForMatchingIllegalClose) {
|
||||||
if (null != errors) {
|
if (null != errors) {
|
||||||
errors.append("Offset " + i + ": "
|
errors.append("Offset " + i + ": "
|
||||||
+ "Found a truly unmatched close bracket, [ or {.\n");
|
+ "Found a truly unmatched close bracket, ] or }.\n");
|
||||||
}
|
}
|
||||||
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
}
|
}
|
||||||
waitingForMatchingIllegalClose = false;
|
waitingForMatchingIllegalClose = false;
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ": "
|
errors.append("Offset " + i + ": "
|
||||||
+ "Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
+ "Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
||||||
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
startOfString = i+1;
|
startOfString = i+1;
|
||||||
currentType = ACIPString.ERROR;
|
currentType = ACIPString.ERROR;
|
||||||
} else {
|
} else {
|
||||||
|
@ -396,6 +421,7 @@ public class ACIPTshegBarScanner {
|
||||||
errors.append("Offset " + i + ": "
|
errors.append("Offset " + i + ": "
|
||||||
+ "Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.\n");
|
+ "Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.\n");
|
||||||
}
|
}
|
||||||
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
}
|
}
|
||||||
waitingForMatchingIllegalClose = true;
|
waitingForMatchingIllegalClose = true;
|
||||||
if (null != errors) {
|
if (null != errors) {
|
||||||
|
@ -411,6 +437,7 @@ public class ACIPTshegBarScanner {
|
||||||
}
|
}
|
||||||
errors.append("Offset " + i + ": "
|
errors.append("Offset " + i + ": "
|
||||||
+ "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n");
|
+ "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n");
|
||||||
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
}
|
}
|
||||||
startOfString = i + 1;
|
startOfString = i + 1;
|
||||||
currentType = ACIPString.ERROR;
|
currentType = ACIPString.ERROR;
|
||||||
|
@ -464,6 +491,7 @@ public class ACIPTshegBarScanner {
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ": "
|
errors.append("Offset " + i + ": "
|
||||||
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.\n");
|
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.\n");
|
||||||
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
startOfString = i+numdigits+3;
|
startOfString = i+numdigits+3;
|
||||||
i = startOfString - 1;
|
i = startOfString - 1;
|
||||||
currentType = ACIPString.ERROR;
|
currentType = ACIPString.ERROR;
|
||||||
|
@ -484,6 +512,7 @@ public class ACIPTshegBarScanner {
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ": "
|
errors.append("Offset " + i + ": "
|
||||||
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.\n");
|
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.\n");
|
||||||
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
startOfString = i+1; // DLC FIXME: skip over more?
|
startOfString = i+1; // DLC FIXME: skip over more?
|
||||||
currentType = ACIPString.ERROR;
|
currentType = ACIPString.ERROR;
|
||||||
break;
|
break;
|
||||||
|
@ -576,6 +605,7 @@ public class ACIPTshegBarScanner {
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ": "
|
errors.append("Offset " + i + ": "
|
||||||
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.\n");
|
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.\n");
|
||||||
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
startOfString = i+1;
|
startOfString = i+1;
|
||||||
currentType = ACIPString.ERROR;
|
currentType = ACIPString.ERROR;
|
||||||
}
|
}
|
||||||
|
@ -623,6 +653,7 @@ public class ACIPTshegBarScanner {
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ": "
|
errors.append("Offset " + i + ": "
|
||||||
+ "Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\n");
|
+ "Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\n");
|
||||||
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
} else {
|
} else {
|
||||||
al.add(new ACIPString(s.substring(i, i+1), ACIPString.END_PAREN));
|
al.add(new ACIPString(s.substring(i, i+1), ACIPString.END_PAREN));
|
||||||
startParenIndex = -1;
|
startParenIndex = -1;
|
||||||
|
@ -635,6 +666,7 @@ public class ACIPTshegBarScanner {
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ": "
|
errors.append("Offset " + i + ": "
|
||||||
+ "Unexpected closing parenthesis, ), found.\n");
|
+ "Unexpected closing parenthesis, ), found.\n");
|
||||||
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
} else {
|
} else {
|
||||||
startParenIndex = i;
|
startParenIndex = i;
|
||||||
al.add(new ACIPString(s.substring(i, i+1), ACIPString.START_PAREN));
|
al.add(new ACIPString(s.substring(i, i+1), ACIPString.START_PAREN));
|
||||||
|
@ -685,6 +717,7 @@ public class ACIPTshegBarScanner {
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ": "
|
errors.append("Offset " + i + ": "
|
||||||
+ "A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n");
|
+ "A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n");
|
||||||
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
}
|
}
|
||||||
startOfString = i+1;
|
startOfString = i+1;
|
||||||
break; // end '.' case
|
break; // end '.' case
|
||||||
|
@ -718,6 +751,10 @@ public class ACIPTshegBarScanner {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (i+1 == sl && 26 == (int)ch)
|
||||||
|
// Silently allow the last character to be ^Z,
|
||||||
|
// which just marks end of file.
|
||||||
|
break;
|
||||||
if (!(isNumeric(ch) || isAlpha(ch))) {
|
if (!(isNumeric(ch) || isAlpha(ch))) {
|
||||||
if (startOfString < i) {
|
if (startOfString < i) {
|
||||||
al.add(new ACIPString(s.substring(startOfString, i),
|
al.add(new ACIPString(s.substring(startOfString, i),
|
||||||
|
@ -734,6 +771,7 @@ public class ACIPTshegBarScanner {
|
||||||
+ "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".\n");
|
+ "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
startOfString = i+1;
|
startOfString = i+1;
|
||||||
currentType = ACIPString.ERROR;
|
currentType = ACIPString.ERROR;
|
||||||
} else {
|
} else {
|
||||||
|
@ -755,6 +793,7 @@ public class ACIPTshegBarScanner {
|
||||||
errors.append("Offset END: "
|
errors.append("Offset END: "
|
||||||
+ "Truly unmatched open bracket found.\n");
|
+ "Truly unmatched open bracket found.\n");
|
||||||
}
|
}
|
||||||
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
}
|
}
|
||||||
if (!bracketTypeStack.empty()) {
|
if (!bracketTypeStack.empty()) {
|
||||||
al.add(new ACIPString("UNEXPECTED END OF INPUT",
|
al.add(new ACIPString("UNEXPECTED END OF INPUT",
|
||||||
|
@ -768,6 +807,7 @@ public class ACIPTshegBarScanner {
|
||||||
+ "Unmatched open bracket found. A correction does not terminate.\n");
|
+ "Unmatched open bracket found. A correction does not terminate.\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
}
|
}
|
||||||
if (startSlashIndex >= 0) {
|
if (startSlashIndex >= 0) {
|
||||||
al.add(new ACIPString("Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.",
|
al.add(new ACIPString("Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.",
|
||||||
|
@ -775,6 +815,7 @@ public class ACIPTshegBarScanner {
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset END: "
|
errors.append("Offset END: "
|
||||||
+ "Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
|
+ "Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
|
||||||
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
}
|
}
|
||||||
if (startParenIndex >= 0) {
|
if (startParenIndex >= 0) {
|
||||||
al.add(new ACIPString("Parentheses are supposed to occur in pairs, but the input had an unmatched parenthesis.",
|
al.add(new ACIPString("Parentheses are supposed to occur in pairs, but the input had an unmatched parenthesis.",
|
||||||
|
@ -782,6 +823,7 @@ public class ACIPTshegBarScanner {
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset END: "
|
errors.append("Offset END: "
|
||||||
+ "Unmatched open parenthesis, (, found.\n");
|
+ "Unmatched open parenthesis, (, found.\n");
|
||||||
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
}
|
}
|
||||||
return al;
|
return al;
|
||||||
}
|
}
|
||||||
|
|
|
@ -6934,7 +6934,7 @@ tstHelper("ZUR");
|
||||||
|
|
||||||
private static void shelp(String s, String expectedErrors, boolean lenientPeriods, String expectedScan) {
|
private static void shelp(String s, String expectedErrors, boolean lenientPeriods, String expectedScan) {
|
||||||
StringBuffer errors = new StringBuffer();
|
StringBuffer errors = new StringBuffer();
|
||||||
ArrayList al = ACIPTshegBarScanner.scan(s, errors, lenientPeriods);
|
ArrayList al = ACIPTshegBarScanner.scan(s, errors, lenientPeriods, -1);
|
||||||
if (null != expectedScan) {
|
if (null != expectedScan) {
|
||||||
if (!al.toString().equals(expectedScan)) {
|
if (!al.toString().equals(expectedScan)) {
|
||||||
System.out.println("Scanning " + s + " into tsheg bars was expected to cause the following scan:");
|
System.out.println("Scanning " + s + " into tsheg bars was expected to cause the following scan:");
|
||||||
|
@ -6955,7 +6955,7 @@ tstHelper("ZUR");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Tests {@link ACIPTshegBarScanner#scan(String, StringBuffer, boolean)}. */
|
/** Tests {@link ACIPTshegBarScanner#scan(String, StringBuffer, boolean, int)}. */
|
||||||
public void testScanner() {
|
public void testScanner() {
|
||||||
shelp("LA...SGRUB",
|
shelp("LA...SGRUB",
|
||||||
"",
|
"",
|
||||||
|
@ -6982,7 +6982,7 @@ tstHelper("ZUR");
|
||||||
"Offset 0: Found an illegal open bracket (in context, this is {). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n");
|
"Offset 0: Found an illegal open bracket (in context, this is {). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n");
|
||||||
shelp("DD", "");
|
shelp("DD", "");
|
||||||
shelp("DD]",
|
shelp("DD]",
|
||||||
"Offset 2: Found a truly unmatched close bracket, [ or {.\nOffset 2: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
"Offset 2: Found a truly unmatched close bracket, ] or }.\nOffset 2: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
||||||
|
|
||||||
shelp("///NYA", "Offset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
|
shelp("///NYA", "Offset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
|
||||||
shelp("/NYA/", "");
|
shelp("/NYA/", "");
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue