Improved the ACIP scanner (the part of the converter that says, "This

is a correction, that's a comment, this is Tibetan, that's Latin
(English), that's Tibetan inter-tsheg-bar punctuation, etc.)  It now
accepts more real-world ACIP files, i.e. it handles illegal
constructs.  The error checking is more user-friendly.  There are now
tests.

Added some tsheg bars that Peter E. Hauer of Linguasoft sent me to the
tests.  Many thanks, Peter.  I still need to implement rules that say,
"This is not Tibetan, it must be Sanskrit, because that letter doesn't
take a MA prefix."
This commit is contained in:
dchandler 2003-08-17 01:45:55 +00:00
parent 0b91ed0beb
commit 4581a2d8ab
3 changed files with 2049 additions and 46 deletions

View file

@ -34,11 +34,14 @@ public class ACIPString {
public static final int COMMENT = 0; public static final int COMMENT = 0;
/** For Folio markers like @012B */ /** For Folio markers like @012B */
public static final int FOLIO_MARKER = 1; public static final int FOLIO_MARKER = 1;
/** For Latin letters and numbers etc. [*LINE BREAK?] uses this,
* for example. */
public static final int LATIN = 2;
/** For Tibetan letters and numbers etc. */ /** For Tibetan letters and numbers etc. */
public static final int TIBETAN_NON_PUNCTUATION = 2; public static final int TIBETAN_NON_PUNCTUATION = 3;
/** For tshegs, whitespace and the like, but not combining /** For tshegs, whitespace and the like, but not combining
* punctutation like %, o, :, m, and x */ * punctutation like %, o, :, m, and x */
public static final int TIBETAN_PUNCTUATION = 3; public static final int TIBETAN_PUNCTUATION = 4;
/** For the start of a [*probable correction] or [*possible correction?] */ /** For the start of a [*probable correction] or [*possible correction?] */
public static final int CORRECTION_START = 5; public static final int CORRECTION_START = 5;
/** Denotes the end of a [*probable correction] */ /** Denotes the end of a [*probable correction] */
@ -65,7 +68,7 @@ public class ACIPString {
public static final int END_PAREN = 16; public static final int END_PAREN = 16;
/** For things that are not legal syntax, such as a file that /** For things that are not legal syntax, such as a file that
* contains just "[# HALF A COMMEN" */ * contains just "[# HALF A COMMEN" */
public static final int ERROR = 17; /* DLC let the user know. */ public static final int ERROR = 17;
/** Returns true if and only if this string is Latin (usually /** Returns true if and only if this string is Latin (usually
* English). Returns false if this string is transliteration of * English). Returns false if this string is transliteration of
@ -105,6 +108,7 @@ public class ACIPString {
String typeString = "HUH?????"; String typeString = "HUH?????";
if (type == COMMENT) typeString = "COMMENT"; if (type == COMMENT) typeString = "COMMENT";
if (type == FOLIO_MARKER) typeString = "FOLIO_MARKER"; if (type == FOLIO_MARKER) typeString = "FOLIO_MARKER";
if (type == LATIN) typeString = "LATIN";
if (type == TIBETAN_NON_PUNCTUATION) typeString = "TIBETAN_NON_PUNCTUATION"; if (type == TIBETAN_NON_PUNCTUATION) typeString = "TIBETAN_NON_PUNCTUATION";
if (type == TIBETAN_PUNCTUATION) typeString = "TIBETAN_PUNCTUATION"; if (type == TIBETAN_PUNCTUATION) typeString = "TIBETAN_PUNCTUATION";
if (type == CORRECTION_START) typeString = "CORRECTION_START"; if (type == CORRECTION_START) typeString = "CORRECTION_START";
@ -120,6 +124,6 @@ public class ACIPString {
if (type == START_PAREN) typeString = "START_PAREN"; if (type == START_PAREN) typeString = "START_PAREN";
if (type == END_PAREN) typeString = "END_PAREN"; if (type == END_PAREN) typeString = "END_PAREN";
if (type == ERROR) typeString = "ERROR"; if (type == ERROR) typeString = "ERROR";
return typeString + ":\"" + getText() + "\""; return typeString + ":{" + getText() + "}";
} }
} }

View file

@ -33,14 +33,20 @@ import org.thdl.util.ThdlDebug;
* @author David Chandler * @author David Chandler
*/ */
public class ACIPTshegBarScanner { public class ACIPTshegBarScanner {
// DLC DOC /** Useful for testing. Gives error messages on standard output
* about why we can't scan the document perfectly and exits with
* non-zero return code, or says "Good scan!" otherwise and exits
* with code zero. <p>FIXME: not so efficient; copies the whole
* file into memory first. */
public static void main(String[] args) throws IOException { public static void main(String[] args) throws IOException {
if (args.length != 1) { boolean strict = true;
System.out.println("Bad args! Need just the ACIP file's path."); if (args.length != 2
|| (!(strict = "--strict".equals(args[0])) && !"--lenient".equals(args[0]))) {
System.out.println("Bad args! Need '--strict filename' or '--lenient filename'.");
System.exit(1); System.exit(1);
} }
StringBuffer errors = new StringBuffer(); StringBuffer errors = new StringBuffer();
ArrayList al = scanFile(args[0], errors); ArrayList al = scanFile(args[1], errors, strict);
if (errors.length() > 0) { if (errors.length() > 0) {
System.out.println("Errors scanning ACIP input file: "); System.out.println("Errors scanning ACIP input file: ");
@ -52,20 +58,26 @@ public class ACIPTshegBarScanner {
System.out.println("Good scan!"); System.out.println("Good scan!");
System.exit(0); System.exit(0);
} }
// DLC DOC /** Scans an ACIP file with path fname into tsheg bars. If errors
// DLC FIXME: not so efficient; copies the whole file into memory first * is non-null, error messages will be appended to it. If strict
public static ArrayList scanFile(String fname, StringBuffer errors) throws IOException { * is true, then you're more likely to see error
* messages. Returns a list of ACIPStrings that is the
* scan. <p>FIXME: not so efficient; copies the whole file into
* memory first.
* @throws IOException if we cannot read in the ACIP input file */
public static ArrayList scanFile(String fname, StringBuffer errors, boolean strict) throws IOException {
StringBuffer s = new StringBuffer(); StringBuffer s = new StringBuffer();
char ch[] = new char[8192]; char ch[] = new char[8192];
BufferedReader in BufferedReader in
= new BufferedReader(new InputStreamReader(new FileInputStream(fname))); // DLC FIXME: specify encoding. = new BufferedReader(new InputStreamReader(new FileInputStream(fname),
"US-ASCII"));
int amt; int amt;
while (-1 != (amt = in.read(ch))) { while (-1 != (amt = in.read(ch))) {
s.append(ch, 0, amt); s.append(ch, 0, amt);
} }
return scan(s.toString(), errors); return scan(s.toString(), errors, !strict);
} }
/** Returns a list of {@link ACIPString ACIPStrings} corresponding /** Returns a list of {@link ACIPString ACIPStrings} corresponding
@ -81,14 +93,18 @@ public class ACIPTshegBarScanner {
* errors, each followed by a '\n'. There is at least one case * errors, each followed by a '\n'. There is at least one case
* where no ERROR ACIPString will appear but errors will be * where no ERROR ACIPString will appear but errors will be
* modified. * modified.
* @param lenientPeriods if and only if this is true, periods
* will never cause errors, even if iffy text like "PAS... LA "
* appears.
*/ */
public static ArrayList scan(String s, StringBuffer errors) { public static ArrayList scan(String s, StringBuffer errors, boolean lenientPeriods) {
// the size depends on whether it's mostly Tibetan or mostly // the size depends on whether it's mostly Tibetan or mostly
// Latin and a number of other factors. This is meant to be // Latin and a number of other factors. This is meant to be
// an underestimate, but not too much of an underestimate. // an underestimate, but not too much of an underestimate.
ArrayList al = new ArrayList(s.length() / 10); ArrayList al = new ArrayList(s.length() / 10);
boolean waitingForMatchingIllegalClose = false;
int sl = s.length(); int sl = s.length();
int currentType = ACIPString.ERROR; int currentType = ACIPString.ERROR;
int startOfString = 0; int startOfString = 0;
@ -101,11 +117,11 @@ public class ACIPTshegBarScanner {
ch = s.charAt(i); ch = s.charAt(i);
if (ACIPString.COMMENT == currentType && ch != ']') { if (ACIPString.COMMENT == currentType && ch != ']') {
if ('[' == ch) { if ('[' == ch) {
al.add(new ACIPString("Found an open square bracket, [, within a [#COMMENT]-style comment. Square brackets may not appear in comments.\n", al.add(new ACIPString("Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n",
ACIPString.ERROR)); ACIPString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ": " errors.append("Offset " + i + ": "
+ "Found an open square bracket, [, within a [#COMMENT]-style comment. Square brackets may not appear in comments.\n"); + "Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n");
} }
continue; continue;
} }
@ -119,24 +135,42 @@ public class ACIPTshegBarScanner {
currentType)); currentType));
} }
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR)); al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
if (!waitingForMatchingIllegalClose) {
if (null != errors) {
errors.append("Offset " + i + ": "
+ "Found a truly unmatched close bracket, [ or {.\n");
}
}
waitingForMatchingIllegalClose = false;
if (null != errors) if (null != errors)
errors.append("Offset " + i + ": " errors.append("Offset " + i + ": "
+ "Found a closing square bracket, ], without a matching open square bracket, [. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); + "Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
} else { } else {
int stackTop = ((Integer)bracketTypeStack.pop()).intValue(); int stackTop = ((Integer)bracketTypeStack.pop()).intValue();
String text = s.substring(startOfString, i+1); int end = startOfString;
if (ACIPString.CORRECTION_START == stackTop) { if (ACIPString.CORRECTION_START == stackTop) {
// This definitely indicates a new token.
char prevCh = s.charAt(i-1); char prevCh = s.charAt(i-1);
if (prevCh == '?')
end = i - 1;
else
end = i;
if (startOfString < end) {
al.add(new ACIPString(s.substring(startOfString, end),
currentType));
}
if ('?' != prevCh) { if ('?' != prevCh) {
currentType = ACIPString.PROBABLE_CORRECTION; currentType = ACIPString.PROBABLE_CORRECTION;
} else { } else {
currentType = ACIPString.POSSIBLE_CORRECTION; currentType = ACIPString.POSSIBLE_CORRECTION;
} }
} }
al.add(new ACIPString(text, currentType)); al.add(new ACIPString(s.substring(end, i+1), currentType));
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
} }
@ -208,8 +242,10 @@ public class ACIPTshegBarScanner {
} else { } else {
// We see comments appear not as [#COMMENT], but // We see comments appear not as [#COMMENT], but
// as [COMMENT] sometimes. We make special cases // as [COMMENT] sometimes. We make special cases
// for some English comments. DLC FIXME: put // for some English comments. There's no need to
// these in a config file. // make this mechanism extensible, because you
// can easily edit the ACIP text so that it uses
// [#COMMENT] notation instead of [COMMENT].
String[] englishComments = new String[] { String[] englishComments = new String[] {
"FIRST", "SECOND", // S5274I.ACT "FIRST", "SECOND", // S5274I.ACT
@ -227,6 +263,7 @@ public class ACIPTshegBarScanner {
"THE FOLLOWING TEXT HAS INCOMPLETE SECTIONS, WHICH ARE ON ORDER", // SE6260A.INC "THE FOLLOWING TEXT HAS INCOMPLETE SECTIONS, WHICH ARE ON ORDER", // SE6260A.INC
"@DATA INCOMPLETE HERE", // SE6260A.INC "@DATA INCOMPLETE HERE", // SE6260A.INC
"@DATA MISSING HERE", // SE6260A.INC "@DATA MISSING HERE", // SE6260A.INC
"LINE APPARENTLY MISSING THIS PAGE", // TD4035I.INC
"DATA INCOMPLETE HERE", // TD4226I2.INC "DATA INCOMPLETE HERE", // TD4226I2.INC
"DATA MISSING HERE", // just being consistent "DATA MISSING HERE", // just being consistent
"FOLLOWING SECTION WAS NOT AVAILABLE WHEN THIS EDITION WAS\nPRINTED, AND IS SUPPLIED FROM ANOTHER, PROBABLY THE ORIGINAL:", // S0018N.ACT "FOLLOWING SECTION WAS NOT AVAILABLE WHEN THIS EDITION WAS\nPRINTED, AND IS SUPPLIED FROM ANOTHER, PROBABLY THE ORIGINAL:", // S0018N.ACT
@ -255,6 +292,74 @@ public class ACIPTshegBarScanner {
break; break;
} }
} }
if (!foundOne && i+1 < sl && s.charAt(i+1) == '*') {
// Identify [*LINE BREAK?] as an English
// correction. Every correction not on this
// list is considered to be Tibetan. DLC
// FIXME: make this extensible via a config
// file or at least a System property (which
// could be a comma-separated list of these
// creatures.
// If "LINE" is in the list below, then [*
// LINE], [* LINE?], [*LINE], [*LINE?], [*
// LINE OUT ?], etc. will be considered
// English corrections. I.e., whitespace
// before and anything after doesn't prevent a
// match.
String[] englishCorrections = new String[] {
"LINE", // KD0001I1.ACT
"DATA", // KL0009I2.INC
"BLANK", // KL0009I2.INC
"NOTE", // R0001F.ACM
"alternate", // R0018F.ACE
"02101-02150 missing", // R1003A3.INC
"51501-51550 missing", // R1003A52.ACT
"BRTAGS ETC", // S0002N.ACT
"TSAN, ETC", // S0015N.ACT
"SNYOMS, THROUGHOUT", // S0016N.ACT
"KYIS ETC", // S0019N.ACT
"MISSING", // S0455M.ACT
"this", // S6850I1B.ALT
"THIS", // S0057M.ACT
};
int begin;
for (begin = i+2; begin < sl; begin++) {
if (!isWhitespace(s.charAt(begin)))
break;
}
int end;
for (end = i+2; end < sl; end++) {
if (s.charAt(end) == ']')
break;
}
int realEnd = end;
if (end < sl && s.charAt(end-1) == '?')
--realEnd;
if (end < sl && begin < realEnd) {
String interestingSubstring
= s.substring(begin, realEnd);
for (int ec = 0; ec < englishCorrections.length; ec++) {
if (interestingSubstring.startsWith(englishCorrections[ec])) {
al.add(new ACIPString(s.substring(i, i+2),
ACIPString.CORRECTION_START));
al.add(new ACIPString(s.substring(i+2, realEnd),
ACIPString.LATIN));
if (s.charAt(end - 1) == '?') {
al.add(new ACIPString(s.substring(end-1, end+1),
ACIPString.POSSIBLE_CORRECTION));
} else {
al.add(new ACIPString(s.substring(end, end+1),
ACIPString.PROBABLE_CORRECTION));
}
foundOne = true;
startOfString = end+1;
i = startOfString - 1;
break;
}
}
}
}
if (foundOne) if (foundOne)
break; break;
} }
@ -269,6 +374,11 @@ public class ACIPTshegBarScanner {
if ('*' == nextCh) { if ('*' == nextCh) {
currentType = ACIPString.CORRECTION_START; currentType = ACIPString.CORRECTION_START;
bracketTypeStack.push(new Integer(currentType)); bracketTypeStack.push(new Integer(currentType));
al.add(new ACIPString(s.substring(i, i+2),
ACIPString.CORRECTION_START));
currentType = ACIPString.ERROR;
startOfString = i+2;
i = startOfString - 1;
break; break;
} else if ('#' == nextCh) { } else if ('#' == nextCh) {
currentType = ACIPString.COMMENT; currentType = ACIPString.COMMENT;
@ -276,18 +386,31 @@ public class ACIPTshegBarScanner {
break; break;
} }
} }
// This is an error. DLC FIXME: in practice // This is an error. Sometimes [COMMENTS APPEAR
// [COMMENTS APPEAR WITHOUT # MARKS]. Though // WITHOUT # MARKS]. Though "... [" could cause
// "... [" could cause this too. // this too.
al.add(new ACIPString(s.substring(i, i+1), al.add(new ACIPString(s.substring(i, i+1),
ACIPString.ERROR)); ACIPString.ERROR));
if (waitingForMatchingIllegalClose) {
if (null != errors) {
errors.append("Offset " + i + ": "
+ "Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.\n");
}
}
waitingForMatchingIllegalClose = true;
if (null != errors) { if (null != errors) {
String inContext = s.substring(i, i+Math.min(sl-i, 10)); String inContext = s.substring(i, i+Math.min(sl-i, 10));
if (sl-i > 10) { if (inContext.indexOf("\r") >= 0) {
inContext = inContext + "..."; inContext = inContext.substring(0, inContext.indexOf("\r"));
} else if (inContext.indexOf("\n") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\n"));
} else {
if (sl-i > 10) {
inContext = inContext + "...";
}
} }
errors.append("Offset " + i + ": " errors.append("Offset " + i + ": "
+ "Found an illegal open square bracket, [ (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open square bracket?\n"); + "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n");
} }
startOfString = i + 1; startOfString = i + 1;
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
@ -303,10 +426,15 @@ public class ACIPTshegBarScanner {
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
} }
// We look for @N[AB], @NN[AB], @NNN[AB], @NNNN[AB], // We look for {@N{AB}, @NN{AB}, ..., @NNNNNN{AB}},
// @NNNNN[AB], and @NNNNNN[AB] only, that is from one // {@[N{AB}], @[NN{AB}], ..., @[NNNNNN{AB}]},
// to six digits. // {@N{AB}.N, @NN{AB}.N, ..., @NNNNNN{AB}.N}, {@N,
for (int numdigits = 1; numdigits <= 5; numdigits++) { // @NN, ..., @NNNNNN}, and {@{AB}N, @{AB}NN,
// ... @{AB}NNNNNN} only, that is from one to six
// digits. Each of these folio marker format occurs
// in practice.
for (int numdigits = 6; numdigits >= 1; numdigits--) {
// @NNN{AB} and @NNN{AB}.N cases:
if (i+numdigits+1 < sl if (i+numdigits+1 < sl
&& (s.charAt(i+numdigits+1) == 'A' || s.charAt(i+numdigits+1) == 'B')) { && (s.charAt(i+numdigits+1) == 'A' || s.charAt(i+numdigits+1) == 'B')) {
boolean allAreNumeric = true; boolean allAreNumeric = true;
@ -316,6 +444,73 @@ public class ACIPTshegBarScanner {
break; break;
} }
} }
if (allAreNumeric) {
// Is this "@012B " or "@012B.3 "?
int extra;
if (i+numdigits+2 < sl && s.charAt(i+numdigits+2) == '.') {
if (!(i+numdigits+4 < sl && isNumeric(s.charAt(i+numdigits+3))
&& !isNumeric(s.charAt(i+numdigits+4)))) {
al.add(new ACIPString(s.substring(i, i+numdigits+3), ACIPString.ERROR));
String inContext = s.substring(i, i+Math.min(sl-i, 10));
if (inContext.indexOf("\r") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\r"));
} else if (inContext.indexOf("\n") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\n"));
} else {
if (sl-i > 10) {
inContext = inContext + "...";
}
}
if (null != errors)
errors.append("Offset " + i + ": "
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.\n");
startOfString = i+numdigits+3;
i = startOfString - 1;
currentType = ACIPString.ERROR;
break;
}
if (i+numdigits+4 < sl && (s.charAt(i+numdigits+4) == '.' || s.charAt(i+numdigits+4) == 'A' || s.charAt(i+numdigits+4) == 'B' || s.charAt(i+numdigits+4) == 'a' || s.charAt(i+numdigits+4) == 'b' || isNumeric(s.charAt(i+numdigits+4)))) {
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
String inContext = s.substring(i, i+Math.min(sl-i, 10));
if (inContext.indexOf("\r") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\r"));
} else if (inContext.indexOf("\n") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\n"));
} else {
if (sl-i > 10) {
inContext = inContext + "...";
}
}
if (null != errors)
errors.append("Offset " + i + ": "
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.\n");
startOfString = i+1; // DLC FIXME: skip over more?
currentType = ACIPString.ERROR;
break;
}
extra = 4;
} else {
extra = 2;
}
al.add(new ACIPString(s.substring(i, i+numdigits+extra),
ACIPString.FOLIO_MARKER));
startOfString = i+numdigits+extra;
i = startOfString - 1;
currentType = ACIPString.ERROR;
break;
}
}
// @{AB}NNN case:
if (i+numdigits+1 < sl
&& (s.charAt(i+1) == 'A' || s.charAt(i+1) == 'B')) {
boolean allAreNumeric = true;
for (int k = 1; k <= numdigits; k++) {
if (!isNumeric(s.charAt(i+1+k))) {
allAreNumeric = false;
break;
}
}
if (allAreNumeric) { if (allAreNumeric) {
al.add(new ACIPString(s.substring(i, i+numdigits+2), al.add(new ACIPString(s.substring(i, i+numdigits+2),
ACIPString.FOLIO_MARKER)); ACIPString.FOLIO_MARKER));
@ -325,8 +520,8 @@ public class ACIPTshegBarScanner {
break; break;
} }
} }
// System.out.println("DLC NOW HERE xxx y:" + (i+numdigits+3 < sl) + " z:" + s.charAt(i+1) + s.charAt(i+numdigits+2) + s.charAt(i+numdigits+3));
// @[NNN{AB}] case:
if (i+numdigits+3 < sl if (i+numdigits+3 < sl
&& s.charAt(i+1) == '[' && s.charAt(i+numdigits+3) == ']' && s.charAt(i+1) == '[' && s.charAt(i+numdigits+3) == ']'
&& (s.charAt(i+numdigits+2) == 'A' || s.charAt(i+numdigits+2) == 'B')) { && (s.charAt(i+numdigits+2) == 'A' || s.charAt(i+numdigits+2) == 'B')) {
@ -346,12 +541,41 @@ public class ACIPTshegBarScanner {
break; break;
} }
} }
// This case, @NNN, must come after the @NNN{AB} case.
if (i+numdigits+1 < sl && s.charAt(i+numdigits+1) == ' ') {
boolean allAreNumeric = true;
for (int k = 1; k <= numdigits; k++) {
if (!isNumeric(s.charAt(i+k))) {
allAreNumeric = false;
break;
}
}
if (allAreNumeric) {
al.add(new ACIPString(s.substring(i, i+numdigits+1),
ACIPString.FOLIO_MARKER));
startOfString = i+numdigits+1;
i = startOfString - 1;
currentType = ACIPString.ERROR;
break;
}
}
} }
if (startOfString == i) { if (startOfString == i) {
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR)); al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
String inContext = s.substring(i, i+Math.min(sl-i, 10));
if (inContext.indexOf("\r") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\r"));
} else if (inContext.indexOf("\n") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\n"));
} else {
if (sl-i > 10) {
inContext = inContext + "...";
}
}
if (null != errors) if (null != errors)
errors.append("Offset " + i + ": " errors.append("Offset " + i + ": "
+ "Found an illegal at sign, @. @012B is an example of a legal folio marker.\n"); + "Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.\n");
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
} }
@ -391,7 +615,7 @@ public class ACIPTshegBarScanner {
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
} }
// DLC support nesting like (NYA (BA))? // We do not support nesting like (NYA (BA)).
if (startParenIndex >= 0) { if (startParenIndex >= 0) {
if (ch == '(') { if (ch == '(') {
@ -421,7 +645,8 @@ public class ACIPTshegBarScanner {
break; // end '(',')' case break; // end '(',')' case
case '?': case '?':
if (bracketTypeStack.empty()) { if (bracketTypeStack.empty() || i+1>=sl
|| (s.charAt(i+1) != ']' && s.charAt(i+1) != '}')) {
// The tsheg bar ends here; new token. // The tsheg bar ends here; new token.
if (startOfString < i) { if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i), al.add(new ACIPString(s.substring(startOfString, i),
@ -443,18 +668,25 @@ public class ACIPTshegBarScanner {
startOfString = i; startOfString = i;
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
} }
// . is used for a non-breaking tsheg, such as in {NGO.,} and {....,DAM}. We give an error unless , or . follows '.'. // . is used for a non-breaking tsheg, such as in
if (i + 1 < sl && (s.charAt(i+1) == '.' || s.charAt(i+1) == ',')) { // {NGO.,} and {....,DAM}. We give an error unless ,
// or ., or [A-Za-z] follows '.'.
if (lenientPeriods
|| (i + 1 < sl
&& (s.charAt(i+1) == '.' || s.charAt(i+1) == ','
|| (s.charAt(i+1) == '\r' || s.charAt(i+1) == '\n')
|| (s.charAt(i+1) >= 'a' && s.charAt(i+1) <= 'z')
|| (s.charAt(i+1) >= 'A' && s.charAt(i+1) <= 'Z')))) {
al.add(new ACIPString(s.substring(i, i+1), al.add(new ACIPString(s.substring(i, i+1),
ACIPString.TIBETAN_PUNCTUATION)); ACIPString.TIBETAN_PUNCTUATION));
} else { } else {
al.add(new ACIPString("A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\".", al.add(new ACIPString(s.substring(i, i+1),
ACIPString.ERROR)); ACIPString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ": " errors.append("Offset " + i + ": "
+ "A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\".\n"); + "A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n");
} }
startOfString = i+1;
break; // end '.' case break; // end '.' case
// Classic tsheg bar enders: // Classic tsheg bar enders:
@ -493,9 +725,15 @@ public class ACIPTshegBarScanner {
} }
al.add(new ACIPString(s.substring(i, i+1), al.add(new ACIPString(s.substring(i, i+1),
ACIPString.ERROR)); ACIPString.ERROR));
if (null != errors) if (null != errors) {
errors.append("Offset " + i + ": " if ((int)ch == 65533) {
+ "Found an illegal character, " + ch + "\n"); errors.append("Offset " + i + ": "
+ "Found an illegal, unprintable character.\n");
} else {
errors.append("Offset " + i + ": "
+ "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".\n");
}
}
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = ACIPString.ERROR;
} else { } else {
@ -510,16 +748,24 @@ public class ACIPTshegBarScanner {
al.add(new ACIPString(s.substring(startOfString, sl), al.add(new ACIPString(s.substring(startOfString, sl),
currentType)); currentType));
} }
if (waitingForMatchingIllegalClose) {
al.add(new ACIPString("UNEXPECTED END OF INPUT",
ACIPString.ERROR));
if (null != errors) {
errors.append("Offset END: "
+ "Truly unmatched open bracket found.\n");
}
}
if (!bracketTypeStack.empty()) { if (!bracketTypeStack.empty()) {
al.add(new ACIPString("UNEXPECTED END OF INPUT", al.add(new ACIPString("UNEXPECTED END OF INPUT",
ACIPString.ERROR)); ACIPString.ERROR));
if (null != errors) { if (null != errors) {
if (ACIPString.COMMENT == currentType) { if (ACIPString.COMMENT == currentType) {
errors.append("Offset END: " errors.append("Offset END: "
+ "Unmatched open square bracket, [, found. A comment does not terminate.\n"); + "Unmatched open bracket found. A comment does not terminate.\n");
} else { } else {
errors.append("Offset END: " errors.append("Offset END: "
+ "Unmatched open square bracket, [, found. A correction does not terminate.\n"); + "Unmatched open bracket found. A correction does not terminate.\n");
} }
} }
} }
@ -545,6 +791,11 @@ public class ACIPTshegBarScanner {
return ch >= '0' && ch <= '9'; return ch >= '0' && ch <= '9';
} }
/** See implementation. */
private static boolean isWhitespace(char ch) {
return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
}
/** See implementation. */ /** See implementation. */
private static boolean isAlpha(char ch) { private static boolean isAlpha(char ch) {
return ch == '\'' // 23rd consonant return ch == '\'' // 23rd consonant
@ -554,6 +805,8 @@ public class ACIPTshegBarScanner {
|| ch == 'o' || ch == 'o'
|| ch == 'x' || ch == 'x'
|| ch == ':' || ch == ':'
|| ch == '^'
|| ch == '\\'
|| ch == '-' || ch == '-'
|| ch == '+' || ch == '+'

File diff suppressed because it is too large Load diff