Improved the ACIP scanner (the part of the converter that says, "This
is a correction, that's a comment, this is Tibetan, that's Latin (English), that's Tibetan inter-tsheg-bar punctuation, etc.) It now accepts more real-world ACIP files, i.e. it handles illegal constructs. The error checking is more user-friendly. There are now tests. Added some tsheg bars that Peter E. Hauer of Linguasoft sent me to the tests. Many thanks, Peter. I still need to implement rules that say, "This is not Tibetan, it must be Sanskrit, because that letter doesn't take a MA prefix."
This commit is contained in:
parent
0b91ed0beb
commit
4581a2d8ab
3 changed files with 2049 additions and 46 deletions
|
@ -34,11 +34,14 @@ public class ACIPString {
|
||||||
public static final int COMMENT = 0;
|
public static final int COMMENT = 0;
|
||||||
/** For Folio markers like @012B */
|
/** For Folio markers like @012B */
|
||||||
public static final int FOLIO_MARKER = 1;
|
public static final int FOLIO_MARKER = 1;
|
||||||
|
/** For Latin letters and numbers etc. [*LINE BREAK?] uses this,
|
||||||
|
* for example. */
|
||||||
|
public static final int LATIN = 2;
|
||||||
/** For Tibetan letters and numbers etc. */
|
/** For Tibetan letters and numbers etc. */
|
||||||
public static final int TIBETAN_NON_PUNCTUATION = 2;
|
public static final int TIBETAN_NON_PUNCTUATION = 3;
|
||||||
/** For tshegs, whitespace and the like, but not combining
|
/** For tshegs, whitespace and the like, but not combining
|
||||||
* punctutation like %, o, :, m, and x */
|
* punctutation like %, o, :, m, and x */
|
||||||
public static final int TIBETAN_PUNCTUATION = 3;
|
public static final int TIBETAN_PUNCTUATION = 4;
|
||||||
/** For the start of a [*probable correction] or [*possible correction?] */
|
/** For the start of a [*probable correction] or [*possible correction?] */
|
||||||
public static final int CORRECTION_START = 5;
|
public static final int CORRECTION_START = 5;
|
||||||
/** Denotes the end of a [*probable correction] */
|
/** Denotes the end of a [*probable correction] */
|
||||||
|
@ -65,7 +68,7 @@ public class ACIPString {
|
||||||
public static final int END_PAREN = 16;
|
public static final int END_PAREN = 16;
|
||||||
/** For things that are not legal syntax, such as a file that
|
/** For things that are not legal syntax, such as a file that
|
||||||
* contains just "[# HALF A COMMEN" */
|
* contains just "[# HALF A COMMEN" */
|
||||||
public static final int ERROR = 17; /* DLC let the user know. */
|
public static final int ERROR = 17;
|
||||||
|
|
||||||
/** Returns true if and only if this string is Latin (usually
|
/** Returns true if and only if this string is Latin (usually
|
||||||
* English). Returns false if this string is transliteration of
|
* English). Returns false if this string is transliteration of
|
||||||
|
@ -105,6 +108,7 @@ public class ACIPString {
|
||||||
String typeString = "HUH?????";
|
String typeString = "HUH?????";
|
||||||
if (type == COMMENT) typeString = "COMMENT";
|
if (type == COMMENT) typeString = "COMMENT";
|
||||||
if (type == FOLIO_MARKER) typeString = "FOLIO_MARKER";
|
if (type == FOLIO_MARKER) typeString = "FOLIO_MARKER";
|
||||||
|
if (type == LATIN) typeString = "LATIN";
|
||||||
if (type == TIBETAN_NON_PUNCTUATION) typeString = "TIBETAN_NON_PUNCTUATION";
|
if (type == TIBETAN_NON_PUNCTUATION) typeString = "TIBETAN_NON_PUNCTUATION";
|
||||||
if (type == TIBETAN_PUNCTUATION) typeString = "TIBETAN_PUNCTUATION";
|
if (type == TIBETAN_PUNCTUATION) typeString = "TIBETAN_PUNCTUATION";
|
||||||
if (type == CORRECTION_START) typeString = "CORRECTION_START";
|
if (type == CORRECTION_START) typeString = "CORRECTION_START";
|
||||||
|
@ -120,6 +124,6 @@ public class ACIPString {
|
||||||
if (type == START_PAREN) typeString = "START_PAREN";
|
if (type == START_PAREN) typeString = "START_PAREN";
|
||||||
if (type == END_PAREN) typeString = "END_PAREN";
|
if (type == END_PAREN) typeString = "END_PAREN";
|
||||||
if (type == ERROR) typeString = "ERROR";
|
if (type == ERROR) typeString = "ERROR";
|
||||||
return typeString + ":\"" + getText() + "\"";
|
return typeString + ":{" + getText() + "}";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -33,14 +33,20 @@ import org.thdl.util.ThdlDebug;
|
||||||
* @author David Chandler
|
* @author David Chandler
|
||||||
*/
|
*/
|
||||||
public class ACIPTshegBarScanner {
|
public class ACIPTshegBarScanner {
|
||||||
// DLC DOC
|
/** Useful for testing. Gives error messages on standard output
|
||||||
|
* about why we can't scan the document perfectly and exits with
|
||||||
|
* non-zero return code, or says "Good scan!" otherwise and exits
|
||||||
|
* with code zero. <p>FIXME: not so efficient; copies the whole
|
||||||
|
* file into memory first. */
|
||||||
public static void main(String[] args) throws IOException {
|
public static void main(String[] args) throws IOException {
|
||||||
if (args.length != 1) {
|
boolean strict = true;
|
||||||
System.out.println("Bad args! Need just the ACIP file's path.");
|
if (args.length != 2
|
||||||
|
|| (!(strict = "--strict".equals(args[0])) && !"--lenient".equals(args[0]))) {
|
||||||
|
System.out.println("Bad args! Need '--strict filename' or '--lenient filename'.");
|
||||||
System.exit(1);
|
System.exit(1);
|
||||||
}
|
}
|
||||||
StringBuffer errors = new StringBuffer();
|
StringBuffer errors = new StringBuffer();
|
||||||
ArrayList al = scanFile(args[0], errors);
|
ArrayList al = scanFile(args[1], errors, strict);
|
||||||
|
|
||||||
if (errors.length() > 0) {
|
if (errors.length() > 0) {
|
||||||
System.out.println("Errors scanning ACIP input file: ");
|
System.out.println("Errors scanning ACIP input file: ");
|
||||||
|
@ -52,20 +58,26 @@ public class ACIPTshegBarScanner {
|
||||||
System.out.println("Good scan!");
|
System.out.println("Good scan!");
|
||||||
System.exit(0);
|
System.exit(0);
|
||||||
}
|
}
|
||||||
|
|
||||||
// DLC DOC
|
/** Scans an ACIP file with path fname into tsheg bars. If errors
|
||||||
// DLC FIXME: not so efficient; copies the whole file into memory first
|
* is non-null, error messages will be appended to it. If strict
|
||||||
public static ArrayList scanFile(String fname, StringBuffer errors) throws IOException {
|
* is true, then you're more likely to see error
|
||||||
|
* messages. Returns a list of ACIPStrings that is the
|
||||||
|
* scan. <p>FIXME: not so efficient; copies the whole file into
|
||||||
|
* memory first.
|
||||||
|
* @throws IOException if we cannot read in the ACIP input file */
|
||||||
|
public static ArrayList scanFile(String fname, StringBuffer errors, boolean strict) throws IOException {
|
||||||
StringBuffer s = new StringBuffer();
|
StringBuffer s = new StringBuffer();
|
||||||
char ch[] = new char[8192];
|
char ch[] = new char[8192];
|
||||||
BufferedReader in
|
BufferedReader in
|
||||||
= new BufferedReader(new InputStreamReader(new FileInputStream(fname))); // DLC FIXME: specify encoding.
|
= new BufferedReader(new InputStreamReader(new FileInputStream(fname),
|
||||||
|
"US-ASCII"));
|
||||||
|
|
||||||
int amt;
|
int amt;
|
||||||
while (-1 != (amt = in.read(ch))) {
|
while (-1 != (amt = in.read(ch))) {
|
||||||
s.append(ch, 0, amt);
|
s.append(ch, 0, amt);
|
||||||
}
|
}
|
||||||
return scan(s.toString(), errors);
|
return scan(s.toString(), errors, !strict);
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns a list of {@link ACIPString ACIPStrings} corresponding
|
/** Returns a list of {@link ACIPString ACIPStrings} corresponding
|
||||||
|
@ -81,14 +93,18 @@ public class ACIPTshegBarScanner {
|
||||||
* errors, each followed by a '\n'. There is at least one case
|
* errors, each followed by a '\n'. There is at least one case
|
||||||
* where no ERROR ACIPString will appear but errors will be
|
* where no ERROR ACIPString will appear but errors will be
|
||||||
* modified.
|
* modified.
|
||||||
|
* @param lenientPeriods if and only if this is true, periods
|
||||||
|
* will never cause errors, even if iffy text like "PAS... LA "
|
||||||
|
* appears.
|
||||||
*/
|
*/
|
||||||
public static ArrayList scan(String s, StringBuffer errors) {
|
public static ArrayList scan(String s, StringBuffer errors, boolean lenientPeriods) {
|
||||||
|
|
||||||
// the size depends on whether it's mostly Tibetan or mostly
|
// the size depends on whether it's mostly Tibetan or mostly
|
||||||
// Latin and a number of other factors. This is meant to be
|
// Latin and a number of other factors. This is meant to be
|
||||||
// an underestimate, but not too much of an underestimate.
|
// an underestimate, but not too much of an underestimate.
|
||||||
ArrayList al = new ArrayList(s.length() / 10);
|
ArrayList al = new ArrayList(s.length() / 10);
|
||||||
|
|
||||||
|
boolean waitingForMatchingIllegalClose = false;
|
||||||
int sl = s.length();
|
int sl = s.length();
|
||||||
int currentType = ACIPString.ERROR;
|
int currentType = ACIPString.ERROR;
|
||||||
int startOfString = 0;
|
int startOfString = 0;
|
||||||
|
@ -101,11 +117,11 @@ public class ACIPTshegBarScanner {
|
||||||
ch = s.charAt(i);
|
ch = s.charAt(i);
|
||||||
if (ACIPString.COMMENT == currentType && ch != ']') {
|
if (ACIPString.COMMENT == currentType && ch != ']') {
|
||||||
if ('[' == ch) {
|
if ('[' == ch) {
|
||||||
al.add(new ACIPString("Found an open square bracket, [, within a [#COMMENT]-style comment. Square brackets may not appear in comments.\n",
|
al.add(new ACIPString("Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n",
|
||||||
ACIPString.ERROR));
|
ACIPString.ERROR));
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ": "
|
errors.append("Offset " + i + ": "
|
||||||
+ "Found an open square bracket, [, within a [#COMMENT]-style comment. Square brackets may not appear in comments.\n");
|
+ "Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n");
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -119,24 +135,42 @@ public class ACIPTshegBarScanner {
|
||||||
currentType));
|
currentType));
|
||||||
}
|
}
|
||||||
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
|
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
|
||||||
|
if (!waitingForMatchingIllegalClose) {
|
||||||
|
if (null != errors) {
|
||||||
|
errors.append("Offset " + i + ": "
|
||||||
|
+ "Found a truly unmatched close bracket, [ or {.\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
waitingForMatchingIllegalClose = false;
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ": "
|
errors.append("Offset " + i + ": "
|
||||||
+ "Found a closing square bracket, ], without a matching open square bracket, [. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
+ "Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
||||||
startOfString = i+1;
|
startOfString = i+1;
|
||||||
currentType = ACIPString.ERROR;
|
currentType = ACIPString.ERROR;
|
||||||
} else {
|
} else {
|
||||||
int stackTop = ((Integer)bracketTypeStack.pop()).intValue();
|
int stackTop = ((Integer)bracketTypeStack.pop()).intValue();
|
||||||
|
|
||||||
String text = s.substring(startOfString, i+1);
|
int end = startOfString;
|
||||||
if (ACIPString.CORRECTION_START == stackTop) {
|
if (ACIPString.CORRECTION_START == stackTop) {
|
||||||
|
|
||||||
|
// This definitely indicates a new token.
|
||||||
char prevCh = s.charAt(i-1);
|
char prevCh = s.charAt(i-1);
|
||||||
|
if (prevCh == '?')
|
||||||
|
end = i - 1;
|
||||||
|
else
|
||||||
|
end = i;
|
||||||
|
if (startOfString < end) {
|
||||||
|
al.add(new ACIPString(s.substring(startOfString, end),
|
||||||
|
currentType));
|
||||||
|
}
|
||||||
|
|
||||||
if ('?' != prevCh) {
|
if ('?' != prevCh) {
|
||||||
currentType = ACIPString.PROBABLE_CORRECTION;
|
currentType = ACIPString.PROBABLE_CORRECTION;
|
||||||
} else {
|
} else {
|
||||||
currentType = ACIPString.POSSIBLE_CORRECTION;
|
currentType = ACIPString.POSSIBLE_CORRECTION;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
al.add(new ACIPString(text, currentType));
|
al.add(new ACIPString(s.substring(end, i+1), currentType));
|
||||||
startOfString = i+1;
|
startOfString = i+1;
|
||||||
currentType = ACIPString.ERROR;
|
currentType = ACIPString.ERROR;
|
||||||
}
|
}
|
||||||
|
@ -208,8 +242,10 @@ public class ACIPTshegBarScanner {
|
||||||
} else {
|
} else {
|
||||||
// We see comments appear not as [#COMMENT], but
|
// We see comments appear not as [#COMMENT], but
|
||||||
// as [COMMENT] sometimes. We make special cases
|
// as [COMMENT] sometimes. We make special cases
|
||||||
// for some English comments. DLC FIXME: put
|
// for some English comments. There's no need to
|
||||||
// these in a config file.
|
// make this mechanism extensible, because you
|
||||||
|
// can easily edit the ACIP text so that it uses
|
||||||
|
// [#COMMENT] notation instead of [COMMENT].
|
||||||
|
|
||||||
String[] englishComments = new String[] {
|
String[] englishComments = new String[] {
|
||||||
"FIRST", "SECOND", // S5274I.ACT
|
"FIRST", "SECOND", // S5274I.ACT
|
||||||
|
@ -227,6 +263,7 @@ public class ACIPTshegBarScanner {
|
||||||
"THE FOLLOWING TEXT HAS INCOMPLETE SECTIONS, WHICH ARE ON ORDER", // SE6260A.INC
|
"THE FOLLOWING TEXT HAS INCOMPLETE SECTIONS, WHICH ARE ON ORDER", // SE6260A.INC
|
||||||
"@DATA INCOMPLETE HERE", // SE6260A.INC
|
"@DATA INCOMPLETE HERE", // SE6260A.INC
|
||||||
"@DATA MISSING HERE", // SE6260A.INC
|
"@DATA MISSING HERE", // SE6260A.INC
|
||||||
|
"LINE APPARENTLY MISSING THIS PAGE", // TD4035I.INC
|
||||||
"DATA INCOMPLETE HERE", // TD4226I2.INC
|
"DATA INCOMPLETE HERE", // TD4226I2.INC
|
||||||
"DATA MISSING HERE", // just being consistent
|
"DATA MISSING HERE", // just being consistent
|
||||||
"FOLLOWING SECTION WAS NOT AVAILABLE WHEN THIS EDITION WAS\nPRINTED, AND IS SUPPLIED FROM ANOTHER, PROBABLY THE ORIGINAL:", // S0018N.ACT
|
"FOLLOWING SECTION WAS NOT AVAILABLE WHEN THIS EDITION WAS\nPRINTED, AND IS SUPPLIED FROM ANOTHER, PROBABLY THE ORIGINAL:", // S0018N.ACT
|
||||||
|
@ -255,6 +292,74 @@ public class ACIPTshegBarScanner {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (!foundOne && i+1 < sl && s.charAt(i+1) == '*') {
|
||||||
|
// Identify [*LINE BREAK?] as an English
|
||||||
|
// correction. Every correction not on this
|
||||||
|
// list is considered to be Tibetan. DLC
|
||||||
|
// FIXME: make this extensible via a config
|
||||||
|
// file or at least a System property (which
|
||||||
|
// could be a comma-separated list of these
|
||||||
|
// creatures.
|
||||||
|
|
||||||
|
// If "LINE" is in the list below, then [*
|
||||||
|
// LINE], [* LINE?], [*LINE], [*LINE?], [*
|
||||||
|
// LINE OUT ?], etc. will be considered
|
||||||
|
// English corrections. I.e., whitespace
|
||||||
|
// before and anything after doesn't prevent a
|
||||||
|
// match.
|
||||||
|
String[] englishCorrections = new String[] {
|
||||||
|
"LINE", // KD0001I1.ACT
|
||||||
|
"DATA", // KL0009I2.INC
|
||||||
|
"BLANK", // KL0009I2.INC
|
||||||
|
"NOTE", // R0001F.ACM
|
||||||
|
"alternate", // R0018F.ACE
|
||||||
|
"02101-02150 missing", // R1003A3.INC
|
||||||
|
"51501-51550 missing", // R1003A52.ACT
|
||||||
|
"BRTAGS ETC", // S0002N.ACT
|
||||||
|
"TSAN, ETC", // S0015N.ACT
|
||||||
|
"SNYOMS, THROUGHOUT", // S0016N.ACT
|
||||||
|
"KYIS ETC", // S0019N.ACT
|
||||||
|
"MISSING", // S0455M.ACT
|
||||||
|
"this", // S6850I1B.ALT
|
||||||
|
"THIS", // S0057M.ACT
|
||||||
|
};
|
||||||
|
int begin;
|
||||||
|
for (begin = i+2; begin < sl; begin++) {
|
||||||
|
if (!isWhitespace(s.charAt(begin)))
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
int end;
|
||||||
|
for (end = i+2; end < sl; end++) {
|
||||||
|
if (s.charAt(end) == ']')
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
int realEnd = end;
|
||||||
|
if (end < sl && s.charAt(end-1) == '?')
|
||||||
|
--realEnd;
|
||||||
|
if (end < sl && begin < realEnd) {
|
||||||
|
String interestingSubstring
|
||||||
|
= s.substring(begin, realEnd);
|
||||||
|
for (int ec = 0; ec < englishCorrections.length; ec++) {
|
||||||
|
if (interestingSubstring.startsWith(englishCorrections[ec])) {
|
||||||
|
al.add(new ACIPString(s.substring(i, i+2),
|
||||||
|
ACIPString.CORRECTION_START));
|
||||||
|
al.add(new ACIPString(s.substring(i+2, realEnd),
|
||||||
|
ACIPString.LATIN));
|
||||||
|
if (s.charAt(end - 1) == '?') {
|
||||||
|
al.add(new ACIPString(s.substring(end-1, end+1),
|
||||||
|
ACIPString.POSSIBLE_CORRECTION));
|
||||||
|
} else {
|
||||||
|
al.add(new ACIPString(s.substring(end, end+1),
|
||||||
|
ACIPString.PROBABLE_CORRECTION));
|
||||||
|
}
|
||||||
|
foundOne = true;
|
||||||
|
startOfString = end+1;
|
||||||
|
i = startOfString - 1;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
if (foundOne)
|
if (foundOne)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -269,6 +374,11 @@ public class ACIPTshegBarScanner {
|
||||||
if ('*' == nextCh) {
|
if ('*' == nextCh) {
|
||||||
currentType = ACIPString.CORRECTION_START;
|
currentType = ACIPString.CORRECTION_START;
|
||||||
bracketTypeStack.push(new Integer(currentType));
|
bracketTypeStack.push(new Integer(currentType));
|
||||||
|
al.add(new ACIPString(s.substring(i, i+2),
|
||||||
|
ACIPString.CORRECTION_START));
|
||||||
|
currentType = ACIPString.ERROR;
|
||||||
|
startOfString = i+2;
|
||||||
|
i = startOfString - 1;
|
||||||
break;
|
break;
|
||||||
} else if ('#' == nextCh) {
|
} else if ('#' == nextCh) {
|
||||||
currentType = ACIPString.COMMENT;
|
currentType = ACIPString.COMMENT;
|
||||||
|
@ -276,18 +386,31 @@ public class ACIPTshegBarScanner {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// This is an error. DLC FIXME: in practice
|
// This is an error. Sometimes [COMMENTS APPEAR
|
||||||
// [COMMENTS APPEAR WITHOUT # MARKS]. Though
|
// WITHOUT # MARKS]. Though "... [" could cause
|
||||||
// "... [" could cause this too.
|
// this too.
|
||||||
al.add(new ACIPString(s.substring(i, i+1),
|
al.add(new ACIPString(s.substring(i, i+1),
|
||||||
ACIPString.ERROR));
|
ACIPString.ERROR));
|
||||||
|
if (waitingForMatchingIllegalClose) {
|
||||||
|
if (null != errors) {
|
||||||
|
errors.append("Offset " + i + ": "
|
||||||
|
+ "Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
waitingForMatchingIllegalClose = true;
|
||||||
if (null != errors) {
|
if (null != errors) {
|
||||||
String inContext = s.substring(i, i+Math.min(sl-i, 10));
|
String inContext = s.substring(i, i+Math.min(sl-i, 10));
|
||||||
if (sl-i > 10) {
|
if (inContext.indexOf("\r") >= 0) {
|
||||||
inContext = inContext + "...";
|
inContext = inContext.substring(0, inContext.indexOf("\r"));
|
||||||
|
} else if (inContext.indexOf("\n") >= 0) {
|
||||||
|
inContext = inContext.substring(0, inContext.indexOf("\n"));
|
||||||
|
} else {
|
||||||
|
if (sl-i > 10) {
|
||||||
|
inContext = inContext + "...";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
errors.append("Offset " + i + ": "
|
errors.append("Offset " + i + ": "
|
||||||
+ "Found an illegal open square bracket, [ (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open square bracket?\n");
|
+ "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n");
|
||||||
}
|
}
|
||||||
startOfString = i + 1;
|
startOfString = i + 1;
|
||||||
currentType = ACIPString.ERROR;
|
currentType = ACIPString.ERROR;
|
||||||
|
@ -303,10 +426,15 @@ public class ACIPTshegBarScanner {
|
||||||
currentType = ACIPString.ERROR;
|
currentType = ACIPString.ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
// We look for @N[AB], @NN[AB], @NNN[AB], @NNNN[AB],
|
// We look for {@N{AB}, @NN{AB}, ..., @NNNNNN{AB}},
|
||||||
// @NNNNN[AB], and @NNNNNN[AB] only, that is from one
|
// {@[N{AB}], @[NN{AB}], ..., @[NNNNNN{AB}]},
|
||||||
// to six digits.
|
// {@N{AB}.N, @NN{AB}.N, ..., @NNNNNN{AB}.N}, {@N,
|
||||||
for (int numdigits = 1; numdigits <= 5; numdigits++) {
|
// @NN, ..., @NNNNNN}, and {@{AB}N, @{AB}NN,
|
||||||
|
// ... @{AB}NNNNNN} only, that is from one to six
|
||||||
|
// digits. Each of these folio marker format occurs
|
||||||
|
// in practice.
|
||||||
|
for (int numdigits = 6; numdigits >= 1; numdigits--) {
|
||||||
|
// @NNN{AB} and @NNN{AB}.N cases:
|
||||||
if (i+numdigits+1 < sl
|
if (i+numdigits+1 < sl
|
||||||
&& (s.charAt(i+numdigits+1) == 'A' || s.charAt(i+numdigits+1) == 'B')) {
|
&& (s.charAt(i+numdigits+1) == 'A' || s.charAt(i+numdigits+1) == 'B')) {
|
||||||
boolean allAreNumeric = true;
|
boolean allAreNumeric = true;
|
||||||
|
@ -316,6 +444,73 @@ public class ACIPTshegBarScanner {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (allAreNumeric) {
|
||||||
|
// Is this "@012B " or "@012B.3 "?
|
||||||
|
int extra;
|
||||||
|
if (i+numdigits+2 < sl && s.charAt(i+numdigits+2) == '.') {
|
||||||
|
if (!(i+numdigits+4 < sl && isNumeric(s.charAt(i+numdigits+3))
|
||||||
|
&& !isNumeric(s.charAt(i+numdigits+4)))) {
|
||||||
|
al.add(new ACIPString(s.substring(i, i+numdigits+3), ACIPString.ERROR));
|
||||||
|
String inContext = s.substring(i, i+Math.min(sl-i, 10));
|
||||||
|
if (inContext.indexOf("\r") >= 0) {
|
||||||
|
inContext = inContext.substring(0, inContext.indexOf("\r"));
|
||||||
|
} else if (inContext.indexOf("\n") >= 0) {
|
||||||
|
inContext = inContext.substring(0, inContext.indexOf("\n"));
|
||||||
|
} else {
|
||||||
|
if (sl-i > 10) {
|
||||||
|
inContext = inContext + "...";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (null != errors)
|
||||||
|
errors.append("Offset " + i + ": "
|
||||||
|
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.\n");
|
||||||
|
startOfString = i+numdigits+3;
|
||||||
|
i = startOfString - 1;
|
||||||
|
currentType = ACIPString.ERROR;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (i+numdigits+4 < sl && (s.charAt(i+numdigits+4) == '.' || s.charAt(i+numdigits+4) == 'A' || s.charAt(i+numdigits+4) == 'B' || s.charAt(i+numdigits+4) == 'a' || s.charAt(i+numdigits+4) == 'b' || isNumeric(s.charAt(i+numdigits+4)))) {
|
||||||
|
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
|
||||||
|
String inContext = s.substring(i, i+Math.min(sl-i, 10));
|
||||||
|
if (inContext.indexOf("\r") >= 0) {
|
||||||
|
inContext = inContext.substring(0, inContext.indexOf("\r"));
|
||||||
|
} else if (inContext.indexOf("\n") >= 0) {
|
||||||
|
inContext = inContext.substring(0, inContext.indexOf("\n"));
|
||||||
|
} else {
|
||||||
|
if (sl-i > 10) {
|
||||||
|
inContext = inContext + "...";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (null != errors)
|
||||||
|
errors.append("Offset " + i + ": "
|
||||||
|
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.\n");
|
||||||
|
startOfString = i+1; // DLC FIXME: skip over more?
|
||||||
|
currentType = ACIPString.ERROR;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
extra = 4;
|
||||||
|
} else {
|
||||||
|
extra = 2;
|
||||||
|
}
|
||||||
|
al.add(new ACIPString(s.substring(i, i+numdigits+extra),
|
||||||
|
ACIPString.FOLIO_MARKER));
|
||||||
|
startOfString = i+numdigits+extra;
|
||||||
|
i = startOfString - 1;
|
||||||
|
currentType = ACIPString.ERROR;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// @{AB}NNN case:
|
||||||
|
if (i+numdigits+1 < sl
|
||||||
|
&& (s.charAt(i+1) == 'A' || s.charAt(i+1) == 'B')) {
|
||||||
|
boolean allAreNumeric = true;
|
||||||
|
for (int k = 1; k <= numdigits; k++) {
|
||||||
|
if (!isNumeric(s.charAt(i+1+k))) {
|
||||||
|
allAreNumeric = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
if (allAreNumeric) {
|
if (allAreNumeric) {
|
||||||
al.add(new ACIPString(s.substring(i, i+numdigits+2),
|
al.add(new ACIPString(s.substring(i, i+numdigits+2),
|
||||||
ACIPString.FOLIO_MARKER));
|
ACIPString.FOLIO_MARKER));
|
||||||
|
@ -325,8 +520,8 @@ public class ACIPTshegBarScanner {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// System.out.println("DLC NOW HERE xxx y:" + (i+numdigits+3 < sl) + " z:" + s.charAt(i+1) + s.charAt(i+numdigits+2) + s.charAt(i+numdigits+3));
|
|
||||||
|
|
||||||
|
// @[NNN{AB}] case:
|
||||||
if (i+numdigits+3 < sl
|
if (i+numdigits+3 < sl
|
||||||
&& s.charAt(i+1) == '[' && s.charAt(i+numdigits+3) == ']'
|
&& s.charAt(i+1) == '[' && s.charAt(i+numdigits+3) == ']'
|
||||||
&& (s.charAt(i+numdigits+2) == 'A' || s.charAt(i+numdigits+2) == 'B')) {
|
&& (s.charAt(i+numdigits+2) == 'A' || s.charAt(i+numdigits+2) == 'B')) {
|
||||||
|
@ -346,12 +541,41 @@ public class ACIPTshegBarScanner {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// This case, @NNN, must come after the @NNN{AB} case.
|
||||||
|
if (i+numdigits+1 < sl && s.charAt(i+numdigits+1) == ' ') {
|
||||||
|
boolean allAreNumeric = true;
|
||||||
|
for (int k = 1; k <= numdigits; k++) {
|
||||||
|
if (!isNumeric(s.charAt(i+k))) {
|
||||||
|
allAreNumeric = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (allAreNumeric) {
|
||||||
|
al.add(new ACIPString(s.substring(i, i+numdigits+1),
|
||||||
|
ACIPString.FOLIO_MARKER));
|
||||||
|
startOfString = i+numdigits+1;
|
||||||
|
i = startOfString - 1;
|
||||||
|
currentType = ACIPString.ERROR;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (startOfString == i) {
|
if (startOfString == i) {
|
||||||
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
|
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
|
||||||
|
String inContext = s.substring(i, i+Math.min(sl-i, 10));
|
||||||
|
if (inContext.indexOf("\r") >= 0) {
|
||||||
|
inContext = inContext.substring(0, inContext.indexOf("\r"));
|
||||||
|
} else if (inContext.indexOf("\n") >= 0) {
|
||||||
|
inContext = inContext.substring(0, inContext.indexOf("\n"));
|
||||||
|
} else {
|
||||||
|
if (sl-i > 10) {
|
||||||
|
inContext = inContext + "...";
|
||||||
|
}
|
||||||
|
}
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ": "
|
errors.append("Offset " + i + ": "
|
||||||
+ "Found an illegal at sign, @. @012B is an example of a legal folio marker.\n");
|
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.\n");
|
||||||
startOfString = i+1;
|
startOfString = i+1;
|
||||||
currentType = ACIPString.ERROR;
|
currentType = ACIPString.ERROR;
|
||||||
}
|
}
|
||||||
|
@ -391,7 +615,7 @@ public class ACIPTshegBarScanner {
|
||||||
currentType = ACIPString.ERROR;
|
currentType = ACIPString.ERROR;
|
||||||
}
|
}
|
||||||
|
|
||||||
// DLC support nesting like (NYA (BA))?
|
// We do not support nesting like (NYA (BA)).
|
||||||
|
|
||||||
if (startParenIndex >= 0) {
|
if (startParenIndex >= 0) {
|
||||||
if (ch == '(') {
|
if (ch == '(') {
|
||||||
|
@ -421,7 +645,8 @@ public class ACIPTshegBarScanner {
|
||||||
break; // end '(',')' case
|
break; // end '(',')' case
|
||||||
|
|
||||||
case '?':
|
case '?':
|
||||||
if (bracketTypeStack.empty()) {
|
if (bracketTypeStack.empty() || i+1>=sl
|
||||||
|
|| (s.charAt(i+1) != ']' && s.charAt(i+1) != '}')) {
|
||||||
// The tsheg bar ends here; new token.
|
// The tsheg bar ends here; new token.
|
||||||
if (startOfString < i) {
|
if (startOfString < i) {
|
||||||
al.add(new ACIPString(s.substring(startOfString, i),
|
al.add(new ACIPString(s.substring(startOfString, i),
|
||||||
|
@ -443,18 +668,25 @@ public class ACIPTshegBarScanner {
|
||||||
startOfString = i;
|
startOfString = i;
|
||||||
currentType = ACIPString.ERROR;
|
currentType = ACIPString.ERROR;
|
||||||
}
|
}
|
||||||
// . is used for a non-breaking tsheg, such as in {NGO.,} and {....,DAM}. We give an error unless , or . follows '.'.
|
// . is used for a non-breaking tsheg, such as in
|
||||||
if (i + 1 < sl && (s.charAt(i+1) == '.' || s.charAt(i+1) == ',')) {
|
// {NGO.,} and {....,DAM}. We give an error unless ,
|
||||||
|
// or ., or [A-Za-z] follows '.'.
|
||||||
|
if (lenientPeriods
|
||||||
|
|| (i + 1 < sl
|
||||||
|
&& (s.charAt(i+1) == '.' || s.charAt(i+1) == ','
|
||||||
|
|| (s.charAt(i+1) == '\r' || s.charAt(i+1) == '\n')
|
||||||
|
|| (s.charAt(i+1) >= 'a' && s.charAt(i+1) <= 'z')
|
||||||
|
|| (s.charAt(i+1) >= 'A' && s.charAt(i+1) <= 'Z')))) {
|
||||||
al.add(new ACIPString(s.substring(i, i+1),
|
al.add(new ACIPString(s.substring(i, i+1),
|
||||||
ACIPString.TIBETAN_PUNCTUATION));
|
ACIPString.TIBETAN_PUNCTUATION));
|
||||||
} else {
|
} else {
|
||||||
al.add(new ACIPString("A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\".",
|
al.add(new ACIPString(s.substring(i, i+1),
|
||||||
ACIPString.ERROR));
|
ACIPString.ERROR));
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ": "
|
errors.append("Offset " + i + ": "
|
||||||
+ "A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\".\n");
|
+ "A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
startOfString = i+1;
|
||||||
break; // end '.' case
|
break; // end '.' case
|
||||||
|
|
||||||
// Classic tsheg bar enders:
|
// Classic tsheg bar enders:
|
||||||
|
@ -493,9 +725,15 @@ public class ACIPTshegBarScanner {
|
||||||
}
|
}
|
||||||
al.add(new ACIPString(s.substring(i, i+1),
|
al.add(new ACIPString(s.substring(i, i+1),
|
||||||
ACIPString.ERROR));
|
ACIPString.ERROR));
|
||||||
if (null != errors)
|
if (null != errors) {
|
||||||
errors.append("Offset " + i + ": "
|
if ((int)ch == 65533) {
|
||||||
+ "Found an illegal character, " + ch + "\n");
|
errors.append("Offset " + i + ": "
|
||||||
|
+ "Found an illegal, unprintable character.\n");
|
||||||
|
} else {
|
||||||
|
errors.append("Offset " + i + ": "
|
||||||
|
+ "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
startOfString = i+1;
|
startOfString = i+1;
|
||||||
currentType = ACIPString.ERROR;
|
currentType = ACIPString.ERROR;
|
||||||
} else {
|
} else {
|
||||||
|
@ -510,16 +748,24 @@ public class ACIPTshegBarScanner {
|
||||||
al.add(new ACIPString(s.substring(startOfString, sl),
|
al.add(new ACIPString(s.substring(startOfString, sl),
|
||||||
currentType));
|
currentType));
|
||||||
}
|
}
|
||||||
|
if (waitingForMatchingIllegalClose) {
|
||||||
|
al.add(new ACIPString("UNEXPECTED END OF INPUT",
|
||||||
|
ACIPString.ERROR));
|
||||||
|
if (null != errors) {
|
||||||
|
errors.append("Offset END: "
|
||||||
|
+ "Truly unmatched open bracket found.\n");
|
||||||
|
}
|
||||||
|
}
|
||||||
if (!bracketTypeStack.empty()) {
|
if (!bracketTypeStack.empty()) {
|
||||||
al.add(new ACIPString("UNEXPECTED END OF INPUT",
|
al.add(new ACIPString("UNEXPECTED END OF INPUT",
|
||||||
ACIPString.ERROR));
|
ACIPString.ERROR));
|
||||||
if (null != errors) {
|
if (null != errors) {
|
||||||
if (ACIPString.COMMENT == currentType) {
|
if (ACIPString.COMMENT == currentType) {
|
||||||
errors.append("Offset END: "
|
errors.append("Offset END: "
|
||||||
+ "Unmatched open square bracket, [, found. A comment does not terminate.\n");
|
+ "Unmatched open bracket found. A comment does not terminate.\n");
|
||||||
} else {
|
} else {
|
||||||
errors.append("Offset END: "
|
errors.append("Offset END: "
|
||||||
+ "Unmatched open square bracket, [, found. A correction does not terminate.\n");
|
+ "Unmatched open bracket found. A correction does not terminate.\n");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -545,6 +791,11 @@ public class ACIPTshegBarScanner {
|
||||||
return ch >= '0' && ch <= '9';
|
return ch >= '0' && ch <= '9';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** See implementation. */
|
||||||
|
private static boolean isWhitespace(char ch) {
|
||||||
|
return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
|
||||||
|
}
|
||||||
|
|
||||||
/** See implementation. */
|
/** See implementation. */
|
||||||
private static boolean isAlpha(char ch) {
|
private static boolean isAlpha(char ch) {
|
||||||
return ch == '\'' // 23rd consonant
|
return ch == '\'' // 23rd consonant
|
||||||
|
@ -554,6 +805,8 @@ public class ACIPTshegBarScanner {
|
||||||
|| ch == 'o'
|
|| ch == 'o'
|
||||||
|| ch == 'x'
|
|| ch == 'x'
|
||||||
|| ch == ':'
|
|| ch == ':'
|
||||||
|
|| ch == '^'
|
||||||
|
|| ch == '\\'
|
||||||
|
|
||||||
|| ch == '-'
|
|| ch == '-'
|
||||||
|| ch == '+'
|
|| ch == '+'
|
||||||
|
|
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue