Jskad/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java

1117 lines
57 KiB
Java
Raw Normal View History

/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.ttt;
import java.io.*;
import java.util.ArrayList;
import java.util.Stack;
import org.thdl.util.ThdlDebug;
import org.thdl.util.ThdlOptions;
/**
* This class is able to break up Strings of ACIP text (for example, an
* entire sutra file) into tsheg bars, comments, etc. Folio markers,
* comments, and the like are segregated (so that consumers can ensure
* that they remain in Latin), and Tibetan passages are broken up into
* tsheg bars.
*
* <p><b>FIXME:</b> We should be handling {KA\n\nKHA} vs. {KA\nKHA} in
* the parser, not here in the lexical analyzer. That'd be cleaner,
* and more like how you'd do things if you used lex and yacc.
*
* @author David Chandler */
public class ACIPTshegBarScanner {
/** True if those ACIP snippets inside square brackets (e.g.,
"[THIS]") are to be passed through into the output unmodified
while retaining the brackets and if those ACIP snippets inside
curly brackets (e.g., "{THAT}") are to be passed through into
the output unmodified while dropping the brackets. (Nesting
of brackets is not allowed regardless.) */
public static final boolean BRACKETED_SECTIONS_PASS_THROUGH_UNMODIFIED
= true; // Robert Chilton's e-mail from April 2004 calls for 'true'
/** Useful for testing. Gives error messages on standard output
* about why we can't scan the document perfectly and exits with
* non-zero return code, or says "Good scan!" otherwise and exits
* with code zero. <p>FIXME: not so efficient; copies the whole
* file into memory first. */
public static void main(String[] args) throws IOException {
if (args.length != 1) {
System.out.println("Bad args! Need just the name of the ACIP text file.");
System.exit(1);
}
StringBuffer errors = new StringBuffer();
int maxErrors = 1000;
ArrayList al = scanFile(args[0], errors, maxErrors - 1,
"true".equals(System.getProperty("org.thdl.tib.text.ttt.ACIPTshegBarScanner.shortMessages")),
"All" /* memory hog */);
if (null == al) {
System.out.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this");
System.out.println("Tibetan or English input?");
System.out.println("");
System.out.println("First " + maxErrors + " errors scanning ACIP input file: ");
System.out.println(errors);
System.out.println("Exiting with " + maxErrors + " or more errors; please fix input file and try again.");
System.exit(1);
}
if (errors.length() > 0) {
System.out.println("Errors scanning ACIP input file: ");
System.out.println(errors);
System.out.println("Exiting; please fix input file and try again.");
System.exit(1);
}
System.out.println("Good scan!");
System.exit(0);
}
/** Scans an ACIP file with path fname into tsheg bars. If errors
* is non-null, error messages will be appended to it. Returns a
* list of TStrings that is the scan. Warning and error messages
* in the result will be long and self-contained unless
* shortMessagse is true.
*
* <p>FIXME: not so efficient; copies the whole file into memory
* first.
*
* @param warningLevel controls which lexical warnings you will
* encounter
*
* @throws IOException if we cannot read in the ACIP input file
* */
public static ArrayList scanFile(String fname, StringBuffer errors,
int maxErrors, boolean shortMessages,
String warningLevel)
throws IOException
{
return scanStream(new FileInputStream(fname),
errors, maxErrors, shortMessages, warningLevel);
}
/** Scans a stream of ACIP into tsheg bars. If errors is
* non-null, error messages will be appended to it. You can
* recover both errors and (optionally) warnings (modulo offset
* information) from the result, though. They will be short
* messages iff shortMessages is true. Returns a list of
* TStrings that is the scan, or null if more than maxErrors
* occur.
*
* <p>FIXME: not so efficient; copies the whole file into memory
* first.
*
* @param warningLevel controls which lexical warnings you will
* encounter
*
* @throws IOException if we cannot read the whole ACIP stream */
public static ArrayList scanStream(InputStream stream, StringBuffer errors,
int maxErrors, boolean shortMessages,
String warningLevel)
throws IOException
{
StringBuffer s = new StringBuffer();
char ch[] = new char[8192];
BufferedReader in
= new BufferedReader(new InputStreamReader(stream, "US-ASCII"));
int amt;
while (-1 != (amt = in.read(ch))) {
s.append(ch, 0, amt);
}
in.close();
return scan(s.toString(), errors, maxErrors, shortMessages,
warningLevel);
}
/** Helper. Here because ACIP {MTHAR%\nKHA} should be treated the
same w.r.t. tsheg insertion regardless of the lex errors and
lex warnings found. */
private static boolean lastNonExceptionalThingWasAdornmentOr(ArrayList al, int kind) {
int i = al.size() - 1;
while (i >= 0 && (((TString)al.get(i)).getType() == TString.WARNING
|| ((TString)al.get(i)).getType() == TString.ERROR))
--i;
return (i >= 0 && // FIXME: or maybe i < 0 || ...
(((TString)al.get(i)).getType() == kind
|| ((TString)al.get(i)).getType() == TString.TSHEG_BAR_ADORNMENT));
}
/** Helper function that increments numErrorsArray[0] by one and
adds an ERROR to the end of al and appends to the end of
errors if it is nonnull. (Nothing else is mutated.)
@return true if and only if the error count has gone too high
and caller should abort scanning */
private static boolean queueError(int code,
String translit,
boolean shortMessages,
int i,
int numNewlines,
int maxErrors,
ArrayList al,
StringBuffer errors,
int numErrorsArray[]) {
String errMsg;
al.add(new TString("ACIP",
errMsg = ErrorsAndWarnings.getMessage(code,
shortMessages,
translit),
TString.ERROR));
if (null != errors)
errors.append("Offset " + ((i < 0) ? "END" : ("" + i))
+ ((numNewlines == 0)
? ""
: (" or maybe " + (i-numNewlines)))
+ ": ERROR "
+ errMsg + "\n");
if (maxErrors >= 0 && ++numErrorsArray[0] >= maxErrors)
return true;
else
return false;
}
// DLC FIXME "H:\n\n" becomes "H: \n\n", wrongly I think. See
// Tibetan! 5.1 section on formatting Tibetan texts.
/** Returns a list of {@link TString TStrings} corresponding
* to s, possibly the empty list (when the empty string is the
* input). Each String is either a Latin comment, some Latin
* text, a tsheg bar (minus the tsheg or shad or whatever), a
* String of inter-tsheg-bar punctuation, etc.
*
* <p>This not only scans; it finds all the errors and warnings a
* parser would too, like "NYA x" and "(" and ")" and "/NYA" etc.
* It puts those in as TStrings with type {@link
* TString#ERROR} or {@link TString#WARNING}, and also, if
* errors is non-null, appends helpful messages to errors, each
* followed by a '\n'.
* @param s the ACIP text
* @param errors if non-null, the buffer to which to append error
* messages (FIXME: kludge, just get this info by scanning
* the result for TString.ERROR (and maybe TString.WARNING,
* if you care about warnings), but then we'd have to put the
* Offset info in the TString)
* @param maxErrors if nonnegative, then scanning will stop when
* more than maxErrors errors occur. In this event, null is
* returned.
* @param shortMessages true iff you want short error and warning
* messages instead of long, self-contained error messages
* @return null if more than maxErrors errors occur, or the scan
* otherwise */
public static ArrayList scan(String s, StringBuffer errors, int maxErrors,
boolean shortMessages, String warningLevel) {
// FIXME: Use less memory and time by not adding in the
// warnings that are below threshold.
// the size depends on whether it's mostly Tibetan or mostly
// Latin and a number of other factors. This is meant to be
// an underestimate, but not too much of an underestimate.
ArrayList al = new ArrayList(s.length() / 10);
int numErrorsArray[] = new int[] { 0 };
boolean waitingForMatchingIllegalClose = false;
int sl = s.length();
int currentType = TString.ERROR;
int startOfString = 0;
Stack bracketTypeStack = new Stack();
int startSlashIndex = -1;
int startParenIndex = -1;
int numNewlines = 0;
for (int i = 0; i < sl; i++) {
if (i < startOfString) throw new Error("bad reset");
char ch;
ch = s.charAt(i);
if (ch == '\n') ++numNewlines;
if (TString.COMMENT == currentType && ch != ']') {
if ('[' == ch) {
if (queueError(102, "" + ch,
shortMessages, i, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
}
continue;
}
switch (ch) {
case '}': // fall through...
case ']':
if (bracketTypeStack.empty()) {
// Error.
if (startOfString < i) {
al.add(new TString("ACIP", s.substring(startOfString, i),
currentType));
}
if (!waitingForMatchingIllegalClose) {
if (queueError(103, "" + ch,
shortMessages, i, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
}
waitingForMatchingIllegalClose = false;
if (queueError(BRACKETED_SECTIONS_PASS_THROUGH_UNMODIFIED ? 140 : 104,
"" + ch,
shortMessages, i, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
startOfString = i+1;
currentType = TString.ERROR;
} else {
int stackTop = ((Integer)bracketTypeStack.pop()).intValue();
int end = startOfString;
if (TString.CORRECTION_START == stackTop) {
// This definitely indicates a new token.
char prevCh = s.charAt(i-1);
if (prevCh == '?')
end = i - 1;
else
end = i;
if (startOfString < end) {
al.add(new TString("ACIP", s.substring(startOfString, end),
currentType));
}
if ('?' != prevCh) {
currentType = TString.PROBABLE_CORRECTION;
} else {
currentType = TString.POSSIBLE_CORRECTION;
}
}
al.add(new TString("ACIP", s.substring(end, i+1), currentType));
startOfString = i+1;
currentType = TString.ERROR;
}
break; // end ']','}' case
case '{': // NOTE WELL: KX0016I.ACT, KD0095M.ACT, and a
// host of other ACIP files use {} brackets like
// [] brackets. I treat both the same if
// BRACKETED_SECTIONS_PASS_THROUGH_UNMODIFIED
// is false.
// fall through...
case '[':
// This definitely indicates a new token.
if (startOfString < i) {
al.add(new TString("ACIP", s.substring(startOfString, i),
currentType));
startOfString = i;
currentType = TString.ERROR;
}
if (BRACKETED_SECTIONS_PASS_THROUGH_UNMODIFIED) {
int indexPastCloseBracket = i;
boolean foundClose = false;
while (++indexPastCloseBracket < sl) {
if ((('[' == ch) ? '[' : '{')
== s.charAt(indexPastCloseBracket)) { // "[i am [nested], you see]" is not allowed.
waitingForMatchingIllegalClose = true;
if (queueError(141, "" + ch,
shortMessages, indexPastCloseBracket, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
} else if ((('[' == ch) ? ']' : '}') == s.charAt(indexPastCloseBracket)) {
al.add(new TString("ACIP",
s.substring(startOfString + (('[' == ch) ? 0 : 1),
indexPastCloseBracket + (('[' == ch) ? 1 : 0)),
TString.LATIN));
startOfString = indexPastCloseBracket + 1;
i = startOfString - 1;
currentType = TString.ERROR;
foundClose = true;
break;
}
}
if (!foundClose) {
// FIXME: duplciated code, search for 106:
{
String inContext = s.substring(i, i+Math.min(sl-i, 10));
if (inContext.indexOf("\r") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\r"));
} else if (inContext.indexOf("\n") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\n"));
} else {
if (sl-i > 10) {
inContext = inContext + "...";
}
}
if (queueError(139, inContext,
shortMessages, i, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
}
if (queueError(117, "-*-END OF FILE-*-",
shortMessages, -1, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
// we're done here:
{
i = sl;
startOfString = sl;
}
}
} else {
String thingy = null;
if (i + "[DD]".length() <= sl
&& (s.substring(i, i + "[DD]".length()).equals("[DD]")
|| s.substring(i, i + "[DD]".length()).equals("{DD}"))) {
thingy = "[DD]";
currentType = TString.DD;
} else if (i + "[DD1]".length() <= sl
&& (s.substring(i, i + "[DD1]".length()).equals("[DD1]")
|| s.substring(i, i + "[DD1]".length()).equals("{DD1}"))) {
thingy = "[DD1]";
currentType = TString.DD;
} else if (i + "[DD2]".length() <= sl
&& (s.substring(i, i + "[DD2]".length()).equals("[DD2]")
|| s.substring(i, i + "[DD2]".length()).equals("{DD2}"))) {
thingy = "[DD2]";
currentType = TString.DD;
} else if (i + "[DDD]".length() <= sl
&& (s.substring(i, i + "[DDD]".length()).equals("[DDD]")
|| s.substring(i, i + "[DDD]".length()).equals("{DDD}"))) {
thingy = "[DDD]";
currentType = TString.DD;
} else if (i + "[DR]".length() <= sl
&& (s.substring(i, i + "[DR]".length()).equals("[DR]")
|| s.substring(i, i + "[DR]".length()).equals("{DR}"))) {
thingy = "[DR]";
currentType = TString.DR;
} else if (i + "[LS]".length() <= sl
&& (s.substring(i, i + "[LS]".length()).equals("[LS]")
|| s.substring(i, i + "[LS]".length()).equals("{LS}"))) {
thingy = "[LS]";
currentType = TString.LS;
} else if (i + "[BP]".length() <= sl
&& (s.substring(i, i + "[BP]".length()).equals("[BP]")
|| s.substring(i, i + "[BP]".length()).equals("{BP}"))) {
thingy = "[BP]";
currentType = TString.BP;
} else if (i + "[BLANK PAGE]".length() <= sl
&& (s.substring(i, i + "[BLANK PAGE]".length()).equals("[BLANK PAGE]")
|| s.substring(i, i + "[BLANK PAGE]".length()).equals("{BLANK PAGE}"))) {
thingy = "[BLANK PAGE]";
currentType = TString.BP;
} else if (i + "[ BP ]".length() <= sl
&& (s.substring(i, i + "[ BP ]".length()).equals("[ BP ]")
|| s.substring(i, i + "[ BP ]".length()).equals("{ BP }"))) {
thingy = "{ BP }"; // found in TD3790E2.ACT
currentType = TString.BP;
} else if (i + "[ DD ]".length() <= sl
&& (s.substring(i, i + "[ DD ]".length()).equals("[ DD ]")
|| s.substring(i, i + "[ DD ]".length()).equals("{ DD }"))) {
thingy = "{ DD }"; // found in TD3790E2.ACT
currentType = TString.DD;
} else if (i + "[?]".length() <= sl
&& (s.substring(i, i + "[?]".length()).equals("[?]")
|| s.substring(i, i + "[?]".length()).equals("{?}"))) {
thingy = "[?]";
currentType = TString.QUESTION;
} else {
// We see comments appear not as [#COMMENT], but
// as [COMMENT] sometimes. We make special cases
// for some English comments. There's no need to
// make this mechanism extensible, because you
// can easily edit the ACIP text so that it uses
// [#COMMENT] notation instead of [COMMENT].
String[] englishComments = new String[] {
"FIRST", "SECOND", // S5274I.ACT
"Additional verses added by Khen Rinpoche here are", // S0216M.ACT
"ADDENDUM: The text of", // S0216M.ACT
"END OF ADDENDUM", // S0216M.ACT
"Some of the verses added here by Khen Rinpoche include:", // S0216M.ACT
"Note that, in the second verse, the {YUL LJONG} was orignally {GANG LJONG},\nand is now recited this way since the ceremony is not only taking place in Tibet.", // S0216M.ACT
"Note that, in the second verse, the {YUL LJONG} was orignally {GANG LJONG},\r\nand is now recited this way since the ceremony is not only taking place in Tibet.", // S0216M.ACT
"text missing", // S6954E1.ACT
"INCOMPLETE", // TD3817I.INC
"MISSING PAGE", // S0935m.act
"MISSING FOLIO", // S0975I.INC
"UNCLEAR LINE", // S0839D1I.INC
"THE FOLLOWING TEXT HAS INCOMPLETE SECTIONS, WHICH ARE ON ORDER", // SE6260A.INC
"@DATA INCOMPLETE HERE", // SE6260A.INC
"@DATA MISSING HERE", // SE6260A.INC
"LINE APPARENTLY MISSING THIS PAGE", // TD4035I.INC
"DATA INCOMPLETE HERE", // TD4226I2.INC
"DATA MISSING HERE", // just being consistent
"FOLLOWING SECTION WAS NOT AVAILABLE WHEN THIS EDITION WAS\nPRINTED, AND IS SUPPLIED FROM ANOTHER, PROBABLY THE ORIGINAL:", // S0018N.ACT
"FOLLOWING SECTION WAS NOT AVAILABLE WHEN THIS EDITION WAS\r\nPRINTED, AND IS SUPPLIED FROM ANOTHER, PROBABLY THE ORIGINAL:", // S0018N.ACT
"THESE PAGE NUMBERS RESERVED IN THIS EDITION FOR PAGES\nMISSING FROM ORIGINAL ON WHICH IT WAS BASED", // S0018N.ACT
"THESE PAGE NUMBERS RESERVED IN THIS EDITION FOR PAGES\r\nMISSING FROM ORIGINAL ON WHICH IT WAS BASED", // S0018N.ACT
"PAGE NUMBERS RESERVED FROM THIS EDITION FOR MISSING\nSECTION SUPPLIED BY PRECEDING", // S0018N.ACT
"PAGE NUMBERS RESERVED FROM THIS EDITION FOR MISSING\r\nSECTION SUPPLIED BY PRECEDING", // S0018N.ACT
"SW: OK", // S0057M.ACT
"m:ok", // S0057M.ACT
"A FIRST ONE\nMISSING HERE?", // S0057M.ACT
"A FIRST ONE\r\nMISSING HERE?", // S0057M.ACT
"THE INITIAL PART OF THIS TEXT WAS INPUT BY THE SERA MEY LIBRARY IN\nTIBETAN FONT AND NEEDS TO BE REDONE BY DOUBLE INPUT", // S0195A1.INC
"THE INITIAL PART OF THIS TEXT WAS INPUT BY THE SERA MEY LIBRARY IN\r\nTIBETAN FONT AND NEEDS TO BE REDONE BY DOUBLE INPUT", // S0195A1.INC
};
boolean foundOne = false;
for (int ec = 0; ec < englishComments.length; ec++) {
if (i + 2 + englishComments[ec].length() <= sl
&& (s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]")
|| s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]"))) {
al.add(new TString("ACIP", "[#" + englishComments[ec] + "]",
TString.COMMENT));
startOfString = i + 2 + englishComments[ec].length();
i = startOfString - 1;
foundOne = true;
break;
}
}
if (!foundOne && i+1 < sl && s.charAt(i+1) == '*') {
// Identify [*LINE BREAK?] as an English
// correction. Every correction not on this
// list is considered to be Tibetan.
// FIXME: make this extensible via a config
// file or at least a System property (which
// could be a comma-separated list of these
// creatures.
// If "LINE" is in the list below, then [*
// LINE], [* LINE?], [*LINE], [*LINE?], [*
// LINE OUT ?], etc. will be considered
// English corrections. I.e., whitespace
// before and anything after doesn't prevent a
// match.
String[] englishCorrections = new String[] {
"LINE", // KD0001I1.ACT
"DATA", // KL0009I2.INC
"BLANK", // KL0009I2.INC
"NOTE", // R0001F.ACM
"alternate", // R0018F.ACE
"02101-02150 missing", // R1003A3.INC
"51501-51550 missing", // R1003A52.ACT
"BRTAGS ETC", // S0002N.ACT
"TSAN, ETC", // S0015N.ACT
"SNYOMS, THROUGHOUT", // S0016N.ACT
"KYIS ETC", // S0019N.ACT
"MISSING", // S0455M.ACT
"this", // S6850I1B.ALT
"THIS", // S0057M.ACT
};
int begin;
for (begin = i+2; begin < sl; begin++) {
if (!isWhitespace(s.charAt(begin)))
break;
}
int end;
for (end = i+2; end < sl; end++) {
if (s.charAt(end) == ']')
break;
}
int realEnd = end;
if (end < sl && s.charAt(end-1) == '?')
--realEnd;
if (end < sl && begin < realEnd) {
String interestingSubstring
= s.substring(begin, realEnd);
for (int ec = 0; ec < englishCorrections.length; ec++) {
if (interestingSubstring.startsWith(englishCorrections[ec])) {
al.add(new TString("ACIP", s.substring(i, i+2),
TString.CORRECTION_START));
al.add(new TString("ACIP", s.substring(i+2, realEnd),
TString.LATIN));
if (s.charAt(end - 1) == '?') {
al.add(new TString("ACIP", s.substring(end-1, end+1),
TString.POSSIBLE_CORRECTION));
} else {
al.add(new TString("ACIP", s.substring(end, end+1),
TString.PROBABLE_CORRECTION));
}
foundOne = true;
startOfString = end+1;
i = startOfString - 1;
break;
}
}
}
}
if (foundOne)
break;
}
if (null != thingy) {
al.add(new TString("ACIP", thingy,
currentType));
startOfString = i + thingy.length();
i = startOfString - 1;
} else {
if (i + 1 < sl) {
char nextCh = s.charAt(i+1);
if ('*' == nextCh) {
currentType = TString.CORRECTION_START;
bracketTypeStack.push(new Integer(currentType));
al.add(new TString("ACIP", s.substring(i, i+2),
TString.CORRECTION_START));
currentType = TString.ERROR;
startOfString = i+2;
i = startOfString - 1;
break;
} else if ('#' == nextCh) {
currentType = TString.COMMENT;
bracketTypeStack.push(new Integer(currentType));
break;
}
}
// This is an error. Sometimes [COMMENTS APPEAR
// WITHOUT # MARKS]. Though "... [" could cause
// this too.
if (waitingForMatchingIllegalClose) {
if (queueError(105, "" + ch,
shortMessages, i, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
}
waitingForMatchingIllegalClose = true;
// FIXME: duplciated code, search for 139:
{
String inContext = s.substring(i, i+Math.min(sl-i, 10));
if (inContext.indexOf("\r") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\r"));
} else if (inContext.indexOf("\n") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\n"));
} else {
if (sl-i > 10) {
inContext = inContext + "...";
}
}
if (queueError(106, inContext,
shortMessages, i, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
}
startOfString = i + 1;
currentType = TString.ERROR;
}
}
break; // end '[','{' case
case '@':
// This definitely indicates a new token.
if (startOfString < i) {
al.add(new TString("ACIP", s.substring(startOfString, i),
currentType));
startOfString = i;
currentType = TString.ERROR;
}
// We look for {@N{AB}, @NN{AB}, ..., @NNNNNN{AB}},
// {@[N{AB}], @[NN{AB}], ..., @[NNNNNN{AB}]},
// {@N{AB}.N, @NN{AB}.N, ..., @NNNNNN{AB}.N}, {@N,
// @NN, ..., @NNNNNN}, and {@{AB}N, @{AB}NN,
// ... @{AB}NNNNNN} only, that is from one to six
// digits. Each of these folio marker format occurs
// in practice.
for (int numdigits = 6; numdigits >= 1; numdigits--) {
// @NNN{AB} and @NNN{AB}.N cases:
if (i+numdigits+1 < sl
&& (s.charAt(i+numdigits+1) == 'A' || s.charAt(i+numdigits+1) == 'B')) {
boolean allAreNumeric = true;
for (int k = 1; k <= numdigits; k++) {
if (!isNumeric(s.charAt(i+k))) {
allAreNumeric = false;
break;
}
}
if (allAreNumeric) {
// Is this "@012B " or "@012B.3 "?
int extra;
if (i+numdigits+2 < sl && s.charAt(i+numdigits+2) == '.') {
if (!(i+numdigits+4 < sl && isNumeric(s.charAt(i+numdigits+3))
&& !isNumeric(s.charAt(i+numdigits+4)))) {
String inContext = s.substring(i, i+Math.min(sl-i, 10));
if (inContext.indexOf("\r") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\r"));
} else if (inContext.indexOf("\n") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\n"));
} else {
if (sl-i > 10) {
inContext = inContext + "...";
}
}
if (queueError(107, inContext,
shortMessages, i, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
startOfString = i+numdigits+3;
i = startOfString - 1;
currentType = TString.ERROR;
break;
}
if (i+numdigits+4 < sl
&& (s.charAt(i+numdigits+4) == '.'
|| s.charAt(i+numdigits+4) == 'A'
|| s.charAt(i+numdigits+4) == 'B'
|| s.charAt(i+numdigits+4) == 'a'
|| s.charAt(i+numdigits+4) == 'b'
|| isNumeric(s.charAt(i+numdigits+4)))) {
String inContext = s.substring(i, i+Math.min(sl-i, 10));
if (inContext.indexOf("\r") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\r"));
} else if (inContext.indexOf("\n") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\n"));
} else {
if (sl-i > 10) {
inContext = inContext + "...";
}
}
if (queueError(108, inContext,
shortMessages, i, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
startOfString = i+1; // FIXME: skip over more? test this code.
currentType = TString.ERROR;
break;
}
extra = 4;
} else {
extra = 2;
}
al.add(new TString("ACIP", s.substring(i, i+numdigits+extra),
TString.FOLIO_MARKER));
startOfString = i+numdigits+extra;
i = startOfString - 1;
currentType = TString.ERROR;
break;
}
}
// @{AB}NNN case:
if (i+numdigits+1 < sl
&& (s.charAt(i+1) == 'A' || s.charAt(i+1) == 'B')) {
boolean allAreNumeric = true;
for (int k = 1; k <= numdigits; k++) {
if (!isNumeric(s.charAt(i+1+k))) {
allAreNumeric = false;
break;
}
}
if (allAreNumeric) {
al.add(new TString("ACIP", s.substring(i, i+numdigits+2),
TString.FOLIO_MARKER));
startOfString = i+numdigits+2;
i = startOfString - 1;
currentType = TString.ERROR;
break;
}
}
// @[NNN{AB}] case:
if (i+numdigits+3 < sl
&& s.charAt(i+1) == '[' && s.charAt(i+numdigits+3) == ']'
&& (s.charAt(i+numdigits+2) == 'A' || s.charAt(i+numdigits+2) == 'B')) {
boolean allAreNumeric = true;
for (int k = 1; k <= numdigits; k++) {
if (!isNumeric(s.charAt(i+1+k))) {
allAreNumeric = false;
break;
}
}
if (allAreNumeric) {
al.add(new TString("ACIP", s.substring(i, i+numdigits+4),
TString.FOLIO_MARKER));
startOfString = i+numdigits+4;
i = startOfString - 1;
currentType = TString.ERROR;
break;
}
}
// This case, @NNN, must come after the @NNN{AB} case.
if (i+numdigits+1 < sl && (s.charAt(i+numdigits+1) == ' '
|| s.charAt(i+numdigits+1) == '\n'
|| s.charAt(i+numdigits+1) == '\r')) {
boolean allAreNumeric = true;
for (int k = 1; k <= numdigits; k++) {
if (!isNumeric(s.charAt(i+k))) {
allAreNumeric = false;
break;
}
}
if (allAreNumeric) {
al.add(new TString("ACIP", s.substring(i, i+numdigits+1),
TString.FOLIO_MARKER));
startOfString = i+numdigits+1;
i = startOfString - 1;
currentType = TString.ERROR;
break;
}
}
}
if (startOfString == i) {
String inContext = s.substring(i, i+Math.min(sl-i, 10));
if (inContext.indexOf("\r") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\r"));
} else if (inContext.indexOf("\n") >= 0) {
inContext = inContext.substring(0, inContext.indexOf("\n"));
} else {
if (sl-i > 10) {
inContext = inContext + "...";
}
}
if (queueError(109, inContext,
shortMessages, i, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
startOfString = i+1;
currentType = TString.ERROR;
}
break; // end '@' case
case '/':
// This definitely indicates a new token.
if (startOfString < i) {
al.add(new TString("ACIP", s.substring(startOfString, i),
currentType));
startOfString = i;
currentType = TString.ERROR;
}
if (startSlashIndex >= 0) {
if (startSlashIndex + 1 == i) {
if (queueError(110, "" + ch,
shortMessages, i, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
}
al.add(new TString("ACIP", s.substring(i, i+1),
TString.END_SLASH));
startOfString = i+1;
currentType = TString.ERROR;
startSlashIndex = -1;
} else {
startSlashIndex = i;
al.add(new TString("ACIP", s.substring(i, i+1),
TString.START_SLASH));
startOfString = i+1;
currentType = TString.ERROR;
}
break; // end '/' case
case '(':
case ')':
// This definitely indicates a new token.
if (startOfString < i) {
al.add(new TString("ACIP", s.substring(startOfString, i),
currentType));
startOfString = i;
currentType = TString.ERROR;
}
// We do not support nesting like (NYA (BA)).
if (startParenIndex >= 0) {
if (ch == '(') {
if (queueError(111, "" + ch,
shortMessages, i, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
} else {
al.add(new TString("ACIP", s.substring(i, i+1), TString.END_PAREN));
startParenIndex = -1;
}
startOfString = i+1;
currentType = TString.ERROR;
} else {
if (ch == ')') {
if (queueError(112, "" + ch,
shortMessages, i, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
} else {
startParenIndex = i;
al.add(new TString("ACIP", s.substring(i, i+1), TString.START_PAREN));
}
startOfString = i+1;
currentType = TString.ERROR;
}
break; // end '(',')' case
case '?':
if (bracketTypeStack.empty() || i+1>=sl
|| (s.charAt(i+1) != ']' && s.charAt(i+1) != '}')) {
// The tsheg bar ends here; new token.
if (startOfString < i) {
al.add(new TString("ACIP", s.substring(startOfString, i),
currentType));
}
if (queueError(113, "" + ch,
shortMessages, i, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
startOfString = i+1;
currentType = TString.ERROR;
} // else this is [*TR'A ?] or the like.
break; // end '?' case
case '.':
// This definitely indicates a new token.
if (startOfString < i) {
al.add(new TString("ACIP", s.substring(startOfString, i),
currentType));
startOfString = i;
currentType = TString.ERROR;
}
// . is used for a non-breaking tsheg, such as in
// {NGO.,} and {....,DAM}. We give a warning unless ,
// or ., or [A-Za-z] follows '.'.
al.add(new TString("ACIP", s.substring(i, i+1),
TString.TIBETAN_PUNCTUATION));
if (ErrorsAndWarnings.isEnabled(510, warningLevel)
&& (!(i + 1 < sl
&& (s.charAt(i+1) == '.' || s.charAt(i+1) == ','
|| (s.charAt(i+1) == '\r' || s.charAt(i+1) == '\n')
|| (s.charAt(i+1) >= 'a' && s.charAt(i+1) <= 'z')
|| (s.charAt(i+1) >= 'A' && s.charAt(i+1) <= 'Z'))))) {
al.add(new TString("ACIP",
ErrorsAndWarnings.getMessage(510,
shortMessages,
"" + ch),
TString.WARNING));
}
startOfString = i+1;
break; // end '.' case
// Classic tsheg bar enders:
case ' ':
case '\t':
case '\r':
case '\n':
case ',':
case '*':
case ';':
case '`':
case '#':
case '%':
case 'x':
case 'o':
case '^':
case '&':
boolean legalTshegBarAdornment = false;
// The tsheg bar ends here; new token.
if (startOfString < i) {
if (currentType == TString.TIBETAN_NON_PUNCTUATION
&& isTshegBarAdornment(ch))
legalTshegBarAdornment = true;
al.add(new TString("ACIP", s.substring(startOfString, i),
currentType));
}
// Insert a tsheg if necessary. ACIP files aren't
// careful, so "KA\r\n" and "GA\n" appear where "KA
// \r\n" and "GA \n" should appear.
if (('\r' == ch
|| ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
&& !al.isEmpty()
&& lastNonExceptionalThingWasAdornmentOr(al, TString.TIBETAN_NON_PUNCTUATION)) {
al.add(new TString("ACIP", " ", TString.TIBETAN_PUNCTUATION));
}
// "DANG,\nLHAG" is really "DANG, LHAG". But always? Not if you have "MDO,\n\nKA...".
if (('\r' == ch
|| ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
&& !al.isEmpty()
&& lastNonExceptionalThingWasAdornmentOr(al, TString.TIBETAN_PUNCTUATION)
&& ((TString)al.get(al.size() - 1)).getText().equals(",")
&& s.charAt(i-1) == ','
&& (i + (('\r' == ch) ? 2 : 1) < sl
&& (s.charAt(i+(('\r' == ch) ? 2 : 1)) != ch))) {
al.add(new TString("ACIP", " ", TString.TIBETAN_PUNCTUATION));
}
if ('^' == ch) {
// "^ GONG SA" is the same as "^GONG SA" or
// "^\r\nGONG SA". But "^\n\nGONG SA" is
// different -- that has a true line break in the
// output between ^ and GONG. We give an error if
// ^ isn't followed by an alphabetical character.
boolean bad = false;
if (i + 1 < sl && isAlpha(s.charAt(i+1))) {
// leave i alone
} else if (i + 2 < sl && (' ' == s.charAt(i+1)
|| '\r' == s.charAt(i+1)
|| '\n' == s.charAt(i+1))
&& isAlpha(s.charAt(i+2))) {
++i;
} else if (i + 3 < sl && '\r' == s.charAt(i+1)
&& '\n' == s.charAt(i+2)
&& isAlpha(s.charAt(i+3))) {
i += 2;
} else {
bad = true;
}
if (!bad)
al.add(new TString("ACIP", "^", TString.TIBETAN_PUNCTUATION));
else {
if (queueError(131, "^",
shortMessages, i, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
}
} else {
// Don't add in a "\r\n" or "\n" unless there's a
// blank line.
boolean rn = false;
boolean realNewline = false;
if (('\n' != ch && '\r' != ch)
|| (realNewline
= ((rn = ('\n' == ch && i >= 3 && s.charAt(i-3) == '\r' && s.charAt(i-2) == '\n' && s.charAt(i-1) == '\r'))
|| ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n')))) {
for (int h = 0; h < (realNewline ? 2 : 1); h++) {
if (isTshegBarAdornment(ch) && !legalTshegBarAdornment) {
if (queueError(132, "" + ch,
shortMessages, i, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
} else {
al.add(new TString("ACIP", rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
(legalTshegBarAdornment
? TString.TSHEG_BAR_ADORNMENT
: TString.TIBETAN_PUNCTUATION)));
}
}
}
if ('%' == ch
&& ErrorsAndWarnings.isEnabled(504, warningLevel)) {
al.add(new TString("ACIP",
ErrorsAndWarnings.getMessage(504,
shortMessages,
"" + ch),
TString.WARNING));
}
}
startOfString = i+1;
currentType = TString.ERROR;
break; // end TIBETAN_PUNCTUATION | TSHEG_BAR_ADORNMENT case
default:
if (!bracketTypeStack.empty()) {
int stackTop = ((Integer)bracketTypeStack.peek()).intValue();
if (TString.CORRECTION_START == stackTop && '?' == ch) {
// allow it through...
break;
}
}
if (i+1 == sl && 26 == (int)ch)
// Silently allow the last character to be
// control-Z (sometimes printed as ^Z), which just
// marks end of file.
break;
if (!(isNumeric(ch) || isAlpha(ch))) {
if (startOfString < i) {
al.add(new TString("ACIP", s.substring(startOfString, i),
currentType));
}
if ((int)ch == 65533) {
if (queueError(114, "unknown character",
shortMessages, i, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
} else if ('\\' == ch) {
int x = -1;
if (!ThdlOptions.getBooleanOption("thdl.tib.text.disallow.unicode.character.escapes.in.acip")
&& i + 5 < sl && 'u' == s.charAt(i+1)) {
try {
if (!((x = Integer.parseInt(s.substring(i+2, i+6), 16)) >= 0x0000 && x <= 0xFFFF))
x = -1;
} catch (NumberFormatException e) {
// Though this is unlikely to be
// legal, we allow it through.
// (FIXME: warn.)
}
}
if (x >= 0) {
al.add(new TString("ACIP", new String(new char[] { (char)x }),
TString.UNICODE_CHARACTER));
i += "uXXXX".length();
startOfString = i+1;
break;
} else {
if (queueError(115, "\\",
shortMessages, i, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
}
} else {
if (queueError(116, "" + ch,
shortMessages, i, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
}
startOfString = i+1;
currentType = TString.ERROR;
} else {
// Continue through the loop.
if (TString.ERROR == currentType)
currentType = TString.TIBETAN_NON_PUNCTUATION;
}
break; // end default case
}
}
if (startOfString < sl) {
al.add(new TString("ACIP", s.substring(startOfString, sl),
currentType));
}
if (waitingForMatchingIllegalClose) {
if (queueError(117, "-*-END OF FILE-*-",
shortMessages, -1, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
}
if (!bracketTypeStack.empty()) {
if (queueError(((TString.COMMENT == currentType) ? 118 : 119),
"-*-END OF FILE-*-",
shortMessages, -1, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
}
if (startSlashIndex >= 0) {
if (queueError(120, "-*-END OF FILE-*-",
shortMessages, -1, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
}
if (startParenIndex >= 0) {
if (queueError(121, "-*-END OF FILE-*-",
shortMessages, -1, numNewlines, maxErrors, al, errors, numErrorsArray))
return null;
}
return al;
}
/** See implementation. */
private static boolean isNumeric(char ch) {
return ch >= '0' && ch <= '9';
}
/** See implementation. */
private static boolean isWhitespace(char ch) {
return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
}
/** See implementation. */
private static boolean isTshegBarAdornment(char ch) {
return (ch == '%' || ch == 'o' || ch == 'x');
// ^ is a pre-adornment; these are post-adornments.
}
/** See implementation. */
private static boolean isAlpha(char ch) {
return ch == '\'' // 23rd consonant
// combining punctuation, vowels:
|| ch == 'm'
|| ch == ':'
// FIXME: we must treat this guy like a vowel, a special vowel that numerals can take on. Until then, warn. See bug 838588 || ch == '\\'
|| ch == '-'
|| ch == '+'
|| ((ch >= 'A' && ch <= 'Z') && ch != 'X' && ch != 'Q' && ch != 'F')
|| ch == 'i'
|| ch == 't'
|| ch == 'h'
|| ch == 'd'
|| ch == 'n'
|| ch == 's'
|| ch == 'h';
}
}