Andres found that "THAG PA" caused a NullPointerException. That's fixed.

Renamed ACIPString to TString -- we'll use this for EWTS and ACIP both.

TMW->ACIP for TMW9.61 should work now.
This commit is contained in:
dchandler 2003-10-04 01:22:59 +00:00
parent c8927b827c
commit ee50291ed4
4 changed files with 207 additions and 205 deletions

View file

@ -31,7 +31,7 @@ import org.thdl.tib.text.DuffCode;
/** /**
* This class is able to convert an ACIP file into Tibetan Machine Web * This class is able to convert an ACIP file into Tibetan Machine Web
* and an ACIP file into TMW. ACIP->Unicode should yield the same * and an ACIP file into Unicode. ACIP->Unicode should yield the same
* results as ACIP->TMW followed by TMW->Unicode (FIXME: test it!) * results as ACIP->TMW followed by TMW->Unicode (FIXME: test it!)
* @author David Chandler * @author David Chandler
*/ */
@ -225,15 +225,15 @@ public class ACIPConverter {
writeWarningsToOut, warningLevel, false); writeWarningsToOut, warningLevel, false);
} }
private static boolean peekaheadFindsSpacesAndComma(ArrayList /* of ACIPString */ scan, private static boolean peekaheadFindsSpacesAndComma(ArrayList /* of TString */ scan,
int pos) { int pos) {
int sz = scan.size(); int sz = scan.size();
while (pos < sz) { while (pos < sz) {
ACIPString s = (ACIPString)scan.get(pos++); TString s = (TString)scan.get(pos++);
if (s.getType() == ACIPString.TIBETAN_PUNCTUATION && s.getText().equals(" ")) { if (s.getType() == TString.TIBETAN_PUNCTUATION && s.getText().equals(" ")) {
// keep going // keep going
} else { } else {
if (s.getType() == ACIPString.TIBETAN_PUNCTUATION && s.getText().equals(",")) { if (s.getType() == TString.TIBETAN_PUNCTUATION && s.getText().equals(",")) {
return true; return true;
} else { } else {
return false; return false;
@ -286,16 +286,16 @@ public class ACIPConverter {
Color lastColor = Color.BLACK; Color lastColor = Color.BLACK;
Color color = Color.BLACK; Color color = Color.BLACK;
for (int i = 0; i < sz; i++) { for (int i = 0; i < sz; i++) {
ACIPString s = (ACIPString)scan.get(i); TString s = (TString)scan.get(i);
int stype = s.getType(); int stype = s.getType();
if (stype == ACIPString.ERROR) { if (stype == TString.ERROR) {
lastGuyWasNonPunct = false; lastGuyWasNonPunct = false;
lastGuy = null; lastGuy = null;
hasErrors = true; hasErrors = true;
String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]"; String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]";
if (null != writer) writer.write(text); if (null != writer) writer.write(text);
if (null != tdoc) tdoc.appendRoman(text, Color.RED); if (null != tdoc) tdoc.appendRoman(text, Color.RED);
} else if (stype == ACIPString.TSHEG_BAR_ADORNMENT) { } else if (stype == TString.TSHEG_BAR_ADORNMENT) {
if (lastGuyWasNonPunct) { if (lastGuyWasNonPunct) {
String err = "[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert " + s.getText() + " because the converter's author is unclear what the result should be.]"; String err = "[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert " + s.getText() + " because the converter's author is unclear what the result should be.]";
if (null != writer) { if (null != writer) {
@ -322,7 +322,7 @@ public class ACIPConverter {
} }
lastGuyWasNonPunct = true; // this stuff is not really punctuation lastGuyWasNonPunct = true; // this stuff is not really punctuation
lastGuy = null; lastGuy = null;
} else if (stype == ACIPString.WARNING) { } else if (stype == TString.WARNING) {
lastGuyWasNonPunct = false; lastGuyWasNonPunct = false;
lastGuy = null; lastGuy = null;
if (writeWarningsToOut) { if (writeWarningsToOut) {
@ -341,15 +341,15 @@ public class ACIPConverter {
lastGuyWasNonPunct = false; lastGuyWasNonPunct = false;
lastGuy = null; lastGuy = null;
String text String text
= (((stype == ACIPString.FOLIO_MARKER) ? "{" : "") = (((stype == TString.FOLIO_MARKER) ? "{" : "")
+ s.getText() + s.getText()
+ ((stype == ACIPString.FOLIO_MARKER) ? "}" : "")); + ((stype == TString.FOLIO_MARKER) ? "}" : ""));
if (null != writer) writer.write(text); if (null != writer) writer.write(text);
if (null != tdoc) tdoc.appendRoman(text, Color.BLACK); if (null != tdoc) tdoc.appendRoman(text, Color.BLACK);
} else { } else {
String unicode = null; String unicode = null;
DuffCode[] duff = null; DuffCode[] duff = null;
if (stype == ACIPString.TIBETAN_NON_PUNCTUATION) { if (stype == TString.TIBETAN_NON_PUNCTUATION) {
lastGuyWasNonPunct = true; lastGuyWasNonPunct = true;
TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText()); TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText());
String acipError; String acipError;
@ -424,13 +424,13 @@ public class ACIPConverter {
} }
} else { } else {
color = Color.BLACK; color = Color.BLACK;
if (stype == ACIPString.START_SLASH) { if (stype == TString.START_SLASH) {
if (null != writer) unicode = "\u0F3C"; if (null != writer) unicode = "\u0F3C";
if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph("(") }; if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph("(") };
} else if (stype == ACIPString.END_SLASH) { } else if (stype == TString.END_SLASH) {
if (null != writer) unicode = "\u0F3D"; if (null != writer) unicode = "\u0F3D";
if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph(")") }; if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph(")") };
} else if (stype == ACIPString.TIBETAN_PUNCTUATION) { } else if (stype == TString.TIBETAN_PUNCTUATION) {
// For ACIP, tshegs are used as both // For ACIP, tshegs are used as both
// tshegs and whitespace. We treat a // tshegs and whitespace. We treat a
// space as a tsheg if and only if it // space as a tsheg if and only if it
@ -452,7 +452,8 @@ public class ACIPConverter {
// space. // space.
&& ((lpl.get(0).getLeft().equals("G") && ((lpl.get(0).getLeft().equals("G")
|| lpl.get(0).getLeft().equals("K")) || lpl.get(0).getLeft().equals("K"))
&& (lpl.get(0).getRight().indexOf('U') < 0)) && (null == lpl.get(0).getRight()
|| lpl.get(0).getRight().indexOf('U') < 0))
&& &&
// it's (G . anything) // it's (G . anything)
// followed by some number of // followed by some number of
@ -500,12 +501,12 @@ public class ACIPConverter {
} }
} }
} }
} else if (stype == ACIPString.START_PAREN) { } else if (stype == TString.START_PAREN) {
if (null != tdoc) { if (null != tdoc) {
tdoc.setTibetanFontSize(smallFontSize); tdoc.setTibetanFontSize(smallFontSize);
} }
continue; continue;
} else if (stype == ACIPString.END_PAREN) { } else if (stype == TString.END_PAREN) {
if (null != tdoc) { if (null != tdoc) {
tdoc.setTibetanFontSize(regularFontSize); tdoc.setTibetanFontSize(regularFontSize);
} }

View file

@ -174,6 +174,7 @@ public class ACIPRules {
if (null == wylieToACIP) { if (null == wylieToACIP) {
wylieToACIP = new HashMap(75); wylieToACIP = new HashMap(75);
wylieToACIP.put("_", " "); // oddball. wylieToACIP.put("_", " "); // oddball.
wylieToACIP.put("o'i", "O'I"); // oddball for TMW9.61.
} }
wylieToACIP.put(EWTS, ACIP); wylieToACIP.put(EWTS, ACIP);
} }

View file

@ -70,7 +70,7 @@ public class ACIPTshegBarScanner {
/** Scans an ACIP file with path fname into tsheg bars. If errors /** Scans an ACIP file with path fname into tsheg bars. If errors
* is non-null, error messages will be appended to it. Returns a * is non-null, error messages will be appended to it. Returns a
* list of ACIPStrings that is the scan. <p>FIXME: not so * list of TStrings that is the scan. <p>FIXME: not so
* efficient; copies the whole file into memory first. * efficient; copies the whole file into memory first.
* @throws IOException if we cannot read in the ACIP input file */ * @throws IOException if we cannot read in the ACIP input file */
public static ArrayList scanFile(String fname, StringBuffer errors, int maxErrors) public static ArrayList scanFile(String fname, StringBuffer errors, int maxErrors)
@ -83,7 +83,7 @@ public class ACIPTshegBarScanner {
/** Scans a stream of ACIP into tsheg bars. If errors is /** Scans a stream of ACIP into tsheg bars. If errors is
* non-null, error messages will be appended to it. You can * non-null, error messages will be appended to it. You can
* recover both errors and warnings (modulo offset information) * recover both errors and warnings (modulo offset information)
* from the result, though. Returns a list of ACIPStrings that * from the result, though. Returns a list of TStrings that
* is the scan, or null if more than maxErrors occur. <p>FIXME: * is the scan, or null if more than maxErrors occur. <p>FIXME:
* not so efficient; copies the whole file into memory first. * not so efficient; copies the whole file into memory first.
* @throws IOException if we cannot read the whole ACIP stream */ * @throws IOException if we cannot read the whole ACIP stream */
@ -104,7 +104,7 @@ public class ACIPTshegBarScanner {
return scan(s.toString(), errors, maxErrors); return scan(s.toString(), errors, maxErrors);
} }
/** Returns a list of {@link ACIPString ACIPStrings} corresponding /** Returns a list of {@link TString TStrings} corresponding
* to s, possibly the empty list (when the empty string is the * to s, possibly the empty list (when the empty string is the
* input). Each String is either a Latin comment, some Latin * input). Each String is either a Latin comment, some Latin
* text, a tsheg bar (minus the tsheg or shad or whatever), a * text, a tsheg bar (minus the tsheg or shad or whatever), a
@ -112,16 +112,16 @@ public class ACIPTshegBarScanner {
* *
* <p>This not only scans; it finds all the errors and warnings a * <p>This not only scans; it finds all the errors and warnings a
* parser would too, like "NYA x" and "(" and ")" and "/NYA" etc. * parser would too, like "NYA x" and "(" and ")" and "/NYA" etc.
* It puts those in as ACIPStrings with type {@link * It puts those in as TStrings with type {@link
* ACIPString#ERROR} or {@link ACIPString#WARNING}, and also, if * TString#ERROR} or {@link TString#WARNING}, and also, if
* errors is non-null, appends helpful messages to errors, each * errors is non-null, appends helpful messages to errors, each
* followed by a '\n'. * followed by a '\n'.
* @param s the ACIP text * @param s the ACIP text
* @param errors if non-null, the buffer to which to append error * @param errors if non-null, the buffer to which to append error
* messages (DLC FIXME: cludge, just get this info by scanning * messages (DLC FIXME: cludge, just get this info by scanning
* the result for ACIPString.ERROR (and maybe ACIPString.WARNING, * the result for TString.ERROR (and maybe TString.WARNING,
* if you care about warnings), but then we'd have to put the * if you care about warnings), but then we'd have to put the
* Offset info in the ACIPString) * Offset info in the TString)
* @param maxErrors if nonnegative, then scanning will stop when * @param maxErrors if nonnegative, then scanning will stop when
* more than maxErrors errors occur. In this event, null is * more than maxErrors errors occur. In this event, null is
* returned. * returned.
@ -138,7 +138,7 @@ public class ACIPTshegBarScanner {
boolean waitingForMatchingIllegalClose = false; boolean waitingForMatchingIllegalClose = false;
int sl = s.length(); int sl = s.length();
int currentType = ACIPString.ERROR; int currentType = TString.ERROR;
int startOfString = 0; int startOfString = 0;
Stack bracketTypeStack = new Stack(); Stack bracketTypeStack = new Stack();
int startSlashIndex = -1; int startSlashIndex = -1;
@ -149,10 +149,10 @@ public class ACIPTshegBarScanner {
char ch; char ch;
ch = s.charAt(i); ch = s.charAt(i);
if (ch == '\n') ++numNewlines; if (ch == '\n') ++numNewlines;
if (ACIPString.COMMENT == currentType && ch != ']') { if (TString.COMMENT == currentType && ch != ']') {
if ('[' == ch) { if ('[' == ch) {
al.add(new ACIPString("Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n", al.add(new TString("Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n",
ACIPString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"); + "Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n");
@ -166,12 +166,12 @@ public class ACIPTshegBarScanner {
if (bracketTypeStack.empty()) { if (bracketTypeStack.empty()) {
// Error. // Error.
if (startOfString < i) { if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i), al.add(new TString(s.substring(startOfString, i),
currentType)); currentType));
} }
if (!waitingForMatchingIllegalClose) { if (!waitingForMatchingIllegalClose) {
al.add(new ACIPString("Found a truly unmatched close bracket, " + s.substring(i, i+1), al.add(new TString("Found a truly unmatched close bracket, " + s.substring(i, i+1),
ACIPString.ERROR)); TString.ERROR));
if (null != errors) { if (null != errors) {
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a truly unmatched close bracket, ] or }.\n"); + "Found a truly unmatched close bracket, ] or }.\n");
@ -179,19 +179,19 @@ public class ACIPTshegBarScanner {
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }
waitingForMatchingIllegalClose = false; waitingForMatchingIllegalClose = false;
al.add(new ACIPString("Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.", al.add(new TString("Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.",
ACIPString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); + "Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
} else { } else {
int stackTop = ((Integer)bracketTypeStack.pop()).intValue(); int stackTop = ((Integer)bracketTypeStack.pop()).intValue();
int end = startOfString; int end = startOfString;
if (ACIPString.CORRECTION_START == stackTop) { if (TString.CORRECTION_START == stackTop) {
// This definitely indicates a new token. // This definitely indicates a new token.
char prevCh = s.charAt(i-1); char prevCh = s.charAt(i-1);
@ -200,19 +200,19 @@ public class ACIPTshegBarScanner {
else else
end = i; end = i;
if (startOfString < end) { if (startOfString < end) {
al.add(new ACIPString(s.substring(startOfString, end), al.add(new TString(s.substring(startOfString, end),
currentType)); currentType));
} }
if ('?' != prevCh) { if ('?' != prevCh) {
currentType = ACIPString.PROBABLE_CORRECTION; currentType = TString.PROBABLE_CORRECTION;
} else { } else {
currentType = ACIPString.POSSIBLE_CORRECTION; currentType = TString.POSSIBLE_CORRECTION;
} }
} }
al.add(new ACIPString(s.substring(end, i+1), currentType)); al.add(new TString(s.substring(end, i+1), currentType));
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
} }
break; // end ']','}' case break; // end ']','}' case
@ -222,10 +222,10 @@ public class ACIPTshegBarScanner {
case '[': case '[':
// This definitely indicates a new token. // This definitely indicates a new token.
if (startOfString < i) { if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i), al.add(new TString(s.substring(startOfString, i),
currentType)); currentType));
startOfString = i; startOfString = i;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
} }
String thingy = null; String thingy = null;
@ -233,57 +233,57 @@ public class ACIPTshegBarScanner {
&& (s.substring(i, i + "[DD]".length()).equals("[DD]") && (s.substring(i, i + "[DD]".length()).equals("[DD]")
|| s.substring(i, i + "[DD]".length()).equals("{DD}"))) { || s.substring(i, i + "[DD]".length()).equals("{DD}"))) {
thingy = "[DD]"; thingy = "[DD]";
currentType = ACIPString.DD; currentType = TString.DD;
} else if (i + "[DD1]".length() <= sl } else if (i + "[DD1]".length() <= sl
&& (s.substring(i, i + "[DD1]".length()).equals("[DD1]") && (s.substring(i, i + "[DD1]".length()).equals("[DD1]")
|| s.substring(i, i + "[DD1]".length()).equals("{DD1}"))) { || s.substring(i, i + "[DD1]".length()).equals("{DD1}"))) {
thingy = "[DD1]"; thingy = "[DD1]";
currentType = ACIPString.DD; currentType = TString.DD;
} else if (i + "[DD2]".length() <= sl } else if (i + "[DD2]".length() <= sl
&& (s.substring(i, i + "[DD2]".length()).equals("[DD2]") && (s.substring(i, i + "[DD2]".length()).equals("[DD2]")
|| s.substring(i, i + "[DD2]".length()).equals("{DD2}"))) { || s.substring(i, i + "[DD2]".length()).equals("{DD2}"))) {
thingy = "[DD2]"; thingy = "[DD2]";
currentType = ACIPString.DD; currentType = TString.DD;
} else if (i + "[DDD]".length() <= sl } else if (i + "[DDD]".length() <= sl
&& (s.substring(i, i + "[DDD]".length()).equals("[DDD]") && (s.substring(i, i + "[DDD]".length()).equals("[DDD]")
|| s.substring(i, i + "[DDD]".length()).equals("{DDD}"))) { || s.substring(i, i + "[DDD]".length()).equals("{DDD}"))) {
thingy = "[DDD]"; thingy = "[DDD]";
currentType = ACIPString.DD; currentType = TString.DD;
} else if (i + "[DR]".length() <= sl } else if (i + "[DR]".length() <= sl
&& (s.substring(i, i + "[DR]".length()).equals("[DR]") && (s.substring(i, i + "[DR]".length()).equals("[DR]")
|| s.substring(i, i + "[DR]".length()).equals("{DR}"))) { || s.substring(i, i + "[DR]".length()).equals("{DR}"))) {
thingy = "[DR]"; thingy = "[DR]";
currentType = ACIPString.DR; currentType = TString.DR;
} else if (i + "[LS]".length() <= sl } else if (i + "[LS]".length() <= sl
&& (s.substring(i, i + "[LS]".length()).equals("[LS]") && (s.substring(i, i + "[LS]".length()).equals("[LS]")
|| s.substring(i, i + "[LS]".length()).equals("{LS}"))) { || s.substring(i, i + "[LS]".length()).equals("{LS}"))) {
thingy = "[LS]"; thingy = "[LS]";
currentType = ACIPString.LS; currentType = TString.LS;
} else if (i + "[BP]".length() <= sl } else if (i + "[BP]".length() <= sl
&& (s.substring(i, i + "[BP]".length()).equals("[BP]") && (s.substring(i, i + "[BP]".length()).equals("[BP]")
|| s.substring(i, i + "[BP]".length()).equals("{BP}"))) { || s.substring(i, i + "[BP]".length()).equals("{BP}"))) {
thingy = "[BP]"; thingy = "[BP]";
currentType = ACIPString.BP; currentType = TString.BP;
} else if (i + "[BLANK PAGE]".length() <= sl } else if (i + "[BLANK PAGE]".length() <= sl
&& (s.substring(i, i + "[BLANK PAGE]".length()).equals("[BLANK PAGE]") && (s.substring(i, i + "[BLANK PAGE]".length()).equals("[BLANK PAGE]")
|| s.substring(i, i + "[BLANK PAGE]".length()).equals("{BLANK PAGE}"))) { || s.substring(i, i + "[BLANK PAGE]".length()).equals("{BLANK PAGE}"))) {
thingy = "[BLANK PAGE]"; thingy = "[BLANK PAGE]";
currentType = ACIPString.BP; currentType = TString.BP;
} else if (i + "[ BP ]".length() <= sl } else if (i + "[ BP ]".length() <= sl
&& (s.substring(i, i + "[ BP ]".length()).equals("[ BP ]") && (s.substring(i, i + "[ BP ]".length()).equals("[ BP ]")
|| s.substring(i, i + "[ BP ]".length()).equals("{ BP }"))) { || s.substring(i, i + "[ BP ]".length()).equals("{ BP }"))) {
thingy = "{ BP }"; // found in TD3790E2.ACT thingy = "{ BP }"; // found in TD3790E2.ACT
currentType = ACIPString.BP; currentType = TString.BP;
} else if (i + "[ DD ]".length() <= sl } else if (i + "[ DD ]".length() <= sl
&& (s.substring(i, i + "[ DD ]".length()).equals("[ DD ]") && (s.substring(i, i + "[ DD ]".length()).equals("[ DD ]")
|| s.substring(i, i + "[ DD ]".length()).equals("{ DD }"))) { || s.substring(i, i + "[ DD ]".length()).equals("{ DD }"))) {
thingy = "{ DD }"; // found in TD3790E2.ACT thingy = "{ DD }"; // found in TD3790E2.ACT
currentType = ACIPString.DD; currentType = TString.DD;
} else if (i + "[?]".length() <= sl } else if (i + "[?]".length() <= sl
&& (s.substring(i, i + "[?]".length()).equals("[?]") && (s.substring(i, i + "[?]".length()).equals("[?]")
|| s.substring(i, i + "[?]".length()).equals("{?}"))) { || s.substring(i, i + "[?]".length()).equals("{?}"))) {
thingy = "[?]"; thingy = "[?]";
currentType = ACIPString.QUESTION; currentType = TString.QUESTION;
} else { } else {
// We see comments appear not as [#COMMENT], but // We see comments appear not as [#COMMENT], but
// as [COMMENT] sometimes. We make special cases // as [COMMENT] sometimes. We make special cases
@ -329,8 +329,8 @@ public class ACIPTshegBarScanner {
if (i + 2 + englishComments[ec].length() <= sl if (i + 2 + englishComments[ec].length() <= sl
&& (s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]") && (s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]")
|| s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]"))) { || s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]"))) {
al.add(new ACIPString("[#" + englishComments[ec] + "]", al.add(new TString("[#" + englishComments[ec] + "]",
ACIPString.COMMENT)); TString.COMMENT));
startOfString = i + 2 + englishComments[ec].length(); startOfString = i + 2 + englishComments[ec].length();
i = startOfString - 1; i = startOfString - 1;
foundOne = true; foundOne = true;
@ -386,16 +386,16 @@ public class ACIPTshegBarScanner {
= s.substring(begin, realEnd); = s.substring(begin, realEnd);
for (int ec = 0; ec < englishCorrections.length; ec++) { for (int ec = 0; ec < englishCorrections.length; ec++) {
if (interestingSubstring.startsWith(englishCorrections[ec])) { if (interestingSubstring.startsWith(englishCorrections[ec])) {
al.add(new ACIPString(s.substring(i, i+2), al.add(new TString(s.substring(i, i+2),
ACIPString.CORRECTION_START)); TString.CORRECTION_START));
al.add(new ACIPString(s.substring(i+2, realEnd), al.add(new TString(s.substring(i+2, realEnd),
ACIPString.LATIN)); TString.LATIN));
if (s.charAt(end - 1) == '?') { if (s.charAt(end - 1) == '?') {
al.add(new ACIPString(s.substring(end-1, end+1), al.add(new TString(s.substring(end-1, end+1),
ACIPString.POSSIBLE_CORRECTION)); TString.POSSIBLE_CORRECTION));
} else { } else {
al.add(new ACIPString(s.substring(end, end+1), al.add(new TString(s.substring(end, end+1),
ACIPString.PROBABLE_CORRECTION)); TString.PROBABLE_CORRECTION));
} }
foundOne = true; foundOne = true;
startOfString = end+1; startOfString = end+1;
@ -409,24 +409,24 @@ public class ACIPTshegBarScanner {
break; break;
} }
if (null != thingy) { if (null != thingy) {
al.add(new ACIPString(thingy, al.add(new TString(thingy,
currentType)); currentType));
startOfString = i + thingy.length(); startOfString = i + thingy.length();
i = startOfString - 1; i = startOfString - 1;
} else { } else {
if (i + 1 < sl) { if (i + 1 < sl) {
char nextCh = s.charAt(i+1); char nextCh = s.charAt(i+1);
if ('*' == nextCh) { if ('*' == nextCh) {
currentType = ACIPString.CORRECTION_START; currentType = TString.CORRECTION_START;
bracketTypeStack.push(new Integer(currentType)); bracketTypeStack.push(new Integer(currentType));
al.add(new ACIPString(s.substring(i, i+2), al.add(new TString(s.substring(i, i+2),
ACIPString.CORRECTION_START)); TString.CORRECTION_START));
currentType = ACIPString.ERROR; currentType = TString.ERROR;
startOfString = i+2; startOfString = i+2;
i = startOfString - 1; i = startOfString - 1;
break; break;
} else if ('#' == nextCh) { } else if ('#' == nextCh) {
currentType = ACIPString.COMMENT; currentType = TString.COMMENT;
bracketTypeStack.push(new Integer(currentType)); bracketTypeStack.push(new Integer(currentType));
break; break;
} }
@ -435,8 +435,8 @@ public class ACIPTshegBarScanner {
// WITHOUT # MARKS]. Though "... [" could cause // WITHOUT # MARKS]. Though "... [" could cause
// this too. // this too.
if (waitingForMatchingIllegalClose) { if (waitingForMatchingIllegalClose) {
al.add(new ACIPString("Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.", al.add(new TString("Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.",
ACIPString.ERROR)); TString.ERROR));
if (null != errors) { if (null != errors) {
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.\n"); + "Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.\n");
@ -455,24 +455,24 @@ public class ACIPTshegBarScanner {
inContext = inContext + "..."; inContext = inContext + "...";
} }
} }
al.add(new ACIPString("Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?", al.add(new TString("Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?",
ACIPString.ERROR)); TString.ERROR));
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n"); + "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }
startOfString = i + 1; startOfString = i + 1;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
} }
break; // end '[','{' case break; // end '[','{' case
case '@': case '@':
// This definitely indicates a new token. // This definitely indicates a new token.
if (startOfString < i) { if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i), al.add(new TString(s.substring(startOfString, i),
currentType)); currentType));
startOfString = i; startOfString = i;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
} }
// We look for {@N{AB}, @NN{AB}, ..., @NNNNNN{AB}}, // We look for {@N{AB}, @NN{AB}, ..., @NNNNNN{AB}},
@ -509,15 +509,15 @@ public class ACIPTshegBarScanner {
inContext = inContext + "..."; inContext = inContext + "...";
} }
} }
al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.", al.add(new TString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.",
ACIPString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.\n"); + "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+numdigits+3; startOfString = i+numdigits+3;
i = startOfString - 1; i = startOfString - 1;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
break; break;
} }
if (i+numdigits+4 < sl && (s.charAt(i+numdigits+4) == '.' || s.charAt(i+numdigits+4) == 'A' || s.charAt(i+numdigits+4) == 'B' || s.charAt(i+numdigits+4) == 'a' || s.charAt(i+numdigits+4) == 'b' || isNumeric(s.charAt(i+numdigits+4)))) { if (i+numdigits+4 < sl && (s.charAt(i+numdigits+4) == '.' || s.charAt(i+numdigits+4) == 'A' || s.charAt(i+numdigits+4) == 'B' || s.charAt(i+numdigits+4) == 'a' || s.charAt(i+numdigits+4) == 'b' || isNumeric(s.charAt(i+numdigits+4)))) {
@ -531,25 +531,25 @@ public class ACIPTshegBarScanner {
inContext = inContext + "..."; inContext = inContext + "...";
} }
} }
al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.", al.add(new TString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.",
ACIPString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.\n"); + "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+1; // DLC FIXME: skip over more? startOfString = i+1; // DLC FIXME: skip over more?
currentType = ACIPString.ERROR; currentType = TString.ERROR;
break; break;
} }
extra = 4; extra = 4;
} else { } else {
extra = 2; extra = 2;
} }
al.add(new ACIPString(s.substring(i, i+numdigits+extra), al.add(new TString(s.substring(i, i+numdigits+extra),
ACIPString.FOLIO_MARKER)); TString.FOLIO_MARKER));
startOfString = i+numdigits+extra; startOfString = i+numdigits+extra;
i = startOfString - 1; i = startOfString - 1;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
break; break;
} }
} }
@ -565,11 +565,11 @@ public class ACIPTshegBarScanner {
} }
} }
if (allAreNumeric) { if (allAreNumeric) {
al.add(new ACIPString(s.substring(i, i+numdigits+2), al.add(new TString(s.substring(i, i+numdigits+2),
ACIPString.FOLIO_MARKER)); TString.FOLIO_MARKER));
startOfString = i+numdigits+2; startOfString = i+numdigits+2;
i = startOfString - 1; i = startOfString - 1;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
break; break;
} }
} }
@ -586,11 +586,11 @@ public class ACIPTshegBarScanner {
} }
} }
if (allAreNumeric) { if (allAreNumeric) {
al.add(new ACIPString(s.substring(i, i+numdigits+4), al.add(new TString(s.substring(i, i+numdigits+4),
ACIPString.FOLIO_MARKER)); TString.FOLIO_MARKER));
startOfString = i+numdigits+4; startOfString = i+numdigits+4;
i = startOfString - 1; i = startOfString - 1;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
break; break;
} }
} }
@ -607,11 +607,11 @@ public class ACIPTshegBarScanner {
} }
} }
if (allAreNumeric) { if (allAreNumeric) {
al.add(new ACIPString(s.substring(i, i+numdigits+1), al.add(new TString(s.substring(i, i+numdigits+1),
ACIPString.FOLIO_MARKER)); TString.FOLIO_MARKER));
startOfString = i+numdigits+1; startOfString = i+numdigits+1;
i = startOfString - 1; i = startOfString - 1;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
break; break;
} }
} }
@ -627,24 +627,24 @@ public class ACIPTshegBarScanner {
inContext = inContext + "..."; inContext = inContext + "...";
} }
} }
al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.", al.add(new TString("Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.",
ACIPString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.\n"); + "Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
} }
break; // end '@' case break; // end '@' case
case '/': case '/':
// This definitely indicates a new token. // This definitely indicates a new token.
if (startOfString < i) { if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i), al.add(new TString(s.substring(startOfString, i),
currentType)); currentType));
startOfString = i; startOfString = i;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
} }
if (startSlashIndex >= 0) { if (startSlashIndex >= 0) {
@ -653,25 +653,25 @@ public class ACIPTshegBarScanner {
* it means /NYA/. We warn about // for this * it means /NYA/. We warn about // for this
* reason. \\ causes a tsheg-bar error (DLC * reason. \\ causes a tsheg-bar error (DLC
* FIXME: verify this is so). */ * FIXME: verify this is so). */
al.add(new ACIPString("Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.", al.add(new TString("Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.",
ACIPString.ERROR)); TString.ERROR));
if (errors != null) { if (errors != null) {
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\n"); + "Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\n");
} }
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }
al.add(new ACIPString(s.substring(i, i+1), al.add(new TString(s.substring(i, i+1),
ACIPString.END_SLASH)); TString.END_SLASH));
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
startSlashIndex = -1; startSlashIndex = -1;
} else { } else {
startSlashIndex = i; startSlashIndex = i;
al.add(new ACIPString(s.substring(i, i+1), al.add(new TString(s.substring(i, i+1),
ACIPString.START_SLASH)); TString.START_SLASH));
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
} }
break; // end '/' case break; // end '/' case
@ -679,42 +679,42 @@ public class ACIPTshegBarScanner {
case ')': case ')':
// This definitely indicates a new token. // This definitely indicates a new token.
if (startOfString < i) { if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i), al.add(new TString(s.substring(startOfString, i),
currentType)); currentType));
startOfString = i; startOfString = i;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
} }
// We do not support nesting like (NYA (BA)). // We do not support nesting like (NYA (BA)).
if (startParenIndex >= 0) { if (startParenIndex >= 0) {
if (ch == '(') { if (ch == '(') {
al.add(new ACIPString("Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.", al.add(new TString("Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.",
ACIPString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\n"); + "Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} else { } else {
al.add(new ACIPString(s.substring(i, i+1), ACIPString.END_PAREN)); al.add(new TString(s.substring(i, i+1), TString.END_PAREN));
startParenIndex = -1; startParenIndex = -1;
} }
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
} else { } else {
if (ch == ')') { if (ch == ')') {
al.add(new ACIPString("Unexpected closing parenthesis, ), found.", al.add(new TString("Unexpected closing parenthesis, ), found.",
ACIPString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Unexpected closing parenthesis, ), found.\n"); + "Unexpected closing parenthesis, ), found.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} else { } else {
startParenIndex = i; startParenIndex = i;
al.add(new ACIPString(s.substring(i, i+1), ACIPString.START_PAREN)); al.add(new TString(s.substring(i, i+1), TString.START_PAREN));
} }
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
} }
break; // end '(',')' case break; // end '(',')' case
@ -723,13 +723,13 @@ public class ACIPTshegBarScanner {
|| (s.charAt(i+1) != ']' && s.charAt(i+1) != '}')) { || (s.charAt(i+1) != ']' && s.charAt(i+1) != '}')) {
// The tsheg bar ends here; new token. // The tsheg bar ends here; new token.
if (startOfString < i) { if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i), al.add(new TString(s.substring(startOfString, i),
currentType)); currentType));
} }
al.add(new ACIPString(s.substring(i, i+1), al.add(new TString(s.substring(i, i+1),
ACIPString.QUESTION)); TString.QUESTION));
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
} // else this is [*TR'A ?] or the like. } // else this is [*TR'A ?] or the like.
break; // end '?' case break; // end '?' case
@ -737,23 +737,23 @@ public class ACIPTshegBarScanner {
case '.': case '.':
// This definitely indicates a new token. // This definitely indicates a new token.
if (startOfString < i) { if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i), al.add(new TString(s.substring(startOfString, i),
currentType)); currentType));
startOfString = i; startOfString = i;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
} }
// . is used for a non-breaking tsheg, such as in // . is used for a non-breaking tsheg, such as in
// {NGO.,} and {....,DAM}. We give a warning unless , // {NGO.,} and {....,DAM}. We give a warning unless ,
// or ., or [A-Za-z] follows '.'. // or ., or [A-Za-z] follows '.'.
al.add(new ACIPString(s.substring(i, i+1), al.add(new TString(s.substring(i, i+1),
ACIPString.TIBETAN_PUNCTUATION)); TString.TIBETAN_PUNCTUATION));
if (!(i + 1 < sl if (!(i + 1 < sl
&& (s.charAt(i+1) == '.' || s.charAt(i+1) == ',' && (s.charAt(i+1) == '.' || s.charAt(i+1) == ','
|| (s.charAt(i+1) == '\r' || s.charAt(i+1) == '\n') || (s.charAt(i+1) == '\r' || s.charAt(i+1) == '\n')
|| (s.charAt(i+1) >= 'a' && s.charAt(i+1) <= 'z') || (s.charAt(i+1) >= 'a' && s.charAt(i+1) <= 'z')
|| (s.charAt(i+1) >= 'A' && s.charAt(i+1) <= 'Z')))) { || (s.charAt(i+1) >= 'A' && s.charAt(i+1) <= 'Z')))) {
al.add(new ACIPString("A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".", al.add(new TString("A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".",
ACIPString.WARNING)); TString.WARNING));
} }
startOfString = i+1; startOfString = i+1;
break; // end '.' case break; // end '.' case
@ -775,11 +775,11 @@ public class ACIPTshegBarScanner {
boolean legalTshegBarAdornment = false; boolean legalTshegBarAdornment = false;
// The tsheg bar ends here; new token. // The tsheg bar ends here; new token.
if (startOfString < i) { if (startOfString < i) {
if (currentType == ACIPString.TIBETAN_NON_PUNCTUATION if (currentType == TString.TIBETAN_NON_PUNCTUATION
&& isTshegBarAdornment(ch)) && isTshegBarAdornment(ch))
legalTshegBarAdornment = true; legalTshegBarAdornment = true;
al.add(new ACIPString(s.substring(startOfString, i), al.add(new TString(s.substring(startOfString, i),
currentType)); currentType));
} }
// Insert a tsheg if necessary. ACIP files aren't // Insert a tsheg if necessary. ACIP files aren't
@ -788,22 +788,22 @@ public class ACIPTshegBarScanner {
if (('\r' == ch if (('\r' == ch
|| ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r')) || ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
&& !al.isEmpty() && !al.isEmpty()
&& (((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION && (((TString)al.get(al.size() - 1)).getType() == TString.TIBETAN_NON_PUNCTUATION
|| ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TSHEG_BAR_ADORNMENT)) { || ((TString)al.get(al.size() - 1)).getType() == TString.TSHEG_BAR_ADORNMENT)) {
al.add(new ACIPString(" ", ACIPString.TIBETAN_PUNCTUATION)); al.add(new TString(" ", TString.TIBETAN_PUNCTUATION));
} }
// "DANG,\nLHAG" is really "DANG, LHAG". But always? Not if you have "MDO,\n\nKA...". // "DANG,\nLHAG" is really "DANG, LHAG". But always? Not if you have "MDO,\n\nKA...".
if (('\r' == ch if (('\r' == ch
|| ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r')) || ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
&& !al.isEmpty() && !al.isEmpty()
&& (((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_PUNCTUATION && (((TString)al.get(al.size() - 1)).getType() == TString.TIBETAN_PUNCTUATION
|| ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TSHEG_BAR_ADORNMENT) || ((TString)al.get(al.size() - 1)).getType() == TString.TSHEG_BAR_ADORNMENT)
&& ((ACIPString)al.get(al.size() - 1)).getText().equals(",") && ((TString)al.get(al.size() - 1)).getText().equals(",")
&& s.charAt(i-1) == ',' && s.charAt(i-1) == ','
&& (i + (('\r' == ch) ? 2 : 1) < sl && (i + (('\r' == ch) ? 2 : 1) < sl
&& (s.charAt(i+(('\r' == ch) ? 2 : 1)) != ch))) { && (s.charAt(i+(('\r' == ch) ? 2 : 1)) != ch))) {
al.add(new ACIPString(" ", ACIPString.TIBETAN_PUNCTUATION)); al.add(new TString(" ", TString.TIBETAN_PUNCTUATION));
} }
// Don't add in a "\r\n" or "\n" unless there's a // Don't add in a "\r\n" or "\n" unless there's a
@ -816,24 +816,24 @@ public class ACIPTshegBarScanner {
|| ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n')))) { || ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n')))) {
for (int h = 0; h < (realNewline ? 2 : 1); h++) { for (int h = 0; h < (realNewline ? 2 : 1); h++) {
if (isTshegBarAdornment(ch) && !legalTshegBarAdornment) { if (isTshegBarAdornment(ch) && !legalTshegBarAdornment) {
al.add(new ACIPString("The ACIP " + ch + " must be glued to the end of a tsheg bar, but this one was not", al.add(new TString("The ACIP " + ch + " must be glued to the end of a tsheg bar, but this one was not",
ACIPString.ERROR)); TString.ERROR));
} else { } else {
al.add(new ACIPString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1), al.add(new TString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
(legalTshegBarAdornment (legalTshegBarAdornment
? ACIPString.TSHEG_BAR_ADORNMENT ? TString.TSHEG_BAR_ADORNMENT
: ACIPString.TIBETAN_PUNCTUATION))); : TString.TIBETAN_PUNCTUATION)));
} }
} }
} }
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
break; // end TIBETAN_PUNCTUATION case break; // end TIBETAN_PUNCTUATION case
default: default:
if (!bracketTypeStack.empty()) { if (!bracketTypeStack.empty()) {
int stackTop = ((Integer)bracketTypeStack.peek()).intValue(); int stackTop = ((Integer)bracketTypeStack.peek()).intValue();
if (ACIPString.CORRECTION_START == stackTop && '?' == ch) { if (TString.CORRECTION_START == stackTop && '?' == ch) {
// allow it through... // allow it through...
break; break;
} }
@ -844,46 +844,46 @@ public class ACIPTshegBarScanner {
break; break;
if (!(isNumeric(ch) || isAlpha(ch))) { if (!(isNumeric(ch) || isAlpha(ch))) {
if (startOfString < i) { if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i), al.add(new TString(s.substring(startOfString, i),
currentType)); currentType));
} }
if ((int)ch == 65533) { if ((int)ch == 65533) {
al.add(new ACIPString("Found an illegal, unprintable character.", al.add(new TString("Found an illegal, unprintable character.",
ACIPString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal, unprintable character.\n"); + "Found an illegal, unprintable character.\n");
} else if ('\\' == ch) { } else if ('\\' == ch) {
al.add(new ACIPString("Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.", al.add(new TString("Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.",
ACIPString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n"); + "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n");
} else { } else {
al.add(new ACIPString("Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".", al.add(new TString("Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".",
ACIPString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": " errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".\n"); + "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".\n");
} }
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+1; startOfString = i+1;
currentType = ACIPString.ERROR; currentType = TString.ERROR;
} else { } else {
// Continue through the loop. // Continue through the loop.
if (ACIPString.ERROR == currentType) if (TString.ERROR == currentType)
currentType = ACIPString.TIBETAN_NON_PUNCTUATION; currentType = TString.TIBETAN_NON_PUNCTUATION;
} }
break; // end default case break; // end default case
} }
} }
if (startOfString < sl) { if (startOfString < sl) {
al.add(new ACIPString(s.substring(startOfString, sl), al.add(new TString(s.substring(startOfString, sl),
currentType)); currentType));
} }
if (waitingForMatchingIllegalClose) { if (waitingForMatchingIllegalClose) {
al.add(new ACIPString("UNEXPECTED END OF INPUT", al.add(new TString("UNEXPECTED END OF INPUT",
ACIPString.ERROR)); TString.ERROR));
if (null != errors) { if (null != errors) {
errors.append("Offset END: " errors.append("Offset END: "
+ "Truly unmatched open bracket found.\n"); + "Truly unmatched open bracket found.\n");
@ -891,25 +891,25 @@ public class ACIPTshegBarScanner {
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }
if (!bracketTypeStack.empty()) { if (!bracketTypeStack.empty()) {
al.add(new ACIPString("Unmatched open bracket found. A " + ((ACIPString.COMMENT == currentType) ? "comment" : "correction") + " does not terminate.", al.add(new TString("Unmatched open bracket found. A " + ((TString.COMMENT == currentType) ? "comment" : "correction") + " does not terminate.",
ACIPString.ERROR)); TString.ERROR));
if (null != errors) { if (null != errors) {
errors.append("Offset END: " errors.append("Offset END: "
+ "Unmatched open bracket found. A " + ((ACIPString.COMMENT == currentType) ? "comment" : "correction") + " does not terminate.\n"); + "Unmatched open bracket found. A " + ((TString.COMMENT == currentType) ? "comment" : "correction") + " does not terminate.\n");
} }
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }
if (startSlashIndex >= 0) { if (startSlashIndex >= 0) {
al.add(new ACIPString("Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.", al.add(new TString("Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.",
ACIPString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset END: " errors.append("Offset END: "
+ "Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n"); + "Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }
if (startParenIndex >= 0) { if (startParenIndex >= 0) {
al.add(new ACIPString("Parentheses are supposed to occur in pairs, but the input had an unmatched parenthesis.", al.add(new TString("Parentheses are supposed to occur in pairs, but the input had an unmatched parenthesis.",
ACIPString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset END: " errors.append("Offset END: "
+ "Unmatched open parenthesis, (, found.\n"); + "Unmatched open parenthesis, (, found.\n");

View file

@ -19,18 +19,18 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt; package org.thdl.tib.text.ttt;
/** /**
* An ACIPString is some Latin text and a type, the type stating * An TString is some Latin text and a type, the type stating whether
* whether said text is Latin (usually English) or transliteration of * said text is Latin (usually English) or transliteration of Tibetan,
* Tibetan and which particular kind. Scanning errors are also encoded * which transliteration system (ACIP or EWTS), and which particular
* as ACIPStrings using a special type. * kind. Scanning errors are also encoded as TStrings using a special
* type.
* *
* @author David Chandler * @author David Chandler */
*/ public class TString {
public class ACIPString {
private int type; private int type;
private String text; private String text;
/** Returns true if and only if an ACIPString with type type is to /** Returns true if and only if an TString with type type is to
* be converted to Latin, not Tibetan, text. */ * be converted to Latin, not Tibetan, text. */
public static boolean isLatin(int type) { public static boolean isLatin(int type) {
return (type != TIBETAN_NON_PUNCTUATION return (type != TIBETAN_NON_PUNCTUATION
@ -42,45 +42,45 @@ public class ACIPString {
&& type != END_SLASH); && type != END_SLASH);
} }
/** For [#COMMENTS] */ /** For ACIP [#COMMENTS] and EWTS (DLC FIXME) */
public static final int COMMENT = 0; public static final int COMMENT = 0;
/** For Folio markers like @012B */ /** For Folio markers like @012B in ACIP */
public static final int FOLIO_MARKER = 1; public static final int FOLIO_MARKER = 1;
/** For Latin letters and numbers etc. [*LINE BREAK?] uses this, /** For Latin letters and numbers etc. [*LINE BREAK?] uses this,
* for example. */ * for example. Or in EWTS, \f uses this. */
public static final int LATIN = 2; public static final int LATIN = 2;
/** For Tibetan letters and numbers etc. */ /** For Tibetan letters and numbers etc. */
public static final int TIBETAN_NON_PUNCTUATION = 3; public static final int TIBETAN_NON_PUNCTUATION = 3;
/** For tshegs, whitespace and the like, but not combining /** For tshegs, whitespace and the like, but not combining
* punctutation like %, o, :, m, and x */ * punctutation like ACIP %, o, :, m, and x */
public static final int TIBETAN_PUNCTUATION = 4; public static final int TIBETAN_PUNCTUATION = 4;
/** For the start of a [*probable correction] or [*possible correction?] */ /** For the start of a [*probable correction] or [*possible correction?] in ACIP */
public static final int CORRECTION_START = 5; public static final int CORRECTION_START = 5;
/** Denotes the end of a [*probable correction] */ /** Denotes the end of a [*probable correction] in ACIP */
public static final int PROBABLE_CORRECTION = 6; public static final int PROBABLE_CORRECTION = 6;
/** Denotes the end of a [*possible correction?] */ /** Denotes the end of a [*possible correction?] in ACIP*/
public static final int POSSIBLE_CORRECTION = 7; public static final int POSSIBLE_CORRECTION = 7;
/** For [BP] -- blank page */ /** For [BP] -- blank page in ACIP*/
public static final int BP = 8; public static final int BP = 8;
/** For [LS] -- Lanycha script on page */ /** For [LS] -- Lanycha script on page in ACIP*/
public static final int LS = 9; public static final int LS = 9;
/** For [DR] -- picture (without caption) on page */ /** For [DR] -- picture (without caption) on page in ACIP*/
public static final int DR = 10; public static final int DR = 10;
/** For [DD], [DDD], [DD1], [DD2], et cetera -- picture with caption on page */ /** For [DD], [DDD], [DD1], [DD2], et cetera -- picture with caption on page in ACIP */
public static final int DD = 11; public static final int DD = 11;
/** For [?] */ /** For [?] in ACIP */
public static final int QUESTION = 12; public static final int QUESTION = 12;
/** For the first / in /NYA/ */ /** For the first / in /NYA/ in ACIP */
public static final int START_SLASH = 13; public static final int START_SLASH = 13;
/** For the last / in /NYA/ */ /** For the last / in /NYA/ in ACIP */
public static final int END_SLASH = 14; public static final int END_SLASH = 14;
/** For the opening ( in (NYA) */ /** For the opening ( in (NYA) in ACIP */
public static final int START_PAREN = 15; public static final int START_PAREN = 15;
/** For the closing ) in (NYA) */ /** For the closing ) in (NYA) in ACIP */
public static final int END_PAREN = 16; public static final int END_PAREN = 16;
/** For things that may not be legal syntax, such as {KA . KHA} */ /** For things that may not be legal syntax, such as {KA . KHA} */
public static final int WARNING = 17; public static final int WARNING = 17;
/** For ACIP %, o, and x */ /** For ACIP %, o, and x or EWTS (DLC FIXME) */
public static final int TSHEG_BAR_ADORNMENT = 18; public static final int TSHEG_BAR_ADORNMENT = 18;
/** For things that are not legal syntax, such as a file that /** For things that are not legal syntax, such as a file that
* contains just "[# HALF A COMMEN" */ * contains just "[# HALF A COMMEN" */
@ -112,11 +112,11 @@ public class ACIPString {
} }
/** Don't instantiate me. */ /** Don't instantiate me. */
private ACIPString() { } private TString() { }
/** Creates a new ACIPString with source text <i>text</i> and type /** Creates a new TString with source text <i>text</i> and type
* <i>type</i> being a characterization like {@link #DD}. */ * <i>type</i> being a characterization like {@link #DD}. */
public ACIPString(String text, int type) { public TString(String text, int type) {
setType(type); setType(type);
setText(text); setText(text);
} }