Andres found that "THAG PA" caused a NullPointerException. That's fixed.

Renamed ACIPString to TString -- we'll use this for EWTS and ACIP both.

TMW->ACIP for TMW9.61 should work now.
This commit is contained in:
dchandler 2003-10-04 01:22:59 +00:00
parent c8927b827c
commit ee50291ed4
4 changed files with 207 additions and 205 deletions

View file

@ -31,7 +31,7 @@ import org.thdl.tib.text.DuffCode;
/**
* This class is able to convert an ACIP file into Tibetan Machine Web
* and an ACIP file into TMW. ACIP->Unicode should yield the same
* and an ACIP file into Unicode. ACIP->Unicode should yield the same
* results as ACIP->TMW followed by TMW->Unicode (FIXME: test it!)
* @author David Chandler
*/
@ -225,15 +225,15 @@ public class ACIPConverter {
writeWarningsToOut, warningLevel, false);
}
private static boolean peekaheadFindsSpacesAndComma(ArrayList /* of ACIPString */ scan,
private static boolean peekaheadFindsSpacesAndComma(ArrayList /* of TString */ scan,
int pos) {
int sz = scan.size();
while (pos < sz) {
ACIPString s = (ACIPString)scan.get(pos++);
if (s.getType() == ACIPString.TIBETAN_PUNCTUATION && s.getText().equals(" ")) {
TString s = (TString)scan.get(pos++);
if (s.getType() == TString.TIBETAN_PUNCTUATION && s.getText().equals(" ")) {
// keep going
} else {
if (s.getType() == ACIPString.TIBETAN_PUNCTUATION && s.getText().equals(",")) {
if (s.getType() == TString.TIBETAN_PUNCTUATION && s.getText().equals(",")) {
return true;
} else {
return false;
@ -286,16 +286,16 @@ public class ACIPConverter {
Color lastColor = Color.BLACK;
Color color = Color.BLACK;
for (int i = 0; i < sz; i++) {
ACIPString s = (ACIPString)scan.get(i);
TString s = (TString)scan.get(i);
int stype = s.getType();
if (stype == ACIPString.ERROR) {
if (stype == TString.ERROR) {
lastGuyWasNonPunct = false;
lastGuy = null;
hasErrors = true;
String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]";
if (null != writer) writer.write(text);
if (null != tdoc) tdoc.appendRoman(text, Color.RED);
} else if (stype == ACIPString.TSHEG_BAR_ADORNMENT) {
} else if (stype == TString.TSHEG_BAR_ADORNMENT) {
if (lastGuyWasNonPunct) {
String err = "[#ERROR CONVERTING ACIP DOCUMENT: This converter cannot yet convert " + s.getText() + " because the converter's author is unclear what the result should be.]";
if (null != writer) {
@ -322,7 +322,7 @@ public class ACIPConverter {
}
lastGuyWasNonPunct = true; // this stuff is not really punctuation
lastGuy = null;
} else if (stype == ACIPString.WARNING) {
} else if (stype == TString.WARNING) {
lastGuyWasNonPunct = false;
lastGuy = null;
if (writeWarningsToOut) {
@ -341,15 +341,15 @@ public class ACIPConverter {
lastGuyWasNonPunct = false;
lastGuy = null;
String text
= (((stype == ACIPString.FOLIO_MARKER) ? "{" : "")
= (((stype == TString.FOLIO_MARKER) ? "{" : "")
+ s.getText()
+ ((stype == ACIPString.FOLIO_MARKER) ? "}" : ""));
+ ((stype == TString.FOLIO_MARKER) ? "}" : ""));
if (null != writer) writer.write(text);
if (null != tdoc) tdoc.appendRoman(text, Color.BLACK);
} else {
String unicode = null;
DuffCode[] duff = null;
if (stype == ACIPString.TIBETAN_NON_PUNCTUATION) {
if (stype == TString.TIBETAN_NON_PUNCTUATION) {
lastGuyWasNonPunct = true;
TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText());
String acipError;
@ -424,13 +424,13 @@ public class ACIPConverter {
}
} else {
color = Color.BLACK;
if (stype == ACIPString.START_SLASH) {
if (stype == TString.START_SLASH) {
if (null != writer) unicode = "\u0F3C";
if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph("(") };
} else if (stype == ACIPString.END_SLASH) {
} else if (stype == TString.END_SLASH) {
if (null != writer) unicode = "\u0F3D";
if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph(")") };
} else if (stype == ACIPString.TIBETAN_PUNCTUATION) {
} else if (stype == TString.TIBETAN_PUNCTUATION) {
// For ACIP, tshegs are used as both
// tshegs and whitespace. We treat a
// space as a tsheg if and only if it
@ -452,7 +452,8 @@ public class ACIPConverter {
// space.
&& ((lpl.get(0).getLeft().equals("G")
|| lpl.get(0).getLeft().equals("K"))
&& (lpl.get(0).getRight().indexOf('U') < 0))
&& (null == lpl.get(0).getRight()
|| lpl.get(0).getRight().indexOf('U') < 0))
&&
// it's (G . anything)
// followed by some number of
@ -500,12 +501,12 @@ public class ACIPConverter {
}
}
}
} else if (stype == ACIPString.START_PAREN) {
} else if (stype == TString.START_PAREN) {
if (null != tdoc) {
tdoc.setTibetanFontSize(smallFontSize);
}
continue;
} else if (stype == ACIPString.END_PAREN) {
} else if (stype == TString.END_PAREN) {
if (null != tdoc) {
tdoc.setTibetanFontSize(regularFontSize);
}

View file

@ -174,6 +174,7 @@ public class ACIPRules {
if (null == wylieToACIP) {
wylieToACIP = new HashMap(75);
wylieToACIP.put("_", " "); // oddball.
wylieToACIP.put("o'i", "O'I"); // oddball for TMW9.61.
}
wylieToACIP.put(EWTS, ACIP);
}

View file

@ -70,7 +70,7 @@ public class ACIPTshegBarScanner {
/** Scans an ACIP file with path fname into tsheg bars. If errors
* is non-null, error messages will be appended to it. Returns a
* list of ACIPStrings that is the scan. <p>FIXME: not so
* list of TStrings that is the scan. <p>FIXME: not so
* efficient; copies the whole file into memory first.
* @throws IOException if we cannot read in the ACIP input file */
public static ArrayList scanFile(String fname, StringBuffer errors, int maxErrors)
@ -83,7 +83,7 @@ public class ACIPTshegBarScanner {
/** Scans a stream of ACIP into tsheg bars. If errors is
* non-null, error messages will be appended to it. You can
* recover both errors and warnings (modulo offset information)
* from the result, though. Returns a list of ACIPStrings that
* from the result, though. Returns a list of TStrings that
* is the scan, or null if more than maxErrors occur. <p>FIXME:
* not so efficient; copies the whole file into memory first.
* @throws IOException if we cannot read the whole ACIP stream */
@ -104,7 +104,7 @@ public class ACIPTshegBarScanner {
return scan(s.toString(), errors, maxErrors);
}
/** Returns a list of {@link ACIPString ACIPStrings} corresponding
/** Returns a list of {@link TString TStrings} corresponding
* to s, possibly the empty list (when the empty string is the
* input). Each String is either a Latin comment, some Latin
* text, a tsheg bar (minus the tsheg or shad or whatever), a
@ -112,16 +112,16 @@ public class ACIPTshegBarScanner {
*
* <p>This not only scans; it finds all the errors and warnings a
* parser would too, like "NYA x" and "(" and ")" and "/NYA" etc.
* It puts those in as ACIPStrings with type {@link
* ACIPString#ERROR} or {@link ACIPString#WARNING}, and also, if
* It puts those in as TStrings with type {@link
* TString#ERROR} or {@link TString#WARNING}, and also, if
* errors is non-null, appends helpful messages to errors, each
* followed by a '\n'.
* @param s the ACIP text
* @param errors if non-null, the buffer to which to append error
* messages (DLC FIXME: cludge, just get this info by scanning
* the result for ACIPString.ERROR (and maybe ACIPString.WARNING,
* the result for TString.ERROR (and maybe TString.WARNING,
* if you care about warnings), but then we'd have to put the
* Offset info in the ACIPString)
* Offset info in the TString)
* @param maxErrors if nonnegative, then scanning will stop when
* more than maxErrors errors occur. In this event, null is
* returned.
@ -138,7 +138,7 @@ public class ACIPTshegBarScanner {
boolean waitingForMatchingIllegalClose = false;
int sl = s.length();
int currentType = ACIPString.ERROR;
int currentType = TString.ERROR;
int startOfString = 0;
Stack bracketTypeStack = new Stack();
int startSlashIndex = -1;
@ -149,10 +149,10 @@ public class ACIPTshegBarScanner {
char ch;
ch = s.charAt(i);
if (ch == '\n') ++numNewlines;
if (ACIPString.COMMENT == currentType && ch != ']') {
if (TString.COMMENT == currentType && ch != ']') {
if ('[' == ch) {
al.add(new ACIPString("Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n",
ACIPString.ERROR));
al.add(new TString("Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n",
TString.ERROR));
if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n");
@ -166,12 +166,12 @@ public class ACIPTshegBarScanner {
if (bracketTypeStack.empty()) {
// Error.
if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i),
currentType));
al.add(new TString(s.substring(startOfString, i),
currentType));
}
if (!waitingForMatchingIllegalClose) {
al.add(new ACIPString("Found a truly unmatched close bracket, " + s.substring(i, i+1),
ACIPString.ERROR));
al.add(new TString("Found a truly unmatched close bracket, " + s.substring(i, i+1),
TString.ERROR));
if (null != errors) {
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a truly unmatched close bracket, ] or }.\n");
@ -179,19 +179,19 @@ public class ACIPTshegBarScanner {
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
}
waitingForMatchingIllegalClose = false;
al.add(new ACIPString("Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.",
ACIPString.ERROR));
al.add(new TString("Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.",
TString.ERROR));
if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+1;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
} else {
int stackTop = ((Integer)bracketTypeStack.pop()).intValue();
int end = startOfString;
if (ACIPString.CORRECTION_START == stackTop) {
if (TString.CORRECTION_START == stackTop) {
// This definitely indicates a new token.
char prevCh = s.charAt(i-1);
@ -200,19 +200,19 @@ public class ACIPTshegBarScanner {
else
end = i;
if (startOfString < end) {
al.add(new ACIPString(s.substring(startOfString, end),
currentType));
al.add(new TString(s.substring(startOfString, end),
currentType));
}
if ('?' != prevCh) {
currentType = ACIPString.PROBABLE_CORRECTION;
currentType = TString.PROBABLE_CORRECTION;
} else {
currentType = ACIPString.POSSIBLE_CORRECTION;
currentType = TString.POSSIBLE_CORRECTION;
}
}
al.add(new ACIPString(s.substring(end, i+1), currentType));
al.add(new TString(s.substring(end, i+1), currentType));
startOfString = i+1;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
}
break; // end ']','}' case
@ -222,10 +222,10 @@ public class ACIPTshegBarScanner {
case '[':
// This definitely indicates a new token.
if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i),
currentType));
al.add(new TString(s.substring(startOfString, i),
currentType));
startOfString = i;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
}
String thingy = null;
@ -233,57 +233,57 @@ public class ACIPTshegBarScanner {
&& (s.substring(i, i + "[DD]".length()).equals("[DD]")
|| s.substring(i, i + "[DD]".length()).equals("{DD}"))) {
thingy = "[DD]";
currentType = ACIPString.DD;
currentType = TString.DD;
} else if (i + "[DD1]".length() <= sl
&& (s.substring(i, i + "[DD1]".length()).equals("[DD1]")
|| s.substring(i, i + "[DD1]".length()).equals("{DD1}"))) {
thingy = "[DD1]";
currentType = ACIPString.DD;
currentType = TString.DD;
} else if (i + "[DD2]".length() <= sl
&& (s.substring(i, i + "[DD2]".length()).equals("[DD2]")
|| s.substring(i, i + "[DD2]".length()).equals("{DD2}"))) {
thingy = "[DD2]";
currentType = ACIPString.DD;
currentType = TString.DD;
} else if (i + "[DDD]".length() <= sl
&& (s.substring(i, i + "[DDD]".length()).equals("[DDD]")
|| s.substring(i, i + "[DDD]".length()).equals("{DDD}"))) {
thingy = "[DDD]";
currentType = ACIPString.DD;
currentType = TString.DD;
} else if (i + "[DR]".length() <= sl
&& (s.substring(i, i + "[DR]".length()).equals("[DR]")
|| s.substring(i, i + "[DR]".length()).equals("{DR}"))) {
thingy = "[DR]";
currentType = ACIPString.DR;
currentType = TString.DR;
} else if (i + "[LS]".length() <= sl
&& (s.substring(i, i + "[LS]".length()).equals("[LS]")
|| s.substring(i, i + "[LS]".length()).equals("{LS}"))) {
thingy = "[LS]";
currentType = ACIPString.LS;
currentType = TString.LS;
} else if (i + "[BP]".length() <= sl
&& (s.substring(i, i + "[BP]".length()).equals("[BP]")
|| s.substring(i, i + "[BP]".length()).equals("{BP}"))) {
thingy = "[BP]";
currentType = ACIPString.BP;
currentType = TString.BP;
} else if (i + "[BLANK PAGE]".length() <= sl
&& (s.substring(i, i + "[BLANK PAGE]".length()).equals("[BLANK PAGE]")
|| s.substring(i, i + "[BLANK PAGE]".length()).equals("{BLANK PAGE}"))) {
thingy = "[BLANK PAGE]";
currentType = ACIPString.BP;
currentType = TString.BP;
} else if (i + "[ BP ]".length() <= sl
&& (s.substring(i, i + "[ BP ]".length()).equals("[ BP ]")
|| s.substring(i, i + "[ BP ]".length()).equals("{ BP }"))) {
thingy = "{ BP }"; // found in TD3790E2.ACT
currentType = ACIPString.BP;
currentType = TString.BP;
} else if (i + "[ DD ]".length() <= sl
&& (s.substring(i, i + "[ DD ]".length()).equals("[ DD ]")
|| s.substring(i, i + "[ DD ]".length()).equals("{ DD }"))) {
thingy = "{ DD }"; // found in TD3790E2.ACT
currentType = ACIPString.DD;
currentType = TString.DD;
} else if (i + "[?]".length() <= sl
&& (s.substring(i, i + "[?]".length()).equals("[?]")
|| s.substring(i, i + "[?]".length()).equals("{?}"))) {
thingy = "[?]";
currentType = ACIPString.QUESTION;
currentType = TString.QUESTION;
} else {
// We see comments appear not as [#COMMENT], but
// as [COMMENT] sometimes. We make special cases
@ -329,8 +329,8 @@ public class ACIPTshegBarScanner {
if (i + 2 + englishComments[ec].length() <= sl
&& (s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]")
|| s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]"))) {
al.add(new ACIPString("[#" + englishComments[ec] + "]",
ACIPString.COMMENT));
al.add(new TString("[#" + englishComments[ec] + "]",
TString.COMMENT));
startOfString = i + 2 + englishComments[ec].length();
i = startOfString - 1;
foundOne = true;
@ -386,16 +386,16 @@ public class ACIPTshegBarScanner {
= s.substring(begin, realEnd);
for (int ec = 0; ec < englishCorrections.length; ec++) {
if (interestingSubstring.startsWith(englishCorrections[ec])) {
al.add(new ACIPString(s.substring(i, i+2),
ACIPString.CORRECTION_START));
al.add(new ACIPString(s.substring(i+2, realEnd),
ACIPString.LATIN));
al.add(new TString(s.substring(i, i+2),
TString.CORRECTION_START));
al.add(new TString(s.substring(i+2, realEnd),
TString.LATIN));
if (s.charAt(end - 1) == '?') {
al.add(new ACIPString(s.substring(end-1, end+1),
ACIPString.POSSIBLE_CORRECTION));
al.add(new TString(s.substring(end-1, end+1),
TString.POSSIBLE_CORRECTION));
} else {
al.add(new ACIPString(s.substring(end, end+1),
ACIPString.PROBABLE_CORRECTION));
al.add(new TString(s.substring(end, end+1),
TString.PROBABLE_CORRECTION));
}
foundOne = true;
startOfString = end+1;
@ -409,24 +409,24 @@ public class ACIPTshegBarScanner {
break;
}
if (null != thingy) {
al.add(new ACIPString(thingy,
currentType));
al.add(new TString(thingy,
currentType));
startOfString = i + thingy.length();
i = startOfString - 1;
} else {
if (i + 1 < sl) {
char nextCh = s.charAt(i+1);
if ('*' == nextCh) {
currentType = ACIPString.CORRECTION_START;
currentType = TString.CORRECTION_START;
bracketTypeStack.push(new Integer(currentType));
al.add(new ACIPString(s.substring(i, i+2),
ACIPString.CORRECTION_START));
currentType = ACIPString.ERROR;
al.add(new TString(s.substring(i, i+2),
TString.CORRECTION_START));
currentType = TString.ERROR;
startOfString = i+2;
i = startOfString - 1;
break;
} else if ('#' == nextCh) {
currentType = ACIPString.COMMENT;
currentType = TString.COMMENT;
bracketTypeStack.push(new Integer(currentType));
break;
}
@ -435,8 +435,8 @@ public class ACIPTshegBarScanner {
// WITHOUT # MARKS]. Though "... [" could cause
// this too.
if (waitingForMatchingIllegalClose) {
al.add(new ACIPString("Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.",
ACIPString.ERROR));
al.add(new TString("Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.",
TString.ERROR));
if (null != errors) {
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.\n");
@ -455,24 +455,24 @@ public class ACIPTshegBarScanner {
inContext = inContext + "...";
}
}
al.add(new ACIPString("Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?",
ACIPString.ERROR));
al.add(new TString("Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?",
TString.ERROR));
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
}
startOfString = i + 1;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
}
break; // end '[','{' case
case '@':
// This definitely indicates a new token.
if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i),
currentType));
al.add(new TString(s.substring(startOfString, i),
currentType));
startOfString = i;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
}
// We look for {@N{AB}, @NN{AB}, ..., @NNNNNN{AB}},
@ -509,15 +509,15 @@ public class ACIPTshegBarScanner {
inContext = inContext + "...";
}
}
al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.",
ACIPString.ERROR));
al.add(new TString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.",
TString.ERROR));
if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+numdigits+3;
i = startOfString - 1;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
break;
}
if (i+numdigits+4 < sl && (s.charAt(i+numdigits+4) == '.' || s.charAt(i+numdigits+4) == 'A' || s.charAt(i+numdigits+4) == 'B' || s.charAt(i+numdigits+4) == 'a' || s.charAt(i+numdigits+4) == 'b' || isNumeric(s.charAt(i+numdigits+4)))) {
@ -531,25 +531,25 @@ public class ACIPTshegBarScanner {
inContext = inContext + "...";
}
}
al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.",
ACIPString.ERROR));
al.add(new TString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.",
TString.ERROR));
if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+1; // DLC FIXME: skip over more?
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
break;
}
extra = 4;
} else {
extra = 2;
}
al.add(new ACIPString(s.substring(i, i+numdigits+extra),
ACIPString.FOLIO_MARKER));
al.add(new TString(s.substring(i, i+numdigits+extra),
TString.FOLIO_MARKER));
startOfString = i+numdigits+extra;
i = startOfString - 1;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
break;
}
}
@ -565,11 +565,11 @@ public class ACIPTshegBarScanner {
}
}
if (allAreNumeric) {
al.add(new ACIPString(s.substring(i, i+numdigits+2),
ACIPString.FOLIO_MARKER));
al.add(new TString(s.substring(i, i+numdigits+2),
TString.FOLIO_MARKER));
startOfString = i+numdigits+2;
i = startOfString - 1;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
break;
}
}
@ -586,11 +586,11 @@ public class ACIPTshegBarScanner {
}
}
if (allAreNumeric) {
al.add(new ACIPString(s.substring(i, i+numdigits+4),
ACIPString.FOLIO_MARKER));
al.add(new TString(s.substring(i, i+numdigits+4),
TString.FOLIO_MARKER));
startOfString = i+numdigits+4;
i = startOfString - 1;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
break;
}
}
@ -607,11 +607,11 @@ public class ACIPTshegBarScanner {
}
}
if (allAreNumeric) {
al.add(new ACIPString(s.substring(i, i+numdigits+1),
ACIPString.FOLIO_MARKER));
al.add(new TString(s.substring(i, i+numdigits+1),
TString.FOLIO_MARKER));
startOfString = i+numdigits+1;
i = startOfString - 1;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
break;
}
}
@ -627,24 +627,24 @@ public class ACIPTshegBarScanner {
inContext = inContext + "...";
}
}
al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.",
ACIPString.ERROR));
al.add(new TString("Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.",
TString.ERROR));
if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+1;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
}
break; // end '@' case
case '/':
// This definitely indicates a new token.
if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i),
currentType));
al.add(new TString(s.substring(startOfString, i),
currentType));
startOfString = i;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
}
if (startSlashIndex >= 0) {
@ -653,25 +653,25 @@ public class ACIPTshegBarScanner {
* it means /NYA/. We warn about // for this
* reason. \\ causes a tsheg-bar error (DLC
* FIXME: verify this is so). */
al.add(new ACIPString("Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.",
ACIPString.ERROR));
al.add(new TString("Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.",
TString.ERROR));
if (errors != null) {
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\n");
}
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
}
al.add(new ACIPString(s.substring(i, i+1),
ACIPString.END_SLASH));
al.add(new TString(s.substring(i, i+1),
TString.END_SLASH));
startOfString = i+1;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
startSlashIndex = -1;
} else {
startSlashIndex = i;
al.add(new ACIPString(s.substring(i, i+1),
ACIPString.START_SLASH));
al.add(new TString(s.substring(i, i+1),
TString.START_SLASH));
startOfString = i+1;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
}
break; // end '/' case
@ -679,42 +679,42 @@ public class ACIPTshegBarScanner {
case ')':
// This definitely indicates a new token.
if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i),
currentType));
al.add(new TString(s.substring(startOfString, i),
currentType));
startOfString = i;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
}
// We do not support nesting like (NYA (BA)).
if (startParenIndex >= 0) {
if (ch == '(') {
al.add(new ACIPString("Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.",
ACIPString.ERROR));
al.add(new TString("Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.",
TString.ERROR));
if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} else {
al.add(new ACIPString(s.substring(i, i+1), ACIPString.END_PAREN));
al.add(new TString(s.substring(i, i+1), TString.END_PAREN));
startParenIndex = -1;
}
startOfString = i+1;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
} else {
if (ch == ')') {
al.add(new ACIPString("Unexpected closing parenthesis, ), found.",
ACIPString.ERROR));
al.add(new TString("Unexpected closing parenthesis, ), found.",
TString.ERROR));
if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Unexpected closing parenthesis, ), found.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} else {
startParenIndex = i;
al.add(new ACIPString(s.substring(i, i+1), ACIPString.START_PAREN));
al.add(new TString(s.substring(i, i+1), TString.START_PAREN));
}
startOfString = i+1;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
}
break; // end '(',')' case
@ -723,13 +723,13 @@ public class ACIPTshegBarScanner {
|| (s.charAt(i+1) != ']' && s.charAt(i+1) != '}')) {
// The tsheg bar ends here; new token.
if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i),
currentType));
al.add(new TString(s.substring(startOfString, i),
currentType));
}
al.add(new ACIPString(s.substring(i, i+1),
ACIPString.QUESTION));
al.add(new TString(s.substring(i, i+1),
TString.QUESTION));
startOfString = i+1;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
} // else this is [*TR'A ?] or the like.
break; // end '?' case
@ -737,23 +737,23 @@ public class ACIPTshegBarScanner {
case '.':
// This definitely indicates a new token.
if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i),
currentType));
al.add(new TString(s.substring(startOfString, i),
currentType));
startOfString = i;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
}
// . is used for a non-breaking tsheg, such as in
// {NGO.,} and {....,DAM}. We give a warning unless ,
// or ., or [A-Za-z] follows '.'.
al.add(new ACIPString(s.substring(i, i+1),
ACIPString.TIBETAN_PUNCTUATION));
al.add(new TString(s.substring(i, i+1),
TString.TIBETAN_PUNCTUATION));
if (!(i + 1 < sl
&& (s.charAt(i+1) == '.' || s.charAt(i+1) == ','
|| (s.charAt(i+1) == '\r' || s.charAt(i+1) == '\n')
|| (s.charAt(i+1) >= 'a' && s.charAt(i+1) <= 'z')
|| (s.charAt(i+1) >= 'A' && s.charAt(i+1) <= 'Z')))) {
al.add(new ACIPString("A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".",
ACIPString.WARNING));
al.add(new TString("A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".",
TString.WARNING));
}
startOfString = i+1;
break; // end '.' case
@ -775,11 +775,11 @@ public class ACIPTshegBarScanner {
boolean legalTshegBarAdornment = false;
// The tsheg bar ends here; new token.
if (startOfString < i) {
if (currentType == ACIPString.TIBETAN_NON_PUNCTUATION
if (currentType == TString.TIBETAN_NON_PUNCTUATION
&& isTshegBarAdornment(ch))
legalTshegBarAdornment = true;
al.add(new ACIPString(s.substring(startOfString, i),
currentType));
al.add(new TString(s.substring(startOfString, i),
currentType));
}
// Insert a tsheg if necessary. ACIP files aren't
@ -788,22 +788,22 @@ public class ACIPTshegBarScanner {
if (('\r' == ch
|| ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
&& !al.isEmpty()
&& (((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION
|| ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TSHEG_BAR_ADORNMENT)) {
al.add(new ACIPString(" ", ACIPString.TIBETAN_PUNCTUATION));
&& (((TString)al.get(al.size() - 1)).getType() == TString.TIBETAN_NON_PUNCTUATION
|| ((TString)al.get(al.size() - 1)).getType() == TString.TSHEG_BAR_ADORNMENT)) {
al.add(new TString(" ", TString.TIBETAN_PUNCTUATION));
}
// "DANG,\nLHAG" is really "DANG, LHAG". But always? Not if you have "MDO,\n\nKA...".
if (('\r' == ch
|| ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
&& !al.isEmpty()
&& (((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_PUNCTUATION
|| ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TSHEG_BAR_ADORNMENT)
&& ((ACIPString)al.get(al.size() - 1)).getText().equals(",")
&& (((TString)al.get(al.size() - 1)).getType() == TString.TIBETAN_PUNCTUATION
|| ((TString)al.get(al.size() - 1)).getType() == TString.TSHEG_BAR_ADORNMENT)
&& ((TString)al.get(al.size() - 1)).getText().equals(",")
&& s.charAt(i-1) == ','
&& (i + (('\r' == ch) ? 2 : 1) < sl
&& (s.charAt(i+(('\r' == ch) ? 2 : 1)) != ch))) {
al.add(new ACIPString(" ", ACIPString.TIBETAN_PUNCTUATION));
al.add(new TString(" ", TString.TIBETAN_PUNCTUATION));
}
// Don't add in a "\r\n" or "\n" unless there's a
@ -816,24 +816,24 @@ public class ACIPTshegBarScanner {
|| ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n')))) {
for (int h = 0; h < (realNewline ? 2 : 1); h++) {
if (isTshegBarAdornment(ch) && !legalTshegBarAdornment) {
al.add(new ACIPString("The ACIP " + ch + " must be glued to the end of a tsheg bar, but this one was not",
ACIPString.ERROR));
al.add(new TString("The ACIP " + ch + " must be glued to the end of a tsheg bar, but this one was not",
TString.ERROR));
} else {
al.add(new ACIPString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
(legalTshegBarAdornment
? ACIPString.TSHEG_BAR_ADORNMENT
: ACIPString.TIBETAN_PUNCTUATION)));
al.add(new TString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
(legalTshegBarAdornment
? TString.TSHEG_BAR_ADORNMENT
: TString.TIBETAN_PUNCTUATION)));
}
}
}
startOfString = i+1;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
break; // end TIBETAN_PUNCTUATION case
default:
if (!bracketTypeStack.empty()) {
int stackTop = ((Integer)bracketTypeStack.peek()).intValue();
if (ACIPString.CORRECTION_START == stackTop && '?' == ch) {
if (TString.CORRECTION_START == stackTop && '?' == ch) {
// allow it through...
break;
}
@ -844,46 +844,46 @@ public class ACIPTshegBarScanner {
break;
if (!(isNumeric(ch) || isAlpha(ch))) {
if (startOfString < i) {
al.add(new ACIPString(s.substring(startOfString, i),
currentType));
al.add(new TString(s.substring(startOfString, i),
currentType));
}
if ((int)ch == 65533) {
al.add(new ACIPString("Found an illegal, unprintable character.",
ACIPString.ERROR));
al.add(new TString("Found an illegal, unprintable character.",
TString.ERROR));
if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal, unprintable character.\n");
} else if ('\\' == ch) {
al.add(new ACIPString("Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.",
ACIPString.ERROR));
al.add(new TString("Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.",
TString.ERROR));
if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n");
} else {
al.add(new ACIPString("Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".",
ACIPString.ERROR));
al.add(new TString("Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".",
TString.ERROR));
if (null != errors)
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
+ "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".\n");
}
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
startOfString = i+1;
currentType = ACIPString.ERROR;
currentType = TString.ERROR;
} else {
// Continue through the loop.
if (ACIPString.ERROR == currentType)
currentType = ACIPString.TIBETAN_NON_PUNCTUATION;
if (TString.ERROR == currentType)
currentType = TString.TIBETAN_NON_PUNCTUATION;
}
break; // end default case
}
}
if (startOfString < sl) {
al.add(new ACIPString(s.substring(startOfString, sl),
currentType));
al.add(new TString(s.substring(startOfString, sl),
currentType));
}
if (waitingForMatchingIllegalClose) {
al.add(new ACIPString("UNEXPECTED END OF INPUT",
ACIPString.ERROR));
al.add(new TString("UNEXPECTED END OF INPUT",
TString.ERROR));
if (null != errors) {
errors.append("Offset END: "
+ "Truly unmatched open bracket found.\n");
@ -891,25 +891,25 @@ public class ACIPTshegBarScanner {
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
}
if (!bracketTypeStack.empty()) {
al.add(new ACIPString("Unmatched open bracket found. A " + ((ACIPString.COMMENT == currentType) ? "comment" : "correction") + " does not terminate.",
ACIPString.ERROR));
al.add(new TString("Unmatched open bracket found. A " + ((TString.COMMENT == currentType) ? "comment" : "correction") + " does not terminate.",
TString.ERROR));
if (null != errors) {
errors.append("Offset END: "
+ "Unmatched open bracket found. A " + ((ACIPString.COMMENT == currentType) ? "comment" : "correction") + " does not terminate.\n");
+ "Unmatched open bracket found. A " + ((TString.COMMENT == currentType) ? "comment" : "correction") + " does not terminate.\n");
}
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
}
if (startSlashIndex >= 0) {
al.add(new ACIPString("Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.",
ACIPString.ERROR));
al.add(new TString("Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.",
TString.ERROR));
if (null != errors)
errors.append("Offset END: "
+ "Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
}
if (startParenIndex >= 0) {
al.add(new ACIPString("Parentheses are supposed to occur in pairs, but the input had an unmatched parenthesis.",
ACIPString.ERROR));
al.add(new TString("Parentheses are supposed to occur in pairs, but the input had an unmatched parenthesis.",
TString.ERROR));
if (null != errors)
errors.append("Offset END: "
+ "Unmatched open parenthesis, (, found.\n");

View file

@ -19,18 +19,18 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt;
/**
* An ACIPString is some Latin text and a type, the type stating
* whether said text is Latin (usually English) or transliteration of
* Tibetan and which particular kind. Scanning errors are also encoded
* as ACIPStrings using a special type.
* An TString is some Latin text and a type, the type stating whether
* said text is Latin (usually English) or transliteration of Tibetan,
* which transliteration system (ACIP or EWTS), and which particular
* kind. Scanning errors are also encoded as TStrings using a special
* type.
*
* @author David Chandler
*/
public class ACIPString {
* @author David Chandler */
public class TString {
private int type;
private String text;
/** Returns true if and only if an ACIPString with type type is to
/** Returns true if and only if an TString with type type is to
* be converted to Latin, not Tibetan, text. */
public static boolean isLatin(int type) {
return (type != TIBETAN_NON_PUNCTUATION
@ -42,45 +42,45 @@ public class ACIPString {
&& type != END_SLASH);
}
/** For [#COMMENTS] */
/** For ACIP [#COMMENTS] and EWTS (DLC FIXME) */
public static final int COMMENT = 0;
/** For Folio markers like @012B */
/** For Folio markers like @012B in ACIP */
public static final int FOLIO_MARKER = 1;
/** For Latin letters and numbers etc. [*LINE BREAK?] uses this,
* for example. */
* for example. Or in EWTS, \f uses this. */
public static final int LATIN = 2;
/** For Tibetan letters and numbers etc. */
public static final int TIBETAN_NON_PUNCTUATION = 3;
/** For tshegs, whitespace and the like, but not combining
* punctutation like %, o, :, m, and x */
* punctutation like ACIP %, o, :, m, and x */
public static final int TIBETAN_PUNCTUATION = 4;
/** For the start of a [*probable correction] or [*possible correction?] */
/** For the start of a [*probable correction] or [*possible correction?] in ACIP */
public static final int CORRECTION_START = 5;
/** Denotes the end of a [*probable correction] */
/** Denotes the end of a [*probable correction] in ACIP */
public static final int PROBABLE_CORRECTION = 6;
/** Denotes the end of a [*possible correction?] */
/** Denotes the end of a [*possible correction?] in ACIP*/
public static final int POSSIBLE_CORRECTION = 7;
/** For [BP] -- blank page */
/** For [BP] -- blank page in ACIP*/
public static final int BP = 8;
/** For [LS] -- Lanycha script on page */
/** For [LS] -- Lanycha script on page in ACIP*/
public static final int LS = 9;
/** For [DR] -- picture (without caption) on page */
/** For [DR] -- picture (without caption) on page in ACIP*/
public static final int DR = 10;
/** For [DD], [DDD], [DD1], [DD2], et cetera -- picture with caption on page */
/** For [DD], [DDD], [DD1], [DD2], et cetera -- picture with caption on page in ACIP */
public static final int DD = 11;
/** For [?] */
/** For [?] in ACIP */
public static final int QUESTION = 12;
/** For the first / in /NYA/ */
/** For the first / in /NYA/ in ACIP */
public static final int START_SLASH = 13;
/** For the last / in /NYA/ */
/** For the last / in /NYA/ in ACIP */
public static final int END_SLASH = 14;
/** For the opening ( in (NYA) */
/** For the opening ( in (NYA) in ACIP */
public static final int START_PAREN = 15;
/** For the closing ) in (NYA) */
/** For the closing ) in (NYA) in ACIP */
public static final int END_PAREN = 16;
/** For things that may not be legal syntax, such as {KA . KHA} */
public static final int WARNING = 17;
/** For ACIP %, o, and x */
/** For ACIP %, o, and x or EWTS (DLC FIXME) */
public static final int TSHEG_BAR_ADORNMENT = 18;
/** For things that are not legal syntax, such as a file that
* contains just "[# HALF A COMMEN" */
@ -112,11 +112,11 @@ public class ACIPString {
}
/** Don't instantiate me. */
private ACIPString() { }
private TString() { }
/** Creates a new ACIPString with source text <i>text</i> and type
/** Creates a new TString with source text <i>text</i> and type
* <i>type</i> being a characterization like {@link #DD}. */
public ACIPString(String text, int type) {
public TString(String text, int type) {
setType(type);
setText(text);
}