TString now has tracks what Roman transliteration system it is using. Next up is to make ACIPConverter handle EWTS or ACIP TStrings.

This commit is contained in:
dchandler 2004-01-17 19:28:54 +00:00
parent 48b4c5cb07
commit c69ba26c60
2 changed files with 74 additions and 59 deletions

View file

@ -171,7 +171,7 @@ public class ACIPTshegBarScanner {
if (ch == '\n') ++numNewlines; if (ch == '\n') ++numNewlines;
if (TString.COMMENT == currentType && ch != ']') { if (TString.COMMENT == currentType && ch != ']') {
if ('[' == ch) { if ('[' == ch) {
al.add(new TString("Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n", al.add(new TString("ACIP", "Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n",
TString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": " errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
@ -186,11 +186,11 @@ public class ACIPTshegBarScanner {
if (bracketTypeStack.empty()) { if (bracketTypeStack.empty()) {
// Error. // Error.
if (startOfString < i) { if (startOfString < i) {
al.add(new TString(s.substring(startOfString, i), al.add(new TString("ACIP", s.substring(startOfString, i),
currentType)); currentType));
} }
if (!waitingForMatchingIllegalClose) { if (!waitingForMatchingIllegalClose) {
al.add(new TString("Found a truly unmatched close bracket, " + s.substring(i, i+1), al.add(new TString("ACIP", "Found a truly unmatched close bracket, " + s.substring(i, i+1),
TString.ERROR)); TString.ERROR));
if (null != errors) { if (null != errors) {
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": " errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
@ -199,7 +199,7 @@ public class ACIPTshegBarScanner {
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }
waitingForMatchingIllegalClose = false; waitingForMatchingIllegalClose = false;
al.add(new TString("Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.", al.add(new TString("ACIP", "Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.",
TString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": " errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
@ -220,7 +220,7 @@ public class ACIPTshegBarScanner {
else else
end = i; end = i;
if (startOfString < end) { if (startOfString < end) {
al.add(new TString(s.substring(startOfString, end), al.add(new TString("ACIP", s.substring(startOfString, end),
currentType)); currentType));
} }
@ -230,7 +230,7 @@ public class ACIPTshegBarScanner {
currentType = TString.POSSIBLE_CORRECTION; currentType = TString.POSSIBLE_CORRECTION;
} }
} }
al.add(new TString(s.substring(end, i+1), currentType)); al.add(new TString("ACIP", s.substring(end, i+1), currentType));
startOfString = i+1; startOfString = i+1;
currentType = TString.ERROR; currentType = TString.ERROR;
} }
@ -244,7 +244,7 @@ public class ACIPTshegBarScanner {
case '[': case '[':
// This definitely indicates a new token. // This definitely indicates a new token.
if (startOfString < i) { if (startOfString < i) {
al.add(new TString(s.substring(startOfString, i), al.add(new TString("ACIP", s.substring(startOfString, i),
currentType)); currentType));
startOfString = i; startOfString = i;
currentType = TString.ERROR; currentType = TString.ERROR;
@ -351,7 +351,7 @@ public class ACIPTshegBarScanner {
if (i + 2 + englishComments[ec].length() <= sl if (i + 2 + englishComments[ec].length() <= sl
&& (s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]") && (s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]")
|| s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]"))) { || s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]"))) {
al.add(new TString("[#" + englishComments[ec] + "]", al.add(new TString("ACIP", "[#" + englishComments[ec] + "]",
TString.COMMENT)); TString.COMMENT));
startOfString = i + 2 + englishComments[ec].length(); startOfString = i + 2 + englishComments[ec].length();
i = startOfString - 1; i = startOfString - 1;
@ -408,15 +408,15 @@ public class ACIPTshegBarScanner {
= s.substring(begin, realEnd); = s.substring(begin, realEnd);
for (int ec = 0; ec < englishCorrections.length; ec++) { for (int ec = 0; ec < englishCorrections.length; ec++) {
if (interestingSubstring.startsWith(englishCorrections[ec])) { if (interestingSubstring.startsWith(englishCorrections[ec])) {
al.add(new TString(s.substring(i, i+2), al.add(new TString("ACIP", s.substring(i, i+2),
TString.CORRECTION_START)); TString.CORRECTION_START));
al.add(new TString(s.substring(i+2, realEnd), al.add(new TString("ACIP", s.substring(i+2, realEnd),
TString.LATIN)); TString.LATIN));
if (s.charAt(end - 1) == '?') { if (s.charAt(end - 1) == '?') {
al.add(new TString(s.substring(end-1, end+1), al.add(new TString("ACIP", s.substring(end-1, end+1),
TString.POSSIBLE_CORRECTION)); TString.POSSIBLE_CORRECTION));
} else { } else {
al.add(new TString(s.substring(end, end+1), al.add(new TString("ACIP", s.substring(end, end+1),
TString.PROBABLE_CORRECTION)); TString.PROBABLE_CORRECTION));
} }
foundOne = true; foundOne = true;
@ -431,7 +431,7 @@ public class ACIPTshegBarScanner {
break; break;
} }
if (null != thingy) { if (null != thingy) {
al.add(new TString(thingy, al.add(new TString("ACIP", thingy,
currentType)); currentType));
startOfString = i + thingy.length(); startOfString = i + thingy.length();
i = startOfString - 1; i = startOfString - 1;
@ -441,7 +441,7 @@ public class ACIPTshegBarScanner {
if ('*' == nextCh) { if ('*' == nextCh) {
currentType = TString.CORRECTION_START; currentType = TString.CORRECTION_START;
bracketTypeStack.push(new Integer(currentType)); bracketTypeStack.push(new Integer(currentType));
al.add(new TString(s.substring(i, i+2), al.add(new TString("ACIP", s.substring(i, i+2),
TString.CORRECTION_START)); TString.CORRECTION_START));
currentType = TString.ERROR; currentType = TString.ERROR;
startOfString = i+2; startOfString = i+2;
@ -457,7 +457,7 @@ public class ACIPTshegBarScanner {
// WITHOUT # MARKS]. Though "... [" could cause // WITHOUT # MARKS]. Though "... [" could cause
// this too. // this too.
if (waitingForMatchingIllegalClose) { if (waitingForMatchingIllegalClose) {
al.add(new TString("Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.", al.add(new TString("ACIP", "Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.",
TString.ERROR)); TString.ERROR));
if (null != errors) { if (null != errors) {
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": " errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
@ -477,7 +477,7 @@ public class ACIPTshegBarScanner {
inContext = inContext + "..."; inContext = inContext + "...";
} }
} }
al.add(new TString("Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?", al.add(new TString("ACIP", "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?",
TString.ERROR)); TString.ERROR));
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": " errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
+ "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n"); + "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n");
@ -491,7 +491,7 @@ public class ACIPTshegBarScanner {
case '@': case '@':
// This definitely indicates a new token. // This definitely indicates a new token.
if (startOfString < i) { if (startOfString < i) {
al.add(new TString(s.substring(startOfString, i), al.add(new TString("ACIP", s.substring(startOfString, i),
currentType)); currentType));
startOfString = i; startOfString = i;
currentType = TString.ERROR; currentType = TString.ERROR;
@ -531,7 +531,7 @@ public class ACIPTshegBarScanner {
inContext = inContext + "..."; inContext = inContext + "...";
} }
} }
al.add(new TString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.", al.add(new TString("ACIP", "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.",
TString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": " errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
@ -553,7 +553,7 @@ public class ACIPTshegBarScanner {
inContext = inContext + "..."; inContext = inContext + "...";
} }
} }
al.add(new TString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.", al.add(new TString("ACIP", "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.",
TString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": " errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
@ -567,7 +567,7 @@ public class ACIPTshegBarScanner {
} else { } else {
extra = 2; extra = 2;
} }
al.add(new TString(s.substring(i, i+numdigits+extra), al.add(new TString("ACIP", s.substring(i, i+numdigits+extra),
TString.FOLIO_MARKER)); TString.FOLIO_MARKER));
startOfString = i+numdigits+extra; startOfString = i+numdigits+extra;
i = startOfString - 1; i = startOfString - 1;
@ -587,7 +587,7 @@ public class ACIPTshegBarScanner {
} }
} }
if (allAreNumeric) { if (allAreNumeric) {
al.add(new TString(s.substring(i, i+numdigits+2), al.add(new TString("ACIP", s.substring(i, i+numdigits+2),
TString.FOLIO_MARKER)); TString.FOLIO_MARKER));
startOfString = i+numdigits+2; startOfString = i+numdigits+2;
i = startOfString - 1; i = startOfString - 1;
@ -608,7 +608,7 @@ public class ACIPTshegBarScanner {
} }
} }
if (allAreNumeric) { if (allAreNumeric) {
al.add(new TString(s.substring(i, i+numdigits+4), al.add(new TString("ACIP", s.substring(i, i+numdigits+4),
TString.FOLIO_MARKER)); TString.FOLIO_MARKER));
startOfString = i+numdigits+4; startOfString = i+numdigits+4;
i = startOfString - 1; i = startOfString - 1;
@ -629,7 +629,7 @@ public class ACIPTshegBarScanner {
} }
} }
if (allAreNumeric) { if (allAreNumeric) {
al.add(new TString(s.substring(i, i+numdigits+1), al.add(new TString("ACIP", s.substring(i, i+numdigits+1),
TString.FOLIO_MARKER)); TString.FOLIO_MARKER));
startOfString = i+numdigits+1; startOfString = i+numdigits+1;
i = startOfString - 1; i = startOfString - 1;
@ -649,7 +649,7 @@ public class ACIPTshegBarScanner {
inContext = inContext + "..."; inContext = inContext + "...";
} }
} }
al.add(new TString("Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.", al.add(new TString("ACIP", "Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.",
TString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": " errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
@ -663,7 +663,7 @@ public class ACIPTshegBarScanner {
case '/': case '/':
// This definitely indicates a new token. // This definitely indicates a new token.
if (startOfString < i) { if (startOfString < i) {
al.add(new TString(s.substring(startOfString, i), al.add(new TString("ACIP", s.substring(startOfString, i),
currentType)); currentType));
startOfString = i; startOfString = i;
currentType = TString.ERROR; currentType = TString.ERROR;
@ -674,7 +674,7 @@ public class ACIPTshegBarScanner {
/* //NYA\\ appears in ACIP input, and I think /* //NYA\\ appears in ACIP input, and I think
* it means /NYA/. We warn about // for this * it means /NYA/. We warn about // for this
* reason. \\ causes a tsheg-bar error. */ * reason. \\ causes a tsheg-bar error. */
al.add(new TString("Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.", al.add(new TString("ACIP", "Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.",
TString.ERROR)); TString.ERROR));
if (errors != null) { if (errors != null) {
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": " errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
@ -682,14 +682,14 @@ public class ACIPTshegBarScanner {
} }
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }
al.add(new TString(s.substring(i, i+1), al.add(new TString("ACIP", s.substring(i, i+1),
TString.END_SLASH)); TString.END_SLASH));
startOfString = i+1; startOfString = i+1;
currentType = TString.ERROR; currentType = TString.ERROR;
startSlashIndex = -1; startSlashIndex = -1;
} else { } else {
startSlashIndex = i; startSlashIndex = i;
al.add(new TString(s.substring(i, i+1), al.add(new TString("ACIP", s.substring(i, i+1),
TString.START_SLASH)); TString.START_SLASH));
startOfString = i+1; startOfString = i+1;
currentType = TString.ERROR; currentType = TString.ERROR;
@ -700,7 +700,7 @@ public class ACIPTshegBarScanner {
case ')': case ')':
// This definitely indicates a new token. // This definitely indicates a new token.
if (startOfString < i) { if (startOfString < i) {
al.add(new TString(s.substring(startOfString, i), al.add(new TString("ACIP", s.substring(startOfString, i),
currentType)); currentType));
startOfString = i; startOfString = i;
currentType = TString.ERROR; currentType = TString.ERROR;
@ -710,21 +710,21 @@ public class ACIPTshegBarScanner {
if (startParenIndex >= 0) { if (startParenIndex >= 0) {
if (ch == '(') { if (ch == '(') {
al.add(new TString("Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.", al.add(new TString("ACIP", "Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.",
TString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": " errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
+ "Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\n"); + "Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\n");
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} else { } else {
al.add(new TString(s.substring(i, i+1), TString.END_PAREN)); al.add(new TString("ACIP", s.substring(i, i+1), TString.END_PAREN));
startParenIndex = -1; startParenIndex = -1;
} }
startOfString = i+1; startOfString = i+1;
currentType = TString.ERROR; currentType = TString.ERROR;
} else { } else {
if (ch == ')') { if (ch == ')') {
al.add(new TString("Unexpected closing parenthesis, ), found.", al.add(new TString("ACIP", "Unexpected closing parenthesis, ), found.",
TString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": " errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
@ -732,7 +732,7 @@ public class ACIPTshegBarScanner {
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} else { } else {
startParenIndex = i; startParenIndex = i;
al.add(new TString(s.substring(i, i+1), TString.START_PAREN)); al.add(new TString("ACIP", s.substring(i, i+1), TString.START_PAREN));
} }
startOfString = i+1; startOfString = i+1;
currentType = TString.ERROR; currentType = TString.ERROR;
@ -744,10 +744,10 @@ public class ACIPTshegBarScanner {
|| (s.charAt(i+1) != ']' && s.charAt(i+1) != '}')) { || (s.charAt(i+1) != ']' && s.charAt(i+1) != '}')) {
// The tsheg bar ends here; new token. // The tsheg bar ends here; new token.
if (startOfString < i) { if (startOfString < i) {
al.add(new TString(s.substring(startOfString, i), al.add(new TString("ACIP", s.substring(startOfString, i),
currentType)); currentType));
} }
al.add(new TString(s.substring(i, i+1), al.add(new TString("ACIP", s.substring(i, i+1),
TString.QUESTION)); TString.QUESTION));
startOfString = i+1; startOfString = i+1;
currentType = TString.ERROR; currentType = TString.ERROR;
@ -758,7 +758,7 @@ public class ACIPTshegBarScanner {
case '.': case '.':
// This definitely indicates a new token. // This definitely indicates a new token.
if (startOfString < i) { if (startOfString < i) {
al.add(new TString(s.substring(startOfString, i), al.add(new TString("ACIP", s.substring(startOfString, i),
currentType)); currentType));
startOfString = i; startOfString = i;
currentType = TString.ERROR; currentType = TString.ERROR;
@ -766,14 +766,14 @@ public class ACIPTshegBarScanner {
// . is used for a non-breaking tsheg, such as in // . is used for a non-breaking tsheg, such as in
// {NGO.,} and {....,DAM}. We give a warning unless , // {NGO.,} and {....,DAM}. We give a warning unless ,
// or ., or [A-Za-z] follows '.'. // or ., or [A-Za-z] follows '.'.
al.add(new TString(s.substring(i, i+1), al.add(new TString("ACIP", s.substring(i, i+1),
TString.TIBETAN_PUNCTUATION)); TString.TIBETAN_PUNCTUATION));
if (!(i + 1 < sl if (!(i + 1 < sl
&& (s.charAt(i+1) == '.' || s.charAt(i+1) == ',' && (s.charAt(i+1) == '.' || s.charAt(i+1) == ','
|| (s.charAt(i+1) == '\r' || s.charAt(i+1) == '\n') || (s.charAt(i+1) == '\r' || s.charAt(i+1) == '\n')
|| (s.charAt(i+1) >= 'a' && s.charAt(i+1) <= 'z') || (s.charAt(i+1) >= 'a' && s.charAt(i+1) <= 'z')
|| (s.charAt(i+1) >= 'A' && s.charAt(i+1) <= 'Z')))) { || (s.charAt(i+1) >= 'A' && s.charAt(i+1) <= 'Z')))) {
al.add(new TString("A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".", al.add(new TString("ACIP", "A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".",
TString.WARNING)); TString.WARNING));
} }
startOfString = i+1; startOfString = i+1;
@ -801,7 +801,7 @@ public class ACIPTshegBarScanner {
if (currentType == TString.TIBETAN_NON_PUNCTUATION if (currentType == TString.TIBETAN_NON_PUNCTUATION
&& isTshegBarAdornment(ch)) && isTshegBarAdornment(ch))
legalTshegBarAdornment = true; legalTshegBarAdornment = true;
al.add(new TString(s.substring(startOfString, i), al.add(new TString("ACIP", s.substring(startOfString, i),
currentType)); currentType));
} }
@ -812,7 +812,7 @@ public class ACIPTshegBarScanner {
|| ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r')) || ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
&& !al.isEmpty() && !al.isEmpty()
&& lastNonExceptionalThingWasAdornmentOr(al, TString.TIBETAN_NON_PUNCTUATION)) { && lastNonExceptionalThingWasAdornmentOr(al, TString.TIBETAN_NON_PUNCTUATION)) {
al.add(new TString(" ", TString.TIBETAN_PUNCTUATION)); al.add(new TString("ACIP", " ", TString.TIBETAN_PUNCTUATION));
} }
// "DANG,\nLHAG" is really "DANG, LHAG". But always? Not if you have "MDO,\n\nKA...". // "DANG,\nLHAG" is really "DANG, LHAG". But always? Not if you have "MDO,\n\nKA...".
@ -824,7 +824,7 @@ public class ACIPTshegBarScanner {
&& s.charAt(i-1) == ',' && s.charAt(i-1) == ','
&& (i + (('\r' == ch) ? 2 : 1) < sl && (i + (('\r' == ch) ? 2 : 1) < sl
&& (s.charAt(i+(('\r' == ch) ? 2 : 1)) != ch))) { && (s.charAt(i+(('\r' == ch) ? 2 : 1)) != ch))) {
al.add(new TString(" ", TString.TIBETAN_PUNCTUATION)); al.add(new TString("ACIP", " ", TString.TIBETAN_PUNCTUATION));
} }
if ('^' == ch) { if ('^' == ch) {
@ -850,9 +850,9 @@ public class ACIPTshegBarScanner {
bad = true; bad = true;
} }
if (!bad) if (!bad)
al.add(new TString("^", TString.TIBETAN_PUNCTUATION)); al.add(new TString("ACIP", "^", TString.TIBETAN_PUNCTUATION));
else else
al.add(new TString("The ACIP {^} must precede a tsheg bar.", TString.ERROR)); al.add(new TString("ACIP", "The ACIP {^} must precede a tsheg bar.", TString.ERROR));
} else { } else {
// Don't add in a "\r\n" or "\n" unless there's a // Don't add in a "\r\n" or "\n" unless there's a
// blank line. // blank line.
@ -864,10 +864,10 @@ public class ACIPTshegBarScanner {
|| ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n')))) { || ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n')))) {
for (int h = 0; h < (realNewline ? 2 : 1); h++) { for (int h = 0; h < (realNewline ? 2 : 1); h++) {
if (isTshegBarAdornment(ch) && !legalTshegBarAdornment) { if (isTshegBarAdornment(ch) && !legalTshegBarAdornment) {
al.add(new TString("The ACIP " + ch + " must be glued to the end of a tsheg bar, but this one was not", al.add(new TString("ACIP", "The ACIP " + ch + " must be glued to the end of a tsheg bar, but this one was not",
TString.ERROR)); TString.ERROR));
} else { } else {
al.add(new TString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1), al.add(new TString("ACIP", rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
(legalTshegBarAdornment (legalTshegBarAdornment
? TString.TSHEG_BAR_ADORNMENT ? TString.TSHEG_BAR_ADORNMENT
: TString.TIBETAN_PUNCTUATION))); : TString.TIBETAN_PUNCTUATION)));
@ -875,7 +875,7 @@ public class ACIPTshegBarScanner {
} }
} }
if ('%' == ch) { if ('%' == ch) {
al.add(new TString("The ACIP {%} is treated by this converter as U+0F35, but sometimes might represent U+0F14 in practice. To avoid seeing this warning again, change the input to use {\\u0F35} instead of {%}.", al.add(new TString("ACIP", "The ACIP {%} is treated by this converter as U+0F35, but sometimes might represent U+0F14 in practice. To avoid seeing this warning again, change the input to use {\\u0F35} instead of {%}.",
TString.WARNING)); TString.WARNING));
} }
} }
@ -898,11 +898,11 @@ public class ACIPTshegBarScanner {
break; break;
if (!(isNumeric(ch) || isAlpha(ch))) { if (!(isNumeric(ch) || isAlpha(ch))) {
if (startOfString < i) { if (startOfString < i) {
al.add(new TString(s.substring(startOfString, i), al.add(new TString("ACIP", s.substring(startOfString, i),
currentType)); currentType));
} }
if ((int)ch == 65533) { if ((int)ch == 65533) {
al.add(new TString("Found an illegal, unprintable character.", al.add(new TString("ACIP", "Found an illegal, unprintable character.",
TString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": " errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
@ -921,7 +921,7 @@ public class ACIPTshegBarScanner {
} }
} }
if (x >= 0) { if (x >= 0) {
al.add(new TString(new String(new char[] { (char)x }), al.add(new TString("ACIP", new String(new char[] { (char)x }),
TString.UNICODE_CHARACTER)); TString.UNICODE_CHARACTER));
i += "uXXXX".length(); i += "uXXXX".length();
startOfString = i+1; startOfString = i+1;
@ -929,14 +929,14 @@ public class ACIPTshegBarScanner {
} else { } else {
final String msg final String msg
= "Found a backslash, \\, which the ACIP Tibetan Input Code standard says represents a Sanskrit virama. In practice, though, this is so often misused (to represent U+0F3D) that {\\} always generates this error. If you want a Sanskrit virama, change the input document to use {\\u0F84} instead of {\\}. If you want U+0F3D, use {/NYA/} or {/NYA\\u0F3D}."; = "Found a backslash, \\, which the ACIP Tibetan Input Code standard says represents a Sanskrit virama. In practice, though, this is so often misused (to represent U+0F3D) that {\\} always generates this error. If you want a Sanskrit virama, change the input document to use {\\u0F84} instead of {\\}. If you want U+0F3D, use {/NYA/} or {/NYA\\u0F3D}.";
al.add(new TString(msg, al.add(new TString("ACIP", msg,
TString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": " errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
+ msg + "\n"); + msg + "\n");
} }
} else { } else {
al.add(new TString("Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".", al.add(new TString("ACIP", "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".",
TString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": " errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
@ -954,11 +954,11 @@ public class ACIPTshegBarScanner {
} }
} }
if (startOfString < sl) { if (startOfString < sl) {
al.add(new TString(s.substring(startOfString, sl), al.add(new TString("ACIP", s.substring(startOfString, sl),
currentType)); currentType));
} }
if (waitingForMatchingIllegalClose) { if (waitingForMatchingIllegalClose) {
al.add(new TString("UNEXPECTED END OF INPUT", al.add(new TString("ACIP", "UNEXPECTED END OF INPUT",
TString.ERROR)); TString.ERROR));
if (null != errors) { if (null != errors) {
errors.append("Offset END: " errors.append("Offset END: "
@ -967,7 +967,7 @@ public class ACIPTshegBarScanner {
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }
if (!bracketTypeStack.empty()) { if (!bracketTypeStack.empty()) {
al.add(new TString("Unmatched open bracket found. A " + ((TString.COMMENT == currentType) ? "comment" : "correction") + " does not terminate.", al.add(new TString("ACIP", "Unmatched open bracket found. A " + ((TString.COMMENT == currentType) ? "comment" : "correction") + " does not terminate.",
TString.ERROR)); TString.ERROR));
if (null != errors) { if (null != errors) {
errors.append("Offset END: " errors.append("Offset END: "
@ -976,7 +976,7 @@ public class ACIPTshegBarScanner {
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }
if (startSlashIndex >= 0) { if (startSlashIndex >= 0) {
al.add(new TString("Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.", al.add(new TString("ACIP", "Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.",
TString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset END: " errors.append("Offset END: "
@ -984,7 +984,7 @@ public class ACIPTshegBarScanner {
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
} }
if (startParenIndex >= 0) { if (startParenIndex >= 0) {
al.add(new TString("Parentheses are supposed to occur in pairs, but the input had an unmatched parenthesis.", al.add(new TString("ACIP", "Parentheses are supposed to occur in pairs, but the input had an unmatched parenthesis.",
TString.ERROR)); TString.ERROR));
if (null != errors) if (null != errors)
errors.append("Offset END: " errors.append("Offset END: "

View file

@ -36,7 +36,17 @@ import java.io.*;
public class TString { public class TString {
private int type; private int type;
private String text; private String text;
// "EWTS" or "ACIP", interned (for quick, '==' equality checking:
private String encoding;
/** Returns "EWTS" if this TString is encoded in EWTS, or,
otherwise, "ACIP" if this TString is encoded in ACIP. Returns
an interned string for quick equality checking via the
<code>==</code> operator. */
public String getEncoding() {
return encoding;
}
/** Returns true if and only if an TString with type <i>type</i> /** Returns true if and only if an TString with type <i>type</i>
* is to be converted to something other than Tibetan text. * is to be converted to something other than Tibetan text.
* (Chinese Unicode, Latin, etc. all qualify as non-Tibetan.) */ * (Chinese Unicode, Latin, etc. all qualify as non-Tibetan.) */
@ -130,15 +140,20 @@ public class TString {
/** Don't instantiate using this constructor. */ /** Don't instantiate using this constructor. */
private TString() { } private TString() { }
/** Creates a new TString with source text <i>text</i> and type /** Creates a new TString with source text <i>text</i>, encoded
* using the Roman transliteration system specified by
* <i>encoding</i> (see {@link getEncoding()}) and type
* <i>type</i> being a characterization like {@link #DD}. */ * <i>type</i> being a characterization like {@link #DD}. */
public TString(String text, int type) { public TString(String encoding, String text, int type) {
this.encoding = encoding;
setType(type); setType(type);
String ftext = (TIBETAN_NON_PUNCTUATION == type) String ftext = (TIBETAN_NON_PUNCTUATION == type)
? MidLexSubstitution.getFinalValueForTibetanNonPunctuationToken(text) ? MidLexSubstitution.getFinalValueForTibetanNonPunctuationToken(text)
: text; : text;
// FIXME: assert this // FIXME: assert these
ThdlDebug.verify(type != UNICODE_CHARACTER || text.length() == 1); ThdlDebug.verify(type != UNICODE_CHARACTER || text.length() == 1);
ThdlDebug.verify("EWTS" == encoding || "ACIP" == encoding);
type != UNICODE_CHARACTER || text.length() == 1);
setText(ftext); setText(ftext);
if ((outputAllTshegBars || outputUniqueTshegBars) && TIBETAN_NON_PUNCTUATION == type) if ((outputAllTshegBars || outputUniqueTshegBars) && TIBETAN_NON_PUNCTUATION == type)
outputTshegBar(ftext); outputTshegBar(ftext);