TString now has tracks what Roman transliteration system it is using. Next up is to make ACIPConverter handle EWTS or ACIP TStrings.
This commit is contained in:
parent
48b4c5cb07
commit
c69ba26c60
2 changed files with 74 additions and 59 deletions
|
@ -171,7 +171,7 @@ public class ACIPTshegBarScanner {
|
||||||
if (ch == '\n') ++numNewlines;
|
if (ch == '\n') ++numNewlines;
|
||||||
if (TString.COMMENT == currentType && ch != ']') {
|
if (TString.COMMENT == currentType && ch != ']') {
|
||||||
if ('[' == ch) {
|
if ('[' == ch) {
|
||||||
al.add(new TString("Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n",
|
al.add(new TString("ACIP", "Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n",
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
||||||
|
@ -186,11 +186,11 @@ public class ACIPTshegBarScanner {
|
||||||
if (bracketTypeStack.empty()) {
|
if (bracketTypeStack.empty()) {
|
||||||
// Error.
|
// Error.
|
||||||
if (startOfString < i) {
|
if (startOfString < i) {
|
||||||
al.add(new TString(s.substring(startOfString, i),
|
al.add(new TString("ACIP", s.substring(startOfString, i),
|
||||||
currentType));
|
currentType));
|
||||||
}
|
}
|
||||||
if (!waitingForMatchingIllegalClose) {
|
if (!waitingForMatchingIllegalClose) {
|
||||||
al.add(new TString("Found a truly unmatched close bracket, " + s.substring(i, i+1),
|
al.add(new TString("ACIP", "Found a truly unmatched close bracket, " + s.substring(i, i+1),
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
if (null != errors) {
|
if (null != errors) {
|
||||||
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
||||||
|
@ -199,7 +199,7 @@ public class ACIPTshegBarScanner {
|
||||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
}
|
}
|
||||||
waitingForMatchingIllegalClose = false;
|
waitingForMatchingIllegalClose = false;
|
||||||
al.add(new TString("Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.",
|
al.add(new TString("ACIP", "Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.",
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
||||||
|
@ -220,7 +220,7 @@ public class ACIPTshegBarScanner {
|
||||||
else
|
else
|
||||||
end = i;
|
end = i;
|
||||||
if (startOfString < end) {
|
if (startOfString < end) {
|
||||||
al.add(new TString(s.substring(startOfString, end),
|
al.add(new TString("ACIP", s.substring(startOfString, end),
|
||||||
currentType));
|
currentType));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -230,7 +230,7 @@ public class ACIPTshegBarScanner {
|
||||||
currentType = TString.POSSIBLE_CORRECTION;
|
currentType = TString.POSSIBLE_CORRECTION;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
al.add(new TString(s.substring(end, i+1), currentType));
|
al.add(new TString("ACIP", s.substring(end, i+1), currentType));
|
||||||
startOfString = i+1;
|
startOfString = i+1;
|
||||||
currentType = TString.ERROR;
|
currentType = TString.ERROR;
|
||||||
}
|
}
|
||||||
|
@ -244,7 +244,7 @@ public class ACIPTshegBarScanner {
|
||||||
case '[':
|
case '[':
|
||||||
// This definitely indicates a new token.
|
// This definitely indicates a new token.
|
||||||
if (startOfString < i) {
|
if (startOfString < i) {
|
||||||
al.add(new TString(s.substring(startOfString, i),
|
al.add(new TString("ACIP", s.substring(startOfString, i),
|
||||||
currentType));
|
currentType));
|
||||||
startOfString = i;
|
startOfString = i;
|
||||||
currentType = TString.ERROR;
|
currentType = TString.ERROR;
|
||||||
|
@ -351,7 +351,7 @@ public class ACIPTshegBarScanner {
|
||||||
if (i + 2 + englishComments[ec].length() <= sl
|
if (i + 2 + englishComments[ec].length() <= sl
|
||||||
&& (s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]")
|
&& (s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]")
|
||||||
|| s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]"))) {
|
|| s.substring(i, i + 2 + englishComments[ec].length()).equals("[" + englishComments[ec] + "]"))) {
|
||||||
al.add(new TString("[#" + englishComments[ec] + "]",
|
al.add(new TString("ACIP", "[#" + englishComments[ec] + "]",
|
||||||
TString.COMMENT));
|
TString.COMMENT));
|
||||||
startOfString = i + 2 + englishComments[ec].length();
|
startOfString = i + 2 + englishComments[ec].length();
|
||||||
i = startOfString - 1;
|
i = startOfString - 1;
|
||||||
|
@ -408,15 +408,15 @@ public class ACIPTshegBarScanner {
|
||||||
= s.substring(begin, realEnd);
|
= s.substring(begin, realEnd);
|
||||||
for (int ec = 0; ec < englishCorrections.length; ec++) {
|
for (int ec = 0; ec < englishCorrections.length; ec++) {
|
||||||
if (interestingSubstring.startsWith(englishCorrections[ec])) {
|
if (interestingSubstring.startsWith(englishCorrections[ec])) {
|
||||||
al.add(new TString(s.substring(i, i+2),
|
al.add(new TString("ACIP", s.substring(i, i+2),
|
||||||
TString.CORRECTION_START));
|
TString.CORRECTION_START));
|
||||||
al.add(new TString(s.substring(i+2, realEnd),
|
al.add(new TString("ACIP", s.substring(i+2, realEnd),
|
||||||
TString.LATIN));
|
TString.LATIN));
|
||||||
if (s.charAt(end - 1) == '?') {
|
if (s.charAt(end - 1) == '?') {
|
||||||
al.add(new TString(s.substring(end-1, end+1),
|
al.add(new TString("ACIP", s.substring(end-1, end+1),
|
||||||
TString.POSSIBLE_CORRECTION));
|
TString.POSSIBLE_CORRECTION));
|
||||||
} else {
|
} else {
|
||||||
al.add(new TString(s.substring(end, end+1),
|
al.add(new TString("ACIP", s.substring(end, end+1),
|
||||||
TString.PROBABLE_CORRECTION));
|
TString.PROBABLE_CORRECTION));
|
||||||
}
|
}
|
||||||
foundOne = true;
|
foundOne = true;
|
||||||
|
@ -431,7 +431,7 @@ public class ACIPTshegBarScanner {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (null != thingy) {
|
if (null != thingy) {
|
||||||
al.add(new TString(thingy,
|
al.add(new TString("ACIP", thingy,
|
||||||
currentType));
|
currentType));
|
||||||
startOfString = i + thingy.length();
|
startOfString = i + thingy.length();
|
||||||
i = startOfString - 1;
|
i = startOfString - 1;
|
||||||
|
@ -441,7 +441,7 @@ public class ACIPTshegBarScanner {
|
||||||
if ('*' == nextCh) {
|
if ('*' == nextCh) {
|
||||||
currentType = TString.CORRECTION_START;
|
currentType = TString.CORRECTION_START;
|
||||||
bracketTypeStack.push(new Integer(currentType));
|
bracketTypeStack.push(new Integer(currentType));
|
||||||
al.add(new TString(s.substring(i, i+2),
|
al.add(new TString("ACIP", s.substring(i, i+2),
|
||||||
TString.CORRECTION_START));
|
TString.CORRECTION_START));
|
||||||
currentType = TString.ERROR;
|
currentType = TString.ERROR;
|
||||||
startOfString = i+2;
|
startOfString = i+2;
|
||||||
|
@ -457,7 +457,7 @@ public class ACIPTshegBarScanner {
|
||||||
// WITHOUT # MARKS]. Though "... [" could cause
|
// WITHOUT # MARKS]. Though "... [" could cause
|
||||||
// this too.
|
// this too.
|
||||||
if (waitingForMatchingIllegalClose) {
|
if (waitingForMatchingIllegalClose) {
|
||||||
al.add(new TString("Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.",
|
al.add(new TString("ACIP", "Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.",
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
if (null != errors) {
|
if (null != errors) {
|
||||||
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
||||||
|
@ -477,7 +477,7 @@ public class ACIPTshegBarScanner {
|
||||||
inContext = inContext + "...";
|
inContext = inContext + "...";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
al.add(new TString("Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?",
|
al.add(new TString("ACIP", "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?",
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
||||||
+ "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n");
|
+ "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n");
|
||||||
|
@ -491,7 +491,7 @@ public class ACIPTshegBarScanner {
|
||||||
case '@':
|
case '@':
|
||||||
// This definitely indicates a new token.
|
// This definitely indicates a new token.
|
||||||
if (startOfString < i) {
|
if (startOfString < i) {
|
||||||
al.add(new TString(s.substring(startOfString, i),
|
al.add(new TString("ACIP", s.substring(startOfString, i),
|
||||||
currentType));
|
currentType));
|
||||||
startOfString = i;
|
startOfString = i;
|
||||||
currentType = TString.ERROR;
|
currentType = TString.ERROR;
|
||||||
|
@ -531,7 +531,7 @@ public class ACIPTshegBarScanner {
|
||||||
inContext = inContext + "...";
|
inContext = inContext + "...";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
al.add(new TString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.",
|
al.add(new TString("ACIP", "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.",
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
||||||
|
@ -553,7 +553,7 @@ public class ACIPTshegBarScanner {
|
||||||
inContext = inContext + "...";
|
inContext = inContext + "...";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
al.add(new TString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.",
|
al.add(new TString("ACIP", "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.",
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
||||||
|
@ -567,7 +567,7 @@ public class ACIPTshegBarScanner {
|
||||||
} else {
|
} else {
|
||||||
extra = 2;
|
extra = 2;
|
||||||
}
|
}
|
||||||
al.add(new TString(s.substring(i, i+numdigits+extra),
|
al.add(new TString("ACIP", s.substring(i, i+numdigits+extra),
|
||||||
TString.FOLIO_MARKER));
|
TString.FOLIO_MARKER));
|
||||||
startOfString = i+numdigits+extra;
|
startOfString = i+numdigits+extra;
|
||||||
i = startOfString - 1;
|
i = startOfString - 1;
|
||||||
|
@ -587,7 +587,7 @@ public class ACIPTshegBarScanner {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (allAreNumeric) {
|
if (allAreNumeric) {
|
||||||
al.add(new TString(s.substring(i, i+numdigits+2),
|
al.add(new TString("ACIP", s.substring(i, i+numdigits+2),
|
||||||
TString.FOLIO_MARKER));
|
TString.FOLIO_MARKER));
|
||||||
startOfString = i+numdigits+2;
|
startOfString = i+numdigits+2;
|
||||||
i = startOfString - 1;
|
i = startOfString - 1;
|
||||||
|
@ -608,7 +608,7 @@ public class ACIPTshegBarScanner {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (allAreNumeric) {
|
if (allAreNumeric) {
|
||||||
al.add(new TString(s.substring(i, i+numdigits+4),
|
al.add(new TString("ACIP", s.substring(i, i+numdigits+4),
|
||||||
TString.FOLIO_MARKER));
|
TString.FOLIO_MARKER));
|
||||||
startOfString = i+numdigits+4;
|
startOfString = i+numdigits+4;
|
||||||
i = startOfString - 1;
|
i = startOfString - 1;
|
||||||
|
@ -629,7 +629,7 @@ public class ACIPTshegBarScanner {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (allAreNumeric) {
|
if (allAreNumeric) {
|
||||||
al.add(new TString(s.substring(i, i+numdigits+1),
|
al.add(new TString("ACIP", s.substring(i, i+numdigits+1),
|
||||||
TString.FOLIO_MARKER));
|
TString.FOLIO_MARKER));
|
||||||
startOfString = i+numdigits+1;
|
startOfString = i+numdigits+1;
|
||||||
i = startOfString - 1;
|
i = startOfString - 1;
|
||||||
|
@ -649,7 +649,7 @@ public class ACIPTshegBarScanner {
|
||||||
inContext = inContext + "...";
|
inContext = inContext + "...";
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
al.add(new TString("Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.",
|
al.add(new TString("ACIP", "Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.",
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
||||||
|
@ -663,7 +663,7 @@ public class ACIPTshegBarScanner {
|
||||||
case '/':
|
case '/':
|
||||||
// This definitely indicates a new token.
|
// This definitely indicates a new token.
|
||||||
if (startOfString < i) {
|
if (startOfString < i) {
|
||||||
al.add(new TString(s.substring(startOfString, i),
|
al.add(new TString("ACIP", s.substring(startOfString, i),
|
||||||
currentType));
|
currentType));
|
||||||
startOfString = i;
|
startOfString = i;
|
||||||
currentType = TString.ERROR;
|
currentType = TString.ERROR;
|
||||||
|
@ -674,7 +674,7 @@ public class ACIPTshegBarScanner {
|
||||||
/* //NYA\\ appears in ACIP input, and I think
|
/* //NYA\\ appears in ACIP input, and I think
|
||||||
* it means /NYA/. We warn about // for this
|
* it means /NYA/. We warn about // for this
|
||||||
* reason. \\ causes a tsheg-bar error. */
|
* reason. \\ causes a tsheg-bar error. */
|
||||||
al.add(new TString("Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.",
|
al.add(new TString("ACIP", "Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.",
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
if (errors != null) {
|
if (errors != null) {
|
||||||
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
||||||
|
@ -682,14 +682,14 @@ public class ACIPTshegBarScanner {
|
||||||
}
|
}
|
||||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
}
|
}
|
||||||
al.add(new TString(s.substring(i, i+1),
|
al.add(new TString("ACIP", s.substring(i, i+1),
|
||||||
TString.END_SLASH));
|
TString.END_SLASH));
|
||||||
startOfString = i+1;
|
startOfString = i+1;
|
||||||
currentType = TString.ERROR;
|
currentType = TString.ERROR;
|
||||||
startSlashIndex = -1;
|
startSlashIndex = -1;
|
||||||
} else {
|
} else {
|
||||||
startSlashIndex = i;
|
startSlashIndex = i;
|
||||||
al.add(new TString(s.substring(i, i+1),
|
al.add(new TString("ACIP", s.substring(i, i+1),
|
||||||
TString.START_SLASH));
|
TString.START_SLASH));
|
||||||
startOfString = i+1;
|
startOfString = i+1;
|
||||||
currentType = TString.ERROR;
|
currentType = TString.ERROR;
|
||||||
|
@ -700,7 +700,7 @@ public class ACIPTshegBarScanner {
|
||||||
case ')':
|
case ')':
|
||||||
// This definitely indicates a new token.
|
// This definitely indicates a new token.
|
||||||
if (startOfString < i) {
|
if (startOfString < i) {
|
||||||
al.add(new TString(s.substring(startOfString, i),
|
al.add(new TString("ACIP", s.substring(startOfString, i),
|
||||||
currentType));
|
currentType));
|
||||||
startOfString = i;
|
startOfString = i;
|
||||||
currentType = TString.ERROR;
|
currentType = TString.ERROR;
|
||||||
|
@ -710,21 +710,21 @@ public class ACIPTshegBarScanner {
|
||||||
|
|
||||||
if (startParenIndex >= 0) {
|
if (startParenIndex >= 0) {
|
||||||
if (ch == '(') {
|
if (ch == '(') {
|
||||||
al.add(new TString("Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.",
|
al.add(new TString("ACIP", "Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.",
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
||||||
+ "Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\n");
|
+ "Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\n");
|
||||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
} else {
|
} else {
|
||||||
al.add(new TString(s.substring(i, i+1), TString.END_PAREN));
|
al.add(new TString("ACIP", s.substring(i, i+1), TString.END_PAREN));
|
||||||
startParenIndex = -1;
|
startParenIndex = -1;
|
||||||
}
|
}
|
||||||
startOfString = i+1;
|
startOfString = i+1;
|
||||||
currentType = TString.ERROR;
|
currentType = TString.ERROR;
|
||||||
} else {
|
} else {
|
||||||
if (ch == ')') {
|
if (ch == ')') {
|
||||||
al.add(new TString("Unexpected closing parenthesis, ), found.",
|
al.add(new TString("ACIP", "Unexpected closing parenthesis, ), found.",
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
||||||
|
@ -732,7 +732,7 @@ public class ACIPTshegBarScanner {
|
||||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
} else {
|
} else {
|
||||||
startParenIndex = i;
|
startParenIndex = i;
|
||||||
al.add(new TString(s.substring(i, i+1), TString.START_PAREN));
|
al.add(new TString("ACIP", s.substring(i, i+1), TString.START_PAREN));
|
||||||
}
|
}
|
||||||
startOfString = i+1;
|
startOfString = i+1;
|
||||||
currentType = TString.ERROR;
|
currentType = TString.ERROR;
|
||||||
|
@ -744,10 +744,10 @@ public class ACIPTshegBarScanner {
|
||||||
|| (s.charAt(i+1) != ']' && s.charAt(i+1) != '}')) {
|
|| (s.charAt(i+1) != ']' && s.charAt(i+1) != '}')) {
|
||||||
// The tsheg bar ends here; new token.
|
// The tsheg bar ends here; new token.
|
||||||
if (startOfString < i) {
|
if (startOfString < i) {
|
||||||
al.add(new TString(s.substring(startOfString, i),
|
al.add(new TString("ACIP", s.substring(startOfString, i),
|
||||||
currentType));
|
currentType));
|
||||||
}
|
}
|
||||||
al.add(new TString(s.substring(i, i+1),
|
al.add(new TString("ACIP", s.substring(i, i+1),
|
||||||
TString.QUESTION));
|
TString.QUESTION));
|
||||||
startOfString = i+1;
|
startOfString = i+1;
|
||||||
currentType = TString.ERROR;
|
currentType = TString.ERROR;
|
||||||
|
@ -758,7 +758,7 @@ public class ACIPTshegBarScanner {
|
||||||
case '.':
|
case '.':
|
||||||
// This definitely indicates a new token.
|
// This definitely indicates a new token.
|
||||||
if (startOfString < i) {
|
if (startOfString < i) {
|
||||||
al.add(new TString(s.substring(startOfString, i),
|
al.add(new TString("ACIP", s.substring(startOfString, i),
|
||||||
currentType));
|
currentType));
|
||||||
startOfString = i;
|
startOfString = i;
|
||||||
currentType = TString.ERROR;
|
currentType = TString.ERROR;
|
||||||
|
@ -766,14 +766,14 @@ public class ACIPTshegBarScanner {
|
||||||
// . is used for a non-breaking tsheg, such as in
|
// . is used for a non-breaking tsheg, such as in
|
||||||
// {NGO.,} and {....,DAM}. We give a warning unless ,
|
// {NGO.,} and {....,DAM}. We give a warning unless ,
|
||||||
// or ., or [A-Za-z] follows '.'.
|
// or ., or [A-Za-z] follows '.'.
|
||||||
al.add(new TString(s.substring(i, i+1),
|
al.add(new TString("ACIP", s.substring(i, i+1),
|
||||||
TString.TIBETAN_PUNCTUATION));
|
TString.TIBETAN_PUNCTUATION));
|
||||||
if (!(i + 1 < sl
|
if (!(i + 1 < sl
|
||||||
&& (s.charAt(i+1) == '.' || s.charAt(i+1) == ','
|
&& (s.charAt(i+1) == '.' || s.charAt(i+1) == ','
|
||||||
|| (s.charAt(i+1) == '\r' || s.charAt(i+1) == '\n')
|
|| (s.charAt(i+1) == '\r' || s.charAt(i+1) == '\n')
|
||||||
|| (s.charAt(i+1) >= 'a' && s.charAt(i+1) <= 'z')
|
|| (s.charAt(i+1) >= 'a' && s.charAt(i+1) <= 'z')
|
||||||
|| (s.charAt(i+1) >= 'A' && s.charAt(i+1) <= 'Z')))) {
|
|| (s.charAt(i+1) >= 'A' && s.charAt(i+1) <= 'Z')))) {
|
||||||
al.add(new TString("A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".",
|
al.add(new TString("ACIP", "A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".",
|
||||||
TString.WARNING));
|
TString.WARNING));
|
||||||
}
|
}
|
||||||
startOfString = i+1;
|
startOfString = i+1;
|
||||||
|
@ -801,7 +801,7 @@ public class ACIPTshegBarScanner {
|
||||||
if (currentType == TString.TIBETAN_NON_PUNCTUATION
|
if (currentType == TString.TIBETAN_NON_PUNCTUATION
|
||||||
&& isTshegBarAdornment(ch))
|
&& isTshegBarAdornment(ch))
|
||||||
legalTshegBarAdornment = true;
|
legalTshegBarAdornment = true;
|
||||||
al.add(new TString(s.substring(startOfString, i),
|
al.add(new TString("ACIP", s.substring(startOfString, i),
|
||||||
currentType));
|
currentType));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -812,7 +812,7 @@ public class ACIPTshegBarScanner {
|
||||||
|| ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
|
|| ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
|
||||||
&& !al.isEmpty()
|
&& !al.isEmpty()
|
||||||
&& lastNonExceptionalThingWasAdornmentOr(al, TString.TIBETAN_NON_PUNCTUATION)) {
|
&& lastNonExceptionalThingWasAdornmentOr(al, TString.TIBETAN_NON_PUNCTUATION)) {
|
||||||
al.add(new TString(" ", TString.TIBETAN_PUNCTUATION));
|
al.add(new TString("ACIP", " ", TString.TIBETAN_PUNCTUATION));
|
||||||
}
|
}
|
||||||
|
|
||||||
// "DANG,\nLHAG" is really "DANG, LHAG". But always? Not if you have "MDO,\n\nKA...".
|
// "DANG,\nLHAG" is really "DANG, LHAG". But always? Not if you have "MDO,\n\nKA...".
|
||||||
|
@ -824,7 +824,7 @@ public class ACIPTshegBarScanner {
|
||||||
&& s.charAt(i-1) == ','
|
&& s.charAt(i-1) == ','
|
||||||
&& (i + (('\r' == ch) ? 2 : 1) < sl
|
&& (i + (('\r' == ch) ? 2 : 1) < sl
|
||||||
&& (s.charAt(i+(('\r' == ch) ? 2 : 1)) != ch))) {
|
&& (s.charAt(i+(('\r' == ch) ? 2 : 1)) != ch))) {
|
||||||
al.add(new TString(" ", TString.TIBETAN_PUNCTUATION));
|
al.add(new TString("ACIP", " ", TString.TIBETAN_PUNCTUATION));
|
||||||
}
|
}
|
||||||
|
|
||||||
if ('^' == ch) {
|
if ('^' == ch) {
|
||||||
|
@ -850,9 +850,9 @@ public class ACIPTshegBarScanner {
|
||||||
bad = true;
|
bad = true;
|
||||||
}
|
}
|
||||||
if (!bad)
|
if (!bad)
|
||||||
al.add(new TString("^", TString.TIBETAN_PUNCTUATION));
|
al.add(new TString("ACIP", "^", TString.TIBETAN_PUNCTUATION));
|
||||||
else
|
else
|
||||||
al.add(new TString("The ACIP {^} must precede a tsheg bar.", TString.ERROR));
|
al.add(new TString("ACIP", "The ACIP {^} must precede a tsheg bar.", TString.ERROR));
|
||||||
} else {
|
} else {
|
||||||
// Don't add in a "\r\n" or "\n" unless there's a
|
// Don't add in a "\r\n" or "\n" unless there's a
|
||||||
// blank line.
|
// blank line.
|
||||||
|
@ -864,10 +864,10 @@ public class ACIPTshegBarScanner {
|
||||||
|| ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n')))) {
|
|| ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n')))) {
|
||||||
for (int h = 0; h < (realNewline ? 2 : 1); h++) {
|
for (int h = 0; h < (realNewline ? 2 : 1); h++) {
|
||||||
if (isTshegBarAdornment(ch) && !legalTshegBarAdornment) {
|
if (isTshegBarAdornment(ch) && !legalTshegBarAdornment) {
|
||||||
al.add(new TString("The ACIP " + ch + " must be glued to the end of a tsheg bar, but this one was not",
|
al.add(new TString("ACIP", "The ACIP " + ch + " must be glued to the end of a tsheg bar, but this one was not",
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
} else {
|
} else {
|
||||||
al.add(new TString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
|
al.add(new TString("ACIP", rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
|
||||||
(legalTshegBarAdornment
|
(legalTshegBarAdornment
|
||||||
? TString.TSHEG_BAR_ADORNMENT
|
? TString.TSHEG_BAR_ADORNMENT
|
||||||
: TString.TIBETAN_PUNCTUATION)));
|
: TString.TIBETAN_PUNCTUATION)));
|
||||||
|
@ -875,7 +875,7 @@ public class ACIPTshegBarScanner {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if ('%' == ch) {
|
if ('%' == ch) {
|
||||||
al.add(new TString("The ACIP {%} is treated by this converter as U+0F35, but sometimes might represent U+0F14 in practice. To avoid seeing this warning again, change the input to use {\\u0F35} instead of {%}.",
|
al.add(new TString("ACIP", "The ACIP {%} is treated by this converter as U+0F35, but sometimes might represent U+0F14 in practice. To avoid seeing this warning again, change the input to use {\\u0F35} instead of {%}.",
|
||||||
TString.WARNING));
|
TString.WARNING));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -898,11 +898,11 @@ public class ACIPTshegBarScanner {
|
||||||
break;
|
break;
|
||||||
if (!(isNumeric(ch) || isAlpha(ch))) {
|
if (!(isNumeric(ch) || isAlpha(ch))) {
|
||||||
if (startOfString < i) {
|
if (startOfString < i) {
|
||||||
al.add(new TString(s.substring(startOfString, i),
|
al.add(new TString("ACIP", s.substring(startOfString, i),
|
||||||
currentType));
|
currentType));
|
||||||
}
|
}
|
||||||
if ((int)ch == 65533) {
|
if ((int)ch == 65533) {
|
||||||
al.add(new TString("Found an illegal, unprintable character.",
|
al.add(new TString("ACIP", "Found an illegal, unprintable character.",
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
||||||
|
@ -921,7 +921,7 @@ public class ACIPTshegBarScanner {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (x >= 0) {
|
if (x >= 0) {
|
||||||
al.add(new TString(new String(new char[] { (char)x }),
|
al.add(new TString("ACIP", new String(new char[] { (char)x }),
|
||||||
TString.UNICODE_CHARACTER));
|
TString.UNICODE_CHARACTER));
|
||||||
i += "uXXXX".length();
|
i += "uXXXX".length();
|
||||||
startOfString = i+1;
|
startOfString = i+1;
|
||||||
|
@ -929,14 +929,14 @@ public class ACIPTshegBarScanner {
|
||||||
} else {
|
} else {
|
||||||
final String msg
|
final String msg
|
||||||
= "Found a backslash, \\, which the ACIP Tibetan Input Code standard says represents a Sanskrit virama. In practice, though, this is so often misused (to represent U+0F3D) that {\\} always generates this error. If you want a Sanskrit virama, change the input document to use {\\u0F84} instead of {\\}. If you want U+0F3D, use {/NYA/} or {/NYA\\u0F3D}.";
|
= "Found a backslash, \\, which the ACIP Tibetan Input Code standard says represents a Sanskrit virama. In practice, though, this is so often misused (to represent U+0F3D) that {\\} always generates this error. If you want a Sanskrit virama, change the input document to use {\\u0F84} instead of {\\}. If you want U+0F3D, use {/NYA/} or {/NYA\\u0F3D}.";
|
||||||
al.add(new TString(msg,
|
al.add(new TString("ACIP", msg,
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
||||||
+ msg + "\n");
|
+ msg + "\n");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
al.add(new TString("Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".",
|
al.add(new TString("ACIP", "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".",
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
errors.append("Offset " + i + ((numNewlines == 0) ? "" : (" or maybe " + (i-numNewlines))) + ": "
|
||||||
|
@ -954,11 +954,11 @@ public class ACIPTshegBarScanner {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (startOfString < sl) {
|
if (startOfString < sl) {
|
||||||
al.add(new TString(s.substring(startOfString, sl),
|
al.add(new TString("ACIP", s.substring(startOfString, sl),
|
||||||
currentType));
|
currentType));
|
||||||
}
|
}
|
||||||
if (waitingForMatchingIllegalClose) {
|
if (waitingForMatchingIllegalClose) {
|
||||||
al.add(new TString("UNEXPECTED END OF INPUT",
|
al.add(new TString("ACIP", "UNEXPECTED END OF INPUT",
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
if (null != errors) {
|
if (null != errors) {
|
||||||
errors.append("Offset END: "
|
errors.append("Offset END: "
|
||||||
|
@ -967,7 +967,7 @@ public class ACIPTshegBarScanner {
|
||||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
}
|
}
|
||||||
if (!bracketTypeStack.empty()) {
|
if (!bracketTypeStack.empty()) {
|
||||||
al.add(new TString("Unmatched open bracket found. A " + ((TString.COMMENT == currentType) ? "comment" : "correction") + " does not terminate.",
|
al.add(new TString("ACIP", "Unmatched open bracket found. A " + ((TString.COMMENT == currentType) ? "comment" : "correction") + " does not terminate.",
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
if (null != errors) {
|
if (null != errors) {
|
||||||
errors.append("Offset END: "
|
errors.append("Offset END: "
|
||||||
|
@ -976,7 +976,7 @@ public class ACIPTshegBarScanner {
|
||||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
}
|
}
|
||||||
if (startSlashIndex >= 0) {
|
if (startSlashIndex >= 0) {
|
||||||
al.add(new TString("Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.",
|
al.add(new TString("ACIP", "Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.",
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset END: "
|
errors.append("Offset END: "
|
||||||
|
@ -984,7 +984,7 @@ public class ACIPTshegBarScanner {
|
||||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||||
}
|
}
|
||||||
if (startParenIndex >= 0) {
|
if (startParenIndex >= 0) {
|
||||||
al.add(new TString("Parentheses are supposed to occur in pairs, but the input had an unmatched parenthesis.",
|
al.add(new TString("ACIP", "Parentheses are supposed to occur in pairs, but the input had an unmatched parenthesis.",
|
||||||
TString.ERROR));
|
TString.ERROR));
|
||||||
if (null != errors)
|
if (null != errors)
|
||||||
errors.append("Offset END: "
|
errors.append("Offset END: "
|
||||||
|
|
|
@ -36,7 +36,17 @@ import java.io.*;
|
||||||
public class TString {
|
public class TString {
|
||||||
private int type;
|
private int type;
|
||||||
private String text;
|
private String text;
|
||||||
|
// "EWTS" or "ACIP", interned (for quick, '==' equality checking:
|
||||||
|
private String encoding;
|
||||||
|
|
||||||
|
/** Returns "EWTS" if this TString is encoded in EWTS, or,
|
||||||
|
otherwise, "ACIP" if this TString is encoded in ACIP. Returns
|
||||||
|
an interned string for quick equality checking via the
|
||||||
|
<code>==</code> operator. */
|
||||||
|
public String getEncoding() {
|
||||||
|
return encoding;
|
||||||
|
}
|
||||||
|
|
||||||
/** Returns true if and only if an TString with type <i>type</i>
|
/** Returns true if and only if an TString with type <i>type</i>
|
||||||
* is to be converted to something other than Tibetan text.
|
* is to be converted to something other than Tibetan text.
|
||||||
* (Chinese Unicode, Latin, etc. all qualify as non-Tibetan.) */
|
* (Chinese Unicode, Latin, etc. all qualify as non-Tibetan.) */
|
||||||
|
@ -130,15 +140,20 @@ public class TString {
|
||||||
/** Don't instantiate using this constructor. */
|
/** Don't instantiate using this constructor. */
|
||||||
private TString() { }
|
private TString() { }
|
||||||
|
|
||||||
/** Creates a new TString with source text <i>text</i> and type
|
/** Creates a new TString with source text <i>text</i>, encoded
|
||||||
|
* using the Roman transliteration system specified by
|
||||||
|
* <i>encoding</i> (see {@link getEncoding()}) and type
|
||||||
* <i>type</i> being a characterization like {@link #DD}. */
|
* <i>type</i> being a characterization like {@link #DD}. */
|
||||||
public TString(String text, int type) {
|
public TString(String encoding, String text, int type) {
|
||||||
|
this.encoding = encoding;
|
||||||
setType(type);
|
setType(type);
|
||||||
String ftext = (TIBETAN_NON_PUNCTUATION == type)
|
String ftext = (TIBETAN_NON_PUNCTUATION == type)
|
||||||
? MidLexSubstitution.getFinalValueForTibetanNonPunctuationToken(text)
|
? MidLexSubstitution.getFinalValueForTibetanNonPunctuationToken(text)
|
||||||
: text;
|
: text;
|
||||||
// FIXME: assert this
|
// FIXME: assert these
|
||||||
ThdlDebug.verify(type != UNICODE_CHARACTER || text.length() == 1);
|
ThdlDebug.verify(type != UNICODE_CHARACTER || text.length() == 1);
|
||||||
|
ThdlDebug.verify("EWTS" == encoding || "ACIP" == encoding);
|
||||||
|
type != UNICODE_CHARACTER || text.length() == 1);
|
||||||
setText(ftext);
|
setText(ftext);
|
||||||
if ((outputAllTshegBars || outputUniqueTshegBars) && TIBETAN_NON_PUNCTUATION == type)
|
if ((outputAllTshegBars || outputUniqueTshegBars) && TIBETAN_NON_PUNCTUATION == type)
|
||||||
outputTshegBar(ftext);
|
outputTshegBar(ftext);
|
||||||
|
|
Loading…
Reference in a new issue