TMW->Wylie conversion now takes advantage of prefix rules, the rules
that say "ya can take a ga prefix" etc. The ACIP->Unicode converter now gives warnings (optionally, and by default, inline). This converter now produces output even when lexical errors occur, but the output has errors and warnings inline.
This commit is contained in:
parent
21ef657921
commit
d5ad760230
14 changed files with 678 additions and 270 deletions
|
@ -58,28 +58,46 @@ public class ACIPConverter {
|
|||
ArrayList al = ACIPTshegBarScanner.scanFile(args[1], errors, strict, maxErrors - 1);
|
||||
|
||||
if (null == al) {
|
||||
System.err.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this");
|
||||
System.err.println(maxErrors + " or more lexical errors occurred while scanning ACIP input file; is this");
|
||||
System.err.println("Tibetan or English input?");
|
||||
System.err.println("");
|
||||
System.err.println("First " + maxErrors + " errors scanning ACIP input file: ");
|
||||
System.err.println(errors);
|
||||
System.err.println("Exiting with " + maxErrors + " or more errors; please fix input file and try again.");
|
||||
if (false) {
|
||||
// Nobody wants to see this. FIXME: maybe somebody; have an option.
|
||||
System.err.println("First " + maxErrors + " lexical errors scanning ACIP input file: ");
|
||||
System.err.println(errors);
|
||||
}
|
||||
System.err.println("Exiting with " + maxErrors + " or more lexical errors; please fix input file and try again.");
|
||||
System.exit(1);
|
||||
}
|
||||
final boolean abortUponScanningError = false; // DLC MAKE ME CONFIGURABLE
|
||||
// DLC NOW: BAo isn't converting.
|
||||
if (errors.length() > 0) {
|
||||
System.err.println("Errors scanning ACIP input file: ");
|
||||
System.err.println(errors);
|
||||
System.err.println("Exiting; please fix input file and try again.");
|
||||
System.exit(1);
|
||||
if (abortUponScanningError) {
|
||||
System.err.println("Exiting; please fix input file and try again.");
|
||||
System.exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
convertToUnicode(al, System.out, errors);
|
||||
StringBuffer warnings = new StringBuffer();
|
||||
boolean putWarningsInOutput = true; // DLC make me configurable.
|
||||
convertToUnicode(al, System.out, errors, warnings,
|
||||
putWarningsInOutput);
|
||||
if (errors.length() > 0) {
|
||||
System.err.println("Errors converting ACIP input file: ");
|
||||
System.err.println(errors);
|
||||
System.err.println("The output contains these errors.");
|
||||
System.err.println("Exiting; please fix input file and try again.");
|
||||
System.exit(2);
|
||||
}
|
||||
if (warnings.length() > 0) {
|
||||
System.err.println("Warnings converting ACIP input file: ");
|
||||
System.err.println(warnings);
|
||||
if (putWarningsInOutput)
|
||||
System.err.println("The output contains these warnings.");
|
||||
System.exit(2);
|
||||
}
|
||||
if (verbose) System.err.println("Converted " + args[1] + " perfectly.");
|
||||
System.exit(0);
|
||||
}
|
||||
|
@ -96,19 +114,30 @@ public class ACIPConverter {
|
|||
{
|
||||
throw new Error("DLC UNIMPLEMENTED");
|
||||
}
|
||||
// DLC FIXME: sometimes { } is \u0F0B, and sometimes it is a
|
||||
// space. Treat it as a tsheg only when it appears after a
|
||||
// syllable or another tsheg.
|
||||
|
||||
/** Returns UTF-8 encoded Unicode. A bit indirect, so use this
|
||||
* for testing only if performance is a concern. If errors occur
|
||||
* in scanning the ACIP or in converting a tsheg bar, then they
|
||||
* are appended to errors if errors is non-null. Returns the
|
||||
* are appended to errors if errors is non-null, as well as
|
||||
* written to the result. If warnings occur in scanning the ACIP
|
||||
* or in converting a tsheg bar, then they are appended to
|
||||
* warnings if warnings is non-null, and they are written to the
|
||||
* result if writeWarningsToResult is true. Returns the
|
||||
* conversion upon perfect success, null if errors occurred.
|
||||
*/
|
||||
public static String convertToUnicode(String acip,
|
||||
StringBuffer errors) {
|
||||
StringBuffer errors,
|
||||
StringBuffer warnings,
|
||||
boolean writeWarningsToResult) {
|
||||
ByteArrayOutputStream sw = new ByteArrayOutputStream();
|
||||
ArrayList al = ACIPTshegBarScanner.scan(acip, errors, true /* DLC FIXME */, -1);
|
||||
try {
|
||||
if (null != al && convertToUnicode(al, sw, errors)) {
|
||||
if (null != al
|
||||
&& convertToUnicode(al, sw, errors,
|
||||
warnings, writeWarningsToResult)) {
|
||||
return sw.toString("UTF-8");
|
||||
} else {
|
||||
System.out.println("DLC al is " + al + " and convertToUnicode returned null.");
|
||||
|
@ -119,15 +148,25 @@ public class ACIPConverter {
|
|||
}
|
||||
}
|
||||
|
||||
/** Writes Unicode to out. If errors occur in converting a
|
||||
* tsheg bar, then they are appended to errors if errors is
|
||||
* non-null. Returns true upon perfect success, false if errors
|
||||
* occurred.
|
||||
/** Writes Unicode to out. If errors occur in converting a tsheg
|
||||
* bar, then they are appended to errors if errors is non-null.
|
||||
* Furthermore, errors are written to out. If writeWarningsToOut
|
||||
* is true, then warnings also will be written to out. Returns
|
||||
* true upon perfect success, false if errors occurred.
|
||||
* @param scan result of ACIPTshegBarScanner.scan(..)
|
||||
* @param out stream to which to write converted text
|
||||
* @param errors if non-null, all error messages are appended
|
||||
* @param warnings if non-null, all warning messages are appended
|
||||
* to this
|
||||
* @param writeWarningsToOut if true, then all warning messages
|
||||
* are written to out in the appropriate places
|
||||
* @throws IOException if we cannot write to out
|
||||
*/
|
||||
public static boolean convertToUnicode(ArrayList scan,
|
||||
OutputStream out,
|
||||
StringBuffer errors)
|
||||
StringBuffer errors,
|
||||
StringBuffer warnings,
|
||||
boolean writeWarningsToOut)
|
||||
throws IOException
|
||||
{
|
||||
int sz = scan.size();
|
||||
|
@ -139,7 +178,7 @@ public class ACIPConverter {
|
|||
int stype = s.getType();
|
||||
if (stype == ACIPString.ERROR) {
|
||||
hasErrors = true;
|
||||
writer.write("[#ERROR CONVERTING ACIP DOCUMENT: ");
|
||||
writer.write("[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: ");
|
||||
writer.write(s.getText());
|
||||
writer.write("]");
|
||||
} else {
|
||||
|
@ -179,6 +218,21 @@ public class ACIPConverter {
|
|||
if (null != errors)
|
||||
errors.append(errorMessage + "\n");
|
||||
} else {
|
||||
String warning
|
||||
= pt.getWarning(false, // DLC: make me configurable
|
||||
pl,
|
||||
s.getText());
|
||||
if (null != warning) {
|
||||
if (writeWarningsToOut) {
|
||||
writer.write("[#WARNING CONVERTING ACIP DOCUMENT: ");
|
||||
writer.write(warning);
|
||||
writer.write("]");
|
||||
}
|
||||
if (null != warnings) {
|
||||
warnings.append(warning);
|
||||
warnings.append('\n');
|
||||
}
|
||||
}
|
||||
unicode = sl.getUnicode();
|
||||
if (null == unicode) throw new Error("DLC: HOW?");
|
||||
}
|
||||
|
|
|
@ -133,16 +133,18 @@ public class ACIPTshegBarScanner {
|
|||
Stack bracketTypeStack = new Stack();
|
||||
int startSlashIndex = -1;
|
||||
int startParenIndex = -1;
|
||||
int numNewlines = 0;
|
||||
for (int i = 0; i < sl; i++) {
|
||||
if (i < startOfString) throw new Error("bad reset");
|
||||
char ch;
|
||||
ch = s.charAt(i);
|
||||
if (ch == '\n') ++numNewlines;
|
||||
if (ACIPString.COMMENT == currentType && ch != ']') {
|
||||
if ('[' == ch) {
|
||||
al.add(new ACIPString("Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n",
|
||||
ACIPString.ERROR));
|
||||
if (null != errors)
|
||||
errors.append("Offset " + i + ": "
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n");
|
||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
}
|
||||
|
@ -157,17 +159,18 @@ public class ACIPTshegBarScanner {
|
|||
al.add(new ACIPString(s.substring(startOfString, i),
|
||||
currentType));
|
||||
}
|
||||
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
|
||||
al.add(new ACIPString("Found a truly unmatched close bracket, " + s.substring(i, i+1),
|
||||
ACIPString.ERROR));
|
||||
if (!waitingForMatchingIllegalClose) {
|
||||
if (null != errors) {
|
||||
errors.append("Offset " + i + ": "
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "Found a truly unmatched close bracket, ] or }.\n");
|
||||
}
|
||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
}
|
||||
waitingForMatchingIllegalClose = false;
|
||||
if (null != errors)
|
||||
errors.append("Offset " + i + ": "
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
startOfString = i+1;
|
||||
|
@ -249,6 +252,11 @@ public class ACIPTshegBarScanner {
|
|||
|| s.substring(i, i + "[BP]".length()).equals("{BP}"))) {
|
||||
thingy = "[BP]";
|
||||
currentType = ACIPString.BP;
|
||||
} else if (i + "[BLANK PAGE]".length() <= sl
|
||||
&& (s.substring(i, i + "[BLANK PAGE]".length()).equals("[BLANK PAGE]")
|
||||
|| s.substring(i, i + "[BLANK PAGE]".length()).equals("{BLANK PAGE}"))) {
|
||||
thingy = "[BLANK PAGE]";
|
||||
currentType = ACIPString.BP;
|
||||
} else if (i + "[ BP ]".length() <= sl
|
||||
&& (s.substring(i, i + "[ BP ]".length()).equals("[ BP ]")
|
||||
|| s.substring(i, i + "[ BP ]".length()).equals("{ BP }"))) {
|
||||
|
@ -414,11 +422,11 @@ public class ACIPTshegBarScanner {
|
|||
// This is an error. Sometimes [COMMENTS APPEAR
|
||||
// WITHOUT # MARKS]. Though "... [" could cause
|
||||
// this too.
|
||||
al.add(new ACIPString(s.substring(i, i+1),
|
||||
al.add(new ACIPString("Found an illegal open bracket: " + s.substring(i, i+1),
|
||||
ACIPString.ERROR));
|
||||
if (waitingForMatchingIllegalClose) {
|
||||
if (null != errors) {
|
||||
errors.append("Offset " + i + ": "
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.\n");
|
||||
}
|
||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
|
@ -435,7 +443,7 @@ public class ACIPTshegBarScanner {
|
|||
inContext = inContext + "...";
|
||||
}
|
||||
}
|
||||
errors.append("Offset " + i + ": "
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n");
|
||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
}
|
||||
|
@ -477,7 +485,6 @@ public class ACIPTshegBarScanner {
|
|||
if (i+numdigits+2 < sl && s.charAt(i+numdigits+2) == '.') {
|
||||
if (!(i+numdigits+4 < sl && isNumeric(s.charAt(i+numdigits+3))
|
||||
&& !isNumeric(s.charAt(i+numdigits+4)))) {
|
||||
al.add(new ACIPString(s.substring(i, i+numdigits+3), ACIPString.ERROR));
|
||||
String inContext = s.substring(i, i+Math.min(sl-i, 10));
|
||||
if (inContext.indexOf("\r") >= 0) {
|
||||
inContext = inContext.substring(0, inContext.indexOf("\r"));
|
||||
|
@ -488,8 +495,10 @@ public class ACIPTshegBarScanner {
|
|||
inContext = inContext + "...";
|
||||
}
|
||||
}
|
||||
al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.",
|
||||
ACIPString.ERROR));
|
||||
if (null != errors)
|
||||
errors.append("Offset " + i + ": "
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker has a period, '.', at the end of it, which is illegal.\n");
|
||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
startOfString = i+numdigits+3;
|
||||
|
@ -498,7 +507,6 @@ public class ACIPTshegBarScanner {
|
|||
break;
|
||||
}
|
||||
if (i+numdigits+4 < sl && (s.charAt(i+numdigits+4) == '.' || s.charAt(i+numdigits+4) == 'A' || s.charAt(i+numdigits+4) == 'B' || s.charAt(i+numdigits+4) == 'a' || s.charAt(i+numdigits+4) == 'b' || isNumeric(s.charAt(i+numdigits+4)))) {
|
||||
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
|
||||
String inContext = s.substring(i, i+Math.min(sl-i, 10));
|
||||
if (inContext.indexOf("\r") >= 0) {
|
||||
inContext = inContext.substring(0, inContext.indexOf("\r"));
|
||||
|
@ -509,8 +517,10 @@ public class ACIPTshegBarScanner {
|
|||
inContext = inContext + "...";
|
||||
}
|
||||
}
|
||||
al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.",
|
||||
ACIPString.ERROR));
|
||||
if (null != errors)
|
||||
errors.append("Offset " + i + ": "
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). This folio marker is not followed by whitespace, as is expected.\n");
|
||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
startOfString = i+1; // DLC FIXME: skip over more?
|
||||
|
@ -572,7 +582,9 @@ public class ACIPTshegBarScanner {
|
|||
}
|
||||
|
||||
// This case, @NNN, must come after the @NNN{AB} case.
|
||||
if (i+numdigits+1 < sl && s.charAt(i+numdigits+1) == ' ') {
|
||||
if (i+numdigits+1 < sl && (s.charAt(i+numdigits+1) == ' '
|
||||
|| s.charAt(i+numdigits+1) == '\n'
|
||||
|| s.charAt(i+numdigits+1) == '\r')) {
|
||||
boolean allAreNumeric = true;
|
||||
for (int k = 1; k <= numdigits; k++) {
|
||||
if (!isNumeric(s.charAt(i+k))) {
|
||||
|
@ -591,7 +603,6 @@ public class ACIPTshegBarScanner {
|
|||
}
|
||||
}
|
||||
if (startOfString == i) {
|
||||
al.add(new ACIPString(s.substring(i, i+1), ACIPString.ERROR));
|
||||
String inContext = s.substring(i, i+Math.min(sl-i, 10));
|
||||
if (inContext.indexOf("\r") >= 0) {
|
||||
inContext = inContext.substring(0, inContext.indexOf("\r"));
|
||||
|
@ -602,8 +613,10 @@ public class ACIPTshegBarScanner {
|
|||
inContext = inContext + "...";
|
||||
}
|
||||
}
|
||||
al.add(new ACIPString("Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.",
|
||||
ACIPString.ERROR));
|
||||
if (null != errors)
|
||||
errors.append("Offset " + i + ": "
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "Found an illegal at sign, @ (in context, this is " + inContext + "). @012B is an example of a legal folio marker.\n");
|
||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
startOfString = i+1;
|
||||
|
@ -626,9 +639,10 @@ public class ACIPTshegBarScanner {
|
|||
* it means /NYA/. We warn about // for this
|
||||
* reason. \\ causes a tsheg-bar error (DLC
|
||||
* FIXME: verify this is so). */
|
||||
al.add(new ACIPString("//", ACIPString.ERROR));
|
||||
al.add(new ACIPString("Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.",
|
||||
ACIPString.ERROR));
|
||||
if (errors != null) {
|
||||
errors.append("Offset " + i + ": "
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\n");
|
||||
}
|
||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
|
@ -661,9 +675,10 @@ public class ACIPTshegBarScanner {
|
|||
|
||||
if (startParenIndex >= 0) {
|
||||
if (ch == '(') {
|
||||
al.add(new ACIPString("Nesting of parentheses () is not allowed", ACIPString.ERROR));
|
||||
al.add(new ACIPString("Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.",
|
||||
ACIPString.ERROR));
|
||||
if (null != errors)
|
||||
errors.append("Offset " + i + ": "
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\n");
|
||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
} else {
|
||||
|
@ -674,9 +689,10 @@ public class ACIPTshegBarScanner {
|
|||
currentType = ACIPString.ERROR;
|
||||
} else {
|
||||
if (ch == ')') {
|
||||
al.add(new ACIPString("Unexpected closing parenthesis )", ACIPString.ERROR));
|
||||
al.add(new ACIPString("Unexpected closing parenthesis, ), found.",
|
||||
ACIPString.ERROR));
|
||||
if (null != errors)
|
||||
errors.append("Offset " + i + ": "
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "Unexpected closing parenthesis, ), found.\n");
|
||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
} else {
|
||||
|
@ -724,10 +740,10 @@ public class ACIPTshegBarScanner {
|
|||
al.add(new ACIPString(s.substring(i, i+1),
|
||||
ACIPString.TIBETAN_PUNCTUATION));
|
||||
} else {
|
||||
al.add(new ACIPString(s.substring(i, i+1),
|
||||
al.add(new ACIPString("A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".",
|
||||
ACIPString.ERROR));
|
||||
if (null != errors)
|
||||
errors.append("Offset " + i + ": "
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n");
|
||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
}
|
||||
|
@ -772,19 +788,24 @@ public class ACIPTshegBarScanner {
|
|||
al.add(new ACIPString(s.substring(startOfString, i),
|
||||
currentType));
|
||||
}
|
||||
al.add(new ACIPString(s.substring(i, i+1),
|
||||
ACIPString.ERROR));
|
||||
if (null != errors) {
|
||||
if ((int)ch == 65533) {
|
||||
errors.append("Offset " + i + ": "
|
||||
if ((int)ch == 65533) {
|
||||
al.add(new ACIPString("Found an illegal, unprintable character.",
|
||||
ACIPString.ERROR));
|
||||
if (null != errors)
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "Found an illegal, unprintable character.\n");
|
||||
} else if ('\\' == ch) {
|
||||
errors.append("Offset " + i + ": "
|
||||
} else if ('\\' == ch) {
|
||||
al.add(new ACIPString("Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.",
|
||||
ACIPString.ERROR));
|
||||
if (null != errors)
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n");
|
||||
} else {
|
||||
errors.append("Offset " + i + ": "
|
||||
} else {
|
||||
al.add(new ACIPString("Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".",
|
||||
ACIPString.ERROR));
|
||||
if (null != errors)
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".\n");
|
||||
}
|
||||
}
|
||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
startOfString = i+1;
|
||||
|
|
|
@ -128,7 +128,7 @@ public class PackageTest extends TestCase {
|
|||
}
|
||||
|
||||
{
|
||||
TStackListList legalParses = pt.getUniqueParse();
|
||||
TStackListList legalParses = pt.getUniqueParse(false);
|
||||
boolean goodness2 = (expectedLegalParses == null
|
||||
|| expectedLegalParses.length == legalParses.size());
|
||||
for (int i = 0 ; i < legalParses.size(); i++) {
|
||||
|
@ -139,18 +139,21 @@ public class PackageTest extends TestCase {
|
|||
|| expectedLegalParses.length < i+1
|
||||
|| n.equals(expectedLegalParses[i]));
|
||||
if (!okay || !goodness2)
|
||||
System.out.println("Legal parse " + (i) + " (from zero) is " + n + " (toString2=" + n.toString2() + ") and expected is " + expectedLegalParses[i]);
|
||||
System.out.println("Legal parse " + (i) + " (from zero) is " + n + " (toString2=" + n.toString2() + ") and expected is "
|
||||
+ ((i < expectedLegalParses.length)
|
||||
? expectedLegalParses[i]
|
||||
: "not present"));
|
||||
assertTrue(okay);
|
||||
}
|
||||
if (!goodness2)
|
||||
System.out.println("You expected " + expectedLegalParses.length + " legal parses, but there were instead " + legalParses.size() + " legal parses.");
|
||||
System.out.println("You expected " + expectedLegalParses.length + " legal parses, but there were instead " + legalParses.size() + " legal parses for ACIP " + acip + ".");
|
||||
assertTrue(goodness2);
|
||||
TStackListList allLegalParses = pt.getLegalParses();
|
||||
TStackListList decentParses = pt.getNonIllegalParses();
|
||||
if (pt.getBestParse() == null) {
|
||||
if (legalParses.size() == 0) {
|
||||
if (null != expectedBestParse && !"".equals(expectedBestParse)) {
|
||||
System.out.print("Expected is that there is a best parse \"" + expectedBestParse + "\" but there is no best parse for acip {" + acip + "}");
|
||||
System.out.print("Expected is that there is a best parse \"" + expectedBestParse + "\" but there is no best parse for ACIP {" + acip + "}");
|
||||
assertTrue(false);
|
||||
}
|
||||
System.out.print("ACIPNoBestParseError: There is no best parse for the ACIP {" + acip + "}; ");
|
||||
|
@ -163,7 +166,7 @@ public class PackageTest extends TestCase {
|
|||
}
|
||||
} else {
|
||||
if (legalParses.size() > 1) {
|
||||
System.out.println("ACIPTooManyLegalParsesError: see these " + legalParses.size() + " legal parses for acip " + acip + ": " + legalParses);
|
||||
System.out.println("ACIPTooManyLegalParsesError: see these " + legalParses.size() + " legal parses for ACIP " + acip + ": " + legalParses);
|
||||
assertTrue(legalParses.size() == 2
|
||||
&& (legalParses.get(0).size()
|
||||
== 1 + legalParses.get(1).size()));
|
||||
|
@ -176,7 +179,7 @@ public class PackageTest extends TestCase {
|
|||
if (null != expectedBestParse) {
|
||||
boolean good = pt.getBestParse().equals(expectedBestParse);
|
||||
if (!good) {
|
||||
System.out.print("Expected best parse is \"" + expectedBestParse + "\" but the best parse is " + pt.getBestParse() + " for acip {" + acip + "}");
|
||||
System.out.print("Expected best parse is \"" + expectedBestParse + "\" but the best parse is " + pt.getBestParse() + " for ACIP {" + acip + "}");
|
||||
}
|
||||
assertTrue(good);
|
||||
}
|
||||
|
@ -229,6 +232,116 @@ public class PackageTest extends TestCase {
|
|||
* {@link TPairList#getACIPError()}, and {@link
|
||||
* TPairList#recoverACIP()}. */
|
||||
public void testBreakACIPIntoChunks() {
|
||||
tstHelper("GASN"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BARMA"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MARDA"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BBA"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BBLUGS"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BDRA"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BDRAG"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BDRA'I"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BDRAL"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BDRAN"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BDRANGS"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BDREN"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BDRI"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BDRIS"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BDROL"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BDRUG"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BLCAG"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BLCI"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BLKONG"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BLNGA"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BLNGAG"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BMA"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BMYOD"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BSALDA"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BSAMS"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BSEMS"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BTSAMS"); // ambiguous with regard to prefix rules
|
||||
tstHelper("BTSIMS"); // ambiguous with regard to prefix rules
|
||||
tstHelper("DDANG"); // ambiguous with regard to prefix rules
|
||||
tstHelper("DDAR"); // ambiguous with regard to prefix rules
|
||||
tstHelper("DDRANGS"); // ambiguous with regard to prefix rules
|
||||
tstHelper("DDRUG"); // ambiguous with regard to prefix rules
|
||||
tstHelper("DNAG"); // ambiguous with regard to prefix rules
|
||||
tstHelper("DNOGS"); // ambiguous with regard to prefix rules
|
||||
tstHelper("DRBAN"); // ambiguous with regard to prefix rules
|
||||
tstHelper("DRGYU"); // ambiguous with regard to prefix rules
|
||||
tstHelper("DRTOG"); // ambiguous with regard to prefix rules
|
||||
tstHelper("DYA"); // ambiguous with regard to prefix rules
|
||||
tstHelper("DYAN"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GDRA"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GDRIM"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GGAN"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GGYUR"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GLTAR"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GLTUNG"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GMA"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GMAN"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GMON"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GRDEGS"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GRDZU"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GRGYA"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GRNAGS"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GRTAN"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GRTOGS"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GRTZO"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GRTZOD"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GRTZON"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GSLA"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GSNAD"); // ambiguous with regard to prefix rules
|
||||
tstHelper("GZLA"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MBA"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MBA'"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MBI'I"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MHA'A"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MRDA"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MRDO"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MRDZOGS"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MRGA"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MRGAD"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MRGAN"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MRJES"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MRJOD"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MRTOGS"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MRTOL"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MRTZE'I"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MRTZIGS"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MSAM"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MSGRIB"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MSKYES"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MSON"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MSOS"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MSTAMS"); // ambiguous with regard to prefix rules
|
||||
tstHelper("MSTAN"); // ambiguous with regard to prefix rules
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// If you're not careful, you'll think GGYES is a legal
|
||||
// Tibetan tsheg bar and parse it as {G}{G+YE}{S}. But it's
|
||||
// Sanskrit, really, because GA doesn't take a GA prefix.
|
||||
// This doesn't occur in ACIP input files that I've seen, but
|
||||
// GGYI (S1000I.INC) and GGYUR (S5275MC4.ACT) do occur.
|
||||
tstHelper("GGYES", "{G}{G}{YE}{S}",
|
||||
new String[] { "{G}{G}{YE}{S}", "{G}{G+YE}{S}", "{G+G}{YE}{S}" },
|
||||
new String[] { },
|
||||
"{G+G}{YE}{S}");
|
||||
|
||||
tstHelper("DRUG", "{D}{RU}{G}",
|
||||
new String[] { "{D}{RU}{G}", "{D+RU}{G}" },
|
||||
new String[] { "{D+RU}{G}" },
|
||||
"{D+RU}{G}");
|
||||
|
||||
|
||||
tstHelper("d+H+d+HA", "{d+}{H+}{d+}{HA}",
|
||||
new String[] { "{d+H+d+HA}" },
|
||||
new String[] { "{d+H+d+HA}" });
|
||||
|
||||
tstHelper("Gd+H+d+HA");
|
||||
|
||||
tstHelper("AUTPA", "{AU}{T}{PA}",
|
||||
new String[] { "{AU}{T}{PA}", "{AU}{T+PA}" },
|
||||
new String[] { },
|
||||
|
@ -249,7 +362,8 @@ public class PackageTest extends TestCase {
|
|||
new String[] { "{G+R+VA}{'I}" });
|
||||
tstHelper("G-RVA'I", "{G-}{R}{VA}{'I}",
|
||||
new String[] { "{G}{R+VA}{'I}" },
|
||||
new String[] { "{G}{R+VA}{'I}" });
|
||||
new String[] { },
|
||||
"{G}{R+VA}{'I}");
|
||||
tstHelper("RVA", "{R}{VA}",
|
||||
new String[] { "{R+VA}" },
|
||||
new String[] { "{R+VA}" });
|
||||
|
@ -6967,8 +7081,8 @@ tstHelper("ZUR");
|
|||
"",
|
||||
"[TIBETAN_NON_PUNCTUATION:{LA}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_NON_PUNCTUATION:{SGRUB}]"); // DLC FIXME
|
||||
shelp("PAS... LA",
|
||||
"Offset 5: A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n",
|
||||
"[TIBETAN_NON_PUNCTUATION:{PAS}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, ERROR:{.}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{LA}]");
|
||||
"Offset 5 or maybe 5: A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n",
|
||||
"[TIBETAN_NON_PUNCTUATION:{PAS}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, ERROR:{A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{LA}]");
|
||||
shelp("PAS... LA",
|
||||
"",
|
||||
true,
|
||||
|
@ -6983,28 +7097,28 @@ tstHelper("ZUR");
|
|||
shelp("", "", "[]");
|
||||
shelp("[DD]", "");
|
||||
shelp("[",
|
||||
"Offset 0: Found an illegal open bracket (in context, this is [). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n");
|
||||
"Offset 0 or maybe 0: Found an illegal open bracket (in context, this is [). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n");
|
||||
shelp("{",
|
||||
"Offset 0: Found an illegal open bracket (in context, this is {). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n");
|
||||
"Offset 0 or maybe 0: Found an illegal open bracket (in context, this is {). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset END: Truly unmatched open bracket found.\n");
|
||||
shelp("DD", "");
|
||||
shelp("DD]",
|
||||
"Offset 2: Found a truly unmatched close bracket, ] or }.\nOffset 2: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
||||
"Offset 2 or maybe 2: Found a truly unmatched close bracket, ] or }.\nOffset 2 or maybe 2: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
||||
|
||||
shelp("///NYA", "Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
|
||||
shelp("///NYA", "Offset 1 or maybe 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
|
||||
shelp("/NYA/", "");
|
||||
shelp("[?][BP][LS][DD1][DD2][DDD][DR][# (<{A COMMENT)}>]", "");
|
||||
shelp("[LS][# A [[[[[COMMENT][LS]",
|
||||
"Offset 9: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
|
||||
+ "Offset 10: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
|
||||
+ "Offset 11: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
|
||||
+ "Offset 12: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
|
||||
+ "Offset 13: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n");
|
||||
"Offset 9 or maybe 9: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
|
||||
+ "Offset 10 or maybe 10: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
|
||||
+ "Offset 11 or maybe 11: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
|
||||
+ "Offset 12 or maybe 12: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n"
|
||||
+ "Offset 13 or maybe 13: Found an open bracket within a [#COMMENT]-style comment. Brackets may not appear in comments.\n");
|
||||
shelp("[ILLEGAL COMMENT]",
|
||||
"Offset 0: Found an illegal open bracket (in context, this is [ILLEGAL C...). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 16: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
||||
"Offset 0 or maybe 0: Found an illegal open bracket (in context, this is [ILLEGAL C...). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 16 or maybe 16: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
||||
shelp("(BSKYABS GRO)", ""); // DLC WHAT ARE THESE FOR?
|
||||
shelp("BSKYABS GRO)", "Offset 11: Unexpected closing parenthesis, ), found.\n");
|
||||
shelp("BSKYABS GRO)", "Offset 11 or maybe 11: Unexpected closing parenthesis, ), found.\n");
|
||||
shelp("BSKYABS GRO(", "Offset END: Unmatched open parenthesis, (, found.\n");
|
||||
shelp("((NESTAGE))", "Offset 1: Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\nOffset 10: Unexpected closing parenthesis, ), found.\n");
|
||||
shelp("((NESTAGE))", "Offset 1 or maybe 1: Found an illegal open parenthesis, (. Nesting of parentheses is not allowed.\nOffset 10 or maybe 10: Unexpected closing parenthesis, ), found.\n");
|
||||
shelp("(BA)(PA)NYA(CA)", "");
|
||||
shelp("NYAx", "");
|
||||
shelp("NYA x", "");
|
||||
|
@ -7033,9 +7147,9 @@ tstHelper("ZUR");
|
|||
shelp("(NYA ", "Offset END: Unmatched open parenthesis, (, found.\n");
|
||||
shelp("[*NYA ", "Offset END: Unmatched open bracket found. A correction does not terminate.\n");
|
||||
shelp("?", "", "[QUESTION:{?}]");
|
||||
shelp("KHAN~ BAR ", "Offset 4: Found an illegal character, ~, with ordinal 126.\n");
|
||||
shelp("KHAN~ BAR ", "Offset 4 or maybe 4: Found an illegal character, ~, with ordinal 126.\n");
|
||||
shelp("[* Correction with []]",
|
||||
"Offset 5: Found an illegal character, r, with ordinal 114.\nOffset 6: Found an illegal character, r, with ordinal 114.\nOffset 7: Found an illegal character, e, with ordinal 101.\nOffset 8: Found an illegal character, c, with ordinal 99.\nOffset 14: Found an illegal character, w, with ordinal 119.\nOffset 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
||||
"Offset 5 or maybe 5: Found an illegal character, r, with ordinal 114.\nOffset 6 or maybe 6: Found an illegal character, r, with ordinal 114.\nOffset 7 or maybe 7: Found an illegal character, e, with ordinal 101.\nOffset 8 or maybe 8: Found an illegal character, c, with ordinal 99.\nOffset 14 or maybe 14: Found an illegal character, w, with ordinal 119.\nOffset 19 or maybe 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21 or maybe 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
||||
|
||||
// DLC FIXME: the line SDIG PA'I GROGS PO'I LAG TU SON PAR 'GYUR PA is followed by a blank line. Note that it's "PA", not "PA ", ending it. Autocorrect to the latter.
|
||||
|
||||
|
@ -7051,8 +7165,8 @@ tstHelper("ZUR");
|
|||
uhelp(" 1\\ ", "\u0f0b\u0f21\u0f84\u0f0b");
|
||||
}
|
||||
shelp("K\\,",
|
||||
"Offset 1: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n",
|
||||
"[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{\\}, TIBETAN_PUNCTUATION:{,}]");
|
||||
"Offset 1 or maybe 1: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n",
|
||||
"[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}, TIBETAN_PUNCTUATION:{,}]");
|
||||
|
||||
|
||||
shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR%}]");
|
||||
|
@ -7073,15 +7187,15 @@ tstHelper("ZUR");
|
|||
shelp("@01A.3 ", "", "[FOLIO_MARKER:{@01A.3}, TIBETAN_PUNCTUATION:{ }]");
|
||||
shelp("@001 ", "", "[FOLIO_MARKER:{@001}, TIBETAN_PUNCTUATION:{ }]");
|
||||
shelp("@19-20A",
|
||||
"Offset 0: Found an illegal at sign, @ (in context, this is @19-20A). @012B is an example of a legal folio marker.\n",
|
||||
"[ERROR:{@}, TIBETAN_NON_PUNCTUATION:{19-20A}]"); // DLC FIXME: yes it occurs in the kangyur.
|
||||
"Offset 0 or maybe 0: Found an illegal at sign, @ (in context, this is @19-20A). @012B is an example of a legal folio marker.\n",
|
||||
"[ERROR:{Found an illegal at sign, @ (in context, this is @19-20A). @012B is an example of a legal folio marker.}, TIBETAN_NON_PUNCTUATION:{19-20A}]"); // DLC FIXME: yes it occurs in the kangyur.
|
||||
shelp("@[7B]", "");
|
||||
shelp("@012A.3KA",
|
||||
"",
|
||||
"[FOLIO_MARKER:{@012A.3}, TIBETAN_NON_PUNCTUATION:{KA}]");
|
||||
shelp("@012A.34",
|
||||
"Offset 0: Found an illegal at sign, @ (in context, this is @012A.34). This folio marker has a period, '.', at the end of it, which is illegal.\n",
|
||||
"[ERROR:{@012A.}, TIBETAN_NON_PUNCTUATION:{34}]");
|
||||
"Offset 0 or maybe 0: Found an illegal at sign, @ (in context, this is @012A.34). This folio marker has a period, '.', at the end of it, which is illegal.\n",
|
||||
"[ERROR:{Found an illegal at sign, @ (in context, this is @012A.34). This folio marker has a period, '.', at the end of it, which is illegal.}, TIBETAN_NON_PUNCTUATION:{34}]");
|
||||
shelp("@[07B]", "");
|
||||
shelp("@[00007B]", "");
|
||||
shelp("@7B", "");
|
||||
|
@ -7097,8 +7211,8 @@ tstHelper("ZUR");
|
|||
shelp("{ DD }", "", "[DD:{{ DD }}]"); // TD3790E2.ACT
|
||||
shelp("{ BP }", "", "[BP:{{ BP }}]"); // TD3790E2.ACT
|
||||
shelp("//NYA\\\\",
|
||||
"Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset 5: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\nOffset 6: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n",
|
||||
"[START_SLASH:{/}, ERROR:{//}, END_SLASH:{/}, TIBETAN_NON_PUNCTUATION:{NYA}, ERROR:{\\}, ERROR:{\\}]");
|
||||
"Offset 1 or maybe 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset 5 or maybe 5: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\nOffset 6 or maybe 6: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n",
|
||||
"[START_SLASH:{/}, ERROR:{Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.}, END_SLASH:{/}, TIBETAN_NON_PUNCTUATION:{NYA}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}, ERROR:{Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.}]");
|
||||
|
||||
}
|
||||
private static void uhelp(String acip) {
|
||||
|
@ -7106,7 +7220,7 @@ tstHelper("ZUR");
|
|||
}
|
||||
private static void uhelp(String acip, String expectedUnicode) {
|
||||
StringBuffer errors = new StringBuffer();
|
||||
String unicode = ACIPConverter.convertToUnicode(acip, errors);
|
||||
String unicode = ACIPConverter.convertToUnicode(acip, errors, null, true);
|
||||
if (null == unicode) {
|
||||
if (null != expectedUnicode && "none" != expectedUnicode) {
|
||||
System.out.println("No unicode exists for " + acip + " but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode));
|
||||
|
@ -8729,22 +8843,22 @@ tstHelper("shKA");
|
|||
}
|
||||
/* DLC FIXME: add test cases: from R0021F.ACE: ambiguous Tibetan/Sanskrit:
|
||||
|
||||
BDA' þþþþ
|
||||
B+DA þþþ
|
||||
DBANG þþþ
|
||||
D+BA þþþ
|
||||
DGA' þþþþ
|
||||
D+GA þþþ
|
||||
DGRA þþþ
|
||||
D+GRA þþþ
|
||||
DGYESþþþþþ
|
||||
D+GYA þþþ
|
||||
DMAR þþþþ
|
||||
D+MA þþþ
|
||||
GDA' þþþþ
|
||||
G+DA þþþ
|
||||
GNAD þþþþ
|
||||
G+NA þþþ
|
||||
MNA' þþþþ
|
||||
M+NA þþþ
|
||||
BDA'
|
||||
B+DA
|
||||
DBANG
|
||||
D+BA
|
||||
DGA'
|
||||
D+GA
|
||||
DGRA
|
||||
D+GRA
|
||||
DGYES
|
||||
D+GYA
|
||||
DMAR
|
||||
D+MA
|
||||
GDA'
|
||||
G+DA
|
||||
GNAD
|
||||
G+NA
|
||||
MNA'
|
||||
M+NA
|
||||
*/
|
||||
|
|
|
@ -520,7 +520,8 @@ class TPairList {
|
|||
* corresponds to exactly one Tibetan grapheme cluster (i.e.,
|
||||
* stack). Note that U+0F7F (ACIP {:}) is part of a stack, not a
|
||||
* stack all on its own. */
|
||||
void populateWithTGCPairs(ArrayList pl, ArrayList indexList, int index) {
|
||||
void populateWithTGCPairs(ArrayList pl,
|
||||
ArrayList indexList, int index) {
|
||||
int sz = size();
|
||||
if (sz == 0) {
|
||||
return;
|
||||
|
@ -540,8 +541,8 @@ class TPairList {
|
|||
// The last pair:
|
||||
TPair p = get(i);
|
||||
ThdlDebug.verify(!"+".equals(p.getRight()));
|
||||
int where;
|
||||
boolean add_U0F7F = false;
|
||||
int where;
|
||||
if (p.getRight() != null
|
||||
&& (where = p.getRight().indexOf(':')) >= 0) {
|
||||
// this ':' guy is his own TGCPair.
|
||||
|
@ -579,27 +580,21 @@ class TPairList {
|
|||
}
|
||||
TGCPair tp;
|
||||
indexList.add(new Integer(index));
|
||||
tp = new TGCPair(lWylie.toString()
|
||||
+ (hasNonAVowel
|
||||
? ACIPRules.getWylieForACIPVowel(p.getRight())
|
||||
: ""),
|
||||
tp = new TGCPair(lWylie.toString(),
|
||||
(hasNonAVowel
|
||||
? ACIPRules.getWylieForACIPVowel(p.getRight())
|
||||
: ""),
|
||||
(isNumeric
|
||||
? TGCPair.OTHER
|
||||
: (hasNonAVowel
|
||||
? (isSanskrit
|
||||
? TGCPair.SANSKRIT_WITH_VOWEL
|
||||
: (isTibetan
|
||||
? TGCPair.CONSONANTAL_WITH_VOWEL
|
||||
: TGCPair.OTHER))
|
||||
: (isSanskrit
|
||||
? TGCPair.SANSKRIT_WITHOUT_VOWEL
|
||||
: (isTibetan
|
||||
? TGCPair.CONSONANTAL_WITHOUT_VOWEL
|
||||
: TGCPair.OTHER)))));
|
||||
? TGCPair.TYPE_OTHER
|
||||
: (isSanskrit
|
||||
? TGCPair.TYPE_SANSKRIT
|
||||
: (isTibetan
|
||||
? TGCPair.TYPE_TIBETAN
|
||||
: TGCPair.TYPE_OTHER))));
|
||||
pl.add(tp);
|
||||
if (add_U0F7F) {
|
||||
indexList.add(new Integer(index));
|
||||
pl.add(new TGCPair("H", TGCPair.OTHER));
|
||||
pl.add(new TGCPair("H", null, TGCPair.TYPE_OTHER));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -91,7 +91,7 @@ class TParseTree {
|
|||
ParseIterator pi = getParseIterator();
|
||||
while (pi.hasNext()) {
|
||||
TStackList sl = pi.next();
|
||||
if (sl.isLegalTshegBar().isLegal) {
|
||||
if (sl.isLegalTshegBar(false).isLegal) {
|
||||
sll.add(sl);
|
||||
}
|
||||
}
|
||||
|
@ -118,12 +118,12 @@ class TParseTree {
|
|||
* a unique non-illegal parse, you get it. If there's not a
|
||||
* unique answer, null is returned. */
|
||||
// {TZANDRA} is not solved by this, DLC NOW. Solve PADMA PROBLEM!
|
||||
|
||||
// DLC by using this we can get rid of single-sanskrit-gc, eh?
|
||||
public TStackList getBestParse() {
|
||||
TStackListList up = getUniqueParse();
|
||||
TStackListList up = getUniqueParse(false);
|
||||
if (up.size() == 1)
|
||||
return up.get(0);
|
||||
|
||||
up = getNonIllegalParses();
|
||||
int sz = up.size();
|
||||
if (sz == 1) {
|
||||
|
@ -192,14 +192,17 @@ class TParseTree {
|
|||
* legal parses if there two or more equally good parses. By
|
||||
* "legal", we mean a sequence of stacks that is legal
|
||||
* by the rules of Tibetan tsheg bar syntax (sometimes called
|
||||
* spelling). */
|
||||
public TStackListList getUniqueParse() {
|
||||
* spelling).
|
||||
* @param noPrefixTests true if you want to pretend that every
|
||||
* stack can take every prefix, which is not the case in
|
||||
* reality */
|
||||
public TStackListList getUniqueParse(boolean noPrefixTests) {
|
||||
TStackListList allLegalParses = new TStackListList(2); // save memory
|
||||
TStackListList legalParsesWithVowelOnRoot = new TStackListList(1);
|
||||
ParseIterator pi = getParseIterator();
|
||||
while (pi.hasNext()) {
|
||||
TStackList sl = pi.next();
|
||||
BoolPair bpa = sl.isLegalTshegBar();
|
||||
BoolPair bpa = sl.isLegalTshegBar(noPrefixTests);
|
||||
if (bpa.isLegal) {
|
||||
if (bpa.isLegalAndHasAVowelOnRoot)
|
||||
legalParsesWithVowelOnRoot.add(sl);
|
||||
|
@ -253,13 +256,23 @@ class TParseTree {
|
|||
public String getWarning(boolean paranoid,
|
||||
TPairList pl,
|
||||
String originalACIP) {
|
||||
TStackListList up = getUniqueParse();
|
||||
|
||||
{
|
||||
TStackList bestParse = getBestParse();
|
||||
TStackListList noPrefixTestsUniqueParse = getUniqueParse(true);
|
||||
if (noPrefixTestsUniqueParse.size() == 1
|
||||
&& !noPrefixTestsUniqueParse.get(0).equals(bestParse)) {
|
||||
return "Warning: We're going with " + bestParse + ", but only because our knowledge of prefix rules says that " + noPrefixTestsUniqueParse.get(0) + " is not a legal Tibetan tsheg bar (\"syllable\")";
|
||||
}
|
||||
}
|
||||
|
||||
TStackListList up = getUniqueParse(false);
|
||||
if (null == up || up.size() != 1) {
|
||||
boolean isLastStack[] = new boolean[1];
|
||||
TStackListList nip = getNonIllegalParses();
|
||||
if (nip.size() != 1) {
|
||||
if (null == getBestParse()) {
|
||||
return "There's not even a unique, non-illegal parse for ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}";
|
||||
return "Warning: There's not even a unique, non-illegal parse for ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}";
|
||||
} else {
|
||||
if (getBestParse().hasStackWithoutVowel(pl, isLastStack)) {
|
||||
if (isLastStack[0]) {
|
||||
|
@ -269,7 +282,7 @@ class TParseTree {
|
|||
}
|
||||
}
|
||||
if (paranoid) {
|
||||
return "Though the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "} is unambiguous, it would be more computer-friendly if + signs were used to stack things because there are two (or more) ways to interpret this ACIP if you're not careful.";
|
||||
return "Warning: Though the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "} is unambiguous, it would be more computer-friendly if + signs were used to stack things because there are two (or more) ways to interpret this ACIP if you're not careful.";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
|
|
|
@ -125,15 +125,17 @@ class TStackList {
|
|||
* Tibetan syntax (sometimes called rules of spelling). If this
|
||||
* is legal, then {@link BoolPair#isLegalAndHasAVowelOnRoot} will
|
||||
* be true if and only if there is an explicit {A} vowel on the
|
||||
* root stack. */
|
||||
public BoolPair isLegalTshegBar() {
|
||||
// DLC handle PADMA and other Tibetanized Sanskrit fellows. Right now we only handle single-stack guys.
|
||||
* root stack.
|
||||
* @param noPrefixTests true if you want to pretend that every
|
||||
* stack can take every prefix, which is not the case in
|
||||
* reality */
|
||||
public BoolPair isLegalTshegBar(boolean noPrefixTests) {
|
||||
// DLC handle PADMA and other Tibetanized Sanskrit fellows consistently. Right now we only treat single-stack Sanskrit guys as legal.
|
||||
|
||||
TTGCList tgcList = new TTGCList(this);
|
||||
StringBuffer warnings = new StringBuffer();
|
||||
String candidateType
|
||||
= TibTextUtils.getClassificationOfTshegBar(tgcList, warnings);
|
||||
// System.out.println("DLC: " + toString() + " has candidateType " + candidateType + " and warnings " + warnings);
|
||||
= TibTextUtils.getClassificationOfTshegBar(tgcList, warnings, noPrefixTests);
|
||||
|
||||
// preliminary answer:
|
||||
boolean isLegal = (candidateType != "invalid");
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue