Jskad's converter now has ACIP-to-Unicode built in. There are known
bugs; it is pre-alpha. It's usable, though, and finds tons of errors in ACIP input files, with the user deciding just how pedantic to be. The biggest outstanding bug is the silent one: treating { }, space, as tsheg instead of whitespace when we ought to know better.
This commit is contained in:
parent
d5ad760230
commit
1982c5847b
11 changed files with 355 additions and 244 deletions
|
@ -38,24 +38,23 @@ public class ACIPConverter {
|
|||
ThdlOptions.setUserPreference("thdl.debug", true);
|
||||
}
|
||||
|
||||
// DLC NOW: (KA)'s info is lost when you convert to Unicode text instead of Unicode RTF. Give an ERROR.
|
||||
|
||||
/** Command-line converter. Gives error messages on standard
|
||||
* output about why we can't convert the document perfectly and
|
||||
* exits with non-zero return code, or is silent otherwise and
|
||||
* exits with code zero. <p>FIXME: not so efficient; copies the
|
||||
* whole file into memory first. */
|
||||
public static void main(String[] args)
|
||||
throws IOException // DLC FIXME: give nice error messages
|
||||
throws IOException
|
||||
{
|
||||
boolean verbose = true;
|
||||
boolean strict = true;
|
||||
if (args.length != 2
|
||||
|| (!(strict = "--strict".equals(args[0])) && !"--lenient".equals(args[0]))) {
|
||||
System.err.println("Bad args! Need '--strict filename' or '--lenient filename'.");
|
||||
System.exit(1);
|
||||
if (args.length != 1) {
|
||||
System.out.println("Bad args! Need just the name of the ACIP text file.");
|
||||
}
|
||||
StringBuffer errors = new StringBuffer();
|
||||
int maxErrors = 250;
|
||||
ArrayList al = ACIPTshegBarScanner.scanFile(args[1], errors, strict, maxErrors - 1);
|
||||
ArrayList al = ACIPTshegBarScanner.scanFile(args[0], errors, maxErrors - 1);
|
||||
|
||||
if (null == al) {
|
||||
System.err.println(maxErrors + " or more lexical errors occurred while scanning ACIP input file; is this");
|
||||
|
@ -69,7 +68,7 @@ public class ACIPConverter {
|
|||
System.err.println("Exiting with " + maxErrors + " or more lexical errors; please fix input file and try again.");
|
||||
System.exit(1);
|
||||
}
|
||||
final boolean abortUponScanningError = false; // DLC MAKE ME CONFIGURABLE
|
||||
final boolean abortUponScanningError = false;
|
||||
// DLC NOW: BAo isn't converting.
|
||||
if (errors.length() > 0) {
|
||||
System.err.println("Errors scanning ACIP input file: ");
|
||||
|
@ -80,10 +79,15 @@ public class ACIPConverter {
|
|||
}
|
||||
}
|
||||
|
||||
StringBuffer warnings = new StringBuffer();
|
||||
boolean putWarningsInOutput = true; // DLC make me configurable.
|
||||
String warningLevel = "Most"; // DLC make me configurable.
|
||||
StringBuffer warnings = null;
|
||||
boolean putWarningsInOutput = false;
|
||||
if ("None" != warningLevel) {
|
||||
warnings = new StringBuffer();
|
||||
putWarningsInOutput = true;
|
||||
}
|
||||
convertToUnicode(al, System.out, errors, warnings,
|
||||
putWarningsInOutput);
|
||||
putWarningsInOutput, warningLevel);
|
||||
if (errors.length() > 0) {
|
||||
System.err.println("Errors converting ACIP input file: ");
|
||||
System.err.println(errors);
|
||||
|
@ -91,14 +95,14 @@ public class ACIPConverter {
|
|||
System.err.println("Exiting; please fix input file and try again.");
|
||||
System.exit(2);
|
||||
}
|
||||
if (warnings.length() > 0) {
|
||||
if (null != warnings && warnings.length() > 0) {
|
||||
System.err.println("Warnings converting ACIP input file: ");
|
||||
System.err.println(warnings);
|
||||
if (putWarningsInOutput)
|
||||
System.err.println("The output contains these warnings.");
|
||||
System.exit(2);
|
||||
}
|
||||
if (verbose) System.err.println("Converted " + args[1] + " perfectly.");
|
||||
if (verbose) System.err.println("Converted " + args[0] + " perfectly.");
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
|
@ -131,16 +135,17 @@ public class ACIPConverter {
|
|||
public static String convertToUnicode(String acip,
|
||||
StringBuffer errors,
|
||||
StringBuffer warnings,
|
||||
boolean writeWarningsToResult) {
|
||||
boolean writeWarningsToResult,
|
||||
String warningLevel) {
|
||||
ByteArrayOutputStream sw = new ByteArrayOutputStream();
|
||||
ArrayList al = ACIPTshegBarScanner.scan(acip, errors, true /* DLC FIXME */, -1);
|
||||
ArrayList al = ACIPTshegBarScanner.scan(acip, errors, -1);
|
||||
try {
|
||||
if (null != al
|
||||
&& convertToUnicode(al, sw, errors,
|
||||
warnings, writeWarningsToResult)) {
|
||||
warnings, writeWarningsToResult,
|
||||
warningLevel)) {
|
||||
return sw.toString("UTF-8");
|
||||
} else {
|
||||
System.out.println("DLC al is " + al + " and convertToUnicode returned null.");
|
||||
return null;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
|
@ -151,8 +156,8 @@ public class ACIPConverter {
|
|||
/** Writes Unicode to out. If errors occur in converting a tsheg
|
||||
* bar, then they are appended to errors if errors is non-null.
|
||||
* Furthermore, errors are written to out. If writeWarningsToOut
|
||||
* is true, then warnings also will be written to out. Returns
|
||||
* true upon perfect success, false if errors occurred.
|
||||
* is true, then warnings also will be written to out.
|
||||
* @return true upon perfect success, false if errors occurred.
|
||||
* @param scan result of ACIPTshegBarScanner.scan(..)
|
||||
* @param out stream to which to write converted text
|
||||
* @param errors if non-null, all error messages are appended
|
||||
|
@ -166,7 +171,8 @@ public class ACIPConverter {
|
|||
OutputStream out,
|
||||
StringBuffer errors,
|
||||
StringBuffer warnings,
|
||||
boolean writeWarningsToOut)
|
||||
boolean writeWarningsToOut,
|
||||
String warningLevel)
|
||||
throws IOException
|
||||
{
|
||||
int sz = scan.size();
|
||||
|
@ -181,8 +187,18 @@ public class ACIPConverter {
|
|||
writer.write("[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: ");
|
||||
writer.write(s.getText());
|
||||
writer.write("]");
|
||||
} else if (stype == ACIPString.WARNING) {
|
||||
if (writeWarningsToOut) {
|
||||
writer.write("[#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: ");
|
||||
writer.write(s.getText());
|
||||
writer.write("]");
|
||||
}
|
||||
if (null != warnings) {
|
||||
warnings.append("Warning: Lexical warning: ");
|
||||
warnings.append(s.getText());
|
||||
warnings.append('\n');
|
||||
}
|
||||
} else {
|
||||
// DLC FIXME: what about 'no A on root stack' and 'no A on such-and-such stack' warnings?
|
||||
if (s.isLatin(stype)) {
|
||||
if (stype == ACIPString.FOLIO_MARKER)
|
||||
writer.write("{");
|
||||
|
@ -219,7 +235,7 @@ public class ACIPConverter {
|
|||
errors.append(errorMessage + "\n");
|
||||
} else {
|
||||
String warning
|
||||
= pt.getWarning(false, // DLC: make me configurable
|
||||
= pt.getWarning(warningLevel,
|
||||
pl,
|
||||
s.getText());
|
||||
if (null != warning) {
|
||||
|
@ -234,7 +250,7 @@ public class ACIPConverter {
|
|||
}
|
||||
}
|
||||
unicode = sl.getUnicode();
|
||||
if (null == unicode) throw new Error("DLC: HOW?");
|
||||
if (null == unicode) throw new Error("FIXME: make this an assertion");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -245,7 +261,7 @@ public class ACIPConverter {
|
|||
unicode = "\u0F3D";
|
||||
else
|
||||
unicode = ACIPRules.getUnicodeFor(s.getText(), false);
|
||||
if (null == unicode) throw new Error("DLC: HOW?");
|
||||
if (null == unicode) throw new Error("FIXME: make this an assertion");
|
||||
}
|
||||
if (null != unicode) {
|
||||
writer.write(unicode);
|
||||
|
|
|
@ -75,9 +75,11 @@ public class ACIPString {
|
|||
public static final int START_PAREN = 15;
|
||||
/** For the closing ) in (NYA) */
|
||||
public static final int END_PAREN = 16;
|
||||
/** For things that may not be legal syntax, such as {KA . KHA} */
|
||||
public static final int WARNING = 17;
|
||||
/** For things that are not legal syntax, such as a file that
|
||||
* contains just "[# HALF A COMMEN" */
|
||||
public static final int ERROR = 17;
|
||||
public static final int ERROR = 18;
|
||||
|
||||
/** Returns true if and only if this string is Latin (usually
|
||||
* English). Returns false if this string is transliteration of
|
||||
|
@ -132,6 +134,7 @@ public class ACIPString {
|
|||
if (type == END_SLASH) typeString = "END_SLASH";
|
||||
if (type == START_PAREN) typeString = "START_PAREN";
|
||||
if (type == END_PAREN) typeString = "END_PAREN";
|
||||
if (type == WARNING) typeString = "WARNING";
|
||||
if (type == ERROR) typeString = "ERROR";
|
||||
return typeString + ":{" + getText() + "}";
|
||||
}
|
||||
|
|
|
@ -39,15 +39,13 @@ public class ACIPTshegBarScanner {
|
|||
* with code zero. <p>FIXME: not so efficient; copies the whole
|
||||
* file into memory first. */
|
||||
public static void main(String[] args) throws IOException {
|
||||
boolean strict = true;
|
||||
if (args.length != 2
|
||||
|| (!(strict = "--strict".equals(args[0])) && !"--lenient".equals(args[0]))) {
|
||||
System.out.println("Bad args! Need '--strict filename' or '--lenient filename'.");
|
||||
if (args.length != 1) {
|
||||
System.out.println("Bad args! Need just the name of the ACIP text file.");
|
||||
System.exit(1);
|
||||
}
|
||||
StringBuffer errors = new StringBuffer();
|
||||
int maxErrors = 250;
|
||||
ArrayList al = scanFile(args[1], errors, strict, maxErrors - 1);
|
||||
ArrayList al = scanFile(args[0], errors, maxErrors - 1);
|
||||
|
||||
if (null == al) {
|
||||
System.out.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this");
|
||||
|
@ -70,27 +68,39 @@ public class ACIPTshegBarScanner {
|
|||
}
|
||||
|
||||
/** Scans an ACIP file with path fname into tsheg bars. If errors
|
||||
* is non-null, error messages will be appended to it. If strict
|
||||
* is true, then you're more likely to see error
|
||||
* messages. Returns a list of ACIPStrings that is the
|
||||
* scan. <p>FIXME: not so efficient; copies the whole file into
|
||||
* memory first.
|
||||
* is non-null, error messages will be appended to it. Returns a
|
||||
* list of ACIPStrings that is the scan. <p>FIXME: not so
|
||||
* efficient; copies the whole file into memory first.
|
||||
* @throws IOException if we cannot read in the ACIP input file */
|
||||
public static ArrayList scanFile(String fname, StringBuffer errors, boolean strict, int maxErrors)
|
||||
public static ArrayList scanFile(String fname, StringBuffer errors, int maxErrors)
|
||||
throws IOException
|
||||
{
|
||||
return scanStream(new FileInputStream(fname),
|
||||
errors, maxErrors);
|
||||
}
|
||||
|
||||
/** Scans a stream of ACIP into tsheg bars. If errors is
|
||||
* non-null, error messages will be appended to it. You can
|
||||
* recover both errors and warnings (modulo offset information)
|
||||
* from the result, though. Returns a list of ACIPStrings that
|
||||
* is the scan, or null if more than maxErrors occur. <p>FIXME:
|
||||
* not so efficient; copies the whole file into memory first.
|
||||
* @throws IOException if we cannot read the whole ACIP stream */
|
||||
public static ArrayList scanStream(InputStream stream, StringBuffer errors,
|
||||
int maxErrors)
|
||||
throws IOException
|
||||
{
|
||||
StringBuffer s = new StringBuffer();
|
||||
char ch[] = new char[8192];
|
||||
BufferedReader in
|
||||
= new BufferedReader(new InputStreamReader(new FileInputStream(fname),
|
||||
"US-ASCII"));
|
||||
= new BufferedReader(new InputStreamReader(stream, "US-ASCII"));
|
||||
|
||||
int amt;
|
||||
while (-1 != (amt = in.read(ch))) {
|
||||
s.append(ch, 0, amt);
|
||||
}
|
||||
in.close();
|
||||
return scan(s.toString(), errors, !strict, maxErrors);
|
||||
return scan(s.toString(), errors, maxErrors);
|
||||
}
|
||||
|
||||
/** Returns a list of {@link ACIPString ACIPStrings} corresponding
|
||||
|
@ -99,26 +109,25 @@ public class ACIPTshegBarScanner {
|
|||
* text, a tsheg bar (minus the tsheg or shad or whatever), a
|
||||
* String of inter-tsheg-bar punctuation, etc.
|
||||
*
|
||||
* <p>This not only scans; it finds all the errors a parser would
|
||||
* too, like "NYA x" and "(" and ")" and "/NYA" etc. It puts
|
||||
* those in as ACIPStrings with type {@link ACIPString#ERROR},
|
||||
* and also, if errors is non-null, appends helpful messages to
|
||||
* errors, each followed by a '\n'. There is at least one case
|
||||
* where no ERROR ACIPString will appear but errors will be
|
||||
* modified.
|
||||
* <p>This not only scans; it finds all the errors and warnings a
|
||||
* parser would too, like "NYA x" and "(" and ")" and "/NYA" etc.
|
||||
* It puts those in as ACIPStrings with type {@link
|
||||
* ACIPString#ERROR} or {@link ACIPString#WARNING}, and also, if
|
||||
* errors is non-null, appends helpful messages to errors, each
|
||||
* followed by a '\n'.
|
||||
* @param s the ACIP text
|
||||
* @param errors if non-null, the buffer to which to append error
|
||||
* messages
|
||||
* @param lenientPeriods if and only if this is true, periods
|
||||
* will never cause errors, even if iffy text like "PAS... LA "
|
||||
* appears.
|
||||
* messages (DLC FIXME: cludge, just get this info by scanning
|
||||
* the result for ACIPString.ERROR (and maybe ACIPString.WARNING,
|
||||
* if you care about warnings), but then we'd have to put the
|
||||
* Offset info in the ACIPString)
|
||||
* @param maxErrors if nonnegative, then scanning will stop when
|
||||
* more than maxErrors errors occur. In this event, null is
|
||||
* returned.
|
||||
* @return null if more than maxErrors errors occur, or the scan
|
||||
* otherwise
|
||||
*/
|
||||
public static ArrayList scan(String s, StringBuffer errors, boolean lenientPeriods, int maxErrors) {
|
||||
public static ArrayList scan(String s, StringBuffer errors, int maxErrors) {
|
||||
|
||||
// the size depends on whether it's mostly Tibetan or mostly
|
||||
// Latin and a number of other factors. This is meant to be
|
||||
|
@ -159,9 +168,9 @@ public class ACIPTshegBarScanner {
|
|||
al.add(new ACIPString(s.substring(startOfString, i),
|
||||
currentType));
|
||||
}
|
||||
al.add(new ACIPString("Found a truly unmatched close bracket, " + s.substring(i, i+1),
|
||||
ACIPString.ERROR));
|
||||
if (!waitingForMatchingIllegalClose) {
|
||||
al.add(new ACIPString("Found a truly unmatched close bracket, " + s.substring(i, i+1),
|
||||
ACIPString.ERROR));
|
||||
if (null != errors) {
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "Found a truly unmatched close bracket, ] or }.\n");
|
||||
|
@ -169,6 +178,8 @@ public class ACIPTshegBarScanner {
|
|||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
}
|
||||
waitingForMatchingIllegalClose = false;
|
||||
al.add(new ACIPString("Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.",
|
||||
ACIPString.ERROR));
|
||||
if (null != errors)
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
||||
|
@ -422,9 +433,9 @@ public class ACIPTshegBarScanner {
|
|||
// This is an error. Sometimes [COMMENTS APPEAR
|
||||
// WITHOUT # MARKS]. Though "... [" could cause
|
||||
// this too.
|
||||
al.add(new ACIPString("Found an illegal open bracket: " + s.substring(i, i+1),
|
||||
ACIPString.ERROR));
|
||||
if (waitingForMatchingIllegalClose) {
|
||||
al.add(new ACIPString("Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.",
|
||||
ACIPString.ERROR));
|
||||
if (null != errors) {
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "Found a truly unmatched open bracket, [ or {, prior to this current illegal open bracket.\n");
|
||||
|
@ -443,6 +454,8 @@ public class ACIPTshegBarScanner {
|
|||
inContext = inContext + "...";
|
||||
}
|
||||
}
|
||||
al.add(new ACIPString("Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?",
|
||||
ACIPString.ERROR));
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "Found an illegal open bracket (in context, this is " + inContext + "). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\n");
|
||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
|
@ -729,23 +742,17 @@ public class ACIPTshegBarScanner {
|
|||
currentType = ACIPString.ERROR;
|
||||
}
|
||||
// . is used for a non-breaking tsheg, such as in
|
||||
// {NGO.,} and {....,DAM}. We give an error unless ,
|
||||
// {NGO.,} and {....,DAM}. We give a warning unless ,
|
||||
// or ., or [A-Za-z] follows '.'.
|
||||
if (lenientPeriods
|
||||
|| (i + 1 < sl
|
||||
&& (s.charAt(i+1) == '.' || s.charAt(i+1) == ','
|
||||
|| (s.charAt(i+1) == '\r' || s.charAt(i+1) == '\n')
|
||||
|| (s.charAt(i+1) >= 'a' && s.charAt(i+1) <= 'z')
|
||||
|| (s.charAt(i+1) >= 'A' && s.charAt(i+1) <= 'Z')))) {
|
||||
al.add(new ACIPString(s.substring(i, i+1),
|
||||
ACIPString.TIBETAN_PUNCTUATION));
|
||||
} else {
|
||||
al.add(new ACIPString(s.substring(i, i+1),
|
||||
ACIPString.TIBETAN_PUNCTUATION));
|
||||
if (!(i + 1 < sl
|
||||
&& (s.charAt(i+1) == '.' || s.charAt(i+1) == ','
|
||||
|| (s.charAt(i+1) == '\r' || s.charAt(i+1) == '\n')
|
||||
|| (s.charAt(i+1) >= 'a' && s.charAt(i+1) <= 'z')
|
||||
|| (s.charAt(i+1) >= 'A' && s.charAt(i+1) <= 'Z')))) {
|
||||
al.add(new ACIPString("A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".",
|
||||
ACIPString.ERROR));
|
||||
if (null != errors)
|
||||
errors.append("Offset " + i + " or maybe " + (i-numNewlines) + ": "
|
||||
+ "A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n");
|
||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
ACIPString.WARNING));
|
||||
}
|
||||
startOfString = i+1;
|
||||
break; // end '.' case
|
||||
|
@ -832,16 +839,11 @@ public class ACIPTshegBarScanner {
|
|||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
}
|
||||
if (!bracketTypeStack.empty()) {
|
||||
al.add(new ACIPString("UNEXPECTED END OF INPUT",
|
||||
al.add(new ACIPString("Unmatched open bracket found. A " + ((ACIPString.COMMENT == currentType) ? "comment" : "correction") + " does not terminate.",
|
||||
ACIPString.ERROR));
|
||||
if (null != errors) {
|
||||
if (ACIPString.COMMENT == currentType) {
|
||||
errors.append("Offset END: "
|
||||
+ "Unmatched open bracket found. A comment does not terminate.\n");
|
||||
} else {
|
||||
errors.append("Offset END: "
|
||||
+ "Unmatched open bracket found. A correction does not terminate.\n");
|
||||
}
|
||||
errors.append("Offset END: "
|
||||
+ "Unmatched open bracket found. A " + ((ACIPString.COMMENT == currentType) ? "comment" : "correction") + " does not terminate.\n");
|
||||
}
|
||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
}
|
||||
|
|
|
@ -102,10 +102,10 @@ public class PackageTest extends TestCase {
|
|||
assertTrue(null == expectedLegalParses || expectedLegalParses.length == 0);
|
||||
return;
|
||||
} else {
|
||||
if (pt.getWarning(false, l, acip) != null) {
|
||||
System.out.println(pt.getWarning(false, l, acip));
|
||||
} else if (pt.getWarning(true, l, acip) != null)
|
||||
if (sdebug || debug) System.out.println("Paranoiac warning is this: " + pt.getWarning(true, l, acip));
|
||||
if (pt.getWarning("Most", l, acip) != null) {
|
||||
System.out.println(pt.getWarning("Most", l, acip));
|
||||
} else if (pt.getWarning("All", l, acip) != null)
|
||||
if (sdebug || debug) System.out.println("Paranoiac warning is this: " + pt.getWarning("All", l, acip));
|
||||
}
|
||||
int np = pt.numberOfParses();
|
||||
boolean goodness = expectedParses == null || expectedParses.length == np;
|
||||
|
@ -7049,12 +7049,8 @@ tstHelper("ZUR");
|
|||
}
|
||||
|
||||
private static void shelp(String s, String expectedErrors, String expectedScan) {
|
||||
shelp(s, expectedErrors, false, expectedScan);
|
||||
}
|
||||
|
||||
private static void shelp(String s, String expectedErrors, boolean lenientPeriods, String expectedScan) {
|
||||
StringBuffer errors = new StringBuffer();
|
||||
ArrayList al = ACIPTshegBarScanner.scan(s, errors, lenientPeriods, -1);
|
||||
ArrayList al = ACIPTshegBarScanner.scan(s, errors, -1);
|
||||
if (null != expectedScan) {
|
||||
if (!al.toString().equals(expectedScan)) {
|
||||
System.out.println("Scanning " + s + " into tsheg bars was expected to cause the following scan:");
|
||||
|
@ -7075,18 +7071,14 @@ tstHelper("ZUR");
|
|||
}
|
||||
}
|
||||
|
||||
/** Tests {@link ACIPTshegBarScanner#scan(String, StringBuffer, boolean, int)}. */
|
||||
/** Tests {@link ACIPTshegBarScanner#scan(String, StringBuffer, int)}. */
|
||||
public void testScanner() {
|
||||
shelp("LA...SGRUB",
|
||||
"",
|
||||
"[TIBETAN_NON_PUNCTUATION:{LA}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_NON_PUNCTUATION:{SGRUB}]"); // DLC FIXME
|
||||
shelp("PAS... LA",
|
||||
"Offset 5 or maybe 5: A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".\n",
|
||||
"[TIBETAN_NON_PUNCTUATION:{PAS}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, ERROR:{A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{LA}]");
|
||||
"[TIBETAN_NON_PUNCTUATION:{LA}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_NON_PUNCTUATION:{SGRUB}]");
|
||||
shelp("PAS... LA",
|
||||
"",
|
||||
true,
|
||||
"[TIBETAN_NON_PUNCTUATION:{PAS}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{LA}]");
|
||||
"[TIBETAN_NON_PUNCTUATION:{PAS}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, TIBETAN_PUNCTUATION:{.}, WARNING:{A non-breaking tsheg, '.', appeared, but not like \"...,\" or \".,\" or \".dA\" or \".DA\".}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{LA}]");
|
||||
shelp("^GONG SA,",
|
||||
"",
|
||||
"[TIBETAN_NON_PUNCTUATION:{^GONG}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{SA}, TIBETAN_PUNCTUATION:{,}]");
|
||||
|
@ -7220,7 +7212,7 @@ tstHelper("ZUR");
|
|||
}
|
||||
private static void uhelp(String acip, String expectedUnicode) {
|
||||
StringBuffer errors = new StringBuffer();
|
||||
String unicode = ACIPConverter.convertToUnicode(acip, errors, null, true);
|
||||
String unicode = ACIPConverter.convertToUnicode(acip, errors, null, true, "Most");
|
||||
if (null == unicode) {
|
||||
if (null != expectedUnicode && "none" != expectedUnicode) {
|
||||
System.out.println("No unicode exists for " + acip + " but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode));
|
||||
|
|
|
@ -139,7 +139,7 @@ class TParseTree {
|
|||
|
||||
// We give a warning about these, optionally, so that
|
||||
// users can produce output that even a dumb ACIP reader
|
||||
// can understand. See getWarning(true, ..).
|
||||
// can understand. See getWarning("All", ..).
|
||||
|
||||
// if j is in this list, then up.get(j) is still a
|
||||
// potential winner.
|
||||
|
@ -246,16 +246,24 @@ class TParseTree {
|
|||
|
||||
/** Returns null if this parse tree is perfectly legal and valid.
|
||||
* Returns a warning for users otherwise. If and only if
|
||||
* paranoid is true, then even unambiguous ACIP like PADMA, which
|
||||
* could be improved by being written as PAD+MA, will cause a
|
||||
* warning.
|
||||
* @param paranoid true if you do not mind a lot of warnings
|
||||
* warningLevel is "All", then even unambiguous ACIP like PADMA,
|
||||
* which could be improved by being written as PAD+MA, will cause
|
||||
* a warning.
|
||||
* @param warningLevel "All" if you're paranoid, "Most" to see
|
||||
* warnings about lacking vowels on final stacks, "Some" to see
|
||||
* warnings about lacking vowels on non-final stacks and also
|
||||
* warnings about when prefix rules affect you, "None" if you
|
||||
* like to see IllegalArgumentExceptions.
|
||||
* @param pl the pair list from which this parse tree originated
|
||||
* @param originalACIP the original ACIP, or null if you want
|
||||
* this parse tree to make a best guess. */
|
||||
public String getWarning(boolean paranoid,
|
||||
public String getWarning(String warningLevel,
|
||||
TPairList pl,
|
||||
String originalACIP) {
|
||||
if (warningLevel != "Some"
|
||||
&& warningLevel != "Most"
|
||||
&& warningLevel != "All")
|
||||
throw new IllegalArgumentException("warning level bad: is it interned?");
|
||||
|
||||
{
|
||||
TStackList bestParse = getBestParse();
|
||||
|
@ -276,19 +284,21 @@ class TParseTree {
|
|||
} else {
|
||||
if (getBestParse().hasStackWithoutVowel(pl, isLastStack)) {
|
||||
if (isLastStack[0]) {
|
||||
return "Warning: The last stack does not have a vowel in the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}";
|
||||
if (warningLevel == "All" || warningLevel == "Most")
|
||||
return "Warning: The last stack does not have a vowel in the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}";
|
||||
} else {
|
||||
return "Warning: There is a stack, before the last stack, without a vowel in the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}";
|
||||
}
|
||||
}
|
||||
if (paranoid) {
|
||||
if ("All" == warningLevel) {
|
||||
return "Warning: Though the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "} is unambiguous, it would be more computer-friendly if + signs were used to stack things because there are two (or more) ways to interpret this ACIP if you're not careful.";
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (nip.get(0).hasStackWithoutVowel(pl, isLastStack)) {
|
||||
if (isLastStack[0]) {
|
||||
return "Warning: The last stack does not have a vowel in the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}";
|
||||
if (warningLevel == "All" || warningLevel == "Most")
|
||||
return "Warning: The last stack does not have a vowel in the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}";
|
||||
} else {
|
||||
return "Warning: There is a stack, before the last stack, without a vowel in the ACIP {" + ((null != originalACIP) ? originalACIP : recoverACIP()) + "}";
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue