Refactored so that there can be an EWTS scanner and an ACIP scanner.

This commit is contained in:
dchandler 2005-02-21 05:37:01 +00:00
parent 7854e4fd93
commit 4c268c5ea2
6 changed files with 171 additions and 109 deletions

View file

@ -297,11 +297,11 @@ public class TibetanConverter implements FontConverterConstants {
if (ACIP_TO_UNI_TEXT == ct || ACIP_TO_TMW == ct) {
try {
ArrayList al
= ACIPTshegBarScanner.scanStream(in, null,
ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have",
1000 - 1),
shortMessages,
warningLevel);
= ACIPTshegBarScanner.instance().scanStream(in, null,
ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have",
1000 - 1),
shortMessages,
warningLevel);
if (null == al)
return 47;
boolean embeddedWarnings = (warningLevel != "None");

View file

@ -333,8 +333,8 @@ public class TibTextUtils implements THDLWylieConstants {
{
StringBuffer errors = new StringBuffer();
String warningLevel = withWarnings ? "All" : "None";
ArrayList al = ACIPTshegBarScanner.scan(acip, errors, 500, false,
warningLevel);
ArrayList al = ACIPTshegBarScanner.instance().scan(acip, errors, 500,
false, warningLevel);
if (null == al || errors.length() > 0) {
if (errors.length() > 0)
throw new InvalidACIPException(errors.toString());

View file

@ -26,18 +26,18 @@ import org.thdl.util.ThdlDebug;
import org.thdl.util.ThdlOptions;
/**
* This class is able to break up Strings of ACIP text (for example, an
* entire sutra file) into tsheg bars, comments, etc. Folio markers,
* comments, and the like are segregated (so that consumers can ensure
* that they remain in Latin), and Tibetan passages are broken up into
* tsheg bars.
* This singleton class is able to break up Strings of ACIP text (for
* example, an entire sutra file) into tsheg bars, comments, etc. Folio
* markers, comments, and the like are segregated (so that consumers
* can ensure that they remain in Latin), and Tibetan passages are
* broken up into tsheg bars.
*
* <p><b>FIXME:</b> We should be handling {KA\n\nKHA} vs. {KA\nKHA} in
* the parser, not here in the lexical analyzer. That'd be cleaner,
* and more like how you'd do things if you used lex and yacc.
*
* @author David Chandler */
public class ACIPTshegBarScanner {
public class ACIPTshegBarScanner extends TTshegBarScanner {
/** True if those ACIP snippets inside square brackets (e.g.,
"[THIS]") are to be passed through into the output unmodified
while retaining the brackets and if those ACIP snippets inside
@ -59,9 +59,9 @@ public class ACIPTshegBarScanner {
}
StringBuffer errors = new StringBuffer();
int maxErrors = 1000;
ArrayList al = scanFile(args[0], errors, maxErrors - 1,
"true".equals(System.getProperty("org.thdl.tib.text.ttt.ACIPTshegBarScanner.shortMessages")),
"All" /* memory hog */);
ArrayList al = instance().scanFile(args[0], errors, maxErrors - 1,
"true".equals(System.getProperty("org.thdl.tib.text.ttt.ACIPTshegBarScanner.shortMessages")),
"All" /* memory hog */);
if (null == al) {
System.out.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this");
@ -83,63 +83,6 @@ public class ACIPTshegBarScanner {
System.exit(0);
}
/** Scans an ACIP file with path fname into tsheg bars. If errors
* is non-null, error messages will be appended to it. Returns a
* list of TStrings that is the scan. Warning and error messages
* in the result will be long and self-contained unless
* shortMessagse is true.
*
* <p>FIXME: not so efficient; copies the whole file into memory
* first.
*
* @param warningLevel controls which lexical warnings you will
* encounter
*
* @throws IOException if we cannot read in the ACIP input file
* */
public static ArrayList scanFile(String fname, StringBuffer errors,
int maxErrors, boolean shortMessages,
String warningLevel)
throws IOException
{
return scanStream(new FileInputStream(fname),
errors, maxErrors, shortMessages, warningLevel);
}
/** Scans a stream of ACIP into tsheg bars. If errors is
* non-null, error messages will be appended to it. You can
* recover both errors and (optionally) warnings (modulo offset
* information) from the result, though. They will be short
* messages iff shortMessages is true. Returns a list of
* TStrings that is the scan, or null if more than maxErrors
* occur.
*
* <p>FIXME: not so efficient; copies the whole file into memory
* first.
*
* @param warningLevel controls which lexical warnings you will
* encounter
*
* @throws IOException if we cannot read the whole ACIP stream */
public static ArrayList scanStream(InputStream stream, StringBuffer errors,
int maxErrors, boolean shortMessages,
String warningLevel)
throws IOException
{
StringBuffer s = new StringBuffer();
char ch[] = new char[8192];
BufferedReader in
= new BufferedReader(new InputStreamReader(stream, "US-ASCII"));
int amt;
while (-1 != (amt = in.read(ch))) {
s.append(ch, 0, amt);
}
in.close();
return scan(s.toString(), errors, maxErrors, shortMessages,
warningLevel);
}
/** Helper. Here because ACIP {MTHAR%\nKHA} should be treated the
same w.r.t. tsheg insertion regardless of the lex errors and
lex warnings found. */
@ -190,33 +133,11 @@ public class ACIPTshegBarScanner {
// DLC FIXME "H:\n\n" becomes "H: \n\n", wrongly I think. See
// Tibetan! 5.1 section on formatting Tibetan texts.
/** Returns a list of {@link TString TStrings} corresponding
* to s, possibly the empty list (when the empty string is the
* input). Each String is either a Latin comment, some Latin
* text, a tsheg bar (minus the tsheg or shad or whatever), a
* String of inter-tsheg-bar punctuation, etc.
*
* <p>This not only scans; it finds all the errors and warnings a
* parser would too, like "NYA x" and "(" and ")" and "/NYA" etc.
* It puts those in as TStrings with type {@link
* TString#ERROR} or {@link TString#WARNING}, and also, if
* errors is non-null, appends helpful messages to errors, each
* followed by a '\n'.
* @param s the ACIP text
* @param errors if non-null, the buffer to which to append error
* messages (FIXME: kludge, just get this info by scanning
* the result for TString.ERROR (and maybe TString.WARNING,
* if you care about warnings), but then we'd have to put the
* Offset info in the TString)
* @param maxErrors if nonnegative, then scanning will stop when
* more than maxErrors errors occur. In this event, null is
* returned.
* @param shortMessages true iff you want short error and warning
* messages instead of long, self-contained error messages
* @return null if more than maxErrors errors occur, or the scan
* otherwise */
public static ArrayList scan(String s, StringBuffer errors, int maxErrors,
boolean shortMessages, String warningLevel) {
/** See the comment in TTshegBarScanner. And note that this not
* only scans; it finds all the errors and warnings a parser
* would too, like "NYA x" and "(" and ")" and "/NYA" etc. */
public ArrayList scan(String s, StringBuffer errors, int maxErrors,
boolean shortMessages, String warningLevel) {
// FIXME: Use less memory and time by not adding in the
// warnings that are below threshold.
@ -1113,4 +1034,15 @@ public class ACIPTshegBarScanner {
|| ch == 's'
|| ch == 'h';
}
/** non-public because this is a singleton */
protected ACIPTshegBarScanner() { }
private static ACIPTshegBarScanner singleton = null;
/** Returns the sole instance of this class. */
public synchronized static ACIPTshegBarScanner instance() {
if (null == singleton) {
singleton = new ACIPTshegBarScanner();
}
return singleton;
}
}

View file

@ -202,7 +202,8 @@ public class PackageTest extends TestCase {
message. */
static String ACIP2TMW2Translit(boolean EWTSNotACIP, String ACIP) {
StringBuffer errors = new StringBuffer();
ArrayList al = ACIPTshegBarScanner.scan(ACIP, errors, -1, false, "None");
ArrayList al = ACIPTshegBarScanner.instance().scan(ACIP, errors, -1,
false, "None");
if (null == al || errors.length() > 0)
return null;
org.thdl.tib.text.TibetanDocument tdoc
@ -7357,7 +7358,8 @@ tstHelper("ZUR");
private static void shelp(String s, String expectedErrors, String expectedScan, String warningLevel) {
StringBuffer errors = new StringBuffer();
ArrayList al = ACIPTshegBarScanner.scan(s, errors, -1, false, warningLevel);
ArrayList al = ACIPTshegBarScanner.instance().scan(s, errors, -1, false,
warningLevel);
if (null != expectedScan) {
if (!al.toString().equals(expectedScan)) {
System.out.println("Scanning " + s + " into tsheg bars was expected to cause the following scan:");

View file

@ -29,6 +29,7 @@ import org.thdl.tib.text.TibetanDocument;
import org.thdl.tib.text.TibetanMachineWeb;
import org.thdl.tib.text.DuffCode;
// TODO(DLC)[EWTS->Tibetan]: THis class is broken for ewts. But kill this class unless it needs to exist.
/**
* This class is able to convert an ACIP file into Tibetan Machine Web
* and an ACIP file into Unicode. ACIP->Unicode should yield the same
@ -68,9 +69,10 @@ public class TConverter {
boolean shortMessages = false;
String warningLevel = "Most";
ArrayList al
= ACIPTshegBarScanner.scanFile(args[0], errors,
maxErrors - 1, shortMessages,
warningLevel);
= ACIPTshegBarScanner.instance().scanFile(args[0], errors,
maxErrors - 1,
shortMessages,
warningLevel);
if (null == al) {
System.err.println(maxErrors + " or more lexical errors occurred while scanning ACIP input file; is this");
@ -208,8 +210,9 @@ public class TConverter {
throw new IllegalArgumentException("Unsupported transliteration");
}
ByteArrayOutputStream sw = new ByteArrayOutputStream();
ArrayList al = ACIPTshegBarScanner.scan(acip, errors, -1, shortMessages,
warningLevel);
ArrayList al
= ACIPTshegBarScanner.instance().scan(acip, errors, -1,
shortMessages, warningLevel);
try {
if (null != al) {
convertToUnicodeText(al, sw, errors,
@ -301,9 +304,9 @@ public class TConverter {
{
try {
if (null != tdoc && (toUnicode && !toRTF))
throw new Error("Doing both at once might work, but it's not been tested. I bet some 'continue;' statements will need to go.");
throw new IllegalArgumentException("Doing both at once might work, but it's not been tested. I bet some 'continue;' statements will need to go.");
if (toUnicode && toRTF)
throw new Error("FIXME: support this ACIP->Unicode.rtf mode so that KA (GA) shows up in two different font sizes. See RFE 838591.");
throw new IllegalArgumentException("FIXME: support this ACIP->Unicode.rtf mode so that KA (GA) shows up in two different font sizes. See RFE 838591.");
if (!toUnicode && !toRTF)
throw new IllegalArgumentException("ACIP->Uni.rtf, ACIP->Uni.txt, and ACIP->TMW.rtf are supported, but not ACIP->TMW.txt");
if (toUnicode && toRTF && null == tdoc)

View file

@ -0,0 +1,125 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.ttt;
import java.io.*;
import java.util.ArrayList;
import java.util.Stack;
import org.thdl.util.ThdlDebug;
import org.thdl.util.ThdlOptions;
/**
* A TTshegBarScanner is able to break up Strings of transliterated
* Tibetan text (for example, an entire sutra) into bite-sized
* components like tsheg bars. This is an abstract class.
*
* @author David Chandler */
public abstract class TTshegBarScanner {
/** Default constructor. */
public TTshegBarScanner() { }
/** Scans a transliteration file with path fname into tsheg bars.
* If errors is non-null, error messages will be appended to it.
* Returns a list of TStrings that is the scan. Warning and
* error messages in the result will be long and self-contained
* unless shortMessagse is true.
*
* <p>This is not so efficient; copies the whole file into memory
* first.
*
* @param warningLevel controls which lexical warnings you will
* encounter
*
* @throws IOException if we cannot read in the input file
* */
public final ArrayList scanFile(String fname, StringBuffer errors,
int maxErrors, boolean shortMessages,
String warningLevel)
throws IOException
{
return scanStream(new FileInputStream(fname),
errors, maxErrors, shortMessages, warningLevel);
}
/** Scans a stream of transliteration into tsheg bars. If errors is
* non-null, error messages will be appended to it. You can
* recover both errors and (optionally) warnings (modulo offset
* information) from the result, though. They will be short
* messages iff shortMessages is true. Returns a list of
* TStrings that is the scan, or null if more than maxErrors
* occur.
*
* <p>This is not so efficient; copies the whole stream into
* memory first.
*
* @param warningLevel controls which lexical warnings you will
* encounter
*
* @throws IOException if we cannot read the whole stream */
public final ArrayList scanStream(InputStream stream,
StringBuffer errors,
int maxErrors,
boolean shortMessages,
String warningLevel)
throws IOException
{
StringBuffer s = new StringBuffer();
char ch[] = new char[8192];
BufferedReader in
= new BufferedReader(new InputStreamReader(stream, "US-ASCII"));
int amt;
while (-1 != (amt = in.read(ch))) {
s.append(ch, 0, amt);
}
in.close();
return scan(s.toString(), errors, maxErrors, shortMessages,
warningLevel);
}
/** Returns a list of {@link TString TStrings} corresponding
* to s, possibly the empty list (when the empty string is the
* input). Each String is either a Latin comment, some Latin
* text, a tsheg bar (minus the tsheg or shad or whatever), a
* String of inter-tsheg-bar punctuation, etc.
*
* <p>This may do more than scan; it may find some errors and
* warnings you'd normally think of a parser (not a scanner)
* finding. If so, it puts those in as TStrings with type {@link
* TString#ERROR} or {@link TString#WARNING}, and also, if errors
* is non-null, appends helpful messages to errors, each followed
* by a '\n'.
* @param s the transliterated text
* @param errors if non-null, the buffer to which to append error
* messages (FIXME: kludge, just get this info by scanning
* the result for TString.ERROR (and maybe TString.WARNING,
* if you care about warnings), but then we'd have to put the
* Offset info in the TString)
* @param maxErrors if nonnegative, then scanning will stop when
* more than maxErrors errors occur. In this event, null is
* returned.
* @param shortMessages true iff you want short error and warning
* messages instead of long, self-contained error messages
* @return null if more than maxErrors errors occur, or the scan
* otherwise */
public abstract ArrayList scan(String s, StringBuffer errors, int maxErrors,
boolean shortMessages, String warningLevel);
}