Refactored so that there can be an EWTS scanner and an ACIP scanner.
This commit is contained in:
parent
7854e4fd93
commit
4c268c5ea2
6 changed files with 171 additions and 109 deletions
|
@ -297,11 +297,11 @@ public class TibetanConverter implements FontConverterConstants {
|
|||
if (ACIP_TO_UNI_TEXT == ct || ACIP_TO_TMW == ct) {
|
||||
try {
|
||||
ArrayList al
|
||||
= ACIPTshegBarScanner.scanStream(in, null,
|
||||
ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have",
|
||||
1000 - 1),
|
||||
shortMessages,
|
||||
warningLevel);
|
||||
= ACIPTshegBarScanner.instance().scanStream(in, null,
|
||||
ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have",
|
||||
1000 - 1),
|
||||
shortMessages,
|
||||
warningLevel);
|
||||
if (null == al)
|
||||
return 47;
|
||||
boolean embeddedWarnings = (warningLevel != "None");
|
||||
|
|
|
@ -333,8 +333,8 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
{
|
||||
StringBuffer errors = new StringBuffer();
|
||||
String warningLevel = withWarnings ? "All" : "None";
|
||||
ArrayList al = ACIPTshegBarScanner.scan(acip, errors, 500, false,
|
||||
warningLevel);
|
||||
ArrayList al = ACIPTshegBarScanner.instance().scan(acip, errors, 500,
|
||||
false, warningLevel);
|
||||
if (null == al || errors.length() > 0) {
|
||||
if (errors.length() > 0)
|
||||
throw new InvalidACIPException(errors.toString());
|
||||
|
|
|
@ -26,18 +26,18 @@ import org.thdl.util.ThdlDebug;
|
|||
import org.thdl.util.ThdlOptions;
|
||||
|
||||
/**
|
||||
* This class is able to break up Strings of ACIP text (for example, an
|
||||
* entire sutra file) into tsheg bars, comments, etc. Folio markers,
|
||||
* comments, and the like are segregated (so that consumers can ensure
|
||||
* that they remain in Latin), and Tibetan passages are broken up into
|
||||
* tsheg bars.
|
||||
* This singleton class is able to break up Strings of ACIP text (for
|
||||
* example, an entire sutra file) into tsheg bars, comments, etc. Folio
|
||||
* markers, comments, and the like are segregated (so that consumers
|
||||
* can ensure that they remain in Latin), and Tibetan passages are
|
||||
* broken up into tsheg bars.
|
||||
*
|
||||
* <p><b>FIXME:</b> We should be handling {KA\n\nKHA} vs. {KA\nKHA} in
|
||||
* the parser, not here in the lexical analyzer. That'd be cleaner,
|
||||
* and more like how you'd do things if you used lex and yacc.
|
||||
*
|
||||
* @author David Chandler */
|
||||
public class ACIPTshegBarScanner {
|
||||
public class ACIPTshegBarScanner extends TTshegBarScanner {
|
||||
/** True if those ACIP snippets inside square brackets (e.g.,
|
||||
"[THIS]") are to be passed through into the output unmodified
|
||||
while retaining the brackets and if those ACIP snippets inside
|
||||
|
@ -59,9 +59,9 @@ public class ACIPTshegBarScanner {
|
|||
}
|
||||
StringBuffer errors = new StringBuffer();
|
||||
int maxErrors = 1000;
|
||||
ArrayList al = scanFile(args[0], errors, maxErrors - 1,
|
||||
"true".equals(System.getProperty("org.thdl.tib.text.ttt.ACIPTshegBarScanner.shortMessages")),
|
||||
"All" /* memory hog */);
|
||||
ArrayList al = instance().scanFile(args[0], errors, maxErrors - 1,
|
||||
"true".equals(System.getProperty("org.thdl.tib.text.ttt.ACIPTshegBarScanner.shortMessages")),
|
||||
"All" /* memory hog */);
|
||||
|
||||
if (null == al) {
|
||||
System.out.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this");
|
||||
|
@ -83,63 +83,6 @@ public class ACIPTshegBarScanner {
|
|||
System.exit(0);
|
||||
}
|
||||
|
||||
/** Scans an ACIP file with path fname into tsheg bars. If errors
|
||||
* is non-null, error messages will be appended to it. Returns a
|
||||
* list of TStrings that is the scan. Warning and error messages
|
||||
* in the result will be long and self-contained unless
|
||||
* shortMessagse is true.
|
||||
*
|
||||
* <p>FIXME: not so efficient; copies the whole file into memory
|
||||
* first.
|
||||
*
|
||||
* @param warningLevel controls which lexical warnings you will
|
||||
* encounter
|
||||
*
|
||||
* @throws IOException if we cannot read in the ACIP input file
|
||||
* */
|
||||
public static ArrayList scanFile(String fname, StringBuffer errors,
|
||||
int maxErrors, boolean shortMessages,
|
||||
String warningLevel)
|
||||
throws IOException
|
||||
{
|
||||
return scanStream(new FileInputStream(fname),
|
||||
errors, maxErrors, shortMessages, warningLevel);
|
||||
}
|
||||
|
||||
/** Scans a stream of ACIP into tsheg bars. If errors is
|
||||
* non-null, error messages will be appended to it. You can
|
||||
* recover both errors and (optionally) warnings (modulo offset
|
||||
* information) from the result, though. They will be short
|
||||
* messages iff shortMessages is true. Returns a list of
|
||||
* TStrings that is the scan, or null if more than maxErrors
|
||||
* occur.
|
||||
*
|
||||
* <p>FIXME: not so efficient; copies the whole file into memory
|
||||
* first.
|
||||
*
|
||||
* @param warningLevel controls which lexical warnings you will
|
||||
* encounter
|
||||
*
|
||||
* @throws IOException if we cannot read the whole ACIP stream */
|
||||
public static ArrayList scanStream(InputStream stream, StringBuffer errors,
|
||||
int maxErrors, boolean shortMessages,
|
||||
String warningLevel)
|
||||
throws IOException
|
||||
{
|
||||
StringBuffer s = new StringBuffer();
|
||||
char ch[] = new char[8192];
|
||||
BufferedReader in
|
||||
= new BufferedReader(new InputStreamReader(stream, "US-ASCII"));
|
||||
|
||||
int amt;
|
||||
while (-1 != (amt = in.read(ch))) {
|
||||
s.append(ch, 0, amt);
|
||||
}
|
||||
in.close();
|
||||
return scan(s.toString(), errors, maxErrors, shortMessages,
|
||||
warningLevel);
|
||||
}
|
||||
|
||||
/** Helper. Here because ACIP {MTHAR%\nKHA} should be treated the
|
||||
same w.r.t. tsheg insertion regardless of the lex errors and
|
||||
lex warnings found. */
|
||||
|
@ -190,33 +133,11 @@ public class ACIPTshegBarScanner {
|
|||
// DLC FIXME "H:\n\n" becomes "H: \n\n", wrongly I think. See
|
||||
// Tibetan! 5.1 section on formatting Tibetan texts.
|
||||
|
||||
/** Returns a list of {@link TString TStrings} corresponding
|
||||
* to s, possibly the empty list (when the empty string is the
|
||||
* input). Each String is either a Latin comment, some Latin
|
||||
* text, a tsheg bar (minus the tsheg or shad or whatever), a
|
||||
* String of inter-tsheg-bar punctuation, etc.
|
||||
*
|
||||
* <p>This not only scans; it finds all the errors and warnings a
|
||||
* parser would too, like "NYA x" and "(" and ")" and "/NYA" etc.
|
||||
* It puts those in as TStrings with type {@link
|
||||
* TString#ERROR} or {@link TString#WARNING}, and also, if
|
||||
* errors is non-null, appends helpful messages to errors, each
|
||||
* followed by a '\n'.
|
||||
* @param s the ACIP text
|
||||
* @param errors if non-null, the buffer to which to append error
|
||||
* messages (FIXME: kludge, just get this info by scanning
|
||||
* the result for TString.ERROR (and maybe TString.WARNING,
|
||||
* if you care about warnings), but then we'd have to put the
|
||||
* Offset info in the TString)
|
||||
* @param maxErrors if nonnegative, then scanning will stop when
|
||||
* more than maxErrors errors occur. In this event, null is
|
||||
* returned.
|
||||
* @param shortMessages true iff you want short error and warning
|
||||
* messages instead of long, self-contained error messages
|
||||
* @return null if more than maxErrors errors occur, or the scan
|
||||
* otherwise */
|
||||
public static ArrayList scan(String s, StringBuffer errors, int maxErrors,
|
||||
boolean shortMessages, String warningLevel) {
|
||||
/** See the comment in TTshegBarScanner. And note that this not
|
||||
* only scans; it finds all the errors and warnings a parser
|
||||
* would too, like "NYA x" and "(" and ")" and "/NYA" etc. */
|
||||
public ArrayList scan(String s, StringBuffer errors, int maxErrors,
|
||||
boolean shortMessages, String warningLevel) {
|
||||
// FIXME: Use less memory and time by not adding in the
|
||||
// warnings that are below threshold.
|
||||
|
||||
|
@ -1113,4 +1034,15 @@ public class ACIPTshegBarScanner {
|
|||
|| ch == 's'
|
||||
|| ch == 'h';
|
||||
}
|
||||
|
||||
/** non-public because this is a singleton */
|
||||
protected ACIPTshegBarScanner() { }
|
||||
private static ACIPTshegBarScanner singleton = null;
|
||||
/** Returns the sole instance of this class. */
|
||||
public synchronized static ACIPTshegBarScanner instance() {
|
||||
if (null == singleton) {
|
||||
singleton = new ACIPTshegBarScanner();
|
||||
}
|
||||
return singleton;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -202,7 +202,8 @@ public class PackageTest extends TestCase {
|
|||
message. */
|
||||
static String ACIP2TMW2Translit(boolean EWTSNotACIP, String ACIP) {
|
||||
StringBuffer errors = new StringBuffer();
|
||||
ArrayList al = ACIPTshegBarScanner.scan(ACIP, errors, -1, false, "None");
|
||||
ArrayList al = ACIPTshegBarScanner.instance().scan(ACIP, errors, -1,
|
||||
false, "None");
|
||||
if (null == al || errors.length() > 0)
|
||||
return null;
|
||||
org.thdl.tib.text.TibetanDocument tdoc
|
||||
|
@ -7357,7 +7358,8 @@ tstHelper("ZUR");
|
|||
|
||||
private static void shelp(String s, String expectedErrors, String expectedScan, String warningLevel) {
|
||||
StringBuffer errors = new StringBuffer();
|
||||
ArrayList al = ACIPTshegBarScanner.scan(s, errors, -1, false, warningLevel);
|
||||
ArrayList al = ACIPTshegBarScanner.instance().scan(s, errors, -1, false,
|
||||
warningLevel);
|
||||
if (null != expectedScan) {
|
||||
if (!al.toString().equals(expectedScan)) {
|
||||
System.out.println("Scanning " + s + " into tsheg bars was expected to cause the following scan:");
|
||||
|
|
|
@ -29,6 +29,7 @@ import org.thdl.tib.text.TibetanDocument;
|
|||
import org.thdl.tib.text.TibetanMachineWeb;
|
||||
import org.thdl.tib.text.DuffCode;
|
||||
|
||||
// TODO(DLC)[EWTS->Tibetan]: THis class is broken for ewts. But kill this class unless it needs to exist.
|
||||
/**
|
||||
* This class is able to convert an ACIP file into Tibetan Machine Web
|
||||
* and an ACIP file into Unicode. ACIP->Unicode should yield the same
|
||||
|
@ -68,9 +69,10 @@ public class TConverter {
|
|||
boolean shortMessages = false;
|
||||
String warningLevel = "Most";
|
||||
ArrayList al
|
||||
= ACIPTshegBarScanner.scanFile(args[0], errors,
|
||||
maxErrors - 1, shortMessages,
|
||||
warningLevel);
|
||||
= ACIPTshegBarScanner.instance().scanFile(args[0], errors,
|
||||
maxErrors - 1,
|
||||
shortMessages,
|
||||
warningLevel);
|
||||
|
||||
if (null == al) {
|
||||
System.err.println(maxErrors + " or more lexical errors occurred while scanning ACIP input file; is this");
|
||||
|
@ -208,8 +210,9 @@ public class TConverter {
|
|||
throw new IllegalArgumentException("Unsupported transliteration");
|
||||
}
|
||||
ByteArrayOutputStream sw = new ByteArrayOutputStream();
|
||||
ArrayList al = ACIPTshegBarScanner.scan(acip, errors, -1, shortMessages,
|
||||
warningLevel);
|
||||
ArrayList al
|
||||
= ACIPTshegBarScanner.instance().scan(acip, errors, -1,
|
||||
shortMessages, warningLevel);
|
||||
try {
|
||||
if (null != al) {
|
||||
convertToUnicodeText(al, sw, errors,
|
||||
|
@ -301,9 +304,9 @@ public class TConverter {
|
|||
{
|
||||
try {
|
||||
if (null != tdoc && (toUnicode && !toRTF))
|
||||
throw new Error("Doing both at once might work, but it's not been tested. I bet some 'continue;' statements will need to go.");
|
||||
throw new IllegalArgumentException("Doing both at once might work, but it's not been tested. I bet some 'continue;' statements will need to go.");
|
||||
if (toUnicode && toRTF)
|
||||
throw new Error("FIXME: support this ACIP->Unicode.rtf mode so that KA (GA) shows up in two different font sizes. See RFE 838591.");
|
||||
throw new IllegalArgumentException("FIXME: support this ACIP->Unicode.rtf mode so that KA (GA) shows up in two different font sizes. See RFE 838591.");
|
||||
if (!toUnicode && !toRTF)
|
||||
throw new IllegalArgumentException("ACIP->Uni.rtf, ACIP->Uni.txt, and ACIP->TMW.rtf are supported, but not ACIP->TMW.txt");
|
||||
if (toUnicode && toRTF && null == tdoc)
|
||||
|
|
125
source/org/thdl/tib/text/ttt/TTshegBarScanner.java
Normal file
125
source/org/thdl/tib/text/ttt/TTshegBarScanner.java
Normal file
|
@ -0,0 +1,125 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Stack;
|
||||
|
||||
import org.thdl.util.ThdlDebug;
|
||||
import org.thdl.util.ThdlOptions;
|
||||
|
||||
/**
|
||||
* A TTshegBarScanner is able to break up Strings of transliterated
|
||||
* Tibetan text (for example, an entire sutra) into bite-sized
|
||||
* components like tsheg bars. This is an abstract class.
|
||||
*
|
||||
* @author David Chandler */
|
||||
public abstract class TTshegBarScanner {
|
||||
|
||||
/** Default constructor. */
|
||||
public TTshegBarScanner() { }
|
||||
|
||||
/** Scans a transliteration file with path fname into tsheg bars.
|
||||
* If errors is non-null, error messages will be appended to it.
|
||||
* Returns a list of TStrings that is the scan. Warning and
|
||||
* error messages in the result will be long and self-contained
|
||||
* unless shortMessagse is true.
|
||||
*
|
||||
* <p>This is not so efficient; copies the whole file into memory
|
||||
* first.
|
||||
*
|
||||
* @param warningLevel controls which lexical warnings you will
|
||||
* encounter
|
||||
*
|
||||
* @throws IOException if we cannot read in the input file
|
||||
* */
|
||||
public final ArrayList scanFile(String fname, StringBuffer errors,
|
||||
int maxErrors, boolean shortMessages,
|
||||
String warningLevel)
|
||||
throws IOException
|
||||
{
|
||||
return scanStream(new FileInputStream(fname),
|
||||
errors, maxErrors, shortMessages, warningLevel);
|
||||
}
|
||||
|
||||
/** Scans a stream of transliteration into tsheg bars. If errors is
|
||||
* non-null, error messages will be appended to it. You can
|
||||
* recover both errors and (optionally) warnings (modulo offset
|
||||
* information) from the result, though. They will be short
|
||||
* messages iff shortMessages is true. Returns a list of
|
||||
* TStrings that is the scan, or null if more than maxErrors
|
||||
* occur.
|
||||
*
|
||||
* <p>This is not so efficient; copies the whole stream into
|
||||
* memory first.
|
||||
*
|
||||
* @param warningLevel controls which lexical warnings you will
|
||||
* encounter
|
||||
*
|
||||
* @throws IOException if we cannot read the whole stream */
|
||||
public final ArrayList scanStream(InputStream stream,
|
||||
StringBuffer errors,
|
||||
int maxErrors,
|
||||
boolean shortMessages,
|
||||
String warningLevel)
|
||||
throws IOException
|
||||
{
|
||||
StringBuffer s = new StringBuffer();
|
||||
char ch[] = new char[8192];
|
||||
BufferedReader in
|
||||
= new BufferedReader(new InputStreamReader(stream, "US-ASCII"));
|
||||
|
||||
int amt;
|
||||
while (-1 != (amt = in.read(ch))) {
|
||||
s.append(ch, 0, amt);
|
||||
}
|
||||
in.close();
|
||||
return scan(s.toString(), errors, maxErrors, shortMessages,
|
||||
warningLevel);
|
||||
}
|
||||
|
||||
/** Returns a list of {@link TString TStrings} corresponding
|
||||
* to s, possibly the empty list (when the empty string is the
|
||||
* input). Each String is either a Latin comment, some Latin
|
||||
* text, a tsheg bar (minus the tsheg or shad or whatever), a
|
||||
* String of inter-tsheg-bar punctuation, etc.
|
||||
*
|
||||
* <p>This may do more than scan; it may find some errors and
|
||||
* warnings you'd normally think of a parser (not a scanner)
|
||||
* finding. If so, it puts those in as TStrings with type {@link
|
||||
* TString#ERROR} or {@link TString#WARNING}, and also, if errors
|
||||
* is non-null, appends helpful messages to errors, each followed
|
||||
* by a '\n'.
|
||||
* @param s the transliterated text
|
||||
* @param errors if non-null, the buffer to which to append error
|
||||
* messages (FIXME: kludge, just get this info by scanning
|
||||
* the result for TString.ERROR (and maybe TString.WARNING,
|
||||
* if you care about warnings), but then we'd have to put the
|
||||
* Offset info in the TString)
|
||||
* @param maxErrors if nonnegative, then scanning will stop when
|
||||
* more than maxErrors errors occur. In this event, null is
|
||||
* returned.
|
||||
* @param shortMessages true iff you want short error and warning
|
||||
* messages instead of long, self-contained error messages
|
||||
* @return null if more than maxErrors errors occur, or the scan
|
||||
* otherwise */
|
||||
public abstract ArrayList scan(String s, StringBuffer errors, int maxErrors,
|
||||
boolean shortMessages, String warningLevel);
|
||||
}
|
Loading…
Reference in a new issue