diff --git a/source/org/thdl/tib/input/TibetanConverter.java b/source/org/thdl/tib/input/TibetanConverter.java index 6c50aeb..a19a6b9 100644 --- a/source/org/thdl/tib/input/TibetanConverter.java +++ b/source/org/thdl/tib/input/TibetanConverter.java @@ -297,11 +297,11 @@ public class TibetanConverter implements FontConverterConstants { if (ACIP_TO_UNI_TEXT == ct || ACIP_TO_TMW == ct) { try { ArrayList al - = ACIPTshegBarScanner.scanStream(in, null, - ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have", - 1000 - 1), - shortMessages, - warningLevel); + = ACIPTshegBarScanner.instance().scanStream(in, null, + ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have", + 1000 - 1), + shortMessages, + warningLevel); if (null == al) return 47; boolean embeddedWarnings = (warningLevel != "None"); diff --git a/source/org/thdl/tib/text/TibTextUtils.java b/source/org/thdl/tib/text/TibTextUtils.java index 3526f34..1327fb5 100644 --- a/source/org/thdl/tib/text/TibTextUtils.java +++ b/source/org/thdl/tib/text/TibTextUtils.java @@ -333,8 +333,8 @@ public class TibTextUtils implements THDLWylieConstants { { StringBuffer errors = new StringBuffer(); String warningLevel = withWarnings ? "All" : "None"; - ArrayList al = ACIPTshegBarScanner.scan(acip, errors, 500, false, - warningLevel); + ArrayList al = ACIPTshegBarScanner.instance().scan(acip, errors, 500, + false, warningLevel); if (null == al || errors.length() > 0) { if (errors.length() > 0) throw new InvalidACIPException(errors.toString()); diff --git a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java index 17ea094..bb6eb74 100644 --- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java +++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java @@ -26,18 +26,18 @@ import org.thdl.util.ThdlDebug; import org.thdl.util.ThdlOptions; /** -* This class is able to break up Strings of ACIP text (for example, an -* entire sutra file) into tsheg bars, comments, etc. Folio markers, -* comments, and the like are segregated (so that consumers can ensure -* that they remain in Latin), and Tibetan passages are broken up into -* tsheg bars. +* This singleton class is able to break up Strings of ACIP text (for +* example, an entire sutra file) into tsheg bars, comments, etc. Folio +* markers, comments, and the like are segregated (so that consumers +* can ensure that they remain in Latin), and Tibetan passages are +* broken up into tsheg bars. * *
FIXME: We should be handling {KA\n\nKHA} vs. {KA\nKHA} in * the parser, not here in the lexical analyzer. That'd be cleaner, * and more like how you'd do things if you used lex and yacc. * * @author David Chandler */ -public class ACIPTshegBarScanner { +public class ACIPTshegBarScanner extends TTshegBarScanner { /** True if those ACIP snippets inside square brackets (e.g., "[THIS]") are to be passed through into the output unmodified while retaining the brackets and if those ACIP snippets inside @@ -59,9 +59,9 @@ public class ACIPTshegBarScanner { } StringBuffer errors = new StringBuffer(); int maxErrors = 1000; - ArrayList al = scanFile(args[0], errors, maxErrors - 1, - "true".equals(System.getProperty("org.thdl.tib.text.ttt.ACIPTshegBarScanner.shortMessages")), - "All" /* memory hog */); + ArrayList al = instance().scanFile(args[0], errors, maxErrors - 1, + "true".equals(System.getProperty("org.thdl.tib.text.ttt.ACIPTshegBarScanner.shortMessages")), + "All" /* memory hog */); if (null == al) { System.out.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this"); @@ -83,63 +83,6 @@ public class ACIPTshegBarScanner { System.exit(0); } - /** Scans an ACIP file with path fname into tsheg bars. If errors - * is non-null, error messages will be appended to it. Returns a - * list of TStrings that is the scan. Warning and error messages - * in the result will be long and self-contained unless - * shortMessagse is true. - * - *
FIXME: not so efficient; copies the whole file into memory - * first. - * - * @param warningLevel controls which lexical warnings you will - * encounter - * - * @throws IOException if we cannot read in the ACIP input file - * */ - public static ArrayList scanFile(String fname, StringBuffer errors, - int maxErrors, boolean shortMessages, - String warningLevel) - throws IOException - { - return scanStream(new FileInputStream(fname), - errors, maxErrors, shortMessages, warningLevel); - } - - /** Scans a stream of ACIP into tsheg bars. If errors is - * non-null, error messages will be appended to it. You can - * recover both errors and (optionally) warnings (modulo offset - * information) from the result, though. They will be short - * messages iff shortMessages is true. Returns a list of - * TStrings that is the scan, or null if more than maxErrors - * occur. - * - *
FIXME: not so efficient; copies the whole file into memory - * first. - * - * @param warningLevel controls which lexical warnings you will - * encounter - * - * @throws IOException if we cannot read the whole ACIP stream */ - public static ArrayList scanStream(InputStream stream, StringBuffer errors, - int maxErrors, boolean shortMessages, - String warningLevel) - throws IOException - { - StringBuffer s = new StringBuffer(); - char ch[] = new char[8192]; - BufferedReader in - = new BufferedReader(new InputStreamReader(stream, "US-ASCII")); - - int amt; - while (-1 != (amt = in.read(ch))) { - s.append(ch, 0, amt); - } - in.close(); - return scan(s.toString(), errors, maxErrors, shortMessages, - warningLevel); - } - /** Helper. Here because ACIP {MTHAR%\nKHA} should be treated the same w.r.t. tsheg insertion regardless of the lex errors and lex warnings found. */ @@ -190,33 +133,11 @@ public class ACIPTshegBarScanner { // DLC FIXME "H:\n\n" becomes "H: \n\n", wrongly I think. See // Tibetan! 5.1 section on formatting Tibetan texts. - /** Returns a list of {@link TString TStrings} corresponding - * to s, possibly the empty list (when the empty string is the - * input). Each String is either a Latin comment, some Latin - * text, a tsheg bar (minus the tsheg or shad or whatever), a - * String of inter-tsheg-bar punctuation, etc. - * - *
This not only scans; it finds all the errors and warnings a - * parser would too, like "NYA x" and "(" and ")" and "/NYA" etc. - * It puts those in as TStrings with type {@link - * TString#ERROR} or {@link TString#WARNING}, and also, if - * errors is non-null, appends helpful messages to errors, each - * followed by a '\n'. - * @param s the ACIP text - * @param errors if non-null, the buffer to which to append error - * messages (FIXME: kludge, just get this info by scanning - * the result for TString.ERROR (and maybe TString.WARNING, - * if you care about warnings), but then we'd have to put the - * Offset info in the TString) - * @param maxErrors if nonnegative, then scanning will stop when - * more than maxErrors errors occur. In this event, null is - * returned. - * @param shortMessages true iff you want short error and warning - * messages instead of long, self-contained error messages - * @return null if more than maxErrors errors occur, or the scan - * otherwise */ - public static ArrayList scan(String s, StringBuffer errors, int maxErrors, - boolean shortMessages, String warningLevel) { + /** See the comment in TTshegBarScanner. And note that this not + * only scans; it finds all the errors and warnings a parser + * would too, like "NYA x" and "(" and ")" and "/NYA" etc. */ + public ArrayList scan(String s, StringBuffer errors, int maxErrors, + boolean shortMessages, String warningLevel) { // FIXME: Use less memory and time by not adding in the // warnings that are below threshold. @@ -1113,4 +1034,15 @@ public class ACIPTshegBarScanner { || ch == 's' || ch == 'h'; } + + /** non-public because this is a singleton */ + protected ACIPTshegBarScanner() { } + private static ACIPTshegBarScanner singleton = null; + /** Returns the sole instance of this class. */ + public synchronized static ACIPTshegBarScanner instance() { + if (null == singleton) { + singleton = new ACIPTshegBarScanner(); + } + return singleton; + } } diff --git a/source/org/thdl/tib/text/ttt/PackageTest.java b/source/org/thdl/tib/text/ttt/PackageTest.java index 6246868..e8dde5b 100644 --- a/source/org/thdl/tib/text/ttt/PackageTest.java +++ b/source/org/thdl/tib/text/ttt/PackageTest.java @@ -202,7 +202,8 @@ public class PackageTest extends TestCase { message. */ static String ACIP2TMW2Translit(boolean EWTSNotACIP, String ACIP) { StringBuffer errors = new StringBuffer(); - ArrayList al = ACIPTshegBarScanner.scan(ACIP, errors, -1, false, "None"); + ArrayList al = ACIPTshegBarScanner.instance().scan(ACIP, errors, -1, + false, "None"); if (null == al || errors.length() > 0) return null; org.thdl.tib.text.TibetanDocument tdoc @@ -7357,7 +7358,8 @@ tstHelper("ZUR"); private static void shelp(String s, String expectedErrors, String expectedScan, String warningLevel) { StringBuffer errors = new StringBuffer(); - ArrayList al = ACIPTshegBarScanner.scan(s, errors, -1, false, warningLevel); + ArrayList al = ACIPTshegBarScanner.instance().scan(s, errors, -1, false, + warningLevel); if (null != expectedScan) { if (!al.toString().equals(expectedScan)) { System.out.println("Scanning " + s + " into tsheg bars was expected to cause the following scan:"); diff --git a/source/org/thdl/tib/text/ttt/TConverter.java b/source/org/thdl/tib/text/ttt/TConverter.java index d0fd55e..9bbe07f 100644 --- a/source/org/thdl/tib/text/ttt/TConverter.java +++ b/source/org/thdl/tib/text/ttt/TConverter.java @@ -29,6 +29,7 @@ import org.thdl.tib.text.TibetanDocument; import org.thdl.tib.text.TibetanMachineWeb; import org.thdl.tib.text.DuffCode; +// TODO(DLC)[EWTS->Tibetan]: THis class is broken for ewts. But kill this class unless it needs to exist. /** * This class is able to convert an ACIP file into Tibetan Machine Web * and an ACIP file into Unicode. ACIP->Unicode should yield the same @@ -68,9 +69,10 @@ public class TConverter { boolean shortMessages = false; String warningLevel = "Most"; ArrayList al - = ACIPTshegBarScanner.scanFile(args[0], errors, - maxErrors - 1, shortMessages, - warningLevel); + = ACIPTshegBarScanner.instance().scanFile(args[0], errors, + maxErrors - 1, + shortMessages, + warningLevel); if (null == al) { System.err.println(maxErrors + " or more lexical errors occurred while scanning ACIP input file; is this"); @@ -208,8 +210,9 @@ public class TConverter { throw new IllegalArgumentException("Unsupported transliteration"); } ByteArrayOutputStream sw = new ByteArrayOutputStream(); - ArrayList al = ACIPTshegBarScanner.scan(acip, errors, -1, shortMessages, - warningLevel); + ArrayList al + = ACIPTshegBarScanner.instance().scan(acip, errors, -1, + shortMessages, warningLevel); try { if (null != al) { convertToUnicodeText(al, sw, errors, @@ -301,9 +304,9 @@ public class TConverter { { try { if (null != tdoc && (toUnicode && !toRTF)) - throw new Error("Doing both at once might work, but it's not been tested. I bet some 'continue;' statements will need to go."); + throw new IllegalArgumentException("Doing both at once might work, but it's not been tested. I bet some 'continue;' statements will need to go."); if (toUnicode && toRTF) - throw new Error("FIXME: support this ACIP->Unicode.rtf mode so that KA (GA) shows up in two different font sizes. See RFE 838591."); + throw new IllegalArgumentException("FIXME: support this ACIP->Unicode.rtf mode so that KA (GA) shows up in two different font sizes. See RFE 838591."); if (!toUnicode && !toRTF) throw new IllegalArgumentException("ACIP->Uni.rtf, ACIP->Uni.txt, and ACIP->TMW.rtf are supported, but not ACIP->TMW.txt"); if (toUnicode && toRTF && null == tdoc) diff --git a/source/org/thdl/tib/text/ttt/TTshegBarScanner.java b/source/org/thdl/tib/text/ttt/TTshegBarScanner.java new file mode 100644 index 0000000..f1a94f1 --- /dev/null +++ b/source/org/thdl/tib/text/ttt/TTshegBarScanner.java @@ -0,0 +1,125 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2003 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.ttt; + +import java.io.*; +import java.util.ArrayList; +import java.util.Stack; + +import org.thdl.util.ThdlDebug; +import org.thdl.util.ThdlOptions; + +/** +* A TTshegBarScanner is able to break up Strings of transliterated +* Tibetan text (for example, an entire sutra) into bite-sized +* components like tsheg bars. This is an abstract class. +* +* @author David Chandler */ +public abstract class TTshegBarScanner { + + /** Default constructor. */ + public TTshegBarScanner() { } + + /** Scans a transliteration file with path fname into tsheg bars. + * If errors is non-null, error messages will be appended to it. + * Returns a list of TStrings that is the scan. Warning and + * error messages in the result will be long and self-contained + * unless shortMessagse is true. + * + *
This is not so efficient; copies the whole file into memory + * first. + * + * @param warningLevel controls which lexical warnings you will + * encounter + * + * @throws IOException if we cannot read in the input file + * */ + public final ArrayList scanFile(String fname, StringBuffer errors, + int maxErrors, boolean shortMessages, + String warningLevel) + throws IOException + { + return scanStream(new FileInputStream(fname), + errors, maxErrors, shortMessages, warningLevel); + } + + /** Scans a stream of transliteration into tsheg bars. If errors is + * non-null, error messages will be appended to it. You can + * recover both errors and (optionally) warnings (modulo offset + * information) from the result, though. They will be short + * messages iff shortMessages is true. Returns a list of + * TStrings that is the scan, or null if more than maxErrors + * occur. + * + *
This is not so efficient; copies the whole stream into + * memory first. + * + * @param warningLevel controls which lexical warnings you will + * encounter + * + * @throws IOException if we cannot read the whole stream */ + public final ArrayList scanStream(InputStream stream, + StringBuffer errors, + int maxErrors, + boolean shortMessages, + String warningLevel) + throws IOException + { + StringBuffer s = new StringBuffer(); + char ch[] = new char[8192]; + BufferedReader in + = new BufferedReader(new InputStreamReader(stream, "US-ASCII")); + + int amt; + while (-1 != (amt = in.read(ch))) { + s.append(ch, 0, amt); + } + in.close(); + return scan(s.toString(), errors, maxErrors, shortMessages, + warningLevel); + } + + /** Returns a list of {@link TString TStrings} corresponding + * to s, possibly the empty list (when the empty string is the + * input). Each String is either a Latin comment, some Latin + * text, a tsheg bar (minus the tsheg or shad or whatever), a + * String of inter-tsheg-bar punctuation, etc. + * + *
This may do more than scan; it may find some errors and + * warnings you'd normally think of a parser (not a scanner) + * finding. If so, it puts those in as TStrings with type {@link + * TString#ERROR} or {@link TString#WARNING}, and also, if errors + * is non-null, appends helpful messages to errors, each followed + * by a '\n'. + * @param s the transliterated text + * @param errors if non-null, the buffer to which to append error + * messages (FIXME: kludge, just get this info by scanning + * the result for TString.ERROR (and maybe TString.WARNING, + * if you care about warnings), but then we'd have to put the + * Offset info in the TString) + * @param maxErrors if nonnegative, then scanning will stop when + * more than maxErrors errors occur. In this event, null is + * returned. + * @param shortMessages true iff you want short error and warning + * messages instead of long, self-contained error messages + * @return null if more than maxErrors errors occur, or the scan + * otherwise */ + public abstract ArrayList scan(String s, StringBuffer errors, int maxErrors, + boolean shortMessages, String warningLevel); +}