Refactored so that there can be an EWTS scanner and an ACIP scanner.

2005-02-21 05:37:01 +00:00 · 2005-02-21 05:37:01 +00:00 · 4c268c5ea2
commit 4c268c5ea2
parent 7854e4fd93
6 changed files with 171 additions and 109 deletions
--- a/source/org/thdl/tib/input/TibetanConverter.java
+++ b/source/org/thdl/tib/input/TibetanConverter.java
@ -297,11 +297,11 @@ public class TibetanConverter implements FontConverterConstants {
        if (ACIP_TO_UNI_TEXT == ct || ACIP_TO_TMW == ct) {
            try {
                ArrayList al
-                    = ACIPTshegBarScanner.scanStream(in, null,
-                                                     ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have",
-                                                                                  1000 - 1),
-                                                     shortMessages,
-                                                     warningLevel);
+                    = ACIPTshegBarScanner.instance().scanStream(in, null,
+                                                                ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have",
+                                                                                             1000 - 1),
+                                                                shortMessages,
+                                                                warningLevel);
                if (null == al)
                    return 47;
                boolean embeddedWarnings = (warningLevel != "None");
--- a/source/org/thdl/tib/text/TibTextUtils.java
+++ b/source/org/thdl/tib/text/TibTextUtils.java
@ -333,8 +333,8 @@ public class TibTextUtils implements THDLWylieConstants {
    {
        StringBuffer errors = new StringBuffer();
        String warningLevel = withWarnings ? "All" : "None";
-        ArrayList al = ACIPTshegBarScanner.scan(acip, errors, 500, false,
-                                                warningLevel);
+        ArrayList al = ACIPTshegBarScanner.instance().scan(acip, errors, 500,
+                                                           false, warningLevel);
        if (null == al || errors.length() > 0) {
            if (errors.length() > 0)
                throw new InvalidACIPException(errors.toString());
--- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
@ -26,18 +26,18 @@ import org.thdl.util.ThdlDebug;
 import org.thdl.util.ThdlOptions;

 /**
-* This class is able to break up Strings of ACIP text (for example, an
-* entire sutra file) into tsheg bars, comments, etc. Folio markers,
-* comments, and the like are segregated (so that consumers can ensure
-* that they remain in Latin), and Tibetan passages are broken up into
-* tsheg bars.
+* This singleton class is able to break up Strings of ACIP text (for
+* example, an entire sutra file) into tsheg bars, comments, etc. Folio
+* markers, comments, and the like are segregated (so that consumers
+* can ensure that they remain in Latin), and Tibetan passages are
+* broken up into tsheg bars.
 *
 * <p><b>FIXME:</b> We should be handling {KA\n\nKHA} vs. {KA\nKHA} in
 * the parser, not here in the lexical analyzer.  That'd be cleaner,
 * and more like how you'd do things if you used lex and yacc.
 *
 * @author David Chandler */
-public class ACIPTshegBarScanner {
+public class ACIPTshegBarScanner extends TTshegBarScanner {
    /** True if those ACIP snippets inside square brackets (e.g.,
        "[THIS]") are to be passed through into the output unmodified
        while retaining the brackets and if those ACIP snippets inside
@ -59,9 +59,9 @@ public class ACIPTshegBarScanner {
        }
        StringBuffer errors = new StringBuffer();
        int maxErrors = 1000;
-        ArrayList al = scanFile(args[0], errors, maxErrors - 1,
-                                "true".equals(System.getProperty("org.thdl.tib.text.ttt.ACIPTshegBarScanner.shortMessages")),
-                                "All" /* memory hog */);
+        ArrayList al = instance().scanFile(args[0], errors, maxErrors - 1,
+                                           "true".equals(System.getProperty("org.thdl.tib.text.ttt.ACIPTshegBarScanner.shortMessages")),
+                                           "All" /* memory hog */);

        if (null == al) {
            System.out.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this");
@ -83,63 +83,6 @@ public class ACIPTshegBarScanner {
        System.exit(0);
    }

-    /** Scans an ACIP file with path fname into tsheg bars.  If errors
-     *  is non-null, error messages will be appended to it.  Returns a
-     *  list of TStrings that is the scan.  Warning and error messages
-     *  in the result will be long and self-contained unless
-     *  shortMessagse is true.
-     *
-     *  <p>FIXME: not so efficient; copies the whole file into memory
-     *  first.
-     *
-     *  @param warningLevel controls which lexical warnings you will
-     *  encounter
-     *
-     *  @throws IOException if we cannot read in the ACIP input file
-     *  */
-    public static ArrayList scanFile(String fname, StringBuffer errors,
-                                     int maxErrors, boolean shortMessages,
-                                     String warningLevel)
-        throws IOException
-    {
-        return scanStream(new FileInputStream(fname),
-                          errors, maxErrors, shortMessages, warningLevel);
-    }
-
-    /** Scans a stream of ACIP into tsheg bars.  If errors is
-     *  non-null, error messages will be appended to it.  You can
-     *  recover both errors and (optionally) warnings (modulo offset
-     *  information) from the result, though.  They will be short
-     *  messages iff shortMessages is true.  Returns a list of
-     *  TStrings that is the scan, or null if more than maxErrors
-     *  occur.
-     *
-     *  <p>FIXME: not so efficient; copies the whole file into memory
-     *  first.
-     *
-     *  @param warningLevel controls which lexical warnings you will
-     *  encounter
-     *
-     *  @throws IOException if we cannot read the whole ACIP stream */
-    public static ArrayList scanStream(InputStream stream, StringBuffer errors,
-                                       int maxErrors, boolean shortMessages,
-                                       String warningLevel)
-        throws IOException
-    {
-        StringBuffer s = new StringBuffer();
-        char ch[] = new char[8192];
-        BufferedReader in
-            = new BufferedReader(new InputStreamReader(stream, "US-ASCII"));
-
-        int amt;
-        while (-1 != (amt = in.read(ch))) {
-            s.append(ch, 0, amt);
-        }
-        in.close();
-        return scan(s.toString(), errors, maxErrors, shortMessages,
-                    warningLevel);
-    }
-
    /** Helper.  Here because ACIP {MTHAR%\nKHA} should be treated the
        same w.r.t. tsheg insertion regardless of the lex errors and
        lex warnings found. */
@ -190,33 +133,11 @@ public class ACIPTshegBarScanner {
    // DLC FIXME "H:\n\n" becomes "H: \n\n", wrongly I think.  See
    // Tibetan! 5.1 section on formatting Tibetan texts.

-    /** Returns a list of {@link TString TStrings} corresponding
-     *  to s, possibly the empty list (when the empty string is the
-     *  input).  Each String is either a Latin comment, some Latin
-     *  text, a tsheg bar (minus the tsheg or shad or whatever), a
-     *  String of inter-tsheg-bar punctuation, etc.
-     *
-     *  <p>This not only scans; it finds all the errors and warnings a
-     *  parser would too, like "NYA x" and "(" and ")" and "/NYA" etc.
-     *  It puts those in as TStrings with type {@link
-     *  TString#ERROR} or {@link TString#WARNING}, and also, if
-     *  errors is non-null, appends helpful messages to errors, each
-     *  followed by a '\n'.
-     *  @param s the ACIP text
-     *  @param errors if non-null, the buffer to which to append error
-     *  messages (FIXME: kludge, just get this info by scanning
-     *  the result for TString.ERROR (and maybe TString.WARNING,
-     *  if you care about warnings), but then we'd have to put the
-     *  Offset info in the TString)
-     *  @param maxErrors if nonnegative, then scanning will stop when
-     *  more than maxErrors errors occur.  In this event, null is
-     *  returned.
-     *  @param shortMessages true iff you want short error and warning
-     *  messages instead of long, self-contained error messages
-     *  @return null if more than maxErrors errors occur, or the scan
-     *  otherwise */
-    public static ArrayList scan(String s, StringBuffer errors, int maxErrors,
-                                 boolean shortMessages, String warningLevel) {
+    /** See the comment in TTshegBarScanner.  And note that this not
+     *  only scans; it finds all the errors and warnings a parser
+     *  would too, like "NYA x" and "(" and ")" and "/NYA" etc.  */
+    public ArrayList scan(String s, StringBuffer errors, int maxErrors,
+                          boolean shortMessages, String warningLevel) {
        // FIXME: Use less memory and time by not adding in the
        // warnings that are below threshold.

@ -1113,4 +1034,15 @@ public class ACIPTshegBarScanner {
            || ch == 's'
            || ch == 'h';
    }
+
+    /** non-public because this is a singleton */
+    protected ACIPTshegBarScanner() { }
+    private static ACIPTshegBarScanner singleton = null;
+    /** Returns the sole instance of this class. */
+    public synchronized static ACIPTshegBarScanner instance() {
+        if (null == singleton) {
+            singleton = new ACIPTshegBarScanner();
+        }
+        return singleton;
+    }
 }
--- a/source/org/thdl/tib/text/ttt/PackageTest.java
+++ b/source/org/thdl/tib/text/ttt/PackageTest.java
@ -202,7 +202,8 @@ public class PackageTest extends TestCase {
        message. */
    static String ACIP2TMW2Translit(boolean EWTSNotACIP, String ACIP) {
        StringBuffer errors = new StringBuffer();
-        ArrayList al = ACIPTshegBarScanner.scan(ACIP, errors, -1, false, "None");
+        ArrayList al = ACIPTshegBarScanner.instance().scan(ACIP, errors, -1,
+                                                           false, "None");
        if (null == al || errors.length() > 0)
            return null;
        org.thdl.tib.text.TibetanDocument tdoc
@ -7357,7 +7358,8 @@ tstHelper("ZUR");

    private static void shelp(String s, String expectedErrors, String expectedScan, String warningLevel) {
        StringBuffer errors = new StringBuffer();
-        ArrayList al = ACIPTshegBarScanner.scan(s, errors, -1, false, warningLevel);
+        ArrayList al = ACIPTshegBarScanner.instance().scan(s, errors, -1, false,
+                                                           warningLevel);
        if (null != expectedScan) {
            if (!al.toString().equals(expectedScan)) {
                System.out.println("Scanning " + s + " into tsheg bars was expected to cause the following scan:");
--- a/source/org/thdl/tib/text/ttt/TConverter.java
+++ b/source/org/thdl/tib/text/ttt/TConverter.java
@ -29,6 +29,7 @@ import org.thdl.tib.text.TibetanDocument;
 import org.thdl.tib.text.TibetanMachineWeb;
 import org.thdl.tib.text.DuffCode;

+// TODO(DLC)[EWTS->Tibetan]: THis class is broken for ewts.  But kill this class unless it needs to exist.
 /**
 * This class is able to convert an ACIP file into Tibetan Machine Web
 * and an ACIP file into Unicode.  ACIP->Unicode should yield the same
@ -68,9 +69,10 @@ public class TConverter {
        boolean shortMessages = false;
        String warningLevel = "Most";
        ArrayList al
-            = ACIPTshegBarScanner.scanFile(args[0], errors,
-                                           maxErrors - 1, shortMessages,
-                                           warningLevel);
+            = ACIPTshegBarScanner.instance().scanFile(args[0], errors,
+                                                      maxErrors - 1,
+                                                      shortMessages,
+                                                      warningLevel);

        if (null == al) {
            System.err.println(maxErrors + " or more lexical errors occurred while scanning ACIP input file; is this");
@ -208,8 +210,9 @@ public class TConverter {
            throw new IllegalArgumentException("Unsupported transliteration");
        }
        ByteArrayOutputStream sw = new ByteArrayOutputStream();
-        ArrayList al = ACIPTshegBarScanner.scan(acip, errors, -1, shortMessages,
-                                                warningLevel);
+        ArrayList al
+            = ACIPTshegBarScanner.instance().scan(acip, errors, -1,
+                                                  shortMessages, warningLevel);
        try {
            if (null != al) {
                convertToUnicodeText(al, sw, errors,
@ -301,9 +304,9 @@ public class TConverter {
    {
        try {
        if (null != tdoc && (toUnicode && !toRTF))
-            throw new Error("Doing both at once might work, but it's not been tested.  I bet some 'continue;' statements will need to go.");
+            throw new IllegalArgumentException("Doing both at once might work, but it's not been tested.  I bet some 'continue;' statements will need to go.");
        if (toUnicode && toRTF)
-            throw new Error("FIXME: support this ACIP->Unicode.rtf mode so that KA (GA) shows up in two different font sizes.  See RFE 838591.");
+            throw new IllegalArgumentException("FIXME: support this ACIP->Unicode.rtf mode so that KA (GA) shows up in two different font sizes.  See RFE 838591.");
        if (!toUnicode && !toRTF)
            throw new IllegalArgumentException("ACIP->Uni.rtf, ACIP->Uni.txt, and ACIP->TMW.rtf are supported, but not ACIP->TMW.txt");
        if (toUnicode && toRTF && null == tdoc)
--- a/source/org/thdl/tib/text/ttt/TTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/TTshegBarScanner.java
@ -0,0 +1,125 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site 
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis, 
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the 
+License for the specific terms governing rights and limitations under the 
+License. 
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
+All Rights Reserved. 
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.ttt;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.Stack;
+
+import org.thdl.util.ThdlDebug;
+import org.thdl.util.ThdlOptions;
+
+/**
+* A TTshegBarScanner is able to break up Strings of transliterated
+* Tibetan text (for example, an entire sutra) into bite-sized
+* components like tsheg bars.  This is an abstract class.
+*
+* @author David Chandler */
+public abstract class TTshegBarScanner {
+
+    /** Default constructor. */
+    public TTshegBarScanner() { }
+
+    /** Scans a transliteration file with path fname into tsheg bars.
+     *  If errors is non-null, error messages will be appended to it.
+     *  Returns a list of TStrings that is the scan.  Warning and
+     *  error messages in the result will be long and self-contained
+     *  unless shortMessagse is true.
+     *
+     *  <p>This is not so efficient; copies the whole file into memory
+     *  first.
+     *
+     *  @param warningLevel controls which lexical warnings you will
+     *  encounter
+     *
+     *  @throws IOException if we cannot read in the input file
+     *  */
+    public final ArrayList scanFile(String fname, StringBuffer errors,
+                                    int maxErrors, boolean shortMessages,
+                                    String warningLevel)
+        throws IOException
+    {
+        return scanStream(new FileInputStream(fname),
+                          errors, maxErrors, shortMessages, warningLevel);
+    }
+
+    /** Scans a stream of transliteration into tsheg bars.  If errors is
+     *  non-null, error messages will be appended to it.  You can
+     *  recover both errors and (optionally) warnings (modulo offset
+     *  information) from the result, though.  They will be short
+     *  messages iff shortMessages is true.  Returns a list of
+     *  TStrings that is the scan, or null if more than maxErrors
+     *  occur.
+     *
+     *  <p>This is not so efficient; copies the whole stream into
+     *  memory first.
+     *
+     *  @param warningLevel controls which lexical warnings you will
+     *  encounter
+     *
+     *  @throws IOException if we cannot read the whole stream */
+    public final ArrayList scanStream(InputStream stream,
+                                      StringBuffer errors,
+                                      int maxErrors,
+                                      boolean shortMessages,
+                                      String warningLevel)
+        throws IOException
+    {
+        StringBuffer s = new StringBuffer();
+        char ch[] = new char[8192];
+        BufferedReader in
+            = new BufferedReader(new InputStreamReader(stream, "US-ASCII"));
+
+        int amt;
+        while (-1 != (amt = in.read(ch))) {
+            s.append(ch, 0, amt);
+        }
+        in.close();
+        return scan(s.toString(), errors, maxErrors, shortMessages,
+                    warningLevel);
+    }
+
+    /** Returns a list of {@link TString TStrings} corresponding
+     *  to s, possibly the empty list (when the empty string is the
+     *  input).  Each String is either a Latin comment, some Latin
+     *  text, a tsheg bar (minus the tsheg or shad or whatever), a
+     *  String of inter-tsheg-bar punctuation, etc.
+     *
+     *  <p>This may do more than scan; it may find some errors and
+     *  warnings you'd normally think of a parser (not a scanner)
+     *  finding.  If so, it puts those in as TStrings with type {@link
+     *  TString#ERROR} or {@link TString#WARNING}, and also, if errors
+     *  is non-null, appends helpful messages to errors, each followed
+     *  by a '\n'.
+     *  @param s the transliterated text
+     *  @param errors if non-null, the buffer to which to append error
+     *  messages (FIXME: kludge, just get this info by scanning
+     *  the result for TString.ERROR (and maybe TString.WARNING,
+     *  if you care about warnings), but then we'd have to put the
+     *  Offset info in the TString)
+     *  @param maxErrors if nonnegative, then scanning will stop when
+     *  more than maxErrors errors occur.  In this event, null is
+     *  returned.
+     *  @param shortMessages true iff you want short error and warning
+     *  messages instead of long, self-contained error messages
+     *  @return null if more than maxErrors errors occur, or the scan
+     *  otherwise */
+    public abstract ArrayList scan(String s, StringBuffer errors, int maxErrors,
+                                   boolean shortMessages, String warningLevel);
+}