ACIP->TMW and ACIP->Unicode are now smart about when a newline is

really a newline and when a space is really a tsheg. The space in {KA ,MDO} is a tsheg, but the space in {GA ,MDO} is not.
2003-09-04 04:04:21 +00:00 · 2003-09-04 04:04:21 +00:00 · d2749cecd0
commit d2749cecd0
parent 72e531e515
2 changed files with 100 additions and 18 deletions
--- a/source/org/thdl/tib/text/ttt/ACIPConverter.java
+++ b/source/org/thdl/tib/text/ttt/ACIPConverter.java
@ -35,13 +35,6 @@ import org.thdl.tib.text.DuffCode;
 * @author David Chandler
 */
 public class ACIPConverter {
-    static {
-        // We don't want to load the TM or TMW font files ourselves:
-        ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
-        ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
-        ThdlOptions.setUserPreference("thdl.debug", true);
-    }
-
    // DLC NOW: (KA)'s info is lost when you convert to Unicode text instead of Unicode RTF.  Give an ERROR.

    /** Command-line converter.  Gives error messages on standard
@ -52,6 +45,11 @@ public class ACIPConverter {
    public static void main(String[] args)
        throws IOException
    {
+        // We don't want to load the TM or TMW font files ourselves:
+        ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
+        ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
+        ThdlOptions.setUserPreference("thdl.debug", true);
+
        boolean verbose = true;
        if (args.length != 1) {
            System.out.println("Bad args!  Need just the name of the ACIP text file.");
@ -216,6 +214,24 @@ public class ACIPConverter {
                         writeWarningsToOut, warningLevel);
    }

+    private static boolean peekaheadFindsSpacesAndComma(ArrayList /* of ACIPString */ scan,
+                                                        int pos) {
+        int sz = scan.size();
+        while (pos < sz) {
+            ACIPString s = (ACIPString)scan.get(pos++);
+            if (s.getType() == ACIPString.TIBETAN_PUNCTUATION && s.getText().equals(" ")) {
+                // keep going
+            } else {
+                if (s.getType() == ACIPString.TIBETAN_PUNCTUATION && s.getText().equals(",")) {
+                    return true;
+                } else {
+                    return false;
+                }
+            }
+        }
+        return false;
+    }
+
    private static boolean convertTo(boolean toUnicode, // else to TMW
                                     ArrayList scan,
                                     OutputStream out, // for toUnicode mode
@ -232,15 +248,21 @@ public class ACIPConverter {
        if (toUnicode)
            writer
                = new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
+        boolean lastGuyWasNonPunct = false;
+        TStackList lastGuy = null;
        for (int i = 0; i < sz; i++) {
            ACIPString s = (ACIPString)scan.get(i);
            int stype = s.getType();
            if (stype == ACIPString.ERROR) {
+                lastGuyWasNonPunct = false;
+                lastGuy = null;
                hasErrors = true;
                String text = "[#ERROR CONVERTING ACIP DOCUMENT: Lexical error: " + s.getText() + "]";
                if (null != writer) writer.write(text);
                if (null != tdoc) tdoc.appendRoman(text);
            } else if (stype == ACIPString.WARNING) {
+                lastGuyWasNonPunct = false;
+                lastGuy = null;
                if (writeWarningsToOut) {
                    String text = "[#WARNING CONVERTING ACIP DOCUMENT: Lexical warning: " + s.getText() + "]";
                    if (null != writer) writer.write(text);
@ -255,6 +277,8 @@ public class ACIPConverter {
                }
            } else {
                if (s.isLatin(stype)) {
+                    lastGuyWasNonPunct = false;
+                    lastGuy = null;
                    String text
                        = (((stype == ACIPString.FOLIO_MARKER) ? "{" : "")
                           + s.getText()
@ -265,6 +289,7 @@ public class ACIPConverter {
                    String unicode = null;
                    DuffCode[] duff = null;
                    if (stype == ACIPString.TIBETAN_NON_PUNCTUATION) {
+                        lastGuyWasNonPunct = true;
                        TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText());
                        String acipError;

@ -294,6 +319,7 @@ public class ACIPConverter {
                                    if (null != errors)
                                        errors.append(errorMessage + "\n");
                                } else {
+                                    lastGuy = sl;
                                    String warning
                                        = pt.getWarning(warningLevel,
                                                        pl,
@ -332,16 +358,50 @@ public class ACIPConverter {
                            if (null != writer) unicode = "\u0F3D";
                            if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph(")") };
                        } else {
-                            if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false);
-                            if (null != tdoc) {
-                                if (s.getText().equals("\r") || s.getText().equals("\t") || s.getText().equals("\n")) {
-                                    tdoc.appendRoman(s.getText());
-                                    continue;
+                            // For ACIP, tshegs are used as both
+                            // tshegs and whitespace.  We treat a
+                            // space as a tsheg if and only if it
+                            // occurs after TIBETAN_NON_PUNCTUATION.
+                            // But "SHIG ,MDO" is an example of a
+                            // special case, needed because a tsheg is
+                            // not used after a GA in Tibetan
+                            // typesetting.
+                            boolean done = false;
+                            // DLC what about after numbers?  marks?
+                            if (s.getText().equals(" ")) {
+                                TPairList lpl = null;
+                                if (!lastGuyWasNonPunct
+                                    || (null != lastGuy
+                                        && (lpl = lastGuy.get(lastGuy.size() - 1)).size() == 1
+                                        && lpl.get(0).getLeft().equals("G")
+                                        && // it's (G . anything)
+                                           // followed by some number
+                                           // of spaces (at least one,
+                                           // this one) and then a
+                                           // comma:
+                                        peekaheadFindsSpacesAndComma(scan, i+1))) {
+                                    if (null != writer) {
+                                        unicode = "    ";
+                                        done = true;
+                                    }
+                                    if (null != tdoc) {
+                                        tdoc.appendRoman("    ");
+                                        continue;
+                                    }
                                }
-                                else {
-                                    String wy = ACIPRules.getWylieForACIPOther(s.getText());
-                                    if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
-                                    duff = new DuffCode[] { TibetanMachineWeb.getGlyph(wy) };
+                            }
+                            if (!done) {
+                                if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false);
+                                if (null != tdoc) {
+                                    if (s.getText().equals("\r") || s.getText().equals("\t") || s.getText().equals("\n")) {
+                                        tdoc.appendRoman(s.getText());
+                                        continue;
+                                    }
+                                    else {
+                                        String wy = ACIPRules.getWylieForACIPOther(s.getText());
+                                        if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
+                                        duff = new DuffCode[] { TibetanMachineWeb.getGlyph(wy) };
+                                    }
                                }
                            }
                        }
@ -349,6 +409,8 @@ public class ACIPConverter {
                            throw new Error("FIXME: make this an assertion 1");
                        if (null != tdoc && (null == duff || 0 == duff.length))
                            throw new Error("FIXME: make this an assertion 2");
+                        lastGuyWasNonPunct = false;
+                        lastGuy = null;
                    }
                    if (null != writer && null != unicode) writer.write(unicode);
                    if (null != tdoc) {
--- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
@ -767,13 +767,33 @@ public class ACIPTshegBarScanner {
            case ';':
            case '`':
            case '#':
+
                // The tsheg bar ends here; new token.
                if (startOfString < i) {
                    al.add(new ACIPString(s.substring(startOfString, i),
                                          currentType));
                }
-                al.add(new ACIPString(s.substring(i, i+1),
-                                      ACIPString.TIBETAN_PUNCTUATION));
+
+                // Insert a tsheg if necessary.  ACIP files aren't
+                // careful, so "KA\r\n" and "GA\n" appear where "KA
+                // \r\n" and "GA \n" should appear.
+                if (('\r' == ch
+                     || '\n' == ch)
+                    && !al.isEmpty()
+                    && ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION) {
+                    al.add(new ACIPString(" ",
+                                          ACIPString.TIBETAN_PUNCTUATION));
+                }
+
+                // Don't add in a "\r\n" or "\n" unless there's a
+                // blank line.
+                boolean rn = false;
+                if (('\n' != ch && '\r' != ch)
+                    || ((rn = ('\n' == ch && i >= 3 && s.charAt(i-3) == '\r' && s.charAt(i-2) == '\n' && s.charAt(i-1) == '\r'))
+                        || ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n'))) {
+                    al.add(new ACIPString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
+                                          ACIPString.TIBETAN_PUNCTUATION));
+                }
                startOfString = i+1;
                currentType = ACIPString.ERROR;
                break; // end TIBETAN_PUNCTUATION case