Fixed ACIP->Unicode spaces/tshegs and newlines, especially with shads.

"NGA," becomes "NGA-tsheg-," automatically now.
2003-09-05 05:08:47 +00:00 · 2003-09-05 05:08:47 +00:00 · 717c3b94f3
commit 717c3b94f3
parent 5c240ac072
8 changed files with 151 additions and 107 deletions
--- a/source/org/thdl/tib/text/ttt/ACIPConverter.java
+++ b/source/org/thdl/tib/text/ttt/ACIPConverter.java
@ -132,7 +132,10 @@ public class ACIPConverter {
        throws IOException
    {
        TibetanDocument tdoc = new TibetanDocument();
-		tdoc.setRomanAttributeSet("Courier", 20); // DLC make me configurable.
+        tdoc.setRomanAttributeSet(ThdlOptions.getStringOption("thdl.acip.to.x.latin.font",
+                                                              "Courier New"),
+                                  ThdlOptions.getIntegerOption("thdl.acip.to.x.latin.font.size",
+                                                               20));
        boolean rv
            = convertToTMW(scan, tdoc, errors, warnings,
                           writeWarningsToResult, warningLevel);
@ -357,7 +360,7 @@ public class ACIPConverter {
                        } else if (stype == ACIPString.END_SLASH) {
                            if (null != writer) unicode = "\u0F3D";
                            if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph(")") };
-                        } else {
+                        } else if (stype == ACIPString.TIBETAN_PUNCTUATION) {
                            // For ACIP, tshegs are used as both
                            // tshegs and whitespace.  We treat a
                            // space as a tsheg if and only if it
@ -368,8 +371,8 @@ public class ACIPConverter {
                            // typesetting.
                            boolean done = false;
                            // DLC what about after numbers?  marks?
+                            TPairList lpl = null;
                            if (s.getText().equals(" ")) {
-                                TPairList lpl = null;
                                if (!lastGuyWasNonPunct
                                    || (null != lastGuy
                                        && (lpl = lastGuy.get(lastGuy.size() - 1)).size() == 1
@ -389,7 +392,16 @@ public class ACIPConverter {
                                        continue;
                                    }
                                }
+                            } else if (s.getText().equals(",")
+                                       && lastGuyWasNonPunct
+                                       && null != lastGuy
+                                       && (lpl = lastGuy.get(lastGuy.size() - 1)).size() == 1
+                                       && lpl.get(0).getLeft().equals("NG")) {
+                                DuffCode tshegDuff = TibetanMachineWeb.getGlyph(" ");
+                                if (null == tshegDuff) throw new Error("tsheg duff");
+                                tdoc.appendDuffCodes(new DuffCode[] { tshegDuff });
                            }
+
                            if (!done) {
                                if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false);
                                if (null != tdoc) {
@ -406,6 +418,8 @@ public class ACIPConverter {
                                    }
                                }
                            }
+                        } else {
+                            throw new Error("forgot a case");
                        }
                        if (null != writer && null == unicode)
                            throw new Error("FIXME: make this an assertion 1");
--- a/source/org/thdl/tib/text/ttt/ACIPRules.java
+++ b/source/org/thdl/tib/text/ttt/ACIPRules.java
@ -21,9 +21,12 @@ package org.thdl.tib.text.ttt;
 import java.util.HashSet;
 import java.util.ArrayList;
 import java.util.HashMap;
+import java.util.List;

 import org.thdl.tib.text.DuffCode;
+import org.thdl.tib.text.THDLWylieConstants;
 import org.thdl.tib.text.TibetanMachineWeb;
+import org.thdl.tib.text.TibTextUtils;

 /** Canonizes some facts regarding the ACIP transcription system.
 *  @author David Chandler */
@ -460,38 +463,41 @@ class ACIPRules {



-    /** DLC DOC: Gets the duffcodes for vowel, such that they look good with hashKey, and appends them to r. */
-    static void getDuffForACIPVowel(ArrayList r, String hashKey, String vowel) {
+    /** Gets the duffcodes for vowel, such that they look good with
+     *  the stack with hash key hashKey, and appends them to r. */
+    static void getDuffForACIPVowel(ArrayList r, DuffCode preceding, String vowel) {
        if (null == vowel) return;
        if (null == getWylieForACIPVowel(vowel)) // FIXME: expensive assertion!  Use assert.
            throw new IllegalArgumentException("Vowel " + vowel + " isn't in the small set of vowels we handle correctly.");
-        if (!TibetanMachineWeb.isKnownHashKey(hashKey)) // FIXME: expensive assertion!  Use assert.
-            throw new IllegalArgumentException("bad hashKey");

        // Order matters here.
-        if (vowel.indexOf("'U") >= 0)
-            r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_U));
-        else {
+        if (vowel.startsWith("A")) {
+            TibTextUtils.getVowel(r, preceding, THDLWylieConstants.WYLIE_aVOWEL);
+        } else if (vowel.indexOf("'U") >= 0) {
+            TibTextUtils.getVowel(r, preceding, "U");
+        } else {
            if (vowel.indexOf('\'') >= 0)
-                r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_A));
+                TibTextUtils.getVowel(r, preceding, THDLWylieConstants.A_VOWEL);
            if (vowel.indexOf("EE") >= 0)
-                r.add(TibetanMachineWeb.getGlyph("ai"));
+                TibTextUtils.getVowel(r, preceding, THDLWylieConstants.ai_VOWEL);
            else if (vowel.indexOf('E') >= 0)
-                r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_e));
+                TibTextUtils.getVowel(r, preceding, THDLWylieConstants.e_VOWEL);
            if (vowel.indexOf("OO") >= 0)
-                r.add(TibetanMachineWeb.getGlyph("au"));
+                TibTextUtils.getVowel(r, preceding, THDLWylieConstants.au_VOWEL);
            else if (vowel.indexOf('O') >= 0)
-                r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_o));
+                TibTextUtils.getVowel(r, preceding, THDLWylieConstants.o_VOWEL);
            if (vowel.indexOf('I') >= 0)
-                r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_i));
+                TibTextUtils.getVowel(r, preceding, THDLWylieConstants.i_VOWEL);
            if (vowel.indexOf('U') >= 0)
-                r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_u));
+                TibTextUtils.getVowel(r, preceding, THDLWylieConstants.u_VOWEL);
            if (vowel.indexOf('i') >= 0)
-                r.add(TibetanMachineWeb.getGlyph("-i"));
+                TibTextUtils.getVowel(r, preceding, THDLWylieConstants.reverse_i_VOWEL);
        }
+
        if (vowel.indexOf('m') >= 0)
            r.add(TibetanMachineWeb.getGlyph("M"));
        if (vowel.indexOf(':') >= 0)
            r.add(TibetanMachineWeb.getGlyph("H"));
+
    }
 }
--- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
@ -778,11 +778,22 @@ public class ACIPTshegBarScanner {
                // careful, so "KA\r\n" and "GA\n" appear where "KA
                // \r\n" and "GA \n" should appear.
                if (('\r' == ch
-                     || '\n' == ch)
+                     || ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
                    && !al.isEmpty()
                    && ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION) {
-                    al.add(new ACIPString(" ",
-                                          ACIPString.TIBETAN_PUNCTUATION));
+                    al.add(new ACIPString(" ", ACIPString.TIBETAN_PUNCTUATION));
+                }
+
+                // "DANG,\nLHAG" is really "DANG, LHAG".  But always?  Not if you have "MDO,\n\nKA...".
+                if (('\r' == ch
+                     || ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
+                    && !al.isEmpty()
+                    && ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_PUNCTUATION
+                    && ((ACIPString)al.get(al.size() - 1)).getText().equals(",")
+                    && s.charAt(i-1) == ','
+                    && (i + (('\r' == ch) ? 2 : 1) < sl
+                        && (s.charAt(i+(('\r' == ch) ? 2 : 1)) != ch))) {
+                    al.add(new ACIPString(" ", ACIPString.TIBETAN_PUNCTUATION));
                }

                // Don't add in a "\r\n" or "\n" unless there's a
--- a/source/org/thdl/tib/text/ttt/TPairList.java
+++ b/source/org/thdl/tib/text/ttt/TPairList.java
@ -19,6 +19,7 @@ Contributor(s): ______________________________________.
 package org.thdl.tib.text.ttt;

 import org.thdl.tib.text.TibetanMachineWeb;
+import org.thdl.tib.text.DuffCode;
 import org.thdl.tib.text.TGCPair;
 import org.thdl.util.ThdlDebug;

@ -612,6 +613,7 @@ class TPairList {
    /** Appends the DuffCodes that correspond to this grapheme cluster
     *  to duff.  Assumes this is one grapheme cluster. */
    void getDuff(ArrayList duff) {
+        int previousSize = duff.size();
        StringBuffer wylieForConsonant = new StringBuffer();
        for (int x = 0; x + 1 < size(); x++) {
            wylieForConsonant.append(get(x).getWylie(false));
@ -625,8 +627,15 @@ class TPairList {
                throw new Error("How did this happen?");
            }
        }
-        duff.add(TibetanMachineWeb.getGlyph(hashKey));
-        ACIPRules.getDuffForACIPVowel(duff, hashKey, lastPair.getRight());
+        if (lastPair.getRight() == null || lastPair.equals("-")) {
+            duff.add(TibetanMachineWeb.getGlyph(hashKey));
+        } else {
+            ACIPRules.getDuffForACIPVowel(duff,
+                                          TibetanMachineWeb.getGlyph(hashKey),
+                                          lastPair.getRight());
+        }
+        if (previousSize == duff.size())
+            throw new Error("TPairList with no duffs? " + toString()); // DLC FIXME: change to assertion.
    }
 }
 // DLC FIXME: handle 'o' and 'x', e.g. KAo and NYAx.
--- a/source/org/thdl/tib/text/ttt/TStackList.java
+++ b/source/org/thdl/tib/text/ttt/TStackList.java
@ -217,7 +217,7 @@ class TStackList {
        }
        return u.toString();
    }
-    /** DLC DOC */
+    /** Returns the DuffCodes corresponding to this stack list. */
    DuffCode[] getDuff() {
        ArrayList al = new ArrayList(size()*2); // rough estimate
        int count = 0;