ACIP->TMW and ACIP->Unicode are now smart about when a newline is

really a newline and when a space is really a tsheg. The space in {KA ,MDO} is a tsheg, but the space in {GA ,MDO} is not.
2003-09-04 04:04:21 +00:00 · 2003-09-04 04:04:21 +00:00 · d2749cecd0
commit d2749cecd0
parent 72e531e515
2 changed files with 100 additions and 18 deletions
--- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
@ -767,13 +767,33 @@ public class ACIPTshegBarScanner {
            case ';':
            case '`':
            case '#':
+
                // The tsheg bar ends here; new token.
                if (startOfString < i) {
                    al.add(new ACIPString(s.substring(startOfString, i),
                                          currentType));
                }
-                al.add(new ACIPString(s.substring(i, i+1),
-                                      ACIPString.TIBETAN_PUNCTUATION));
+
+                // Insert a tsheg if necessary.  ACIP files aren't
+                // careful, so "KA\r\n" and "GA\n" appear where "KA
+                // \r\n" and "GA \n" should appear.
+                if (('\r' == ch
+                     || '\n' == ch)
+                    && !al.isEmpty()
+                    && ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION) {
+                    al.add(new ACIPString(" ",
+                                          ACIPString.TIBETAN_PUNCTUATION));
+                }
+
+                // Don't add in a "\r\n" or "\n" unless there's a
+                // blank line.
+                boolean rn = false;
+                if (('\n' != ch && '\r' != ch)
+                    || ((rn = ('\n' == ch && i >= 3 && s.charAt(i-3) == '\r' && s.charAt(i-2) == '\n' && s.charAt(i-1) == '\r'))
+                        || ('\n' == ch && i >= 1 && s.charAt(i-1) == '\n'))) {
+                    al.add(new ACIPString(rn ? s.substring(i - 1, i+1) : s.substring(i, i+1),
+                                          ACIPString.TIBETAN_PUNCTUATION));
+                }
                startOfString = i+1;
                currentType = ACIPString.ERROR;
                break; // end TIBETAN_PUNCTUATION case