Fixed ACIP->Unicode spaces/tshegs and newlines, especially with shads.
"NGA," becomes "NGA-tsheg-," automatically now.
This commit is contained in:
parent
5c240ac072
commit
717c3b94f3
8 changed files with 151 additions and 107 deletions
|
@ -132,7 +132,10 @@ public class ACIPConverter {
|
|||
throws IOException
|
||||
{
|
||||
TibetanDocument tdoc = new TibetanDocument();
|
||||
tdoc.setRomanAttributeSet("Courier", 20); // DLC make me configurable.
|
||||
tdoc.setRomanAttributeSet(ThdlOptions.getStringOption("thdl.acip.to.x.latin.font",
|
||||
"Courier New"),
|
||||
ThdlOptions.getIntegerOption("thdl.acip.to.x.latin.font.size",
|
||||
20));
|
||||
boolean rv
|
||||
= convertToTMW(scan, tdoc, errors, warnings,
|
||||
writeWarningsToResult, warningLevel);
|
||||
|
@ -357,7 +360,7 @@ public class ACIPConverter {
|
|||
} else if (stype == ACIPString.END_SLASH) {
|
||||
if (null != writer) unicode = "\u0F3D";
|
||||
if (null != tdoc) duff = new DuffCode[] { TibetanMachineWeb.getGlyph(")") };
|
||||
} else {
|
||||
} else if (stype == ACIPString.TIBETAN_PUNCTUATION) {
|
||||
// For ACIP, tshegs are used as both
|
||||
// tshegs and whitespace. We treat a
|
||||
// space as a tsheg if and only if it
|
||||
|
@ -368,8 +371,8 @@ public class ACIPConverter {
|
|||
// typesetting.
|
||||
boolean done = false;
|
||||
// DLC what about after numbers? marks?
|
||||
TPairList lpl = null;
|
||||
if (s.getText().equals(" ")) {
|
||||
TPairList lpl = null;
|
||||
if (!lastGuyWasNonPunct
|
||||
|| (null != lastGuy
|
||||
&& (lpl = lastGuy.get(lastGuy.size() - 1)).size() == 1
|
||||
|
@ -389,7 +392,16 @@ public class ACIPConverter {
|
|||
continue;
|
||||
}
|
||||
}
|
||||
} else if (s.getText().equals(",")
|
||||
&& lastGuyWasNonPunct
|
||||
&& null != lastGuy
|
||||
&& (lpl = lastGuy.get(lastGuy.size() - 1)).size() == 1
|
||||
&& lpl.get(0).getLeft().equals("NG")) {
|
||||
DuffCode tshegDuff = TibetanMachineWeb.getGlyph(" ");
|
||||
if (null == tshegDuff) throw new Error("tsheg duff");
|
||||
tdoc.appendDuffCodes(new DuffCode[] { tshegDuff });
|
||||
}
|
||||
|
||||
if (!done) {
|
||||
if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false);
|
||||
if (null != tdoc) {
|
||||
|
@ -406,6 +418,8 @@ public class ACIPConverter {
|
|||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
throw new Error("forgot a case");
|
||||
}
|
||||
if (null != writer && null == unicode)
|
||||
throw new Error("FIXME: make this an assertion 1");
|
||||
|
|
|
@ -21,9 +21,12 @@ package org.thdl.tib.text.ttt;
|
|||
import java.util.HashSet;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
|
||||
import org.thdl.tib.text.DuffCode;
|
||||
import org.thdl.tib.text.THDLWylieConstants;
|
||||
import org.thdl.tib.text.TibetanMachineWeb;
|
||||
import org.thdl.tib.text.TibTextUtils;
|
||||
|
||||
/** Canonizes some facts regarding the ACIP transcription system.
|
||||
* @author David Chandler */
|
||||
|
@ -460,38 +463,41 @@ class ACIPRules {
|
|||
|
||||
|
||||
|
||||
/** DLC DOC: Gets the duffcodes for vowel, such that they look good with hashKey, and appends them to r. */
|
||||
static void getDuffForACIPVowel(ArrayList r, String hashKey, String vowel) {
|
||||
/** Gets the duffcodes for vowel, such that they look good with
|
||||
* the stack with hash key hashKey, and appends them to r. */
|
||||
static void getDuffForACIPVowel(ArrayList r, DuffCode preceding, String vowel) {
|
||||
if (null == vowel) return;
|
||||
if (null == getWylieForACIPVowel(vowel)) // FIXME: expensive assertion! Use assert.
|
||||
throw new IllegalArgumentException("Vowel " + vowel + " isn't in the small set of vowels we handle correctly.");
|
||||
if (!TibetanMachineWeb.isKnownHashKey(hashKey)) // FIXME: expensive assertion! Use assert.
|
||||
throw new IllegalArgumentException("bad hashKey");
|
||||
|
||||
// Order matters here.
|
||||
if (vowel.indexOf("'U") >= 0)
|
||||
r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_U));
|
||||
else {
|
||||
if (vowel.startsWith("A")) {
|
||||
TibTextUtils.getVowel(r, preceding, THDLWylieConstants.WYLIE_aVOWEL);
|
||||
} else if (vowel.indexOf("'U") >= 0) {
|
||||
TibTextUtils.getVowel(r, preceding, "U");
|
||||
} else {
|
||||
if (vowel.indexOf('\'') >= 0)
|
||||
r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_A));
|
||||
TibTextUtils.getVowel(r, preceding, THDLWylieConstants.A_VOWEL);
|
||||
if (vowel.indexOf("EE") >= 0)
|
||||
r.add(TibetanMachineWeb.getGlyph("ai"));
|
||||
TibTextUtils.getVowel(r, preceding, THDLWylieConstants.ai_VOWEL);
|
||||
else if (vowel.indexOf('E') >= 0)
|
||||
r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_e));
|
||||
TibTextUtils.getVowel(r, preceding, THDLWylieConstants.e_VOWEL);
|
||||
if (vowel.indexOf("OO") >= 0)
|
||||
r.add(TibetanMachineWeb.getGlyph("au"));
|
||||
TibTextUtils.getVowel(r, preceding, THDLWylieConstants.au_VOWEL);
|
||||
else if (vowel.indexOf('O') >= 0)
|
||||
r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_o));
|
||||
TibTextUtils.getVowel(r, preceding, THDLWylieConstants.o_VOWEL);
|
||||
if (vowel.indexOf('I') >= 0)
|
||||
r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_i));
|
||||
TibTextUtils.getVowel(r, preceding, THDLWylieConstants.i_VOWEL);
|
||||
if (vowel.indexOf('U') >= 0)
|
||||
r.add(TibetanMachineWeb.getVowel(hashKey, TibetanMachineWeb.VOWEL_u));
|
||||
TibTextUtils.getVowel(r, preceding, THDLWylieConstants.u_VOWEL);
|
||||
if (vowel.indexOf('i') >= 0)
|
||||
r.add(TibetanMachineWeb.getGlyph("-i"));
|
||||
TibTextUtils.getVowel(r, preceding, THDLWylieConstants.reverse_i_VOWEL);
|
||||
}
|
||||
|
||||
if (vowel.indexOf('m') >= 0)
|
||||
r.add(TibetanMachineWeb.getGlyph("M"));
|
||||
if (vowel.indexOf(':') >= 0)
|
||||
r.add(TibetanMachineWeb.getGlyph("H"));
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -778,11 +778,22 @@ public class ACIPTshegBarScanner {
|
|||
// careful, so "KA\r\n" and "GA\n" appear where "KA
|
||||
// \r\n" and "GA \n" should appear.
|
||||
if (('\r' == ch
|
||||
|| '\n' == ch)
|
||||
|| ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
|
||||
&& !al.isEmpty()
|
||||
&& ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_NON_PUNCTUATION) {
|
||||
al.add(new ACIPString(" ",
|
||||
ACIPString.TIBETAN_PUNCTUATION));
|
||||
al.add(new ACIPString(" ", ACIPString.TIBETAN_PUNCTUATION));
|
||||
}
|
||||
|
||||
// "DANG,\nLHAG" is really "DANG, LHAG". But always? Not if you have "MDO,\n\nKA...".
|
||||
if (('\r' == ch
|
||||
|| ('\n' == ch && i > 0 && s.charAt(i - 1) != '\r'))
|
||||
&& !al.isEmpty()
|
||||
&& ((ACIPString)al.get(al.size() - 1)).getType() == ACIPString.TIBETAN_PUNCTUATION
|
||||
&& ((ACIPString)al.get(al.size() - 1)).getText().equals(",")
|
||||
&& s.charAt(i-1) == ','
|
||||
&& (i + (('\r' == ch) ? 2 : 1) < sl
|
||||
&& (s.charAt(i+(('\r' == ch) ? 2 : 1)) != ch))) {
|
||||
al.add(new ACIPString(" ", ACIPString.TIBETAN_PUNCTUATION));
|
||||
}
|
||||
|
||||
// Don't add in a "\r\n" or "\n" unless there's a
|
||||
|
|
|
@ -19,6 +19,7 @@ Contributor(s): ______________________________________.
|
|||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import org.thdl.tib.text.TibetanMachineWeb;
|
||||
import org.thdl.tib.text.DuffCode;
|
||||
import org.thdl.tib.text.TGCPair;
|
||||
import org.thdl.util.ThdlDebug;
|
||||
|
||||
|
@ -612,6 +613,7 @@ class TPairList {
|
|||
/** Appends the DuffCodes that correspond to this grapheme cluster
|
||||
* to duff. Assumes this is one grapheme cluster. */
|
||||
void getDuff(ArrayList duff) {
|
||||
int previousSize = duff.size();
|
||||
StringBuffer wylieForConsonant = new StringBuffer();
|
||||
for (int x = 0; x + 1 < size(); x++) {
|
||||
wylieForConsonant.append(get(x).getWylie(false));
|
||||
|
@ -625,8 +627,15 @@ class TPairList {
|
|||
throw new Error("How did this happen?");
|
||||
}
|
||||
}
|
||||
duff.add(TibetanMachineWeb.getGlyph(hashKey));
|
||||
ACIPRules.getDuffForACIPVowel(duff, hashKey, lastPair.getRight());
|
||||
if (lastPair.getRight() == null || lastPair.equals("-")) {
|
||||
duff.add(TibetanMachineWeb.getGlyph(hashKey));
|
||||
} else {
|
||||
ACIPRules.getDuffForACIPVowel(duff,
|
||||
TibetanMachineWeb.getGlyph(hashKey),
|
||||
lastPair.getRight());
|
||||
}
|
||||
if (previousSize == duff.size())
|
||||
throw new Error("TPairList with no duffs? " + toString()); // DLC FIXME: change to assertion.
|
||||
}
|
||||
}
|
||||
// DLC FIXME: handle 'o' and 'x', e.g. KAo and NYAx.
|
||||
|
|
|
@ -217,7 +217,7 @@ class TStackList {
|
|||
}
|
||||
return u.toString();
|
||||
}
|
||||
/** DLC DOC */
|
||||
/** Returns the DuffCodes corresponding to this stack list. */
|
||||
DuffCode[] getDuff() {
|
||||
ArrayList al = new ArrayList(size()*2); // rough estimate
|
||||
int count = 0;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue