diff --git a/source/org/thdl/tib/input/DuffPaneTest.java b/source/org/thdl/tib/input/DuffPaneTest.java
index da8eba1..50c4e48 100644
--- a/source/org/thdl/tib/input/DuffPaneTest.java
+++ b/source/org/thdl/tib/input/DuffPaneTest.java
@@ -969,6 +969,22 @@ public class DuffPaneTest extends DuffPaneTestBase {
ensureKeysGiveCorrectWylie("'gas");
+ /* Chris Fynn's e-mail on Feb 21 2005 leads to these test
+ cases: */
+ {
+ ensureKeysGiveCorrectWylie("dgas");
+ ensureKeysGiveCorrectWylie("'gas");
+ ensureKeysGiveCorrectWylie("dngas");
+ ensureKeysGiveCorrectWylie("gnad");
+ ensureKeysGiveCorrectWylie("mnad");
+ ensureKeysGiveCorrectWylie("bags");
+ ensureKeysGiveCorrectWylie("dbas");
+ ensureKeysGiveCorrectWylie("'bas");
+ ensureKeysGiveCorrectWylie("mags");
+ ensureKeysGiveCorrectWylie("mangs");
+ ensureKeysGiveCorrectWylie("dmas");
+ }
+
ensureKeysGiveCorrectWylie("gangs");
ensureKeysGiveCorrectWylie("gnags");
diff --git a/source/org/thdl/tib/input/TibetanConverter.java b/source/org/thdl/tib/input/TibetanConverter.java
index a19a6b9..425a44a 100644
--- a/source/org/thdl/tib/input/TibetanConverter.java
+++ b/source/org/thdl/tib/input/TibetanConverter.java
@@ -27,7 +27,7 @@ import org.thdl.util.*;
import org.thdl.tib.text.*;
import org.thdl.tib.text.ttt.TConverter;
-import org.thdl.tib.text.ttt.ACIPTshegBarScanner;
+import org.thdl.tib.text.ttt.ACIPTraits;
import java.util.ArrayList;
/** TibetanConverter is a command-line utility for converting to and
@@ -297,17 +297,18 @@ public class TibetanConverter implements FontConverterConstants {
if (ACIP_TO_UNI_TEXT == ct || ACIP_TO_TMW == ct) {
try {
ArrayList al
- = ACIPTshegBarScanner.instance().scanStream(in, null,
- ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have",
- 1000 - 1),
- shortMessages,
- warningLevel);
+ = ACIPTraits.instance().scanner().scanStream(in, null,
+ ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have",
+ 1000 - 1),
+ shortMessages,
+ warningLevel);
if (null == al)
return 47;
boolean embeddedWarnings = (warningLevel != "None");
boolean hasWarnings[] = new boolean[] { false };
if (ACIP_TO_UNI_TEXT == ct) {
- if (!TConverter.convertToUnicodeText(al, out, null,
+ if (!TConverter.convertToUnicodeText(ACIPTraits.instance(),
+ al, out, null,
null, hasWarnings,
embeddedWarnings,
warningLevel,
@@ -315,7 +316,8 @@ public class TibetanConverter implements FontConverterConstants {
return 46;
} else {
if (ct != ACIP_TO_TMW) throw new Error("badness");
- if (!TConverter.convertToTMW(al, out, null, null,
+ if (!TConverter.convertToTMW(ACIPTraits.instance(),
+ al, out, null, null,
hasWarnings,
embeddedWarnings,
warningLevel, shortMessages,
diff --git a/source/org/thdl/tib/text/TGCPair.java b/source/org/thdl/tib/text/TGCPair.java
index 1ba11d6..9276dd7 100644
--- a/source/org/thdl/tib/text/TGCPair.java
+++ b/source/org/thdl/tib/text/TGCPair.java
@@ -137,7 +137,7 @@ public class TGCPair implements THDLWylieConstants {
consonantACIP = "V";
else
consonantACIP
- = org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(consonantWylie);
+ = org.thdl.tib.text.ttt.ACIPTraits.instance().getACIPForEWTS(consonantWylie);
if (null == consonantACIP) {
if (null != consonantWylie && consonantWylie.startsWith("R+"))
return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + consonantWylie, " because the ACIP R+... could imply the short superscribed form, but this most likely intends the full form (i.e., Unicode character U+0F6A)");
@@ -160,7 +160,7 @@ public class TGCPair implements THDLWylieConstants {
}
if (vowelWylie != null) {
String vowelACIP
- = org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(vowelWylie);
+ = org.thdl.tib.text.ttt.ACIPTraits.instance().getACIPForEWTS(vowelWylie);
if (null == vowelACIP) {
return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + vowelWylie, "");
} else {
diff --git a/source/org/thdl/tib/text/TibTextUtils.java b/source/org/thdl/tib/text/TibTextUtils.java
index 1327fb5..6c7f77e 100644
--- a/source/org/thdl/tib/text/TibTextUtils.java
+++ b/source/org/thdl/tib/text/TibTextUtils.java
@@ -25,7 +25,7 @@ import javax.swing.text.rtf.RTFEditorKit;
import java.io.*;
import org.thdl.util.ThdlDebug;
-import org.thdl.tib.text.ttt.ACIPTshegBarScanner;
+import org.thdl.tib.text.ttt.ACIPTraits;
import org.thdl.tib.text.ttt.TConverter;
import org.thdl.tib.text.tshegbar.LegalTshegBar;
import org.thdl.tib.text.tshegbar.UnicodeConstants;
@@ -333,8 +333,8 @@ public class TibTextUtils implements THDLWylieConstants {
{
StringBuffer errors = new StringBuffer();
String warningLevel = withWarnings ? "All" : "None";
- ArrayList al = ACIPTshegBarScanner.instance().scan(acip, errors, 500,
- false, warningLevel);
+ ArrayList al = ACIPTraits.instance().scanner().scan(acip, errors, 500,
+ false, warningLevel);
if (null == al || errors.length() > 0) {
if (errors.length() > 0)
throw new InvalidACIPException(errors.toString());
@@ -348,8 +348,8 @@ public class TibTextUtils implements THDLWylieConstants {
}
try {
int tloc[] = new int[] { loc };
- TConverter.convertToTMW(al, tdoc, null, null, null,
- putWarningsInOutput, warningLevel,
+ TConverter.convertToTMW(ACIPTraits.instance(), al, tdoc, null, null,
+ null, putWarningsInOutput, warningLevel,
false, colors, tloc);
return tloc[0] - loc;
} catch (IOException e) {
@@ -1430,6 +1430,53 @@ public class TibTextUtils implements THDLWylieConstants {
candidateType = getCandidateTypeModuloAppendage(candidateType);
if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType) {
+ /* Update: Chris Fynn wrote this in response to an
+e-mail from David Chapman on Feb 21, 2005:
+
+
+When working out the rules for Tibetan and Dzongkha
+collation in Bhutan we came up with the following sequences
+that could be ambiguous:
+
+0F51 0F42 0F66
+0F60 0F42 0F66
+0F51 0F44 0F66
+0F42 0F53 0F51
+0F58 0F53 0F51
+0F56 0F42 0F66
+0F51 0F56 0F66
+0F60 0F56 0F66
+0F58 0F42 0F66
+0F58 0F44 0F66
+0F51 0F58 0F66
+
+After much consultation with experts in Bhutan it was
+decided these should always be read as follows:
+
+0F51 0F42 0F66 dgas
+0F60 0F42 0F66 'gas
+0F51 0F44 0F66 dngas *
+0F42 0F53 0F51 gnad
+0F58 0F53 0F51 mnad *
+0F56 0F42 0F66 bags
+0F51 0F56 0F66 dbas
+0F60 0F56 0F66 'bas *
+0F58 0F42 0F66 mags
+0F58 0F44 0F66 mangs
+0F51 0F58 0F66 dmas
+
+In most cases it was found that only one of the two possible
+readings actually existed as words. 0F51 0F44 0F66 , 0F58
+0F53 0F51, and 0F60 0F56 0F66 were not found as syllables in
+any known words, but the experts felt that *if* they
+occurred in Tibetan or Dzongkha text then dngas, mnad, and
+'bas would be the most likely reading.
+
+
+
+
+ Because of this e-mail, dbas and dngas were added to the list of
+ exceptions. */
/* Yes, this is ambiguous. How do we handle it? See
* this from Andres (but note that only 4 of the 14 in
* the second list are ambiguous because ra na sa and
@@ -1480,7 +1527,9 @@ public class TibTextUtils implements THDLWylieConstants {
|| wylie2.equals("n")
|| wylie2.equals("s")))
|| (wylie1.equals("d") && (wylie2.equals("g")
- || wylie2.equals("m")))
+ || wylie2.equals("m")
+ || wylie2.equals("b")
+ || wylie2.equals("ng")))
|| (wylie1.equals("b") && wylie2.equals("d"))
|| (wylie1.equals("m") && wylie2.equals("d"))
|| (wylie1.equals("'") && (wylie2.equals("g")
diff --git a/source/org/thdl/tib/text/TibetanMachineWeb.java b/source/org/thdl/tib/text/TibetanMachineWeb.java
index e525663..c55a852 100644
--- a/source/org/thdl/tib/text/TibetanMachineWeb.java
+++ b/source/org/thdl/tib/text/TibetanMachineWeb.java
@@ -1988,7 +1988,7 @@ private static String acipForGlyph(String hashKey) {
// ~X is a special case because the EWTS is 2 characters in
// length
|| "~X".equals(hashKey)) // hard-coded EWTS value
- return org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(hashKey);
+ return org.thdl.tib.text.ttt.ACIPTraits.instance().getACIPForEWTS(hashKey);
else
// else we are not be able to use it because it's not smart
// about stacks (e.g., W+W)
@@ -2116,7 +2116,7 @@ public static String getACIPForGlyph(DuffCode dc1,
// DLC FIXME: TMW.53 is probably going to come out all wrong (VA
// vs. WA) from this function, but
- // ACIPRules.getACIPForEWTS(String) seems to come through... will
+ // ACIPTraits.getACIPForEWTS(String) seems to come through... will
// it always?
String hashKey = getHashKeyForGlyph(dc1);
diff --git a/source/org/thdl/tib/text/tibwn.ini b/source/org/thdl/tib/text/tibwn.ini
index 05a0eaa..beb52e3 100644
--- a/source/org/thdl/tib/text/tibwn.ini
+++ b/source/org/thdl/tib/text/tibwn.ini
@@ -9,9 +9,9 @@
// - blank lines should be ignored
// - marks a command
//
-// If you change the Wylie here, it can break the ACIP->TMW and
-// ACIP->Unicode conversion. So keep ACIPRules in sync with this, and be
-// sure to run 'ant clean check' after your change.
+// If you change the EWTS transliteration here, it can break the
+// ACIP->TMW and ACIP->Unicode conversion. So keep ACIPTraits in sync
+// with this, and be sure to run 'ant clean check' after your change.
//
// Note that some glyphs have EWTS \uF021-\uF0FF inclusive. These do
// not have anything in the Unicode column, though, because this is
@@ -37,7 +37,7 @@
// by the way.
//
// If EWTS changes, then ACIP->TMW and ACIP->Unicode will break --
-// modify ACIPRules and test test test.
+// modify ACIPTraits and test test test.
//_~32,1~0,32
@@ -645,7 +645,7 @@ r+m+m~51,4~~7,59~1,110~8,121~1,123~1,125~8,107~8,114~f62,fa8,fa8
// Note that TPairList.java's unicodeExceptionsMap must be updated if
// we change who uses U+0F6A.
R+Y~52,4~~7,60~1,110~8,120~1,123~1,125~8,106~8,113~f6a,fbb
-// R+W is mentioned in ACIPRules.java:
+// R+W is mentioned in ACIPTraits.java:
R+W~196,4~~7,61~1,109~8,120~1,123~1,125~8,106~8,113~f6a,fba
R+sh~53,4~~7,62~1,109~8,120~1,123~1,125~8,106~8,113~f6a,fb4
R+sh+y~54,4~~7,63~1,109~8,122~1,123~1,125~8,108~8,115~f6a,fb4,fb1
@@ -667,7 +667,7 @@ l+h+w~197,4~~7,78~1,109~8,121~1,123~1,125~8,106~8,113~f63,fb7,fad
w+y~69,4~~7,79~1,109~8,121~1,123~1,125~8,107~8,114~f5d,fb1
w+r~70,4~~7,80~1,109~8,121~1,123~1,125~8,107~8,114~f5d,fb2
w+n~195,4~~7,81~1,109~8,120~1,123~1,125~8,106~8,113~f5d,fa3
-// w+W is mentioned in ACIPRules.java:
+// w+W is mentioned in ACIPTraits.java:
w+W~194,4~~7,82~1,109~8,120~1,123~1,125~8,106~8,113~f5d,fba
sh+ts~71,4~~7,83~1,109~8,120~1,123~1,125~8,106~8,113~f64,fa9
sh+ts+y~72,4~~7,84~1,109~8,122~1,123~1,125~8,108~8,115~f64,fa9,fb1
diff --git a/source/org/thdl/tib/text/ttt/ACIPRules.java b/source/org/thdl/tib/text/ttt/ACIPRules.java
deleted file mode 100644
index c6c9986..0000000
--- a/source/org/thdl/tib/text/ttt/ACIPRules.java
+++ /dev/null
@@ -1,658 +0,0 @@
-/*
-The contents of this file are subject to the THDL Open Community License
-Version 1.0 (the "License"); you may not use this file except in compliance
-with the License. You may obtain a copy of the License on the THDL web site
-(http://www.thdl.org/).
-
-Software distributed under the License is distributed on an "AS IS" basis,
-WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
-License for the specific terms governing rights and limitations under the
-License.
-
-The Initial Developer of this software is the Tibetan and Himalayan Digital
-Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
-All Rights Reserved.
-
-Contributor(s): ______________________________________.
-*/
-
-package org.thdl.tib.text.ttt;
-
-import java.util.HashSet;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.StringTokenizer;
-import java.util.List;
-
-import org.thdl.util.ThdlOptions;
-import org.thdl.tib.text.DuffCode;
-import org.thdl.tib.text.THDLWylieConstants;
-import org.thdl.tib.text.TibetanMachineWeb;
-import org.thdl.tib.text.TibTextUtils;
-
-// TODO(DLC)[EWTS->Tibetan]: this and ACIPTraits -- unify?
-
-/** Canonizes some facts regarding the ACIP transcription system.
- * @author David Chandler */
-public class ACIPRules {
- /** {Ksh}, the longest consonant, has 3 characters, so this is
- * three. */
- public static int MAX_CONSONANT_LENGTH = 3;
-
- /** {'EEm:}, the longest wowel, has 5 characters, so this is
- * five. */
- public static int MAX_WOWEL_LENGTH = 5;
-
- /** For O(1) {@link #isWowel(String)} calls. */
- private static HashSet acipVowels = null;
-
- private static String[][] baseVowels = new String[][] {
- // { ACIP, EWTS, EWTS for ACIP {'\'' + baseVowels[][0]}, vowel
- // numbers (see TibetanMachineWeb's VOWEL_A, VOWEL_o, etc.)
- // for ACIP, vowel numbers for ACIP {'\'' + baseVowels[][0]}
- { "A", "a", "A" },
- { "I", "i", "I" },
- { "U", "u", "U" },
- { "E", "e", "Ae" },
- { "O", "o", "Ao" },
- { "EE", "ai", "Aai" },
- { "OO", "au", "Aau" },
- { "i", "-i", "A-i" }
- };
-
- /** Returns true if and only if s is an ACIP wowel. You can't
- * just call this any time -- A is both a consonant and a vowel
- * in ACIP, so you have to call this in the right context. */
- public static boolean isWowel(String s) {
- if (null == acipVowels) {
- acipVowels = new HashSet(baseVowels.length * 8);
- for (int i = 0; i < baseVowels.length; i++) {
- // I'm on my own with 'O and 'E and 'OO and 'EE, but
- // GANG'O appears and I wonder... so here they are.
- // It's consistent with 'I and 'A and 'U, at least:
- // all the vowels may appear as K'vowel. DLC FIXME:
- // ask.
-
- acipVowels.add(baseVowels[i][0]);
- acipVowels.add('\'' + baseVowels[i][0]);
- acipVowels.add(baseVowels[i][0] + 'm');
- acipVowels.add('\'' + baseVowels[i][0] + 'm');
- acipVowels.add(baseVowels[i][0] + ':');
- acipVowels.add('\'' + baseVowels[i][0] + ':');
- acipVowels.add(baseVowels[i][0] + "m:");
- acipVowels.add('\'' + baseVowels[i][0] + "m:");
-
- // Keep this code in sync with getUnicodeFor.
-
- // Keep this code in sync with getWylieForACIPVowel.
- }
- // {Pm} is treated just like {PAm}; {P:} is treated just
- // like {PA:}; {Pm:} is treated just like {PAm:}. But
- // that happens thanks to
- }
- return (acipVowels.contains(s));
- }
-
- /** For O(1) {@link #isConsonant(String)} calls. */
- private static HashSet consonants = null;
-
- /** Returns true if and only if acip is an ACIP consonant (without
- * a vowel). For example, returns true for "K", but not for
- * "KA" or "X". */
- public static boolean isConsonant(String acip) {
- if (consonants == null) {
- consonants = new HashSet();
- consonants.add("V");
- consonants.add("K");
- consonants.add("KH");
- consonants.add("G");
- consonants.add("NG");
- consonants.add("C");
- consonants.add("CH");
- consonants.add("J");
- consonants.add("NY");
- consonants.add("T");
- consonants.add("TH");
- consonants.add("D");
- consonants.add("N");
- consonants.add("P");
- consonants.add("PH");
- consonants.add("B");
- consonants.add("M");
- consonants.add("TZ");
- consonants.add("TS");
- consonants.add("DZ");
- consonants.add("W");
- consonants.add("ZH");
- consonants.add("Z");
- consonants.add("Y");
- consonants.add("R");
- consonants.add("L");
- consonants.add("SH");
- consonants.add("S");
- consonants.add("H");
- consonants.add("t");
- consonants.add("th");
- consonants.add("d");
- consonants.add("n");
- consonants.add("sh");
- consonants.add("dH");
- consonants.add("DH");
- consonants.add("BH");
- consonants.add("DZH"); // longest, MAX_CONSONANT_LENGTH characters
- consonants.add("Ksh"); // longest, MAX_CONSONANT_LENGTH characters
- consonants.add("GH");
- consonants.add("'");
- consonants.add("A");
- }
- return consonants.contains(acip);
- }
-
- /** A map from wylie to ACIP. Note that the Wylie "w" maps to
- both "V" and "W". */
- private static HashMap wylieToACIP = null;
- /** Returns the ACIP transliteration corresponding to the THDL
- Extended Wylie atom EWTS, or null if EWTS is not
- recognized. */
- public static String getACIPForEWTS(String EWTS) {
- getWylieForACIPConsonant(null);
- getWylieForACIPOther(null);
- getWylieForACIPVowel(null);
- String ans = (String)wylieToACIP.get(EWTS);
- boolean useCapitalW = false;
- if (EWTS.startsWith("w"))
- useCapitalW = true; // We want W+NA, not V+NA; we want WA, not VA.
- if (null == ans) {
- StringBuffer finalAns = new StringBuffer(EWTS.length());
- StringTokenizer sTok = new StringTokenizer(EWTS, "-+", true);
- while (sTok.hasMoreTokens()) {
- String part, tok = sTok.nextToken();
- if (tok.equals("-") || tok.equals("+"))
- part = tok;
- else {
- if ("w".equals(tok)) {
- // There are only two stacks in TMW that have
- // U+0FBA: R+Wa and w+Wa. TMW->ACIP fails for
- // these unless we handle it here. (FIXME:
- // add an automated test for this).
- if ("R+W".equals(EWTS) || "w+W".equals(EWTS)) {
- part = "W";
- } else {
- part = "V";
- }
- } else {
- part = (String)wylieToACIP.get(tok);
- }
- }
- if (null == part) return null;
- finalAns.append(part);
- }
- if (useCapitalW)
- finalAns.setCharAt(0, 'W');
- return finalAns.toString();
- }
- if (useCapitalW)
- return "W" + ans.substring(1);
- else
- return ans;
- }
-
- /** Registers acip->wylie mappings in toWylie; registers
- wylie->acip mappings in {@link #wylieToACIP}. */
- private static void putMapping(HashMap toWylie, String ACIP, String EWTS) {
- toWylie.put(ACIP, EWTS);
- if (null == wylieToACIP) {
- wylieToACIP = new HashMap(75);
-
- // We don't want to put "/" in toWylie:
- wylieToACIP.put("(", "/");
- wylieToACIP.put(")", "/");
- wylieToACIP.put("?", "\\");
-
- wylieToACIP.put("_", " "); // oddball.
- wylieToACIP.put("o'i", "O'I"); // oddball for TMW9.61.
- }
- wylieToACIP.put(EWTS, ACIP);
- }
-
- /** Returns true if and only if s is an ACIP consonant. */
- static final boolean isACIPConsonant(String s) {
- return (null != ACIPRules.getWylieForACIPConsonant(s));
- }
-
- private static HashMap acipConsonant2wylie = null;
- /** Returns the EWTS corresponding to the given ACIP consonant
- * (without the "A" vowel). Returns null if there is no such
- * EWTS.
- *
- *
Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y",
- * even though sometimes the EWTS for those is "w", "R", or "Y".
- * Handle that in the caller. */
- static final String getWylieForACIPConsonant(String acip) {
- if (acipConsonant2wylie == null) {
- acipConsonant2wylie = new HashMap(37);
-
- // oddball:
- putMapping(acipConsonant2wylie, "V", "w");
-
- // more oddballs:
- putMapping(acipConsonant2wylie, "DH", "d+h");
- putMapping(acipConsonant2wylie, "BH", "b+h");
- putMapping(acipConsonant2wylie, "dH", "D+h");
- putMapping(acipConsonant2wylie, "DZH", "dz+h");
- putMapping(acipConsonant2wylie, "Ksh", "k+Sh");
- putMapping(acipConsonant2wylie, "GH", "g+h");
-
-
- putMapping(acipConsonant2wylie, "K", "k");
- putMapping(acipConsonant2wylie, "KH", "kh");
- putMapping(acipConsonant2wylie, "G", "g");
- putMapping(acipConsonant2wylie, "NG", "ng");
- putMapping(acipConsonant2wylie, "C", "c");
- putMapping(acipConsonant2wylie, "CH", "ch");
- putMapping(acipConsonant2wylie, "J", "j");
- putMapping(acipConsonant2wylie, "NY", "ny");
- putMapping(acipConsonant2wylie, "T", "t");
- putMapping(acipConsonant2wylie, "TH", "th");
- putMapping(acipConsonant2wylie, "D", "d");
- putMapping(acipConsonant2wylie, "N", "n");
- putMapping(acipConsonant2wylie, "P", "p");
- putMapping(acipConsonant2wylie, "PH", "ph");
- putMapping(acipConsonant2wylie, "B", "b");
- putMapping(acipConsonant2wylie, "M", "m");
- putMapping(acipConsonant2wylie, "TZ", "ts");
- putMapping(acipConsonant2wylie, "TS", "tsh");
- putMapping(acipConsonant2wylie, "DZ", "dz");
- putMapping(acipConsonant2wylie, "W", "W"
- /* NOTE WELL: sometimes "w", sometimes "W".
- Handle this in the caller.
-
- Reasoning for "W" instead of "w": r-w and
- r+w are both known hash keys. We sort 'em
- out this way. (They are the only things
- like this according to bug report #800166.) */
- );
- putMapping(acipConsonant2wylie, "ZH", "zh");
- putMapping(acipConsonant2wylie, "Z", "z");
- putMapping(acipConsonant2wylie, "'", "'");
- putMapping(acipConsonant2wylie, "Y", "y");
- putMapping(acipConsonant2wylie, "R", "r");
- putMapping(acipConsonant2wylie, "L", "l");
- putMapping(acipConsonant2wylie, "SH", "sh");
- putMapping(acipConsonant2wylie, "S", "s");
- putMapping(acipConsonant2wylie, "H", "h");
- putMapping(acipConsonant2wylie, "A", "a");
- putMapping(acipConsonant2wylie, "t", "T");
- putMapping(acipConsonant2wylie, "th", "Th");
- putMapping(acipConsonant2wylie, "d", "D");
- putMapping(acipConsonant2wylie, "n", "N");
- putMapping(acipConsonant2wylie, "sh", "Sh");
- }
- return (String)acipConsonant2wylie.get(acip);
- }
-
- private static HashMap acipVowel2wylie = null;
- /** Returns the EWTS corresponding to the given ACIP "vowel".
- * Returns null if there is no such EWTS. */
- static final String getWylieForACIPVowel(String acip) {
- if (acipVowel2wylie == null) {
- acipVowel2wylie = new HashMap(baseVowels.length * 4);
-
- for (int i = 0; i < baseVowels.length; i++) {
- putMapping(acipVowel2wylie, baseVowels[i][0], baseVowels[i][1]);
- putMapping(acipVowel2wylie, '\'' + baseVowels[i][0], baseVowels[i][2]);
- putMapping(acipVowel2wylie, baseVowels[i][0] + 'm', baseVowels[i][1] + 'M');
- putMapping(acipVowel2wylie, '\'' + baseVowels[i][0] + 'm', baseVowels[i][2] + 'M');
- putMapping(acipVowel2wylie, baseVowels[i][0] + ':', baseVowels[i][1] + 'H');
- putMapping(acipVowel2wylie, '\'' + baseVowels[i][0] + ':', baseVowels[i][2] + 'H');
- putMapping(acipVowel2wylie, baseVowels[i][0] + "m:", baseVowels[i][1] + "MH");
- putMapping(acipVowel2wylie, '\'' + baseVowels[i][0] + "m:", baseVowels[i][2] + "MH");
- }
- // {Pm} is treated just like {PAm}; {P:} is treated just
- // like {PA:}; {Pm:} is treated just like {PAm:}. But
- // that happens thanks to
- // TPairListFactory.getFirstConsonantAndVowel(StringBuffer,int[]).
- }
- return (String)acipVowel2wylie.get(acip);
- }
-
- private static HashMap acipOther2wylie = null;
- /** Returns the EWTS corresponding to the given ACIP puncuation or
- * mark. Returns null if there is no such EWTS. */
- static final String getWylieForACIPOther(String acip) {
- if (acipOther2wylie == null) {
- acipOther2wylie = new HashMap(20);
-
- // don't use putMapping for this. We don't want TMW->ACIP
- // to produce "." for a U+0F0C because ACIP doesn't say
- // that "." means U+0F0C. It just seems to in practice
- // for ACIP Release IV texts.
- acipOther2wylie.put(".", "*");
-
- putMapping(acipOther2wylie, "m", "M");
- putMapping(acipOther2wylie, ":", "H");
- putMapping(acipOther2wylie, ",", "/");
- putMapping(acipOther2wylie, " ", " ");
- putMapping(acipOther2wylie, ";", "|");
- putMapping(acipOther2wylie, "`", "!");
- putMapping(acipOther2wylie, "*", "@#");
- // There is no glyph in TMW with the EWTS @##, so we don't do this: putMapping(acipOther2wylie, "#", "@##");
- putMapping(acipOther2wylie, "%", "~X");
- putMapping(acipOther2wylie, "o", "X");
- putMapping(acipOther2wylie, "&", "&");
- putMapping(acipOther2wylie, "^", "\\u0F38");
-
- putMapping(acipOther2wylie, "0", "0");
- putMapping(acipOther2wylie, "1", "1");
- putMapping(acipOther2wylie, "2", "2");
- putMapping(acipOther2wylie, "3", "3");
- putMapping(acipOther2wylie, "4", "4");
- putMapping(acipOther2wylie, "5", "5");
- putMapping(acipOther2wylie, "6", "6");
- putMapping(acipOther2wylie, "7", "7");
- putMapping(acipOther2wylie, "8", "8");
- putMapping(acipOther2wylie, "9", "9");
- }
- return (String)acipOther2wylie.get(acip);
- }
-
- private static HashMap superACIP2unicode = null;
- private static HashMap subACIP2unicode = null;
- /** If acip is an ACIP consonant or vowel or punctuation mark,
- * then this returns the Unicode for it. The Unicode for the
- * subscribed form of the glyph is returned if subscribed is
- * true. Returns null if acip is unknown. */
- static String getUnicodeFor(String acip, boolean subscribed) {
- if (superACIP2unicode == null) {
- final boolean compactUnicode
- = ThdlOptions.getBooleanOption("thdl.acip.to.unicode.conversions.use.0F52.et.cetera");
- superACIP2unicode = new HashMap(144);
- subACIP2unicode = new HashMap(42);
-
- // oddball:
- subACIP2unicode.put("V", "\u0FAD");
-
- superACIP2unicode.put("DH", (compactUnicode ? "\u0F52" : "\u0F51\u0FB7"));
- subACIP2unicode.put("DH", (compactUnicode ? "\u0FA2" : "\u0FA1\u0FB7"));
- superACIP2unicode.put("BH", (compactUnicode ? "\u0F57" : "\u0F56\u0FB7"));
- subACIP2unicode.put("BH", (compactUnicode ? "\u0FA7" : "\u0FA6\u0FB7"));
- superACIP2unicode.put("dH", (compactUnicode ? "\u0F4D" : "\u0F4C\u0FB7"));
- subACIP2unicode.put("dH", (compactUnicode ? "\u0F9D" : "\u0F9C\u0FB7"));
- superACIP2unicode.put("DZH", (compactUnicode ? "\u0F5C" : "\u0F5B\u0FB7"));
- subACIP2unicode.put("DZH", (compactUnicode ? "\u0FAC" : "\u0FAB\u0FB7"));
- superACIP2unicode.put("Ksh", (compactUnicode ? "\u0F69" : "\u0F40\u0FB5"));
- subACIP2unicode.put("Ksh", (compactUnicode ? "\u0FB9" : "\u0F90\u0FB5"));
- superACIP2unicode.put("GH", (compactUnicode ? "\u0F43" : "\u0F42\u0FB7"));
- subACIP2unicode.put("GH", (compactUnicode ? "\u0F93" : "\u0F92\u0FB7"));
- superACIP2unicode.put("K", "\u0F40");
- subACIP2unicode.put("K", "\u0F90");
- superACIP2unicode.put("KH", "\u0F41");
- subACIP2unicode.put("KH", "\u0F91");
- superACIP2unicode.put("G", "\u0F42");
- subACIP2unicode.put("G", "\u0F92");
- superACIP2unicode.put("NG", "\u0F44");
- subACIP2unicode.put("NG", "\u0F94");
- superACIP2unicode.put("C", "\u0F45");
- subACIP2unicode.put("C", "\u0F95");
- superACIP2unicode.put("CH", "\u0F46");
- subACIP2unicode.put("CH", "\u0F96");
- superACIP2unicode.put("J", "\u0F47");
- subACIP2unicode.put("J", "\u0F97");
- superACIP2unicode.put("NY", "\u0F49");
- subACIP2unicode.put("NY", "\u0F99");
- superACIP2unicode.put("T", "\u0F4F");
- subACIP2unicode.put("T", "\u0F9F");
- superACIP2unicode.put("TH", "\u0F50");
- subACIP2unicode.put("TH", "\u0FA0");
- superACIP2unicode.put("D", "\u0F51");
- subACIP2unicode.put("D", "\u0FA1");
- superACIP2unicode.put("N", "\u0F53");
- subACIP2unicode.put("N", "\u0FA3");
- superACIP2unicode.put("P", "\u0F54");
- subACIP2unicode.put("P", "\u0FA4");
- superACIP2unicode.put("PH", "\u0F55");
- subACIP2unicode.put("PH", "\u0FA5");
- superACIP2unicode.put("B", "\u0F56");
- subACIP2unicode.put("B", "\u0FA6");
- superACIP2unicode.put("M", "\u0F58");
- subACIP2unicode.put("M", "\u0FA8");
- superACIP2unicode.put("TZ", "\u0F59");
- subACIP2unicode.put("TZ", "\u0FA9");
- superACIP2unicode.put("TS", "\u0F5A");
- subACIP2unicode.put("TS", "\u0FAA");
- superACIP2unicode.put("DZ", "\u0F5B");
- subACIP2unicode.put("DZ", "\u0FAB");
- superACIP2unicode.put("W", "\u0F5D");
- subACIP2unicode.put("W", "\u0FBA"); // oddball
- superACIP2unicode.put("ZH", "\u0F5E");
- subACIP2unicode.put("ZH", "\u0FAE");
- superACIP2unicode.put("Z", "\u0F5F");
- subACIP2unicode.put("Z", "\u0FAF");
- superACIP2unicode.put("'", "\u0F60");
- subACIP2unicode.put("'", "\u0FB0");
- superACIP2unicode.put("Y", "\u0F61");
- subACIP2unicode.put("Y", "\u0FB1");
- superACIP2unicode.put("R", "\u0F62");
- subACIP2unicode.put("R", "\u0FB2");
- superACIP2unicode.put("L", "\u0F63");
- subACIP2unicode.put("L", "\u0FB3");
- superACIP2unicode.put("SH", "\u0F64");
- subACIP2unicode.put("SH", "\u0FB4");
- superACIP2unicode.put("S", "\u0F66");
- subACIP2unicode.put("S", "\u0FB6");
- superACIP2unicode.put("H", "\u0F67");
- subACIP2unicode.put("H", "\u0FB7");
- superACIP2unicode.put("A", "\u0F68");
- subACIP2unicode.put("A", "\u0FB8");
- superACIP2unicode.put("t", "\u0F4A");
- subACIP2unicode.put("t", "\u0F9A");
- superACIP2unicode.put("th", "\u0F4B");
- subACIP2unicode.put("th", "\u0F9B");
- superACIP2unicode.put("d", "\u0F4C");
- subACIP2unicode.put("d", "\u0F9C");
- superACIP2unicode.put("n", "\u0F4E");
- subACIP2unicode.put("n", "\u0F9E");
- superACIP2unicode.put("sh", "\u0F65");
- subACIP2unicode.put("sh", "\u0FB5");
-
- superACIP2unicode.put("I", "\u0F72");
- superACIP2unicode.put("E", "\u0F7A");
- superACIP2unicode.put("O", "\u0F7C");
- superACIP2unicode.put("U", "\u0F74");
- superACIP2unicode.put("OO", "\u0F7D");
- superACIP2unicode.put("EE", "\u0F7B");
- superACIP2unicode.put("i", "\u0F80");
- superACIP2unicode.put("'A", "\u0F71");
- superACIP2unicode.put("'I", "\u0F71\u0F72");
- superACIP2unicode.put("'E", "\u0F71\u0F7A");
- superACIP2unicode.put("'O", "\u0F71\u0F7C");
- superACIP2unicode.put("'U", "\u0F71\u0F74");
- superACIP2unicode.put("'OO", "\u0F71\u0F7D");
- superACIP2unicode.put("'EE", "\u0F71\u0F7B");
- superACIP2unicode.put("'i", "\u0F71\u0F80");
-
- superACIP2unicode.put("Im", "\u0F72\u0F7E");
- superACIP2unicode.put("Em", "\u0F7A\u0F7E");
- superACIP2unicode.put("Om", "\u0F7C\u0F7E");
- superACIP2unicode.put("Um", "\u0F74\u0F7E");
- superACIP2unicode.put("OOm", "\u0F7D\u0F7E");
- superACIP2unicode.put("EEm", "\u0F7B\u0F7E");
- superACIP2unicode.put("im", "\u0F80\u0F7E");
- superACIP2unicode.put("'Am", "\u0F71\u0F7E");
- superACIP2unicode.put("'Im", "\u0F71\u0F72\u0F7E");
- superACIP2unicode.put("'Em", "\u0F71\u0F7A\u0F7E");
- superACIP2unicode.put("'Om", "\u0F71\u0F7C\u0F7E");
- superACIP2unicode.put("'Um", "\u0F71\u0F74\u0F7E");
- superACIP2unicode.put("'OOm", "\u0F71\u0F7D\u0F7E");
- superACIP2unicode.put("'EEm", "\u0F71\u0F7B\u0F7E");
- superACIP2unicode.put("'im", "\u0F71\u0F80\u0F7E");
-
- superACIP2unicode.put("I:", "\u0F72\u0F7F");
- superACIP2unicode.put("E:", "\u0F7A\u0F7F");
- superACIP2unicode.put("O:", "\u0F7C\u0F7F");
- superACIP2unicode.put("U:", "\u0F74\u0F7F");
- superACIP2unicode.put("OO:", "\u0F7D\u0F7F");
- superACIP2unicode.put("EE:", "\u0F7B\u0F7F");
- superACIP2unicode.put("i:", "\u0F80\u0F7F");
- superACIP2unicode.put("'A:", "\u0F71\u0F7F");
- superACIP2unicode.put("'I:", "\u0F71\u0F72\u0F7F");
- superACIP2unicode.put("'E:", "\u0F71\u0F7A\u0F7F");
- superACIP2unicode.put("'O:", "\u0F71\u0F7C\u0F7F");
- superACIP2unicode.put("'U:", "\u0F71\u0F74\u0F7F");
- superACIP2unicode.put("'OO:", "\u0F71\u0F7D\u0F7F");
- superACIP2unicode.put("'EE:", "\u0F71\u0F7B\u0F7F");
- superACIP2unicode.put("'i:", "\u0F71\u0F80\u0F7F");
-
- superACIP2unicode.put("Im:", "\u0F72\u0F7E\u0F7F");
- superACIP2unicode.put("Em:", "\u0F7A\u0F7E\u0F7F");
- superACIP2unicode.put("Om:", "\u0F7C\u0F7E\u0F7F");
- superACIP2unicode.put("Um:", "\u0F74\u0F7E\u0F7F");
- superACIP2unicode.put("OOm:", "\u0F7D\u0F7E\u0F7F");
- superACIP2unicode.put("EEm:", "\u0F7B\u0F7E\u0F7F");
- superACIP2unicode.put("im:", "\u0F80\u0F7E\u0F7F");
- superACIP2unicode.put("'Am:", "\u0F71\u0F7E\u0F7F");
- superACIP2unicode.put("'Im:", "\u0F71\u0F72\u0F7E\u0F7F");
- superACIP2unicode.put("'Em:", "\u0F71\u0F7A\u0F7E\u0F7F");
- superACIP2unicode.put("'Om:", "\u0F71\u0F7C\u0F7E\u0F7F");
- superACIP2unicode.put("'Um:", "\u0F71\u0F74\u0F7E\u0F7F");
- superACIP2unicode.put("'OOm:", "\u0F71\u0F7D\u0F7E\u0F7F");
- superACIP2unicode.put("'EEm:", "\u0F71\u0F7B\u0F7E\u0F7F");
- superACIP2unicode.put("'im:", "\u0F71\u0F80\u0F7E\u0F7F");
- // :m does not appear, though you'd think it's as valid as m:.
-
- superACIP2unicode.put("m", "\u0F7E");
- superACIP2unicode.put(":", "\u0F7F");
- superACIP2unicode.put("m:", "\u0F7E\u0F7F");
-
- superACIP2unicode.put("Am", "\u0F7E");
- superACIP2unicode.put("A:", "\u0F7F");
- superACIP2unicode.put("Am:", "\u0F7E\u0F7F");
-
- superACIP2unicode.put("0", "\u0F20");
- superACIP2unicode.put("1", "\u0F21");
- superACIP2unicode.put("2", "\u0F22");
- superACIP2unicode.put("3", "\u0F23");
- superACIP2unicode.put("4", "\u0F24");
- superACIP2unicode.put("5", "\u0F25");
- superACIP2unicode.put("6", "\u0F26");
- superACIP2unicode.put("7", "\u0F27");
- superACIP2unicode.put("8", "\u0F28");
- superACIP2unicode.put("9", "\u0F29");
-
- // punctuation
- superACIP2unicode.put("&", "\u0F85");
- superACIP2unicode.put(",", "\u0F0D");
- superACIP2unicode.put(" ", "\u0F0B");
- superACIP2unicode.put(".", "\u0F0C");
- superACIP2unicode.put("`", "\u0F08");
- superACIP2unicode.put("`", "\u0F08");
- superACIP2unicode.put("*", "\u0F04\u0F05");
- superACIP2unicode.put("#", "\u0F04\u0F05\u0F05");
- superACIP2unicode.put("%", "\u0F35"); // but might be U+0F14, so we warn.
- superACIP2unicode.put("o", "\u0F37");
- superACIP2unicode.put(";", "\u0F11");
- superACIP2unicode.put("\r", "\r");
- superACIP2unicode.put("\t", "\t");
- superACIP2unicode.put("\r\n", "\r\n");
- superACIP2unicode.put("\n", "\n");
- superACIP2unicode.put("\\", "\u0F84");
- superACIP2unicode.put("^", "\u0F38");
-
- // DLC FIXME: "^ GONG" is "^GONG", right?
- // DLC FIXME: what's the Unicode for x? RC said there is none in plain-text Unicode for x. But what about in RTF Unicode?
- }
- if (subscribed) {
- String u = (String)subACIP2unicode.get(acip);
- if (null != u) return u;
- }
- return (String)superACIP2unicode.get(acip);
- }
-
-
-
- /** Gets the duffcodes for vowel, such that they look good with
- * the stack with hash key hashKey, and appends them to r. */
- static void getDuffForACIPVowel(ArrayList duff, DuffCode preceding, String vowel) {
- if (null == vowel) return;
- if (null == getWylieForACIPVowel(vowel)) // FIXME: expensive assertion! Use assert.
- throw new IllegalArgumentException("Vowel " + vowel + " isn't in the small set of vowels we handle correctly.");
-
- // Order matters here.
- boolean context_added[] = new boolean[] { false };
- if (vowel.startsWith("A")) {
- TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.WYLIE_aVOWEL, context_added);
- } else if (vowel.indexOf("'U") >= 0) {
- TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.U_VOWEL, context_added);
- } else if (vowel.indexOf("'I") >= 0) {
- TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.I_VOWEL, context_added);
- } else {
- if (vowel.indexOf('\'') >= 0) {
- TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.A_VOWEL, context_added);
- }
- if (vowel.indexOf("EE") >= 0) {
- TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.ai_VOWEL, context_added);
- } else if (vowel.indexOf('E') >= 0) {
- TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.e_VOWEL, context_added);
- }
- if (vowel.indexOf("OO") >= 0) {
- TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.au_VOWEL, context_added);
- } else if (vowel.indexOf('O') >= 0) {
- TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.o_VOWEL, context_added);
- }
- if (vowel.indexOf('I') >= 0) {
- TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.i_VOWEL, context_added);
- }
- if (vowel.indexOf('U') >= 0) {
- TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.u_VOWEL, context_added);
- }
- if (vowel.indexOf('i') >= 0) {
- TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.reverse_i_VOWEL, context_added);
- }
- }
- // FIXME: Use TMW9.61, the "o'i" special combination, when appropriate.
-
- if (vowel.indexOf('m') >= 0) {
- DuffCode last = (DuffCode)duff.get(duff.size() - 1);
- duff.remove(duff.size() - 1); // getBindu will add it back...
- TibTextUtils.getBindu(duff, last);
- }
- if (vowel.indexOf(':') >= 0)
- duff.add(TibetanMachineWeb.getGlyph("H"));
- }
-
- /** Returns true if and only if l is the ACIP representation of a
- letter that can be a suffix. Note that all postsuffixes are
- also suffixes. l must not have an "A" -- use "S", not "SA",
- that is. */
- public static boolean isACIPSuffix(String l) {
- return ("S".equals(l)
- || "G".equals(l)
- || "D".equals(l)
- || "M".equals(l)
- || "'".equals(l)
- || "B".equals(l)
- || "NG".equals(l)
- || "N".equals(l)
- || "L".equals(l)
- || "R".equals(l));
- }
-
- /** Returns true if and only if l is the ACIP representation of a
- letter that can be a prefix. l must not have an "A" -- use
- "D", not "DA", that is. */
- public static boolean isACIPPrefix(String l) {
- return ("'".equals(l)
- || "M".equals(l)
- || "B".equals(l)
- || "D".equals(l)
- || "G".equals(l));
- }
-
- /** Returns true if and only if l is the ACIP representation of a
- letter that can be a postsuffix. l must not have an "A" --
- use "D", not "DA", that is. */
- public static boolean isACIPPostsuffix(String l) {
- return ("S".equals(l)
- || "D".equals(l));
- }
-}
diff --git a/source/org/thdl/tib/text/ttt/ACIPTraits.java b/source/org/thdl/tib/text/ttt/ACIPTraits.java
index 036b197..dd4abec 100644
--- a/source/org/thdl/tib/text/ttt/ACIPTraits.java
+++ b/source/org/thdl/tib/text/ttt/ACIPTraits.java
@@ -18,11 +18,25 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt;
+import java.util.HashSet;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.StringTokenizer;
+import java.util.List;
+
+import org.thdl.util.ThdlOptions;
+import org.thdl.tib.text.DuffCode;
+import org.thdl.tib.text.THDLWylieConstants;
+import org.thdl.tib.text.TibetanMachineWeb;
+import org.thdl.tib.text.TibTextUtils;
+
+
/** A singleton class that should contain (but due to laziness and
* ignorance probably does not contain) all the traits that make ACIP
- * transliteration different from other (say, EWTS)
- * transliterations. */
-final class ACIPTraits implements TTraits {
+ * transliteration scheme different from other (say, EWTS)
+ * transliteration schemes. This is not safe to use in concurrent
+ * programs but it would be easy to make it so. */
+public final class ACIPTraits implements TTraits {
/** sole instance of this class */
private static ACIPTraits singleton = null;
@@ -30,7 +44,7 @@ final class ACIPTraits implements TTraits {
private ACIPTraits() { }
/** Returns the singleton instance of this class. */
- public static ACIPTraits instance() {
+ public static /* synchronized */ ACIPTraits instance() {
if (null == singleton) {
singleton = new ACIPTraits();
}
@@ -43,15 +57,536 @@ final class ACIPTraits implements TTraits {
/** Returns '-'. */
public char disambiguatorChar() { return '-'; }
- public int maxConsonantLength() { return ACIPRules.MAX_CONSONANT_LENGTH; }
+ public int maxConsonantLength() { return MAX_CONSONANT_LENGTH; }
- public int maxWowelLength() { return ACIPRules.MAX_WOWEL_LENGTH; }
-
- public boolean isConsonant(String s) { return ACIPRules.isConsonant(s); }
-
- public boolean isWowel(String s) { return ACIPRules.isWowel(s); }
+ public int maxWowelLength() { return MAX_WOWEL_LENGTH; }
public boolean hasSimpleError(TPair p) {
return ("A".equals(p.getLeft()) && null == p.getRight());
}
+
+ public String aVowel() { return "A"; }
+
+ public boolean isPostsuffix(String l) {
+ return ("S".equals(l)
+ || "D".equals(l));
+ }
+
+ public boolean isSuffix(String l) {
+ return ("S".equals(l)
+ || "G".equals(l)
+ || "D".equals(l)
+ || "M".equals(l)
+ || "'".equals(l)
+ || "B".equals(l)
+ || "NG".equals(l)
+ || "N".equals(l)
+ || "L".equals(l)
+ || "R".equals(l));
+ }
+
+ public boolean isPrefix(String l) {
+ return ("'".equals(l)
+ || "M".equals(l)
+ || "B".equals(l)
+ || "D".equals(l)
+ || "G".equals(l));
+ }
+
+ private HashMap superACIP2unicode = null;
+ private HashMap subACIP2unicode = null;
+ public /* synchronized */ String getUnicodeFor(String acip, boolean subscribed) {
+ if (superACIP2unicode == null) {
+ final boolean compactUnicode
+ = ThdlOptions.getBooleanOption("thdl.acip.to.unicode.conversions.use.0F52.et.cetera");
+ superACIP2unicode = new HashMap(144);
+ subACIP2unicode = new HashMap(42);
+
+ // oddball:
+ subACIP2unicode.put("V", "\u0FAD");
+
+ superACIP2unicode.put("DH", (compactUnicode ? "\u0F52" : "\u0F51\u0FB7"));
+ subACIP2unicode.put("DH", (compactUnicode ? "\u0FA2" : "\u0FA1\u0FB7"));
+ superACIP2unicode.put("BH", (compactUnicode ? "\u0F57" : "\u0F56\u0FB7"));
+ subACIP2unicode.put("BH", (compactUnicode ? "\u0FA7" : "\u0FA6\u0FB7"));
+ superACIP2unicode.put("dH", (compactUnicode ? "\u0F4D" : "\u0F4C\u0FB7"));
+ subACIP2unicode.put("dH", (compactUnicode ? "\u0F9D" : "\u0F9C\u0FB7"));
+ superACIP2unicode.put("DZH", (compactUnicode ? "\u0F5C" : "\u0F5B\u0FB7"));
+ subACIP2unicode.put("DZH", (compactUnicode ? "\u0FAC" : "\u0FAB\u0FB7"));
+ superACIP2unicode.put("Ksh", (compactUnicode ? "\u0F69" : "\u0F40\u0FB5"));
+ subACIP2unicode.put("Ksh", (compactUnicode ? "\u0FB9" : "\u0F90\u0FB5"));
+ superACIP2unicode.put("GH", (compactUnicode ? "\u0F43" : "\u0F42\u0FB7"));
+ subACIP2unicode.put("GH", (compactUnicode ? "\u0F93" : "\u0F92\u0FB7"));
+ superACIP2unicode.put("K", "\u0F40");
+ subACIP2unicode.put("K", "\u0F90");
+ superACIP2unicode.put("KH", "\u0F41");
+ subACIP2unicode.put("KH", "\u0F91");
+ superACIP2unicode.put("G", "\u0F42");
+ subACIP2unicode.put("G", "\u0F92");
+ superACIP2unicode.put("NG", "\u0F44");
+ subACIP2unicode.put("NG", "\u0F94");
+ superACIP2unicode.put("C", "\u0F45");
+ subACIP2unicode.put("C", "\u0F95");
+ superACIP2unicode.put("CH", "\u0F46");
+ subACIP2unicode.put("CH", "\u0F96");
+ superACIP2unicode.put("J", "\u0F47");
+ subACIP2unicode.put("J", "\u0F97");
+ superACIP2unicode.put("NY", "\u0F49");
+ subACIP2unicode.put("NY", "\u0F99");
+ superACIP2unicode.put("T", "\u0F4F");
+ subACIP2unicode.put("T", "\u0F9F");
+ superACIP2unicode.put("TH", "\u0F50");
+ subACIP2unicode.put("TH", "\u0FA0");
+ superACIP2unicode.put("D", "\u0F51");
+ subACIP2unicode.put("D", "\u0FA1");
+ superACIP2unicode.put("N", "\u0F53");
+ subACIP2unicode.put("N", "\u0FA3");
+ superACIP2unicode.put("P", "\u0F54");
+ subACIP2unicode.put("P", "\u0FA4");
+ superACIP2unicode.put("PH", "\u0F55");
+ subACIP2unicode.put("PH", "\u0FA5");
+ superACIP2unicode.put("B", "\u0F56");
+ subACIP2unicode.put("B", "\u0FA6");
+ superACIP2unicode.put("M", "\u0F58");
+ subACIP2unicode.put("M", "\u0FA8");
+ superACIP2unicode.put("TZ", "\u0F59");
+ subACIP2unicode.put("TZ", "\u0FA9");
+ superACIP2unicode.put("TS", "\u0F5A");
+ subACIP2unicode.put("TS", "\u0FAA");
+ superACIP2unicode.put("DZ", "\u0F5B");
+ subACIP2unicode.put("DZ", "\u0FAB");
+ superACIP2unicode.put("W", "\u0F5D");
+ subACIP2unicode.put("W", "\u0FBA"); // oddball
+ superACIP2unicode.put("ZH", "\u0F5E");
+ subACIP2unicode.put("ZH", "\u0FAE");
+ superACIP2unicode.put("Z", "\u0F5F");
+ subACIP2unicode.put("Z", "\u0FAF");
+ superACIP2unicode.put("'", "\u0F60");
+ subACIP2unicode.put("'", "\u0FB0");
+ superACIP2unicode.put("Y", "\u0F61");
+ subACIP2unicode.put("Y", "\u0FB1");
+ superACIP2unicode.put("R", "\u0F62");
+ subACIP2unicode.put("R", "\u0FB2");
+ superACIP2unicode.put("L", "\u0F63");
+ subACIP2unicode.put("L", "\u0FB3");
+ superACIP2unicode.put("SH", "\u0F64");
+ subACIP2unicode.put("SH", "\u0FB4");
+ superACIP2unicode.put("S", "\u0F66");
+ subACIP2unicode.put("S", "\u0FB6");
+ superACIP2unicode.put("H", "\u0F67");
+ subACIP2unicode.put("H", "\u0FB7");
+ superACIP2unicode.put("A", "\u0F68");
+ subACIP2unicode.put("A", "\u0FB8");
+ superACIP2unicode.put("t", "\u0F4A");
+ subACIP2unicode.put("t", "\u0F9A");
+ superACIP2unicode.put("th", "\u0F4B");
+ subACIP2unicode.put("th", "\u0F9B");
+ superACIP2unicode.put("d", "\u0F4C");
+ subACIP2unicode.put("d", "\u0F9C");
+ superACIP2unicode.put("n", "\u0F4E");
+ subACIP2unicode.put("n", "\u0F9E");
+ superACIP2unicode.put("sh", "\u0F65");
+ subACIP2unicode.put("sh", "\u0FB5");
+
+ superACIP2unicode.put("I", "\u0F72");
+ superACIP2unicode.put("E", "\u0F7A");
+ superACIP2unicode.put("O", "\u0F7C");
+ superACIP2unicode.put("U", "\u0F74");
+ superACIP2unicode.put("OO", "\u0F7D");
+ superACIP2unicode.put("EE", "\u0F7B");
+ superACIP2unicode.put("i", "\u0F80");
+ superACIP2unicode.put("'A", "\u0F71");
+ superACIP2unicode.put("'I", "\u0F71\u0F72");
+ superACIP2unicode.put("'E", "\u0F71\u0F7A");
+ superACIP2unicode.put("'O", "\u0F71\u0F7C");
+ superACIP2unicode.put("'U", "\u0F71\u0F74");
+ superACIP2unicode.put("'OO", "\u0F71\u0F7D");
+ superACIP2unicode.put("'EE", "\u0F71\u0F7B");
+ superACIP2unicode.put("'i", "\u0F71\u0F80");
+
+ superACIP2unicode.put("Im", "\u0F72\u0F7E");
+ superACIP2unicode.put("Em", "\u0F7A\u0F7E");
+ superACIP2unicode.put("Om", "\u0F7C\u0F7E");
+ superACIP2unicode.put("Um", "\u0F74\u0F7E");
+ superACIP2unicode.put("OOm", "\u0F7D\u0F7E");
+ superACIP2unicode.put("EEm", "\u0F7B\u0F7E");
+ superACIP2unicode.put("im", "\u0F80\u0F7E");
+ superACIP2unicode.put("'Am", "\u0F71\u0F7E");
+ superACIP2unicode.put("'Im", "\u0F71\u0F72\u0F7E");
+ superACIP2unicode.put("'Em", "\u0F71\u0F7A\u0F7E");
+ superACIP2unicode.put("'Om", "\u0F71\u0F7C\u0F7E");
+ superACIP2unicode.put("'Um", "\u0F71\u0F74\u0F7E");
+ superACIP2unicode.put("'OOm", "\u0F71\u0F7D\u0F7E");
+ superACIP2unicode.put("'EEm", "\u0F71\u0F7B\u0F7E");
+ superACIP2unicode.put("'im", "\u0F71\u0F80\u0F7E");
+
+ superACIP2unicode.put("I:", "\u0F72\u0F7F");
+ superACIP2unicode.put("E:", "\u0F7A\u0F7F");
+ superACIP2unicode.put("O:", "\u0F7C\u0F7F");
+ superACIP2unicode.put("U:", "\u0F74\u0F7F");
+ superACIP2unicode.put("OO:", "\u0F7D\u0F7F");
+ superACIP2unicode.put("EE:", "\u0F7B\u0F7F");
+ superACIP2unicode.put("i:", "\u0F80\u0F7F");
+ superACIP2unicode.put("'A:", "\u0F71\u0F7F");
+ superACIP2unicode.put("'I:", "\u0F71\u0F72\u0F7F");
+ superACIP2unicode.put("'E:", "\u0F71\u0F7A\u0F7F");
+ superACIP2unicode.put("'O:", "\u0F71\u0F7C\u0F7F");
+ superACIP2unicode.put("'U:", "\u0F71\u0F74\u0F7F");
+ superACIP2unicode.put("'OO:", "\u0F71\u0F7D\u0F7F");
+ superACIP2unicode.put("'EE:", "\u0F71\u0F7B\u0F7F");
+ superACIP2unicode.put("'i:", "\u0F71\u0F80\u0F7F");
+
+ superACIP2unicode.put("Im:", "\u0F72\u0F7E\u0F7F");
+ superACIP2unicode.put("Em:", "\u0F7A\u0F7E\u0F7F");
+ superACIP2unicode.put("Om:", "\u0F7C\u0F7E\u0F7F");
+ superACIP2unicode.put("Um:", "\u0F74\u0F7E\u0F7F");
+ superACIP2unicode.put("OOm:", "\u0F7D\u0F7E\u0F7F");
+ superACIP2unicode.put("EEm:", "\u0F7B\u0F7E\u0F7F");
+ superACIP2unicode.put("im:", "\u0F80\u0F7E\u0F7F");
+ superACIP2unicode.put("'Am:", "\u0F71\u0F7E\u0F7F");
+ superACIP2unicode.put("'Im:", "\u0F71\u0F72\u0F7E\u0F7F");
+ superACIP2unicode.put("'Em:", "\u0F71\u0F7A\u0F7E\u0F7F");
+ superACIP2unicode.put("'Om:", "\u0F71\u0F7C\u0F7E\u0F7F");
+ superACIP2unicode.put("'Um:", "\u0F71\u0F74\u0F7E\u0F7F");
+ superACIP2unicode.put("'OOm:", "\u0F71\u0F7D\u0F7E\u0F7F");
+ superACIP2unicode.put("'EEm:", "\u0F71\u0F7B\u0F7E\u0F7F");
+ superACIP2unicode.put("'im:", "\u0F71\u0F80\u0F7E\u0F7F");
+ // :m does not appear, though you'd think it's as valid as m:.
+
+ superACIP2unicode.put("m", "\u0F7E");
+ superACIP2unicode.put(":", "\u0F7F");
+ superACIP2unicode.put("m:", "\u0F7E\u0F7F");
+
+ superACIP2unicode.put("Am", "\u0F7E");
+ superACIP2unicode.put("A:", "\u0F7F");
+ superACIP2unicode.put("Am:", "\u0F7E\u0F7F");
+
+ superACIP2unicode.put("0", "\u0F20");
+ superACIP2unicode.put("1", "\u0F21");
+ superACIP2unicode.put("2", "\u0F22");
+ superACIP2unicode.put("3", "\u0F23");
+ superACIP2unicode.put("4", "\u0F24");
+ superACIP2unicode.put("5", "\u0F25");
+ superACIP2unicode.put("6", "\u0F26");
+ superACIP2unicode.put("7", "\u0F27");
+ superACIP2unicode.put("8", "\u0F28");
+ superACIP2unicode.put("9", "\u0F29");
+
+ // punctuation
+ superACIP2unicode.put("&", "\u0F85");
+ superACIP2unicode.put(",", "\u0F0D");
+ superACIP2unicode.put(" ", "\u0F0B");
+ superACIP2unicode.put(".", "\u0F0C");
+ superACIP2unicode.put("`", "\u0F08");
+ superACIP2unicode.put("`", "\u0F08");
+ superACIP2unicode.put("*", "\u0F04\u0F05");
+ superACIP2unicode.put("#", "\u0F04\u0F05\u0F05");
+ superACIP2unicode.put("%", "\u0F35"); // but might be U+0F14, so we warn.
+ superACIP2unicode.put("o", "\u0F37");
+ superACIP2unicode.put(";", "\u0F11");
+ superACIP2unicode.put("\r", "\r");
+ superACIP2unicode.put("\t", "\t");
+ superACIP2unicode.put("\r\n", "\r\n");
+ superACIP2unicode.put("\n", "\n");
+ superACIP2unicode.put("\\", "\u0F84");
+ superACIP2unicode.put("^", "\u0F38");
+
+ // DLC FIXME: "^ GONG" is "^GONG", right?
+ // DLC FIXME: what's the Unicode for x? RC said there is none in plain-text Unicode for x. But what about in RTF Unicode?
+ }
+ if (subscribed) {
+ String u = (String)subACIP2unicode.get(acip);
+ if (null != u) return u;
+ }
+ return (String)superACIP2unicode.get(acip);
+ }
+
+ private HashMap acipOther2wylie = null;
+ public /* synchronized */ String getEwtsForOther(String acip) {
+ if (acipOther2wylie == null) {
+ acipOther2wylie = new HashMap(20);
+
+ // don't use putMapping for this. We don't want TMW->ACIP
+ // to produce "." for a U+0F0C because ACIP doesn't say
+ // that "." means U+0F0C. It just seems to in practice
+ // for ACIP Release IV texts.
+ acipOther2wylie.put(".", "*");
+
+ putMapping(acipOther2wylie, "m", "M");
+ putMapping(acipOther2wylie, ":", "H");
+ putMapping(acipOther2wylie, ",", "/");
+ putMapping(acipOther2wylie, " ", " ");
+ putMapping(acipOther2wylie, ";", "|");
+ putMapping(acipOther2wylie, "`", "!");
+ putMapping(acipOther2wylie, "*", "@#");
+ // There is no glyph in TMW with the EWTS @##, so we don't do this: putMapping(acipOther2wylie, "#", "@##");
+ putMapping(acipOther2wylie, "%", "~X");
+ putMapping(acipOther2wylie, "o", "X");
+ putMapping(acipOther2wylie, "&", "&");
+ putMapping(acipOther2wylie, "^", "\\u0F38");
+
+ putMapping(acipOther2wylie, "0", "0");
+ putMapping(acipOther2wylie, "1", "1");
+ putMapping(acipOther2wylie, "2", "2");
+ putMapping(acipOther2wylie, "3", "3");
+ putMapping(acipOther2wylie, "4", "4");
+ putMapping(acipOther2wylie, "5", "5");
+ putMapping(acipOther2wylie, "6", "6");
+ putMapping(acipOther2wylie, "7", "7");
+ putMapping(acipOther2wylie, "8", "8");
+ putMapping(acipOther2wylie, "9", "9");
+ }
+ return (String)acipOther2wylie.get(acip);
+ }
+
+ public TTshegBarScanner scanner() { return ACIPTshegBarScanner.instance(); }
+
+ /** Registers acip->wylie mappings in toWylie; registers
+ wylie->acip mappings in {@link #wylieToACIP}. */
+ private /* synchronized */ void putMapping(HashMap toWylie, String ACIP, String EWTS) {
+ toWylie.put(ACIP, EWTS);
+ if (null == wylieToACIP) {
+ wylieToACIP = new HashMap(75);
+
+ // We don't want to put "/" in toWylie:
+ wylieToACIP.put("(", "/");
+ wylieToACIP.put(")", "/");
+ wylieToACIP.put("?", "\\");
+
+ wylieToACIP.put("_", " "); // oddball.
+ wylieToACIP.put("o'i", "O'I"); // oddball for TMW9.61.
+ }
+ wylieToACIP.put(EWTS, ACIP);
+ }
+
+ /** A map from EWTS to ACIP. Note that the EWTS "w" maps to both
+ "V" and "W" in reality but this map will only give one or the
+ other. */
+ private HashMap wylieToACIP = null;
+ /** Returns the ACIP transliteration corresponding to the THDL
+ Extended Wylie atom EWTS, or null if EWTS is not
+ recognized. */
+ public String getACIPForEWTS(String EWTS) {
+ getEwtsForConsonant(null); // inits wylieToACIP
+ getEwtsForOther(null); // inits wylieToACIP
+ getEwtsForWowel(null); // inits wylieToACIP
+ String ans = (String)wylieToACIP.get(EWTS);
+ boolean useCapitalW = false;
+ if (EWTS.startsWith("w"))
+ useCapitalW = true; // We want W+NA, not V+NA; we want WA, not VA.
+ if (null == ans) {
+ StringBuffer finalAns = new StringBuffer(EWTS.length());
+ StringTokenizer sTok = new StringTokenizer(EWTS, "-+", true);
+ while (sTok.hasMoreTokens()) {
+ String part, tok = sTok.nextToken();
+ if (tok.equals("-") || tok.equals("+"))
+ part = tok;
+ else {
+ if ("w".equals(tok)) {
+ // There are only two stacks in TMW that have
+ // U+0FBA: R+Wa and w+Wa. TMW->ACIP fails for
+ // these unless we handle it here. (FIXME:
+ // add an automated test for this).
+ if ("R+W".equals(EWTS) || "w+W".equals(EWTS)) {
+ part = "W";
+ } else {
+ part = "V";
+ }
+ } else {
+ part = (String)wylieToACIP.get(tok);
+ }
+ }
+ if (null == part) return null;
+ finalAns.append(part);
+ }
+ if (useCapitalW)
+ finalAns.setCharAt(0, 'W');
+ return finalAns.toString();
+ }
+ if (useCapitalW)
+ return "W" + ans.substring(1);
+ else
+ return ans;
+ }
+
+ private HashMap acipConsonant2wylie = null;
+ /** Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y",
+ * even though sometimes the EWTS for those is "w", "R", or "Y".
+ * Handle that in the caller. */
+ public /* synchronized */ String getEwtsForConsonant(String acip) {
+ if (acipConsonant2wylie == null) {
+ acipConsonant2wylie = new HashMap(37);
+
+ // oddball:
+ putMapping(acipConsonant2wylie, "V", "w");
+
+ // more oddballs:
+ putMapping(acipConsonant2wylie, "DH", "d+h");
+ putMapping(acipConsonant2wylie, "BH", "b+h");
+ putMapping(acipConsonant2wylie, "dH", "D+h");
+ putMapping(acipConsonant2wylie, "DZH", "dz+h"); // longest, MAX_CONSONANT_LENGTH characters
+ putMapping(acipConsonant2wylie, "Ksh", "k+Sh"); // longest, MAX_CONSONANT_LENGTH characters
+ putMapping(acipConsonant2wylie, "GH", "g+h");
+
+
+ putMapping(acipConsonant2wylie, "K", "k");
+ putMapping(acipConsonant2wylie, "KH", "kh");
+ putMapping(acipConsonant2wylie, "G", "g");
+ putMapping(acipConsonant2wylie, "NG", "ng");
+ putMapping(acipConsonant2wylie, "C", "c");
+ putMapping(acipConsonant2wylie, "CH", "ch");
+ putMapping(acipConsonant2wylie, "J", "j");
+ putMapping(acipConsonant2wylie, "NY", "ny");
+ putMapping(acipConsonant2wylie, "T", "t");
+ putMapping(acipConsonant2wylie, "TH", "th");
+ putMapping(acipConsonant2wylie, "D", "d");
+ putMapping(acipConsonant2wylie, "N", "n");
+ putMapping(acipConsonant2wylie, "P", "p");
+ putMapping(acipConsonant2wylie, "PH", "ph");
+ putMapping(acipConsonant2wylie, "B", "b");
+ putMapping(acipConsonant2wylie, "M", "m");
+ putMapping(acipConsonant2wylie, "TZ", "ts");
+ putMapping(acipConsonant2wylie, "TS", "tsh");
+ putMapping(acipConsonant2wylie, "DZ", "dz");
+ putMapping(acipConsonant2wylie, "W", "W"
+ /* NOTE WELL: sometimes "w", sometimes "W".
+ Handle this in the caller.
+
+ Reasoning for "W" instead of "w": r-w and
+ r+w are both known hash keys. We sort 'em
+ out this way. (They are the only things
+ like this according to bug report #800166.) */
+ );
+ putMapping(acipConsonant2wylie, "ZH", "zh");
+ putMapping(acipConsonant2wylie, "Z", "z");
+ putMapping(acipConsonant2wylie, "'", "'");
+ putMapping(acipConsonant2wylie, "Y", "y");
+ putMapping(acipConsonant2wylie, "R", "r");
+ putMapping(acipConsonant2wylie, "L", "l");
+ putMapping(acipConsonant2wylie, "SH", "sh");
+ putMapping(acipConsonant2wylie, "S", "s");
+ putMapping(acipConsonant2wylie, "H", "h");
+ putMapping(acipConsonant2wylie, "A", "a");
+ putMapping(acipConsonant2wylie, "t", "T");
+ putMapping(acipConsonant2wylie, "th", "Th");
+ putMapping(acipConsonant2wylie, "d", "D");
+ putMapping(acipConsonant2wylie, "n", "N");
+ putMapping(acipConsonant2wylie, "sh", "Sh");
+ }
+ return (String)acipConsonant2wylie.get(acip);
+ }
+
+ private HashMap acipWowel2wylie = null;
+ public /* synchronized */ String getEwtsForWowel(String acip) {
+ if (acipWowel2wylie == null) {
+ acipWowel2wylie = new HashMap(baseVowels.length * 4);
+
+ for (int i = 0; i < baseVowels.length; i++) {
+ putMapping(acipWowel2wylie, baseVowels[i][0], baseVowels[i][1]);
+ putMapping(acipWowel2wylie, '\'' + baseVowels[i][0], baseVowels[i][2]);
+ putMapping(acipWowel2wylie, baseVowels[i][0] + 'm', baseVowels[i][1] + 'M');
+ putMapping(acipWowel2wylie, '\'' + baseVowels[i][0] + 'm', baseVowels[i][2] + 'M');
+ putMapping(acipWowel2wylie, baseVowels[i][0] + ':', baseVowels[i][1] + 'H');
+ putMapping(acipWowel2wylie, '\'' + baseVowels[i][0] + ':', baseVowels[i][2] + 'H');
+ putMapping(acipWowel2wylie, baseVowels[i][0] + "m:", baseVowels[i][1] + "MH");
+ putMapping(acipWowel2wylie, '\'' + baseVowels[i][0] + "m:", baseVowels[i][2] + "MH");
+ }
+ // {Pm} is treated just like {PAm}; {P:} is treated just
+ // like {PA:}; {Pm:} is treated just like {PAm:}. But
+ // that happens thanks to
+ // TPairListFactory.getFirstConsonantAndVowel(StringBuffer,int[]).
+
+ // Keep this code in sync with getUnicodeFor.
+ }
+ return (String)acipWowel2wylie.get(acip);
+ }
+
+ /** {Ksh}, the longest consonant, has 3 characters, so this is
+ * three. */
+ private static int MAX_CONSONANT_LENGTH = 3;
+
+ /** {'EEm:}, the longest wowel, has 5 characters, so this is
+ * five. */
+ private static int MAX_WOWEL_LENGTH = 5;
+
+ private static String[][] baseVowels = new String[][] {
+ // { ACIP, EWTS, EWTS for ACIP {'\'' + baseVowels[][0]}, vowel
+ // numbers (see TibetanMachineWeb's VOWEL_A, VOWEL_o, etc.)
+ // for ACIP, vowel numbers for ACIP {'\'' + baseVowels[][0]}
+ { "A", "a", "A" },
+ { "I", "i", "I" },
+ { "U", "u", "U" },
+ { "E", "e", "Ae" },
+ { "O", "o", "Ao" },
+ { "EE", "ai", "Aai" },
+ { "OO", "au", "Aau" },
+ { "i", "-i", "A-i" }
+ };
+
+ /** Returns true if and only if s is an ACIP wowel. You can't
+ * just call this any time -- A is both a consonant and a vowel
+ * in ACIP, so you have to call this in the right context. */
+ public boolean isWowel(String s) {
+ // I'm on my own with 'O and 'E and 'OO and 'EE, but GANG'O
+ // appears and I wonder... so here they are. It's consistent
+ // with 'I and 'A and 'U, at least: all the vowels may appear
+ // as K'vowel. DLC FIXME: ask.
+ return (null != getEwtsForWowel(s));
+ }
+
+ /** Returns true if and only if s is an ACIP consonant. */
+ public boolean isConsonant(String s) {
+ return (null != getEwtsForConsonant(s));
+ }
+
+ /** Gets the duffcodes for wowel, such that they look good with
+ * the preceding glyph, and appends them to duff. */
+ public void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel) {
+ if (null == wowel) return;
+ if (null == getEwtsForWowel(wowel)) // FIXME: expensive assertion! Use assert.
+ throw new IllegalArgumentException("Wowel " + wowel + " isn't in the small set of wowels we handle correctly.");
+
+ // Order matters here.
+ boolean context_added[] = new boolean[] { false };
+ if (wowel.startsWith("A")) {
+ TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.WYLIE_aVOWEL, context_added);
+ } else if (wowel.indexOf("'U") >= 0) {
+ TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.U_VOWEL, context_added);
+ } else if (wowel.indexOf("'I") >= 0) {
+ TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.I_VOWEL, context_added);
+ } else {
+ if (wowel.indexOf('\'') >= 0) {
+ TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.A_VOWEL, context_added);
+ }
+ if (wowel.indexOf("EE") >= 0) {
+ TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.ai_VOWEL, context_added);
+ } else if (wowel.indexOf('E') >= 0) {
+ TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.e_VOWEL, context_added);
+ }
+ if (wowel.indexOf("OO") >= 0) {
+ TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.au_VOWEL, context_added);
+ } else if (wowel.indexOf('O') >= 0) {
+ TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.o_VOWEL, context_added);
+ }
+ if (wowel.indexOf('I') >= 0) {
+ TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.i_VOWEL, context_added);
+ }
+ if (wowel.indexOf('U') >= 0) {
+ TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.u_VOWEL, context_added);
+ }
+ if (wowel.indexOf('i') >= 0) {
+ TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.reverse_i_VOWEL, context_added);
+ }
+ }
+ // FIXME: Use TMW9.61, the "o'i" special combination, when appropriate.
+
+ if (wowel.indexOf('m') >= 0) {
+ DuffCode last = (DuffCode)duff.get(duff.size() - 1);
+ duff.remove(duff.size() - 1); // getBindu will add it back...
+ TibTextUtils.getBindu(duff, last);
+ }
+ if (wowel.indexOf(':') >= 0)
+ duff.add(TibetanMachineWeb.getGlyph(getEwtsForOther(":")));
+ }
}
+
diff --git a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
index bb6eb74..9a750c5 100644
--- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
@@ -18,11 +18,10 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt;
-import java.io.*;
+import java.io.IOException;
import java.util.ArrayList;
import java.util.Stack;
-import org.thdl.util.ThdlDebug;
import org.thdl.util.ThdlOptions;
/**
@@ -36,8 +35,10 @@ import org.thdl.util.ThdlOptions;
* the parser, not here in the lexical analyzer. That'd be cleaner,
* and more like how you'd do things if you used lex and yacc.
*
+* This is not public because you should use {@link ACIPTraits#scanner()}.
+*
* @author David Chandler */
-public class ACIPTshegBarScanner extends TTshegBarScanner {
+class ACIPTshegBarScanner extends TTshegBarScanner {
/** True if those ACIP snippets inside square brackets (e.g.,
"[THIS]") are to be passed through into the output unmodified
while retaining the brackets and if those ACIP snippets inside
diff --git a/source/org/thdl/tib/text/ttt/EWTSTraits.java b/source/org/thdl/tib/text/ttt/EWTSTraits.java
index 7027622..bfef618 100644
--- a/source/org/thdl/tib/text/ttt/EWTSTraits.java
+++ b/source/org/thdl/tib/text/ttt/EWTSTraits.java
@@ -18,11 +18,14 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt;
+import java.util.ArrayList;
+import org.thdl.tib.text.DuffCode;
+
/** A singleton class that should contain (but due to laziness and
* ignorance probably does not contain) all the traits that make EWTS
* transliteration different from other (say, ACIP) transliteration
* schemes. */
-final class EWTSTraits implements TTraits {
+public final class EWTSTraits implements TTraits {
/** sole instance of this class */
private static EWTSTraits singleton = null;
@@ -30,7 +33,7 @@ final class EWTSTraits implements TTraits {
private EWTSTraits() { }
/** */
- public static EWTSTraits instance() {
+ public static synchronized EWTSTraits instance() {
if (null == singleton) {
singleton = new EWTSTraits();
}
@@ -79,4 +82,48 @@ final class EWTSTraits implements TTraits {
|| "H".equals(s)
|| "M".equals(s)); // TODO(DLC)[EWTS->Tibetan]:???
}
+
+ public String aVowel() { return "a"; }
+
+ public boolean isPostsuffix(String s) {
+ return ("s".equals(s) || "d".equals(s));
+ }
+
+ public boolean isPrefix(String l) {
+ return ("'".equals(l)
+ || "m".equals(l)
+ || "b".equals(l)
+ || "d".equals(l)
+ || "g".equals(l));
+ }
+
+ public boolean isSuffix(String l) {
+ return ("s".equals(l)
+ || "g".equals(l)
+ || "d".equals(l)
+ || "m".equals(l)
+ || "'".equals(l)
+ || "b".equals(l)
+ || "ng".equals(l)
+ || "n".equals(l)
+ || "l".equals(l)
+ || "r".equals(l));
+ }
+
+ /** Returns l, since this is EWTS's traits class. */
+ public String getEwtsForConsonant(String l) { return l; }
+
+ /** Returns l, since this is EWTS's traits class. */
+ public String getEwtsForOther(String l) { return l; }
+
+ /** Returns l, since this is EWTS's traits class. */
+ public String getEwtsForWowel(String l) { return l; }
+
+ public TTshegBarScanner scanner() { return EWTSTshegBarScanner.instance(); }
+
+ public void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel) {
+ throw new Error("TODO(DLC)[EWTS->Tibetan]");
+ }
+
+ public String getUnicodeFor(String l, boolean subscribed) { throw new Error("TODO(DLC)[EWTS->Tibetan]"); }
}
diff --git a/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java b/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java
new file mode 100644
index 0000000..7315675
--- /dev/null
+++ b/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java
@@ -0,0 +1,56 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis,
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+License for the specific terms governing rights and limitations under the
+License.
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
+All Rights Reserved.
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text.ttt;
+
+import java.util.ArrayList;
+
+/**
+* This singleton class is able to break up Strings of EWTS text (for
+* example, an entire sutra file) into tsheg bars, comments, etc.
+* Non-Tibetan parts are segregated (so that consumers can ensure that
+* they remain non-Tibetan), and Tibetan passages are broken up into
+* tsheg bars.
+*
+* This is not public because you should use {@link EWTSTraits#scanner()}.
+*
+* @author David Chandler */
+class EWTSTshegBarScanner extends TTshegBarScanner {
+ /** See the comment in TTshegBarScanner. This does not find
+ errors and warnings that you'd think of a parser finding (DLC
+ DOES IT?). */
+ public ArrayList scan(String s, StringBuffer errors, int maxErrors,
+ boolean shortMessages, String warningLevel) {
+ // the size depends on whether it's mostly Tibetan or mostly
+ // Latin and a number of other factors. This is meant to be
+ // an underestimate, but not too much of an underestimate.
+ ArrayList al = new ArrayList(s.length() / 10);
+ throw new Error("DLC unimplemented");
+ }
+
+ /** non-public because this is a singleton */
+ protected EWTSTshegBarScanner() { }
+ private static EWTSTshegBarScanner singleton = null;
+ /** Returns the sole instance of this class. */
+ public synchronized static EWTSTshegBarScanner instance() {
+ if (null == singleton) {
+ singleton = new EWTSTshegBarScanner();
+ }
+ return singleton;
+ }
+}
diff --git a/source/org/thdl/tib/text/ttt/PackageTest.java b/source/org/thdl/tib/text/ttt/PackageTest.java
index e8dde5b..eff8d50 100644
--- a/source/org/thdl/tib/text/ttt/PackageTest.java
+++ b/source/org/thdl/tib/text/ttt/PackageTest.java
@@ -202,15 +202,16 @@ public class PackageTest extends TestCase {
message. */
static String ACIP2TMW2Translit(boolean EWTSNotACIP, String ACIP) {
StringBuffer errors = new StringBuffer();
- ArrayList al = ACIPTshegBarScanner.instance().scan(ACIP, errors, -1,
- false, "None");
+ ArrayList al = ACIPTraits.instance().scanner().scan(ACIP, errors, -1,
+ false, "None");
if (null == al || errors.length() > 0)
return null;
org.thdl.tib.text.TibetanDocument tdoc
= new org.thdl.tib.text.TibetanDocument();
int loc[] = new int[] { 0 };
try {
- if (!TConverter.convertToTMW(al,
+ if (!TConverter.convertToTMW(ACIPTraits.instance(),
+ al,
tdoc,
null,
null,
@@ -7358,8 +7359,8 @@ tstHelper("ZUR");
private static void shelp(String s, String expectedErrors, String expectedScan, String warningLevel) {
StringBuffer errors = new StringBuffer();
- ArrayList al = ACIPTshegBarScanner.instance().scan(s, errors, -1, false,
- warningLevel);
+ ArrayList al = ACIPTraits.instance().scanner().scan(s, errors, -1, false,
+ warningLevel);
if (null != expectedScan) {
if (!al.toString().equals(expectedScan)) {
System.out.println("Scanning " + s + " into tsheg bars was expected to cause the following scan:");
@@ -7392,7 +7393,7 @@ tstHelper("ZUR");
/** Tests {@link ACIPTshegBarScanner#scan(String, StringBuffer,
int, boolean)}. */
- public void testScanner() {
+ public void testAcipScanner() {
shelp("Pm KA", "", "[TIBETAN_NON_PUNCTUATION:{Pm}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{KA}]");
shelp("KA (KHA\nGA)", "", "[TIBETAN_NON_PUNCTUATION:{KA}, TIBETAN_PUNCTUATION:{ }, START_PAREN:{(}, TIBETAN_NON_PUNCTUATION:{KHA}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{GA}, END_PAREN:{)}]");
@@ -7682,7 +7683,8 @@ tstHelper("ZUR");
private static void uhelp(String acip, String expectedUnicode,
String warningLevel, boolean shortMessages) {
StringBuffer errors = new StringBuffer();
- String unicode = TConverter.convertToUnicodeText("ACIP", acip, errors,
+ String unicode = TConverter.convertToUnicodeText(ACIPTraits.instance(),
+ acip, errors,
null, true,
warningLevel,
shortMessages);
diff --git a/source/org/thdl/tib/text/ttt/TConverter.java b/source/org/thdl/tib/text/ttt/TConverter.java
index 9bbe07f..bd889dc 100644
--- a/source/org/thdl/tib/text/ttt/TConverter.java
+++ b/source/org/thdl/tib/text/ttt/TConverter.java
@@ -69,10 +69,10 @@ public class TConverter {
boolean shortMessages = false;
String warningLevel = "Most";
ArrayList al
- = ACIPTshegBarScanner.instance().scanFile(args[0], errors,
- maxErrors - 1,
- shortMessages,
- warningLevel);
+ = ACIPTraits.instance().scanner().scanFile(args[0], errors,
+ maxErrors - 1,
+ shortMessages,
+ warningLevel);
if (null == al) {
System.err.println(maxErrors + " or more lexical errors occurred while scanning ACIP input file; is this");
@@ -103,8 +103,9 @@ public class TConverter {
warnings = new StringBuffer();
putWarningsInOutput = true;
}
- convertToTMW(al, System.out, errors, warnings, null,
- putWarningsInOutput, warningLevel, shortMessages, colors);
+ convertToTMW(ACIPTraits.instance(), al, System.out, errors, warnings,
+ null, putWarningsInOutput, warningLevel, shortMessages,
+ colors);
int retCode = 0;
if (errors.length() > 0) {
System.err.println("Errors converting ACIP input file: ");
@@ -139,7 +140,8 @@ public class TConverter {
* prefix rules in another
* @throws IOException if we cannot write to out
*/
- public static boolean convertToTMW(ArrayList scan,
+ public static boolean convertToTMW(TTraits ttraits,
+ ArrayList scan,
OutputStream out,
StringBuffer errors,
StringBuffer warnings,
@@ -152,7 +154,8 @@ public class TConverter {
{
TibetanDocument tdoc = new TibetanDocument();
boolean rv
- = convertToTMW(scan, tdoc, errors, warnings, hasWarnings,
+ = convertToTMW(ttraits,
+ scan, tdoc, errors, warnings, hasWarnings,
writeWarningsToResult, warningLevel,
shortMessages, colors,
new int[] { tdoc.getLength() });
@@ -169,7 +172,8 @@ public class TConverter {
offset from zero inside tdoc at which conversion results will
be placed. On output, loc[0] is one past the offset of the
last of the conversion results. */
- public static boolean convertToTMW(ArrayList scan,
+ public static boolean convertToTMW(TTraits ttraits,
+ ArrayList scan,
TibetanDocument tdoc,
StringBuffer errors,
StringBuffer warnings,
@@ -181,7 +185,8 @@ public class TConverter {
int[] loc)
throws IOException
{
- return convertTo(false, true, scan, null, tdoc, errors, warnings,
+ return convertTo(false, true,
+ ttraits, scan, null, tdoc, errors, warnings,
hasWarnings, writeWarningsToResult, warningLevel,
shortMessages, colors, loc,
loc[0] == tdoc.getLength());
@@ -189,33 +194,30 @@ public class TConverter {
/** Returns UTF-8 encoded Unicode. A bit indirect, so use this
* for testing only if performance is a concern. If errors occur
- * in scanning the ACIP or in converting a tsheg bar, then they
- * are appended to errors if errors is non-null, as well as
- * written to the result. If warnings occur in scanning the ACIP
- * or in converting a tsheg bar, then they are appended to
- * warnings if warnings is non-null, and they are written to the
- * result if writeWarningsToResult is true. Error and warning
- * messages are long and self-contained unless shortMessages is
- * true. Returns the conversion upon perfect success or if there
- * were merely warnings, null if errors occurred. */
- public static String convertToUnicodeText(String transliteration,
- String acip,
+ * in scanning the transliteration or in converting a tsheg bar,
+ * then they are appended to errors if errors is non-null, as
+ * well as written to the result. If warnings occur in scanning
+ * the transliteration or in converting a tsheg bar, then they
+ * are appended to warnings if warnings is non-null, and they are
+ * written to the result if writeWarningsToResult is true. Error
+ * and warning messages are long and self-contained unless
+ * shortMessages is true. Returns the conversion upon perfect
+ * success or if there were merely warnings, null if errors
+ * occurred. */
+ public static String convertToUnicodeText(TTraits ttraits,
+ String translit,
StringBuffer errors,
StringBuffer warnings,
boolean writeWarningsToResult,
String warningLevel,
boolean shortMessages) {
- if (transliteration != "ACIP") {
- ThdlDebug.noteIffyCode();
- throw new IllegalArgumentException("Unsupported transliteration");
- }
ByteArrayOutputStream sw = new ByteArrayOutputStream();
ArrayList al
- = ACIPTshegBarScanner.instance().scan(acip, errors, -1,
- shortMessages, warningLevel);
+ = ttraits.scanner().scan(translit, errors, -1, shortMessages,
+ warningLevel);
try {
if (null != al) {
- convertToUnicodeText(al, sw, errors,
+ convertToUnicodeText(ttraits, al, sw, errors,
warnings, null, writeWarningsToResult,
warningLevel, shortMessages);
return sw.toString("UTF-8");
@@ -236,7 +238,8 @@ public class TConverter {
* writeWarningsToOut is true, then warnings also will be written
* to out.
* @return true upon perfect success, false if errors occurred.
- * @param scan result of ACIPTshegBarScanner.scan(..)
+ * @param scan result of using ttraits.scanner() to break up the
+ * original string of transliteration
* @param out stream to which to write converted text
* @param errors if non-null, all error messages are appended
* @param warnings if non-null, all warning messages appropriate
@@ -246,9 +249,9 @@ public class TConverter {
* false otherwise
* @param writeWarningsToOut if true, then all warning messages
* are written to out in the appropriate places
- * @throws IOException if we cannot write to out
- */
- public static boolean convertToUnicodeText(ArrayList scan,
+ * @throws IOException if we cannot write to out */
+ public static boolean convertToUnicodeText(TTraits ttraits,
+ ArrayList scan,
OutputStream out,
StringBuffer errors,
StringBuffer warnings,
@@ -258,7 +261,8 @@ public class TConverter {
boolean shortMessages)
throws IOException
{
- return convertTo(true, false, scan, out, null, errors, warnings,
+ return convertTo(true, false,
+ ttraits, scan, out, null, errors, warnings,
hasWarnings, writeWarningsToOut, warningLevel,
shortMessages, false, new int[] { -1 } , true);
}
@@ -283,6 +287,7 @@ public class TConverter {
private static boolean convertTo(boolean toUnicode, // else to TMW
boolean toRTF, // else to UTF-8-encoded text
+ TTraits ttraits,
ArrayList scan,
OutputStream out, // for (toUnicode && !toRTF) mode
TibetanDocument tdoc, // for !toUnicode mode or (toUnicode && toRTF) mode
@@ -368,7 +373,7 @@ public class TConverter {
if (lastGuyWasNonPunct) {
String err = "[#ERROR " + ErrorsAndWarnings.getMessage(133, shortMessages, s.getText()) + "]";
if (null != writer) {
- String uni = ACIPRules.getUnicodeFor(s.getText(), false);
+ String uni = ttraits.getUnicodeFor(s.getText(), false);
if (null == uni) {
hasErrors = true;
uni = err;
@@ -377,7 +382,7 @@ public class TConverter {
}
if (null != tdoc) {
String wylie
- = ACIPRules.getWylieForACIPOther(s.getText());
+ = ttraits.getEwtsForOther(s.getText());
if (null == wylie) {
hasErrors = true;
tdoc.appendRoman(tdocLocation[0], err, Color.RED);
@@ -658,7 +663,7 @@ public class TConverter {
}
if (!done) {
- if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false);
+ if (null != writer) unicode = ttraits.getUnicodeFor(s.getText(), false);
if (null != tdoc) {
if (s.getText().equals("\r")
|| s.getText().equals("\t")
@@ -675,7 +680,7 @@ public class TConverter {
TibetanMachineWeb.getGlyph("#")
}; // hard-coded EWTS values
} else {
- String wy = ACIPRules.getWylieForACIPOther(s.getText());
+ String wy = ttraits.getEwtsForOther(s.getText());
if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
duff = new Object[] { TibetanMachineWeb.getGlyph(wy) };
}
diff --git a/source/org/thdl/tib/text/ttt/TPair.java b/source/org/thdl/tib/text/ttt/TPair.java
index a21181d..8814493 100644
--- a/source/org/thdl/tib/text/ttt/TPair.java
+++ b/source/org/thdl/tib/text/ttt/TPair.java
@@ -26,22 +26,27 @@ import java.util.ArrayList;
/** An ordered pair used in ACIP/EWTS-to-TMW/Unicode conversion. The
* left side is the consonant or empty; the right side is either the
- * vowel or '+' (indicating stacking) or a disambiguator (i.e., '-'
- * in ACIP or '.' in EWTS).
+ * vowel or '+' (indicating stacking in both ACIP and EWTS) or a
+ * disambiguator (e.g., '-' in ACIP or '.' in EWTS).
* @author David Chandler */
/* BIG FIXME: make this package work for EWTS, not just ACIP. (TODO(DLC)[EWTS->Tibetan]: does it?) */
class TPair {
- /** The left side, or null if there is no left side. That is, the
- * non-vowel, non-'m', non-':', non-'-', non-'+' guy. */
+ /** the part that knows ACIP from EWTS */
+ private TTraits traits;
+
+ /** Returns the part that knows ACIP from EWTS. */
+ public TTraits getTraits() { return traits; }
+
+ /** The left side, or null if there is no left side. I.e., the
+ * non-wowel, non-disambiguator, non-'+' guy. */
private String l;
String getLeft() {
ThdlDebug.verify(!"".equals(l));
return l;
}
- /** The right side. That is, the vowel, with 'm' or ':' "vowel"
- * after it if appropriate, or "-" (disambiguator), or "+"
- * (stacking), or null otherwise. */
+ /** The right side. That is, the wowel or disambiguator or "+"
+ * (for stacking) or null otherwise. */
private String r;
String getRight() {
ThdlDebug.verify(!"".equals(r));
@@ -50,13 +55,14 @@ class TPair {
/** Constructs a new TPair with left side l and right side r.
* Use null or the empty string to represent an absence. */
- TPair(String l, String r) {
+ TPair(TTraits traits, String l, String r) {
// Normalize:
if (null != l && l.equals("")) l = null;
if (null != r && r.equals("")) r = null;
this.l = l;
this.r = r;
+ this.traits = traits;
}
/** Returns a nice String representation. Returns "(D . E)" for
@@ -67,8 +73,8 @@ class TPair {
+ ((null == r) ? "" : r) + ")";
}
- /** Returns the number of ACIP characters that make up this
- * TPair. */
+ /** Returns the number of transliteration characters that make up
+ * this TPair. */
int size() {
return (((l == null) ? 0 : l.length())
+ ((r == null) ? 0 : r.length()));
@@ -98,18 +104,18 @@ class TPair {
sz = l.length();
newL = l.substring(0, sz - N);
}
- return new TPair(newL, newR);
+ return new TPair(traits, newL, newR);
}
- /** Returns true if and only if this is nonempty and is l, if
- * present, is a legal ACIP consonant, and is r, if present, is a
- * legal ACIP vowel. */
+ /** Returns true if and only if this is nonempty and if l, if
+ * present, is a legal consonant, and if r, if present, is a
+ * legal wowel. */
boolean isLegal() {
if (size() < 1)
return false;
- if (null != l && !ACIPRules.isConsonant(l))
+ if (null != l && !traits.isConsonant(l))
return false;
- if (null != r && !ACIPRules.isWowel(r))
+ if (null != r && !traits.isWowel(r))
return false;
return true;
}
@@ -119,9 +125,9 @@ class TPair {
boolean isPrefix() {
return (null != l
&& ((null == r || "".equals(r))
- || "-".equals(r) // TODO(DLC)[EWTS->Tibetan]
- || "A".equals(r)) // FIXME: though check for BASKYABS and warn because BSKYABS is more common
- && ACIPRules.isACIPPrefix(l));
+ || traits.disambiguator().equals(r)
+ || traits.aVowel().equals(r)) // FIXME: though check for BASKYABS and warn because BSKYABS is more common
+ && traits.isPrefix(l));
}
/** Returns true if and only if this pair could be a Tibetan
@@ -129,25 +135,25 @@ class TPair {
boolean isPostSuffix() {
return (null != l
&& ((null == r || "".equals(r))
- || "-".equals(r)
- || "A".equals(r)) // FIXME: though warn about GAMASA vs. GAMS
- && ACIPRules.isACIPPostsuffix(l));
+ || traits.disambiguator().equals(r)
+ || traits.aVowel().equals(r)) // FIXME: though warn about GAMASA vs. GAMS
+ && traits.isPostsuffix(l));
}
/** Returns true if and only if this pair could be a Tibetan
- * suffix. FIXME: ACIP specific, just like isPostSuffix() and isPrefix() */
+ * suffix. */
boolean isSuffix() {
return (null != l
&& ((null == r || "".equals(r))
- || "-".equals(r)
- || "A".equals(r))
- && ACIPRules.isACIPSuffix(l));
+ || traits.disambiguator().equals(r)
+ || traits.aVowel().equals(r))
+ && traits.isSuffix(l));
}
/** Returns true if and only if this pair is merely a
* disambiguator. */
boolean isDisambiguator() {
- return ("-".equals(r) && getLeft() == null);
+ return (traits.disambiguator().equals(r) && getLeft() == null);
}
/** Yep, this works for TPairs. */
@@ -160,16 +166,16 @@ class TPair {
return false;
}
- /** Returns a TPair that is like this pair except that it has
- * a "+" on the right if this pair is empty on the right and is
- * empty on the right if this pair has a disambiguator (i.e., a
- * '-') on the right. May return itself (but never mutates this
+ /** Returns a TPair that is like this pair except that it has a
+ * "+" on the right if this pair is empty on the right and is
+ * empty on the right if this pair has a disambiguator on the
+ * right. May return itself (but never mutates this
* instance). */
TPair insideStack() {
if (null == getRight())
- return new TPair(getLeft(), "+");
- else if ("-".equals(getRight()))
- return new TPair(getLeft(), null);
+ return new TPair(traits, getLeft(), "+");
+ else if (traits.disambiguator().equals(getRight()))
+ return new TPair(traits, getLeft(), null);
else
return this;
}
@@ -194,7 +200,7 @@ class TPair {
String getWylie(boolean justLeft) {
String leftWylie = null;
if (getLeft() != null) {
- leftWylie = ACIPRules.getWylieForACIPConsonant(getLeft());
+ leftWylie = traits.getEwtsForConsonant(getLeft());
if (leftWylie == null) {
if (isNumeric())
leftWylie = getLeft();
@@ -208,7 +214,7 @@ class TPair {
else if ("+".equals(getRight()))
rightWylie = "+";
else if (getRight() != null)
- rightWylie = ACIPRules.getWylieForACIPVowel(getRight());
+ rightWylie = traits.getEwtsForWowel(getRight());
if (null == rightWylie) rightWylie = "";
return leftWylie + rightWylie;
}
@@ -227,18 +233,19 @@ class TPair {
void getUnicode(StringBuffer consonantSB, StringBuffer vowelSB,
boolean subscribed) {
if (null != getLeft()) {
- String x = ACIPRules.getUnicodeFor(getLeft(), subscribed);
+ String x = traits.getUnicodeFor(getLeft(), subscribed);
if (null == x) throw new Error("TPair: " + getLeft() + " has no Uni");
consonantSB.append(x);
}
if (null != getRight()
&& !("-".equals(getRight()) || "+".equals(getRight()) || "A".equals(getRight()))) {
- String x = ACIPRules.getUnicodeFor(getRight(), subscribed);
+ String x = traits.getUnicodeFor(getRight(), subscribed);
if (null == x) throw new Error("TPair: " + getRight() + " has no Uni");
vowelSB.append(x);
}
}
+ // TODO(DLC)[EWTS->Tibetan]
/** Returns true if this pair is surely the last pair in an ACIP
* stack. Stacking continues through (* . ) and (* . +), but
* stops anywhere else. */
diff --git a/source/org/thdl/tib/text/ttt/TPairList.java b/source/org/thdl/tib/text/ttt/TPairList.java
index 5b83de5..2452a51 100644
--- a/source/org/thdl/tib/text/ttt/TPairList.java
+++ b/source/org/thdl/tib/text/ttt/TPairList.java
@@ -33,6 +33,9 @@ import java.util.ArrayList;
*
* @author David Chandler */
class TPairList {
+ /** the part that knows ACIP from EWTS */
+ private TTraits traits;
+
/** FIXME: change me and see if performance improves. */
private static final int INITIAL_SIZE = 1;
@@ -41,17 +44,20 @@ class TPairList {
/** Creates a new list containing just p. */
public TPairList(TPair p) {
+ this.traits = p.getTraits();
al = new ArrayList(1);
add(p);
}
/** Creates an empty list. */
- public TPairList() {
+ public TPairList(TTraits traits) {
+ this.traits = traits;
al = new ArrayList(INITIAL_SIZE);
}
/** Creates an empty list with the capacity to hold N items. */
- public TPairList(int N) {
+ public TPairList(TTraits traits, int N) {
+ this.traits = traits;
al = new ArrayList(N);
}
@@ -181,7 +187,7 @@ class TPairList {
return ErrorsAndWarnings.getMessage(125, shortMessages, translit);
} else if ((null == p.getLeft() && !"-".equals(p.getRight()))
|| (null != p.getLeft()
- && !ACIPRules.isConsonant(p.getLeft())
+ && !traits.isConsonant(p.getLeft())
&& !p.isNumeric())) {
// FIXME: stop handling this outside of ErrorsAndWarnings:
if (null == p.getLeft()) {
@@ -406,12 +412,12 @@ class TPairList {
// and only if b1 is one, etc.
for (int counter = 0; counter < (1< If both EWTS and ACIP transliterations have a property in
+ * common, then it's likely encoded in a manner that's hard to
+ * modify. But if they differ in some respect, then that difference
+ * should be encoded in a TTraits object.
*
* It is very likely that classes that implement this interface
* will choose to use the design pattern 'singleton'. */
@@ -62,9 +68,63 @@ interface TTraits {
/** Returns true if and only if s is a stretch of
* transliteration corresponding to a Tibetan wowel (without any
* [achen or other] consonant) */
- boolean isWowel(String s);
+ boolean isWowel(String s); // TODO(DLC)[EWTS->Tibetan]: what about "m:" as opposed to "m" or ":"
/** Returns true if and only if the pair given has a simple error
* other than being a mere disambiguator. */
boolean hasSimpleError(TPair p);
+
+ /** The implicit 'ahhh' vowel, the one you see when you write the
+ human-friendly transliteration for "\u0f40\u0f0b". */
+ String aVowel();
+
+ /** Returns true if s is a valid postsuffix. s must not have a
+ wowel on it. */
+ boolean isPostsuffix(String s);
+
+ /** Returns true if and only if l is the representation of a
+ letter that can be a suffix. Note that all postsuffixes are
+ also suffixes. l should not have a wowel. */
+ boolean isSuffix(String l);
+
+ /** Returns true if and only if l is the representation of a
+ letter that can be a prefix. l should not have a wowel. */
+ boolean isPrefix(String l);
+
+ /** Returns the EWTS transliteration corresponding to the
+ * consonant l, which should not have a vowel. Returns null if
+ * there is no such EWTS.
+ *
+ * May return "W" instead of "w", "r" instead of "R", and "y"
+ * instead of "Y" because we sometimes don't have enough context
+ * to decide.
+ *
+ * The reasoning for "W" instead of "w" is that r-w and r+w
+ * are both known hash keys (as {@link
+ * org.thdl.tib.text#TibetanMachineWeb} would call them). We
+ * sort 'em out this way. (They are the only things like this
+ * according to bug report #800166.) */
+ String getEwtsForConsonant(String l);
+
+ /** Returns the EWTS corresponding to the given punctuation or
+ * mark. Returns null if there is no such EWTS. */
+ String getEwtsForOther(String l);
+
+ /** Returns the EWTS corresponding to the given "wowel". Returns
+ * null if there is no such EWTS. */
+ String getEwtsForWowel(String l);
+
+ /** If l is a consonant or vowel or punctuation mark, then this
+ * returns the Unicode for it. The Unicode for the subscribed
+ * form of the glyph is returned if subscribed is true. Returns
+ * null if l is unknown. */
+ String getUnicodeFor(String l, boolean subscribed);
+
+ /** Returns a scanner that can break up a string of
+ transliteration. */
+ TTshegBarScanner scanner();
+
+ /** Gets the duffcodes for wowel, such that they look good with
+ * the preceding glyph, and appends them to duff. */
+ void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel);
}
diff --git a/source/org/thdl/tib/text/ttt/TTshegBarScanner.java b/source/org/thdl/tib/text/ttt/TTshegBarScanner.java
index f1a94f1..0835a3b 100644
--- a/source/org/thdl/tib/text/ttt/TTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/TTshegBarScanner.java
@@ -18,7 +18,11 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt;
-import java.io.*;
+import java.io.IOException;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
+import java.io.InputStream;
+import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.Stack;
@@ -40,7 +44,7 @@ public abstract class TTshegBarScanner {
* If errors is non-null, error messages will be appended to it.
* Returns a list of TStrings that is the scan. Warning and
* error messages in the result will be long and self-contained
- * unless shortMessagse is true.
+ * unless shortMessages is true.
*
* This is not so efficient; copies the whole file into memory
* first.