From c16f633ecf7ae1e087f5798ec55df1e617abc1db Mon Sep 17 00:00:00 2001 From: dchandler Date: Tue, 22 Feb 2005 04:36:54 +0000 Subject: [PATCH] Two things: One, TMW->EWTS gives dbas and dngas instead of dabs and dangs because Chris Fynn's e-mail from today has dbas and dngas. Second, Down with ACIPRules. Long live ACIPTraits. EWTS->Tibetan conversion is closer still. --- source/org/thdl/tib/input/DuffPaneTest.java | 16 + .../org/thdl/tib/input/TibetanConverter.java | 18 +- source/org/thdl/tib/text/TGCPair.java | 4 +- source/org/thdl/tib/text/TibTextUtils.java | 61 +- .../org/thdl/tib/text/TibetanMachineWeb.java | 4 +- source/org/thdl/tib/text/tibwn.ini | 12 +- source/org/thdl/tib/text/ttt/ACIPRules.java | 658 ------------------ source/org/thdl/tib/text/ttt/ACIPTraits.java | 555 ++++++++++++++- .../tib/text/ttt/ACIPTshegBarScanner.java | 7 +- source/org/thdl/tib/text/ttt/EWTSTraits.java | 51 +- .../tib/text/ttt/EWTSTshegBarScanner.java | 56 ++ source/org/thdl/tib/text/ttt/PackageTest.java | 16 +- source/org/thdl/tib/text/ttt/TConverter.java | 79 ++- source/org/thdl/tib/text/ttt/TPair.java | 83 ++- source/org/thdl/tib/text/ttt/TPairList.java | 42 +- .../thdl/tib/text/ttt/TPairListFactory.java | 26 +- source/org/thdl/tib/text/ttt/TTraits.java | 72 +- .../thdl/tib/text/ttt/TTshegBarScanner.java | 8 +- 18 files changed, 950 insertions(+), 818 deletions(-) delete mode 100644 source/org/thdl/tib/text/ttt/ACIPRules.java create mode 100644 source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java diff --git a/source/org/thdl/tib/input/DuffPaneTest.java b/source/org/thdl/tib/input/DuffPaneTest.java index da8eba1..50c4e48 100644 --- a/source/org/thdl/tib/input/DuffPaneTest.java +++ b/source/org/thdl/tib/input/DuffPaneTest.java @@ -969,6 +969,22 @@ public class DuffPaneTest extends DuffPaneTestBase { ensureKeysGiveCorrectWylie("'gas"); + /* Chris Fynn's e-mail on Feb 21 2005 leads to these test + cases: */ + { + ensureKeysGiveCorrectWylie("dgas"); + ensureKeysGiveCorrectWylie("'gas"); + ensureKeysGiveCorrectWylie("dngas"); + ensureKeysGiveCorrectWylie("gnad"); + ensureKeysGiveCorrectWylie("mnad"); + ensureKeysGiveCorrectWylie("bags"); + ensureKeysGiveCorrectWylie("dbas"); + ensureKeysGiveCorrectWylie("'bas"); + ensureKeysGiveCorrectWylie("mags"); + ensureKeysGiveCorrectWylie("mangs"); + ensureKeysGiveCorrectWylie("dmas"); + } + ensureKeysGiveCorrectWylie("gangs"); ensureKeysGiveCorrectWylie("gnags"); diff --git a/source/org/thdl/tib/input/TibetanConverter.java b/source/org/thdl/tib/input/TibetanConverter.java index a19a6b9..425a44a 100644 --- a/source/org/thdl/tib/input/TibetanConverter.java +++ b/source/org/thdl/tib/input/TibetanConverter.java @@ -27,7 +27,7 @@ import org.thdl.util.*; import org.thdl.tib.text.*; import org.thdl.tib.text.ttt.TConverter; -import org.thdl.tib.text.ttt.ACIPTshegBarScanner; +import org.thdl.tib.text.ttt.ACIPTraits; import java.util.ArrayList; /** TibetanConverter is a command-line utility for converting to and @@ -297,17 +297,18 @@ public class TibetanConverter implements FontConverterConstants { if (ACIP_TO_UNI_TEXT == ct || ACIP_TO_TMW == ct) { try { ArrayList al - = ACIPTshegBarScanner.instance().scanStream(in, null, - ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have", - 1000 - 1), - shortMessages, - warningLevel); + = ACIPTraits.instance().scanner().scanStream(in, null, + ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have", + 1000 - 1), + shortMessages, + warningLevel); if (null == al) return 47; boolean embeddedWarnings = (warningLevel != "None"); boolean hasWarnings[] = new boolean[] { false }; if (ACIP_TO_UNI_TEXT == ct) { - if (!TConverter.convertToUnicodeText(al, out, null, + if (!TConverter.convertToUnicodeText(ACIPTraits.instance(), + al, out, null, null, hasWarnings, embeddedWarnings, warningLevel, @@ -315,7 +316,8 @@ public class TibetanConverter implements FontConverterConstants { return 46; } else { if (ct != ACIP_TO_TMW) throw new Error("badness"); - if (!TConverter.convertToTMW(al, out, null, null, + if (!TConverter.convertToTMW(ACIPTraits.instance(), + al, out, null, null, hasWarnings, embeddedWarnings, warningLevel, shortMessages, diff --git a/source/org/thdl/tib/text/TGCPair.java b/source/org/thdl/tib/text/TGCPair.java index 1ba11d6..9276dd7 100644 --- a/source/org/thdl/tib/text/TGCPair.java +++ b/source/org/thdl/tib/text/TGCPair.java @@ -137,7 +137,7 @@ public class TGCPair implements THDLWylieConstants { consonantACIP = "V"; else consonantACIP - = org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(consonantWylie); + = org.thdl.tib.text.ttt.ACIPTraits.instance().getACIPForEWTS(consonantWylie); if (null == consonantACIP) { if (null != consonantWylie && consonantWylie.startsWith("R+")) return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + consonantWylie, " because the ACIP R+... could imply the short superscribed form, but this most likely intends the full form (i.e., Unicode character U+0F6A)"); @@ -160,7 +160,7 @@ public class TGCPair implements THDLWylieConstants { } if (vowelWylie != null) { String vowelACIP - = org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(vowelWylie); + = org.thdl.tib.text.ttt.ACIPTraits.instance().getACIPForEWTS(vowelWylie); if (null == vowelACIP) { return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + vowelWylie, ""); } else { diff --git a/source/org/thdl/tib/text/TibTextUtils.java b/source/org/thdl/tib/text/TibTextUtils.java index 1327fb5..6c7f77e 100644 --- a/source/org/thdl/tib/text/TibTextUtils.java +++ b/source/org/thdl/tib/text/TibTextUtils.java @@ -25,7 +25,7 @@ import javax.swing.text.rtf.RTFEditorKit; import java.io.*; import org.thdl.util.ThdlDebug; -import org.thdl.tib.text.ttt.ACIPTshegBarScanner; +import org.thdl.tib.text.ttt.ACIPTraits; import org.thdl.tib.text.ttt.TConverter; import org.thdl.tib.text.tshegbar.LegalTshegBar; import org.thdl.tib.text.tshegbar.UnicodeConstants; @@ -333,8 +333,8 @@ public class TibTextUtils implements THDLWylieConstants { { StringBuffer errors = new StringBuffer(); String warningLevel = withWarnings ? "All" : "None"; - ArrayList al = ACIPTshegBarScanner.instance().scan(acip, errors, 500, - false, warningLevel); + ArrayList al = ACIPTraits.instance().scanner().scan(acip, errors, 500, + false, warningLevel); if (null == al || errors.length() > 0) { if (errors.length() > 0) throw new InvalidACIPException(errors.toString()); @@ -348,8 +348,8 @@ public class TibTextUtils implements THDLWylieConstants { } try { int tloc[] = new int[] { loc }; - TConverter.convertToTMW(al, tdoc, null, null, null, - putWarningsInOutput, warningLevel, + TConverter.convertToTMW(ACIPTraits.instance(), al, tdoc, null, null, + null, putWarningsInOutput, warningLevel, false, colors, tloc); return tloc[0] - loc; } catch (IOException e) { @@ -1430,6 +1430,53 @@ public class TibTextUtils implements THDLWylieConstants { candidateType = getCandidateTypeModuloAppendage(candidateType); if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType) { + /* Update: Chris Fynn wrote this in response to an +e-mail from David Chapman on Feb 21, 2005: + + +When working out the rules for Tibetan and Dzongkha +collation in Bhutan we came up with the following sequences +that could be ambiguous: + +0F51 0F42 0F66 +0F60 0F42 0F66 +0F51 0F44 0F66 +0F42 0F53 0F51 +0F58 0F53 0F51 +0F56 0F42 0F66 +0F51 0F56 0F66 +0F60 0F56 0F66 +0F58 0F42 0F66 +0F58 0F44 0F66 +0F51 0F58 0F66 + +After much consultation with experts in Bhutan it was +decided these should always be read as follows: + +0F51 0F42 0F66 dgas +0F60 0F42 0F66 'gas +0F51 0F44 0F66 dngas * +0F42 0F53 0F51 gnad +0F58 0F53 0F51 mnad * +0F56 0F42 0F66 bags +0F51 0F56 0F66 dbas +0F60 0F56 0F66 'bas * +0F58 0F42 0F66 mags +0F58 0F44 0F66 mangs +0F51 0F58 0F66 dmas + +In most cases it was found that only one of the two possible +readings actually existed as words. 0F51 0F44 0F66 , 0F58 +0F53 0F51, and 0F60 0F56 0F66 were not found as syllables in +any known words, but the experts felt that *if* they +occurred in Tibetan or Dzongkha text then dngas, mnad, and +'bas would be the most likely reading. + + + + + Because of this e-mail, dbas and dngas were added to the list of + exceptions. */ /* Yes, this is ambiguous. How do we handle it? See * this from Andres (but note that only 4 of the 14 in * the second list are ambiguous because ra na sa and @@ -1480,7 +1527,9 @@ public class TibTextUtils implements THDLWylieConstants { || wylie2.equals("n") || wylie2.equals("s"))) || (wylie1.equals("d") && (wylie2.equals("g") - || wylie2.equals("m"))) + || wylie2.equals("m") + || wylie2.equals("b") + || wylie2.equals("ng"))) || (wylie1.equals("b") && wylie2.equals("d")) || (wylie1.equals("m") && wylie2.equals("d")) || (wylie1.equals("'") && (wylie2.equals("g") diff --git a/source/org/thdl/tib/text/TibetanMachineWeb.java b/source/org/thdl/tib/text/TibetanMachineWeb.java index e525663..c55a852 100644 --- a/source/org/thdl/tib/text/TibetanMachineWeb.java +++ b/source/org/thdl/tib/text/TibetanMachineWeb.java @@ -1988,7 +1988,7 @@ private static String acipForGlyph(String hashKey) { // ~X is a special case because the EWTS is 2 characters in // length || "~X".equals(hashKey)) // hard-coded EWTS value - return org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(hashKey); + return org.thdl.tib.text.ttt.ACIPTraits.instance().getACIPForEWTS(hashKey); else // else we are not be able to use it because it's not smart // about stacks (e.g., W+W) @@ -2116,7 +2116,7 @@ public static String getACIPForGlyph(DuffCode dc1, // DLC FIXME: TMW.53 is probably going to come out all wrong (VA // vs. WA) from this function, but - // ACIPRules.getACIPForEWTS(String) seems to come through... will + // ACIPTraits.getACIPForEWTS(String) seems to come through... will // it always? String hashKey = getHashKeyForGlyph(dc1); diff --git a/source/org/thdl/tib/text/tibwn.ini b/source/org/thdl/tib/text/tibwn.ini index 05a0eaa..beb52e3 100644 --- a/source/org/thdl/tib/text/tibwn.ini +++ b/source/org/thdl/tib/text/tibwn.ini @@ -9,9 +9,9 @@ // - blank lines should be ignored // - marks a command // -// If you change the Wylie here, it can break the ACIP->TMW and -// ACIP->Unicode conversion. So keep ACIPRules in sync with this, and be -// sure to run 'ant clean check' after your change. +// If you change the EWTS transliteration here, it can break the +// ACIP->TMW and ACIP->Unicode conversion. So keep ACIPTraits in sync +// with this, and be sure to run 'ant clean check' after your change. // // Note that some glyphs have EWTS \uF021-\uF0FF inclusive. These do // not have anything in the Unicode column, though, because this is @@ -37,7 +37,7 @@ // by the way. // // If EWTS changes, then ACIP->TMW and ACIP->Unicode will break -- -// modify ACIPRules and test test test. +// modify ACIPTraits and test test test. //_~32,1~0,32 @@ -645,7 +645,7 @@ r+m+m~51,4~~7,59~1,110~8,121~1,123~1,125~8,107~8,114~f62,fa8,fa8 // Note that TPairList.java's unicodeExceptionsMap must be updated if // we change who uses U+0F6A. R+Y~52,4~~7,60~1,110~8,120~1,123~1,125~8,106~8,113~f6a,fbb -// R+W is mentioned in ACIPRules.java: +// R+W is mentioned in ACIPTraits.java: R+W~196,4~~7,61~1,109~8,120~1,123~1,125~8,106~8,113~f6a,fba R+sh~53,4~~7,62~1,109~8,120~1,123~1,125~8,106~8,113~f6a,fb4 R+sh+y~54,4~~7,63~1,109~8,122~1,123~1,125~8,108~8,115~f6a,fb4,fb1 @@ -667,7 +667,7 @@ l+h+w~197,4~~7,78~1,109~8,121~1,123~1,125~8,106~8,113~f63,fb7,fad w+y~69,4~~7,79~1,109~8,121~1,123~1,125~8,107~8,114~f5d,fb1 w+r~70,4~~7,80~1,109~8,121~1,123~1,125~8,107~8,114~f5d,fb2 w+n~195,4~~7,81~1,109~8,120~1,123~1,125~8,106~8,113~f5d,fa3 -// w+W is mentioned in ACIPRules.java: +// w+W is mentioned in ACIPTraits.java: w+W~194,4~~7,82~1,109~8,120~1,123~1,125~8,106~8,113~f5d,fba sh+ts~71,4~~7,83~1,109~8,120~1,123~1,125~8,106~8,113~f64,fa9 sh+ts+y~72,4~~7,84~1,109~8,122~1,123~1,125~8,108~8,115~f64,fa9,fb1 diff --git a/source/org/thdl/tib/text/ttt/ACIPRules.java b/source/org/thdl/tib/text/ttt/ACIPRules.java deleted file mode 100644 index c6c9986..0000000 --- a/source/org/thdl/tib/text/ttt/ACIPRules.java +++ /dev/null @@ -1,658 +0,0 @@ -/* -The contents of this file are subject to the THDL Open Community License -Version 1.0 (the "License"); you may not use this file except in compliance -with the License. You may obtain a copy of the License on the THDL web site -(http://www.thdl.org/). - -Software distributed under the License is distributed on an "AS IS" basis, -WITHOUT WARRANTY OF ANY KIND, either express or implied. See the -License for the specific terms governing rights and limitations under the -License. - -The Initial Developer of this software is the Tibetan and Himalayan Digital -Library (THDL). Portions created by the THDL are Copyright 2003 THDL. -All Rights Reserved. - -Contributor(s): ______________________________________. -*/ - -package org.thdl.tib.text.ttt; - -import java.util.HashSet; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.StringTokenizer; -import java.util.List; - -import org.thdl.util.ThdlOptions; -import org.thdl.tib.text.DuffCode; -import org.thdl.tib.text.THDLWylieConstants; -import org.thdl.tib.text.TibetanMachineWeb; -import org.thdl.tib.text.TibTextUtils; - -// TODO(DLC)[EWTS->Tibetan]: this and ACIPTraits -- unify? - -/** Canonizes some facts regarding the ACIP transcription system. - * @author David Chandler */ -public class ACIPRules { - /** {Ksh}, the longest consonant, has 3 characters, so this is - * three. */ - public static int MAX_CONSONANT_LENGTH = 3; - - /** {'EEm:}, the longest wowel, has 5 characters, so this is - * five. */ - public static int MAX_WOWEL_LENGTH = 5; - - /** For O(1) {@link #isWowel(String)} calls. */ - private static HashSet acipVowels = null; - - private static String[][] baseVowels = new String[][] { - // { ACIP, EWTS, EWTS for ACIP {'\'' + baseVowels[][0]}, vowel - // numbers (see TibetanMachineWeb's VOWEL_A, VOWEL_o, etc.) - // for ACIP, vowel numbers for ACIP {'\'' + baseVowels[][0]} - { "A", "a", "A" }, - { "I", "i", "I" }, - { "U", "u", "U" }, - { "E", "e", "Ae" }, - { "O", "o", "Ao" }, - { "EE", "ai", "Aai" }, - { "OO", "au", "Aau" }, - { "i", "-i", "A-i" } - }; - - /** Returns true if and only if s is an ACIP wowel. You can't - * just call this any time -- A is both a consonant and a vowel - * in ACIP, so you have to call this in the right context. */ - public static boolean isWowel(String s) { - if (null == acipVowels) { - acipVowels = new HashSet(baseVowels.length * 8); - for (int i = 0; i < baseVowels.length; i++) { - // I'm on my own with 'O and 'E and 'OO and 'EE, but - // GANG'O appears and I wonder... so here they are. - // It's consistent with 'I and 'A and 'U, at least: - // all the vowels may appear as K'vowel. DLC FIXME: - // ask. - - acipVowels.add(baseVowels[i][0]); - acipVowels.add('\'' + baseVowels[i][0]); - acipVowels.add(baseVowels[i][0] + 'm'); - acipVowels.add('\'' + baseVowels[i][0] + 'm'); - acipVowels.add(baseVowels[i][0] + ':'); - acipVowels.add('\'' + baseVowels[i][0] + ':'); - acipVowels.add(baseVowels[i][0] + "m:"); - acipVowels.add('\'' + baseVowels[i][0] + "m:"); - - // Keep this code in sync with getUnicodeFor. - - // Keep this code in sync with getWylieForACIPVowel. - } - // {Pm} is treated just like {PAm}; {P:} is treated just - // like {PA:}; {Pm:} is treated just like {PAm:}. But - // that happens thanks to - } - return (acipVowels.contains(s)); - } - - /** For O(1) {@link #isConsonant(String)} calls. */ - private static HashSet consonants = null; - - /** Returns true if and only if acip is an ACIP consonant (without - * a vowel). For example, returns true for "K", but not for - * "KA" or "X". */ - public static boolean isConsonant(String acip) { - if (consonants == null) { - consonants = new HashSet(); - consonants.add("V"); - consonants.add("K"); - consonants.add("KH"); - consonants.add("G"); - consonants.add("NG"); - consonants.add("C"); - consonants.add("CH"); - consonants.add("J"); - consonants.add("NY"); - consonants.add("T"); - consonants.add("TH"); - consonants.add("D"); - consonants.add("N"); - consonants.add("P"); - consonants.add("PH"); - consonants.add("B"); - consonants.add("M"); - consonants.add("TZ"); - consonants.add("TS"); - consonants.add("DZ"); - consonants.add("W"); - consonants.add("ZH"); - consonants.add("Z"); - consonants.add("Y"); - consonants.add("R"); - consonants.add("L"); - consonants.add("SH"); - consonants.add("S"); - consonants.add("H"); - consonants.add("t"); - consonants.add("th"); - consonants.add("d"); - consonants.add("n"); - consonants.add("sh"); - consonants.add("dH"); - consonants.add("DH"); - consonants.add("BH"); - consonants.add("DZH"); // longest, MAX_CONSONANT_LENGTH characters - consonants.add("Ksh"); // longest, MAX_CONSONANT_LENGTH characters - consonants.add("GH"); - consonants.add("'"); - consonants.add("A"); - } - return consonants.contains(acip); - } - - /** A map from wylie to ACIP. Note that the Wylie "w" maps to - both "V" and "W". */ - private static HashMap wylieToACIP = null; - /** Returns the ACIP transliteration corresponding to the THDL - Extended Wylie atom EWTS, or null if EWTS is not - recognized. */ - public static String getACIPForEWTS(String EWTS) { - getWylieForACIPConsonant(null); - getWylieForACIPOther(null); - getWylieForACIPVowel(null); - String ans = (String)wylieToACIP.get(EWTS); - boolean useCapitalW = false; - if (EWTS.startsWith("w")) - useCapitalW = true; // We want W+NA, not V+NA; we want WA, not VA. - if (null == ans) { - StringBuffer finalAns = new StringBuffer(EWTS.length()); - StringTokenizer sTok = new StringTokenizer(EWTS, "-+", true); - while (sTok.hasMoreTokens()) { - String part, tok = sTok.nextToken(); - if (tok.equals("-") || tok.equals("+")) - part = tok; - else { - if ("w".equals(tok)) { - // There are only two stacks in TMW that have - // U+0FBA: R+Wa and w+Wa. TMW->ACIP fails for - // these unless we handle it here. (FIXME: - // add an automated test for this). - if ("R+W".equals(EWTS) || "w+W".equals(EWTS)) { - part = "W"; - } else { - part = "V"; - } - } else { - part = (String)wylieToACIP.get(tok); - } - } - if (null == part) return null; - finalAns.append(part); - } - if (useCapitalW) - finalAns.setCharAt(0, 'W'); - return finalAns.toString(); - } - if (useCapitalW) - return "W" + ans.substring(1); - else - return ans; - } - - /** Registers acip->wylie mappings in toWylie; registers - wylie->acip mappings in {@link #wylieToACIP}. */ - private static void putMapping(HashMap toWylie, String ACIP, String EWTS) { - toWylie.put(ACIP, EWTS); - if (null == wylieToACIP) { - wylieToACIP = new HashMap(75); - - // We don't want to put "/" in toWylie: - wylieToACIP.put("(", "/"); - wylieToACIP.put(")", "/"); - wylieToACIP.put("?", "\\"); - - wylieToACIP.put("_", " "); // oddball. - wylieToACIP.put("o'i", "O'I"); // oddball for TMW9.61. - } - wylieToACIP.put(EWTS, ACIP); - } - - /** Returns true if and only if s is an ACIP consonant. */ - static final boolean isACIPConsonant(String s) { - return (null != ACIPRules.getWylieForACIPConsonant(s)); - } - - private static HashMap acipConsonant2wylie = null; - /** Returns the EWTS corresponding to the given ACIP consonant - * (without the "A" vowel). Returns null if there is no such - * EWTS. - * - *

Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y", - * even though sometimes the EWTS for those is "w", "R", or "Y". - * Handle that in the caller. */ - static final String getWylieForACIPConsonant(String acip) { - if (acipConsonant2wylie == null) { - acipConsonant2wylie = new HashMap(37); - - // oddball: - putMapping(acipConsonant2wylie, "V", "w"); - - // more oddballs: - putMapping(acipConsonant2wylie, "DH", "d+h"); - putMapping(acipConsonant2wylie, "BH", "b+h"); - putMapping(acipConsonant2wylie, "dH", "D+h"); - putMapping(acipConsonant2wylie, "DZH", "dz+h"); - putMapping(acipConsonant2wylie, "Ksh", "k+Sh"); - putMapping(acipConsonant2wylie, "GH", "g+h"); - - - putMapping(acipConsonant2wylie, "K", "k"); - putMapping(acipConsonant2wylie, "KH", "kh"); - putMapping(acipConsonant2wylie, "G", "g"); - putMapping(acipConsonant2wylie, "NG", "ng"); - putMapping(acipConsonant2wylie, "C", "c"); - putMapping(acipConsonant2wylie, "CH", "ch"); - putMapping(acipConsonant2wylie, "J", "j"); - putMapping(acipConsonant2wylie, "NY", "ny"); - putMapping(acipConsonant2wylie, "T", "t"); - putMapping(acipConsonant2wylie, "TH", "th"); - putMapping(acipConsonant2wylie, "D", "d"); - putMapping(acipConsonant2wylie, "N", "n"); - putMapping(acipConsonant2wylie, "P", "p"); - putMapping(acipConsonant2wylie, "PH", "ph"); - putMapping(acipConsonant2wylie, "B", "b"); - putMapping(acipConsonant2wylie, "M", "m"); - putMapping(acipConsonant2wylie, "TZ", "ts"); - putMapping(acipConsonant2wylie, "TS", "tsh"); - putMapping(acipConsonant2wylie, "DZ", "dz"); - putMapping(acipConsonant2wylie, "W", "W" - /* NOTE WELL: sometimes "w", sometimes "W". - Handle this in the caller. - - Reasoning for "W" instead of "w": r-w and - r+w are both known hash keys. We sort 'em - out this way. (They are the only things - like this according to bug report #800166.) */ - ); - putMapping(acipConsonant2wylie, "ZH", "zh"); - putMapping(acipConsonant2wylie, "Z", "z"); - putMapping(acipConsonant2wylie, "'", "'"); - putMapping(acipConsonant2wylie, "Y", "y"); - putMapping(acipConsonant2wylie, "R", "r"); - putMapping(acipConsonant2wylie, "L", "l"); - putMapping(acipConsonant2wylie, "SH", "sh"); - putMapping(acipConsonant2wylie, "S", "s"); - putMapping(acipConsonant2wylie, "H", "h"); - putMapping(acipConsonant2wylie, "A", "a"); - putMapping(acipConsonant2wylie, "t", "T"); - putMapping(acipConsonant2wylie, "th", "Th"); - putMapping(acipConsonant2wylie, "d", "D"); - putMapping(acipConsonant2wylie, "n", "N"); - putMapping(acipConsonant2wylie, "sh", "Sh"); - } - return (String)acipConsonant2wylie.get(acip); - } - - private static HashMap acipVowel2wylie = null; - /** Returns the EWTS corresponding to the given ACIP "vowel". - * Returns null if there is no such EWTS. */ - static final String getWylieForACIPVowel(String acip) { - if (acipVowel2wylie == null) { - acipVowel2wylie = new HashMap(baseVowels.length * 4); - - for (int i = 0; i < baseVowels.length; i++) { - putMapping(acipVowel2wylie, baseVowels[i][0], baseVowels[i][1]); - putMapping(acipVowel2wylie, '\'' + baseVowels[i][0], baseVowels[i][2]); - putMapping(acipVowel2wylie, baseVowels[i][0] + 'm', baseVowels[i][1] + 'M'); - putMapping(acipVowel2wylie, '\'' + baseVowels[i][0] + 'm', baseVowels[i][2] + 'M'); - putMapping(acipVowel2wylie, baseVowels[i][0] + ':', baseVowels[i][1] + 'H'); - putMapping(acipVowel2wylie, '\'' + baseVowels[i][0] + ':', baseVowels[i][2] + 'H'); - putMapping(acipVowel2wylie, baseVowels[i][0] + "m:", baseVowels[i][1] + "MH"); - putMapping(acipVowel2wylie, '\'' + baseVowels[i][0] + "m:", baseVowels[i][2] + "MH"); - } - // {Pm} is treated just like {PAm}; {P:} is treated just - // like {PA:}; {Pm:} is treated just like {PAm:}. But - // that happens thanks to - // TPairListFactory.getFirstConsonantAndVowel(StringBuffer,int[]). - } - return (String)acipVowel2wylie.get(acip); - } - - private static HashMap acipOther2wylie = null; - /** Returns the EWTS corresponding to the given ACIP puncuation or - * mark. Returns null if there is no such EWTS. */ - static final String getWylieForACIPOther(String acip) { - if (acipOther2wylie == null) { - acipOther2wylie = new HashMap(20); - - // don't use putMapping for this. We don't want TMW->ACIP - // to produce "." for a U+0F0C because ACIP doesn't say - // that "." means U+0F0C. It just seems to in practice - // for ACIP Release IV texts. - acipOther2wylie.put(".", "*"); - - putMapping(acipOther2wylie, "m", "M"); - putMapping(acipOther2wylie, ":", "H"); - putMapping(acipOther2wylie, ",", "/"); - putMapping(acipOther2wylie, " ", " "); - putMapping(acipOther2wylie, ";", "|"); - putMapping(acipOther2wylie, "`", "!"); - putMapping(acipOther2wylie, "*", "@#"); - // There is no glyph in TMW with the EWTS @##, so we don't do this: putMapping(acipOther2wylie, "#", "@##"); - putMapping(acipOther2wylie, "%", "~X"); - putMapping(acipOther2wylie, "o", "X"); - putMapping(acipOther2wylie, "&", "&"); - putMapping(acipOther2wylie, "^", "\\u0F38"); - - putMapping(acipOther2wylie, "0", "0"); - putMapping(acipOther2wylie, "1", "1"); - putMapping(acipOther2wylie, "2", "2"); - putMapping(acipOther2wylie, "3", "3"); - putMapping(acipOther2wylie, "4", "4"); - putMapping(acipOther2wylie, "5", "5"); - putMapping(acipOther2wylie, "6", "6"); - putMapping(acipOther2wylie, "7", "7"); - putMapping(acipOther2wylie, "8", "8"); - putMapping(acipOther2wylie, "9", "9"); - } - return (String)acipOther2wylie.get(acip); - } - - private static HashMap superACIP2unicode = null; - private static HashMap subACIP2unicode = null; - /** If acip is an ACIP consonant or vowel or punctuation mark, - * then this returns the Unicode for it. The Unicode for the - * subscribed form of the glyph is returned if subscribed is - * true. Returns null if acip is unknown. */ - static String getUnicodeFor(String acip, boolean subscribed) { - if (superACIP2unicode == null) { - final boolean compactUnicode - = ThdlOptions.getBooleanOption("thdl.acip.to.unicode.conversions.use.0F52.et.cetera"); - superACIP2unicode = new HashMap(144); - subACIP2unicode = new HashMap(42); - - // oddball: - subACIP2unicode.put("V", "\u0FAD"); - - superACIP2unicode.put("DH", (compactUnicode ? "\u0F52" : "\u0F51\u0FB7")); - subACIP2unicode.put("DH", (compactUnicode ? "\u0FA2" : "\u0FA1\u0FB7")); - superACIP2unicode.put("BH", (compactUnicode ? "\u0F57" : "\u0F56\u0FB7")); - subACIP2unicode.put("BH", (compactUnicode ? "\u0FA7" : "\u0FA6\u0FB7")); - superACIP2unicode.put("dH", (compactUnicode ? "\u0F4D" : "\u0F4C\u0FB7")); - subACIP2unicode.put("dH", (compactUnicode ? "\u0F9D" : "\u0F9C\u0FB7")); - superACIP2unicode.put("DZH", (compactUnicode ? "\u0F5C" : "\u0F5B\u0FB7")); - subACIP2unicode.put("DZH", (compactUnicode ? "\u0FAC" : "\u0FAB\u0FB7")); - superACIP2unicode.put("Ksh", (compactUnicode ? "\u0F69" : "\u0F40\u0FB5")); - subACIP2unicode.put("Ksh", (compactUnicode ? "\u0FB9" : "\u0F90\u0FB5")); - superACIP2unicode.put("GH", (compactUnicode ? "\u0F43" : "\u0F42\u0FB7")); - subACIP2unicode.put("GH", (compactUnicode ? "\u0F93" : "\u0F92\u0FB7")); - superACIP2unicode.put("K", "\u0F40"); - subACIP2unicode.put("K", "\u0F90"); - superACIP2unicode.put("KH", "\u0F41"); - subACIP2unicode.put("KH", "\u0F91"); - superACIP2unicode.put("G", "\u0F42"); - subACIP2unicode.put("G", "\u0F92"); - superACIP2unicode.put("NG", "\u0F44"); - subACIP2unicode.put("NG", "\u0F94"); - superACIP2unicode.put("C", "\u0F45"); - subACIP2unicode.put("C", "\u0F95"); - superACIP2unicode.put("CH", "\u0F46"); - subACIP2unicode.put("CH", "\u0F96"); - superACIP2unicode.put("J", "\u0F47"); - subACIP2unicode.put("J", "\u0F97"); - superACIP2unicode.put("NY", "\u0F49"); - subACIP2unicode.put("NY", "\u0F99"); - superACIP2unicode.put("T", "\u0F4F"); - subACIP2unicode.put("T", "\u0F9F"); - superACIP2unicode.put("TH", "\u0F50"); - subACIP2unicode.put("TH", "\u0FA0"); - superACIP2unicode.put("D", "\u0F51"); - subACIP2unicode.put("D", "\u0FA1"); - superACIP2unicode.put("N", "\u0F53"); - subACIP2unicode.put("N", "\u0FA3"); - superACIP2unicode.put("P", "\u0F54"); - subACIP2unicode.put("P", "\u0FA4"); - superACIP2unicode.put("PH", "\u0F55"); - subACIP2unicode.put("PH", "\u0FA5"); - superACIP2unicode.put("B", "\u0F56"); - subACIP2unicode.put("B", "\u0FA6"); - superACIP2unicode.put("M", "\u0F58"); - subACIP2unicode.put("M", "\u0FA8"); - superACIP2unicode.put("TZ", "\u0F59"); - subACIP2unicode.put("TZ", "\u0FA9"); - superACIP2unicode.put("TS", "\u0F5A"); - subACIP2unicode.put("TS", "\u0FAA"); - superACIP2unicode.put("DZ", "\u0F5B"); - subACIP2unicode.put("DZ", "\u0FAB"); - superACIP2unicode.put("W", "\u0F5D"); - subACIP2unicode.put("W", "\u0FBA"); // oddball - superACIP2unicode.put("ZH", "\u0F5E"); - subACIP2unicode.put("ZH", "\u0FAE"); - superACIP2unicode.put("Z", "\u0F5F"); - subACIP2unicode.put("Z", "\u0FAF"); - superACIP2unicode.put("'", "\u0F60"); - subACIP2unicode.put("'", "\u0FB0"); - superACIP2unicode.put("Y", "\u0F61"); - subACIP2unicode.put("Y", "\u0FB1"); - superACIP2unicode.put("R", "\u0F62"); - subACIP2unicode.put("R", "\u0FB2"); - superACIP2unicode.put("L", "\u0F63"); - subACIP2unicode.put("L", "\u0FB3"); - superACIP2unicode.put("SH", "\u0F64"); - subACIP2unicode.put("SH", "\u0FB4"); - superACIP2unicode.put("S", "\u0F66"); - subACIP2unicode.put("S", "\u0FB6"); - superACIP2unicode.put("H", "\u0F67"); - subACIP2unicode.put("H", "\u0FB7"); - superACIP2unicode.put("A", "\u0F68"); - subACIP2unicode.put("A", "\u0FB8"); - superACIP2unicode.put("t", "\u0F4A"); - subACIP2unicode.put("t", "\u0F9A"); - superACIP2unicode.put("th", "\u0F4B"); - subACIP2unicode.put("th", "\u0F9B"); - superACIP2unicode.put("d", "\u0F4C"); - subACIP2unicode.put("d", "\u0F9C"); - superACIP2unicode.put("n", "\u0F4E"); - subACIP2unicode.put("n", "\u0F9E"); - superACIP2unicode.put("sh", "\u0F65"); - subACIP2unicode.put("sh", "\u0FB5"); - - superACIP2unicode.put("I", "\u0F72"); - superACIP2unicode.put("E", "\u0F7A"); - superACIP2unicode.put("O", "\u0F7C"); - superACIP2unicode.put("U", "\u0F74"); - superACIP2unicode.put("OO", "\u0F7D"); - superACIP2unicode.put("EE", "\u0F7B"); - superACIP2unicode.put("i", "\u0F80"); - superACIP2unicode.put("'A", "\u0F71"); - superACIP2unicode.put("'I", "\u0F71\u0F72"); - superACIP2unicode.put("'E", "\u0F71\u0F7A"); - superACIP2unicode.put("'O", "\u0F71\u0F7C"); - superACIP2unicode.put("'U", "\u0F71\u0F74"); - superACIP2unicode.put("'OO", "\u0F71\u0F7D"); - superACIP2unicode.put("'EE", "\u0F71\u0F7B"); - superACIP2unicode.put("'i", "\u0F71\u0F80"); - - superACIP2unicode.put("Im", "\u0F72\u0F7E"); - superACIP2unicode.put("Em", "\u0F7A\u0F7E"); - superACIP2unicode.put("Om", "\u0F7C\u0F7E"); - superACIP2unicode.put("Um", "\u0F74\u0F7E"); - superACIP2unicode.put("OOm", "\u0F7D\u0F7E"); - superACIP2unicode.put("EEm", "\u0F7B\u0F7E"); - superACIP2unicode.put("im", "\u0F80\u0F7E"); - superACIP2unicode.put("'Am", "\u0F71\u0F7E"); - superACIP2unicode.put("'Im", "\u0F71\u0F72\u0F7E"); - superACIP2unicode.put("'Em", "\u0F71\u0F7A\u0F7E"); - superACIP2unicode.put("'Om", "\u0F71\u0F7C\u0F7E"); - superACIP2unicode.put("'Um", "\u0F71\u0F74\u0F7E"); - superACIP2unicode.put("'OOm", "\u0F71\u0F7D\u0F7E"); - superACIP2unicode.put("'EEm", "\u0F71\u0F7B\u0F7E"); - superACIP2unicode.put("'im", "\u0F71\u0F80\u0F7E"); - - superACIP2unicode.put("I:", "\u0F72\u0F7F"); - superACIP2unicode.put("E:", "\u0F7A\u0F7F"); - superACIP2unicode.put("O:", "\u0F7C\u0F7F"); - superACIP2unicode.put("U:", "\u0F74\u0F7F"); - superACIP2unicode.put("OO:", "\u0F7D\u0F7F"); - superACIP2unicode.put("EE:", "\u0F7B\u0F7F"); - superACIP2unicode.put("i:", "\u0F80\u0F7F"); - superACIP2unicode.put("'A:", "\u0F71\u0F7F"); - superACIP2unicode.put("'I:", "\u0F71\u0F72\u0F7F"); - superACIP2unicode.put("'E:", "\u0F71\u0F7A\u0F7F"); - superACIP2unicode.put("'O:", "\u0F71\u0F7C\u0F7F"); - superACIP2unicode.put("'U:", "\u0F71\u0F74\u0F7F"); - superACIP2unicode.put("'OO:", "\u0F71\u0F7D\u0F7F"); - superACIP2unicode.put("'EE:", "\u0F71\u0F7B\u0F7F"); - superACIP2unicode.put("'i:", "\u0F71\u0F80\u0F7F"); - - superACIP2unicode.put("Im:", "\u0F72\u0F7E\u0F7F"); - superACIP2unicode.put("Em:", "\u0F7A\u0F7E\u0F7F"); - superACIP2unicode.put("Om:", "\u0F7C\u0F7E\u0F7F"); - superACIP2unicode.put("Um:", "\u0F74\u0F7E\u0F7F"); - superACIP2unicode.put("OOm:", "\u0F7D\u0F7E\u0F7F"); - superACIP2unicode.put("EEm:", "\u0F7B\u0F7E\u0F7F"); - superACIP2unicode.put("im:", "\u0F80\u0F7E\u0F7F"); - superACIP2unicode.put("'Am:", "\u0F71\u0F7E\u0F7F"); - superACIP2unicode.put("'Im:", "\u0F71\u0F72\u0F7E\u0F7F"); - superACIP2unicode.put("'Em:", "\u0F71\u0F7A\u0F7E\u0F7F"); - superACIP2unicode.put("'Om:", "\u0F71\u0F7C\u0F7E\u0F7F"); - superACIP2unicode.put("'Um:", "\u0F71\u0F74\u0F7E\u0F7F"); - superACIP2unicode.put("'OOm:", "\u0F71\u0F7D\u0F7E\u0F7F"); - superACIP2unicode.put("'EEm:", "\u0F71\u0F7B\u0F7E\u0F7F"); - superACIP2unicode.put("'im:", "\u0F71\u0F80\u0F7E\u0F7F"); - // :m does not appear, though you'd think it's as valid as m:. - - superACIP2unicode.put("m", "\u0F7E"); - superACIP2unicode.put(":", "\u0F7F"); - superACIP2unicode.put("m:", "\u0F7E\u0F7F"); - - superACIP2unicode.put("Am", "\u0F7E"); - superACIP2unicode.put("A:", "\u0F7F"); - superACIP2unicode.put("Am:", "\u0F7E\u0F7F"); - - superACIP2unicode.put("0", "\u0F20"); - superACIP2unicode.put("1", "\u0F21"); - superACIP2unicode.put("2", "\u0F22"); - superACIP2unicode.put("3", "\u0F23"); - superACIP2unicode.put("4", "\u0F24"); - superACIP2unicode.put("5", "\u0F25"); - superACIP2unicode.put("6", "\u0F26"); - superACIP2unicode.put("7", "\u0F27"); - superACIP2unicode.put("8", "\u0F28"); - superACIP2unicode.put("9", "\u0F29"); - - // punctuation - superACIP2unicode.put("&", "\u0F85"); - superACIP2unicode.put(",", "\u0F0D"); - superACIP2unicode.put(" ", "\u0F0B"); - superACIP2unicode.put(".", "\u0F0C"); - superACIP2unicode.put("`", "\u0F08"); - superACIP2unicode.put("`", "\u0F08"); - superACIP2unicode.put("*", "\u0F04\u0F05"); - superACIP2unicode.put("#", "\u0F04\u0F05\u0F05"); - superACIP2unicode.put("%", "\u0F35"); // but might be U+0F14, so we warn. - superACIP2unicode.put("o", "\u0F37"); - superACIP2unicode.put(";", "\u0F11"); - superACIP2unicode.put("\r", "\r"); - superACIP2unicode.put("\t", "\t"); - superACIP2unicode.put("\r\n", "\r\n"); - superACIP2unicode.put("\n", "\n"); - superACIP2unicode.put("\\", "\u0F84"); - superACIP2unicode.put("^", "\u0F38"); - - // DLC FIXME: "^ GONG" is "^GONG", right? - // DLC FIXME: what's the Unicode for x? RC said there is none in plain-text Unicode for x. But what about in RTF Unicode? - } - if (subscribed) { - String u = (String)subACIP2unicode.get(acip); - if (null != u) return u; - } - return (String)superACIP2unicode.get(acip); - } - - - - /** Gets the duffcodes for vowel, such that they look good with - * the stack with hash key hashKey, and appends them to r. */ - static void getDuffForACIPVowel(ArrayList duff, DuffCode preceding, String vowel) { - if (null == vowel) return; - if (null == getWylieForACIPVowel(vowel)) // FIXME: expensive assertion! Use assert. - throw new IllegalArgumentException("Vowel " + vowel + " isn't in the small set of vowels we handle correctly."); - - // Order matters here. - boolean context_added[] = new boolean[] { false }; - if (vowel.startsWith("A")) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.WYLIE_aVOWEL, context_added); - } else if (vowel.indexOf("'U") >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.U_VOWEL, context_added); - } else if (vowel.indexOf("'I") >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.I_VOWEL, context_added); - } else { - if (vowel.indexOf('\'') >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.A_VOWEL, context_added); - } - if (vowel.indexOf("EE") >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.ai_VOWEL, context_added); - } else if (vowel.indexOf('E') >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.e_VOWEL, context_added); - } - if (vowel.indexOf("OO") >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.au_VOWEL, context_added); - } else if (vowel.indexOf('O') >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.o_VOWEL, context_added); - } - if (vowel.indexOf('I') >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.i_VOWEL, context_added); - } - if (vowel.indexOf('U') >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.u_VOWEL, context_added); - } - if (vowel.indexOf('i') >= 0) { - TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.reverse_i_VOWEL, context_added); - } - } - // FIXME: Use TMW9.61, the "o'i" special combination, when appropriate. - - if (vowel.indexOf('m') >= 0) { - DuffCode last = (DuffCode)duff.get(duff.size() - 1); - duff.remove(duff.size() - 1); // getBindu will add it back... - TibTextUtils.getBindu(duff, last); - } - if (vowel.indexOf(':') >= 0) - duff.add(TibetanMachineWeb.getGlyph("H")); - } - - /** Returns true if and only if l is the ACIP representation of a - letter that can be a suffix. Note that all postsuffixes are - also suffixes. l must not have an "A" -- use "S", not "SA", - that is. */ - public static boolean isACIPSuffix(String l) { - return ("S".equals(l) - || "G".equals(l) - || "D".equals(l) - || "M".equals(l) - || "'".equals(l) - || "B".equals(l) - || "NG".equals(l) - || "N".equals(l) - || "L".equals(l) - || "R".equals(l)); - } - - /** Returns true if and only if l is the ACIP representation of a - letter that can be a prefix. l must not have an "A" -- use - "D", not "DA", that is. */ - public static boolean isACIPPrefix(String l) { - return ("'".equals(l) - || "M".equals(l) - || "B".equals(l) - || "D".equals(l) - || "G".equals(l)); - } - - /** Returns true if and only if l is the ACIP representation of a - letter that can be a postsuffix. l must not have an "A" -- - use "D", not "DA", that is. */ - public static boolean isACIPPostsuffix(String l) { - return ("S".equals(l) - || "D".equals(l)); - } -} diff --git a/source/org/thdl/tib/text/ttt/ACIPTraits.java b/source/org/thdl/tib/text/ttt/ACIPTraits.java index 036b197..dd4abec 100644 --- a/source/org/thdl/tib/text/ttt/ACIPTraits.java +++ b/source/org/thdl/tib/text/ttt/ACIPTraits.java @@ -18,11 +18,25 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; +import java.util.HashSet; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.StringTokenizer; +import java.util.List; + +import org.thdl.util.ThdlOptions; +import org.thdl.tib.text.DuffCode; +import org.thdl.tib.text.THDLWylieConstants; +import org.thdl.tib.text.TibetanMachineWeb; +import org.thdl.tib.text.TibTextUtils; + + /** A singleton class that should contain (but due to laziness and * ignorance probably does not contain) all the traits that make ACIP - * transliteration different from other (say, EWTS) - * transliterations. */ -final class ACIPTraits implements TTraits { + * transliteration scheme different from other (say, EWTS) + * transliteration schemes. This is not safe to use in concurrent + * programs but it would be easy to make it so. */ +public final class ACIPTraits implements TTraits { /** sole instance of this class */ private static ACIPTraits singleton = null; @@ -30,7 +44,7 @@ final class ACIPTraits implements TTraits { private ACIPTraits() { } /** Returns the singleton instance of this class. */ - public static ACIPTraits instance() { + public static /* synchronized */ ACIPTraits instance() { if (null == singleton) { singleton = new ACIPTraits(); } @@ -43,15 +57,536 @@ final class ACIPTraits implements TTraits { /** Returns '-'. */ public char disambiguatorChar() { return '-'; } - public int maxConsonantLength() { return ACIPRules.MAX_CONSONANT_LENGTH; } + public int maxConsonantLength() { return MAX_CONSONANT_LENGTH; } - public int maxWowelLength() { return ACIPRules.MAX_WOWEL_LENGTH; } - - public boolean isConsonant(String s) { return ACIPRules.isConsonant(s); } - - public boolean isWowel(String s) { return ACIPRules.isWowel(s); } + public int maxWowelLength() { return MAX_WOWEL_LENGTH; } public boolean hasSimpleError(TPair p) { return ("A".equals(p.getLeft()) && null == p.getRight()); } + + public String aVowel() { return "A"; } + + public boolean isPostsuffix(String l) { + return ("S".equals(l) + || "D".equals(l)); + } + + public boolean isSuffix(String l) { + return ("S".equals(l) + || "G".equals(l) + || "D".equals(l) + || "M".equals(l) + || "'".equals(l) + || "B".equals(l) + || "NG".equals(l) + || "N".equals(l) + || "L".equals(l) + || "R".equals(l)); + } + + public boolean isPrefix(String l) { + return ("'".equals(l) + || "M".equals(l) + || "B".equals(l) + || "D".equals(l) + || "G".equals(l)); + } + + private HashMap superACIP2unicode = null; + private HashMap subACIP2unicode = null; + public /* synchronized */ String getUnicodeFor(String acip, boolean subscribed) { + if (superACIP2unicode == null) { + final boolean compactUnicode + = ThdlOptions.getBooleanOption("thdl.acip.to.unicode.conversions.use.0F52.et.cetera"); + superACIP2unicode = new HashMap(144); + subACIP2unicode = new HashMap(42); + + // oddball: + subACIP2unicode.put("V", "\u0FAD"); + + superACIP2unicode.put("DH", (compactUnicode ? "\u0F52" : "\u0F51\u0FB7")); + subACIP2unicode.put("DH", (compactUnicode ? "\u0FA2" : "\u0FA1\u0FB7")); + superACIP2unicode.put("BH", (compactUnicode ? "\u0F57" : "\u0F56\u0FB7")); + subACIP2unicode.put("BH", (compactUnicode ? "\u0FA7" : "\u0FA6\u0FB7")); + superACIP2unicode.put("dH", (compactUnicode ? "\u0F4D" : "\u0F4C\u0FB7")); + subACIP2unicode.put("dH", (compactUnicode ? "\u0F9D" : "\u0F9C\u0FB7")); + superACIP2unicode.put("DZH", (compactUnicode ? "\u0F5C" : "\u0F5B\u0FB7")); + subACIP2unicode.put("DZH", (compactUnicode ? "\u0FAC" : "\u0FAB\u0FB7")); + superACIP2unicode.put("Ksh", (compactUnicode ? "\u0F69" : "\u0F40\u0FB5")); + subACIP2unicode.put("Ksh", (compactUnicode ? "\u0FB9" : "\u0F90\u0FB5")); + superACIP2unicode.put("GH", (compactUnicode ? "\u0F43" : "\u0F42\u0FB7")); + subACIP2unicode.put("GH", (compactUnicode ? "\u0F93" : "\u0F92\u0FB7")); + superACIP2unicode.put("K", "\u0F40"); + subACIP2unicode.put("K", "\u0F90"); + superACIP2unicode.put("KH", "\u0F41"); + subACIP2unicode.put("KH", "\u0F91"); + superACIP2unicode.put("G", "\u0F42"); + subACIP2unicode.put("G", "\u0F92"); + superACIP2unicode.put("NG", "\u0F44"); + subACIP2unicode.put("NG", "\u0F94"); + superACIP2unicode.put("C", "\u0F45"); + subACIP2unicode.put("C", "\u0F95"); + superACIP2unicode.put("CH", "\u0F46"); + subACIP2unicode.put("CH", "\u0F96"); + superACIP2unicode.put("J", "\u0F47"); + subACIP2unicode.put("J", "\u0F97"); + superACIP2unicode.put("NY", "\u0F49"); + subACIP2unicode.put("NY", "\u0F99"); + superACIP2unicode.put("T", "\u0F4F"); + subACIP2unicode.put("T", "\u0F9F"); + superACIP2unicode.put("TH", "\u0F50"); + subACIP2unicode.put("TH", "\u0FA0"); + superACIP2unicode.put("D", "\u0F51"); + subACIP2unicode.put("D", "\u0FA1"); + superACIP2unicode.put("N", "\u0F53"); + subACIP2unicode.put("N", "\u0FA3"); + superACIP2unicode.put("P", "\u0F54"); + subACIP2unicode.put("P", "\u0FA4"); + superACIP2unicode.put("PH", "\u0F55"); + subACIP2unicode.put("PH", "\u0FA5"); + superACIP2unicode.put("B", "\u0F56"); + subACIP2unicode.put("B", "\u0FA6"); + superACIP2unicode.put("M", "\u0F58"); + subACIP2unicode.put("M", "\u0FA8"); + superACIP2unicode.put("TZ", "\u0F59"); + subACIP2unicode.put("TZ", "\u0FA9"); + superACIP2unicode.put("TS", "\u0F5A"); + subACIP2unicode.put("TS", "\u0FAA"); + superACIP2unicode.put("DZ", "\u0F5B"); + subACIP2unicode.put("DZ", "\u0FAB"); + superACIP2unicode.put("W", "\u0F5D"); + subACIP2unicode.put("W", "\u0FBA"); // oddball + superACIP2unicode.put("ZH", "\u0F5E"); + subACIP2unicode.put("ZH", "\u0FAE"); + superACIP2unicode.put("Z", "\u0F5F"); + subACIP2unicode.put("Z", "\u0FAF"); + superACIP2unicode.put("'", "\u0F60"); + subACIP2unicode.put("'", "\u0FB0"); + superACIP2unicode.put("Y", "\u0F61"); + subACIP2unicode.put("Y", "\u0FB1"); + superACIP2unicode.put("R", "\u0F62"); + subACIP2unicode.put("R", "\u0FB2"); + superACIP2unicode.put("L", "\u0F63"); + subACIP2unicode.put("L", "\u0FB3"); + superACIP2unicode.put("SH", "\u0F64"); + subACIP2unicode.put("SH", "\u0FB4"); + superACIP2unicode.put("S", "\u0F66"); + subACIP2unicode.put("S", "\u0FB6"); + superACIP2unicode.put("H", "\u0F67"); + subACIP2unicode.put("H", "\u0FB7"); + superACIP2unicode.put("A", "\u0F68"); + subACIP2unicode.put("A", "\u0FB8"); + superACIP2unicode.put("t", "\u0F4A"); + subACIP2unicode.put("t", "\u0F9A"); + superACIP2unicode.put("th", "\u0F4B"); + subACIP2unicode.put("th", "\u0F9B"); + superACIP2unicode.put("d", "\u0F4C"); + subACIP2unicode.put("d", "\u0F9C"); + superACIP2unicode.put("n", "\u0F4E"); + subACIP2unicode.put("n", "\u0F9E"); + superACIP2unicode.put("sh", "\u0F65"); + subACIP2unicode.put("sh", "\u0FB5"); + + superACIP2unicode.put("I", "\u0F72"); + superACIP2unicode.put("E", "\u0F7A"); + superACIP2unicode.put("O", "\u0F7C"); + superACIP2unicode.put("U", "\u0F74"); + superACIP2unicode.put("OO", "\u0F7D"); + superACIP2unicode.put("EE", "\u0F7B"); + superACIP2unicode.put("i", "\u0F80"); + superACIP2unicode.put("'A", "\u0F71"); + superACIP2unicode.put("'I", "\u0F71\u0F72"); + superACIP2unicode.put("'E", "\u0F71\u0F7A"); + superACIP2unicode.put("'O", "\u0F71\u0F7C"); + superACIP2unicode.put("'U", "\u0F71\u0F74"); + superACIP2unicode.put("'OO", "\u0F71\u0F7D"); + superACIP2unicode.put("'EE", "\u0F71\u0F7B"); + superACIP2unicode.put("'i", "\u0F71\u0F80"); + + superACIP2unicode.put("Im", "\u0F72\u0F7E"); + superACIP2unicode.put("Em", "\u0F7A\u0F7E"); + superACIP2unicode.put("Om", "\u0F7C\u0F7E"); + superACIP2unicode.put("Um", "\u0F74\u0F7E"); + superACIP2unicode.put("OOm", "\u0F7D\u0F7E"); + superACIP2unicode.put("EEm", "\u0F7B\u0F7E"); + superACIP2unicode.put("im", "\u0F80\u0F7E"); + superACIP2unicode.put("'Am", "\u0F71\u0F7E"); + superACIP2unicode.put("'Im", "\u0F71\u0F72\u0F7E"); + superACIP2unicode.put("'Em", "\u0F71\u0F7A\u0F7E"); + superACIP2unicode.put("'Om", "\u0F71\u0F7C\u0F7E"); + superACIP2unicode.put("'Um", "\u0F71\u0F74\u0F7E"); + superACIP2unicode.put("'OOm", "\u0F71\u0F7D\u0F7E"); + superACIP2unicode.put("'EEm", "\u0F71\u0F7B\u0F7E"); + superACIP2unicode.put("'im", "\u0F71\u0F80\u0F7E"); + + superACIP2unicode.put("I:", "\u0F72\u0F7F"); + superACIP2unicode.put("E:", "\u0F7A\u0F7F"); + superACIP2unicode.put("O:", "\u0F7C\u0F7F"); + superACIP2unicode.put("U:", "\u0F74\u0F7F"); + superACIP2unicode.put("OO:", "\u0F7D\u0F7F"); + superACIP2unicode.put("EE:", "\u0F7B\u0F7F"); + superACIP2unicode.put("i:", "\u0F80\u0F7F"); + superACIP2unicode.put("'A:", "\u0F71\u0F7F"); + superACIP2unicode.put("'I:", "\u0F71\u0F72\u0F7F"); + superACIP2unicode.put("'E:", "\u0F71\u0F7A\u0F7F"); + superACIP2unicode.put("'O:", "\u0F71\u0F7C\u0F7F"); + superACIP2unicode.put("'U:", "\u0F71\u0F74\u0F7F"); + superACIP2unicode.put("'OO:", "\u0F71\u0F7D\u0F7F"); + superACIP2unicode.put("'EE:", "\u0F71\u0F7B\u0F7F"); + superACIP2unicode.put("'i:", "\u0F71\u0F80\u0F7F"); + + superACIP2unicode.put("Im:", "\u0F72\u0F7E\u0F7F"); + superACIP2unicode.put("Em:", "\u0F7A\u0F7E\u0F7F"); + superACIP2unicode.put("Om:", "\u0F7C\u0F7E\u0F7F"); + superACIP2unicode.put("Um:", "\u0F74\u0F7E\u0F7F"); + superACIP2unicode.put("OOm:", "\u0F7D\u0F7E\u0F7F"); + superACIP2unicode.put("EEm:", "\u0F7B\u0F7E\u0F7F"); + superACIP2unicode.put("im:", "\u0F80\u0F7E\u0F7F"); + superACIP2unicode.put("'Am:", "\u0F71\u0F7E\u0F7F"); + superACIP2unicode.put("'Im:", "\u0F71\u0F72\u0F7E\u0F7F"); + superACIP2unicode.put("'Em:", "\u0F71\u0F7A\u0F7E\u0F7F"); + superACIP2unicode.put("'Om:", "\u0F71\u0F7C\u0F7E\u0F7F"); + superACIP2unicode.put("'Um:", "\u0F71\u0F74\u0F7E\u0F7F"); + superACIP2unicode.put("'OOm:", "\u0F71\u0F7D\u0F7E\u0F7F"); + superACIP2unicode.put("'EEm:", "\u0F71\u0F7B\u0F7E\u0F7F"); + superACIP2unicode.put("'im:", "\u0F71\u0F80\u0F7E\u0F7F"); + // :m does not appear, though you'd think it's as valid as m:. + + superACIP2unicode.put("m", "\u0F7E"); + superACIP2unicode.put(":", "\u0F7F"); + superACIP2unicode.put("m:", "\u0F7E\u0F7F"); + + superACIP2unicode.put("Am", "\u0F7E"); + superACIP2unicode.put("A:", "\u0F7F"); + superACIP2unicode.put("Am:", "\u0F7E\u0F7F"); + + superACIP2unicode.put("0", "\u0F20"); + superACIP2unicode.put("1", "\u0F21"); + superACIP2unicode.put("2", "\u0F22"); + superACIP2unicode.put("3", "\u0F23"); + superACIP2unicode.put("4", "\u0F24"); + superACIP2unicode.put("5", "\u0F25"); + superACIP2unicode.put("6", "\u0F26"); + superACIP2unicode.put("7", "\u0F27"); + superACIP2unicode.put("8", "\u0F28"); + superACIP2unicode.put("9", "\u0F29"); + + // punctuation + superACIP2unicode.put("&", "\u0F85"); + superACIP2unicode.put(",", "\u0F0D"); + superACIP2unicode.put(" ", "\u0F0B"); + superACIP2unicode.put(".", "\u0F0C"); + superACIP2unicode.put("`", "\u0F08"); + superACIP2unicode.put("`", "\u0F08"); + superACIP2unicode.put("*", "\u0F04\u0F05"); + superACIP2unicode.put("#", "\u0F04\u0F05\u0F05"); + superACIP2unicode.put("%", "\u0F35"); // but might be U+0F14, so we warn. + superACIP2unicode.put("o", "\u0F37"); + superACIP2unicode.put(";", "\u0F11"); + superACIP2unicode.put("\r", "\r"); + superACIP2unicode.put("\t", "\t"); + superACIP2unicode.put("\r\n", "\r\n"); + superACIP2unicode.put("\n", "\n"); + superACIP2unicode.put("\\", "\u0F84"); + superACIP2unicode.put("^", "\u0F38"); + + // DLC FIXME: "^ GONG" is "^GONG", right? + // DLC FIXME: what's the Unicode for x? RC said there is none in plain-text Unicode for x. But what about in RTF Unicode? + } + if (subscribed) { + String u = (String)subACIP2unicode.get(acip); + if (null != u) return u; + } + return (String)superACIP2unicode.get(acip); + } + + private HashMap acipOther2wylie = null; + public /* synchronized */ String getEwtsForOther(String acip) { + if (acipOther2wylie == null) { + acipOther2wylie = new HashMap(20); + + // don't use putMapping for this. We don't want TMW->ACIP + // to produce "." for a U+0F0C because ACIP doesn't say + // that "." means U+0F0C. It just seems to in practice + // for ACIP Release IV texts. + acipOther2wylie.put(".", "*"); + + putMapping(acipOther2wylie, "m", "M"); + putMapping(acipOther2wylie, ":", "H"); + putMapping(acipOther2wylie, ",", "/"); + putMapping(acipOther2wylie, " ", " "); + putMapping(acipOther2wylie, ";", "|"); + putMapping(acipOther2wylie, "`", "!"); + putMapping(acipOther2wylie, "*", "@#"); + // There is no glyph in TMW with the EWTS @##, so we don't do this: putMapping(acipOther2wylie, "#", "@##"); + putMapping(acipOther2wylie, "%", "~X"); + putMapping(acipOther2wylie, "o", "X"); + putMapping(acipOther2wylie, "&", "&"); + putMapping(acipOther2wylie, "^", "\\u0F38"); + + putMapping(acipOther2wylie, "0", "0"); + putMapping(acipOther2wylie, "1", "1"); + putMapping(acipOther2wylie, "2", "2"); + putMapping(acipOther2wylie, "3", "3"); + putMapping(acipOther2wylie, "4", "4"); + putMapping(acipOther2wylie, "5", "5"); + putMapping(acipOther2wylie, "6", "6"); + putMapping(acipOther2wylie, "7", "7"); + putMapping(acipOther2wylie, "8", "8"); + putMapping(acipOther2wylie, "9", "9"); + } + return (String)acipOther2wylie.get(acip); + } + + public TTshegBarScanner scanner() { return ACIPTshegBarScanner.instance(); } + + /** Registers acip->wylie mappings in toWylie; registers + wylie->acip mappings in {@link #wylieToACIP}. */ + private /* synchronized */ void putMapping(HashMap toWylie, String ACIP, String EWTS) { + toWylie.put(ACIP, EWTS); + if (null == wylieToACIP) { + wylieToACIP = new HashMap(75); + + // We don't want to put "/" in toWylie: + wylieToACIP.put("(", "/"); + wylieToACIP.put(")", "/"); + wylieToACIP.put("?", "\\"); + + wylieToACIP.put("_", " "); // oddball. + wylieToACIP.put("o'i", "O'I"); // oddball for TMW9.61. + } + wylieToACIP.put(EWTS, ACIP); + } + + /** A map from EWTS to ACIP. Note that the EWTS "w" maps to both + "V" and "W" in reality but this map will only give one or the + other. */ + private HashMap wylieToACIP = null; + /** Returns the ACIP transliteration corresponding to the THDL + Extended Wylie atom EWTS, or null if EWTS is not + recognized. */ + public String getACIPForEWTS(String EWTS) { + getEwtsForConsonant(null); // inits wylieToACIP + getEwtsForOther(null); // inits wylieToACIP + getEwtsForWowel(null); // inits wylieToACIP + String ans = (String)wylieToACIP.get(EWTS); + boolean useCapitalW = false; + if (EWTS.startsWith("w")) + useCapitalW = true; // We want W+NA, not V+NA; we want WA, not VA. + if (null == ans) { + StringBuffer finalAns = new StringBuffer(EWTS.length()); + StringTokenizer sTok = new StringTokenizer(EWTS, "-+", true); + while (sTok.hasMoreTokens()) { + String part, tok = sTok.nextToken(); + if (tok.equals("-") || tok.equals("+")) + part = tok; + else { + if ("w".equals(tok)) { + // There are only two stacks in TMW that have + // U+0FBA: R+Wa and w+Wa. TMW->ACIP fails for + // these unless we handle it here. (FIXME: + // add an automated test for this). + if ("R+W".equals(EWTS) || "w+W".equals(EWTS)) { + part = "W"; + } else { + part = "V"; + } + } else { + part = (String)wylieToACIP.get(tok); + } + } + if (null == part) return null; + finalAns.append(part); + } + if (useCapitalW) + finalAns.setCharAt(0, 'W'); + return finalAns.toString(); + } + if (useCapitalW) + return "W" + ans.substring(1); + else + return ans; + } + + private HashMap acipConsonant2wylie = null; + /** Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y", + * even though sometimes the EWTS for those is "w", "R", or "Y". + * Handle that in the caller. */ + public /* synchronized */ String getEwtsForConsonant(String acip) { + if (acipConsonant2wylie == null) { + acipConsonant2wylie = new HashMap(37); + + // oddball: + putMapping(acipConsonant2wylie, "V", "w"); + + // more oddballs: + putMapping(acipConsonant2wylie, "DH", "d+h"); + putMapping(acipConsonant2wylie, "BH", "b+h"); + putMapping(acipConsonant2wylie, "dH", "D+h"); + putMapping(acipConsonant2wylie, "DZH", "dz+h"); // longest, MAX_CONSONANT_LENGTH characters + putMapping(acipConsonant2wylie, "Ksh", "k+Sh"); // longest, MAX_CONSONANT_LENGTH characters + putMapping(acipConsonant2wylie, "GH", "g+h"); + + + putMapping(acipConsonant2wylie, "K", "k"); + putMapping(acipConsonant2wylie, "KH", "kh"); + putMapping(acipConsonant2wylie, "G", "g"); + putMapping(acipConsonant2wylie, "NG", "ng"); + putMapping(acipConsonant2wylie, "C", "c"); + putMapping(acipConsonant2wylie, "CH", "ch"); + putMapping(acipConsonant2wylie, "J", "j"); + putMapping(acipConsonant2wylie, "NY", "ny"); + putMapping(acipConsonant2wylie, "T", "t"); + putMapping(acipConsonant2wylie, "TH", "th"); + putMapping(acipConsonant2wylie, "D", "d"); + putMapping(acipConsonant2wylie, "N", "n"); + putMapping(acipConsonant2wylie, "P", "p"); + putMapping(acipConsonant2wylie, "PH", "ph"); + putMapping(acipConsonant2wylie, "B", "b"); + putMapping(acipConsonant2wylie, "M", "m"); + putMapping(acipConsonant2wylie, "TZ", "ts"); + putMapping(acipConsonant2wylie, "TS", "tsh"); + putMapping(acipConsonant2wylie, "DZ", "dz"); + putMapping(acipConsonant2wylie, "W", "W" + /* NOTE WELL: sometimes "w", sometimes "W". + Handle this in the caller. + + Reasoning for "W" instead of "w": r-w and + r+w are both known hash keys. We sort 'em + out this way. (They are the only things + like this according to bug report #800166.) */ + ); + putMapping(acipConsonant2wylie, "ZH", "zh"); + putMapping(acipConsonant2wylie, "Z", "z"); + putMapping(acipConsonant2wylie, "'", "'"); + putMapping(acipConsonant2wylie, "Y", "y"); + putMapping(acipConsonant2wylie, "R", "r"); + putMapping(acipConsonant2wylie, "L", "l"); + putMapping(acipConsonant2wylie, "SH", "sh"); + putMapping(acipConsonant2wylie, "S", "s"); + putMapping(acipConsonant2wylie, "H", "h"); + putMapping(acipConsonant2wylie, "A", "a"); + putMapping(acipConsonant2wylie, "t", "T"); + putMapping(acipConsonant2wylie, "th", "Th"); + putMapping(acipConsonant2wylie, "d", "D"); + putMapping(acipConsonant2wylie, "n", "N"); + putMapping(acipConsonant2wylie, "sh", "Sh"); + } + return (String)acipConsonant2wylie.get(acip); + } + + private HashMap acipWowel2wylie = null; + public /* synchronized */ String getEwtsForWowel(String acip) { + if (acipWowel2wylie == null) { + acipWowel2wylie = new HashMap(baseVowels.length * 4); + + for (int i = 0; i < baseVowels.length; i++) { + putMapping(acipWowel2wylie, baseVowels[i][0], baseVowels[i][1]); + putMapping(acipWowel2wylie, '\'' + baseVowels[i][0], baseVowels[i][2]); + putMapping(acipWowel2wylie, baseVowels[i][0] + 'm', baseVowels[i][1] + 'M'); + putMapping(acipWowel2wylie, '\'' + baseVowels[i][0] + 'm', baseVowels[i][2] + 'M'); + putMapping(acipWowel2wylie, baseVowels[i][0] + ':', baseVowels[i][1] + 'H'); + putMapping(acipWowel2wylie, '\'' + baseVowels[i][0] + ':', baseVowels[i][2] + 'H'); + putMapping(acipWowel2wylie, baseVowels[i][0] + "m:", baseVowels[i][1] + "MH"); + putMapping(acipWowel2wylie, '\'' + baseVowels[i][0] + "m:", baseVowels[i][2] + "MH"); + } + // {Pm} is treated just like {PAm}; {P:} is treated just + // like {PA:}; {Pm:} is treated just like {PAm:}. But + // that happens thanks to + // TPairListFactory.getFirstConsonantAndVowel(StringBuffer,int[]). + + // Keep this code in sync with getUnicodeFor. + } + return (String)acipWowel2wylie.get(acip); + } + + /** {Ksh}, the longest consonant, has 3 characters, so this is + * three. */ + private static int MAX_CONSONANT_LENGTH = 3; + + /** {'EEm:}, the longest wowel, has 5 characters, so this is + * five. */ + private static int MAX_WOWEL_LENGTH = 5; + + private static String[][] baseVowels = new String[][] { + // { ACIP, EWTS, EWTS for ACIP {'\'' + baseVowels[][0]}, vowel + // numbers (see TibetanMachineWeb's VOWEL_A, VOWEL_o, etc.) + // for ACIP, vowel numbers for ACIP {'\'' + baseVowels[][0]} + { "A", "a", "A" }, + { "I", "i", "I" }, + { "U", "u", "U" }, + { "E", "e", "Ae" }, + { "O", "o", "Ao" }, + { "EE", "ai", "Aai" }, + { "OO", "au", "Aau" }, + { "i", "-i", "A-i" } + }; + + /** Returns true if and only if s is an ACIP wowel. You can't + * just call this any time -- A is both a consonant and a vowel + * in ACIP, so you have to call this in the right context. */ + public boolean isWowel(String s) { + // I'm on my own with 'O and 'E and 'OO and 'EE, but GANG'O + // appears and I wonder... so here they are. It's consistent + // with 'I and 'A and 'U, at least: all the vowels may appear + // as K'vowel. DLC FIXME: ask. + return (null != getEwtsForWowel(s)); + } + + /** Returns true if and only if s is an ACIP consonant. */ + public boolean isConsonant(String s) { + return (null != getEwtsForConsonant(s)); + } + + /** Gets the duffcodes for wowel, such that they look good with + * the preceding glyph, and appends them to duff. */ + public void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel) { + if (null == wowel) return; + if (null == getEwtsForWowel(wowel)) // FIXME: expensive assertion! Use assert. + throw new IllegalArgumentException("Wowel " + wowel + " isn't in the small set of wowels we handle correctly."); + + // Order matters here. + boolean context_added[] = new boolean[] { false }; + if (wowel.startsWith("A")) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.WYLIE_aVOWEL, context_added); + } else if (wowel.indexOf("'U") >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.U_VOWEL, context_added); + } else if (wowel.indexOf("'I") >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.I_VOWEL, context_added); + } else { + if (wowel.indexOf('\'') >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.A_VOWEL, context_added); + } + if (wowel.indexOf("EE") >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.ai_VOWEL, context_added); + } else if (wowel.indexOf('E') >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.e_VOWEL, context_added); + } + if (wowel.indexOf("OO") >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.au_VOWEL, context_added); + } else if (wowel.indexOf('O') >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.o_VOWEL, context_added); + } + if (wowel.indexOf('I') >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.i_VOWEL, context_added); + } + if (wowel.indexOf('U') >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.u_VOWEL, context_added); + } + if (wowel.indexOf('i') >= 0) { + TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.reverse_i_VOWEL, context_added); + } + } + // FIXME: Use TMW9.61, the "o'i" special combination, when appropriate. + + if (wowel.indexOf('m') >= 0) { + DuffCode last = (DuffCode)duff.get(duff.size() - 1); + duff.remove(duff.size() - 1); // getBindu will add it back... + TibTextUtils.getBindu(duff, last); + } + if (wowel.indexOf(':') >= 0) + duff.add(TibetanMachineWeb.getGlyph(getEwtsForOther(":"))); + } } + diff --git a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java index bb6eb74..9a750c5 100644 --- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java +++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java @@ -18,11 +18,10 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; -import java.io.*; +import java.io.IOException; import java.util.ArrayList; import java.util.Stack; -import org.thdl.util.ThdlDebug; import org.thdl.util.ThdlOptions; /** @@ -36,8 +35,10 @@ import org.thdl.util.ThdlOptions; * the parser, not here in the lexical analyzer. That'd be cleaner, * and more like how you'd do things if you used lex and yacc. * +* This is not public because you should use {@link ACIPTraits#scanner()}. +* * @author David Chandler */ -public class ACIPTshegBarScanner extends TTshegBarScanner { +class ACIPTshegBarScanner extends TTshegBarScanner { /** True if those ACIP snippets inside square brackets (e.g., "[THIS]") are to be passed through into the output unmodified while retaining the brackets and if those ACIP snippets inside diff --git a/source/org/thdl/tib/text/ttt/EWTSTraits.java b/source/org/thdl/tib/text/ttt/EWTSTraits.java index 7027622..bfef618 100644 --- a/source/org/thdl/tib/text/ttt/EWTSTraits.java +++ b/source/org/thdl/tib/text/ttt/EWTSTraits.java @@ -18,11 +18,14 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; +import java.util.ArrayList; +import org.thdl.tib.text.DuffCode; + /** A singleton class that should contain (but due to laziness and * ignorance probably does not contain) all the traits that make EWTS * transliteration different from other (say, ACIP) transliteration * schemes. */ -final class EWTSTraits implements TTraits { +public final class EWTSTraits implements TTraits { /** sole instance of this class */ private static EWTSTraits singleton = null; @@ -30,7 +33,7 @@ final class EWTSTraits implements TTraits { private EWTSTraits() { } /** */ - public static EWTSTraits instance() { + public static synchronized EWTSTraits instance() { if (null == singleton) { singleton = new EWTSTraits(); } @@ -79,4 +82,48 @@ final class EWTSTraits implements TTraits { || "H".equals(s) || "M".equals(s)); // TODO(DLC)[EWTS->Tibetan]:??? } + + public String aVowel() { return "a"; } + + public boolean isPostsuffix(String s) { + return ("s".equals(s) || "d".equals(s)); + } + + public boolean isPrefix(String l) { + return ("'".equals(l) + || "m".equals(l) + || "b".equals(l) + || "d".equals(l) + || "g".equals(l)); + } + + public boolean isSuffix(String l) { + return ("s".equals(l) + || "g".equals(l) + || "d".equals(l) + || "m".equals(l) + || "'".equals(l) + || "b".equals(l) + || "ng".equals(l) + || "n".equals(l) + || "l".equals(l) + || "r".equals(l)); + } + + /** Returns l, since this is EWTS's traits class. */ + public String getEwtsForConsonant(String l) { return l; } + + /** Returns l, since this is EWTS's traits class. */ + public String getEwtsForOther(String l) { return l; } + + /** Returns l, since this is EWTS's traits class. */ + public String getEwtsForWowel(String l) { return l; } + + public TTshegBarScanner scanner() { return EWTSTshegBarScanner.instance(); } + + public void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel) { + throw new Error("TODO(DLC)[EWTS->Tibetan]"); + } + + public String getUnicodeFor(String l, boolean subscribed) { throw new Error("TODO(DLC)[EWTS->Tibetan]"); } } diff --git a/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java b/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java new file mode 100644 index 0000000..7315675 --- /dev/null +++ b/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java @@ -0,0 +1,56 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2003 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.ttt; + +import java.util.ArrayList; + +/** +* This singleton class is able to break up Strings of EWTS text (for +* example, an entire sutra file) into tsheg bars, comments, etc. +* Non-Tibetan parts are segregated (so that consumers can ensure that +* they remain non-Tibetan), and Tibetan passages are broken up into +* tsheg bars. +* +* This is not public because you should use {@link EWTSTraits#scanner()}. +* +* @author David Chandler */ +class EWTSTshegBarScanner extends TTshegBarScanner { + /** See the comment in TTshegBarScanner. This does not find + errors and warnings that you'd think of a parser finding (DLC + DOES IT?). */ + public ArrayList scan(String s, StringBuffer errors, int maxErrors, + boolean shortMessages, String warningLevel) { + // the size depends on whether it's mostly Tibetan or mostly + // Latin and a number of other factors. This is meant to be + // an underestimate, but not too much of an underestimate. + ArrayList al = new ArrayList(s.length() / 10); + throw new Error("DLC unimplemented"); + } + + /** non-public because this is a singleton */ + protected EWTSTshegBarScanner() { } + private static EWTSTshegBarScanner singleton = null; + /** Returns the sole instance of this class. */ + public synchronized static EWTSTshegBarScanner instance() { + if (null == singleton) { + singleton = new EWTSTshegBarScanner(); + } + return singleton; + } +} diff --git a/source/org/thdl/tib/text/ttt/PackageTest.java b/source/org/thdl/tib/text/ttt/PackageTest.java index e8dde5b..eff8d50 100644 --- a/source/org/thdl/tib/text/ttt/PackageTest.java +++ b/source/org/thdl/tib/text/ttt/PackageTest.java @@ -202,15 +202,16 @@ public class PackageTest extends TestCase { message. */ static String ACIP2TMW2Translit(boolean EWTSNotACIP, String ACIP) { StringBuffer errors = new StringBuffer(); - ArrayList al = ACIPTshegBarScanner.instance().scan(ACIP, errors, -1, - false, "None"); + ArrayList al = ACIPTraits.instance().scanner().scan(ACIP, errors, -1, + false, "None"); if (null == al || errors.length() > 0) return null; org.thdl.tib.text.TibetanDocument tdoc = new org.thdl.tib.text.TibetanDocument(); int loc[] = new int[] { 0 }; try { - if (!TConverter.convertToTMW(al, + if (!TConverter.convertToTMW(ACIPTraits.instance(), + al, tdoc, null, null, @@ -7358,8 +7359,8 @@ tstHelper("ZUR"); private static void shelp(String s, String expectedErrors, String expectedScan, String warningLevel) { StringBuffer errors = new StringBuffer(); - ArrayList al = ACIPTshegBarScanner.instance().scan(s, errors, -1, false, - warningLevel); + ArrayList al = ACIPTraits.instance().scanner().scan(s, errors, -1, false, + warningLevel); if (null != expectedScan) { if (!al.toString().equals(expectedScan)) { System.out.println("Scanning " + s + " into tsheg bars was expected to cause the following scan:"); @@ -7392,7 +7393,7 @@ tstHelper("ZUR"); /** Tests {@link ACIPTshegBarScanner#scan(String, StringBuffer, int, boolean)}. */ - public void testScanner() { + public void testAcipScanner() { shelp("Pm KA", "", "[TIBETAN_NON_PUNCTUATION:{Pm}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{KA}]"); shelp("KA (KHA\nGA)", "", "[TIBETAN_NON_PUNCTUATION:{KA}, TIBETAN_PUNCTUATION:{ }, START_PAREN:{(}, TIBETAN_NON_PUNCTUATION:{KHA}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{GA}, END_PAREN:{)}]"); @@ -7682,7 +7683,8 @@ tstHelper("ZUR"); private static void uhelp(String acip, String expectedUnicode, String warningLevel, boolean shortMessages) { StringBuffer errors = new StringBuffer(); - String unicode = TConverter.convertToUnicodeText("ACIP", acip, errors, + String unicode = TConverter.convertToUnicodeText(ACIPTraits.instance(), + acip, errors, null, true, warningLevel, shortMessages); diff --git a/source/org/thdl/tib/text/ttt/TConverter.java b/source/org/thdl/tib/text/ttt/TConverter.java index 9bbe07f..bd889dc 100644 --- a/source/org/thdl/tib/text/ttt/TConverter.java +++ b/source/org/thdl/tib/text/ttt/TConverter.java @@ -69,10 +69,10 @@ public class TConverter { boolean shortMessages = false; String warningLevel = "Most"; ArrayList al - = ACIPTshegBarScanner.instance().scanFile(args[0], errors, - maxErrors - 1, - shortMessages, - warningLevel); + = ACIPTraits.instance().scanner().scanFile(args[0], errors, + maxErrors - 1, + shortMessages, + warningLevel); if (null == al) { System.err.println(maxErrors + " or more lexical errors occurred while scanning ACIP input file; is this"); @@ -103,8 +103,9 @@ public class TConverter { warnings = new StringBuffer(); putWarningsInOutput = true; } - convertToTMW(al, System.out, errors, warnings, null, - putWarningsInOutput, warningLevel, shortMessages, colors); + convertToTMW(ACIPTraits.instance(), al, System.out, errors, warnings, + null, putWarningsInOutput, warningLevel, shortMessages, + colors); int retCode = 0; if (errors.length() > 0) { System.err.println("Errors converting ACIP input file: "); @@ -139,7 +140,8 @@ public class TConverter { * prefix rules in another * @throws IOException if we cannot write to out */ - public static boolean convertToTMW(ArrayList scan, + public static boolean convertToTMW(TTraits ttraits, + ArrayList scan, OutputStream out, StringBuffer errors, StringBuffer warnings, @@ -152,7 +154,8 @@ public class TConverter { { TibetanDocument tdoc = new TibetanDocument(); boolean rv - = convertToTMW(scan, tdoc, errors, warnings, hasWarnings, + = convertToTMW(ttraits, + scan, tdoc, errors, warnings, hasWarnings, writeWarningsToResult, warningLevel, shortMessages, colors, new int[] { tdoc.getLength() }); @@ -169,7 +172,8 @@ public class TConverter { offset from zero inside tdoc at which conversion results will be placed. On output, loc[0] is one past the offset of the last of the conversion results. */ - public static boolean convertToTMW(ArrayList scan, + public static boolean convertToTMW(TTraits ttraits, + ArrayList scan, TibetanDocument tdoc, StringBuffer errors, StringBuffer warnings, @@ -181,7 +185,8 @@ public class TConverter { int[] loc) throws IOException { - return convertTo(false, true, scan, null, tdoc, errors, warnings, + return convertTo(false, true, + ttraits, scan, null, tdoc, errors, warnings, hasWarnings, writeWarningsToResult, warningLevel, shortMessages, colors, loc, loc[0] == tdoc.getLength()); @@ -189,33 +194,30 @@ public class TConverter { /** Returns UTF-8 encoded Unicode. A bit indirect, so use this * for testing only if performance is a concern. If errors occur - * in scanning the ACIP or in converting a tsheg bar, then they - * are appended to errors if errors is non-null, as well as - * written to the result. If warnings occur in scanning the ACIP - * or in converting a tsheg bar, then they are appended to - * warnings if warnings is non-null, and they are written to the - * result if writeWarningsToResult is true. Error and warning - * messages are long and self-contained unless shortMessages is - * true. Returns the conversion upon perfect success or if there - * were merely warnings, null if errors occurred. */ - public static String convertToUnicodeText(String transliteration, - String acip, + * in scanning the transliteration or in converting a tsheg bar, + * then they are appended to errors if errors is non-null, as + * well as written to the result. If warnings occur in scanning + * the transliteration or in converting a tsheg bar, then they + * are appended to warnings if warnings is non-null, and they are + * written to the result if writeWarningsToResult is true. Error + * and warning messages are long and self-contained unless + * shortMessages is true. Returns the conversion upon perfect + * success or if there were merely warnings, null if errors + * occurred. */ + public static String convertToUnicodeText(TTraits ttraits, + String translit, StringBuffer errors, StringBuffer warnings, boolean writeWarningsToResult, String warningLevel, boolean shortMessages) { - if (transliteration != "ACIP") { - ThdlDebug.noteIffyCode(); - throw new IllegalArgumentException("Unsupported transliteration"); - } ByteArrayOutputStream sw = new ByteArrayOutputStream(); ArrayList al - = ACIPTshegBarScanner.instance().scan(acip, errors, -1, - shortMessages, warningLevel); + = ttraits.scanner().scan(translit, errors, -1, shortMessages, + warningLevel); try { if (null != al) { - convertToUnicodeText(al, sw, errors, + convertToUnicodeText(ttraits, al, sw, errors, warnings, null, writeWarningsToResult, warningLevel, shortMessages); return sw.toString("UTF-8"); @@ -236,7 +238,8 @@ public class TConverter { * writeWarningsToOut is true, then warnings also will be written * to out. * @return true upon perfect success, false if errors occurred. - * @param scan result of ACIPTshegBarScanner.scan(..) + * @param scan result of using ttraits.scanner() to break up the + * original string of transliteration * @param out stream to which to write converted text * @param errors if non-null, all error messages are appended * @param warnings if non-null, all warning messages appropriate @@ -246,9 +249,9 @@ public class TConverter { * false otherwise * @param writeWarningsToOut if true, then all warning messages * are written to out in the appropriate places - * @throws IOException if we cannot write to out - */ - public static boolean convertToUnicodeText(ArrayList scan, + * @throws IOException if we cannot write to out */ + public static boolean convertToUnicodeText(TTraits ttraits, + ArrayList scan, OutputStream out, StringBuffer errors, StringBuffer warnings, @@ -258,7 +261,8 @@ public class TConverter { boolean shortMessages) throws IOException { - return convertTo(true, false, scan, out, null, errors, warnings, + return convertTo(true, false, + ttraits, scan, out, null, errors, warnings, hasWarnings, writeWarningsToOut, warningLevel, shortMessages, false, new int[] { -1 } , true); } @@ -283,6 +287,7 @@ public class TConverter { private static boolean convertTo(boolean toUnicode, // else to TMW boolean toRTF, // else to UTF-8-encoded text + TTraits ttraits, ArrayList scan, OutputStream out, // for (toUnicode && !toRTF) mode TibetanDocument tdoc, // for !toUnicode mode or (toUnicode && toRTF) mode @@ -368,7 +373,7 @@ public class TConverter { if (lastGuyWasNonPunct) { String err = "[#ERROR " + ErrorsAndWarnings.getMessage(133, shortMessages, s.getText()) + "]"; if (null != writer) { - String uni = ACIPRules.getUnicodeFor(s.getText(), false); + String uni = ttraits.getUnicodeFor(s.getText(), false); if (null == uni) { hasErrors = true; uni = err; @@ -377,7 +382,7 @@ public class TConverter { } if (null != tdoc) { String wylie - = ACIPRules.getWylieForACIPOther(s.getText()); + = ttraits.getEwtsForOther(s.getText()); if (null == wylie) { hasErrors = true; tdoc.appendRoman(tdocLocation[0], err, Color.RED); @@ -658,7 +663,7 @@ public class TConverter { } if (!done) { - if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false); + if (null != writer) unicode = ttraits.getUnicodeFor(s.getText(), false); if (null != tdoc) { if (s.getText().equals("\r") || s.getText().equals("\t") @@ -675,7 +680,7 @@ public class TConverter { TibetanMachineWeb.getGlyph("#") }; // hard-coded EWTS values } else { - String wy = ACIPRules.getWylieForACIPOther(s.getText()); + String wy = ttraits.getEwtsForOther(s.getText()); if (null == wy) throw new Error("No wylie for ACIP " + s.getText()); duff = new Object[] { TibetanMachineWeb.getGlyph(wy) }; } diff --git a/source/org/thdl/tib/text/ttt/TPair.java b/source/org/thdl/tib/text/ttt/TPair.java index a21181d..8814493 100644 --- a/source/org/thdl/tib/text/ttt/TPair.java +++ b/source/org/thdl/tib/text/ttt/TPair.java @@ -26,22 +26,27 @@ import java.util.ArrayList; /** An ordered pair used in ACIP/EWTS-to-TMW/Unicode conversion. The * left side is the consonant or empty; the right side is either the - * vowel or '+' (indicating stacking) or a disambiguator (i.e., '-' - * in ACIP or '.' in EWTS). + * vowel or '+' (indicating stacking in both ACIP and EWTS) or a + * disambiguator (e.g., '-' in ACIP or '.' in EWTS). * @author David Chandler */ /* BIG FIXME: make this package work for EWTS, not just ACIP. (TODO(DLC)[EWTS->Tibetan]: does it?) */ class TPair { - /** The left side, or null if there is no left side. That is, the - * non-vowel, non-'m', non-':', non-'-', non-'+' guy. */ + /** the part that knows ACIP from EWTS */ + private TTraits traits; + + /** Returns the part that knows ACIP from EWTS. */ + public TTraits getTraits() { return traits; } + + /** The left side, or null if there is no left side. I.e., the + * non-wowel, non-disambiguator, non-'+' guy. */ private String l; String getLeft() { ThdlDebug.verify(!"".equals(l)); return l; } - /** The right side. That is, the vowel, with 'm' or ':' "vowel" - * after it if appropriate, or "-" (disambiguator), or "+" - * (stacking), or null otherwise. */ + /** The right side. That is, the wowel or disambiguator or "+" + * (for stacking) or null otherwise. */ private String r; String getRight() { ThdlDebug.verify(!"".equals(r)); @@ -50,13 +55,14 @@ class TPair { /** Constructs a new TPair with left side l and right side r. * Use null or the empty string to represent an absence. */ - TPair(String l, String r) { + TPair(TTraits traits, String l, String r) { // Normalize: if (null != l && l.equals("")) l = null; if (null != r && r.equals("")) r = null; this.l = l; this.r = r; + this.traits = traits; } /** Returns a nice String representation. Returns "(D . E)" for @@ -67,8 +73,8 @@ class TPair { + ((null == r) ? "" : r) + ")"; } - /** Returns the number of ACIP characters that make up this - * TPair. */ + /** Returns the number of transliteration characters that make up + * this TPair. */ int size() { return (((l == null) ? 0 : l.length()) + ((r == null) ? 0 : r.length())); @@ -98,18 +104,18 @@ class TPair { sz = l.length(); newL = l.substring(0, sz - N); } - return new TPair(newL, newR); + return new TPair(traits, newL, newR); } - /** Returns true if and only if this is nonempty and is l, if - * present, is a legal ACIP consonant, and is r, if present, is a - * legal ACIP vowel. */ + /** Returns true if and only if this is nonempty and if l, if + * present, is a legal consonant, and if r, if present, is a + * legal wowel. */ boolean isLegal() { if (size() < 1) return false; - if (null != l && !ACIPRules.isConsonant(l)) + if (null != l && !traits.isConsonant(l)) return false; - if (null != r && !ACIPRules.isWowel(r)) + if (null != r && !traits.isWowel(r)) return false; return true; } @@ -119,9 +125,9 @@ class TPair { boolean isPrefix() { return (null != l && ((null == r || "".equals(r)) - || "-".equals(r) // TODO(DLC)[EWTS->Tibetan] - || "A".equals(r)) // FIXME: though check for BASKYABS and warn because BSKYABS is more common - && ACIPRules.isACIPPrefix(l)); + || traits.disambiguator().equals(r) + || traits.aVowel().equals(r)) // FIXME: though check for BASKYABS and warn because BSKYABS is more common + && traits.isPrefix(l)); } /** Returns true if and only if this pair could be a Tibetan @@ -129,25 +135,25 @@ class TPair { boolean isPostSuffix() { return (null != l && ((null == r || "".equals(r)) - || "-".equals(r) - || "A".equals(r)) // FIXME: though warn about GAMASA vs. GAMS - && ACIPRules.isACIPPostsuffix(l)); + || traits.disambiguator().equals(r) + || traits.aVowel().equals(r)) // FIXME: though warn about GAMASA vs. GAMS + && traits.isPostsuffix(l)); } /** Returns true if and only if this pair could be a Tibetan - * suffix. FIXME: ACIP specific, just like isPostSuffix() and isPrefix() */ + * suffix. */ boolean isSuffix() { return (null != l && ((null == r || "".equals(r)) - || "-".equals(r) - || "A".equals(r)) - && ACIPRules.isACIPSuffix(l)); + || traits.disambiguator().equals(r) + || traits.aVowel().equals(r)) + && traits.isSuffix(l)); } /** Returns true if and only if this pair is merely a * disambiguator. */ boolean isDisambiguator() { - return ("-".equals(r) && getLeft() == null); + return (traits.disambiguator().equals(r) && getLeft() == null); } /** Yep, this works for TPairs. */ @@ -160,16 +166,16 @@ class TPair { return false; } - /** Returns a TPair that is like this pair except that it has - * a "+" on the right if this pair is empty on the right and is - * empty on the right if this pair has a disambiguator (i.e., a - * '-') on the right. May return itself (but never mutates this + /** Returns a TPair that is like this pair except that it has a + * "+" on the right if this pair is empty on the right and is + * empty on the right if this pair has a disambiguator on the + * right. May return itself (but never mutates this * instance). */ TPair insideStack() { if (null == getRight()) - return new TPair(getLeft(), "+"); - else if ("-".equals(getRight())) - return new TPair(getLeft(), null); + return new TPair(traits, getLeft(), "+"); + else if (traits.disambiguator().equals(getRight())) + return new TPair(traits, getLeft(), null); else return this; } @@ -194,7 +200,7 @@ class TPair { String getWylie(boolean justLeft) { String leftWylie = null; if (getLeft() != null) { - leftWylie = ACIPRules.getWylieForACIPConsonant(getLeft()); + leftWylie = traits.getEwtsForConsonant(getLeft()); if (leftWylie == null) { if (isNumeric()) leftWylie = getLeft(); @@ -208,7 +214,7 @@ class TPair { else if ("+".equals(getRight())) rightWylie = "+"; else if (getRight() != null) - rightWylie = ACIPRules.getWylieForACIPVowel(getRight()); + rightWylie = traits.getEwtsForWowel(getRight()); if (null == rightWylie) rightWylie = ""; return leftWylie + rightWylie; } @@ -227,18 +233,19 @@ class TPair { void getUnicode(StringBuffer consonantSB, StringBuffer vowelSB, boolean subscribed) { if (null != getLeft()) { - String x = ACIPRules.getUnicodeFor(getLeft(), subscribed); + String x = traits.getUnicodeFor(getLeft(), subscribed); if (null == x) throw new Error("TPair: " + getLeft() + " has no Uni"); consonantSB.append(x); } if (null != getRight() && !("-".equals(getRight()) || "+".equals(getRight()) || "A".equals(getRight()))) { - String x = ACIPRules.getUnicodeFor(getRight(), subscribed); + String x = traits.getUnicodeFor(getRight(), subscribed); if (null == x) throw new Error("TPair: " + getRight() + " has no Uni"); vowelSB.append(x); } } + // TODO(DLC)[EWTS->Tibetan] /** Returns true if this pair is surely the last pair in an ACIP * stack. Stacking continues through (* . ) and (* . +), but * stops anywhere else. */ diff --git a/source/org/thdl/tib/text/ttt/TPairList.java b/source/org/thdl/tib/text/ttt/TPairList.java index 5b83de5..2452a51 100644 --- a/source/org/thdl/tib/text/ttt/TPairList.java +++ b/source/org/thdl/tib/text/ttt/TPairList.java @@ -33,6 +33,9 @@ import java.util.ArrayList; * * @author David Chandler */ class TPairList { + /** the part that knows ACIP from EWTS */ + private TTraits traits; + /** FIXME: change me and see if performance improves. */ private static final int INITIAL_SIZE = 1; @@ -41,17 +44,20 @@ class TPairList { /** Creates a new list containing just p. */ public TPairList(TPair p) { + this.traits = p.getTraits(); al = new ArrayList(1); add(p); } /** Creates an empty list. */ - public TPairList() { + public TPairList(TTraits traits) { + this.traits = traits; al = new ArrayList(INITIAL_SIZE); } /** Creates an empty list with the capacity to hold N items. */ - public TPairList(int N) { + public TPairList(TTraits traits, int N) { + this.traits = traits; al = new ArrayList(N); } @@ -181,7 +187,7 @@ class TPairList { return ErrorsAndWarnings.getMessage(125, shortMessages, translit); } else if ((null == p.getLeft() && !"-".equals(p.getRight())) || (null != p.getLeft() - && !ACIPRules.isConsonant(p.getLeft()) + && !traits.isConsonant(p.getLeft()) && !p.isNumeric())) { // FIXME: stop handling this outside of ErrorsAndWarnings: if (null == p.getLeft()) { @@ -406,12 +412,12 @@ class TPairList { // and only if b1 is one, etc. for (int counter = 0; counter < (1< 0) { for (int j = 0; breakStart+j < 3; j++) { @@ -427,7 +433,7 @@ class TPairList { && 1 == ((counter >> j) & 1)) { if (!currentStack.isEmpty()) sl.add(currentStack.asStack()); - currentStack = new TPairList(); + currentStack = new TPairList(traits); break; // shouldn't matter, but you never know } } @@ -460,9 +466,9 @@ class TPairList { if (!isEmpty()) { TPair lastPair = get(size() - 1); if ("+".equals(lastPair.getRight())) - al.set(size() - 1, new TPair(lastPair.getLeft(), null)); + al.set(size() - 1, new TPair(traits, lastPair.getLeft(), null)); else if ("-".equals(lastPair.getRight())) - al.set(size() - 1, new TPair(lastPair.getLeft(), null)); + al.set(size() - 1, new TPair(traits, lastPair.getLeft(), null)); } return this; } @@ -506,10 +512,10 @@ class TPairList { add_U0F7F = true; StringBuffer rr = new StringBuffer(p.getRight()); rr.deleteCharAt(where); - p = new TPair(p.getLeft(), rr.toString()); + p = new TPair(traits, p.getLeft(), rr.toString()); } boolean hasNonAVowel = (!"A".equals(p.getRight()) && null != p.getRight()); - String thislWylie = ACIPRules.getWylieForACIPConsonant(p.getLeft()); + String thislWylie = traits.getEwtsForConsonant(p.getLeft()); if (thislWylie == null) { char ch; if (p.isNumeric()) { @@ -528,21 +534,21 @@ class TPairList { boolean isTibetan = TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(ll.toString()); boolean isSanskrit = TibetanMachineWeb.isWylieSanskritConsonantStack(lWylie.toString()); if (ddebug && !isTibetan && !isSanskrit && !isNumeric) { - System.out.println("OTHER for " + lWylie + " with vowel " + ACIPRules.getWylieForACIPVowel(p.getRight()) + " and p.getRight()=" + p.getRight()); + System.out.println("OTHER for " + lWylie + " with vowel " + traits.getEwtsForWowel(p.getRight()) + " and p.getRight()=" + p.getRight()); } if (isTibetan && isSanskrit) { // RVA, e.g. It must be Tibetan because RWA is what // you'd use for RA over fixed-form WA. isSanskrit = false; } - if (ddebug && hasNonAVowel && ACIPRules.getWylieForACIPVowel(p.getRight()) == null) { - System.out.println("vowel " + ACIPRules.getWylieForACIPVowel(p.getRight()) + " and p.getRight()=" + p.getRight()); + if (ddebug && hasNonAVowel && traits.getEwtsForWowel(p.getRight()) == null) { + System.out.println("vowel " + traits.getEwtsForWowel(p.getRight()) + " and p.getRight()=" + p.getRight()); } TGCPair tp; indexList.add(new Integer(index)); tp = new TGCPair(lWylie.toString(), (hasNonAVowel - ? ACIPRules.getWylieForACIPVowel(p.getRight()) + ? traits.getEwtsForWowel(p.getRight()) : ""), (isNumeric ? TGCPair.TYPE_OTHER @@ -697,9 +703,9 @@ class TPairList { if (lastPair.getRight() == null || lastPair.equals("-")) { duffsAndErrors.add(TibetanMachineWeb.getGlyph(hashKey)); } else { - ACIPRules.getDuffForACIPVowel(duffsAndErrors, - TibetanMachineWeb.getGlyph(hashKey), - lastPair.getRight()); + traits.getDuffForWowel(duffsAndErrors, + TibetanMachineWeb.getGlyph(hashKey), + lastPair.getRight()); } if (previousSize == duffsAndErrors.size()) throw new Error("TPairList with no duffs? " + toString()); // FIXME: change to assertion. diff --git a/source/org/thdl/tib/text/ttt/TPairListFactory.java b/source/org/thdl/tib/text/ttt/TPairListFactory.java index 73aeea3..6d79136 100644 --- a/source/org/thdl/tib/text/ttt/TPairListFactory.java +++ b/source/org/thdl/tib/text/ttt/TPairListFactory.java @@ -121,7 +121,7 @@ class TPairListFactory { // base case for our recursion: if ("".equals(acip)) - return new TPairList(); + return new TPairList(ttraits); StringBuffer acipBuf = new StringBuffer(acip); int howMuchBuf[] = new int[1]; @@ -131,9 +131,9 @@ class TPairListFactory { && null != head.getLeft() && null != head.getRight() && weHaveSeenVowelAlready - && ACIPRules.isACIPSuffix(head.getLeft()) // DKY'O should be two horizontal units, not three. -- {D}{KY'O}, not {D}{KY}{'O}. + && ttraits.isSuffix(head.getLeft()) // DKY'O should be two horizontal units, not three. -- {D}{KY'O}, not {D}{KY}{'O}. && head.getRight().startsWith("'")) { - head = new TPair(head.getLeft(), + head = new TPair(ttraits, head.getLeft(), // Without this disambiguator, we are // less efficient (8 parses, not 4) and // we can't handle PA'AM'ANG etc. @@ -177,11 +177,11 @@ class TPairListFactory { } // TODO(DLC)[EWTS->Tibetan]: doc - private static TPairList breakHelperEWTS(String ewts, TTraits ttraits /* TODO(DLC)[EWTS->Tibetan]: use */) { + private static TPairList breakHelperEWTS(String ewts, TTraits ttraits) { // base case for our recursion: if ("".equals(ewts)) - return new TPairList(); + return new TPairList(ttraits); StringBuffer ewtsBuf = new StringBuffer(ewts); int howMuchBuf[] = new int[1]; @@ -238,11 +238,11 @@ class TPairListFactory { int i, xl = acip.length(); if (0 == xl) { howMuch[0] = 0; - return new TPair(null, null); + return new TPair(ttraits, null, null); } if (acip.charAt(0) == ttraits.disambiguatorChar()) { howMuch[0] = 1; - return new TPair(null, ttraits.disambiguator()); + return new TPair(ttraits, null, ttraits.disambiguator()); } char ch = acip.charAt(0); @@ -250,7 +250,7 @@ class TPairListFactory { // like seeing 1-2-3-4. if (ch >= '0' && ch <= '9') { howMuch[0] = 1; // not 2... - return new TPair(acip.substring(0, 1), (xl == 1) ? null : ttraits.disambiguator()); + return new TPair(ttraits, acip.substring(0, 1), (xl == 1) ? null : ttraits.disambiguator()); } String l = null, r = null; @@ -264,11 +264,11 @@ class TPairListFactory { int ll = (null == l) ? 0 : l.length(); if (null != l && xl > ll && acip.charAt(ll) == ttraits.disambiguatorChar()) { howMuch[0] = l.length() + 1; - return new TPair(l, ttraits.disambiguator()); + return new TPair(ttraits, l, ttraits.disambiguator()); } if (null != l && xl > ll && acip.charAt(ll) == '+') { howMuch[0] = l.length() + 1; - return new TPair(l, "+"); + return new TPair(ttraits, l, "+"); } for (i = Math.min(ttraits.maxWowelLength(), xl - ll); i >= 1; i--) { String t = null; @@ -289,7 +289,7 @@ class TPairListFactory { && acip.charAt(z) == '+') { acip.deleteCharAt(z-1); howMuch[0] = l.length() + 1; - return new TPair(l, "+"); + return new TPair(ttraits, l, "+"); } // Allow Pm to mean PAm, P: to mean PA:, Pm: to mean PAm:. /* TODO(DLC)[EWTS->Tibetan]: */ @@ -305,14 +305,14 @@ class TPairListFactory { if (null == l && null == r) { howMuch[0] = 1; // not 2... // add a disambiguator to avoid exponential running time: - return new TPair(acip.substring(0, 1), + return new TPair(ttraits, acip.substring(0, 1), (xl == 1) ? null : ttraits.disambiguator()); } howMuch[0] = (((l == null) ? 0 : l.length()) + ((r == null) ? 0 : r.length()) + mod); - return new TPair(l, r); + return new TPair(ttraits, l, r); } // TODO(DLC)[EWTS->Tibetan]: } diff --git a/source/org/thdl/tib/text/ttt/TTraits.java b/source/org/thdl/tib/text/ttt/TTraits.java index 41bcb66..d6eac0a 100644 --- a/source/org/thdl/tib/text/ttt/TTraits.java +++ b/source/org/thdl/tib/text/ttt/TTraits.java @@ -18,12 +18,18 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; +import java.util.ArrayList; +import org.thdl.tib.text.DuffCode; + /** A TTraits object encapsulates all the things that make a - * particular Roman transliteration scheme unique. If both EWTS and - * ACIP transliterations have a property in common, then it's likely - * encoded in a manner that's hard to modify. But if they differ in - * some respect, then that difference should be encoded in a TTraits - * object. + * particular Roman transliteration scheme unique. For the most + * part, this difference is expressed at the finest granularity + * possible -- often single characters of Roman transliteration. + * + *

If both EWTS and ACIP transliterations have a property in + * common, then it's likely encoded in a manner that's hard to + * modify. But if they differ in some respect, then that difference + * should be encoded in a TTraits object. * *

It is very likely that classes that implement this interface * will choose to use the design pattern 'singleton'. */ @@ -62,9 +68,63 @@ interface TTraits { /** Returns true if and only if s is a stretch of * transliteration corresponding to a Tibetan wowel (without any * [achen or other] consonant) */ - boolean isWowel(String s); + boolean isWowel(String s); // TODO(DLC)[EWTS->Tibetan]: what about "m:" as opposed to "m" or ":" /** Returns true if and only if the pair given has a simple error * other than being a mere disambiguator. */ boolean hasSimpleError(TPair p); + + /** The implicit 'ahhh' vowel, the one you see when you write the + human-friendly transliteration for "\u0f40\u0f0b". */ + String aVowel(); + + /** Returns true if s is a valid postsuffix. s must not have a + wowel on it. */ + boolean isPostsuffix(String s); + + /** Returns true if and only if l is the representation of a + letter that can be a suffix. Note that all postsuffixes are + also suffixes. l should not have a wowel. */ + boolean isSuffix(String l); + + /** Returns true if and only if l is the representation of a + letter that can be a prefix. l should not have a wowel. */ + boolean isPrefix(String l); + + /** Returns the EWTS transliteration corresponding to the + * consonant l, which should not have a vowel. Returns null if + * there is no such EWTS. + * + *

May return "W" instead of "w", "r" instead of "R", and "y" + * instead of "Y" because we sometimes don't have enough context + * to decide. + * + *

The reasoning for "W" instead of "w" is that r-w and r+w + * are both known hash keys (as {@link + * org.thdl.tib.text#TibetanMachineWeb} would call them). We + * sort 'em out this way. (They are the only things like this + * according to bug report #800166.) */ + String getEwtsForConsonant(String l); + + /** Returns the EWTS corresponding to the given punctuation or + * mark. Returns null if there is no such EWTS. */ + String getEwtsForOther(String l); + + /** Returns the EWTS corresponding to the given "wowel". Returns + * null if there is no such EWTS. */ + String getEwtsForWowel(String l); + + /** If l is a consonant or vowel or punctuation mark, then this + * returns the Unicode for it. The Unicode for the subscribed + * form of the glyph is returned if subscribed is true. Returns + * null if l is unknown. */ + String getUnicodeFor(String l, boolean subscribed); + + /** Returns a scanner that can break up a string of + transliteration. */ + TTshegBarScanner scanner(); + + /** Gets the duffcodes for wowel, such that they look good with + * the preceding glyph, and appends them to duff. */ + void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel); } diff --git a/source/org/thdl/tib/text/ttt/TTshegBarScanner.java b/source/org/thdl/tib/text/ttt/TTshegBarScanner.java index f1a94f1..0835a3b 100644 --- a/source/org/thdl/tib/text/ttt/TTshegBarScanner.java +++ b/source/org/thdl/tib/text/ttt/TTshegBarScanner.java @@ -18,7 +18,11 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.ttt; -import java.io.*; +import java.io.IOException; +import java.io.FileInputStream; +import java.io.InputStreamReader; +import java.io.InputStream; +import java.io.BufferedReader; import java.util.ArrayList; import java.util.Stack; @@ -40,7 +44,7 @@ public abstract class TTshegBarScanner { * If errors is non-null, error messages will be appended to it. * Returns a list of TStrings that is the scan. Warning and * error messages in the result will be long and self-contained - * unless shortMessagse is true. + * unless shortMessages is true. * *

This is not so efficient; copies the whole file into memory * first.