Two things:
One, TMW->EWTS gives dbas and dngas instead of dabs and dangs because Chris Fynn's e-mail from today has dbas and dngas. Second, Down with ACIPRules. Long live ACIPTraits. EWTS->Tibetan conversion is closer still.
This commit is contained in:
parent
82c6047cc2
commit
c16f633ecf
18 changed files with 950 additions and 818 deletions
|
@ -969,6 +969,22 @@ public class DuffPaneTest extends DuffPaneTestBase {
|
|||
|
||||
ensureKeysGiveCorrectWylie("'gas");
|
||||
|
||||
/* Chris Fynn's e-mail on Feb 21 2005 leads to these test
|
||||
cases: */
|
||||
{
|
||||
ensureKeysGiveCorrectWylie("dgas");
|
||||
ensureKeysGiveCorrectWylie("'gas");
|
||||
ensureKeysGiveCorrectWylie("dngas");
|
||||
ensureKeysGiveCorrectWylie("gnad");
|
||||
ensureKeysGiveCorrectWylie("mnad");
|
||||
ensureKeysGiveCorrectWylie("bags");
|
||||
ensureKeysGiveCorrectWylie("dbas");
|
||||
ensureKeysGiveCorrectWylie("'bas");
|
||||
ensureKeysGiveCorrectWylie("mags");
|
||||
ensureKeysGiveCorrectWylie("mangs");
|
||||
ensureKeysGiveCorrectWylie("dmas");
|
||||
}
|
||||
|
||||
ensureKeysGiveCorrectWylie("gangs");
|
||||
|
||||
ensureKeysGiveCorrectWylie("gnags");
|
||||
|
|
|
@ -27,7 +27,7 @@ import org.thdl.util.*;
|
|||
import org.thdl.tib.text.*;
|
||||
|
||||
import org.thdl.tib.text.ttt.TConverter;
|
||||
import org.thdl.tib.text.ttt.ACIPTshegBarScanner;
|
||||
import org.thdl.tib.text.ttt.ACIPTraits;
|
||||
import java.util.ArrayList;
|
||||
|
||||
/** TibetanConverter is a command-line utility for converting to and
|
||||
|
@ -297,17 +297,18 @@ public class TibetanConverter implements FontConverterConstants {
|
|||
if (ACIP_TO_UNI_TEXT == ct || ACIP_TO_TMW == ct) {
|
||||
try {
|
||||
ArrayList al
|
||||
= ACIPTshegBarScanner.instance().scanStream(in, null,
|
||||
ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have",
|
||||
1000 - 1),
|
||||
shortMessages,
|
||||
warningLevel);
|
||||
= ACIPTraits.instance().scanner().scanStream(in, null,
|
||||
ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have",
|
||||
1000 - 1),
|
||||
shortMessages,
|
||||
warningLevel);
|
||||
if (null == al)
|
||||
return 47;
|
||||
boolean embeddedWarnings = (warningLevel != "None");
|
||||
boolean hasWarnings[] = new boolean[] { false };
|
||||
if (ACIP_TO_UNI_TEXT == ct) {
|
||||
if (!TConverter.convertToUnicodeText(al, out, null,
|
||||
if (!TConverter.convertToUnicodeText(ACIPTraits.instance(),
|
||||
al, out, null,
|
||||
null, hasWarnings,
|
||||
embeddedWarnings,
|
||||
warningLevel,
|
||||
|
@ -315,7 +316,8 @@ public class TibetanConverter implements FontConverterConstants {
|
|||
return 46;
|
||||
} else {
|
||||
if (ct != ACIP_TO_TMW) throw new Error("badness");
|
||||
if (!TConverter.convertToTMW(al, out, null, null,
|
||||
if (!TConverter.convertToTMW(ACIPTraits.instance(),
|
||||
al, out, null, null,
|
||||
hasWarnings,
|
||||
embeddedWarnings,
|
||||
warningLevel, shortMessages,
|
||||
|
|
|
@ -137,7 +137,7 @@ public class TGCPair implements THDLWylieConstants {
|
|||
consonantACIP = "V";
|
||||
else
|
||||
consonantACIP
|
||||
= org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(consonantWylie);
|
||||
= org.thdl.tib.text.ttt.ACIPTraits.instance().getACIPForEWTS(consonantWylie);
|
||||
if (null == consonantACIP) {
|
||||
if (null != consonantWylie && consonantWylie.startsWith("R+"))
|
||||
return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + consonantWylie, " because the ACIP R+... could imply the short superscribed form, but this most likely intends the full form (i.e., Unicode character U+0F6A)");
|
||||
|
@ -160,7 +160,7 @@ public class TGCPair implements THDLWylieConstants {
|
|||
}
|
||||
if (vowelWylie != null) {
|
||||
String vowelACIP
|
||||
= org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(vowelWylie);
|
||||
= org.thdl.tib.text.ttt.ACIPTraits.instance().getACIPForEWTS(vowelWylie);
|
||||
if (null == vowelACIP) {
|
||||
return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + vowelWylie, "");
|
||||
} else {
|
||||
|
|
|
@ -25,7 +25,7 @@ import javax.swing.text.rtf.RTFEditorKit;
|
|||
import java.io.*;
|
||||
|
||||
import org.thdl.util.ThdlDebug;
|
||||
import org.thdl.tib.text.ttt.ACIPTshegBarScanner;
|
||||
import org.thdl.tib.text.ttt.ACIPTraits;
|
||||
import org.thdl.tib.text.ttt.TConverter;
|
||||
import org.thdl.tib.text.tshegbar.LegalTshegBar;
|
||||
import org.thdl.tib.text.tshegbar.UnicodeConstants;
|
||||
|
@ -333,8 +333,8 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
{
|
||||
StringBuffer errors = new StringBuffer();
|
||||
String warningLevel = withWarnings ? "All" : "None";
|
||||
ArrayList al = ACIPTshegBarScanner.instance().scan(acip, errors, 500,
|
||||
false, warningLevel);
|
||||
ArrayList al = ACIPTraits.instance().scanner().scan(acip, errors, 500,
|
||||
false, warningLevel);
|
||||
if (null == al || errors.length() > 0) {
|
||||
if (errors.length() > 0)
|
||||
throw new InvalidACIPException(errors.toString());
|
||||
|
@ -348,8 +348,8 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
}
|
||||
try {
|
||||
int tloc[] = new int[] { loc };
|
||||
TConverter.convertToTMW(al, tdoc, null, null, null,
|
||||
putWarningsInOutput, warningLevel,
|
||||
TConverter.convertToTMW(ACIPTraits.instance(), al, tdoc, null, null,
|
||||
null, putWarningsInOutput, warningLevel,
|
||||
false, colors, tloc);
|
||||
return tloc[0] - loc;
|
||||
} catch (IOException e) {
|
||||
|
@ -1430,6 +1430,53 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
candidateType = getCandidateTypeModuloAppendage(candidateType);
|
||||
|
||||
if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType) {
|
||||
/* Update: Chris Fynn wrote this in response to an
|
||||
e-mail from David Chapman on Feb 21, 2005:
|
||||
|
||||
<quote Chris Fynn feb 21 2005>
|
||||
When working out the rules for Tibetan and Dzongkha
|
||||
collation in Bhutan we came up with the following sequences
|
||||
that could be ambiguous:
|
||||
|
||||
0F51 0F42 0F66
|
||||
0F60 0F42 0F66
|
||||
0F51 0F44 0F66
|
||||
0F42 0F53 0F51
|
||||
0F58 0F53 0F51
|
||||
0F56 0F42 0F66
|
||||
0F51 0F56 0F66
|
||||
0F60 0F56 0F66
|
||||
0F58 0F42 0F66
|
||||
0F58 0F44 0F66
|
||||
0F51 0F58 0F66
|
||||
|
||||
After much consultation with experts in Bhutan it was
|
||||
decided these should always be read as follows:
|
||||
|
||||
0F51 0F42 0F66 dgas
|
||||
0F60 0F42 0F66 'gas
|
||||
0F51 0F44 0F66 dngas *
|
||||
0F42 0F53 0F51 gnad
|
||||
0F58 0F53 0F51 mnad *
|
||||
0F56 0F42 0F66 bags
|
||||
0F51 0F56 0F66 dbas
|
||||
0F60 0F56 0F66 'bas *
|
||||
0F58 0F42 0F66 mags
|
||||
0F58 0F44 0F66 mangs
|
||||
0F51 0F58 0F66 dmas
|
||||
|
||||
In most cases it was found that only one of the two possible
|
||||
readings actually existed as words. 0F51 0F44 0F66 , 0F58
|
||||
0F53 0F51, and 0F60 0F56 0F66 were not found as syllables in
|
||||
any known words, but the experts felt that *if* they
|
||||
occurred in Tibetan or Dzongkha text then dngas, mnad, and
|
||||
'bas would be the most likely reading.
|
||||
</quote>
|
||||
|
||||
|
||||
|
||||
Because of this e-mail, dbas and dngas were added to the list of
|
||||
exceptions. */
|
||||
/* Yes, this is ambiguous. How do we handle it? See
|
||||
* this from Andres (but note that only 4 of the 14 in
|
||||
* the second list are ambiguous because ra na sa and
|
||||
|
@ -1480,7 +1527,9 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
|| wylie2.equals("n")
|
||||
|| wylie2.equals("s")))
|
||||
|| (wylie1.equals("d") && (wylie2.equals("g")
|
||||
|| wylie2.equals("m")))
|
||||
|| wylie2.equals("m")
|
||||
|| wylie2.equals("b")
|
||||
|| wylie2.equals("ng")))
|
||||
|| (wylie1.equals("b") && wylie2.equals("d"))
|
||||
|| (wylie1.equals("m") && wylie2.equals("d"))
|
||||
|| (wylie1.equals("'") && (wylie2.equals("g")
|
||||
|
|
|
@ -1988,7 +1988,7 @@ private static String acipForGlyph(String hashKey) {
|
|||
// ~X is a special case because the EWTS is 2 characters in
|
||||
// length
|
||||
|| "~X".equals(hashKey)) // hard-coded EWTS value
|
||||
return org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(hashKey);
|
||||
return org.thdl.tib.text.ttt.ACIPTraits.instance().getACIPForEWTS(hashKey);
|
||||
else
|
||||
// else we are not be able to use it because it's not smart
|
||||
// about stacks (e.g., W+W)
|
||||
|
@ -2116,7 +2116,7 @@ public static String getACIPForGlyph(DuffCode dc1,
|
|||
|
||||
// DLC FIXME: TMW.53 is probably going to come out all wrong (VA
|
||||
// vs. WA) from this function, but
|
||||
// ACIPRules.getACIPForEWTS(String) seems to come through... will
|
||||
// ACIPTraits.getACIPForEWTS(String) seems to come through... will
|
||||
// it always?
|
||||
|
||||
String hashKey = getHashKeyForGlyph(dc1);
|
||||
|
|
|
@ -9,9 +9,9 @@
|
|||
// - blank lines should be ignored
|
||||
// - <?x?> marks a command
|
||||
//
|
||||
// If you change the Wylie here, it can break the ACIP->TMW and
|
||||
// ACIP->Unicode conversion. So keep ACIPRules in sync with this, and be
|
||||
// sure to run 'ant clean check' after your change.
|
||||
// If you change the EWTS transliteration here, it can break the
|
||||
// ACIP->TMW and ACIP->Unicode conversion. So keep ACIPTraits in sync
|
||||
// with this, and be sure to run 'ant clean check' after your change.
|
||||
//
|
||||
// Note that some glyphs have EWTS \uF021-\uF0FF inclusive. These do
|
||||
// not have anything in the Unicode column, though, because this is
|
||||
|
@ -37,7 +37,7 @@
|
|||
// by the way.
|
||||
//
|
||||
// If EWTS changes, then ACIP->TMW and ACIP->Unicode will break --
|
||||
// modify ACIPRules and test test test.
|
||||
// modify ACIPTraits and test test test.
|
||||
|
||||
<?Input:Punctuation?>
|
||||
//_~32,1~0,32
|
||||
|
@ -645,7 +645,7 @@ r+m+m~51,4~~7,59~1,110~8,121~1,123~1,125~8,107~8,114~f62,fa8,fa8
|
|||
// Note that TPairList.java's unicodeExceptionsMap must be updated if
|
||||
// we change who uses U+0F6A.
|
||||
R+Y~52,4~~7,60~1,110~8,120~1,123~1,125~8,106~8,113~f6a,fbb
|
||||
// R+W is mentioned in ACIPRules.java:
|
||||
// R+W is mentioned in ACIPTraits.java:
|
||||
R+W~196,4~~7,61~1,109~8,120~1,123~1,125~8,106~8,113~f6a,fba
|
||||
R+sh~53,4~~7,62~1,109~8,120~1,123~1,125~8,106~8,113~f6a,fb4
|
||||
R+sh+y~54,4~~7,63~1,109~8,122~1,123~1,125~8,108~8,115~f6a,fb4,fb1
|
||||
|
@ -667,7 +667,7 @@ l+h+w~197,4~~7,78~1,109~8,121~1,123~1,125~8,106~8,113~f63,fb7,fad
|
|||
w+y~69,4~~7,79~1,109~8,121~1,123~1,125~8,107~8,114~f5d,fb1
|
||||
w+r~70,4~~7,80~1,109~8,121~1,123~1,125~8,107~8,114~f5d,fb2
|
||||
w+n~195,4~~7,81~1,109~8,120~1,123~1,125~8,106~8,113~f5d,fa3
|
||||
// w+W is mentioned in ACIPRules.java:
|
||||
// w+W is mentioned in ACIPTraits.java:
|
||||
w+W~194,4~~7,82~1,109~8,120~1,123~1,125~8,106~8,113~f5d,fba
|
||||
sh+ts~71,4~~7,83~1,109~8,120~1,123~1,125~8,106~8,113~f64,fa9
|
||||
sh+ts+y~72,4~~7,84~1,109~8,122~1,123~1,125~8,108~8,115~f64,fa9,fb1
|
||||
|
|
|
@ -1,658 +0,0 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.StringTokenizer;
|
||||
import java.util.List;
|
||||
|
||||
import org.thdl.util.ThdlOptions;
|
||||
import org.thdl.tib.text.DuffCode;
|
||||
import org.thdl.tib.text.THDLWylieConstants;
|
||||
import org.thdl.tib.text.TibetanMachineWeb;
|
||||
import org.thdl.tib.text.TibTextUtils;
|
||||
|
||||
// TODO(DLC)[EWTS->Tibetan]: this and ACIPTraits -- unify?
|
||||
|
||||
/** Canonizes some facts regarding the ACIP transcription system.
|
||||
* @author David Chandler */
|
||||
public class ACIPRules {
|
||||
/** {Ksh}, the longest consonant, has 3 characters, so this is
|
||||
* three. */
|
||||
public static int MAX_CONSONANT_LENGTH = 3;
|
||||
|
||||
/** {'EEm:}, the longest wowel, has 5 characters, so this is
|
||||
* five. */
|
||||
public static int MAX_WOWEL_LENGTH = 5;
|
||||
|
||||
/** For O(1) {@link #isWowel(String)} calls. */
|
||||
private static HashSet acipVowels = null;
|
||||
|
||||
private static String[][] baseVowels = new String[][] {
|
||||
// { ACIP, EWTS, EWTS for ACIP {'\'' + baseVowels[][0]}, vowel
|
||||
// numbers (see TibetanMachineWeb's VOWEL_A, VOWEL_o, etc.)
|
||||
// for ACIP, vowel numbers for ACIP {'\'' + baseVowels[][0]}
|
||||
{ "A", "a", "A" },
|
||||
{ "I", "i", "I" },
|
||||
{ "U", "u", "U" },
|
||||
{ "E", "e", "Ae" },
|
||||
{ "O", "o", "Ao" },
|
||||
{ "EE", "ai", "Aai" },
|
||||
{ "OO", "au", "Aau" },
|
||||
{ "i", "-i", "A-i" }
|
||||
};
|
||||
|
||||
/** Returns true if and only if s is an ACIP wowel. You can't
|
||||
* just call this any time -- A is both a consonant and a vowel
|
||||
* in ACIP, so you have to call this in the right context. */
|
||||
public static boolean isWowel(String s) {
|
||||
if (null == acipVowels) {
|
||||
acipVowels = new HashSet(baseVowels.length * 8);
|
||||
for (int i = 0; i < baseVowels.length; i++) {
|
||||
// I'm on my own with 'O and 'E and 'OO and 'EE, but
|
||||
// GANG'O appears and I wonder... so here they are.
|
||||
// It's consistent with 'I and 'A and 'U, at least:
|
||||
// all the vowels may appear as K'vowel. DLC FIXME:
|
||||
// ask.
|
||||
|
||||
acipVowels.add(baseVowels[i][0]);
|
||||
acipVowels.add('\'' + baseVowels[i][0]);
|
||||
acipVowels.add(baseVowels[i][0] + 'm');
|
||||
acipVowels.add('\'' + baseVowels[i][0] + 'm');
|
||||
acipVowels.add(baseVowels[i][0] + ':');
|
||||
acipVowels.add('\'' + baseVowels[i][0] + ':');
|
||||
acipVowels.add(baseVowels[i][0] + "m:");
|
||||
acipVowels.add('\'' + baseVowels[i][0] + "m:");
|
||||
|
||||
// Keep this code in sync with getUnicodeFor.
|
||||
|
||||
// Keep this code in sync with getWylieForACIPVowel.
|
||||
}
|
||||
// {Pm} is treated just like {PAm}; {P:} is treated just
|
||||
// like {PA:}; {Pm:} is treated just like {PAm:}. But
|
||||
// that happens thanks to
|
||||
}
|
||||
return (acipVowels.contains(s));
|
||||
}
|
||||
|
||||
/** For O(1) {@link #isConsonant(String)} calls. */
|
||||
private static HashSet consonants = null;
|
||||
|
||||
/** Returns true if and only if acip is an ACIP consonant (without
|
||||
* a vowel). For example, returns true for "K", but not for
|
||||
* "KA" or "X". */
|
||||
public static boolean isConsonant(String acip) {
|
||||
if (consonants == null) {
|
||||
consonants = new HashSet();
|
||||
consonants.add("V");
|
||||
consonants.add("K");
|
||||
consonants.add("KH");
|
||||
consonants.add("G");
|
||||
consonants.add("NG");
|
||||
consonants.add("C");
|
||||
consonants.add("CH");
|
||||
consonants.add("J");
|
||||
consonants.add("NY");
|
||||
consonants.add("T");
|
||||
consonants.add("TH");
|
||||
consonants.add("D");
|
||||
consonants.add("N");
|
||||
consonants.add("P");
|
||||
consonants.add("PH");
|
||||
consonants.add("B");
|
||||
consonants.add("M");
|
||||
consonants.add("TZ");
|
||||
consonants.add("TS");
|
||||
consonants.add("DZ");
|
||||
consonants.add("W");
|
||||
consonants.add("ZH");
|
||||
consonants.add("Z");
|
||||
consonants.add("Y");
|
||||
consonants.add("R");
|
||||
consonants.add("L");
|
||||
consonants.add("SH");
|
||||
consonants.add("S");
|
||||
consonants.add("H");
|
||||
consonants.add("t");
|
||||
consonants.add("th");
|
||||
consonants.add("d");
|
||||
consonants.add("n");
|
||||
consonants.add("sh");
|
||||
consonants.add("dH");
|
||||
consonants.add("DH");
|
||||
consonants.add("BH");
|
||||
consonants.add("DZH"); // longest, MAX_CONSONANT_LENGTH characters
|
||||
consonants.add("Ksh"); // longest, MAX_CONSONANT_LENGTH characters
|
||||
consonants.add("GH");
|
||||
consonants.add("'");
|
||||
consonants.add("A");
|
||||
}
|
||||
return consonants.contains(acip);
|
||||
}
|
||||
|
||||
/** A map from wylie to ACIP. Note that the Wylie "w" maps to
|
||||
both "V" and "W". */
|
||||
private static HashMap wylieToACIP = null;
|
||||
/** Returns the ACIP transliteration corresponding to the THDL
|
||||
Extended Wylie <em>atom</em> EWTS, or null if EWTS is not
|
||||
recognized. */
|
||||
public static String getACIPForEWTS(String EWTS) {
|
||||
getWylieForACIPConsonant(null);
|
||||
getWylieForACIPOther(null);
|
||||
getWylieForACIPVowel(null);
|
||||
String ans = (String)wylieToACIP.get(EWTS);
|
||||
boolean useCapitalW = false;
|
||||
if (EWTS.startsWith("w"))
|
||||
useCapitalW = true; // We want W+NA, not V+NA; we want WA, not VA.
|
||||
if (null == ans) {
|
||||
StringBuffer finalAns = new StringBuffer(EWTS.length());
|
||||
StringTokenizer sTok = new StringTokenizer(EWTS, "-+", true);
|
||||
while (sTok.hasMoreTokens()) {
|
||||
String part, tok = sTok.nextToken();
|
||||
if (tok.equals("-") || tok.equals("+"))
|
||||
part = tok;
|
||||
else {
|
||||
if ("w".equals(tok)) {
|
||||
// There are only two stacks in TMW that have
|
||||
// U+0FBA: R+Wa and w+Wa. TMW->ACIP fails for
|
||||
// these unless we handle it here. (FIXME:
|
||||
// add an automated test for this).
|
||||
if ("R+W".equals(EWTS) || "w+W".equals(EWTS)) {
|
||||
part = "W";
|
||||
} else {
|
||||
part = "V";
|
||||
}
|
||||
} else {
|
||||
part = (String)wylieToACIP.get(tok);
|
||||
}
|
||||
}
|
||||
if (null == part) return null;
|
||||
finalAns.append(part);
|
||||
}
|
||||
if (useCapitalW)
|
||||
finalAns.setCharAt(0, 'W');
|
||||
return finalAns.toString();
|
||||
}
|
||||
if (useCapitalW)
|
||||
return "W" + ans.substring(1);
|
||||
else
|
||||
return ans;
|
||||
}
|
||||
|
||||
/** Registers acip->wylie mappings in toWylie; registers
|
||||
wylie->acip mappings in {@link #wylieToACIP}. */
|
||||
private static void putMapping(HashMap toWylie, String ACIP, String EWTS) {
|
||||
toWylie.put(ACIP, EWTS);
|
||||
if (null == wylieToACIP) {
|
||||
wylieToACIP = new HashMap(75);
|
||||
|
||||
// We don't want to put "/" in toWylie:
|
||||
wylieToACIP.put("(", "/");
|
||||
wylieToACIP.put(")", "/");
|
||||
wylieToACIP.put("?", "\\");
|
||||
|
||||
wylieToACIP.put("_", " "); // oddball.
|
||||
wylieToACIP.put("o'i", "O'I"); // oddball for TMW9.61.
|
||||
}
|
||||
wylieToACIP.put(EWTS, ACIP);
|
||||
}
|
||||
|
||||
/** Returns true if and only if s is an ACIP consonant. */
|
||||
static final boolean isACIPConsonant(String s) {
|
||||
return (null != ACIPRules.getWylieForACIPConsonant(s));
|
||||
}
|
||||
|
||||
private static HashMap acipConsonant2wylie = null;
|
||||
/** Returns the EWTS corresponding to the given ACIP consonant
|
||||
* (without the "A" vowel). Returns null if there is no such
|
||||
* EWTS.
|
||||
*
|
||||
* <p>Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y",
|
||||
* even though sometimes the EWTS for those is "w", "R", or "Y".
|
||||
* Handle that in the caller. */
|
||||
static final String getWylieForACIPConsonant(String acip) {
|
||||
if (acipConsonant2wylie == null) {
|
||||
acipConsonant2wylie = new HashMap(37);
|
||||
|
||||
// oddball:
|
||||
putMapping(acipConsonant2wylie, "V", "w");
|
||||
|
||||
// more oddballs:
|
||||
putMapping(acipConsonant2wylie, "DH", "d+h");
|
||||
putMapping(acipConsonant2wylie, "BH", "b+h");
|
||||
putMapping(acipConsonant2wylie, "dH", "D+h");
|
||||
putMapping(acipConsonant2wylie, "DZH", "dz+h");
|
||||
putMapping(acipConsonant2wylie, "Ksh", "k+Sh");
|
||||
putMapping(acipConsonant2wylie, "GH", "g+h");
|
||||
|
||||
|
||||
putMapping(acipConsonant2wylie, "K", "k");
|
||||
putMapping(acipConsonant2wylie, "KH", "kh");
|
||||
putMapping(acipConsonant2wylie, "G", "g");
|
||||
putMapping(acipConsonant2wylie, "NG", "ng");
|
||||
putMapping(acipConsonant2wylie, "C", "c");
|
||||
putMapping(acipConsonant2wylie, "CH", "ch");
|
||||
putMapping(acipConsonant2wylie, "J", "j");
|
||||
putMapping(acipConsonant2wylie, "NY", "ny");
|
||||
putMapping(acipConsonant2wylie, "T", "t");
|
||||
putMapping(acipConsonant2wylie, "TH", "th");
|
||||
putMapping(acipConsonant2wylie, "D", "d");
|
||||
putMapping(acipConsonant2wylie, "N", "n");
|
||||
putMapping(acipConsonant2wylie, "P", "p");
|
||||
putMapping(acipConsonant2wylie, "PH", "ph");
|
||||
putMapping(acipConsonant2wylie, "B", "b");
|
||||
putMapping(acipConsonant2wylie, "M", "m");
|
||||
putMapping(acipConsonant2wylie, "TZ", "ts");
|
||||
putMapping(acipConsonant2wylie, "TS", "tsh");
|
||||
putMapping(acipConsonant2wylie, "DZ", "dz");
|
||||
putMapping(acipConsonant2wylie, "W", "W"
|
||||
/* NOTE WELL: sometimes "w", sometimes "W".
|
||||
Handle this in the caller.
|
||||
|
||||
Reasoning for "W" instead of "w": r-w and
|
||||
r+w are both known hash keys. We sort 'em
|
||||
out this way. (They are the only things
|
||||
like this according to bug report #800166.) */
|
||||
);
|
||||
putMapping(acipConsonant2wylie, "ZH", "zh");
|
||||
putMapping(acipConsonant2wylie, "Z", "z");
|
||||
putMapping(acipConsonant2wylie, "'", "'");
|
||||
putMapping(acipConsonant2wylie, "Y", "y");
|
||||
putMapping(acipConsonant2wylie, "R", "r");
|
||||
putMapping(acipConsonant2wylie, "L", "l");
|
||||
putMapping(acipConsonant2wylie, "SH", "sh");
|
||||
putMapping(acipConsonant2wylie, "S", "s");
|
||||
putMapping(acipConsonant2wylie, "H", "h");
|
||||
putMapping(acipConsonant2wylie, "A", "a");
|
||||
putMapping(acipConsonant2wylie, "t", "T");
|
||||
putMapping(acipConsonant2wylie, "th", "Th");
|
||||
putMapping(acipConsonant2wylie, "d", "D");
|
||||
putMapping(acipConsonant2wylie, "n", "N");
|
||||
putMapping(acipConsonant2wylie, "sh", "Sh");
|
||||
}
|
||||
return (String)acipConsonant2wylie.get(acip);
|
||||
}
|
||||
|
||||
private static HashMap acipVowel2wylie = null;
|
||||
/** Returns the EWTS corresponding to the given ACIP "vowel".
|
||||
* Returns null if there is no such EWTS. */
|
||||
static final String getWylieForACIPVowel(String acip) {
|
||||
if (acipVowel2wylie == null) {
|
||||
acipVowel2wylie = new HashMap(baseVowels.length * 4);
|
||||
|
||||
for (int i = 0; i < baseVowels.length; i++) {
|
||||
putMapping(acipVowel2wylie, baseVowels[i][0], baseVowels[i][1]);
|
||||
putMapping(acipVowel2wylie, '\'' + baseVowels[i][0], baseVowels[i][2]);
|
||||
putMapping(acipVowel2wylie, baseVowels[i][0] + 'm', baseVowels[i][1] + 'M');
|
||||
putMapping(acipVowel2wylie, '\'' + baseVowels[i][0] + 'm', baseVowels[i][2] + 'M');
|
||||
putMapping(acipVowel2wylie, baseVowels[i][0] + ':', baseVowels[i][1] + 'H');
|
||||
putMapping(acipVowel2wylie, '\'' + baseVowels[i][0] + ':', baseVowels[i][2] + 'H');
|
||||
putMapping(acipVowel2wylie, baseVowels[i][0] + "m:", baseVowels[i][1] + "MH");
|
||||
putMapping(acipVowel2wylie, '\'' + baseVowels[i][0] + "m:", baseVowels[i][2] + "MH");
|
||||
}
|
||||
// {Pm} is treated just like {PAm}; {P:} is treated just
|
||||
// like {PA:}; {Pm:} is treated just like {PAm:}. But
|
||||
// that happens thanks to
|
||||
// TPairListFactory.getFirstConsonantAndVowel(StringBuffer,int[]).
|
||||
}
|
||||
return (String)acipVowel2wylie.get(acip);
|
||||
}
|
||||
|
||||
private static HashMap acipOther2wylie = null;
|
||||
/** Returns the EWTS corresponding to the given ACIP puncuation or
|
||||
* mark. Returns null if there is no such EWTS. */
|
||||
static final String getWylieForACIPOther(String acip) {
|
||||
if (acipOther2wylie == null) {
|
||||
acipOther2wylie = new HashMap(20);
|
||||
|
||||
// don't use putMapping for this. We don't want TMW->ACIP
|
||||
// to produce "." for a U+0F0C because ACIP doesn't say
|
||||
// that "." means U+0F0C. It just seems to in practice
|
||||
// for ACIP Release IV texts.
|
||||
acipOther2wylie.put(".", "*");
|
||||
|
||||
putMapping(acipOther2wylie, "m", "M");
|
||||
putMapping(acipOther2wylie, ":", "H");
|
||||
putMapping(acipOther2wylie, ",", "/");
|
||||
putMapping(acipOther2wylie, " ", " ");
|
||||
putMapping(acipOther2wylie, ";", "|");
|
||||
putMapping(acipOther2wylie, "`", "!");
|
||||
putMapping(acipOther2wylie, "*", "@#");
|
||||
// There is no glyph in TMW with the EWTS @##, so we don't do this: putMapping(acipOther2wylie, "#", "@##");
|
||||
putMapping(acipOther2wylie, "%", "~X");
|
||||
putMapping(acipOther2wylie, "o", "X");
|
||||
putMapping(acipOther2wylie, "&", "&");
|
||||
putMapping(acipOther2wylie, "^", "\\u0F38");
|
||||
|
||||
putMapping(acipOther2wylie, "0", "0");
|
||||
putMapping(acipOther2wylie, "1", "1");
|
||||
putMapping(acipOther2wylie, "2", "2");
|
||||
putMapping(acipOther2wylie, "3", "3");
|
||||
putMapping(acipOther2wylie, "4", "4");
|
||||
putMapping(acipOther2wylie, "5", "5");
|
||||
putMapping(acipOther2wylie, "6", "6");
|
||||
putMapping(acipOther2wylie, "7", "7");
|
||||
putMapping(acipOther2wylie, "8", "8");
|
||||
putMapping(acipOther2wylie, "9", "9");
|
||||
}
|
||||
return (String)acipOther2wylie.get(acip);
|
||||
}
|
||||
|
||||
private static HashMap superACIP2unicode = null;
|
||||
private static HashMap subACIP2unicode = null;
|
||||
/** If acip is an ACIP consonant or vowel or punctuation mark,
|
||||
* then this returns the Unicode for it. The Unicode for the
|
||||
* subscribed form of the glyph is returned if subscribed is
|
||||
* true. Returns null if acip is unknown. */
|
||||
static String getUnicodeFor(String acip, boolean subscribed) {
|
||||
if (superACIP2unicode == null) {
|
||||
final boolean compactUnicode
|
||||
= ThdlOptions.getBooleanOption("thdl.acip.to.unicode.conversions.use.0F52.et.cetera");
|
||||
superACIP2unicode = new HashMap(144);
|
||||
subACIP2unicode = new HashMap(42);
|
||||
|
||||
// oddball:
|
||||
subACIP2unicode.put("V", "\u0FAD");
|
||||
|
||||
superACIP2unicode.put("DH", (compactUnicode ? "\u0F52" : "\u0F51\u0FB7"));
|
||||
subACIP2unicode.put("DH", (compactUnicode ? "\u0FA2" : "\u0FA1\u0FB7"));
|
||||
superACIP2unicode.put("BH", (compactUnicode ? "\u0F57" : "\u0F56\u0FB7"));
|
||||
subACIP2unicode.put("BH", (compactUnicode ? "\u0FA7" : "\u0FA6\u0FB7"));
|
||||
superACIP2unicode.put("dH", (compactUnicode ? "\u0F4D" : "\u0F4C\u0FB7"));
|
||||
subACIP2unicode.put("dH", (compactUnicode ? "\u0F9D" : "\u0F9C\u0FB7"));
|
||||
superACIP2unicode.put("DZH", (compactUnicode ? "\u0F5C" : "\u0F5B\u0FB7"));
|
||||
subACIP2unicode.put("DZH", (compactUnicode ? "\u0FAC" : "\u0FAB\u0FB7"));
|
||||
superACIP2unicode.put("Ksh", (compactUnicode ? "\u0F69" : "\u0F40\u0FB5"));
|
||||
subACIP2unicode.put("Ksh", (compactUnicode ? "\u0FB9" : "\u0F90\u0FB5"));
|
||||
superACIP2unicode.put("GH", (compactUnicode ? "\u0F43" : "\u0F42\u0FB7"));
|
||||
subACIP2unicode.put("GH", (compactUnicode ? "\u0F93" : "\u0F92\u0FB7"));
|
||||
superACIP2unicode.put("K", "\u0F40");
|
||||
subACIP2unicode.put("K", "\u0F90");
|
||||
superACIP2unicode.put("KH", "\u0F41");
|
||||
subACIP2unicode.put("KH", "\u0F91");
|
||||
superACIP2unicode.put("G", "\u0F42");
|
||||
subACIP2unicode.put("G", "\u0F92");
|
||||
superACIP2unicode.put("NG", "\u0F44");
|
||||
subACIP2unicode.put("NG", "\u0F94");
|
||||
superACIP2unicode.put("C", "\u0F45");
|
||||
subACIP2unicode.put("C", "\u0F95");
|
||||
superACIP2unicode.put("CH", "\u0F46");
|
||||
subACIP2unicode.put("CH", "\u0F96");
|
||||
superACIP2unicode.put("J", "\u0F47");
|
||||
subACIP2unicode.put("J", "\u0F97");
|
||||
superACIP2unicode.put("NY", "\u0F49");
|
||||
subACIP2unicode.put("NY", "\u0F99");
|
||||
superACIP2unicode.put("T", "\u0F4F");
|
||||
subACIP2unicode.put("T", "\u0F9F");
|
||||
superACIP2unicode.put("TH", "\u0F50");
|
||||
subACIP2unicode.put("TH", "\u0FA0");
|
||||
superACIP2unicode.put("D", "\u0F51");
|
||||
subACIP2unicode.put("D", "\u0FA1");
|
||||
superACIP2unicode.put("N", "\u0F53");
|
||||
subACIP2unicode.put("N", "\u0FA3");
|
||||
superACIP2unicode.put("P", "\u0F54");
|
||||
subACIP2unicode.put("P", "\u0FA4");
|
||||
superACIP2unicode.put("PH", "\u0F55");
|
||||
subACIP2unicode.put("PH", "\u0FA5");
|
||||
superACIP2unicode.put("B", "\u0F56");
|
||||
subACIP2unicode.put("B", "\u0FA6");
|
||||
superACIP2unicode.put("M", "\u0F58");
|
||||
subACIP2unicode.put("M", "\u0FA8");
|
||||
superACIP2unicode.put("TZ", "\u0F59");
|
||||
subACIP2unicode.put("TZ", "\u0FA9");
|
||||
superACIP2unicode.put("TS", "\u0F5A");
|
||||
subACIP2unicode.put("TS", "\u0FAA");
|
||||
superACIP2unicode.put("DZ", "\u0F5B");
|
||||
subACIP2unicode.put("DZ", "\u0FAB");
|
||||
superACIP2unicode.put("W", "\u0F5D");
|
||||
subACIP2unicode.put("W", "\u0FBA"); // oddball
|
||||
superACIP2unicode.put("ZH", "\u0F5E");
|
||||
subACIP2unicode.put("ZH", "\u0FAE");
|
||||
superACIP2unicode.put("Z", "\u0F5F");
|
||||
subACIP2unicode.put("Z", "\u0FAF");
|
||||
superACIP2unicode.put("'", "\u0F60");
|
||||
subACIP2unicode.put("'", "\u0FB0");
|
||||
superACIP2unicode.put("Y", "\u0F61");
|
||||
subACIP2unicode.put("Y", "\u0FB1");
|
||||
superACIP2unicode.put("R", "\u0F62");
|
||||
subACIP2unicode.put("R", "\u0FB2");
|
||||
superACIP2unicode.put("L", "\u0F63");
|
||||
subACIP2unicode.put("L", "\u0FB3");
|
||||
superACIP2unicode.put("SH", "\u0F64");
|
||||
subACIP2unicode.put("SH", "\u0FB4");
|
||||
superACIP2unicode.put("S", "\u0F66");
|
||||
subACIP2unicode.put("S", "\u0FB6");
|
||||
superACIP2unicode.put("H", "\u0F67");
|
||||
subACIP2unicode.put("H", "\u0FB7");
|
||||
superACIP2unicode.put("A", "\u0F68");
|
||||
subACIP2unicode.put("A", "\u0FB8");
|
||||
superACIP2unicode.put("t", "\u0F4A");
|
||||
subACIP2unicode.put("t", "\u0F9A");
|
||||
superACIP2unicode.put("th", "\u0F4B");
|
||||
subACIP2unicode.put("th", "\u0F9B");
|
||||
superACIP2unicode.put("d", "\u0F4C");
|
||||
subACIP2unicode.put("d", "\u0F9C");
|
||||
superACIP2unicode.put("n", "\u0F4E");
|
||||
subACIP2unicode.put("n", "\u0F9E");
|
||||
superACIP2unicode.put("sh", "\u0F65");
|
||||
subACIP2unicode.put("sh", "\u0FB5");
|
||||
|
||||
superACIP2unicode.put("I", "\u0F72");
|
||||
superACIP2unicode.put("E", "\u0F7A");
|
||||
superACIP2unicode.put("O", "\u0F7C");
|
||||
superACIP2unicode.put("U", "\u0F74");
|
||||
superACIP2unicode.put("OO", "\u0F7D");
|
||||
superACIP2unicode.put("EE", "\u0F7B");
|
||||
superACIP2unicode.put("i", "\u0F80");
|
||||
superACIP2unicode.put("'A", "\u0F71");
|
||||
superACIP2unicode.put("'I", "\u0F71\u0F72");
|
||||
superACIP2unicode.put("'E", "\u0F71\u0F7A");
|
||||
superACIP2unicode.put("'O", "\u0F71\u0F7C");
|
||||
superACIP2unicode.put("'U", "\u0F71\u0F74");
|
||||
superACIP2unicode.put("'OO", "\u0F71\u0F7D");
|
||||
superACIP2unicode.put("'EE", "\u0F71\u0F7B");
|
||||
superACIP2unicode.put("'i", "\u0F71\u0F80");
|
||||
|
||||
superACIP2unicode.put("Im", "\u0F72\u0F7E");
|
||||
superACIP2unicode.put("Em", "\u0F7A\u0F7E");
|
||||
superACIP2unicode.put("Om", "\u0F7C\u0F7E");
|
||||
superACIP2unicode.put("Um", "\u0F74\u0F7E");
|
||||
superACIP2unicode.put("OOm", "\u0F7D\u0F7E");
|
||||
superACIP2unicode.put("EEm", "\u0F7B\u0F7E");
|
||||
superACIP2unicode.put("im", "\u0F80\u0F7E");
|
||||
superACIP2unicode.put("'Am", "\u0F71\u0F7E");
|
||||
superACIP2unicode.put("'Im", "\u0F71\u0F72\u0F7E");
|
||||
superACIP2unicode.put("'Em", "\u0F71\u0F7A\u0F7E");
|
||||
superACIP2unicode.put("'Om", "\u0F71\u0F7C\u0F7E");
|
||||
superACIP2unicode.put("'Um", "\u0F71\u0F74\u0F7E");
|
||||
superACIP2unicode.put("'OOm", "\u0F71\u0F7D\u0F7E");
|
||||
superACIP2unicode.put("'EEm", "\u0F71\u0F7B\u0F7E");
|
||||
superACIP2unicode.put("'im", "\u0F71\u0F80\u0F7E");
|
||||
|
||||
superACIP2unicode.put("I:", "\u0F72\u0F7F");
|
||||
superACIP2unicode.put("E:", "\u0F7A\u0F7F");
|
||||
superACIP2unicode.put("O:", "\u0F7C\u0F7F");
|
||||
superACIP2unicode.put("U:", "\u0F74\u0F7F");
|
||||
superACIP2unicode.put("OO:", "\u0F7D\u0F7F");
|
||||
superACIP2unicode.put("EE:", "\u0F7B\u0F7F");
|
||||
superACIP2unicode.put("i:", "\u0F80\u0F7F");
|
||||
superACIP2unicode.put("'A:", "\u0F71\u0F7F");
|
||||
superACIP2unicode.put("'I:", "\u0F71\u0F72\u0F7F");
|
||||
superACIP2unicode.put("'E:", "\u0F71\u0F7A\u0F7F");
|
||||
superACIP2unicode.put("'O:", "\u0F71\u0F7C\u0F7F");
|
||||
superACIP2unicode.put("'U:", "\u0F71\u0F74\u0F7F");
|
||||
superACIP2unicode.put("'OO:", "\u0F71\u0F7D\u0F7F");
|
||||
superACIP2unicode.put("'EE:", "\u0F71\u0F7B\u0F7F");
|
||||
superACIP2unicode.put("'i:", "\u0F71\u0F80\u0F7F");
|
||||
|
||||
superACIP2unicode.put("Im:", "\u0F72\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("Em:", "\u0F7A\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("Om:", "\u0F7C\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("Um:", "\u0F74\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("OOm:", "\u0F7D\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("EEm:", "\u0F7B\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("im:", "\u0F80\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'Am:", "\u0F71\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'Im:", "\u0F71\u0F72\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'Em:", "\u0F71\u0F7A\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'Om:", "\u0F71\u0F7C\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'Um:", "\u0F71\u0F74\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'OOm:", "\u0F71\u0F7D\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'EEm:", "\u0F71\u0F7B\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'im:", "\u0F71\u0F80\u0F7E\u0F7F");
|
||||
// :m does not appear, though you'd think it's as valid as m:.
|
||||
|
||||
superACIP2unicode.put("m", "\u0F7E");
|
||||
superACIP2unicode.put(":", "\u0F7F");
|
||||
superACIP2unicode.put("m:", "\u0F7E\u0F7F");
|
||||
|
||||
superACIP2unicode.put("Am", "\u0F7E");
|
||||
superACIP2unicode.put("A:", "\u0F7F");
|
||||
superACIP2unicode.put("Am:", "\u0F7E\u0F7F");
|
||||
|
||||
superACIP2unicode.put("0", "\u0F20");
|
||||
superACIP2unicode.put("1", "\u0F21");
|
||||
superACIP2unicode.put("2", "\u0F22");
|
||||
superACIP2unicode.put("3", "\u0F23");
|
||||
superACIP2unicode.put("4", "\u0F24");
|
||||
superACIP2unicode.put("5", "\u0F25");
|
||||
superACIP2unicode.put("6", "\u0F26");
|
||||
superACIP2unicode.put("7", "\u0F27");
|
||||
superACIP2unicode.put("8", "\u0F28");
|
||||
superACIP2unicode.put("9", "\u0F29");
|
||||
|
||||
// punctuation
|
||||
superACIP2unicode.put("&", "\u0F85");
|
||||
superACIP2unicode.put(",", "\u0F0D");
|
||||
superACIP2unicode.put(" ", "\u0F0B");
|
||||
superACIP2unicode.put(".", "\u0F0C");
|
||||
superACIP2unicode.put("`", "\u0F08");
|
||||
superACIP2unicode.put("`", "\u0F08");
|
||||
superACIP2unicode.put("*", "\u0F04\u0F05");
|
||||
superACIP2unicode.put("#", "\u0F04\u0F05\u0F05");
|
||||
superACIP2unicode.put("%", "\u0F35"); // but might be U+0F14, so we warn.
|
||||
superACIP2unicode.put("o", "\u0F37");
|
||||
superACIP2unicode.put(";", "\u0F11");
|
||||
superACIP2unicode.put("\r", "\r");
|
||||
superACIP2unicode.put("\t", "\t");
|
||||
superACIP2unicode.put("\r\n", "\r\n");
|
||||
superACIP2unicode.put("\n", "\n");
|
||||
superACIP2unicode.put("\\", "\u0F84");
|
||||
superACIP2unicode.put("^", "\u0F38");
|
||||
|
||||
// DLC FIXME: "^ GONG" is "^GONG", right?
|
||||
// DLC FIXME: what's the Unicode for x? RC said there is none in plain-text Unicode for x. But what about in RTF Unicode?
|
||||
}
|
||||
if (subscribed) {
|
||||
String u = (String)subACIP2unicode.get(acip);
|
||||
if (null != u) return u;
|
||||
}
|
||||
return (String)superACIP2unicode.get(acip);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/** Gets the duffcodes for vowel, such that they look good with
|
||||
* the stack with hash key hashKey, and appends them to r. */
|
||||
static void getDuffForACIPVowel(ArrayList duff, DuffCode preceding, String vowel) {
|
||||
if (null == vowel) return;
|
||||
if (null == getWylieForACIPVowel(vowel)) // FIXME: expensive assertion! Use assert.
|
||||
throw new IllegalArgumentException("Vowel " + vowel + " isn't in the small set of vowels we handle correctly.");
|
||||
|
||||
// Order matters here.
|
||||
boolean context_added[] = new boolean[] { false };
|
||||
if (vowel.startsWith("A")) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.WYLIE_aVOWEL, context_added);
|
||||
} else if (vowel.indexOf("'U") >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.U_VOWEL, context_added);
|
||||
} else if (vowel.indexOf("'I") >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.I_VOWEL, context_added);
|
||||
} else {
|
||||
if (vowel.indexOf('\'') >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.A_VOWEL, context_added);
|
||||
}
|
||||
if (vowel.indexOf("EE") >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.ai_VOWEL, context_added);
|
||||
} else if (vowel.indexOf('E') >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.e_VOWEL, context_added);
|
||||
}
|
||||
if (vowel.indexOf("OO") >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.au_VOWEL, context_added);
|
||||
} else if (vowel.indexOf('O') >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.o_VOWEL, context_added);
|
||||
}
|
||||
if (vowel.indexOf('I') >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.i_VOWEL, context_added);
|
||||
}
|
||||
if (vowel.indexOf('U') >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.u_VOWEL, context_added);
|
||||
}
|
||||
if (vowel.indexOf('i') >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.reverse_i_VOWEL, context_added);
|
||||
}
|
||||
}
|
||||
// FIXME: Use TMW9.61, the "o'i" special combination, when appropriate.
|
||||
|
||||
if (vowel.indexOf('m') >= 0) {
|
||||
DuffCode last = (DuffCode)duff.get(duff.size() - 1);
|
||||
duff.remove(duff.size() - 1); // getBindu will add it back...
|
||||
TibTextUtils.getBindu(duff, last);
|
||||
}
|
||||
if (vowel.indexOf(':') >= 0)
|
||||
duff.add(TibetanMachineWeb.getGlyph("H"));
|
||||
}
|
||||
|
||||
/** Returns true if and only if l is the ACIP representation of a
|
||||
letter that can be a suffix. Note that all postsuffixes are
|
||||
also suffixes. l must not have an "A" -- use "S", not "SA",
|
||||
that is. */
|
||||
public static boolean isACIPSuffix(String l) {
|
||||
return ("S".equals(l)
|
||||
|| "G".equals(l)
|
||||
|| "D".equals(l)
|
||||
|| "M".equals(l)
|
||||
|| "'".equals(l)
|
||||
|| "B".equals(l)
|
||||
|| "NG".equals(l)
|
||||
|| "N".equals(l)
|
||||
|| "L".equals(l)
|
||||
|| "R".equals(l));
|
||||
}
|
||||
|
||||
/** Returns true if and only if l is the ACIP representation of a
|
||||
letter that can be a prefix. l must not have an "A" -- use
|
||||
"D", not "DA", that is. */
|
||||
public static boolean isACIPPrefix(String l) {
|
||||
return ("'".equals(l)
|
||||
|| "M".equals(l)
|
||||
|| "B".equals(l)
|
||||
|| "D".equals(l)
|
||||
|| "G".equals(l));
|
||||
}
|
||||
|
||||
/** Returns true if and only if l is the ACIP representation of a
|
||||
letter that can be a postsuffix. l must not have an "A" --
|
||||
use "D", not "DA", that is. */
|
||||
public static boolean isACIPPostsuffix(String l) {
|
||||
return ("S".equals(l)
|
||||
|| "D".equals(l));
|
||||
}
|
||||
}
|
|
@ -18,11 +18,25 @@ Contributor(s): ______________________________________.
|
|||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.util.HashSet;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.StringTokenizer;
|
||||
import java.util.List;
|
||||
|
||||
import org.thdl.util.ThdlOptions;
|
||||
import org.thdl.tib.text.DuffCode;
|
||||
import org.thdl.tib.text.THDLWylieConstants;
|
||||
import org.thdl.tib.text.TibetanMachineWeb;
|
||||
import org.thdl.tib.text.TibTextUtils;
|
||||
|
||||
|
||||
/** A singleton class that should contain (but due to laziness and
|
||||
* ignorance probably does not contain) all the traits that make ACIP
|
||||
* transliteration different from other (say, EWTS)
|
||||
* transliterations. */
|
||||
final class ACIPTraits implements TTraits {
|
||||
* transliteration scheme different from other (say, EWTS)
|
||||
* transliteration schemes. This is not safe to use in concurrent
|
||||
* programs but it would be easy to make it so. */
|
||||
public final class ACIPTraits implements TTraits {
|
||||
/** sole instance of this class */
|
||||
private static ACIPTraits singleton = null;
|
||||
|
||||
|
@ -30,7 +44,7 @@ final class ACIPTraits implements TTraits {
|
|||
private ACIPTraits() { }
|
||||
|
||||
/** Returns the singleton instance of this class. */
|
||||
public static ACIPTraits instance() {
|
||||
public static /* synchronized */ ACIPTraits instance() {
|
||||
if (null == singleton) {
|
||||
singleton = new ACIPTraits();
|
||||
}
|
||||
|
@ -43,15 +57,536 @@ final class ACIPTraits implements TTraits {
|
|||
/** Returns '-'. */
|
||||
public char disambiguatorChar() { return '-'; }
|
||||
|
||||
public int maxConsonantLength() { return ACIPRules.MAX_CONSONANT_LENGTH; }
|
||||
public int maxConsonantLength() { return MAX_CONSONANT_LENGTH; }
|
||||
|
||||
public int maxWowelLength() { return ACIPRules.MAX_WOWEL_LENGTH; }
|
||||
|
||||
public boolean isConsonant(String s) { return ACIPRules.isConsonant(s); }
|
||||
|
||||
public boolean isWowel(String s) { return ACIPRules.isWowel(s); }
|
||||
public int maxWowelLength() { return MAX_WOWEL_LENGTH; }
|
||||
|
||||
public boolean hasSimpleError(TPair p) {
|
||||
return ("A".equals(p.getLeft()) && null == p.getRight());
|
||||
}
|
||||
|
||||
public String aVowel() { return "A"; }
|
||||
|
||||
public boolean isPostsuffix(String l) {
|
||||
return ("S".equals(l)
|
||||
|| "D".equals(l));
|
||||
}
|
||||
|
||||
public boolean isSuffix(String l) {
|
||||
return ("S".equals(l)
|
||||
|| "G".equals(l)
|
||||
|| "D".equals(l)
|
||||
|| "M".equals(l)
|
||||
|| "'".equals(l)
|
||||
|| "B".equals(l)
|
||||
|| "NG".equals(l)
|
||||
|| "N".equals(l)
|
||||
|| "L".equals(l)
|
||||
|| "R".equals(l));
|
||||
}
|
||||
|
||||
public boolean isPrefix(String l) {
|
||||
return ("'".equals(l)
|
||||
|| "M".equals(l)
|
||||
|| "B".equals(l)
|
||||
|| "D".equals(l)
|
||||
|| "G".equals(l));
|
||||
}
|
||||
|
||||
private HashMap superACIP2unicode = null;
|
||||
private HashMap subACIP2unicode = null;
|
||||
public /* synchronized */ String getUnicodeFor(String acip, boolean subscribed) {
|
||||
if (superACIP2unicode == null) {
|
||||
final boolean compactUnicode
|
||||
= ThdlOptions.getBooleanOption("thdl.acip.to.unicode.conversions.use.0F52.et.cetera");
|
||||
superACIP2unicode = new HashMap(144);
|
||||
subACIP2unicode = new HashMap(42);
|
||||
|
||||
// oddball:
|
||||
subACIP2unicode.put("V", "\u0FAD");
|
||||
|
||||
superACIP2unicode.put("DH", (compactUnicode ? "\u0F52" : "\u0F51\u0FB7"));
|
||||
subACIP2unicode.put("DH", (compactUnicode ? "\u0FA2" : "\u0FA1\u0FB7"));
|
||||
superACIP2unicode.put("BH", (compactUnicode ? "\u0F57" : "\u0F56\u0FB7"));
|
||||
subACIP2unicode.put("BH", (compactUnicode ? "\u0FA7" : "\u0FA6\u0FB7"));
|
||||
superACIP2unicode.put("dH", (compactUnicode ? "\u0F4D" : "\u0F4C\u0FB7"));
|
||||
subACIP2unicode.put("dH", (compactUnicode ? "\u0F9D" : "\u0F9C\u0FB7"));
|
||||
superACIP2unicode.put("DZH", (compactUnicode ? "\u0F5C" : "\u0F5B\u0FB7"));
|
||||
subACIP2unicode.put("DZH", (compactUnicode ? "\u0FAC" : "\u0FAB\u0FB7"));
|
||||
superACIP2unicode.put("Ksh", (compactUnicode ? "\u0F69" : "\u0F40\u0FB5"));
|
||||
subACIP2unicode.put("Ksh", (compactUnicode ? "\u0FB9" : "\u0F90\u0FB5"));
|
||||
superACIP2unicode.put("GH", (compactUnicode ? "\u0F43" : "\u0F42\u0FB7"));
|
||||
subACIP2unicode.put("GH", (compactUnicode ? "\u0F93" : "\u0F92\u0FB7"));
|
||||
superACIP2unicode.put("K", "\u0F40");
|
||||
subACIP2unicode.put("K", "\u0F90");
|
||||
superACIP2unicode.put("KH", "\u0F41");
|
||||
subACIP2unicode.put("KH", "\u0F91");
|
||||
superACIP2unicode.put("G", "\u0F42");
|
||||
subACIP2unicode.put("G", "\u0F92");
|
||||
superACIP2unicode.put("NG", "\u0F44");
|
||||
subACIP2unicode.put("NG", "\u0F94");
|
||||
superACIP2unicode.put("C", "\u0F45");
|
||||
subACIP2unicode.put("C", "\u0F95");
|
||||
superACIP2unicode.put("CH", "\u0F46");
|
||||
subACIP2unicode.put("CH", "\u0F96");
|
||||
superACIP2unicode.put("J", "\u0F47");
|
||||
subACIP2unicode.put("J", "\u0F97");
|
||||
superACIP2unicode.put("NY", "\u0F49");
|
||||
subACIP2unicode.put("NY", "\u0F99");
|
||||
superACIP2unicode.put("T", "\u0F4F");
|
||||
subACIP2unicode.put("T", "\u0F9F");
|
||||
superACIP2unicode.put("TH", "\u0F50");
|
||||
subACIP2unicode.put("TH", "\u0FA0");
|
||||
superACIP2unicode.put("D", "\u0F51");
|
||||
subACIP2unicode.put("D", "\u0FA1");
|
||||
superACIP2unicode.put("N", "\u0F53");
|
||||
subACIP2unicode.put("N", "\u0FA3");
|
||||
superACIP2unicode.put("P", "\u0F54");
|
||||
subACIP2unicode.put("P", "\u0FA4");
|
||||
superACIP2unicode.put("PH", "\u0F55");
|
||||
subACIP2unicode.put("PH", "\u0FA5");
|
||||
superACIP2unicode.put("B", "\u0F56");
|
||||
subACIP2unicode.put("B", "\u0FA6");
|
||||
superACIP2unicode.put("M", "\u0F58");
|
||||
subACIP2unicode.put("M", "\u0FA8");
|
||||
superACIP2unicode.put("TZ", "\u0F59");
|
||||
subACIP2unicode.put("TZ", "\u0FA9");
|
||||
superACIP2unicode.put("TS", "\u0F5A");
|
||||
subACIP2unicode.put("TS", "\u0FAA");
|
||||
superACIP2unicode.put("DZ", "\u0F5B");
|
||||
subACIP2unicode.put("DZ", "\u0FAB");
|
||||
superACIP2unicode.put("W", "\u0F5D");
|
||||
subACIP2unicode.put("W", "\u0FBA"); // oddball
|
||||
superACIP2unicode.put("ZH", "\u0F5E");
|
||||
subACIP2unicode.put("ZH", "\u0FAE");
|
||||
superACIP2unicode.put("Z", "\u0F5F");
|
||||
subACIP2unicode.put("Z", "\u0FAF");
|
||||
superACIP2unicode.put("'", "\u0F60");
|
||||
subACIP2unicode.put("'", "\u0FB0");
|
||||
superACIP2unicode.put("Y", "\u0F61");
|
||||
subACIP2unicode.put("Y", "\u0FB1");
|
||||
superACIP2unicode.put("R", "\u0F62");
|
||||
subACIP2unicode.put("R", "\u0FB2");
|
||||
superACIP2unicode.put("L", "\u0F63");
|
||||
subACIP2unicode.put("L", "\u0FB3");
|
||||
superACIP2unicode.put("SH", "\u0F64");
|
||||
subACIP2unicode.put("SH", "\u0FB4");
|
||||
superACIP2unicode.put("S", "\u0F66");
|
||||
subACIP2unicode.put("S", "\u0FB6");
|
||||
superACIP2unicode.put("H", "\u0F67");
|
||||
subACIP2unicode.put("H", "\u0FB7");
|
||||
superACIP2unicode.put("A", "\u0F68");
|
||||
subACIP2unicode.put("A", "\u0FB8");
|
||||
superACIP2unicode.put("t", "\u0F4A");
|
||||
subACIP2unicode.put("t", "\u0F9A");
|
||||
superACIP2unicode.put("th", "\u0F4B");
|
||||
subACIP2unicode.put("th", "\u0F9B");
|
||||
superACIP2unicode.put("d", "\u0F4C");
|
||||
subACIP2unicode.put("d", "\u0F9C");
|
||||
superACIP2unicode.put("n", "\u0F4E");
|
||||
subACIP2unicode.put("n", "\u0F9E");
|
||||
superACIP2unicode.put("sh", "\u0F65");
|
||||
subACIP2unicode.put("sh", "\u0FB5");
|
||||
|
||||
superACIP2unicode.put("I", "\u0F72");
|
||||
superACIP2unicode.put("E", "\u0F7A");
|
||||
superACIP2unicode.put("O", "\u0F7C");
|
||||
superACIP2unicode.put("U", "\u0F74");
|
||||
superACIP2unicode.put("OO", "\u0F7D");
|
||||
superACIP2unicode.put("EE", "\u0F7B");
|
||||
superACIP2unicode.put("i", "\u0F80");
|
||||
superACIP2unicode.put("'A", "\u0F71");
|
||||
superACIP2unicode.put("'I", "\u0F71\u0F72");
|
||||
superACIP2unicode.put("'E", "\u0F71\u0F7A");
|
||||
superACIP2unicode.put("'O", "\u0F71\u0F7C");
|
||||
superACIP2unicode.put("'U", "\u0F71\u0F74");
|
||||
superACIP2unicode.put("'OO", "\u0F71\u0F7D");
|
||||
superACIP2unicode.put("'EE", "\u0F71\u0F7B");
|
||||
superACIP2unicode.put("'i", "\u0F71\u0F80");
|
||||
|
||||
superACIP2unicode.put("Im", "\u0F72\u0F7E");
|
||||
superACIP2unicode.put("Em", "\u0F7A\u0F7E");
|
||||
superACIP2unicode.put("Om", "\u0F7C\u0F7E");
|
||||
superACIP2unicode.put("Um", "\u0F74\u0F7E");
|
||||
superACIP2unicode.put("OOm", "\u0F7D\u0F7E");
|
||||
superACIP2unicode.put("EEm", "\u0F7B\u0F7E");
|
||||
superACIP2unicode.put("im", "\u0F80\u0F7E");
|
||||
superACIP2unicode.put("'Am", "\u0F71\u0F7E");
|
||||
superACIP2unicode.put("'Im", "\u0F71\u0F72\u0F7E");
|
||||
superACIP2unicode.put("'Em", "\u0F71\u0F7A\u0F7E");
|
||||
superACIP2unicode.put("'Om", "\u0F71\u0F7C\u0F7E");
|
||||
superACIP2unicode.put("'Um", "\u0F71\u0F74\u0F7E");
|
||||
superACIP2unicode.put("'OOm", "\u0F71\u0F7D\u0F7E");
|
||||
superACIP2unicode.put("'EEm", "\u0F71\u0F7B\u0F7E");
|
||||
superACIP2unicode.put("'im", "\u0F71\u0F80\u0F7E");
|
||||
|
||||
superACIP2unicode.put("I:", "\u0F72\u0F7F");
|
||||
superACIP2unicode.put("E:", "\u0F7A\u0F7F");
|
||||
superACIP2unicode.put("O:", "\u0F7C\u0F7F");
|
||||
superACIP2unicode.put("U:", "\u0F74\u0F7F");
|
||||
superACIP2unicode.put("OO:", "\u0F7D\u0F7F");
|
||||
superACIP2unicode.put("EE:", "\u0F7B\u0F7F");
|
||||
superACIP2unicode.put("i:", "\u0F80\u0F7F");
|
||||
superACIP2unicode.put("'A:", "\u0F71\u0F7F");
|
||||
superACIP2unicode.put("'I:", "\u0F71\u0F72\u0F7F");
|
||||
superACIP2unicode.put("'E:", "\u0F71\u0F7A\u0F7F");
|
||||
superACIP2unicode.put("'O:", "\u0F71\u0F7C\u0F7F");
|
||||
superACIP2unicode.put("'U:", "\u0F71\u0F74\u0F7F");
|
||||
superACIP2unicode.put("'OO:", "\u0F71\u0F7D\u0F7F");
|
||||
superACIP2unicode.put("'EE:", "\u0F71\u0F7B\u0F7F");
|
||||
superACIP2unicode.put("'i:", "\u0F71\u0F80\u0F7F");
|
||||
|
||||
superACIP2unicode.put("Im:", "\u0F72\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("Em:", "\u0F7A\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("Om:", "\u0F7C\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("Um:", "\u0F74\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("OOm:", "\u0F7D\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("EEm:", "\u0F7B\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("im:", "\u0F80\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'Am:", "\u0F71\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'Im:", "\u0F71\u0F72\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'Em:", "\u0F71\u0F7A\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'Om:", "\u0F71\u0F7C\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'Um:", "\u0F71\u0F74\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'OOm:", "\u0F71\u0F7D\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'EEm:", "\u0F71\u0F7B\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'im:", "\u0F71\u0F80\u0F7E\u0F7F");
|
||||
// :m does not appear, though you'd think it's as valid as m:.
|
||||
|
||||
superACIP2unicode.put("m", "\u0F7E");
|
||||
superACIP2unicode.put(":", "\u0F7F");
|
||||
superACIP2unicode.put("m:", "\u0F7E\u0F7F");
|
||||
|
||||
superACIP2unicode.put("Am", "\u0F7E");
|
||||
superACIP2unicode.put("A:", "\u0F7F");
|
||||
superACIP2unicode.put("Am:", "\u0F7E\u0F7F");
|
||||
|
||||
superACIP2unicode.put("0", "\u0F20");
|
||||
superACIP2unicode.put("1", "\u0F21");
|
||||
superACIP2unicode.put("2", "\u0F22");
|
||||
superACIP2unicode.put("3", "\u0F23");
|
||||
superACIP2unicode.put("4", "\u0F24");
|
||||
superACIP2unicode.put("5", "\u0F25");
|
||||
superACIP2unicode.put("6", "\u0F26");
|
||||
superACIP2unicode.put("7", "\u0F27");
|
||||
superACIP2unicode.put("8", "\u0F28");
|
||||
superACIP2unicode.put("9", "\u0F29");
|
||||
|
||||
// punctuation
|
||||
superACIP2unicode.put("&", "\u0F85");
|
||||
superACIP2unicode.put(",", "\u0F0D");
|
||||
superACIP2unicode.put(" ", "\u0F0B");
|
||||
superACIP2unicode.put(".", "\u0F0C");
|
||||
superACIP2unicode.put("`", "\u0F08");
|
||||
superACIP2unicode.put("`", "\u0F08");
|
||||
superACIP2unicode.put("*", "\u0F04\u0F05");
|
||||
superACIP2unicode.put("#", "\u0F04\u0F05\u0F05");
|
||||
superACIP2unicode.put("%", "\u0F35"); // but might be U+0F14, so we warn.
|
||||
superACIP2unicode.put("o", "\u0F37");
|
||||
superACIP2unicode.put(";", "\u0F11");
|
||||
superACIP2unicode.put("\r", "\r");
|
||||
superACIP2unicode.put("\t", "\t");
|
||||
superACIP2unicode.put("\r\n", "\r\n");
|
||||
superACIP2unicode.put("\n", "\n");
|
||||
superACIP2unicode.put("\\", "\u0F84");
|
||||
superACIP2unicode.put("^", "\u0F38");
|
||||
|
||||
// DLC FIXME: "^ GONG" is "^GONG", right?
|
||||
// DLC FIXME: what's the Unicode for x? RC said there is none in plain-text Unicode for x. But what about in RTF Unicode?
|
||||
}
|
||||
if (subscribed) {
|
||||
String u = (String)subACIP2unicode.get(acip);
|
||||
if (null != u) return u;
|
||||
}
|
||||
return (String)superACIP2unicode.get(acip);
|
||||
}
|
||||
|
||||
private HashMap acipOther2wylie = null;
|
||||
public /* synchronized */ String getEwtsForOther(String acip) {
|
||||
if (acipOther2wylie == null) {
|
||||
acipOther2wylie = new HashMap(20);
|
||||
|
||||
// don't use putMapping for this. We don't want TMW->ACIP
|
||||
// to produce "." for a U+0F0C because ACIP doesn't say
|
||||
// that "." means U+0F0C. It just seems to in practice
|
||||
// for ACIP Release IV texts.
|
||||
acipOther2wylie.put(".", "*");
|
||||
|
||||
putMapping(acipOther2wylie, "m", "M");
|
||||
putMapping(acipOther2wylie, ":", "H");
|
||||
putMapping(acipOther2wylie, ",", "/");
|
||||
putMapping(acipOther2wylie, " ", " ");
|
||||
putMapping(acipOther2wylie, ";", "|");
|
||||
putMapping(acipOther2wylie, "`", "!");
|
||||
putMapping(acipOther2wylie, "*", "@#");
|
||||
// There is no glyph in TMW with the EWTS @##, so we don't do this: putMapping(acipOther2wylie, "#", "@##");
|
||||
putMapping(acipOther2wylie, "%", "~X");
|
||||
putMapping(acipOther2wylie, "o", "X");
|
||||
putMapping(acipOther2wylie, "&", "&");
|
||||
putMapping(acipOther2wylie, "^", "\\u0F38");
|
||||
|
||||
putMapping(acipOther2wylie, "0", "0");
|
||||
putMapping(acipOther2wylie, "1", "1");
|
||||
putMapping(acipOther2wylie, "2", "2");
|
||||
putMapping(acipOther2wylie, "3", "3");
|
||||
putMapping(acipOther2wylie, "4", "4");
|
||||
putMapping(acipOther2wylie, "5", "5");
|
||||
putMapping(acipOther2wylie, "6", "6");
|
||||
putMapping(acipOther2wylie, "7", "7");
|
||||
putMapping(acipOther2wylie, "8", "8");
|
||||
putMapping(acipOther2wylie, "9", "9");
|
||||
}
|
||||
return (String)acipOther2wylie.get(acip);
|
||||
}
|
||||
|
||||
public TTshegBarScanner scanner() { return ACIPTshegBarScanner.instance(); }
|
||||
|
||||
/** Registers acip->wylie mappings in toWylie; registers
|
||||
wylie->acip mappings in {@link #wylieToACIP}. */
|
||||
private /* synchronized */ void putMapping(HashMap toWylie, String ACIP, String EWTS) {
|
||||
toWylie.put(ACIP, EWTS);
|
||||
if (null == wylieToACIP) {
|
||||
wylieToACIP = new HashMap(75);
|
||||
|
||||
// We don't want to put "/" in toWylie:
|
||||
wylieToACIP.put("(", "/");
|
||||
wylieToACIP.put(")", "/");
|
||||
wylieToACIP.put("?", "\\");
|
||||
|
||||
wylieToACIP.put("_", " "); // oddball.
|
||||
wylieToACIP.put("o'i", "O'I"); // oddball for TMW9.61.
|
||||
}
|
||||
wylieToACIP.put(EWTS, ACIP);
|
||||
}
|
||||
|
||||
/** A map from EWTS to ACIP. Note that the EWTS "w" maps to both
|
||||
"V" and "W" in reality but this map will only give one or the
|
||||
other. */
|
||||
private HashMap wylieToACIP = null;
|
||||
/** Returns the ACIP transliteration corresponding to the THDL
|
||||
Extended Wylie <em>atom</em> EWTS, or null if EWTS is not
|
||||
recognized. */
|
||||
public String getACIPForEWTS(String EWTS) {
|
||||
getEwtsForConsonant(null); // inits wylieToACIP
|
||||
getEwtsForOther(null); // inits wylieToACIP
|
||||
getEwtsForWowel(null); // inits wylieToACIP
|
||||
String ans = (String)wylieToACIP.get(EWTS);
|
||||
boolean useCapitalW = false;
|
||||
if (EWTS.startsWith("w"))
|
||||
useCapitalW = true; // We want W+NA, not V+NA; we want WA, not VA.
|
||||
if (null == ans) {
|
||||
StringBuffer finalAns = new StringBuffer(EWTS.length());
|
||||
StringTokenizer sTok = new StringTokenizer(EWTS, "-+", true);
|
||||
while (sTok.hasMoreTokens()) {
|
||||
String part, tok = sTok.nextToken();
|
||||
if (tok.equals("-") || tok.equals("+"))
|
||||
part = tok;
|
||||
else {
|
||||
if ("w".equals(tok)) {
|
||||
// There are only two stacks in TMW that have
|
||||
// U+0FBA: R+Wa and w+Wa. TMW->ACIP fails for
|
||||
// these unless we handle it here. (FIXME:
|
||||
// add an automated test for this).
|
||||
if ("R+W".equals(EWTS) || "w+W".equals(EWTS)) {
|
||||
part = "W";
|
||||
} else {
|
||||
part = "V";
|
||||
}
|
||||
} else {
|
||||
part = (String)wylieToACIP.get(tok);
|
||||
}
|
||||
}
|
||||
if (null == part) return null;
|
||||
finalAns.append(part);
|
||||
}
|
||||
if (useCapitalW)
|
||||
finalAns.setCharAt(0, 'W');
|
||||
return finalAns.toString();
|
||||
}
|
||||
if (useCapitalW)
|
||||
return "W" + ans.substring(1);
|
||||
else
|
||||
return ans;
|
||||
}
|
||||
|
||||
private HashMap acipConsonant2wylie = null;
|
||||
/** Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y",
|
||||
* even though sometimes the EWTS for those is "w", "R", or "Y".
|
||||
* Handle that in the caller. */
|
||||
public /* synchronized */ String getEwtsForConsonant(String acip) {
|
||||
if (acipConsonant2wylie == null) {
|
||||
acipConsonant2wylie = new HashMap(37);
|
||||
|
||||
// oddball:
|
||||
putMapping(acipConsonant2wylie, "V", "w");
|
||||
|
||||
// more oddballs:
|
||||
putMapping(acipConsonant2wylie, "DH", "d+h");
|
||||
putMapping(acipConsonant2wylie, "BH", "b+h");
|
||||
putMapping(acipConsonant2wylie, "dH", "D+h");
|
||||
putMapping(acipConsonant2wylie, "DZH", "dz+h"); // longest, MAX_CONSONANT_LENGTH characters
|
||||
putMapping(acipConsonant2wylie, "Ksh", "k+Sh"); // longest, MAX_CONSONANT_LENGTH characters
|
||||
putMapping(acipConsonant2wylie, "GH", "g+h");
|
||||
|
||||
|
||||
putMapping(acipConsonant2wylie, "K", "k");
|
||||
putMapping(acipConsonant2wylie, "KH", "kh");
|
||||
putMapping(acipConsonant2wylie, "G", "g");
|
||||
putMapping(acipConsonant2wylie, "NG", "ng");
|
||||
putMapping(acipConsonant2wylie, "C", "c");
|
||||
putMapping(acipConsonant2wylie, "CH", "ch");
|
||||
putMapping(acipConsonant2wylie, "J", "j");
|
||||
putMapping(acipConsonant2wylie, "NY", "ny");
|
||||
putMapping(acipConsonant2wylie, "T", "t");
|
||||
putMapping(acipConsonant2wylie, "TH", "th");
|
||||
putMapping(acipConsonant2wylie, "D", "d");
|
||||
putMapping(acipConsonant2wylie, "N", "n");
|
||||
putMapping(acipConsonant2wylie, "P", "p");
|
||||
putMapping(acipConsonant2wylie, "PH", "ph");
|
||||
putMapping(acipConsonant2wylie, "B", "b");
|
||||
putMapping(acipConsonant2wylie, "M", "m");
|
||||
putMapping(acipConsonant2wylie, "TZ", "ts");
|
||||
putMapping(acipConsonant2wylie, "TS", "tsh");
|
||||
putMapping(acipConsonant2wylie, "DZ", "dz");
|
||||
putMapping(acipConsonant2wylie, "W", "W"
|
||||
/* NOTE WELL: sometimes "w", sometimes "W".
|
||||
Handle this in the caller.
|
||||
|
||||
Reasoning for "W" instead of "w": r-w and
|
||||
r+w are both known hash keys. We sort 'em
|
||||
out this way. (They are the only things
|
||||
like this according to bug report #800166.) */
|
||||
);
|
||||
putMapping(acipConsonant2wylie, "ZH", "zh");
|
||||
putMapping(acipConsonant2wylie, "Z", "z");
|
||||
putMapping(acipConsonant2wylie, "'", "'");
|
||||
putMapping(acipConsonant2wylie, "Y", "y");
|
||||
putMapping(acipConsonant2wylie, "R", "r");
|
||||
putMapping(acipConsonant2wylie, "L", "l");
|
||||
putMapping(acipConsonant2wylie, "SH", "sh");
|
||||
putMapping(acipConsonant2wylie, "S", "s");
|
||||
putMapping(acipConsonant2wylie, "H", "h");
|
||||
putMapping(acipConsonant2wylie, "A", "a");
|
||||
putMapping(acipConsonant2wylie, "t", "T");
|
||||
putMapping(acipConsonant2wylie, "th", "Th");
|
||||
putMapping(acipConsonant2wylie, "d", "D");
|
||||
putMapping(acipConsonant2wylie, "n", "N");
|
||||
putMapping(acipConsonant2wylie, "sh", "Sh");
|
||||
}
|
||||
return (String)acipConsonant2wylie.get(acip);
|
||||
}
|
||||
|
||||
private HashMap acipWowel2wylie = null;
|
||||
public /* synchronized */ String getEwtsForWowel(String acip) {
|
||||
if (acipWowel2wylie == null) {
|
||||
acipWowel2wylie = new HashMap(baseVowels.length * 4);
|
||||
|
||||
for (int i = 0; i < baseVowels.length; i++) {
|
||||
putMapping(acipWowel2wylie, baseVowels[i][0], baseVowels[i][1]);
|
||||
putMapping(acipWowel2wylie, '\'' + baseVowels[i][0], baseVowels[i][2]);
|
||||
putMapping(acipWowel2wylie, baseVowels[i][0] + 'm', baseVowels[i][1] + 'M');
|
||||
putMapping(acipWowel2wylie, '\'' + baseVowels[i][0] + 'm', baseVowels[i][2] + 'M');
|
||||
putMapping(acipWowel2wylie, baseVowels[i][0] + ':', baseVowels[i][1] + 'H');
|
||||
putMapping(acipWowel2wylie, '\'' + baseVowels[i][0] + ':', baseVowels[i][2] + 'H');
|
||||
putMapping(acipWowel2wylie, baseVowels[i][0] + "m:", baseVowels[i][1] + "MH");
|
||||
putMapping(acipWowel2wylie, '\'' + baseVowels[i][0] + "m:", baseVowels[i][2] + "MH");
|
||||
}
|
||||
// {Pm} is treated just like {PAm}; {P:} is treated just
|
||||
// like {PA:}; {Pm:} is treated just like {PAm:}. But
|
||||
// that happens thanks to
|
||||
// TPairListFactory.getFirstConsonantAndVowel(StringBuffer,int[]).
|
||||
|
||||
// Keep this code in sync with getUnicodeFor.
|
||||
}
|
||||
return (String)acipWowel2wylie.get(acip);
|
||||
}
|
||||
|
||||
/** {Ksh}, the longest consonant, has 3 characters, so this is
|
||||
* three. */
|
||||
private static int MAX_CONSONANT_LENGTH = 3;
|
||||
|
||||
/** {'EEm:}, the longest wowel, has 5 characters, so this is
|
||||
* five. */
|
||||
private static int MAX_WOWEL_LENGTH = 5;
|
||||
|
||||
private static String[][] baseVowels = new String[][] {
|
||||
// { ACIP, EWTS, EWTS for ACIP {'\'' + baseVowels[][0]}, vowel
|
||||
// numbers (see TibetanMachineWeb's VOWEL_A, VOWEL_o, etc.)
|
||||
// for ACIP, vowel numbers for ACIP {'\'' + baseVowels[][0]}
|
||||
{ "A", "a", "A" },
|
||||
{ "I", "i", "I" },
|
||||
{ "U", "u", "U" },
|
||||
{ "E", "e", "Ae" },
|
||||
{ "O", "o", "Ao" },
|
||||
{ "EE", "ai", "Aai" },
|
||||
{ "OO", "au", "Aau" },
|
||||
{ "i", "-i", "A-i" }
|
||||
};
|
||||
|
||||
/** Returns true if and only if s is an ACIP wowel. You can't
|
||||
* just call this any time -- A is both a consonant and a vowel
|
||||
* in ACIP, so you have to call this in the right context. */
|
||||
public boolean isWowel(String s) {
|
||||
// I'm on my own with 'O and 'E and 'OO and 'EE, but GANG'O
|
||||
// appears and I wonder... so here they are. It's consistent
|
||||
// with 'I and 'A and 'U, at least: all the vowels may appear
|
||||
// as K'vowel. DLC FIXME: ask.
|
||||
return (null != getEwtsForWowel(s));
|
||||
}
|
||||
|
||||
/** Returns true if and only if s is an ACIP consonant. */
|
||||
public boolean isConsonant(String s) {
|
||||
return (null != getEwtsForConsonant(s));
|
||||
}
|
||||
|
||||
/** Gets the duffcodes for wowel, such that they look good with
|
||||
* the preceding glyph, and appends them to duff. */
|
||||
public void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel) {
|
||||
if (null == wowel) return;
|
||||
if (null == getEwtsForWowel(wowel)) // FIXME: expensive assertion! Use assert.
|
||||
throw new IllegalArgumentException("Wowel " + wowel + " isn't in the small set of wowels we handle correctly.");
|
||||
|
||||
// Order matters here.
|
||||
boolean context_added[] = new boolean[] { false };
|
||||
if (wowel.startsWith("A")) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.WYLIE_aVOWEL, context_added);
|
||||
} else if (wowel.indexOf("'U") >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.U_VOWEL, context_added);
|
||||
} else if (wowel.indexOf("'I") >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.I_VOWEL, context_added);
|
||||
} else {
|
||||
if (wowel.indexOf('\'') >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.A_VOWEL, context_added);
|
||||
}
|
||||
if (wowel.indexOf("EE") >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.ai_VOWEL, context_added);
|
||||
} else if (wowel.indexOf('E') >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.e_VOWEL, context_added);
|
||||
}
|
||||
if (wowel.indexOf("OO") >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.au_VOWEL, context_added);
|
||||
} else if (wowel.indexOf('O') >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.o_VOWEL, context_added);
|
||||
}
|
||||
if (wowel.indexOf('I') >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.i_VOWEL, context_added);
|
||||
}
|
||||
if (wowel.indexOf('U') >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.u_VOWEL, context_added);
|
||||
}
|
||||
if (wowel.indexOf('i') >= 0) {
|
||||
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.reverse_i_VOWEL, context_added);
|
||||
}
|
||||
}
|
||||
// FIXME: Use TMW9.61, the "o'i" special combination, when appropriate.
|
||||
|
||||
if (wowel.indexOf('m') >= 0) {
|
||||
DuffCode last = (DuffCode)duff.get(duff.size() - 1);
|
||||
duff.remove(duff.size() - 1); // getBindu will add it back...
|
||||
TibTextUtils.getBindu(duff, last);
|
||||
}
|
||||
if (wowel.indexOf(':') >= 0)
|
||||
duff.add(TibetanMachineWeb.getGlyph(getEwtsForOther(":")));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -18,11 +18,10 @@ Contributor(s): ______________________________________.
|
|||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Stack;
|
||||
|
||||
import org.thdl.util.ThdlDebug;
|
||||
import org.thdl.util.ThdlOptions;
|
||||
|
||||
/**
|
||||
|
@ -36,8 +35,10 @@ import org.thdl.util.ThdlOptions;
|
|||
* the parser, not here in the lexical analyzer. That'd be cleaner,
|
||||
* and more like how you'd do things if you used lex and yacc.
|
||||
*
|
||||
* This is not public because you should use {@link ACIPTraits#scanner()}.
|
||||
*
|
||||
* @author David Chandler */
|
||||
public class ACIPTshegBarScanner extends TTshegBarScanner {
|
||||
class ACIPTshegBarScanner extends TTshegBarScanner {
|
||||
/** True if those ACIP snippets inside square brackets (e.g.,
|
||||
"[THIS]") are to be passed through into the output unmodified
|
||||
while retaining the brackets and if those ACIP snippets inside
|
||||
|
|
|
@ -18,11 +18,14 @@ Contributor(s): ______________________________________.
|
|||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import org.thdl.tib.text.DuffCode;
|
||||
|
||||
/** A singleton class that should contain (but due to laziness and
|
||||
* ignorance probably does not contain) all the traits that make EWTS
|
||||
* transliteration different from other (say, ACIP) transliteration
|
||||
* schemes. */
|
||||
final class EWTSTraits implements TTraits {
|
||||
public final class EWTSTraits implements TTraits {
|
||||
/** sole instance of this class */
|
||||
private static EWTSTraits singleton = null;
|
||||
|
||||
|
@ -30,7 +33,7 @@ final class EWTSTraits implements TTraits {
|
|||
private EWTSTraits() { }
|
||||
|
||||
/** */
|
||||
public static EWTSTraits instance() {
|
||||
public static synchronized EWTSTraits instance() {
|
||||
if (null == singleton) {
|
||||
singleton = new EWTSTraits();
|
||||
}
|
||||
|
@ -79,4 +82,48 @@ final class EWTSTraits implements TTraits {
|
|||
|| "H".equals(s)
|
||||
|| "M".equals(s)); // TODO(DLC)[EWTS->Tibetan]:???
|
||||
}
|
||||
|
||||
public String aVowel() { return "a"; }
|
||||
|
||||
public boolean isPostsuffix(String s) {
|
||||
return ("s".equals(s) || "d".equals(s));
|
||||
}
|
||||
|
||||
public boolean isPrefix(String l) {
|
||||
return ("'".equals(l)
|
||||
|| "m".equals(l)
|
||||
|| "b".equals(l)
|
||||
|| "d".equals(l)
|
||||
|| "g".equals(l));
|
||||
}
|
||||
|
||||
public boolean isSuffix(String l) {
|
||||
return ("s".equals(l)
|
||||
|| "g".equals(l)
|
||||
|| "d".equals(l)
|
||||
|| "m".equals(l)
|
||||
|| "'".equals(l)
|
||||
|| "b".equals(l)
|
||||
|| "ng".equals(l)
|
||||
|| "n".equals(l)
|
||||
|| "l".equals(l)
|
||||
|| "r".equals(l));
|
||||
}
|
||||
|
||||
/** Returns l, since this is EWTS's traits class. */
|
||||
public String getEwtsForConsonant(String l) { return l; }
|
||||
|
||||
/** Returns l, since this is EWTS's traits class. */
|
||||
public String getEwtsForOther(String l) { return l; }
|
||||
|
||||
/** Returns l, since this is EWTS's traits class. */
|
||||
public String getEwtsForWowel(String l) { return l; }
|
||||
|
||||
public TTshegBarScanner scanner() { return EWTSTshegBarScanner.instance(); }
|
||||
|
||||
public void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel) {
|
||||
throw new Error("TODO(DLC)[EWTS->Tibetan]");
|
||||
}
|
||||
|
||||
public String getUnicodeFor(String l, boolean subscribed) { throw new Error("TODO(DLC)[EWTS->Tibetan]"); }
|
||||
}
|
||||
|
|
56
source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java
Normal file
56
source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java
Normal file
|
@ -0,0 +1,56 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/**
|
||||
* This singleton class is able to break up Strings of EWTS text (for
|
||||
* example, an entire sutra file) into tsheg bars, comments, etc.
|
||||
* Non-Tibetan parts are segregated (so that consumers can ensure that
|
||||
* they remain non-Tibetan), and Tibetan passages are broken up into
|
||||
* tsheg bars.
|
||||
*
|
||||
* This is not public because you should use {@link EWTSTraits#scanner()}.
|
||||
*
|
||||
* @author David Chandler */
|
||||
class EWTSTshegBarScanner extends TTshegBarScanner {
|
||||
/** See the comment in TTshegBarScanner. This does not find
|
||||
errors and warnings that you'd think of a parser finding (DLC
|
||||
DOES IT?). */
|
||||
public ArrayList scan(String s, StringBuffer errors, int maxErrors,
|
||||
boolean shortMessages, String warningLevel) {
|
||||
// the size depends on whether it's mostly Tibetan or mostly
|
||||
// Latin and a number of other factors. This is meant to be
|
||||
// an underestimate, but not too much of an underestimate.
|
||||
ArrayList al = new ArrayList(s.length() / 10);
|
||||
throw new Error("DLC unimplemented");
|
||||
}
|
||||
|
||||
/** non-public because this is a singleton */
|
||||
protected EWTSTshegBarScanner() { }
|
||||
private static EWTSTshegBarScanner singleton = null;
|
||||
/** Returns the sole instance of this class. */
|
||||
public synchronized static EWTSTshegBarScanner instance() {
|
||||
if (null == singleton) {
|
||||
singleton = new EWTSTshegBarScanner();
|
||||
}
|
||||
return singleton;
|
||||
}
|
||||
}
|
|
@ -202,15 +202,16 @@ public class PackageTest extends TestCase {
|
|||
message. */
|
||||
static String ACIP2TMW2Translit(boolean EWTSNotACIP, String ACIP) {
|
||||
StringBuffer errors = new StringBuffer();
|
||||
ArrayList al = ACIPTshegBarScanner.instance().scan(ACIP, errors, -1,
|
||||
false, "None");
|
||||
ArrayList al = ACIPTraits.instance().scanner().scan(ACIP, errors, -1,
|
||||
false, "None");
|
||||
if (null == al || errors.length() > 0)
|
||||
return null;
|
||||
org.thdl.tib.text.TibetanDocument tdoc
|
||||
= new org.thdl.tib.text.TibetanDocument();
|
||||
int loc[] = new int[] { 0 };
|
||||
try {
|
||||
if (!TConverter.convertToTMW(al,
|
||||
if (!TConverter.convertToTMW(ACIPTraits.instance(),
|
||||
al,
|
||||
tdoc,
|
||||
null,
|
||||
null,
|
||||
|
@ -7358,8 +7359,8 @@ tstHelper("ZUR");
|
|||
|
||||
private static void shelp(String s, String expectedErrors, String expectedScan, String warningLevel) {
|
||||
StringBuffer errors = new StringBuffer();
|
||||
ArrayList al = ACIPTshegBarScanner.instance().scan(s, errors, -1, false,
|
||||
warningLevel);
|
||||
ArrayList al = ACIPTraits.instance().scanner().scan(s, errors, -1, false,
|
||||
warningLevel);
|
||||
if (null != expectedScan) {
|
||||
if (!al.toString().equals(expectedScan)) {
|
||||
System.out.println("Scanning " + s + " into tsheg bars was expected to cause the following scan:");
|
||||
|
@ -7392,7 +7393,7 @@ tstHelper("ZUR");
|
|||
|
||||
/** Tests {@link ACIPTshegBarScanner#scan(String, StringBuffer,
|
||||
int, boolean)}. */
|
||||
public void testScanner() {
|
||||
public void testAcipScanner() {
|
||||
shelp("Pm KA", "", "[TIBETAN_NON_PUNCTUATION:{Pm}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{KA}]");
|
||||
|
||||
shelp("KA (KHA\nGA)", "", "[TIBETAN_NON_PUNCTUATION:{KA}, TIBETAN_PUNCTUATION:{ }, START_PAREN:{(}, TIBETAN_NON_PUNCTUATION:{KHA}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{GA}, END_PAREN:{)}]");
|
||||
|
@ -7682,7 +7683,8 @@ tstHelper("ZUR");
|
|||
private static void uhelp(String acip, String expectedUnicode,
|
||||
String warningLevel, boolean shortMessages) {
|
||||
StringBuffer errors = new StringBuffer();
|
||||
String unicode = TConverter.convertToUnicodeText("ACIP", acip, errors,
|
||||
String unicode = TConverter.convertToUnicodeText(ACIPTraits.instance(),
|
||||
acip, errors,
|
||||
null, true,
|
||||
warningLevel,
|
||||
shortMessages);
|
||||
|
|
|
@ -69,10 +69,10 @@ public class TConverter {
|
|||
boolean shortMessages = false;
|
||||
String warningLevel = "Most";
|
||||
ArrayList al
|
||||
= ACIPTshegBarScanner.instance().scanFile(args[0], errors,
|
||||
maxErrors - 1,
|
||||
shortMessages,
|
||||
warningLevel);
|
||||
= ACIPTraits.instance().scanner().scanFile(args[0], errors,
|
||||
maxErrors - 1,
|
||||
shortMessages,
|
||||
warningLevel);
|
||||
|
||||
if (null == al) {
|
||||
System.err.println(maxErrors + " or more lexical errors occurred while scanning ACIP input file; is this");
|
||||
|
@ -103,8 +103,9 @@ public class TConverter {
|
|||
warnings = new StringBuffer();
|
||||
putWarningsInOutput = true;
|
||||
}
|
||||
convertToTMW(al, System.out, errors, warnings, null,
|
||||
putWarningsInOutput, warningLevel, shortMessages, colors);
|
||||
convertToTMW(ACIPTraits.instance(), al, System.out, errors, warnings,
|
||||
null, putWarningsInOutput, warningLevel, shortMessages,
|
||||
colors);
|
||||
int retCode = 0;
|
||||
if (errors.length() > 0) {
|
||||
System.err.println("Errors converting ACIP input file: ");
|
||||
|
@ -139,7 +140,8 @@ public class TConverter {
|
|||
* prefix rules in another
|
||||
* @throws IOException if we cannot write to out
|
||||
*/
|
||||
public static boolean convertToTMW(ArrayList scan,
|
||||
public static boolean convertToTMW(TTraits ttraits,
|
||||
ArrayList scan,
|
||||
OutputStream out,
|
||||
StringBuffer errors,
|
||||
StringBuffer warnings,
|
||||
|
@ -152,7 +154,8 @@ public class TConverter {
|
|||
{
|
||||
TibetanDocument tdoc = new TibetanDocument();
|
||||
boolean rv
|
||||
= convertToTMW(scan, tdoc, errors, warnings, hasWarnings,
|
||||
= convertToTMW(ttraits,
|
||||
scan, tdoc, errors, warnings, hasWarnings,
|
||||
writeWarningsToResult, warningLevel,
|
||||
shortMessages, colors,
|
||||
new int[] { tdoc.getLength() });
|
||||
|
@ -169,7 +172,8 @@ public class TConverter {
|
|||
offset from zero inside tdoc at which conversion results will
|
||||
be placed. On output, loc[0] is one past the offset of the
|
||||
last of the conversion results. */
|
||||
public static boolean convertToTMW(ArrayList scan,
|
||||
public static boolean convertToTMW(TTraits ttraits,
|
||||
ArrayList scan,
|
||||
TibetanDocument tdoc,
|
||||
StringBuffer errors,
|
||||
StringBuffer warnings,
|
||||
|
@ -181,7 +185,8 @@ public class TConverter {
|
|||
int[] loc)
|
||||
throws IOException
|
||||
{
|
||||
return convertTo(false, true, scan, null, tdoc, errors, warnings,
|
||||
return convertTo(false, true,
|
||||
ttraits, scan, null, tdoc, errors, warnings,
|
||||
hasWarnings, writeWarningsToResult, warningLevel,
|
||||
shortMessages, colors, loc,
|
||||
loc[0] == tdoc.getLength());
|
||||
|
@ -189,33 +194,30 @@ public class TConverter {
|
|||
|
||||
/** Returns UTF-8 encoded Unicode. A bit indirect, so use this
|
||||
* for testing only if performance is a concern. If errors occur
|
||||
* in scanning the ACIP or in converting a tsheg bar, then they
|
||||
* are appended to errors if errors is non-null, as well as
|
||||
* written to the result. If warnings occur in scanning the ACIP
|
||||
* or in converting a tsheg bar, then they are appended to
|
||||
* warnings if warnings is non-null, and they are written to the
|
||||
* result if writeWarningsToResult is true. Error and warning
|
||||
* messages are long and self-contained unless shortMessages is
|
||||
* true. Returns the conversion upon perfect success or if there
|
||||
* were merely warnings, null if errors occurred. */
|
||||
public static String convertToUnicodeText(String transliteration,
|
||||
String acip,
|
||||
* in scanning the transliteration or in converting a tsheg bar,
|
||||
* then they are appended to errors if errors is non-null, as
|
||||
* well as written to the result. If warnings occur in scanning
|
||||
* the transliteration or in converting a tsheg bar, then they
|
||||
* are appended to warnings if warnings is non-null, and they are
|
||||
* written to the result if writeWarningsToResult is true. Error
|
||||
* and warning messages are long and self-contained unless
|
||||
* shortMessages is true. Returns the conversion upon perfect
|
||||
* success or if there were merely warnings, null if errors
|
||||
* occurred. */
|
||||
public static String convertToUnicodeText(TTraits ttraits,
|
||||
String translit,
|
||||
StringBuffer errors,
|
||||
StringBuffer warnings,
|
||||
boolean writeWarningsToResult,
|
||||
String warningLevel,
|
||||
boolean shortMessages) {
|
||||
if (transliteration != "ACIP") {
|
||||
ThdlDebug.noteIffyCode();
|
||||
throw new IllegalArgumentException("Unsupported transliteration");
|
||||
}
|
||||
ByteArrayOutputStream sw = new ByteArrayOutputStream();
|
||||
ArrayList al
|
||||
= ACIPTshegBarScanner.instance().scan(acip, errors, -1,
|
||||
shortMessages, warningLevel);
|
||||
= ttraits.scanner().scan(translit, errors, -1, shortMessages,
|
||||
warningLevel);
|
||||
try {
|
||||
if (null != al) {
|
||||
convertToUnicodeText(al, sw, errors,
|
||||
convertToUnicodeText(ttraits, al, sw, errors,
|
||||
warnings, null, writeWarningsToResult,
|
||||
warningLevel, shortMessages);
|
||||
return sw.toString("UTF-8");
|
||||
|
@ -236,7 +238,8 @@ public class TConverter {
|
|||
* writeWarningsToOut is true, then warnings also will be written
|
||||
* to out.
|
||||
* @return true upon perfect success, false if errors occurred.
|
||||
* @param scan result of ACIPTshegBarScanner.scan(..)
|
||||
* @param scan result of using ttraits.scanner() to break up the
|
||||
* original string of transliteration
|
||||
* @param out stream to which to write converted text
|
||||
* @param errors if non-null, all error messages are appended
|
||||
* @param warnings if non-null, all warning messages appropriate
|
||||
|
@ -246,9 +249,9 @@ public class TConverter {
|
|||
* false otherwise
|
||||
* @param writeWarningsToOut if true, then all warning messages
|
||||
* are written to out in the appropriate places
|
||||
* @throws IOException if we cannot write to out
|
||||
*/
|
||||
public static boolean convertToUnicodeText(ArrayList scan,
|
||||
* @throws IOException if we cannot write to out */
|
||||
public static boolean convertToUnicodeText(TTraits ttraits,
|
||||
ArrayList scan,
|
||||
OutputStream out,
|
||||
StringBuffer errors,
|
||||
StringBuffer warnings,
|
||||
|
@ -258,7 +261,8 @@ public class TConverter {
|
|||
boolean shortMessages)
|
||||
throws IOException
|
||||
{
|
||||
return convertTo(true, false, scan, out, null, errors, warnings,
|
||||
return convertTo(true, false,
|
||||
ttraits, scan, out, null, errors, warnings,
|
||||
hasWarnings, writeWarningsToOut, warningLevel,
|
||||
shortMessages, false, new int[] { -1 } , true);
|
||||
}
|
||||
|
@ -283,6 +287,7 @@ public class TConverter {
|
|||
|
||||
private static boolean convertTo(boolean toUnicode, // else to TMW
|
||||
boolean toRTF, // else to UTF-8-encoded text
|
||||
TTraits ttraits,
|
||||
ArrayList scan,
|
||||
OutputStream out, // for (toUnicode && !toRTF) mode
|
||||
TibetanDocument tdoc, // for !toUnicode mode or (toUnicode && toRTF) mode
|
||||
|
@ -368,7 +373,7 @@ public class TConverter {
|
|||
if (lastGuyWasNonPunct) {
|
||||
String err = "[#ERROR " + ErrorsAndWarnings.getMessage(133, shortMessages, s.getText()) + "]";
|
||||
if (null != writer) {
|
||||
String uni = ACIPRules.getUnicodeFor(s.getText(), false);
|
||||
String uni = ttraits.getUnicodeFor(s.getText(), false);
|
||||
if (null == uni) {
|
||||
hasErrors = true;
|
||||
uni = err;
|
||||
|
@ -377,7 +382,7 @@ public class TConverter {
|
|||
}
|
||||
if (null != tdoc) {
|
||||
String wylie
|
||||
= ACIPRules.getWylieForACIPOther(s.getText());
|
||||
= ttraits.getEwtsForOther(s.getText());
|
||||
if (null == wylie) {
|
||||
hasErrors = true;
|
||||
tdoc.appendRoman(tdocLocation[0], err, Color.RED);
|
||||
|
@ -658,7 +663,7 @@ public class TConverter {
|
|||
}
|
||||
|
||||
if (!done) {
|
||||
if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false);
|
||||
if (null != writer) unicode = ttraits.getUnicodeFor(s.getText(), false);
|
||||
if (null != tdoc) {
|
||||
if (s.getText().equals("\r")
|
||||
|| s.getText().equals("\t")
|
||||
|
@ -675,7 +680,7 @@ public class TConverter {
|
|||
TibetanMachineWeb.getGlyph("#")
|
||||
}; // hard-coded EWTS values
|
||||
} else {
|
||||
String wy = ACIPRules.getWylieForACIPOther(s.getText());
|
||||
String wy = ttraits.getEwtsForOther(s.getText());
|
||||
if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
|
||||
duff = new Object[] { TibetanMachineWeb.getGlyph(wy) };
|
||||
}
|
||||
|
|
|
@ -26,22 +26,27 @@ import java.util.ArrayList;
|
|||
|
||||
/** An ordered pair used in ACIP/EWTS-to-TMW/Unicode conversion. The
|
||||
* left side is the consonant or empty; the right side is either the
|
||||
* vowel or '+' (indicating stacking) or a disambiguator (i.e., '-'
|
||||
* in ACIP or '.' in EWTS).
|
||||
* vowel or '+' (indicating stacking in both ACIP and EWTS) or a
|
||||
* disambiguator (e.g., '-' in ACIP or '.' in EWTS).
|
||||
* @author David Chandler */
|
||||
/* BIG FIXME: make this package work for EWTS, not just ACIP. (TODO(DLC)[EWTS->Tibetan]: does it?) */
|
||||
class TPair {
|
||||
/** The left side, or null if there is no left side. That is, the
|
||||
* non-vowel, non-'m', non-':', non-'-', non-'+' guy. */
|
||||
/** the part that knows ACIP from EWTS */
|
||||
private TTraits traits;
|
||||
|
||||
/** Returns the part that knows ACIP from EWTS. */
|
||||
public TTraits getTraits() { return traits; }
|
||||
|
||||
/** The left side, or null if there is no left side. I.e., the
|
||||
* non-wowel, non-disambiguator, non-'+' guy. */
|
||||
private String l;
|
||||
String getLeft() {
|
||||
ThdlDebug.verify(!"".equals(l));
|
||||
return l;
|
||||
}
|
||||
|
||||
/** The right side. That is, the vowel, with 'm' or ':' "vowel"
|
||||
* after it if appropriate, or "-" (disambiguator), or "+"
|
||||
* (stacking), or null otherwise. */
|
||||
/** The right side. That is, the wowel or disambiguator or "+"
|
||||
* (for stacking) or null otherwise. */
|
||||
private String r;
|
||||
String getRight() {
|
||||
ThdlDebug.verify(!"".equals(r));
|
||||
|
@ -50,13 +55,14 @@ class TPair {
|
|||
|
||||
/** Constructs a new TPair with left side l and right side r.
|
||||
* Use null or the empty string to represent an absence. */
|
||||
TPair(String l, String r) {
|
||||
TPair(TTraits traits, String l, String r) {
|
||||
// Normalize:
|
||||
if (null != l && l.equals("")) l = null;
|
||||
if (null != r && r.equals("")) r = null;
|
||||
|
||||
this.l = l;
|
||||
this.r = r;
|
||||
this.traits = traits;
|
||||
}
|
||||
|
||||
/** Returns a nice String representation. Returns "(D . E)" for
|
||||
|
@ -67,8 +73,8 @@ class TPair {
|
|||
+ ((null == r) ? "" : r) + ")";
|
||||
}
|
||||
|
||||
/** Returns the number of ACIP characters that make up this
|
||||
* TPair. */
|
||||
/** Returns the number of transliteration characters that make up
|
||||
* this TPair. */
|
||||
int size() {
|
||||
return (((l == null) ? 0 : l.length())
|
||||
+ ((r == null) ? 0 : r.length()));
|
||||
|
@ -98,18 +104,18 @@ class TPair {
|
|||
sz = l.length();
|
||||
newL = l.substring(0, sz - N);
|
||||
}
|
||||
return new TPair(newL, newR);
|
||||
return new TPair(traits, newL, newR);
|
||||
}
|
||||
|
||||
/** Returns true if and only if this is nonempty and is l, if
|
||||
* present, is a legal ACIP consonant, and is r, if present, is a
|
||||
* legal ACIP vowel. */
|
||||
/** Returns true if and only if this is nonempty and if l, if
|
||||
* present, is a legal consonant, and if r, if present, is a
|
||||
* legal wowel. */
|
||||
boolean isLegal() {
|
||||
if (size() < 1)
|
||||
return false;
|
||||
if (null != l && !ACIPRules.isConsonant(l))
|
||||
if (null != l && !traits.isConsonant(l))
|
||||
return false;
|
||||
if (null != r && !ACIPRules.isWowel(r))
|
||||
if (null != r && !traits.isWowel(r))
|
||||
return false;
|
||||
return true;
|
||||
}
|
||||
|
@ -119,9 +125,9 @@ class TPair {
|
|||
boolean isPrefix() {
|
||||
return (null != l
|
||||
&& ((null == r || "".equals(r))
|
||||
|| "-".equals(r) // TODO(DLC)[EWTS->Tibetan]
|
||||
|| "A".equals(r)) // FIXME: though check for BASKYABS and warn because BSKYABS is more common
|
||||
&& ACIPRules.isACIPPrefix(l));
|
||||
|| traits.disambiguator().equals(r)
|
||||
|| traits.aVowel().equals(r)) // FIXME: though check for BASKYABS and warn because BSKYABS is more common
|
||||
&& traits.isPrefix(l));
|
||||
}
|
||||
|
||||
/** Returns true if and only if this pair could be a Tibetan
|
||||
|
@ -129,25 +135,25 @@ class TPair {
|
|||
boolean isPostSuffix() {
|
||||
return (null != l
|
||||
&& ((null == r || "".equals(r))
|
||||
|| "-".equals(r)
|
||||
|| "A".equals(r)) // FIXME: though warn about GAMASA vs. GAMS
|
||||
&& ACIPRules.isACIPPostsuffix(l));
|
||||
|| traits.disambiguator().equals(r)
|
||||
|| traits.aVowel().equals(r)) // FIXME: though warn about GAMASA vs. GAMS
|
||||
&& traits.isPostsuffix(l));
|
||||
}
|
||||
|
||||
/** Returns true if and only if this pair could be a Tibetan
|
||||
* suffix. FIXME: ACIP specific, just like isPostSuffix() and isPrefix() */
|
||||
* suffix. */
|
||||
boolean isSuffix() {
|
||||
return (null != l
|
||||
&& ((null == r || "".equals(r))
|
||||
|| "-".equals(r)
|
||||
|| "A".equals(r))
|
||||
&& ACIPRules.isACIPSuffix(l));
|
||||
|| traits.disambiguator().equals(r)
|
||||
|| traits.aVowel().equals(r))
|
||||
&& traits.isSuffix(l));
|
||||
}
|
||||
|
||||
/** Returns true if and only if this pair is merely a
|
||||
* disambiguator. */
|
||||
boolean isDisambiguator() {
|
||||
return ("-".equals(r) && getLeft() == null);
|
||||
return (traits.disambiguator().equals(r) && getLeft() == null);
|
||||
}
|
||||
|
||||
/** Yep, this works for TPairs. */
|
||||
|
@ -160,16 +166,16 @@ class TPair {
|
|||
return false;
|
||||
}
|
||||
|
||||
/** Returns a TPair that is like this pair except that it has
|
||||
* a "+" on the right if this pair is empty on the right and is
|
||||
* empty on the right if this pair has a disambiguator (i.e., a
|
||||
* '-') on the right. May return itself (but never mutates this
|
||||
/** Returns a TPair that is like this pair except that it has a
|
||||
* "+" on the right if this pair is empty on the right and is
|
||||
* empty on the right if this pair has a disambiguator on the
|
||||
* right. May return itself (but never mutates this
|
||||
* instance). */
|
||||
TPair insideStack() {
|
||||
if (null == getRight())
|
||||
return new TPair(getLeft(), "+");
|
||||
else if ("-".equals(getRight()))
|
||||
return new TPair(getLeft(), null);
|
||||
return new TPair(traits, getLeft(), "+");
|
||||
else if (traits.disambiguator().equals(getRight()))
|
||||
return new TPair(traits, getLeft(), null);
|
||||
else
|
||||
return this;
|
||||
}
|
||||
|
@ -194,7 +200,7 @@ class TPair {
|
|||
String getWylie(boolean justLeft) {
|
||||
String leftWylie = null;
|
||||
if (getLeft() != null) {
|
||||
leftWylie = ACIPRules.getWylieForACIPConsonant(getLeft());
|
||||
leftWylie = traits.getEwtsForConsonant(getLeft());
|
||||
if (leftWylie == null) {
|
||||
if (isNumeric())
|
||||
leftWylie = getLeft();
|
||||
|
@ -208,7 +214,7 @@ class TPair {
|
|||
else if ("+".equals(getRight()))
|
||||
rightWylie = "+";
|
||||
else if (getRight() != null)
|
||||
rightWylie = ACIPRules.getWylieForACIPVowel(getRight());
|
||||
rightWylie = traits.getEwtsForWowel(getRight());
|
||||
if (null == rightWylie) rightWylie = "";
|
||||
return leftWylie + rightWylie;
|
||||
}
|
||||
|
@ -227,18 +233,19 @@ class TPair {
|
|||
void getUnicode(StringBuffer consonantSB, StringBuffer vowelSB,
|
||||
boolean subscribed) {
|
||||
if (null != getLeft()) {
|
||||
String x = ACIPRules.getUnicodeFor(getLeft(), subscribed);
|
||||
String x = traits.getUnicodeFor(getLeft(), subscribed);
|
||||
if (null == x) throw new Error("TPair: " + getLeft() + " has no Uni");
|
||||
consonantSB.append(x);
|
||||
}
|
||||
if (null != getRight()
|
||||
&& !("-".equals(getRight()) || "+".equals(getRight()) || "A".equals(getRight()))) {
|
||||
String x = ACIPRules.getUnicodeFor(getRight(), subscribed);
|
||||
String x = traits.getUnicodeFor(getRight(), subscribed);
|
||||
if (null == x) throw new Error("TPair: " + getRight() + " has no Uni");
|
||||
vowelSB.append(x);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO(DLC)[EWTS->Tibetan]
|
||||
/** Returns true if this pair is surely the last pair in an ACIP
|
||||
* stack. Stacking continues through (* . ) and (* . +), but
|
||||
* stops anywhere else. */
|
||||
|
|
|
@ -33,6 +33,9 @@ import java.util.ArrayList;
|
|||
*
|
||||
* @author David Chandler */
|
||||
class TPairList {
|
||||
/** the part that knows ACIP from EWTS */
|
||||
private TTraits traits;
|
||||
|
||||
/** FIXME: change me and see if performance improves. */
|
||||
private static final int INITIAL_SIZE = 1;
|
||||
|
||||
|
@ -41,17 +44,20 @@ class TPairList {
|
|||
|
||||
/** Creates a new list containing just p. */
|
||||
public TPairList(TPair p) {
|
||||
this.traits = p.getTraits();
|
||||
al = new ArrayList(1);
|
||||
add(p);
|
||||
}
|
||||
|
||||
/** Creates an empty list. */
|
||||
public TPairList() {
|
||||
public TPairList(TTraits traits) {
|
||||
this.traits = traits;
|
||||
al = new ArrayList(INITIAL_SIZE);
|
||||
}
|
||||
|
||||
/** Creates an empty list with the capacity to hold N items. */
|
||||
public TPairList(int N) {
|
||||
public TPairList(TTraits traits, int N) {
|
||||
this.traits = traits;
|
||||
al = new ArrayList(N);
|
||||
}
|
||||
|
||||
|
@ -181,7 +187,7 @@ class TPairList {
|
|||
return ErrorsAndWarnings.getMessage(125, shortMessages, translit);
|
||||
} else if ((null == p.getLeft() && !"-".equals(p.getRight()))
|
||||
|| (null != p.getLeft()
|
||||
&& !ACIPRules.isConsonant(p.getLeft())
|
||||
&& !traits.isConsonant(p.getLeft())
|
||||
&& !p.isNumeric())) {
|
||||
// FIXME: stop handling this outside of ErrorsAndWarnings:
|
||||
if (null == p.getLeft()) {
|
||||
|
@ -406,12 +412,12 @@ class TPairList {
|
|||
// and only if b1 is one, etc.
|
||||
for (int counter = 0; counter < (1<<numBreaks); counter++) {
|
||||
TStackList sl = new TStackList();
|
||||
TPairList currentStack = new TPairList();
|
||||
TPairList currentStack = new TPairList(traits);
|
||||
for (int k = startLoc; k <= i; k++) {
|
||||
if (!get(k).isDisambiguator()) {
|
||||
if (get(k).isNumeric()
|
||||
|| (get(k).getLeft() != null
|
||||
&& ACIPRules.isConsonant(get(k).getLeft())))
|
||||
&& traits.isConsonant(get(k).getLeft())))
|
||||
currentStack.add(get(k).insideStack());
|
||||
else
|
||||
return null; // sA, for example, is illegal.
|
||||
|
@ -419,7 +425,7 @@ class TPairList {
|
|||
if (k == i || get(k).endsACIPStack()) {
|
||||
if (!currentStack.isEmpty())
|
||||
sl.add(currentStack.asStack());
|
||||
currentStack = new TPairList();
|
||||
currentStack = new TPairList(traits);
|
||||
} else {
|
||||
if (numBreaks > 0) {
|
||||
for (int j = 0; breakStart+j < 3; j++) {
|
||||
|
@ -427,7 +433,7 @@ class TPairList {
|
|||
&& 1 == ((counter >> j) & 1)) {
|
||||
if (!currentStack.isEmpty())
|
||||
sl.add(currentStack.asStack());
|
||||
currentStack = new TPairList();
|
||||
currentStack = new TPairList(traits);
|
||||
break; // shouldn't matter, but you never know
|
||||
}
|
||||
}
|
||||
|
@ -460,9 +466,9 @@ class TPairList {
|
|||
if (!isEmpty()) {
|
||||
TPair lastPair = get(size() - 1);
|
||||
if ("+".equals(lastPair.getRight()))
|
||||
al.set(size() - 1, new TPair(lastPair.getLeft(), null));
|
||||
al.set(size() - 1, new TPair(traits, lastPair.getLeft(), null));
|
||||
else if ("-".equals(lastPair.getRight()))
|
||||
al.set(size() - 1, new TPair(lastPair.getLeft(), null));
|
||||
al.set(size() - 1, new TPair(traits, lastPair.getLeft(), null));
|
||||
}
|
||||
return this;
|
||||
}
|
||||
|
@ -506,10 +512,10 @@ class TPairList {
|
|||
add_U0F7F = true;
|
||||
StringBuffer rr = new StringBuffer(p.getRight());
|
||||
rr.deleteCharAt(where);
|
||||
p = new TPair(p.getLeft(), rr.toString());
|
||||
p = new TPair(traits, p.getLeft(), rr.toString());
|
||||
}
|
||||
boolean hasNonAVowel = (!"A".equals(p.getRight()) && null != p.getRight());
|
||||
String thislWylie = ACIPRules.getWylieForACIPConsonant(p.getLeft());
|
||||
String thislWylie = traits.getEwtsForConsonant(p.getLeft());
|
||||
if (thislWylie == null) {
|
||||
char ch;
|
||||
if (p.isNumeric()) {
|
||||
|
@ -528,21 +534,21 @@ class TPairList {
|
|||
boolean isTibetan = TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(ll.toString());
|
||||
boolean isSanskrit = TibetanMachineWeb.isWylieSanskritConsonantStack(lWylie.toString());
|
||||
if (ddebug && !isTibetan && !isSanskrit && !isNumeric) {
|
||||
System.out.println("OTHER for " + lWylie + " with vowel " + ACIPRules.getWylieForACIPVowel(p.getRight()) + " and p.getRight()=" + p.getRight());
|
||||
System.out.println("OTHER for " + lWylie + " with vowel " + traits.getEwtsForWowel(p.getRight()) + " and p.getRight()=" + p.getRight());
|
||||
}
|
||||
if (isTibetan && isSanskrit) {
|
||||
// RVA, e.g. It must be Tibetan because RWA is what
|
||||
// you'd use for RA over fixed-form WA.
|
||||
isSanskrit = false;
|
||||
}
|
||||
if (ddebug && hasNonAVowel && ACIPRules.getWylieForACIPVowel(p.getRight()) == null) {
|
||||
System.out.println("vowel " + ACIPRules.getWylieForACIPVowel(p.getRight()) + " and p.getRight()=" + p.getRight());
|
||||
if (ddebug && hasNonAVowel && traits.getEwtsForWowel(p.getRight()) == null) {
|
||||
System.out.println("vowel " + traits.getEwtsForWowel(p.getRight()) + " and p.getRight()=" + p.getRight());
|
||||
}
|
||||
TGCPair tp;
|
||||
indexList.add(new Integer(index));
|
||||
tp = new TGCPair(lWylie.toString(),
|
||||
(hasNonAVowel
|
||||
? ACIPRules.getWylieForACIPVowel(p.getRight())
|
||||
? traits.getEwtsForWowel(p.getRight())
|
||||
: ""),
|
||||
(isNumeric
|
||||
? TGCPair.TYPE_OTHER
|
||||
|
@ -697,9 +703,9 @@ class TPairList {
|
|||
if (lastPair.getRight() == null || lastPair.equals("-")) {
|
||||
duffsAndErrors.add(TibetanMachineWeb.getGlyph(hashKey));
|
||||
} else {
|
||||
ACIPRules.getDuffForACIPVowel(duffsAndErrors,
|
||||
TibetanMachineWeb.getGlyph(hashKey),
|
||||
lastPair.getRight());
|
||||
traits.getDuffForWowel(duffsAndErrors,
|
||||
TibetanMachineWeb.getGlyph(hashKey),
|
||||
lastPair.getRight());
|
||||
}
|
||||
if (previousSize == duffsAndErrors.size())
|
||||
throw new Error("TPairList with no duffs? " + toString()); // FIXME: change to assertion.
|
||||
|
|
|
@ -121,7 +121,7 @@ class TPairListFactory {
|
|||
|
||||
// base case for our recursion:
|
||||
if ("".equals(acip))
|
||||
return new TPairList();
|
||||
return new TPairList(ttraits);
|
||||
|
||||
StringBuffer acipBuf = new StringBuffer(acip);
|
||||
int howMuchBuf[] = new int[1];
|
||||
|
@ -131,9 +131,9 @@ class TPairListFactory {
|
|||
&& null != head.getLeft()
|
||||
&& null != head.getRight()
|
||||
&& weHaveSeenVowelAlready
|
||||
&& ACIPRules.isACIPSuffix(head.getLeft()) // DKY'O should be two horizontal units, not three. -- {D}{KY'O}, not {D}{KY}{'O}.
|
||||
&& ttraits.isSuffix(head.getLeft()) // DKY'O should be two horizontal units, not three. -- {D}{KY'O}, not {D}{KY}{'O}.
|
||||
&& head.getRight().startsWith("'")) {
|
||||
head = new TPair(head.getLeft(),
|
||||
head = new TPair(ttraits, head.getLeft(),
|
||||
// Without this disambiguator, we are
|
||||
// less efficient (8 parses, not 4) and
|
||||
// we can't handle PA'AM'ANG etc.
|
||||
|
@ -177,11 +177,11 @@ class TPairListFactory {
|
|||
}
|
||||
|
||||
// TODO(DLC)[EWTS->Tibetan]: doc
|
||||
private static TPairList breakHelperEWTS(String ewts, TTraits ttraits /* TODO(DLC)[EWTS->Tibetan]: use */) {
|
||||
private static TPairList breakHelperEWTS(String ewts, TTraits ttraits) {
|
||||
|
||||
// base case for our recursion:
|
||||
if ("".equals(ewts))
|
||||
return new TPairList();
|
||||
return new TPairList(ttraits);
|
||||
|
||||
StringBuffer ewtsBuf = new StringBuffer(ewts);
|
||||
int howMuchBuf[] = new int[1];
|
||||
|
@ -238,11 +238,11 @@ class TPairListFactory {
|
|||
int i, xl = acip.length();
|
||||
if (0 == xl) {
|
||||
howMuch[0] = 0;
|
||||
return new TPair(null, null);
|
||||
return new TPair(ttraits, null, null);
|
||||
}
|
||||
if (acip.charAt(0) == ttraits.disambiguatorChar()) {
|
||||
howMuch[0] = 1;
|
||||
return new TPair(null, ttraits.disambiguator());
|
||||
return new TPair(ttraits, null, ttraits.disambiguator());
|
||||
}
|
||||
char ch = acip.charAt(0);
|
||||
|
||||
|
@ -250,7 +250,7 @@ class TPairListFactory {
|
|||
// like seeing 1-2-3-4.
|
||||
if (ch >= '0' && ch <= '9') {
|
||||
howMuch[0] = 1; // not 2...
|
||||
return new TPair(acip.substring(0, 1), (xl == 1) ? null : ttraits.disambiguator());
|
||||
return new TPair(ttraits, acip.substring(0, 1), (xl == 1) ? null : ttraits.disambiguator());
|
||||
}
|
||||
|
||||
String l = null, r = null;
|
||||
|
@ -264,11 +264,11 @@ class TPairListFactory {
|
|||
int ll = (null == l) ? 0 : l.length();
|
||||
if (null != l && xl > ll && acip.charAt(ll) == ttraits.disambiguatorChar()) {
|
||||
howMuch[0] = l.length() + 1;
|
||||
return new TPair(l, ttraits.disambiguator());
|
||||
return new TPair(ttraits, l, ttraits.disambiguator());
|
||||
}
|
||||
if (null != l && xl > ll && acip.charAt(ll) == '+') {
|
||||
howMuch[0] = l.length() + 1;
|
||||
return new TPair(l, "+");
|
||||
return new TPair(ttraits, l, "+");
|
||||
}
|
||||
for (i = Math.min(ttraits.maxWowelLength(), xl - ll); i >= 1; i--) {
|
||||
String t = null;
|
||||
|
@ -289,7 +289,7 @@ class TPairListFactory {
|
|||
&& acip.charAt(z) == '+') {
|
||||
acip.deleteCharAt(z-1);
|
||||
howMuch[0] = l.length() + 1;
|
||||
return new TPair(l, "+");
|
||||
return new TPair(ttraits, l, "+");
|
||||
}
|
||||
|
||||
// Allow Pm to mean PAm, P: to mean PA:, Pm: to mean PAm:. /* TODO(DLC)[EWTS->Tibetan]: */
|
||||
|
@ -305,14 +305,14 @@ class TPairListFactory {
|
|||
if (null == l && null == r) {
|
||||
howMuch[0] = 1; // not 2...
|
||||
// add a disambiguator to avoid exponential running time:
|
||||
return new TPair(acip.substring(0, 1),
|
||||
return new TPair(ttraits, acip.substring(0, 1),
|
||||
(xl == 1) ? null : ttraits.disambiguator());
|
||||
}
|
||||
|
||||
howMuch[0] = (((l == null) ? 0 : l.length())
|
||||
+ ((r == null) ? 0 : r.length())
|
||||
+ mod);
|
||||
return new TPair(l, r);
|
||||
return new TPair(ttraits, l, r);
|
||||
} // TODO(DLC)[EWTS->Tibetan]:
|
||||
}
|
||||
|
||||
|
|
|
@ -18,12 +18,18 @@ Contributor(s): ______________________________________.
|
|||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import org.thdl.tib.text.DuffCode;
|
||||
|
||||
/** A TTraits object encapsulates all the things that make a
|
||||
* particular Roman transliteration scheme unique. If both EWTS and
|
||||
* ACIP transliterations have a property in common, then it's likely
|
||||
* encoded in a manner that's hard to modify. But if they differ in
|
||||
* some respect, then that difference should be encoded in a TTraits
|
||||
* object.
|
||||
* particular Roman transliteration scheme unique. For the most
|
||||
* part, this difference is expressed at the finest granularity
|
||||
* possible -- often single characters of Roman transliteration.
|
||||
*
|
||||
* <p>If both EWTS and ACIP transliterations have a property in
|
||||
* common, then it's likely encoded in a manner that's hard to
|
||||
* modify. But if they differ in some respect, then that difference
|
||||
* should be encoded in a TTraits object.
|
||||
*
|
||||
* <p>It is very likely that classes that implement this interface
|
||||
* will choose to use the design pattern 'singleton'. */
|
||||
|
@ -62,9 +68,63 @@ interface TTraits {
|
|||
/** Returns true if and only if <em>s</em> is a stretch of
|
||||
* transliteration corresponding to a Tibetan wowel (without any
|
||||
* [achen or other] consonant) */
|
||||
boolean isWowel(String s);
|
||||
boolean isWowel(String s); // TODO(DLC)[EWTS->Tibetan]: what about "m:" as opposed to "m" or ":"
|
||||
|
||||
/** Returns true if and only if the pair given has a simple error
|
||||
* other than being a mere disambiguator. */
|
||||
boolean hasSimpleError(TPair p);
|
||||
|
||||
/** The implicit 'ahhh' vowel, the one you see when you write the
|
||||
human-friendly transliteration for "\u0f40\u0f0b". */
|
||||
String aVowel();
|
||||
|
||||
/** Returns true if s is a valid postsuffix. s must not have a
|
||||
wowel on it. */
|
||||
boolean isPostsuffix(String s);
|
||||
|
||||
/** Returns true if and only if l is the representation of a
|
||||
letter that can be a suffix. Note that all postsuffixes are
|
||||
also suffixes. l should not have a wowel. */
|
||||
boolean isSuffix(String l);
|
||||
|
||||
/** Returns true if and only if l is the representation of a
|
||||
letter that can be a prefix. l should not have a wowel. */
|
||||
boolean isPrefix(String l);
|
||||
|
||||
/** Returns the EWTS transliteration corresponding to the
|
||||
* consonant l, which should not have a vowel. Returns null if
|
||||
* there is no such EWTS.
|
||||
*
|
||||
* <p>May return "W" instead of "w", "r" instead of "R", and "y"
|
||||
* instead of "Y" because we sometimes don't have enough context
|
||||
* to decide.
|
||||
*
|
||||
* <p>The reasoning for "W" instead of "w" is that r-w and r+w
|
||||
* are both known hash keys (as {@link
|
||||
* org.thdl.tib.text#TibetanMachineWeb} would call them). We
|
||||
* sort 'em out this way. (They are the only things like this
|
||||
* according to bug report #800166.) */
|
||||
String getEwtsForConsonant(String l);
|
||||
|
||||
/** Returns the EWTS corresponding to the given punctuation or
|
||||
* mark. Returns null if there is no such EWTS. */
|
||||
String getEwtsForOther(String l);
|
||||
|
||||
/** Returns the EWTS corresponding to the given "wowel". Returns
|
||||
* null if there is no such EWTS. */
|
||||
String getEwtsForWowel(String l);
|
||||
|
||||
/** If l is a consonant or vowel or punctuation mark, then this
|
||||
* returns the Unicode for it. The Unicode for the subscribed
|
||||
* form of the glyph is returned if subscribed is true. Returns
|
||||
* null if l is unknown. */
|
||||
String getUnicodeFor(String l, boolean subscribed);
|
||||
|
||||
/** Returns a scanner that can break up a string of
|
||||
transliteration. */
|
||||
TTshegBarScanner scanner();
|
||||
|
||||
/** Gets the duffcodes for wowel, such that they look good with
|
||||
* the preceding glyph, and appends them to duff. */
|
||||
void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel);
|
||||
}
|
||||
|
|
|
@ -18,7 +18,11 @@ Contributor(s): ______________________________________.
|
|||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.io.*;
|
||||
import java.io.IOException;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.InputStream;
|
||||
import java.io.BufferedReader;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Stack;
|
||||
|
||||
|
@ -40,7 +44,7 @@ public abstract class TTshegBarScanner {
|
|||
* If errors is non-null, error messages will be appended to it.
|
||||
* Returns a list of TStrings that is the scan. Warning and
|
||||
* error messages in the result will be long and self-contained
|
||||
* unless shortMessagse is true.
|
||||
* unless shortMessages is true.
|
||||
*
|
||||
* <p>This is not so efficient; copies the whole file into memory
|
||||
* first.
|
||||
|
|
Loading…
Reference in a new issue