Two things:

One, TMW->EWTS gives dbas and dngas instead of dabs and dangs
because Chris Fynn's e-mail from today has dbas and dngas.

Second, Down with ACIPRules.  Long live ACIPTraits.  EWTS->Tibetan
conversion is closer still.
This commit is contained in:
dchandler 2005-02-22 04:36:54 +00:00
parent 82c6047cc2
commit c16f633ecf
18 changed files with 950 additions and 818 deletions

View file

@ -969,6 +969,22 @@ public class DuffPaneTest extends DuffPaneTestBase {
ensureKeysGiveCorrectWylie("'gas");
/* Chris Fynn's e-mail on Feb 21 2005 leads to these test
cases: */
{
ensureKeysGiveCorrectWylie("dgas");
ensureKeysGiveCorrectWylie("'gas");
ensureKeysGiveCorrectWylie("dngas");
ensureKeysGiveCorrectWylie("gnad");
ensureKeysGiveCorrectWylie("mnad");
ensureKeysGiveCorrectWylie("bags");
ensureKeysGiveCorrectWylie("dbas");
ensureKeysGiveCorrectWylie("'bas");
ensureKeysGiveCorrectWylie("mags");
ensureKeysGiveCorrectWylie("mangs");
ensureKeysGiveCorrectWylie("dmas");
}
ensureKeysGiveCorrectWylie("gangs");
ensureKeysGiveCorrectWylie("gnags");

View file

@ -27,7 +27,7 @@ import org.thdl.util.*;
import org.thdl.tib.text.*;
import org.thdl.tib.text.ttt.TConverter;
import org.thdl.tib.text.ttt.ACIPTshegBarScanner;
import org.thdl.tib.text.ttt.ACIPTraits;
import java.util.ArrayList;
/** TibetanConverter is a command-line utility for converting to and
@ -297,17 +297,18 @@ public class TibetanConverter implements FontConverterConstants {
if (ACIP_TO_UNI_TEXT == ct || ACIP_TO_TMW == ct) {
try {
ArrayList al
= ACIPTshegBarScanner.instance().scanStream(in, null,
ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have",
1000 - 1),
shortMessages,
warningLevel);
= ACIPTraits.instance().scanner().scanStream(in, null,
ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have",
1000 - 1),
shortMessages,
warningLevel);
if (null == al)
return 47;
boolean embeddedWarnings = (warningLevel != "None");
boolean hasWarnings[] = new boolean[] { false };
if (ACIP_TO_UNI_TEXT == ct) {
if (!TConverter.convertToUnicodeText(al, out, null,
if (!TConverter.convertToUnicodeText(ACIPTraits.instance(),
al, out, null,
null, hasWarnings,
embeddedWarnings,
warningLevel,
@ -315,7 +316,8 @@ public class TibetanConverter implements FontConverterConstants {
return 46;
} else {
if (ct != ACIP_TO_TMW) throw new Error("badness");
if (!TConverter.convertToTMW(al, out, null, null,
if (!TConverter.convertToTMW(ACIPTraits.instance(),
al, out, null, null,
hasWarnings,
embeddedWarnings,
warningLevel, shortMessages,

View file

@ -137,7 +137,7 @@ public class TGCPair implements THDLWylieConstants {
consonantACIP = "V";
else
consonantACIP
= org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(consonantWylie);
= org.thdl.tib.text.ttt.ACIPTraits.instance().getACIPForEWTS(consonantWylie);
if (null == consonantACIP) {
if (null != consonantWylie && consonantWylie.startsWith("R+"))
return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + consonantWylie, " because the ACIP R+... could imply the short superscribed form, but this most likely intends the full form (i.e., Unicode character U+0F6A)");
@ -160,7 +160,7 @@ public class TGCPair implements THDLWylieConstants {
}
if (vowelWylie != null) {
String vowelACIP
= org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(vowelWylie);
= org.thdl.tib.text.ttt.ACIPTraits.instance().getACIPForEWTS(vowelWylie);
if (null == vowelACIP) {
return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + vowelWylie, "");
} else {

View file

@ -25,7 +25,7 @@ import javax.swing.text.rtf.RTFEditorKit;
import java.io.*;
import org.thdl.util.ThdlDebug;
import org.thdl.tib.text.ttt.ACIPTshegBarScanner;
import org.thdl.tib.text.ttt.ACIPTraits;
import org.thdl.tib.text.ttt.TConverter;
import org.thdl.tib.text.tshegbar.LegalTshegBar;
import org.thdl.tib.text.tshegbar.UnicodeConstants;
@ -333,8 +333,8 @@ public class TibTextUtils implements THDLWylieConstants {
{
StringBuffer errors = new StringBuffer();
String warningLevel = withWarnings ? "All" : "None";
ArrayList al = ACIPTshegBarScanner.instance().scan(acip, errors, 500,
false, warningLevel);
ArrayList al = ACIPTraits.instance().scanner().scan(acip, errors, 500,
false, warningLevel);
if (null == al || errors.length() > 0) {
if (errors.length() > 0)
throw new InvalidACIPException(errors.toString());
@ -348,8 +348,8 @@ public class TibTextUtils implements THDLWylieConstants {
}
try {
int tloc[] = new int[] { loc };
TConverter.convertToTMW(al, tdoc, null, null, null,
putWarningsInOutput, warningLevel,
TConverter.convertToTMW(ACIPTraits.instance(), al, tdoc, null, null,
null, putWarningsInOutput, warningLevel,
false, colors, tloc);
return tloc[0] - loc;
} catch (IOException e) {
@ -1430,6 +1430,53 @@ public class TibTextUtils implements THDLWylieConstants {
candidateType = getCandidateTypeModuloAppendage(candidateType);
if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType) {
/* Update: Chris Fynn wrote this in response to an
e-mail from David Chapman on Feb 21, 2005:
<quote Chris Fynn feb 21 2005>
When working out the rules for Tibetan and Dzongkha
collation in Bhutan we came up with the following sequences
that could be ambiguous:
0F51 0F42 0F66
0F60 0F42 0F66
0F51 0F44 0F66
0F42 0F53 0F51
0F58 0F53 0F51
0F56 0F42 0F66
0F51 0F56 0F66
0F60 0F56 0F66
0F58 0F42 0F66
0F58 0F44 0F66
0F51 0F58 0F66
After much consultation with experts in Bhutan it was
decided these should always be read as follows:
0F51 0F42 0F66 dgas
0F60 0F42 0F66 'gas
0F51 0F44 0F66 dngas *
0F42 0F53 0F51 gnad
0F58 0F53 0F51 mnad *
0F56 0F42 0F66 bags
0F51 0F56 0F66 dbas
0F60 0F56 0F66 'bas *
0F58 0F42 0F66 mags
0F58 0F44 0F66 mangs
0F51 0F58 0F66 dmas
In most cases it was found that only one of the two possible
readings actually existed as words. 0F51 0F44 0F66 , 0F58
0F53 0F51, and 0F60 0F56 0F66 were not found as syllables in
any known words, but the experts felt that *if* they
occurred in Tibetan or Dzongkha text then dngas, mnad, and
'bas would be the most likely reading.
</quote>
Because of this e-mail, dbas and dngas were added to the list of
exceptions. */
/* Yes, this is ambiguous. How do we handle it? See
* this from Andres (but note that only 4 of the 14 in
* the second list are ambiguous because ra na sa and
@ -1480,7 +1527,9 @@ public class TibTextUtils implements THDLWylieConstants {
|| wylie2.equals("n")
|| wylie2.equals("s")))
|| (wylie1.equals("d") && (wylie2.equals("g")
|| wylie2.equals("m")))
|| wylie2.equals("m")
|| wylie2.equals("b")
|| wylie2.equals("ng")))
|| (wylie1.equals("b") && wylie2.equals("d"))
|| (wylie1.equals("m") && wylie2.equals("d"))
|| (wylie1.equals("'") && (wylie2.equals("g")

View file

@ -1988,7 +1988,7 @@ private static String acipForGlyph(String hashKey) {
// ~X is a special case because the EWTS is 2 characters in
// length
|| "~X".equals(hashKey)) // hard-coded EWTS value
return org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(hashKey);
return org.thdl.tib.text.ttt.ACIPTraits.instance().getACIPForEWTS(hashKey);
else
// else we are not be able to use it because it's not smart
// about stacks (e.g., W+W)
@ -2116,7 +2116,7 @@ public static String getACIPForGlyph(DuffCode dc1,
// DLC FIXME: TMW.53 is probably going to come out all wrong (VA
// vs. WA) from this function, but
// ACIPRules.getACIPForEWTS(String) seems to come through... will
// ACIPTraits.getACIPForEWTS(String) seems to come through... will
// it always?
String hashKey = getHashKeyForGlyph(dc1);

View file

@ -9,9 +9,9 @@
// - blank lines should be ignored
// - <?x?> marks a command
//
// If you change the Wylie here, it can break the ACIP->TMW and
// ACIP->Unicode conversion. So keep ACIPRules in sync with this, and be
// sure to run 'ant clean check' after your change.
// If you change the EWTS transliteration here, it can break the
// ACIP->TMW and ACIP->Unicode conversion. So keep ACIPTraits in sync
// with this, and be sure to run 'ant clean check' after your change.
//
// Note that some glyphs have EWTS \uF021-\uF0FF inclusive. These do
// not have anything in the Unicode column, though, because this is
@ -37,7 +37,7 @@
// by the way.
//
// If EWTS changes, then ACIP->TMW and ACIP->Unicode will break --
// modify ACIPRules and test test test.
// modify ACIPTraits and test test test.
<?Input:Punctuation?>
//_~32,1~0,32
@ -645,7 +645,7 @@ r+m+m~51,4~~7,59~1,110~8,121~1,123~1,125~8,107~8,114~f62,fa8,fa8
// Note that TPairList.java's unicodeExceptionsMap must be updated if
// we change who uses U+0F6A.
R+Y~52,4~~7,60~1,110~8,120~1,123~1,125~8,106~8,113~f6a,fbb
// R+W is mentioned in ACIPRules.java:
// R+W is mentioned in ACIPTraits.java:
R+W~196,4~~7,61~1,109~8,120~1,123~1,125~8,106~8,113~f6a,fba
R+sh~53,4~~7,62~1,109~8,120~1,123~1,125~8,106~8,113~f6a,fb4
R+sh+y~54,4~~7,63~1,109~8,122~1,123~1,125~8,108~8,115~f6a,fb4,fb1
@ -667,7 +667,7 @@ l+h+w~197,4~~7,78~1,109~8,121~1,123~1,125~8,106~8,113~f63,fb7,fad
w+y~69,4~~7,79~1,109~8,121~1,123~1,125~8,107~8,114~f5d,fb1
w+r~70,4~~7,80~1,109~8,121~1,123~1,125~8,107~8,114~f5d,fb2
w+n~195,4~~7,81~1,109~8,120~1,123~1,125~8,106~8,113~f5d,fa3
// w+W is mentioned in ACIPRules.java:
// w+W is mentioned in ACIPTraits.java:
w+W~194,4~~7,82~1,109~8,120~1,123~1,125~8,106~8,113~f5d,fba
sh+ts~71,4~~7,83~1,109~8,120~1,123~1,125~8,106~8,113~f64,fa9
sh+ts+y~72,4~~7,84~1,109~8,122~1,123~1,125~8,108~8,115~f64,fa9,fb1

View file

@ -1,658 +0,0 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.ttt;
import java.util.HashSet;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.StringTokenizer;
import java.util.List;
import org.thdl.util.ThdlOptions;
import org.thdl.tib.text.DuffCode;
import org.thdl.tib.text.THDLWylieConstants;
import org.thdl.tib.text.TibetanMachineWeb;
import org.thdl.tib.text.TibTextUtils;
// TODO(DLC)[EWTS->Tibetan]: this and ACIPTraits -- unify?
/** Canonizes some facts regarding the ACIP transcription system.
* @author David Chandler */
public class ACIPRules {
/** {Ksh}, the longest consonant, has 3 characters, so this is
* three. */
public static int MAX_CONSONANT_LENGTH = 3;
/** {'EEm:}, the longest wowel, has 5 characters, so this is
* five. */
public static int MAX_WOWEL_LENGTH = 5;
/** For O(1) {@link #isWowel(String)} calls. */
private static HashSet acipVowels = null;
private static String[][] baseVowels = new String[][] {
// { ACIP, EWTS, EWTS for ACIP {'\'' + baseVowels[][0]}, vowel
// numbers (see TibetanMachineWeb's VOWEL_A, VOWEL_o, etc.)
// for ACIP, vowel numbers for ACIP {'\'' + baseVowels[][0]}
{ "A", "a", "A" },
{ "I", "i", "I" },
{ "U", "u", "U" },
{ "E", "e", "Ae" },
{ "O", "o", "Ao" },
{ "EE", "ai", "Aai" },
{ "OO", "au", "Aau" },
{ "i", "-i", "A-i" }
};
/** Returns true if and only if s is an ACIP wowel. You can't
* just call this any time -- A is both a consonant and a vowel
* in ACIP, so you have to call this in the right context. */
public static boolean isWowel(String s) {
if (null == acipVowels) {
acipVowels = new HashSet(baseVowels.length * 8);
for (int i = 0; i < baseVowels.length; i++) {
// I'm on my own with 'O and 'E and 'OO and 'EE, but
// GANG'O appears and I wonder... so here they are.
// It's consistent with 'I and 'A and 'U, at least:
// all the vowels may appear as K'vowel. DLC FIXME:
// ask.
acipVowels.add(baseVowels[i][0]);
acipVowels.add('\'' + baseVowels[i][0]);
acipVowels.add(baseVowels[i][0] + 'm');
acipVowels.add('\'' + baseVowels[i][0] + 'm');
acipVowels.add(baseVowels[i][0] + ':');
acipVowels.add('\'' + baseVowels[i][0] + ':');
acipVowels.add(baseVowels[i][0] + "m:");
acipVowels.add('\'' + baseVowels[i][0] + "m:");
// Keep this code in sync with getUnicodeFor.
// Keep this code in sync with getWylieForACIPVowel.
}
// {Pm} is treated just like {PAm}; {P:} is treated just
// like {PA:}; {Pm:} is treated just like {PAm:}. But
// that happens thanks to
}
return (acipVowels.contains(s));
}
/** For O(1) {@link #isConsonant(String)} calls. */
private static HashSet consonants = null;
/** Returns true if and only if acip is an ACIP consonant (without
* a vowel). For example, returns true for "K", but not for
* "KA" or "X". */
public static boolean isConsonant(String acip) {
if (consonants == null) {
consonants = new HashSet();
consonants.add("V");
consonants.add("K");
consonants.add("KH");
consonants.add("G");
consonants.add("NG");
consonants.add("C");
consonants.add("CH");
consonants.add("J");
consonants.add("NY");
consonants.add("T");
consonants.add("TH");
consonants.add("D");
consonants.add("N");
consonants.add("P");
consonants.add("PH");
consonants.add("B");
consonants.add("M");
consonants.add("TZ");
consonants.add("TS");
consonants.add("DZ");
consonants.add("W");
consonants.add("ZH");
consonants.add("Z");
consonants.add("Y");
consonants.add("R");
consonants.add("L");
consonants.add("SH");
consonants.add("S");
consonants.add("H");
consonants.add("t");
consonants.add("th");
consonants.add("d");
consonants.add("n");
consonants.add("sh");
consonants.add("dH");
consonants.add("DH");
consonants.add("BH");
consonants.add("DZH"); // longest, MAX_CONSONANT_LENGTH characters
consonants.add("Ksh"); // longest, MAX_CONSONANT_LENGTH characters
consonants.add("GH");
consonants.add("'");
consonants.add("A");
}
return consonants.contains(acip);
}
/** A map from wylie to ACIP. Note that the Wylie "w" maps to
both "V" and "W". */
private static HashMap wylieToACIP = null;
/** Returns the ACIP transliteration corresponding to the THDL
Extended Wylie <em>atom</em> EWTS, or null if EWTS is not
recognized. */
public static String getACIPForEWTS(String EWTS) {
getWylieForACIPConsonant(null);
getWylieForACIPOther(null);
getWylieForACIPVowel(null);
String ans = (String)wylieToACIP.get(EWTS);
boolean useCapitalW = false;
if (EWTS.startsWith("w"))
useCapitalW = true; // We want W+NA, not V+NA; we want WA, not VA.
if (null == ans) {
StringBuffer finalAns = new StringBuffer(EWTS.length());
StringTokenizer sTok = new StringTokenizer(EWTS, "-+", true);
while (sTok.hasMoreTokens()) {
String part, tok = sTok.nextToken();
if (tok.equals("-") || tok.equals("+"))
part = tok;
else {
if ("w".equals(tok)) {
// There are only two stacks in TMW that have
// U+0FBA: R+Wa and w+Wa. TMW->ACIP fails for
// these unless we handle it here. (FIXME:
// add an automated test for this).
if ("R+W".equals(EWTS) || "w+W".equals(EWTS)) {
part = "W";
} else {
part = "V";
}
} else {
part = (String)wylieToACIP.get(tok);
}
}
if (null == part) return null;
finalAns.append(part);
}
if (useCapitalW)
finalAns.setCharAt(0, 'W');
return finalAns.toString();
}
if (useCapitalW)
return "W" + ans.substring(1);
else
return ans;
}
/** Registers acip->wylie mappings in toWylie; registers
wylie->acip mappings in {@link #wylieToACIP}. */
private static void putMapping(HashMap toWylie, String ACIP, String EWTS) {
toWylie.put(ACIP, EWTS);
if (null == wylieToACIP) {
wylieToACIP = new HashMap(75);
// We don't want to put "/" in toWylie:
wylieToACIP.put("(", "/");
wylieToACIP.put(")", "/");
wylieToACIP.put("?", "\\");
wylieToACIP.put("_", " "); // oddball.
wylieToACIP.put("o'i", "O'I"); // oddball for TMW9.61.
}
wylieToACIP.put(EWTS, ACIP);
}
/** Returns true if and only if s is an ACIP consonant. */
static final boolean isACIPConsonant(String s) {
return (null != ACIPRules.getWylieForACIPConsonant(s));
}
private static HashMap acipConsonant2wylie = null;
/** Returns the EWTS corresponding to the given ACIP consonant
* (without the "A" vowel). Returns null if there is no such
* EWTS.
*
* <p>Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y",
* even though sometimes the EWTS for those is "w", "R", or "Y".
* Handle that in the caller. */
static final String getWylieForACIPConsonant(String acip) {
if (acipConsonant2wylie == null) {
acipConsonant2wylie = new HashMap(37);
// oddball:
putMapping(acipConsonant2wylie, "V", "w");
// more oddballs:
putMapping(acipConsonant2wylie, "DH", "d+h");
putMapping(acipConsonant2wylie, "BH", "b+h");
putMapping(acipConsonant2wylie, "dH", "D+h");
putMapping(acipConsonant2wylie, "DZH", "dz+h");
putMapping(acipConsonant2wylie, "Ksh", "k+Sh");
putMapping(acipConsonant2wylie, "GH", "g+h");
putMapping(acipConsonant2wylie, "K", "k");
putMapping(acipConsonant2wylie, "KH", "kh");
putMapping(acipConsonant2wylie, "G", "g");
putMapping(acipConsonant2wylie, "NG", "ng");
putMapping(acipConsonant2wylie, "C", "c");
putMapping(acipConsonant2wylie, "CH", "ch");
putMapping(acipConsonant2wylie, "J", "j");
putMapping(acipConsonant2wylie, "NY", "ny");
putMapping(acipConsonant2wylie, "T", "t");
putMapping(acipConsonant2wylie, "TH", "th");
putMapping(acipConsonant2wylie, "D", "d");
putMapping(acipConsonant2wylie, "N", "n");
putMapping(acipConsonant2wylie, "P", "p");
putMapping(acipConsonant2wylie, "PH", "ph");
putMapping(acipConsonant2wylie, "B", "b");
putMapping(acipConsonant2wylie, "M", "m");
putMapping(acipConsonant2wylie, "TZ", "ts");
putMapping(acipConsonant2wylie, "TS", "tsh");
putMapping(acipConsonant2wylie, "DZ", "dz");
putMapping(acipConsonant2wylie, "W", "W"
/* NOTE WELL: sometimes "w", sometimes "W".
Handle this in the caller.
Reasoning for "W" instead of "w": r-w and
r+w are both known hash keys. We sort 'em
out this way. (They are the only things
like this according to bug report #800166.) */
);
putMapping(acipConsonant2wylie, "ZH", "zh");
putMapping(acipConsonant2wylie, "Z", "z");
putMapping(acipConsonant2wylie, "'", "'");
putMapping(acipConsonant2wylie, "Y", "y");
putMapping(acipConsonant2wylie, "R", "r");
putMapping(acipConsonant2wylie, "L", "l");
putMapping(acipConsonant2wylie, "SH", "sh");
putMapping(acipConsonant2wylie, "S", "s");
putMapping(acipConsonant2wylie, "H", "h");
putMapping(acipConsonant2wylie, "A", "a");
putMapping(acipConsonant2wylie, "t", "T");
putMapping(acipConsonant2wylie, "th", "Th");
putMapping(acipConsonant2wylie, "d", "D");
putMapping(acipConsonant2wylie, "n", "N");
putMapping(acipConsonant2wylie, "sh", "Sh");
}
return (String)acipConsonant2wylie.get(acip);
}
private static HashMap acipVowel2wylie = null;
/** Returns the EWTS corresponding to the given ACIP "vowel".
* Returns null if there is no such EWTS. */
static final String getWylieForACIPVowel(String acip) {
if (acipVowel2wylie == null) {
acipVowel2wylie = new HashMap(baseVowels.length * 4);
for (int i = 0; i < baseVowels.length; i++) {
putMapping(acipVowel2wylie, baseVowels[i][0], baseVowels[i][1]);
putMapping(acipVowel2wylie, '\'' + baseVowels[i][0], baseVowels[i][2]);
putMapping(acipVowel2wylie, baseVowels[i][0] + 'm', baseVowels[i][1] + 'M');
putMapping(acipVowel2wylie, '\'' + baseVowels[i][0] + 'm', baseVowels[i][2] + 'M');
putMapping(acipVowel2wylie, baseVowels[i][0] + ':', baseVowels[i][1] + 'H');
putMapping(acipVowel2wylie, '\'' + baseVowels[i][0] + ':', baseVowels[i][2] + 'H');
putMapping(acipVowel2wylie, baseVowels[i][0] + "m:", baseVowels[i][1] + "MH");
putMapping(acipVowel2wylie, '\'' + baseVowels[i][0] + "m:", baseVowels[i][2] + "MH");
}
// {Pm} is treated just like {PAm}; {P:} is treated just
// like {PA:}; {Pm:} is treated just like {PAm:}. But
// that happens thanks to
// TPairListFactory.getFirstConsonantAndVowel(StringBuffer,int[]).
}
return (String)acipVowel2wylie.get(acip);
}
private static HashMap acipOther2wylie = null;
/** Returns the EWTS corresponding to the given ACIP puncuation or
* mark. Returns null if there is no such EWTS. */
static final String getWylieForACIPOther(String acip) {
if (acipOther2wylie == null) {
acipOther2wylie = new HashMap(20);
// don't use putMapping for this. We don't want TMW->ACIP
// to produce "." for a U+0F0C because ACIP doesn't say
// that "." means U+0F0C. It just seems to in practice
// for ACIP Release IV texts.
acipOther2wylie.put(".", "*");
putMapping(acipOther2wylie, "m", "M");
putMapping(acipOther2wylie, ":", "H");
putMapping(acipOther2wylie, ",", "/");
putMapping(acipOther2wylie, " ", " ");
putMapping(acipOther2wylie, ";", "|");
putMapping(acipOther2wylie, "`", "!");
putMapping(acipOther2wylie, "*", "@#");
// There is no glyph in TMW with the EWTS @##, so we don't do this: putMapping(acipOther2wylie, "#", "@##");
putMapping(acipOther2wylie, "%", "~X");
putMapping(acipOther2wylie, "o", "X");
putMapping(acipOther2wylie, "&", "&");
putMapping(acipOther2wylie, "^", "\\u0F38");
putMapping(acipOther2wylie, "0", "0");
putMapping(acipOther2wylie, "1", "1");
putMapping(acipOther2wylie, "2", "2");
putMapping(acipOther2wylie, "3", "3");
putMapping(acipOther2wylie, "4", "4");
putMapping(acipOther2wylie, "5", "5");
putMapping(acipOther2wylie, "6", "6");
putMapping(acipOther2wylie, "7", "7");
putMapping(acipOther2wylie, "8", "8");
putMapping(acipOther2wylie, "9", "9");
}
return (String)acipOther2wylie.get(acip);
}
private static HashMap superACIP2unicode = null;
private static HashMap subACIP2unicode = null;
/** If acip is an ACIP consonant or vowel or punctuation mark,
* then this returns the Unicode for it. The Unicode for the
* subscribed form of the glyph is returned if subscribed is
* true. Returns null if acip is unknown. */
static String getUnicodeFor(String acip, boolean subscribed) {
if (superACIP2unicode == null) {
final boolean compactUnicode
= ThdlOptions.getBooleanOption("thdl.acip.to.unicode.conversions.use.0F52.et.cetera");
superACIP2unicode = new HashMap(144);
subACIP2unicode = new HashMap(42);
// oddball:
subACIP2unicode.put("V", "\u0FAD");
superACIP2unicode.put("DH", (compactUnicode ? "\u0F52" : "\u0F51\u0FB7"));
subACIP2unicode.put("DH", (compactUnicode ? "\u0FA2" : "\u0FA1\u0FB7"));
superACIP2unicode.put("BH", (compactUnicode ? "\u0F57" : "\u0F56\u0FB7"));
subACIP2unicode.put("BH", (compactUnicode ? "\u0FA7" : "\u0FA6\u0FB7"));
superACIP2unicode.put("dH", (compactUnicode ? "\u0F4D" : "\u0F4C\u0FB7"));
subACIP2unicode.put("dH", (compactUnicode ? "\u0F9D" : "\u0F9C\u0FB7"));
superACIP2unicode.put("DZH", (compactUnicode ? "\u0F5C" : "\u0F5B\u0FB7"));
subACIP2unicode.put("DZH", (compactUnicode ? "\u0FAC" : "\u0FAB\u0FB7"));
superACIP2unicode.put("Ksh", (compactUnicode ? "\u0F69" : "\u0F40\u0FB5"));
subACIP2unicode.put("Ksh", (compactUnicode ? "\u0FB9" : "\u0F90\u0FB5"));
superACIP2unicode.put("GH", (compactUnicode ? "\u0F43" : "\u0F42\u0FB7"));
subACIP2unicode.put("GH", (compactUnicode ? "\u0F93" : "\u0F92\u0FB7"));
superACIP2unicode.put("K", "\u0F40");
subACIP2unicode.put("K", "\u0F90");
superACIP2unicode.put("KH", "\u0F41");
subACIP2unicode.put("KH", "\u0F91");
superACIP2unicode.put("G", "\u0F42");
subACIP2unicode.put("G", "\u0F92");
superACIP2unicode.put("NG", "\u0F44");
subACIP2unicode.put("NG", "\u0F94");
superACIP2unicode.put("C", "\u0F45");
subACIP2unicode.put("C", "\u0F95");
superACIP2unicode.put("CH", "\u0F46");
subACIP2unicode.put("CH", "\u0F96");
superACIP2unicode.put("J", "\u0F47");
subACIP2unicode.put("J", "\u0F97");
superACIP2unicode.put("NY", "\u0F49");
subACIP2unicode.put("NY", "\u0F99");
superACIP2unicode.put("T", "\u0F4F");
subACIP2unicode.put("T", "\u0F9F");
superACIP2unicode.put("TH", "\u0F50");
subACIP2unicode.put("TH", "\u0FA0");
superACIP2unicode.put("D", "\u0F51");
subACIP2unicode.put("D", "\u0FA1");
superACIP2unicode.put("N", "\u0F53");
subACIP2unicode.put("N", "\u0FA3");
superACIP2unicode.put("P", "\u0F54");
subACIP2unicode.put("P", "\u0FA4");
superACIP2unicode.put("PH", "\u0F55");
subACIP2unicode.put("PH", "\u0FA5");
superACIP2unicode.put("B", "\u0F56");
subACIP2unicode.put("B", "\u0FA6");
superACIP2unicode.put("M", "\u0F58");
subACIP2unicode.put("M", "\u0FA8");
superACIP2unicode.put("TZ", "\u0F59");
subACIP2unicode.put("TZ", "\u0FA9");
superACIP2unicode.put("TS", "\u0F5A");
subACIP2unicode.put("TS", "\u0FAA");
superACIP2unicode.put("DZ", "\u0F5B");
subACIP2unicode.put("DZ", "\u0FAB");
superACIP2unicode.put("W", "\u0F5D");
subACIP2unicode.put("W", "\u0FBA"); // oddball
superACIP2unicode.put("ZH", "\u0F5E");
subACIP2unicode.put("ZH", "\u0FAE");
superACIP2unicode.put("Z", "\u0F5F");
subACIP2unicode.put("Z", "\u0FAF");
superACIP2unicode.put("'", "\u0F60");
subACIP2unicode.put("'", "\u0FB0");
superACIP2unicode.put("Y", "\u0F61");
subACIP2unicode.put("Y", "\u0FB1");
superACIP2unicode.put("R", "\u0F62");
subACIP2unicode.put("R", "\u0FB2");
superACIP2unicode.put("L", "\u0F63");
subACIP2unicode.put("L", "\u0FB3");
superACIP2unicode.put("SH", "\u0F64");
subACIP2unicode.put("SH", "\u0FB4");
superACIP2unicode.put("S", "\u0F66");
subACIP2unicode.put("S", "\u0FB6");
superACIP2unicode.put("H", "\u0F67");
subACIP2unicode.put("H", "\u0FB7");
superACIP2unicode.put("A", "\u0F68");
subACIP2unicode.put("A", "\u0FB8");
superACIP2unicode.put("t", "\u0F4A");
subACIP2unicode.put("t", "\u0F9A");
superACIP2unicode.put("th", "\u0F4B");
subACIP2unicode.put("th", "\u0F9B");
superACIP2unicode.put("d", "\u0F4C");
subACIP2unicode.put("d", "\u0F9C");
superACIP2unicode.put("n", "\u0F4E");
subACIP2unicode.put("n", "\u0F9E");
superACIP2unicode.put("sh", "\u0F65");
subACIP2unicode.put("sh", "\u0FB5");
superACIP2unicode.put("I", "\u0F72");
superACIP2unicode.put("E", "\u0F7A");
superACIP2unicode.put("O", "\u0F7C");
superACIP2unicode.put("U", "\u0F74");
superACIP2unicode.put("OO", "\u0F7D");
superACIP2unicode.put("EE", "\u0F7B");
superACIP2unicode.put("i", "\u0F80");
superACIP2unicode.put("'A", "\u0F71");
superACIP2unicode.put("'I", "\u0F71\u0F72");
superACIP2unicode.put("'E", "\u0F71\u0F7A");
superACIP2unicode.put("'O", "\u0F71\u0F7C");
superACIP2unicode.put("'U", "\u0F71\u0F74");
superACIP2unicode.put("'OO", "\u0F71\u0F7D");
superACIP2unicode.put("'EE", "\u0F71\u0F7B");
superACIP2unicode.put("'i", "\u0F71\u0F80");
superACIP2unicode.put("Im", "\u0F72\u0F7E");
superACIP2unicode.put("Em", "\u0F7A\u0F7E");
superACIP2unicode.put("Om", "\u0F7C\u0F7E");
superACIP2unicode.put("Um", "\u0F74\u0F7E");
superACIP2unicode.put("OOm", "\u0F7D\u0F7E");
superACIP2unicode.put("EEm", "\u0F7B\u0F7E");
superACIP2unicode.put("im", "\u0F80\u0F7E");
superACIP2unicode.put("'Am", "\u0F71\u0F7E");
superACIP2unicode.put("'Im", "\u0F71\u0F72\u0F7E");
superACIP2unicode.put("'Em", "\u0F71\u0F7A\u0F7E");
superACIP2unicode.put("'Om", "\u0F71\u0F7C\u0F7E");
superACIP2unicode.put("'Um", "\u0F71\u0F74\u0F7E");
superACIP2unicode.put("'OOm", "\u0F71\u0F7D\u0F7E");
superACIP2unicode.put("'EEm", "\u0F71\u0F7B\u0F7E");
superACIP2unicode.put("'im", "\u0F71\u0F80\u0F7E");
superACIP2unicode.put("I:", "\u0F72\u0F7F");
superACIP2unicode.put("E:", "\u0F7A\u0F7F");
superACIP2unicode.put("O:", "\u0F7C\u0F7F");
superACIP2unicode.put("U:", "\u0F74\u0F7F");
superACIP2unicode.put("OO:", "\u0F7D\u0F7F");
superACIP2unicode.put("EE:", "\u0F7B\u0F7F");
superACIP2unicode.put("i:", "\u0F80\u0F7F");
superACIP2unicode.put("'A:", "\u0F71\u0F7F");
superACIP2unicode.put("'I:", "\u0F71\u0F72\u0F7F");
superACIP2unicode.put("'E:", "\u0F71\u0F7A\u0F7F");
superACIP2unicode.put("'O:", "\u0F71\u0F7C\u0F7F");
superACIP2unicode.put("'U:", "\u0F71\u0F74\u0F7F");
superACIP2unicode.put("'OO:", "\u0F71\u0F7D\u0F7F");
superACIP2unicode.put("'EE:", "\u0F71\u0F7B\u0F7F");
superACIP2unicode.put("'i:", "\u0F71\u0F80\u0F7F");
superACIP2unicode.put("Im:", "\u0F72\u0F7E\u0F7F");
superACIP2unicode.put("Em:", "\u0F7A\u0F7E\u0F7F");
superACIP2unicode.put("Om:", "\u0F7C\u0F7E\u0F7F");
superACIP2unicode.put("Um:", "\u0F74\u0F7E\u0F7F");
superACIP2unicode.put("OOm:", "\u0F7D\u0F7E\u0F7F");
superACIP2unicode.put("EEm:", "\u0F7B\u0F7E\u0F7F");
superACIP2unicode.put("im:", "\u0F80\u0F7E\u0F7F");
superACIP2unicode.put("'Am:", "\u0F71\u0F7E\u0F7F");
superACIP2unicode.put("'Im:", "\u0F71\u0F72\u0F7E\u0F7F");
superACIP2unicode.put("'Em:", "\u0F71\u0F7A\u0F7E\u0F7F");
superACIP2unicode.put("'Om:", "\u0F71\u0F7C\u0F7E\u0F7F");
superACIP2unicode.put("'Um:", "\u0F71\u0F74\u0F7E\u0F7F");
superACIP2unicode.put("'OOm:", "\u0F71\u0F7D\u0F7E\u0F7F");
superACIP2unicode.put("'EEm:", "\u0F71\u0F7B\u0F7E\u0F7F");
superACIP2unicode.put("'im:", "\u0F71\u0F80\u0F7E\u0F7F");
// :m does not appear, though you'd think it's as valid as m:.
superACIP2unicode.put("m", "\u0F7E");
superACIP2unicode.put(":", "\u0F7F");
superACIP2unicode.put("m:", "\u0F7E\u0F7F");
superACIP2unicode.put("Am", "\u0F7E");
superACIP2unicode.put("A:", "\u0F7F");
superACIP2unicode.put("Am:", "\u0F7E\u0F7F");
superACIP2unicode.put("0", "\u0F20");
superACIP2unicode.put("1", "\u0F21");
superACIP2unicode.put("2", "\u0F22");
superACIP2unicode.put("3", "\u0F23");
superACIP2unicode.put("4", "\u0F24");
superACIP2unicode.put("5", "\u0F25");
superACIP2unicode.put("6", "\u0F26");
superACIP2unicode.put("7", "\u0F27");
superACIP2unicode.put("8", "\u0F28");
superACIP2unicode.put("9", "\u0F29");
// punctuation
superACIP2unicode.put("&", "\u0F85");
superACIP2unicode.put(",", "\u0F0D");
superACIP2unicode.put(" ", "\u0F0B");
superACIP2unicode.put(".", "\u0F0C");
superACIP2unicode.put("`", "\u0F08");
superACIP2unicode.put("`", "\u0F08");
superACIP2unicode.put("*", "\u0F04\u0F05");
superACIP2unicode.put("#", "\u0F04\u0F05\u0F05");
superACIP2unicode.put("%", "\u0F35"); // but might be U+0F14, so we warn.
superACIP2unicode.put("o", "\u0F37");
superACIP2unicode.put(";", "\u0F11");
superACIP2unicode.put("\r", "\r");
superACIP2unicode.put("\t", "\t");
superACIP2unicode.put("\r\n", "\r\n");
superACIP2unicode.put("\n", "\n");
superACIP2unicode.put("\\", "\u0F84");
superACIP2unicode.put("^", "\u0F38");
// DLC FIXME: "^ GONG" is "^GONG", right?
// DLC FIXME: what's the Unicode for x? RC said there is none in plain-text Unicode for x. But what about in RTF Unicode?
}
if (subscribed) {
String u = (String)subACIP2unicode.get(acip);
if (null != u) return u;
}
return (String)superACIP2unicode.get(acip);
}
/** Gets the duffcodes for vowel, such that they look good with
* the stack with hash key hashKey, and appends them to r. */
static void getDuffForACIPVowel(ArrayList duff, DuffCode preceding, String vowel) {
if (null == vowel) return;
if (null == getWylieForACIPVowel(vowel)) // FIXME: expensive assertion! Use assert.
throw new IllegalArgumentException("Vowel " + vowel + " isn't in the small set of vowels we handle correctly.");
// Order matters here.
boolean context_added[] = new boolean[] { false };
if (vowel.startsWith("A")) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.WYLIE_aVOWEL, context_added);
} else if (vowel.indexOf("'U") >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.U_VOWEL, context_added);
} else if (vowel.indexOf("'I") >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.I_VOWEL, context_added);
} else {
if (vowel.indexOf('\'') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.A_VOWEL, context_added);
}
if (vowel.indexOf("EE") >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.ai_VOWEL, context_added);
} else if (vowel.indexOf('E') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.e_VOWEL, context_added);
}
if (vowel.indexOf("OO") >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.au_VOWEL, context_added);
} else if (vowel.indexOf('O') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.o_VOWEL, context_added);
}
if (vowel.indexOf('I') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.i_VOWEL, context_added);
}
if (vowel.indexOf('U') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.u_VOWEL, context_added);
}
if (vowel.indexOf('i') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.reverse_i_VOWEL, context_added);
}
}
// FIXME: Use TMW9.61, the "o'i" special combination, when appropriate.
if (vowel.indexOf('m') >= 0) {
DuffCode last = (DuffCode)duff.get(duff.size() - 1);
duff.remove(duff.size() - 1); // getBindu will add it back...
TibTextUtils.getBindu(duff, last);
}
if (vowel.indexOf(':') >= 0)
duff.add(TibetanMachineWeb.getGlyph("H"));
}
/** Returns true if and only if l is the ACIP representation of a
letter that can be a suffix. Note that all postsuffixes are
also suffixes. l must not have an "A" -- use "S", not "SA",
that is. */
public static boolean isACIPSuffix(String l) {
return ("S".equals(l)
|| "G".equals(l)
|| "D".equals(l)
|| "M".equals(l)
|| "'".equals(l)
|| "B".equals(l)
|| "NG".equals(l)
|| "N".equals(l)
|| "L".equals(l)
|| "R".equals(l));
}
/** Returns true if and only if l is the ACIP representation of a
letter that can be a prefix. l must not have an "A" -- use
"D", not "DA", that is. */
public static boolean isACIPPrefix(String l) {
return ("'".equals(l)
|| "M".equals(l)
|| "B".equals(l)
|| "D".equals(l)
|| "G".equals(l));
}
/** Returns true if and only if l is the ACIP representation of a
letter that can be a postsuffix. l must not have an "A" --
use "D", not "DA", that is. */
public static boolean isACIPPostsuffix(String l) {
return ("S".equals(l)
|| "D".equals(l));
}
}

View file

@ -18,11 +18,25 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt;
import java.util.HashSet;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.StringTokenizer;
import java.util.List;
import org.thdl.util.ThdlOptions;
import org.thdl.tib.text.DuffCode;
import org.thdl.tib.text.THDLWylieConstants;
import org.thdl.tib.text.TibetanMachineWeb;
import org.thdl.tib.text.TibTextUtils;
/** A singleton class that should contain (but due to laziness and
* ignorance probably does not contain) all the traits that make ACIP
* transliteration different from other (say, EWTS)
* transliterations. */
final class ACIPTraits implements TTraits {
* transliteration scheme different from other (say, EWTS)
* transliteration schemes. This is not safe to use in concurrent
* programs but it would be easy to make it so. */
public final class ACIPTraits implements TTraits {
/** sole instance of this class */
private static ACIPTraits singleton = null;
@ -30,7 +44,7 @@ final class ACIPTraits implements TTraits {
private ACIPTraits() { }
/** Returns the singleton instance of this class. */
public static ACIPTraits instance() {
public static /* synchronized */ ACIPTraits instance() {
if (null == singleton) {
singleton = new ACIPTraits();
}
@ -43,15 +57,536 @@ final class ACIPTraits implements TTraits {
/** Returns '-'. */
public char disambiguatorChar() { return '-'; }
public int maxConsonantLength() { return ACIPRules.MAX_CONSONANT_LENGTH; }
public int maxConsonantLength() { return MAX_CONSONANT_LENGTH; }
public int maxWowelLength() { return ACIPRules.MAX_WOWEL_LENGTH; }
public boolean isConsonant(String s) { return ACIPRules.isConsonant(s); }
public boolean isWowel(String s) { return ACIPRules.isWowel(s); }
public int maxWowelLength() { return MAX_WOWEL_LENGTH; }
public boolean hasSimpleError(TPair p) {
return ("A".equals(p.getLeft()) && null == p.getRight());
}
public String aVowel() { return "A"; }
public boolean isPostsuffix(String l) {
return ("S".equals(l)
|| "D".equals(l));
}
public boolean isSuffix(String l) {
return ("S".equals(l)
|| "G".equals(l)
|| "D".equals(l)
|| "M".equals(l)
|| "'".equals(l)
|| "B".equals(l)
|| "NG".equals(l)
|| "N".equals(l)
|| "L".equals(l)
|| "R".equals(l));
}
public boolean isPrefix(String l) {
return ("'".equals(l)
|| "M".equals(l)
|| "B".equals(l)
|| "D".equals(l)
|| "G".equals(l));
}
private HashMap superACIP2unicode = null;
private HashMap subACIP2unicode = null;
public /* synchronized */ String getUnicodeFor(String acip, boolean subscribed) {
if (superACIP2unicode == null) {
final boolean compactUnicode
= ThdlOptions.getBooleanOption("thdl.acip.to.unicode.conversions.use.0F52.et.cetera");
superACIP2unicode = new HashMap(144);
subACIP2unicode = new HashMap(42);
// oddball:
subACIP2unicode.put("V", "\u0FAD");
superACIP2unicode.put("DH", (compactUnicode ? "\u0F52" : "\u0F51\u0FB7"));
subACIP2unicode.put("DH", (compactUnicode ? "\u0FA2" : "\u0FA1\u0FB7"));
superACIP2unicode.put("BH", (compactUnicode ? "\u0F57" : "\u0F56\u0FB7"));
subACIP2unicode.put("BH", (compactUnicode ? "\u0FA7" : "\u0FA6\u0FB7"));
superACIP2unicode.put("dH", (compactUnicode ? "\u0F4D" : "\u0F4C\u0FB7"));
subACIP2unicode.put("dH", (compactUnicode ? "\u0F9D" : "\u0F9C\u0FB7"));
superACIP2unicode.put("DZH", (compactUnicode ? "\u0F5C" : "\u0F5B\u0FB7"));
subACIP2unicode.put("DZH", (compactUnicode ? "\u0FAC" : "\u0FAB\u0FB7"));
superACIP2unicode.put("Ksh", (compactUnicode ? "\u0F69" : "\u0F40\u0FB5"));
subACIP2unicode.put("Ksh", (compactUnicode ? "\u0FB9" : "\u0F90\u0FB5"));
superACIP2unicode.put("GH", (compactUnicode ? "\u0F43" : "\u0F42\u0FB7"));
subACIP2unicode.put("GH", (compactUnicode ? "\u0F93" : "\u0F92\u0FB7"));
superACIP2unicode.put("K", "\u0F40");
subACIP2unicode.put("K", "\u0F90");
superACIP2unicode.put("KH", "\u0F41");
subACIP2unicode.put("KH", "\u0F91");
superACIP2unicode.put("G", "\u0F42");
subACIP2unicode.put("G", "\u0F92");
superACIP2unicode.put("NG", "\u0F44");
subACIP2unicode.put("NG", "\u0F94");
superACIP2unicode.put("C", "\u0F45");
subACIP2unicode.put("C", "\u0F95");
superACIP2unicode.put("CH", "\u0F46");
subACIP2unicode.put("CH", "\u0F96");
superACIP2unicode.put("J", "\u0F47");
subACIP2unicode.put("J", "\u0F97");
superACIP2unicode.put("NY", "\u0F49");
subACIP2unicode.put("NY", "\u0F99");
superACIP2unicode.put("T", "\u0F4F");
subACIP2unicode.put("T", "\u0F9F");
superACIP2unicode.put("TH", "\u0F50");
subACIP2unicode.put("TH", "\u0FA0");
superACIP2unicode.put("D", "\u0F51");
subACIP2unicode.put("D", "\u0FA1");
superACIP2unicode.put("N", "\u0F53");
subACIP2unicode.put("N", "\u0FA3");
superACIP2unicode.put("P", "\u0F54");
subACIP2unicode.put("P", "\u0FA4");
superACIP2unicode.put("PH", "\u0F55");
subACIP2unicode.put("PH", "\u0FA5");
superACIP2unicode.put("B", "\u0F56");
subACIP2unicode.put("B", "\u0FA6");
superACIP2unicode.put("M", "\u0F58");
subACIP2unicode.put("M", "\u0FA8");
superACIP2unicode.put("TZ", "\u0F59");
subACIP2unicode.put("TZ", "\u0FA9");
superACIP2unicode.put("TS", "\u0F5A");
subACIP2unicode.put("TS", "\u0FAA");
superACIP2unicode.put("DZ", "\u0F5B");
subACIP2unicode.put("DZ", "\u0FAB");
superACIP2unicode.put("W", "\u0F5D");
subACIP2unicode.put("W", "\u0FBA"); // oddball
superACIP2unicode.put("ZH", "\u0F5E");
subACIP2unicode.put("ZH", "\u0FAE");
superACIP2unicode.put("Z", "\u0F5F");
subACIP2unicode.put("Z", "\u0FAF");
superACIP2unicode.put("'", "\u0F60");
subACIP2unicode.put("'", "\u0FB0");
superACIP2unicode.put("Y", "\u0F61");
subACIP2unicode.put("Y", "\u0FB1");
superACIP2unicode.put("R", "\u0F62");
subACIP2unicode.put("R", "\u0FB2");
superACIP2unicode.put("L", "\u0F63");
subACIP2unicode.put("L", "\u0FB3");
superACIP2unicode.put("SH", "\u0F64");
subACIP2unicode.put("SH", "\u0FB4");
superACIP2unicode.put("S", "\u0F66");
subACIP2unicode.put("S", "\u0FB6");
superACIP2unicode.put("H", "\u0F67");
subACIP2unicode.put("H", "\u0FB7");
superACIP2unicode.put("A", "\u0F68");
subACIP2unicode.put("A", "\u0FB8");
superACIP2unicode.put("t", "\u0F4A");
subACIP2unicode.put("t", "\u0F9A");
superACIP2unicode.put("th", "\u0F4B");
subACIP2unicode.put("th", "\u0F9B");
superACIP2unicode.put("d", "\u0F4C");
subACIP2unicode.put("d", "\u0F9C");
superACIP2unicode.put("n", "\u0F4E");
subACIP2unicode.put("n", "\u0F9E");
superACIP2unicode.put("sh", "\u0F65");
subACIP2unicode.put("sh", "\u0FB5");
superACIP2unicode.put("I", "\u0F72");
superACIP2unicode.put("E", "\u0F7A");
superACIP2unicode.put("O", "\u0F7C");
superACIP2unicode.put("U", "\u0F74");
superACIP2unicode.put("OO", "\u0F7D");
superACIP2unicode.put("EE", "\u0F7B");
superACIP2unicode.put("i", "\u0F80");
superACIP2unicode.put("'A", "\u0F71");
superACIP2unicode.put("'I", "\u0F71\u0F72");
superACIP2unicode.put("'E", "\u0F71\u0F7A");
superACIP2unicode.put("'O", "\u0F71\u0F7C");
superACIP2unicode.put("'U", "\u0F71\u0F74");
superACIP2unicode.put("'OO", "\u0F71\u0F7D");
superACIP2unicode.put("'EE", "\u0F71\u0F7B");
superACIP2unicode.put("'i", "\u0F71\u0F80");
superACIP2unicode.put("Im", "\u0F72\u0F7E");
superACIP2unicode.put("Em", "\u0F7A\u0F7E");
superACIP2unicode.put("Om", "\u0F7C\u0F7E");
superACIP2unicode.put("Um", "\u0F74\u0F7E");
superACIP2unicode.put("OOm", "\u0F7D\u0F7E");
superACIP2unicode.put("EEm", "\u0F7B\u0F7E");
superACIP2unicode.put("im", "\u0F80\u0F7E");
superACIP2unicode.put("'Am", "\u0F71\u0F7E");
superACIP2unicode.put("'Im", "\u0F71\u0F72\u0F7E");
superACIP2unicode.put("'Em", "\u0F71\u0F7A\u0F7E");
superACIP2unicode.put("'Om", "\u0F71\u0F7C\u0F7E");
superACIP2unicode.put("'Um", "\u0F71\u0F74\u0F7E");
superACIP2unicode.put("'OOm", "\u0F71\u0F7D\u0F7E");
superACIP2unicode.put("'EEm", "\u0F71\u0F7B\u0F7E");
superACIP2unicode.put("'im", "\u0F71\u0F80\u0F7E");
superACIP2unicode.put("I:", "\u0F72\u0F7F");
superACIP2unicode.put("E:", "\u0F7A\u0F7F");
superACIP2unicode.put("O:", "\u0F7C\u0F7F");
superACIP2unicode.put("U:", "\u0F74\u0F7F");
superACIP2unicode.put("OO:", "\u0F7D\u0F7F");
superACIP2unicode.put("EE:", "\u0F7B\u0F7F");
superACIP2unicode.put("i:", "\u0F80\u0F7F");
superACIP2unicode.put("'A:", "\u0F71\u0F7F");
superACIP2unicode.put("'I:", "\u0F71\u0F72\u0F7F");
superACIP2unicode.put("'E:", "\u0F71\u0F7A\u0F7F");
superACIP2unicode.put("'O:", "\u0F71\u0F7C\u0F7F");
superACIP2unicode.put("'U:", "\u0F71\u0F74\u0F7F");
superACIP2unicode.put("'OO:", "\u0F71\u0F7D\u0F7F");
superACIP2unicode.put("'EE:", "\u0F71\u0F7B\u0F7F");
superACIP2unicode.put("'i:", "\u0F71\u0F80\u0F7F");
superACIP2unicode.put("Im:", "\u0F72\u0F7E\u0F7F");
superACIP2unicode.put("Em:", "\u0F7A\u0F7E\u0F7F");
superACIP2unicode.put("Om:", "\u0F7C\u0F7E\u0F7F");
superACIP2unicode.put("Um:", "\u0F74\u0F7E\u0F7F");
superACIP2unicode.put("OOm:", "\u0F7D\u0F7E\u0F7F");
superACIP2unicode.put("EEm:", "\u0F7B\u0F7E\u0F7F");
superACIP2unicode.put("im:", "\u0F80\u0F7E\u0F7F");
superACIP2unicode.put("'Am:", "\u0F71\u0F7E\u0F7F");
superACIP2unicode.put("'Im:", "\u0F71\u0F72\u0F7E\u0F7F");
superACIP2unicode.put("'Em:", "\u0F71\u0F7A\u0F7E\u0F7F");
superACIP2unicode.put("'Om:", "\u0F71\u0F7C\u0F7E\u0F7F");
superACIP2unicode.put("'Um:", "\u0F71\u0F74\u0F7E\u0F7F");
superACIP2unicode.put("'OOm:", "\u0F71\u0F7D\u0F7E\u0F7F");
superACIP2unicode.put("'EEm:", "\u0F71\u0F7B\u0F7E\u0F7F");
superACIP2unicode.put("'im:", "\u0F71\u0F80\u0F7E\u0F7F");
// :m does not appear, though you'd think it's as valid as m:.
superACIP2unicode.put("m", "\u0F7E");
superACIP2unicode.put(":", "\u0F7F");
superACIP2unicode.put("m:", "\u0F7E\u0F7F");
superACIP2unicode.put("Am", "\u0F7E");
superACIP2unicode.put("A:", "\u0F7F");
superACIP2unicode.put("Am:", "\u0F7E\u0F7F");
superACIP2unicode.put("0", "\u0F20");
superACIP2unicode.put("1", "\u0F21");
superACIP2unicode.put("2", "\u0F22");
superACIP2unicode.put("3", "\u0F23");
superACIP2unicode.put("4", "\u0F24");
superACIP2unicode.put("5", "\u0F25");
superACIP2unicode.put("6", "\u0F26");
superACIP2unicode.put("7", "\u0F27");
superACIP2unicode.put("8", "\u0F28");
superACIP2unicode.put("9", "\u0F29");
// punctuation
superACIP2unicode.put("&", "\u0F85");
superACIP2unicode.put(",", "\u0F0D");
superACIP2unicode.put(" ", "\u0F0B");
superACIP2unicode.put(".", "\u0F0C");
superACIP2unicode.put("`", "\u0F08");
superACIP2unicode.put("`", "\u0F08");
superACIP2unicode.put("*", "\u0F04\u0F05");
superACIP2unicode.put("#", "\u0F04\u0F05\u0F05");
superACIP2unicode.put("%", "\u0F35"); // but might be U+0F14, so we warn.
superACIP2unicode.put("o", "\u0F37");
superACIP2unicode.put(";", "\u0F11");
superACIP2unicode.put("\r", "\r");
superACIP2unicode.put("\t", "\t");
superACIP2unicode.put("\r\n", "\r\n");
superACIP2unicode.put("\n", "\n");
superACIP2unicode.put("\\", "\u0F84");
superACIP2unicode.put("^", "\u0F38");
// DLC FIXME: "^ GONG" is "^GONG", right?
// DLC FIXME: what's the Unicode for x? RC said there is none in plain-text Unicode for x. But what about in RTF Unicode?
}
if (subscribed) {
String u = (String)subACIP2unicode.get(acip);
if (null != u) return u;
}
return (String)superACIP2unicode.get(acip);
}
private HashMap acipOther2wylie = null;
public /* synchronized */ String getEwtsForOther(String acip) {
if (acipOther2wylie == null) {
acipOther2wylie = new HashMap(20);
// don't use putMapping for this. We don't want TMW->ACIP
// to produce "." for a U+0F0C because ACIP doesn't say
// that "." means U+0F0C. It just seems to in practice
// for ACIP Release IV texts.
acipOther2wylie.put(".", "*");
putMapping(acipOther2wylie, "m", "M");
putMapping(acipOther2wylie, ":", "H");
putMapping(acipOther2wylie, ",", "/");
putMapping(acipOther2wylie, " ", " ");
putMapping(acipOther2wylie, ";", "|");
putMapping(acipOther2wylie, "`", "!");
putMapping(acipOther2wylie, "*", "@#");
// There is no glyph in TMW with the EWTS @##, so we don't do this: putMapping(acipOther2wylie, "#", "@##");
putMapping(acipOther2wylie, "%", "~X");
putMapping(acipOther2wylie, "o", "X");
putMapping(acipOther2wylie, "&", "&");
putMapping(acipOther2wylie, "^", "\\u0F38");
putMapping(acipOther2wylie, "0", "0");
putMapping(acipOther2wylie, "1", "1");
putMapping(acipOther2wylie, "2", "2");
putMapping(acipOther2wylie, "3", "3");
putMapping(acipOther2wylie, "4", "4");
putMapping(acipOther2wylie, "5", "5");
putMapping(acipOther2wylie, "6", "6");
putMapping(acipOther2wylie, "7", "7");
putMapping(acipOther2wylie, "8", "8");
putMapping(acipOther2wylie, "9", "9");
}
return (String)acipOther2wylie.get(acip);
}
public TTshegBarScanner scanner() { return ACIPTshegBarScanner.instance(); }
/** Registers acip->wylie mappings in toWylie; registers
wylie->acip mappings in {@link #wylieToACIP}. */
private /* synchronized */ void putMapping(HashMap toWylie, String ACIP, String EWTS) {
toWylie.put(ACIP, EWTS);
if (null == wylieToACIP) {
wylieToACIP = new HashMap(75);
// We don't want to put "/" in toWylie:
wylieToACIP.put("(", "/");
wylieToACIP.put(")", "/");
wylieToACIP.put("?", "\\");
wylieToACIP.put("_", " "); // oddball.
wylieToACIP.put("o'i", "O'I"); // oddball for TMW9.61.
}
wylieToACIP.put(EWTS, ACIP);
}
/** A map from EWTS to ACIP. Note that the EWTS "w" maps to both
"V" and "W" in reality but this map will only give one or the
other. */
private HashMap wylieToACIP = null;
/** Returns the ACIP transliteration corresponding to the THDL
Extended Wylie <em>atom</em> EWTS, or null if EWTS is not
recognized. */
public String getACIPForEWTS(String EWTS) {
getEwtsForConsonant(null); // inits wylieToACIP
getEwtsForOther(null); // inits wylieToACIP
getEwtsForWowel(null); // inits wylieToACIP
String ans = (String)wylieToACIP.get(EWTS);
boolean useCapitalW = false;
if (EWTS.startsWith("w"))
useCapitalW = true; // We want W+NA, not V+NA; we want WA, not VA.
if (null == ans) {
StringBuffer finalAns = new StringBuffer(EWTS.length());
StringTokenizer sTok = new StringTokenizer(EWTS, "-+", true);
while (sTok.hasMoreTokens()) {
String part, tok = sTok.nextToken();
if (tok.equals("-") || tok.equals("+"))
part = tok;
else {
if ("w".equals(tok)) {
// There are only two stacks in TMW that have
// U+0FBA: R+Wa and w+Wa. TMW->ACIP fails for
// these unless we handle it here. (FIXME:
// add an automated test for this).
if ("R+W".equals(EWTS) || "w+W".equals(EWTS)) {
part = "W";
} else {
part = "V";
}
} else {
part = (String)wylieToACIP.get(tok);
}
}
if (null == part) return null;
finalAns.append(part);
}
if (useCapitalW)
finalAns.setCharAt(0, 'W');
return finalAns.toString();
}
if (useCapitalW)
return "W" + ans.substring(1);
else
return ans;
}
private HashMap acipConsonant2wylie = null;
/** Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y",
* even though sometimes the EWTS for those is "w", "R", or "Y".
* Handle that in the caller. */
public /* synchronized */ String getEwtsForConsonant(String acip) {
if (acipConsonant2wylie == null) {
acipConsonant2wylie = new HashMap(37);
// oddball:
putMapping(acipConsonant2wylie, "V", "w");
// more oddballs:
putMapping(acipConsonant2wylie, "DH", "d+h");
putMapping(acipConsonant2wylie, "BH", "b+h");
putMapping(acipConsonant2wylie, "dH", "D+h");
putMapping(acipConsonant2wylie, "DZH", "dz+h"); // longest, MAX_CONSONANT_LENGTH characters
putMapping(acipConsonant2wylie, "Ksh", "k+Sh"); // longest, MAX_CONSONANT_LENGTH characters
putMapping(acipConsonant2wylie, "GH", "g+h");
putMapping(acipConsonant2wylie, "K", "k");
putMapping(acipConsonant2wylie, "KH", "kh");
putMapping(acipConsonant2wylie, "G", "g");
putMapping(acipConsonant2wylie, "NG", "ng");
putMapping(acipConsonant2wylie, "C", "c");
putMapping(acipConsonant2wylie, "CH", "ch");
putMapping(acipConsonant2wylie, "J", "j");
putMapping(acipConsonant2wylie, "NY", "ny");
putMapping(acipConsonant2wylie, "T", "t");
putMapping(acipConsonant2wylie, "TH", "th");
putMapping(acipConsonant2wylie, "D", "d");
putMapping(acipConsonant2wylie, "N", "n");
putMapping(acipConsonant2wylie, "P", "p");
putMapping(acipConsonant2wylie, "PH", "ph");
putMapping(acipConsonant2wylie, "B", "b");
putMapping(acipConsonant2wylie, "M", "m");
putMapping(acipConsonant2wylie, "TZ", "ts");
putMapping(acipConsonant2wylie, "TS", "tsh");
putMapping(acipConsonant2wylie, "DZ", "dz");
putMapping(acipConsonant2wylie, "W", "W"
/* NOTE WELL: sometimes "w", sometimes "W".
Handle this in the caller.
Reasoning for "W" instead of "w": r-w and
r+w are both known hash keys. We sort 'em
out this way. (They are the only things
like this according to bug report #800166.) */
);
putMapping(acipConsonant2wylie, "ZH", "zh");
putMapping(acipConsonant2wylie, "Z", "z");
putMapping(acipConsonant2wylie, "'", "'");
putMapping(acipConsonant2wylie, "Y", "y");
putMapping(acipConsonant2wylie, "R", "r");
putMapping(acipConsonant2wylie, "L", "l");
putMapping(acipConsonant2wylie, "SH", "sh");
putMapping(acipConsonant2wylie, "S", "s");
putMapping(acipConsonant2wylie, "H", "h");
putMapping(acipConsonant2wylie, "A", "a");
putMapping(acipConsonant2wylie, "t", "T");
putMapping(acipConsonant2wylie, "th", "Th");
putMapping(acipConsonant2wylie, "d", "D");
putMapping(acipConsonant2wylie, "n", "N");
putMapping(acipConsonant2wylie, "sh", "Sh");
}
return (String)acipConsonant2wylie.get(acip);
}
private HashMap acipWowel2wylie = null;
public /* synchronized */ String getEwtsForWowel(String acip) {
if (acipWowel2wylie == null) {
acipWowel2wylie = new HashMap(baseVowels.length * 4);
for (int i = 0; i < baseVowels.length; i++) {
putMapping(acipWowel2wylie, baseVowels[i][0], baseVowels[i][1]);
putMapping(acipWowel2wylie, '\'' + baseVowels[i][0], baseVowels[i][2]);
putMapping(acipWowel2wylie, baseVowels[i][0] + 'm', baseVowels[i][1] + 'M');
putMapping(acipWowel2wylie, '\'' + baseVowels[i][0] + 'm', baseVowels[i][2] + 'M');
putMapping(acipWowel2wylie, baseVowels[i][0] + ':', baseVowels[i][1] + 'H');
putMapping(acipWowel2wylie, '\'' + baseVowels[i][0] + ':', baseVowels[i][2] + 'H');
putMapping(acipWowel2wylie, baseVowels[i][0] + "m:", baseVowels[i][1] + "MH");
putMapping(acipWowel2wylie, '\'' + baseVowels[i][0] + "m:", baseVowels[i][2] + "MH");
}
// {Pm} is treated just like {PAm}; {P:} is treated just
// like {PA:}; {Pm:} is treated just like {PAm:}. But
// that happens thanks to
// TPairListFactory.getFirstConsonantAndVowel(StringBuffer,int[]).
// Keep this code in sync with getUnicodeFor.
}
return (String)acipWowel2wylie.get(acip);
}
/** {Ksh}, the longest consonant, has 3 characters, so this is
* three. */
private static int MAX_CONSONANT_LENGTH = 3;
/** {'EEm:}, the longest wowel, has 5 characters, so this is
* five. */
private static int MAX_WOWEL_LENGTH = 5;
private static String[][] baseVowels = new String[][] {
// { ACIP, EWTS, EWTS for ACIP {'\'' + baseVowels[][0]}, vowel
// numbers (see TibetanMachineWeb's VOWEL_A, VOWEL_o, etc.)
// for ACIP, vowel numbers for ACIP {'\'' + baseVowels[][0]}
{ "A", "a", "A" },
{ "I", "i", "I" },
{ "U", "u", "U" },
{ "E", "e", "Ae" },
{ "O", "o", "Ao" },
{ "EE", "ai", "Aai" },
{ "OO", "au", "Aau" },
{ "i", "-i", "A-i" }
};
/** Returns true if and only if s is an ACIP wowel. You can't
* just call this any time -- A is both a consonant and a vowel
* in ACIP, so you have to call this in the right context. */
public boolean isWowel(String s) {
// I'm on my own with 'O and 'E and 'OO and 'EE, but GANG'O
// appears and I wonder... so here they are. It's consistent
// with 'I and 'A and 'U, at least: all the vowels may appear
// as K'vowel. DLC FIXME: ask.
return (null != getEwtsForWowel(s));
}
/** Returns true if and only if s is an ACIP consonant. */
public boolean isConsonant(String s) {
return (null != getEwtsForConsonant(s));
}
/** Gets the duffcodes for wowel, such that they look good with
* the preceding glyph, and appends them to duff. */
public void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel) {
if (null == wowel) return;
if (null == getEwtsForWowel(wowel)) // FIXME: expensive assertion! Use assert.
throw new IllegalArgumentException("Wowel " + wowel + " isn't in the small set of wowels we handle correctly.");
// Order matters here.
boolean context_added[] = new boolean[] { false };
if (wowel.startsWith("A")) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.WYLIE_aVOWEL, context_added);
} else if (wowel.indexOf("'U") >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.U_VOWEL, context_added);
} else if (wowel.indexOf("'I") >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.I_VOWEL, context_added);
} else {
if (wowel.indexOf('\'') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.A_VOWEL, context_added);
}
if (wowel.indexOf("EE") >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.ai_VOWEL, context_added);
} else if (wowel.indexOf('E') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.e_VOWEL, context_added);
}
if (wowel.indexOf("OO") >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.au_VOWEL, context_added);
} else if (wowel.indexOf('O') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.o_VOWEL, context_added);
}
if (wowel.indexOf('I') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.i_VOWEL, context_added);
}
if (wowel.indexOf('U') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.u_VOWEL, context_added);
}
if (wowel.indexOf('i') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.reverse_i_VOWEL, context_added);
}
}
// FIXME: Use TMW9.61, the "o'i" special combination, when appropriate.
if (wowel.indexOf('m') >= 0) {
DuffCode last = (DuffCode)duff.get(duff.size() - 1);
duff.remove(duff.size() - 1); // getBindu will add it back...
TibTextUtils.getBindu(duff, last);
}
if (wowel.indexOf(':') >= 0)
duff.add(TibetanMachineWeb.getGlyph(getEwtsForOther(":")));
}
}

View file

@ -18,11 +18,10 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt;
import java.io.*;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Stack;
import org.thdl.util.ThdlDebug;
import org.thdl.util.ThdlOptions;
/**
@ -36,8 +35,10 @@ import org.thdl.util.ThdlOptions;
* the parser, not here in the lexical analyzer. That'd be cleaner,
* and more like how you'd do things if you used lex and yacc.
*
* This is not public because you should use {@link ACIPTraits#scanner()}.
*
* @author David Chandler */
public class ACIPTshegBarScanner extends TTshegBarScanner {
class ACIPTshegBarScanner extends TTshegBarScanner {
/** True if those ACIP snippets inside square brackets (e.g.,
"[THIS]") are to be passed through into the output unmodified
while retaining the brackets and if those ACIP snippets inside

View file

@ -18,11 +18,14 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt;
import java.util.ArrayList;
import org.thdl.tib.text.DuffCode;
/** A singleton class that should contain (but due to laziness and
* ignorance probably does not contain) all the traits that make EWTS
* transliteration different from other (say, ACIP) transliteration
* schemes. */
final class EWTSTraits implements TTraits {
public final class EWTSTraits implements TTraits {
/** sole instance of this class */
private static EWTSTraits singleton = null;
@ -30,7 +33,7 @@ final class EWTSTraits implements TTraits {
private EWTSTraits() { }
/** */
public static EWTSTraits instance() {
public static synchronized EWTSTraits instance() {
if (null == singleton) {
singleton = new EWTSTraits();
}
@ -79,4 +82,48 @@ final class EWTSTraits implements TTraits {
|| "H".equals(s)
|| "M".equals(s)); // TODO(DLC)[EWTS->Tibetan]:???
}
public String aVowel() { return "a"; }
public boolean isPostsuffix(String s) {
return ("s".equals(s) || "d".equals(s));
}
public boolean isPrefix(String l) {
return ("'".equals(l)
|| "m".equals(l)
|| "b".equals(l)
|| "d".equals(l)
|| "g".equals(l));
}
public boolean isSuffix(String l) {
return ("s".equals(l)
|| "g".equals(l)
|| "d".equals(l)
|| "m".equals(l)
|| "'".equals(l)
|| "b".equals(l)
|| "ng".equals(l)
|| "n".equals(l)
|| "l".equals(l)
|| "r".equals(l));
}
/** Returns l, since this is EWTS's traits class. */
public String getEwtsForConsonant(String l) { return l; }
/** Returns l, since this is EWTS's traits class. */
public String getEwtsForOther(String l) { return l; }
/** Returns l, since this is EWTS's traits class. */
public String getEwtsForWowel(String l) { return l; }
public TTshegBarScanner scanner() { return EWTSTshegBarScanner.instance(); }
public void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel) {
throw new Error("TODO(DLC)[EWTS->Tibetan]");
}
public String getUnicodeFor(String l, boolean subscribed) { throw new Error("TODO(DLC)[EWTS->Tibetan]"); }
}

View file

@ -0,0 +1,56 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.ttt;
import java.util.ArrayList;
/**
* This singleton class is able to break up Strings of EWTS text (for
* example, an entire sutra file) into tsheg bars, comments, etc.
* Non-Tibetan parts are segregated (so that consumers can ensure that
* they remain non-Tibetan), and Tibetan passages are broken up into
* tsheg bars.
*
* This is not public because you should use {@link EWTSTraits#scanner()}.
*
* @author David Chandler */
class EWTSTshegBarScanner extends TTshegBarScanner {
/** See the comment in TTshegBarScanner. This does not find
errors and warnings that you'd think of a parser finding (DLC
DOES IT?). */
public ArrayList scan(String s, StringBuffer errors, int maxErrors,
boolean shortMessages, String warningLevel) {
// the size depends on whether it's mostly Tibetan or mostly
// Latin and a number of other factors. This is meant to be
// an underestimate, but not too much of an underestimate.
ArrayList al = new ArrayList(s.length() / 10);
throw new Error("DLC unimplemented");
}
/** non-public because this is a singleton */
protected EWTSTshegBarScanner() { }
private static EWTSTshegBarScanner singleton = null;
/** Returns the sole instance of this class. */
public synchronized static EWTSTshegBarScanner instance() {
if (null == singleton) {
singleton = new EWTSTshegBarScanner();
}
return singleton;
}
}

View file

@ -202,15 +202,16 @@ public class PackageTest extends TestCase {
message. */
static String ACIP2TMW2Translit(boolean EWTSNotACIP, String ACIP) {
StringBuffer errors = new StringBuffer();
ArrayList al = ACIPTshegBarScanner.instance().scan(ACIP, errors, -1,
false, "None");
ArrayList al = ACIPTraits.instance().scanner().scan(ACIP, errors, -1,
false, "None");
if (null == al || errors.length() > 0)
return null;
org.thdl.tib.text.TibetanDocument tdoc
= new org.thdl.tib.text.TibetanDocument();
int loc[] = new int[] { 0 };
try {
if (!TConverter.convertToTMW(al,
if (!TConverter.convertToTMW(ACIPTraits.instance(),
al,
tdoc,
null,
null,
@ -7358,8 +7359,8 @@ tstHelper("ZUR");
private static void shelp(String s, String expectedErrors, String expectedScan, String warningLevel) {
StringBuffer errors = new StringBuffer();
ArrayList al = ACIPTshegBarScanner.instance().scan(s, errors, -1, false,
warningLevel);
ArrayList al = ACIPTraits.instance().scanner().scan(s, errors, -1, false,
warningLevel);
if (null != expectedScan) {
if (!al.toString().equals(expectedScan)) {
System.out.println("Scanning " + s + " into tsheg bars was expected to cause the following scan:");
@ -7392,7 +7393,7 @@ tstHelper("ZUR");
/** Tests {@link ACIPTshegBarScanner#scan(String, StringBuffer,
int, boolean)}. */
public void testScanner() {
public void testAcipScanner() {
shelp("Pm KA", "", "[TIBETAN_NON_PUNCTUATION:{Pm}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{KA}]");
shelp("KA (KHA\nGA)", "", "[TIBETAN_NON_PUNCTUATION:{KA}, TIBETAN_PUNCTUATION:{ }, START_PAREN:{(}, TIBETAN_NON_PUNCTUATION:{KHA}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{GA}, END_PAREN:{)}]");
@ -7682,7 +7683,8 @@ tstHelper("ZUR");
private static void uhelp(String acip, String expectedUnicode,
String warningLevel, boolean shortMessages) {
StringBuffer errors = new StringBuffer();
String unicode = TConverter.convertToUnicodeText("ACIP", acip, errors,
String unicode = TConverter.convertToUnicodeText(ACIPTraits.instance(),
acip, errors,
null, true,
warningLevel,
shortMessages);

View file

@ -69,10 +69,10 @@ public class TConverter {
boolean shortMessages = false;
String warningLevel = "Most";
ArrayList al
= ACIPTshegBarScanner.instance().scanFile(args[0], errors,
maxErrors - 1,
shortMessages,
warningLevel);
= ACIPTraits.instance().scanner().scanFile(args[0], errors,
maxErrors - 1,
shortMessages,
warningLevel);
if (null == al) {
System.err.println(maxErrors + " or more lexical errors occurred while scanning ACIP input file; is this");
@ -103,8 +103,9 @@ public class TConverter {
warnings = new StringBuffer();
putWarningsInOutput = true;
}
convertToTMW(al, System.out, errors, warnings, null,
putWarningsInOutput, warningLevel, shortMessages, colors);
convertToTMW(ACIPTraits.instance(), al, System.out, errors, warnings,
null, putWarningsInOutput, warningLevel, shortMessages,
colors);
int retCode = 0;
if (errors.length() > 0) {
System.err.println("Errors converting ACIP input file: ");
@ -139,7 +140,8 @@ public class TConverter {
* prefix rules in another
* @throws IOException if we cannot write to out
*/
public static boolean convertToTMW(ArrayList scan,
public static boolean convertToTMW(TTraits ttraits,
ArrayList scan,
OutputStream out,
StringBuffer errors,
StringBuffer warnings,
@ -152,7 +154,8 @@ public class TConverter {
{
TibetanDocument tdoc = new TibetanDocument();
boolean rv
= convertToTMW(scan, tdoc, errors, warnings, hasWarnings,
= convertToTMW(ttraits,
scan, tdoc, errors, warnings, hasWarnings,
writeWarningsToResult, warningLevel,
shortMessages, colors,
new int[] { tdoc.getLength() });
@ -169,7 +172,8 @@ public class TConverter {
offset from zero inside tdoc at which conversion results will
be placed. On output, loc[0] is one past the offset of the
last of the conversion results. */
public static boolean convertToTMW(ArrayList scan,
public static boolean convertToTMW(TTraits ttraits,
ArrayList scan,
TibetanDocument tdoc,
StringBuffer errors,
StringBuffer warnings,
@ -181,7 +185,8 @@ public class TConverter {
int[] loc)
throws IOException
{
return convertTo(false, true, scan, null, tdoc, errors, warnings,
return convertTo(false, true,
ttraits, scan, null, tdoc, errors, warnings,
hasWarnings, writeWarningsToResult, warningLevel,
shortMessages, colors, loc,
loc[0] == tdoc.getLength());
@ -189,33 +194,30 @@ public class TConverter {
/** Returns UTF-8 encoded Unicode. A bit indirect, so use this
* for testing only if performance is a concern. If errors occur
* in scanning the ACIP or in converting a tsheg bar, then they
* are appended to errors if errors is non-null, as well as
* written to the result. If warnings occur in scanning the ACIP
* or in converting a tsheg bar, then they are appended to
* warnings if warnings is non-null, and they are written to the
* result if writeWarningsToResult is true. Error and warning
* messages are long and self-contained unless shortMessages is
* true. Returns the conversion upon perfect success or if there
* were merely warnings, null if errors occurred. */
public static String convertToUnicodeText(String transliteration,
String acip,
* in scanning the transliteration or in converting a tsheg bar,
* then they are appended to errors if errors is non-null, as
* well as written to the result. If warnings occur in scanning
* the transliteration or in converting a tsheg bar, then they
* are appended to warnings if warnings is non-null, and they are
* written to the result if writeWarningsToResult is true. Error
* and warning messages are long and self-contained unless
* shortMessages is true. Returns the conversion upon perfect
* success or if there were merely warnings, null if errors
* occurred. */
public static String convertToUnicodeText(TTraits ttraits,
String translit,
StringBuffer errors,
StringBuffer warnings,
boolean writeWarningsToResult,
String warningLevel,
boolean shortMessages) {
if (transliteration != "ACIP") {
ThdlDebug.noteIffyCode();
throw new IllegalArgumentException("Unsupported transliteration");
}
ByteArrayOutputStream sw = new ByteArrayOutputStream();
ArrayList al
= ACIPTshegBarScanner.instance().scan(acip, errors, -1,
shortMessages, warningLevel);
= ttraits.scanner().scan(translit, errors, -1, shortMessages,
warningLevel);
try {
if (null != al) {
convertToUnicodeText(al, sw, errors,
convertToUnicodeText(ttraits, al, sw, errors,
warnings, null, writeWarningsToResult,
warningLevel, shortMessages);
return sw.toString("UTF-8");
@ -236,7 +238,8 @@ public class TConverter {
* writeWarningsToOut is true, then warnings also will be written
* to out.
* @return true upon perfect success, false if errors occurred.
* @param scan result of ACIPTshegBarScanner.scan(..)
* @param scan result of using ttraits.scanner() to break up the
* original string of transliteration
* @param out stream to which to write converted text
* @param errors if non-null, all error messages are appended
* @param warnings if non-null, all warning messages appropriate
@ -246,9 +249,9 @@ public class TConverter {
* false otherwise
* @param writeWarningsToOut if true, then all warning messages
* are written to out in the appropriate places
* @throws IOException if we cannot write to out
*/
public static boolean convertToUnicodeText(ArrayList scan,
* @throws IOException if we cannot write to out */
public static boolean convertToUnicodeText(TTraits ttraits,
ArrayList scan,
OutputStream out,
StringBuffer errors,
StringBuffer warnings,
@ -258,7 +261,8 @@ public class TConverter {
boolean shortMessages)
throws IOException
{
return convertTo(true, false, scan, out, null, errors, warnings,
return convertTo(true, false,
ttraits, scan, out, null, errors, warnings,
hasWarnings, writeWarningsToOut, warningLevel,
shortMessages, false, new int[] { -1 } , true);
}
@ -283,6 +287,7 @@ public class TConverter {
private static boolean convertTo(boolean toUnicode, // else to TMW
boolean toRTF, // else to UTF-8-encoded text
TTraits ttraits,
ArrayList scan,
OutputStream out, // for (toUnicode && !toRTF) mode
TibetanDocument tdoc, // for !toUnicode mode or (toUnicode && toRTF) mode
@ -368,7 +373,7 @@ public class TConverter {
if (lastGuyWasNonPunct) {
String err = "[#ERROR " + ErrorsAndWarnings.getMessage(133, shortMessages, s.getText()) + "]";
if (null != writer) {
String uni = ACIPRules.getUnicodeFor(s.getText(), false);
String uni = ttraits.getUnicodeFor(s.getText(), false);
if (null == uni) {
hasErrors = true;
uni = err;
@ -377,7 +382,7 @@ public class TConverter {
}
if (null != tdoc) {
String wylie
= ACIPRules.getWylieForACIPOther(s.getText());
= ttraits.getEwtsForOther(s.getText());
if (null == wylie) {
hasErrors = true;
tdoc.appendRoman(tdocLocation[0], err, Color.RED);
@ -658,7 +663,7 @@ public class TConverter {
}
if (!done) {
if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false);
if (null != writer) unicode = ttraits.getUnicodeFor(s.getText(), false);
if (null != tdoc) {
if (s.getText().equals("\r")
|| s.getText().equals("\t")
@ -675,7 +680,7 @@ public class TConverter {
TibetanMachineWeb.getGlyph("#")
}; // hard-coded EWTS values
} else {
String wy = ACIPRules.getWylieForACIPOther(s.getText());
String wy = ttraits.getEwtsForOther(s.getText());
if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
duff = new Object[] { TibetanMachineWeb.getGlyph(wy) };
}

View file

@ -26,22 +26,27 @@ import java.util.ArrayList;
/** An ordered pair used in ACIP/EWTS-to-TMW/Unicode conversion. The
* left side is the consonant or empty; the right side is either the
* vowel or '+' (indicating stacking) or a disambiguator (i.e., '-'
* in ACIP or '.' in EWTS).
* vowel or '+' (indicating stacking in both ACIP and EWTS) or a
* disambiguator (e.g., '-' in ACIP or '.' in EWTS).
* @author David Chandler */
/* BIG FIXME: make this package work for EWTS, not just ACIP. (TODO(DLC)[EWTS->Tibetan]: does it?) */
class TPair {
/** The left side, or null if there is no left side. That is, the
* non-vowel, non-'m', non-':', non-'-', non-'+' guy. */
/** the part that knows ACIP from EWTS */
private TTraits traits;
/** Returns the part that knows ACIP from EWTS. */
public TTraits getTraits() { return traits; }
/** The left side, or null if there is no left side. I.e., the
* non-wowel, non-disambiguator, non-'+' guy. */
private String l;
String getLeft() {
ThdlDebug.verify(!"".equals(l));
return l;
}
/** The right side. That is, the vowel, with 'm' or ':' "vowel"
* after it if appropriate, or "-" (disambiguator), or "+"
* (stacking), or null otherwise. */
/** The right side. That is, the wowel or disambiguator or "+"
* (for stacking) or null otherwise. */
private String r;
String getRight() {
ThdlDebug.verify(!"".equals(r));
@ -50,13 +55,14 @@ class TPair {
/** Constructs a new TPair with left side l and right side r.
* Use null or the empty string to represent an absence. */
TPair(String l, String r) {
TPair(TTraits traits, String l, String r) {
// Normalize:
if (null != l && l.equals("")) l = null;
if (null != r && r.equals("")) r = null;
this.l = l;
this.r = r;
this.traits = traits;
}
/** Returns a nice String representation. Returns "(D . E)" for
@ -67,8 +73,8 @@ class TPair {
+ ((null == r) ? "" : r) + ")";
}
/** Returns the number of ACIP characters that make up this
* TPair. */
/** Returns the number of transliteration characters that make up
* this TPair. */
int size() {
return (((l == null) ? 0 : l.length())
+ ((r == null) ? 0 : r.length()));
@ -98,18 +104,18 @@ class TPair {
sz = l.length();
newL = l.substring(0, sz - N);
}
return new TPair(newL, newR);
return new TPair(traits, newL, newR);
}
/** Returns true if and only if this is nonempty and is l, if
* present, is a legal ACIP consonant, and is r, if present, is a
* legal ACIP vowel. */
/** Returns true if and only if this is nonempty and if l, if
* present, is a legal consonant, and if r, if present, is a
* legal wowel. */
boolean isLegal() {
if (size() < 1)
return false;
if (null != l && !ACIPRules.isConsonant(l))
if (null != l && !traits.isConsonant(l))
return false;
if (null != r && !ACIPRules.isWowel(r))
if (null != r && !traits.isWowel(r))
return false;
return true;
}
@ -119,9 +125,9 @@ class TPair {
boolean isPrefix() {
return (null != l
&& ((null == r || "".equals(r))
|| "-".equals(r) // TODO(DLC)[EWTS->Tibetan]
|| "A".equals(r)) // FIXME: though check for BASKYABS and warn because BSKYABS is more common
&& ACIPRules.isACIPPrefix(l));
|| traits.disambiguator().equals(r)
|| traits.aVowel().equals(r)) // FIXME: though check for BASKYABS and warn because BSKYABS is more common
&& traits.isPrefix(l));
}
/** Returns true if and only if this pair could be a Tibetan
@ -129,25 +135,25 @@ class TPair {
boolean isPostSuffix() {
return (null != l
&& ((null == r || "".equals(r))
|| "-".equals(r)
|| "A".equals(r)) // FIXME: though warn about GAMASA vs. GAMS
&& ACIPRules.isACIPPostsuffix(l));
|| traits.disambiguator().equals(r)
|| traits.aVowel().equals(r)) // FIXME: though warn about GAMASA vs. GAMS
&& traits.isPostsuffix(l));
}
/** Returns true if and only if this pair could be a Tibetan
* suffix. FIXME: ACIP specific, just like isPostSuffix() and isPrefix() */
* suffix. */
boolean isSuffix() {
return (null != l
&& ((null == r || "".equals(r))
|| "-".equals(r)
|| "A".equals(r))
&& ACIPRules.isACIPSuffix(l));
|| traits.disambiguator().equals(r)
|| traits.aVowel().equals(r))
&& traits.isSuffix(l));
}
/** Returns true if and only if this pair is merely a
* disambiguator. */
boolean isDisambiguator() {
return ("-".equals(r) && getLeft() == null);
return (traits.disambiguator().equals(r) && getLeft() == null);
}
/** Yep, this works for TPairs. */
@ -160,16 +166,16 @@ class TPair {
return false;
}
/** Returns a TPair that is like this pair except that it has
* a "+" on the right if this pair is empty on the right and is
* empty on the right if this pair has a disambiguator (i.e., a
* '-') on the right. May return itself (but never mutates this
/** Returns a TPair that is like this pair except that it has a
* "+" on the right if this pair is empty on the right and is
* empty on the right if this pair has a disambiguator on the
* right. May return itself (but never mutates this
* instance). */
TPair insideStack() {
if (null == getRight())
return new TPair(getLeft(), "+");
else if ("-".equals(getRight()))
return new TPair(getLeft(), null);
return new TPair(traits, getLeft(), "+");
else if (traits.disambiguator().equals(getRight()))
return new TPair(traits, getLeft(), null);
else
return this;
}
@ -194,7 +200,7 @@ class TPair {
String getWylie(boolean justLeft) {
String leftWylie = null;
if (getLeft() != null) {
leftWylie = ACIPRules.getWylieForACIPConsonant(getLeft());
leftWylie = traits.getEwtsForConsonant(getLeft());
if (leftWylie == null) {
if (isNumeric())
leftWylie = getLeft();
@ -208,7 +214,7 @@ class TPair {
else if ("+".equals(getRight()))
rightWylie = "+";
else if (getRight() != null)
rightWylie = ACIPRules.getWylieForACIPVowel(getRight());
rightWylie = traits.getEwtsForWowel(getRight());
if (null == rightWylie) rightWylie = "";
return leftWylie + rightWylie;
}
@ -227,18 +233,19 @@ class TPair {
void getUnicode(StringBuffer consonantSB, StringBuffer vowelSB,
boolean subscribed) {
if (null != getLeft()) {
String x = ACIPRules.getUnicodeFor(getLeft(), subscribed);
String x = traits.getUnicodeFor(getLeft(), subscribed);
if (null == x) throw new Error("TPair: " + getLeft() + " has no Uni");
consonantSB.append(x);
}
if (null != getRight()
&& !("-".equals(getRight()) || "+".equals(getRight()) || "A".equals(getRight()))) {
String x = ACIPRules.getUnicodeFor(getRight(), subscribed);
String x = traits.getUnicodeFor(getRight(), subscribed);
if (null == x) throw new Error("TPair: " + getRight() + " has no Uni");
vowelSB.append(x);
}
}
// TODO(DLC)[EWTS->Tibetan]
/** Returns true if this pair is surely the last pair in an ACIP
* stack. Stacking continues through (* . ) and (* . +), but
* stops anywhere else. */

View file

@ -33,6 +33,9 @@ import java.util.ArrayList;
*
* @author David Chandler */
class TPairList {
/** the part that knows ACIP from EWTS */
private TTraits traits;
/** FIXME: change me and see if performance improves. */
private static final int INITIAL_SIZE = 1;
@ -41,17 +44,20 @@ class TPairList {
/** Creates a new list containing just p. */
public TPairList(TPair p) {
this.traits = p.getTraits();
al = new ArrayList(1);
add(p);
}
/** Creates an empty list. */
public TPairList() {
public TPairList(TTraits traits) {
this.traits = traits;
al = new ArrayList(INITIAL_SIZE);
}
/** Creates an empty list with the capacity to hold N items. */
public TPairList(int N) {
public TPairList(TTraits traits, int N) {
this.traits = traits;
al = new ArrayList(N);
}
@ -181,7 +187,7 @@ class TPairList {
return ErrorsAndWarnings.getMessage(125, shortMessages, translit);
} else if ((null == p.getLeft() && !"-".equals(p.getRight()))
|| (null != p.getLeft()
&& !ACIPRules.isConsonant(p.getLeft())
&& !traits.isConsonant(p.getLeft())
&& !p.isNumeric())) {
// FIXME: stop handling this outside of ErrorsAndWarnings:
if (null == p.getLeft()) {
@ -406,12 +412,12 @@ class TPairList {
// and only if b1 is one, etc.
for (int counter = 0; counter < (1<<numBreaks); counter++) {
TStackList sl = new TStackList();
TPairList currentStack = new TPairList();
TPairList currentStack = new TPairList(traits);
for (int k = startLoc; k <= i; k++) {
if (!get(k).isDisambiguator()) {
if (get(k).isNumeric()
|| (get(k).getLeft() != null
&& ACIPRules.isConsonant(get(k).getLeft())))
&& traits.isConsonant(get(k).getLeft())))
currentStack.add(get(k).insideStack());
else
return null; // sA, for example, is illegal.
@ -419,7 +425,7 @@ class TPairList {
if (k == i || get(k).endsACIPStack()) {
if (!currentStack.isEmpty())
sl.add(currentStack.asStack());
currentStack = new TPairList();
currentStack = new TPairList(traits);
} else {
if (numBreaks > 0) {
for (int j = 0; breakStart+j < 3; j++) {
@ -427,7 +433,7 @@ class TPairList {
&& 1 == ((counter >> j) & 1)) {
if (!currentStack.isEmpty())
sl.add(currentStack.asStack());
currentStack = new TPairList();
currentStack = new TPairList(traits);
break; // shouldn't matter, but you never know
}
}
@ -460,9 +466,9 @@ class TPairList {
if (!isEmpty()) {
TPair lastPair = get(size() - 1);
if ("+".equals(lastPair.getRight()))
al.set(size() - 1, new TPair(lastPair.getLeft(), null));
al.set(size() - 1, new TPair(traits, lastPair.getLeft(), null));
else if ("-".equals(lastPair.getRight()))
al.set(size() - 1, new TPair(lastPair.getLeft(), null));
al.set(size() - 1, new TPair(traits, lastPair.getLeft(), null));
}
return this;
}
@ -506,10 +512,10 @@ class TPairList {
add_U0F7F = true;
StringBuffer rr = new StringBuffer(p.getRight());
rr.deleteCharAt(where);
p = new TPair(p.getLeft(), rr.toString());
p = new TPair(traits, p.getLeft(), rr.toString());
}
boolean hasNonAVowel = (!"A".equals(p.getRight()) && null != p.getRight());
String thislWylie = ACIPRules.getWylieForACIPConsonant(p.getLeft());
String thislWylie = traits.getEwtsForConsonant(p.getLeft());
if (thislWylie == null) {
char ch;
if (p.isNumeric()) {
@ -528,21 +534,21 @@ class TPairList {
boolean isTibetan = TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(ll.toString());
boolean isSanskrit = TibetanMachineWeb.isWylieSanskritConsonantStack(lWylie.toString());
if (ddebug && !isTibetan && !isSanskrit && !isNumeric) {
System.out.println("OTHER for " + lWylie + " with vowel " + ACIPRules.getWylieForACIPVowel(p.getRight()) + " and p.getRight()=" + p.getRight());
System.out.println("OTHER for " + lWylie + " with vowel " + traits.getEwtsForWowel(p.getRight()) + " and p.getRight()=" + p.getRight());
}
if (isTibetan && isSanskrit) {
// RVA, e.g. It must be Tibetan because RWA is what
// you'd use for RA over fixed-form WA.
isSanskrit = false;
}
if (ddebug && hasNonAVowel && ACIPRules.getWylieForACIPVowel(p.getRight()) == null) {
System.out.println("vowel " + ACIPRules.getWylieForACIPVowel(p.getRight()) + " and p.getRight()=" + p.getRight());
if (ddebug && hasNonAVowel && traits.getEwtsForWowel(p.getRight()) == null) {
System.out.println("vowel " + traits.getEwtsForWowel(p.getRight()) + " and p.getRight()=" + p.getRight());
}
TGCPair tp;
indexList.add(new Integer(index));
tp = new TGCPair(lWylie.toString(),
(hasNonAVowel
? ACIPRules.getWylieForACIPVowel(p.getRight())
? traits.getEwtsForWowel(p.getRight())
: ""),
(isNumeric
? TGCPair.TYPE_OTHER
@ -697,9 +703,9 @@ class TPairList {
if (lastPair.getRight() == null || lastPair.equals("-")) {
duffsAndErrors.add(TibetanMachineWeb.getGlyph(hashKey));
} else {
ACIPRules.getDuffForACIPVowel(duffsAndErrors,
TibetanMachineWeb.getGlyph(hashKey),
lastPair.getRight());
traits.getDuffForWowel(duffsAndErrors,
TibetanMachineWeb.getGlyph(hashKey),
lastPair.getRight());
}
if (previousSize == duffsAndErrors.size())
throw new Error("TPairList with no duffs? " + toString()); // FIXME: change to assertion.

View file

@ -121,7 +121,7 @@ class TPairListFactory {
// base case for our recursion:
if ("".equals(acip))
return new TPairList();
return new TPairList(ttraits);
StringBuffer acipBuf = new StringBuffer(acip);
int howMuchBuf[] = new int[1];
@ -131,9 +131,9 @@ class TPairListFactory {
&& null != head.getLeft()
&& null != head.getRight()
&& weHaveSeenVowelAlready
&& ACIPRules.isACIPSuffix(head.getLeft()) // DKY'O should be two horizontal units, not three. -- {D}{KY'O}, not {D}{KY}{'O}.
&& ttraits.isSuffix(head.getLeft()) // DKY'O should be two horizontal units, not three. -- {D}{KY'O}, not {D}{KY}{'O}.
&& head.getRight().startsWith("'")) {
head = new TPair(head.getLeft(),
head = new TPair(ttraits, head.getLeft(),
// Without this disambiguator, we are
// less efficient (8 parses, not 4) and
// we can't handle PA'AM'ANG etc.
@ -177,11 +177,11 @@ class TPairListFactory {
}
// TODO(DLC)[EWTS->Tibetan]: doc
private static TPairList breakHelperEWTS(String ewts, TTraits ttraits /* TODO(DLC)[EWTS->Tibetan]: use */) {
private static TPairList breakHelperEWTS(String ewts, TTraits ttraits) {
// base case for our recursion:
if ("".equals(ewts))
return new TPairList();
return new TPairList(ttraits);
StringBuffer ewtsBuf = new StringBuffer(ewts);
int howMuchBuf[] = new int[1];
@ -238,11 +238,11 @@ class TPairListFactory {
int i, xl = acip.length();
if (0 == xl) {
howMuch[0] = 0;
return new TPair(null, null);
return new TPair(ttraits, null, null);
}
if (acip.charAt(0) == ttraits.disambiguatorChar()) {
howMuch[0] = 1;
return new TPair(null, ttraits.disambiguator());
return new TPair(ttraits, null, ttraits.disambiguator());
}
char ch = acip.charAt(0);
@ -250,7 +250,7 @@ class TPairListFactory {
// like seeing 1-2-3-4.
if (ch >= '0' && ch <= '9') {
howMuch[0] = 1; // not 2...
return new TPair(acip.substring(0, 1), (xl == 1) ? null : ttraits.disambiguator());
return new TPair(ttraits, acip.substring(0, 1), (xl == 1) ? null : ttraits.disambiguator());
}
String l = null, r = null;
@ -264,11 +264,11 @@ class TPairListFactory {
int ll = (null == l) ? 0 : l.length();
if (null != l && xl > ll && acip.charAt(ll) == ttraits.disambiguatorChar()) {
howMuch[0] = l.length() + 1;
return new TPair(l, ttraits.disambiguator());
return new TPair(ttraits, l, ttraits.disambiguator());
}
if (null != l && xl > ll && acip.charAt(ll) == '+') {
howMuch[0] = l.length() + 1;
return new TPair(l, "+");
return new TPair(ttraits, l, "+");
}
for (i = Math.min(ttraits.maxWowelLength(), xl - ll); i >= 1; i--) {
String t = null;
@ -289,7 +289,7 @@ class TPairListFactory {
&& acip.charAt(z) == '+') {
acip.deleteCharAt(z-1);
howMuch[0] = l.length() + 1;
return new TPair(l, "+");
return new TPair(ttraits, l, "+");
}
// Allow Pm to mean PAm, P: to mean PA:, Pm: to mean PAm:. /* TODO(DLC)[EWTS->Tibetan]: */
@ -305,14 +305,14 @@ class TPairListFactory {
if (null == l && null == r) {
howMuch[0] = 1; // not 2...
// add a disambiguator to avoid exponential running time:
return new TPair(acip.substring(0, 1),
return new TPair(ttraits, acip.substring(0, 1),
(xl == 1) ? null : ttraits.disambiguator());
}
howMuch[0] = (((l == null) ? 0 : l.length())
+ ((r == null) ? 0 : r.length())
+ mod);
return new TPair(l, r);
return new TPair(ttraits, l, r);
} // TODO(DLC)[EWTS->Tibetan]:
}

View file

@ -18,12 +18,18 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt;
import java.util.ArrayList;
import org.thdl.tib.text.DuffCode;
/** A TTraits object encapsulates all the things that make a
* particular Roman transliteration scheme unique. If both EWTS and
* ACIP transliterations have a property in common, then it's likely
* encoded in a manner that's hard to modify. But if they differ in
* some respect, then that difference should be encoded in a TTraits
* object.
* particular Roman transliteration scheme unique. For the most
* part, this difference is expressed at the finest granularity
* possible -- often single characters of Roman transliteration.
*
* <p>If both EWTS and ACIP transliterations have a property in
* common, then it's likely encoded in a manner that's hard to
* modify. But if they differ in some respect, then that difference
* should be encoded in a TTraits object.
*
* <p>It is very likely that classes that implement this interface
* will choose to use the design pattern 'singleton'. */
@ -62,9 +68,63 @@ interface TTraits {
/** Returns true if and only if <em>s</em> is a stretch of
* transliteration corresponding to a Tibetan wowel (without any
* [achen or other] consonant) */
boolean isWowel(String s);
boolean isWowel(String s); // TODO(DLC)[EWTS->Tibetan]: what about "m:" as opposed to "m" or ":"
/** Returns true if and only if the pair given has a simple error
* other than being a mere disambiguator. */
boolean hasSimpleError(TPair p);
/** The implicit 'ahhh' vowel, the one you see when you write the
human-friendly transliteration for "\u0f40\u0f0b". */
String aVowel();
/** Returns true if s is a valid postsuffix. s must not have a
wowel on it. */
boolean isPostsuffix(String s);
/** Returns true if and only if l is the representation of a
letter that can be a suffix. Note that all postsuffixes are
also suffixes. l should not have a wowel. */
boolean isSuffix(String l);
/** Returns true if and only if l is the representation of a
letter that can be a prefix. l should not have a wowel. */
boolean isPrefix(String l);
/** Returns the EWTS transliteration corresponding to the
* consonant l, which should not have a vowel. Returns null if
* there is no such EWTS.
*
* <p>May return "W" instead of "w", "r" instead of "R", and "y"
* instead of "Y" because we sometimes don't have enough context
* to decide.
*
* <p>The reasoning for "W" instead of "w" is that r-w and r+w
* are both known hash keys (as {@link
* org.thdl.tib.text#TibetanMachineWeb} would call them). We
* sort 'em out this way. (They are the only things like this
* according to bug report #800166.) */
String getEwtsForConsonant(String l);
/** Returns the EWTS corresponding to the given punctuation or
* mark. Returns null if there is no such EWTS. */
String getEwtsForOther(String l);
/** Returns the EWTS corresponding to the given "wowel". Returns
* null if there is no such EWTS. */
String getEwtsForWowel(String l);
/** If l is a consonant or vowel or punctuation mark, then this
* returns the Unicode for it. The Unicode for the subscribed
* form of the glyph is returned if subscribed is true. Returns
* null if l is unknown. */
String getUnicodeFor(String l, boolean subscribed);
/** Returns a scanner that can break up a string of
transliteration. */
TTshegBarScanner scanner();
/** Gets the duffcodes for wowel, such that they look good with
* the preceding glyph, and appends them to duff. */
void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel);
}

View file

@ -18,7 +18,11 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt;
import java.io.*;
import java.io.IOException;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.InputStream;
import java.io.BufferedReader;
import java.util.ArrayList;
import java.util.Stack;
@ -40,7 +44,7 @@ public abstract class TTshegBarScanner {
* If errors is non-null, error messages will be appended to it.
* Returns a list of TStrings that is the scan. Warning and
* error messages in the result will be long and self-contained
* unless shortMessagse is true.
* unless shortMessages is true.
*
* <p>This is not so efficient; copies the whole file into memory
* first.