Jskad/source/org/thdl/tib/text/TibetanMachineWeb.java

2450 lines
96 KiB
Java

/*
// give B+DE to be very friendly to machines.The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text;
import java.awt.Font;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import javax.swing.text.SimpleAttributeSet;
import javax.swing.text.StyleConstants;
import org.thdl.tib.text.tshegbar.UnicodeUtils;
import org.thdl.util.ThdlDebug;
import org.thdl.util.ThdlOptions;
import org.thdl.util.Trie;
import org.thdl.tib.scanner.BasicTibetanTranscriptionConverter;
/**
* Interfaces between Extended Wylie and the TibetanMachineWeb fonts.
* To do this this must first read the code table, which lives in
* "tibwn.ini", and which must be found in the same directory as this
* class. Note that WylieWord has its own copy of this file, so edit
* both or neither.
*
* <p>In addition, this class optionally loads the TibetanMachineWeb
* fonts manually via {@link #readInTMWFontFiles()}. When we do that,
* it means that users don't have to install the fonts on their
* systems, so installation of Jskad becomes easier.
* @author Edward Garrett, Tibetan and Himalayan Digital Library
* @author David Chandler
*/
public class TibetanMachineWeb implements THDLWylieConstants {
/** This addresses bug 624133, "Input freezes after impossible
* character". The input sequences that are valid in Extended
* Wylie. For example, "Sh" will be in this container, but "S"
* will not be. */
private static Trie validInputSequences = new Trie();
/** needed because a Trie cannot have a null value associated with
* a key */
private final static String anyOldObjectWillDo
= "this placeholder is useful for debugging; we need a nonnull Object anyway";
private static TibetanKeyboard keyboard = null;
private static Set charSet = null;
private static Set tibSet = null;
private static Set sanskritStackSet = null;
private static Set numberSet = null;
private static Set vowelSet = null;
private static int maxEwtsVowelLength = -1;
private static Set puncSet = null;
private static Set topSet = null;
private static Set leftSet = null;
private static Set rightSet = null;
private static Set farRightSet = null;
private static Map tibHash = new HashMap();
private static Map binduMap = new HashMap();
private static String[][] toHashKey = new String[11][95]; //note: toHashKey[0][..] is not used
private static DuffCode[][] TMtoTMW = new DuffCode[5][255-32]; // ordinal 255 doesn't occur in TM
private static DuffCode[][] TMWtoTM = new DuffCode[10][127-32]; // ordinal 127 doesn't occur in TMW
private static String[][] TMWtoUnicode = new String[10][127-32]; // ordinal 127 doesn't occur in TMW
/** For mapping single codepoints U+0F00..U+0FFF to TMW. This
won't handle 0F00, 0F02, 0F03, or 0F0E, which are made by
using multiple glyphs from TMW, but it handles all the rest.
It handles U+0F90-U+0FBC rather poorly, in that you have to
use special formatting to get those right (FIXME: warn
whenever they're used). */
private static DuffCode[][] UnicodeToTMW = new DuffCode[256][1];
/** For mapping codepoints U+F021..U+0FFF to TMW. */
private static DuffCode[][] NonUnicodeToTMW = new DuffCode[256][1];
private static String fileName = "tibwn.ini";
private static final String DELIMITER = "~";
/** vowels that appear over the glyph: */
private static Set top_vowels;
/** the font we use when we convert TMW->Unicode: */
private static SimpleAttributeSet defaultUnicodeFontAttributeSet = null;
/** a way of encoding the choice of TibetanMachineWeb font from
that family of 10 fonts: */
private static SimpleAttributeSet[] webFontAttributeSet = new SimpleAttributeSet[11];
/** a way of encoding the choice of TibetanMachine font from
that family of 5 fonts: */
private static SimpleAttributeSet[] normFontAttributeSet = new SimpleAttributeSet[6];
private static boolean hasDisambiguatingKey; //to disambiguate gy and g.y=
private static char disambiguating_key;
private static boolean hasSanskritStackingKey; //for stacking Sanskrit
private static boolean hasTibetanStackingKey; //for stacking Tibetan
private static boolean isStackingMedial; //ie g+y, not +gy
private static char stacking_key;
private static boolean isAChenRequiredBeforeVowel;
private static boolean isAChungConsonant;
private static boolean hasAVowel;
private static String aVowel;
// We use .intern() explicitly here so the code is easier to
// understand, but all string literals are interned.
public static final String[] tmFontNames = {
null,
"TibetanMachine".intern(),
"TibetanMachineSkt1".intern(),
"TibetanMachineSkt2".intern(),
"TibetanMachineSkt3".intern(),
"TibetanMachineSkt4".intern()
};
public static final String[] tmwFontNames = {
null,
"TibetanMachineWeb".intern(),
"TibetanMachineWeb1".intern(),
"TibetanMachineWeb2".intern(),
"TibetanMachineWeb3".intern(),
"TibetanMachineWeb4".intern(),
"TibetanMachineWeb5".intern(),
"TibetanMachineWeb6".intern(),
"TibetanMachineWeb7".intern(),
"TibetanMachineWeb8".intern(),
"TibetanMachineWeb9".intern()
};
/**
* represents where in an array of DuffCodes you
* find the TibetanMachine equivalence of a glyph
*/
public static final int TM = 0;
/**
* represents where in an array of DuffCodes you
* find the reduced character equivalent of a TMW glyph
*/
public static final int REDUCED_C = 1;
/**
* represents where in an array of DuffCodes you
* find the TibetanMachineWeb glyph
*/
public static final int TMW = 2;
/**
* represents where in an array of DuffCodes you
* find the gigu value for a given glyph
*/
public static final int VOWEL_i = 3;
/**
* represents where in an array of DuffCodes you
* find the zhebju value for a given glyph
*/
public static final int VOWEL_u = 4;
/**
* represents where in an array of DuffCodes you
* find the drengbu value for a given glyph
*/
public static final int VOWEL_e = 5;
/**
* represents where in an array of DuffCodes you
* find the naro value for a given glyph
*/
public static final int VOWEL_o = 6;
/**
* represents where in an array of DuffCodes you
* find the achung value for a given glyph
*/
public static final int VOWEL_A = 7;
/**
* represents where in an array of DuffCodes you
* find the achung + zhebju value for a given glyph
*/
public static final int VOWEL_U = 8;
/**
* represents where in an array of DuffCodes you
* find the Unicode equivalence of a given glyph
*/
public static final int UNICODE = 9;
/**
* represents where in an array of DuffCodes you
* find the half height equivalence of a given glyph
*/
public static final int HALF_C = 10;
// NOTE WELL: if you delete from tibetanConsonants,
// otherConsonants, numbers, vowels, or others, you'll change the
// way Jskad's Extended Wylie keyboard works, yes, but you'll also
// change TMW->Wylie.
/** comma-delimited list of supported Tibetan consonants: */
private static final String tibetanConsonants
= "k,kh,g,ng,c,ch,j,ny,t,th,d,n,p,ph,b,m,ts,tsh,dz,w,zh,z,',y,r,l,sh,s,h,a";
/** comma-delimited list of supported non-Tibetan consonants, such
* as Sanskrit consonants: */
private static final String otherConsonants // va and fa are treated pretty-much like Sanskrit. // TODO(DLC)[EWTS->Tibetan]: now are v and f in EWTS?
= "T,Th,D,N,Sh,v,f";
/** comma-delimited list of supported numbers (superscribed,
subscribed, normal, half-numerals): */
private static final String numbers
= "0,1,2,3,4,5,6,7,8,9";
/** comma-delimited list of supported punctuation and
miscellaneous characters: */
private static final String others
= "_, ,/,|,!,:,;,@,#,$,%,(,),H,M,&,@#,?,=,{,},*,~X,X"; // FIXME: not yet supporting all these...
/** comma-delimited list of supported vowels: */
private static final String vowels
= "a,i,u,e,o,I,U,ai,au,A,-i,-I";
/** comma-delimited list of head letters (superscribed letters) */
private static final String tops = "r,s,l";
/** comma-delimited list of prefixes */
private static final String lefts = "g,d,b,m,'";
/** comma-delimited list of suffixes */
private static final String rights = "g,ng,d,n,b,m,r,l,s,',T";
/** comma-delimited list of postsuffixes. nga was here in the
* past, according to Edward, to handle cases like ya'ng. pa'am
* wasn't considered, but had it been, ma probably would've gone
* here too. We now handle 'am, 'ang, etc. specially, so now
* this set is now just the postsuffixes. */
private static final String farrights = "d,s";
static {
readData();
/* Initialize to Extended Wylie keyboard. The preferences
* mechanism will switch this to the preferred keyboard. */
setKeyboard(keyboard);
}
/** If the TMW font files are resources associated with this
* class, those font files are loaded. This means that the user
* need not install the fonts on their system, but it does make
* the JAR bigger and takes time at startup.
* @return true upon successful loading, false otherwise */
private static boolean readInTMWFontFiles() {
/* Note the leading slashes on these paths: */
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn1.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn2.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn3.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn4.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn5.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn6.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn7.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn8.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn9.ttf")) return false;
return true;
}
/** If the TM font files are resources associated with this
* class, those font files are loaded. This means that the user
* need not install the fonts on their system, but it does make
* the JAR bigger and takes time at startup.
* @return true upon successful loading, false otherwise */
private static boolean readInTMFontFiles() {
/* Note the leading slashes on these paths: */
if (!readInFontFile("/Fonts/TibetanMachine/Timn.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachine/Tims1.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachine/Tims2.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachine/Tims3.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachine/Tims4.ttf")) return false;
return true;
}
/** If the TMW font file at the given path is a resource
* associated with this class, that font file is loaded.
* @param path a path within the JAR containing this class file
* @return true upon successful loading, false otherwise */
private static boolean readInFontFile(String path) {
// Note that the TM and TMW fonts do not have hanging
// baselines. They have Roman baselines. Tony Duff said this
// is subtly necessary and that only an OpenType font can
// support baselines properly.
try {
InputStream is = TibetanMachineWeb.class.getResourceAsStream(path);
if (null == is) {
return false;
}
Font.createFont(Font.TRUETYPE_FONT, is);
} catch( Exception e ) {
e.printStackTrace();
ThdlDebug.noteIffyCode();
return false;
}
return true;
}
/** Returns the next token in st with the first occurrence of
__TILDE__ replaced with ~. Needed because the DELIMITER is ~.
Appends the escaped token to sb iff an escape sequence
occurred. */
private static String getEscapedToken(StringTokenizer st,
StringBuffer sb) {
String unescaped = st.nextToken();
int start;
if ((start = unescaped.indexOf("__TILDE__")) >= 0) {
StringBuffer x = new StringBuffer(unescaped);
x.replace(start, "__TILDE__".length(), "~");
sb.append(x.toString());
return x.toString();
} else {
return unescaped;
}
}
/**
* This method reads the data file ("tibwn.ini"), constructs
* the character, punctuation, and vowel lists, as well as
* performing other acts of initialization.
*/
private static void readData() {
if (!ThdlOptions.getBooleanOption("thdl.rely.on.system.tmw.fonts")) {
readInTMWFontFiles();
}
if (!ThdlOptions.getBooleanOption("thdl.rely.on.system.tm.fonts")) {
readInTMFontFiles();
}
defaultUnicodeFontAttributeSet = new SimpleAttributeSet();
StyleConstants.setFontFamily(defaultUnicodeFontAttributeSet,
"Tibetan Machine Uni");
webFontAttributeSet[0] = null;
for (int i=1; i<webFontAttributeSet.length; i++) {
webFontAttributeSet[i] = new SimpleAttributeSet();
StyleConstants.setFontFamily(webFontAttributeSet[i],tmwFontNames[i]);
}
normFontAttributeSet[0] = null;
for (int i=1; i<normFontAttributeSet.length; i++) {
normFontAttributeSet[i] = new SimpleAttributeSet();
StyleConstants.setFontFamily(normFontAttributeSet[i],tmFontNames[i]);
}
StringTokenizer sTok;
topSet = new HashSet();
sTok = new StringTokenizer(tops, ",");
while (sTok.hasMoreTokens())
topSet.add(sTok.nextToken());
leftSet = new HashSet();
sTok = new StringTokenizer(lefts, ",");
while (sTok.hasMoreTokens())
leftSet.add(sTok.nextToken());
rightSet = new HashSet();
sTok = new StringTokenizer(rights, ",");
while (sTok.hasMoreTokens())
rightSet.add(sTok.nextToken());
farRightSet = new HashSet();
sTok = new StringTokenizer(farrights, ",");
while (sTok.hasMoreTokens())
farRightSet.add(sTok.nextToken());
vowelSet = new HashSet();
sTok = new StringTokenizer(vowels, ",");
while (sTok.hasMoreTokens()) {
String ntk;
vowelSet.add(ntk = sTok.nextToken());
if (maxEwtsVowelLength < ntk.length())
maxEwtsVowelLength = ntk.length();
validInputSequences.put(ntk, anyOldObjectWillDo);
}
puncSet = new HashSet();
sTok = new StringTokenizer(others, ",");
while (sTok.hasMoreTokens()) {
String ntk;
puncSet.add(ntk = sTok.nextToken());
validInputSequences.put(ntk, anyOldObjectWillDo);
}
charSet = new HashSet();
tibSet = new HashSet();
sTok = new StringTokenizer(tibetanConsonants, ",");
while (sTok.hasMoreTokens()) {
String ntk;
charSet.add(ntk = sTok.nextToken());
tibSet.add(ntk);
validInputSequences.put(ntk, anyOldObjectWillDo);
}
sanskritStackSet = new HashSet();
sTok = new StringTokenizer(otherConsonants, ",");
while (sTok.hasMoreTokens()) {
String ntk;
charSet.add(ntk = sTok.nextToken());
sanskritStackSet.add(ntk);
validInputSequences.put(ntk, anyOldObjectWillDo);
}
numberSet = new HashSet();
sTok = new StringTokenizer(numbers, ",");
while (sTok.hasMoreTokens()) {
// DLC FIXME: don't add it to numberSet and charSet here;
// do it in <?Input:Numbers?> so that Jskad has the same
// TMW->Wylie conversion regardless of whether or not it
// chooses to support inputting numbers. Likewise for
// tibetanConsonants, otherConsonants, others, and vowels.
String ntk;
charSet.add(ntk = sTok.nextToken());
numberSet.add(ntk);
validInputSequences.put(ntk, anyOldObjectWillDo);
}
charSet.add("Y");
charSet.add("R");
charSet.add("W");
// Useless because none is two or more characters, but for
// completeness we put these in since EWTS allows w+W etc.
validInputSequences.put("Y", anyOldObjectWillDo);
validInputSequences.put("R", anyOldObjectWillDo);
validInputSequences.put("W", anyOldObjectWillDo);
sTok = null;
top_vowels = new HashSet();
top_vowels.add(i_VOWEL);
top_vowels.add(e_VOWEL);
top_vowels.add(o_VOWEL);
top_vowels.add(ai_VOWEL);
top_vowels.add(au_VOWEL);
top_vowels.add(reverse_i_VOWEL);
try {
URL url = TibetanMachineWeb.class.getResource(fileName);
if (url == null) {
System.err.println("Cannot find " + fileName + "; aborting.");
System.exit(1);
}
InputStreamReader isr = new InputStreamReader(url.openStream());
BufferedReader in = new BufferedReader(isr);
if (ThdlOptions.getBooleanOption("thdl.verbose")) {
System.out.println("Reading Tibetan Machine Web code table "
+ fileName);
}
String line;
boolean hashOn = false;
// is this a Tibetan consonant or consonant stack?
boolean isTibetan = false;
// is this a Sanskrit consonant stack?
boolean isSanskrit = false;
boolean ignore = false;
while ((line = in.readLine()) != null) {
if (line.startsWith("<?")) { //line is command
if (line.equalsIgnoreCase("<?Consonants?>")) {
isSanskrit = false;
isTibetan = true;
hashOn = false;
ignore = false;
do {
line = in.readLine();
} while (line.startsWith("//") || line.equals(""));
// use tibSet or charSet; ignore this.
}
else if (line.equalsIgnoreCase("<?Numbers?>")) {
// FIXME: for historical reasons, numbers go
// in both charSet and numberSet.
isSanskrit = false;
isTibetan = false;
hashOn = false;
ignore = false;
do {
line = in.readLine();
} while (line.startsWith("//") || line.equals(""));
// use numberSet or charSet; ignore this.
}
else if (line.equalsIgnoreCase("<?Vowels?>")) {
isSanskrit = false;
isTibetan = false;
hashOn = false;
ignore = false;
do {
line = in.readLine();
} while (line.startsWith("//") || line.equals(""));
// use vowelSet; ignore this.
}
else if (line.equalsIgnoreCase("<?Other?>")) {
isSanskrit = false;
isTibetan = false;
hashOn = false;
ignore = false;
do {
line = in.readLine();
} while (line.startsWith("//") || line.equals(""));
// use puncSet; ignore this.
}
else if (line.equalsIgnoreCase("<?Input:Punctuation?>")
|| line.equalsIgnoreCase("<?Input:Vowels?>")) {
isSanskrit = false;
isTibetan = false;
hashOn = true;
ignore = false;
}
else if (line.equalsIgnoreCase("<?Input:Tibetan?>")) {
isSanskrit = false;
isTibetan = true;
hashOn = true;
ignore = false;
}
else if (line.equalsIgnoreCase("<?Input:Numbers?>")) {
isSanskrit = false;
isTibetan = false;
hashOn = true;
ignore = false;
}
else if (line.equalsIgnoreCase("<?Input:Sanskrit?>")) {
isSanskrit = true;
isTibetan = false;
hashOn = true;
ignore = false;
}
else if (line.equalsIgnoreCase("<?ToWylie?>")) {
isSanskrit = false;
isTibetan = false;
hashOn = false;
ignore = false;
}
else if (line.equalsIgnoreCase("<?Ignore?>")) {
isSanskrit = false;
ignore = true;
}
}
else if (line.startsWith("//")) { //comment
;
}
else if (line.equals("")) {//empty string
;
}
else {
StringTokenizer st = new StringTokenizer(line,DELIMITER,true);
String wylie = null;
DuffCode[] duffCodes;
duffCodes = new DuffCode[11];
int k = 0;
StringBuffer escapedToken = new StringBuffer("");
ThdlDebug.verify(escapedToken.length() == 0);
while (st.hasMoreTokens()) {
String val = getEscapedToken(st, escapedToken);
if (val.equals(DELIMITER)
&& escapedToken.length() == 0) {
k++;
} else if (!val.equals("")) {
if (escapedToken.length() != 0) {
escapedToken = new StringBuffer("");
ThdlDebug.verify(escapedToken.length() == 0);
}
switch (k) {
case 0: //wylie key
wylie = val;
break;
case 1: // Tibetan Machine glyph
duffCodes[TM] = new DuffCode(val,false);
break;
case 2: //reduced-size character if there is one
if (!ignore) {
duffCodes[REDUCED_C] = new DuffCode(val,true);
}
break;
case 3: //TibetanMachineWeb code
duffCodes[TMW] = new DuffCode(val,true);
// TibetanMachineWeb7.91, for
// example, has no TM(win32)
// equivalent (though it has a
// TM(dos) equivalent), so we must
// test for null here:
if (null != duffCodes[TM]) {
TMtoTMW[duffCodes[TM].getFontNum()-1][duffCodes[TM].getCharNum()-32]
= duffCodes[TMW]; // TM->TMW mapping
}
// but no null test is necessary
// here for either the TMW or the
// TM glyph (though the TM glyph
// could well be null):
if (null != TMWtoTM[duffCodes[TMW].getFontNum()-1][duffCodes[TMW].getCharNum()-32])
throw new Error("tibwn.ini is supposed to use the TibetanMachineWeb glyph as the unique key, but "
+ val
+ " appears two or more times.");
TMWtoTM[duffCodes[TMW].getFontNum()-1][duffCodes[TMW].getCharNum()-32]
= duffCodes[TM]; // TMW->TM mapping
if (wylie.toLowerCase().startsWith("\\uf0")) {
int x = Integer.parseInt(wylie.substring("\\u".length()), 16);
ThdlDebug.verify((x >= 0xF000
&& x <= 0xF0FF));
NonUnicodeToTMW[x - '\uF000']
= new DuffCode[] { duffCodes[TMW] };
}
break;
// Vowels etc. to use with this glyph:
case 4:
case 5:
case 6:
case 7:
case 8:
case 9:
if (!ignore) {
try {
duffCodes[k-1] = new DuffCode(val,true);
} catch (Exception e) {
System.err.println("Couldn't make a DuffCode out of " + val + "; line is " + line + "; k is " + k);
}
}
break;
case 10: //Unicode:
if (!val.equals("none")) {
StringBuffer unicodeBuffer = new StringBuffer();
StringTokenizer uTok = new StringTokenizer(val, ",");
while (uTok.hasMoreTokens()) {
String subval = uTok.nextToken();
ThdlDebug.verify(subval.length() == 4 || subval.length() == 3);
try {
int x = Integer.parseInt(subval, 16);
ThdlDebug.verify((x >= 0x0F00
&& x <= 0x0FFF)
|| x == 0x5350
|| x == 0x534D
|| x == 0x0020
|| x == 0x00A0
|| x == 0x2003);
unicodeBuffer.append((char)x);
} catch (NumberFormatException e) {
ThdlDebug.verify(false);
}
}
TMWtoUnicode[duffCodes[TMW].getFontNum()-1][duffCodes[TMW].getCharNum()-32]
= unicodeBuffer.toString(); // TMW->Unicode mapping
char ch;
if (unicodeBuffer.length() == 1
&& UnicodeUtils.isInTibetanRange(ch = unicodeBuffer.charAt(0))) {
if (null != UnicodeToTMW[ch - '\u0F00'][0]
&& '\u0F00' != ch
&& '\u0F02' != ch
&& '\u0F03' != ch
&& '\u0F0B' != ch // any will do...
&& '\u0F0E' != ch
&& '\u0F40' != ch
&& '\u0F42' != ch
&& '\u0F49' != ch
&& '\u0F4F' != ch
&& '\u0F51' != ch
&& '\u0F53' != ch
&& '\u0F5E' != ch
&& '\u0F62' != ch
&& '\u0F64' != ch
&& '\u0F67' != ch
&& '\u0F6A' != ch
&& '\u0F71' != ch // any will do...
&& '\u0F72' != ch // any will do...
&& '\u0F73' != ch
&& '\u0F74' != ch // any will do...
&& '\u0F75' != ch // any will do...
&& '\u0F76' != ch
&& '\u0F77' != ch
&& '\u0F78' != ch
&& '\u0F79' != ch
&& '\u0F7A' != ch // any will do...
&& '\u0F7C' != ch // any will do...
&& '\u0F7E' != ch
&& '\u0F81' != ch) {
throw new Error("tibwn.ini has more than one TMW fellow listed that has the Unicode " + val + ", but it's not on the list of specially handled glyphs");
}
UnicodeToTMW[ch - '\u0F00'][0]
= duffCodes[TMW]; // Unicode->TMW mapping
}
// For V&V:
// DLC FIXME: also check for ^[90-bc]. and ^.+[40-6a]
// StringBuffer wylie_minus_plusses_buf
// = org.thdl.tib.text.tshegbar.UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeString(unicodeBuffer.toString());
// String wylie_minus_plusses
// = ((wylie_minus_plusses_buf == null)
// ? null
// : wylie_minus_plusses_buf.toString().replaceAll("(.)\\+","$1"));
// if (null == wylie
// || null == wylie_minus_plusses
// || !(wylie.replaceAll("(.)\\+","$1").equals(wylie_minus_plusses)
// || wylie.replaceAll("(.)-","$1").equals(wylie_minus_plusses)
// || wylie.equals(wylie_minus_plusses))) {
// System.out.println("wylie: " + wylie + "; wylie_minus_plusses: " + wylie_minus_plusses);
// }
}
break;
case 11: //half-height character if there is one
if (!ignore) {
duffCodes[HALF_C] = new DuffCode(val,true);
}
break;
case 12: //special bindu-value if vowel+bindu are one glyph
if (!ignore) {
DuffCode binduCode = new DuffCode(val,true);
binduMap.put(duffCodes[TMW],binduCode);
}
break;
case 13:
throw new Error("tibwn.ini has only 13 columns, you tried to use a 14th column.");
}
} else {
if (k == 10) {
throw new Error("needed none or some unicode; line is " + line);
}
}
}
if (k < 10) {
throw new Error("needed none or some unicode; line is " + line);
}
if (!ignore) {
if (null == wylie)
throw new Error(fileName
+ " has a line ^"
+ DELIMITER
+ " which means that no Wylie is assigned. That isn't supported.");
if (hashOn) {
tibHash.put(wylie, duffCodes);
}
if (isTibetan) {
// Delete the dashes:
StringBuffer wylieWithoutDashes = new StringBuffer(wylie);
for (int wl = 0; wl < wylieWithoutDashes.length(); wl++) {
if (wylieWithoutDashes.charAt(wl) == '-') {
wylieWithoutDashes.deleteCharAt(wl);
--wl;
}
}
tibSet.add(wylieWithoutDashes.toString());
}
if (isSanskrit) {
sanskritStackSet.add(wylie);
}
if (null == duffCodes[TMW])
throw new Error(fileName
+ " has a line with wylie " + wylie + " but no TMW; that's not allowed");
int font = duffCodes[TMW].getFontNum();
int code = duffCodes[TMW].getCharNum()-32;
toHashKey[font][code] = wylie;
}
}
}
}
catch (IOException e) {
System.out.println("file Disappeared");
ThdlDebug.noteIffyCode();
}
}
/**
* (Re-)sets the keyboard.
* @param kb the keyboard to be installed. If null, then the
* Extended Wylie keyboard is installed
* @return true if the keyboard was successfully set, false
* if there was an error
*/
public static boolean setKeyboard(TibetanKeyboard kb) {
keyboard = kb;
if (currentKeyboardIsExtendedWylie()) { //wylie keyboard
hasDisambiguatingKey = true;
disambiguating_key = WYLIE_DISAMBIGUATING_KEY;
hasSanskritStackingKey = true;
hasTibetanStackingKey = false;
isStackingMedial = true;
stacking_key = WYLIE_SANSKRIT_STACKING_KEY;
isAChenRequiredBeforeVowel = false;
isAChungConsonant = false;
hasAVowel = true;
aVowel = WYLIE_aVOWEL;
if (!vowelSet.contains(WYLIE_aVOWEL)) {
ThdlDebug.noteIffyCode();
// iffy because vowels contains 'a' and because
// maxEwtsVowelLength better be correct if this branch is
// ever taken
vowelSet.add(WYLIE_aVOWEL);
validInputSequences.put(WYLIE_aVOWEL, anyOldObjectWillDo);
}
}
else {
hasDisambiguatingKey = keyboard.hasDisambiguatingKey();
if (hasDisambiguatingKey)
disambiguating_key = keyboard.getDisambiguatingKey();
hasSanskritStackingKey = keyboard.hasSanskritStackingKey();
hasTibetanStackingKey = keyboard.hasTibetanStackingKey();
if (hasSanskritStackingKey || hasTibetanStackingKey) {
isStackingMedial = keyboard.isStackingMedial();
stacking_key = keyboard.getStackingKey();
}
isAChenRequiredBeforeVowel = keyboard.isAChenRequiredBeforeVowel();
isAChungConsonant = keyboard.isAChungConsonant();
hasAVowel = keyboard.hasAVowel();
}
return true;
}
/**
* (Re-)sets the keyboard.
* @param url the URL of the keyboard to be installed.
* If null, then the Extended Wylie keyboard is
* installed
* @return true if the keyboard was successfully set, false
* if there was an error
*/
public static boolean setKeyboard(URL url) {
try {
TibetanKeyboard kb = new TibetanKeyboard(url);
if (setKeyboard(kb))
return true;
else
return false;
}
catch (TibetanKeyboard.InvalidKeyboardException ike) {
System.out.println("can't create the keyboard associated with " + url);
ThdlDebug.noteIffyCode();
return false;
}
}
/**
* Gets the AttributeSet for the given TibetanMachineWeb font.
* This information is required in order to be able to put styled
* text into {@link TibetanDocument TibetanDocument}.
* @param font the number of the TibetanMachineWeb font for which
* you want the SimpleAttributeSet: TibetanMachineWeb = 1,
* TibetanMachineWeb1 = 2, TibetanMachineWeb = 3, etc. up to 10
* @return a SimpleAttributeSet for the given font - that is,
* a way of encoding the font itself
*/
public static SimpleAttributeSet getAttributeSet(int font) {
if (font > -1 && font < webFontAttributeSet.length)
return webFontAttributeSet[font];
else
return null;
}
/**
* Gets the AttributeSet for the font we use for the Unicode we create
* in our TMW->Unicode conversion. This information is required in
* order to be able to put styled text into {@link TibetanDocument
* TibetanDocument}.
* @param unicodeFont the interned name of the Unicode font to use;
* defaults to Tibetan Machine Uni if null
* @return a SimpleAttributeSet for the Unicode font - that is, a way
* of encoding the font itself */
public static SimpleAttributeSet getUnicodeAttributeSet(String unicodeFont) {
if (null == unicodeFont
|| "Tibetan Machine Uni" == unicodeFont)
return defaultUnicodeFontAttributeSet;
else {
SimpleAttributeSet cached
= (SimpleAttributeSet)unicodeAttributeSets.get(unicodeFont);
if (null == cached) {
cached = new SimpleAttributeSet();
StyleConstants.setFontFamily(cached, unicodeFont);
unicodeAttributeSets.put(unicodeFont, cached);
}
return cached;
}
}
private static HashMap unicodeAttributeSets = new HashMap();
/**
* Gets the AttributeSet for the given TibetanMachine font.
* This information is required in order to be able to put styled
* text into {@link TibetanDocument TibetanDocument}.
* @param font the number of the TibetanMachineWeb font for which
* you want the SimpleAttributeSet: TibetanMachine = 1,
* TibetanMachineSkt1 = 2, etc. up to 5
* @return a SimpleAttributeSet for the given font - that is,
* a way of encoding the font itself
*/
public static SimpleAttributeSet getAttributeSetTM(int font) {
if (font > -1 && font < normFontAttributeSet.length)
return normFontAttributeSet[font];
else
return null;
}
/**
* Says whether or not the character is formatting.
* @param c the character to be checked
* @return true if c is formatting (TAB or
* ENTER), false if not
*/
public static boolean isFormatting(char c) {
if (c < 32 || c > 126)
return true;
else
return false;
/*
if ( c == KeyEvent.VK_TAB
|| c == KeyEvent.VK_ENTER)
return true;
else
return false;
*/
}
/**
* Checks to see if the passed string is a character (a single
* [possibly Sanskrit or va or fa] consonant or a number [possibly
* super- or subscribed]) in the installed keyboard.
*
* @param s the string you want to check
* @return true if s is a character in the current keyboard, false if
* not */
public static boolean isChar(String s) {
if (currentKeyboardIsExtendedWylie())
return charSet.contains(s);
else
return keyboard.isChar(s);
}
/**
* Checks to see if the passed string is a character (a single
* [possibly Sanskrit or va or fa] consonant or a number [possibly
* super- or subscribed]) in Extended Wylie.
* @param s the string to be checked
* @return true if s is a character in Extended Wylie transliteration,
* false if not */
public static boolean isWylieChar(String s) {
return charSet.contains(s);
}
/**
* Checks to see if the passed string is a consonant or unadorned
* consonant stack in Extended Wylie. The string shouldn't have any
* '+' or '.' characters in it if you wnat this to return true.
* @param s the string to be checked
* @return true if s is such in Extended Wylie transliteration, false
* if not */
public static boolean isWylieTibetanConsonantOrConsonantStack(String s) {
return tibSet.contains(s);
}
/**
* Returns true if and only if s is necessarily the THDL Extended Wylie
* for a Sanskrit (non-Tibetan, to be more correct) multi-consonant
* stack. If s is "w", then it might be the EWTS for TWM7.69, and that
* glyph is only used in non-Tibetan stacks, but "w" also stands for
* TMW.53, which is Tibetan, so this will return false for such a
* glyph. */
public static boolean isWylieSanskritConsonantStack(String s) {
return sanskritStackSet.contains(s);
}
/** Returns true if and only if s is the THDL Extended Wylie
representation of a legal tsheg-bar appendage like 'i, 'u, 'am,
etc. The word le'u (chapter) contains such an appendage, e.g. */
public static boolean isWylieAchungAppendage(String s) {
// THIS LIST IS DOCUMENTED IN THE ACIP->TIBETAN CONVERTERS USER GUIDE.
return (s.equals("'e") // Appears in transcription of foreign words at the very least. The ACIP LEM'E is {LE}{M'E} if we don't think of 'E as "legal"... tricky, tricky (DLC FIXME: think again, verify it's doing what we want for both ACIP-> and EWTS-> and TMW->)
|| s.equals("'i")
|| s.equals("'o")
|| s.equals("'u")
|| s.equals("'us")
|| s.equals("'ur")
|| s.equals("'um")
|| s.equals("'ong") // This isn't an appendage I know of, but 'ongs is, and we need this in here or 'ongs won't work. DLC FIXME: cludge
|| s.equals("'ongs")
|| s.equals("'os")
|| s.equals("'is")
|| s.equals("'ung")
|| s.equals("'ang")
|| s.equals("'am"));
}
/**
* Checks to see if the passed string is a number [possibly super- or
* subscribed]) in Extended Wylie.
* @param s the string to be checked
* @return true if s is a number in Extended Wylie transliteration,
* false if not */
public static boolean isWylieNumber(String s) {
return numberSet.contains(s);
}
/**
* Checks to see if the passed string
* is punctuation in the installed keyboard.
* @param s the string you want to check
* @return true if s is punctuation in the current
* keyboard, false if not
*/
public static boolean isPunc(String s) {
if (currentKeyboardIsExtendedWylie())
return puncSet.contains(s);
else
return keyboard.isPunc(s);
}
/**
* This method checks to see if the passed string
* is punctuation in Extended Wylie.
* @param s the string to be checked
* @return true if s is punctuation in
* Extended Wylie transliteration, false if not
*/
public static boolean isWyliePunc(String s) {
return puncSet.contains(s);
}
/**
* Checks to see if the passed string
* is a vowel in the installed keyboard.
* @param s the string you want to check
* @return true if s is a vowel in the current
* keyboard, false if not
*/
public static boolean isVowel(String s) {
if (currentKeyboardIsExtendedWylie())
return vowelSet.contains(s);
else
return keyboard.isVowel(s);
}
/** By example, this returns true for le, lA-i, lA-iM, luM, l-i, etc.,
and for l, but false for lc, lj, lt, ld, lp, lb, and lh. Thus,
this is useful for seeing if something is truly disambiguous,
because blta is unambiguous but blan/b.lan is ambiguous. */
private static boolean isAmbHelper(String y) {
// Wylie vowels are 1 or two (Roman) characters long, so this
// works. But we make it work for three- and four-character long
// vowels, just in case someone adds them.
return (y.length() == 1
|| isWylieVowel(y.substring(1,2))
|| (y.length() > 2 && isWylieVowel(y.substring(1,3)))
|| (y.length() > 3 && isWylieVowel(y.substring(1,4)))
|| (y.length() > 4 && isWylieVowel(y.substring(1,5))));
}
/**
* Checks to see if the concatenation of x and y is ambiguous in
* Extended Wylie. gya and g.ya, bla and b.la, and bra and b.ra are
* the only syntactically legal ambiguous fellows, as stacks like blha,
* blda, brla, brkya, brgya, brka, etc. are unambiguous. However, we
* generate b.lha instead of blha because it's easier
* implementation-wise.
* @param x the prefix
* @param y the root stack
* @return true if x + y is ambiguous in the Extended Wylie
* transliteration, false if not */
public static boolean isAmbiguousWylie(String x, String y) {
// What about ambiguity between wa-zur and wa? dwa vs. d.wa, e.g.?
// Some would say it doesn't matter, because that's illegal. wa
// doesn't take any prefixes. But I want even illegal stuff to
// work well end-to-end (i.e., converting tibetan X to wylie Y to
// tibetan Z should get you X==Z in a perfect world), and it
// doesn't confuse the legal stuff.
// How do I know this list is complete? Because tibwn.ini has a
// list of all legal Tibetan (not Sanskrit) glyphs, and searching
// for the regular expressions ^d-, ^m-, ^'-, ^g-, and ^b- shows
// you all the fellows that could be ambiguous.
return (("g".equals(x) && y.startsWith("y") && isAmbHelper(y))
|| ("g".equals(x) && y.startsWith("w") && isAmbHelper(y))
|| ("d".equals(x) && y.startsWith("w") && isAmbHelper(y))
|| ("d".equals(x) && y.startsWith("z") && isAmbHelper(y))
|| ("b".equals(x) && y.startsWith("l") && isAmbHelper(y))
|| ("b".equals(x) && y.startsWith("r") && isAmbHelper(y))
|| ("m".equals(x) && y.startsWith("y") && isAmbHelper(y))
|| ("b".equals(x) && y.startsWith("y") && isAmbHelper(y))
|| ("g".equals(x) && y.startsWith("rw"))
|| ("d".equals(x) && y.startsWith("rw"))
// Because we wouldn't want to generate de for d.e (if
// achen took a da prefix): [This is a HIGH-CLASS WORRY
// because achen doesn't take any prefixes. But I'm
// thorough when I think to be.]
|| isWylieVowel(y)
);
}
/** Returns the length in characters of the longest EWTS vowel. */
public static int getMaxEwtsVowelLength() {
ThdlDebug.verify(maxEwtsVowelLength > 0);
return maxEwtsVowelLength;
}
/**
* Checks to see if the passed string
* is a vowel in Extended Wylie.
* @param s the string to be checked
* @return true if s is a vowel in
* Extended Wylie transliteration, false if not
*/
public static boolean isWylieVowel(String s) {
return vowelSet.contains(s);
}
/**
* Checks to see if the passed string begins with an EWTS vowel.
* @param s the string to be checked
* @return true if s is a vowel in
* Extended Wylie transliteration, false if not */
public static boolean startsWithWylieVowelSequence(String s) {
for (int i = 0; i < maxEwtsVowelLength; i++) {
if (i == s.length())
return false;
if (isWylieVowel(s.substring(0, i + 1)))
return true;
}
return false;
}
/** Returns true if and only if wylie is the THDL Extended Wylie for
an adornment. An adornment is something that is part of a stack
but is not a consonant, such as a Tibetan or Sanskrit vowel or a
bindu. Note that an adornment might be both an adornment and a
vowel, or an adornment and punctuation. */
public static boolean isWylieAdornment(String wylie) {
return (vowelSet.contains(wylie)
|| (wylie.equals("M") /* U+0F7E */
|| wylie.equals("~M") /* U+0F83 */
|| wylie.equals("~M`") /* U+0F82 */
|| wylie.equals("iM")
|| wylie.equals("-iM")
|| wylie.equals("eM")
|| wylie.equals("aiM")
|| wylie.equals("oM")
|| wylie.equals("auM")));
}
/** Returns true if and only if wylie is the THDL Extended Wylie for
an adornment {@link #isWylieAdornment(String)} that contains a
vowel within it. */
public static boolean isWylieAdornmentAndContainsVowel(String wylie) {
return (isWylieAdornment(wylie) &&
!wylie.equals("M") /* U+0F7E */
&& !wylie.equals("~M") /* U+0F83 */
&& !wylie.equals("~M`") /* U+0F82 */);
}
/**
* Returns true iff this Wylie is valid as a leftmost character in a
* Tibetan syllable. For example, in the syllable 'brgyad', 'b' is the
* leftmost character. Valid leftmost characters include g, d, b, ',
* and m.
* @param s the (Wylie) string to be checked
* @return true if s is a possible leftmost character in a Tibetan
* syllable, false if not. */
public static boolean isWylieLeft(String s) {
return leftSet.contains(s);
}
/**
* Returns true iff this Wylie is valid as a suffix (i.e., a right
* (post-vowel) character) in a Tibetan syllable. For example, in the
* syllable 'lags', 'g' is in the right character position. Valid right
* characters include g, ng, d, n, b, m, r, l, s, ', and T.
* @param s the (Wylie) string to be checked
* @return true if s is a possible right character in a Tibetan
* syllable, false if not. */
public static boolean isWylieRight(String s) {
return rightSet.contains(s);
}
/**
* Returns true iff this Wylie is valid as a postsuffix in a
* Tibetan syllable.
* @param s the string to be checked
* @return true if s is a possible postsuffix in a Tibetan
* syllable, false if not. */
public static boolean isWylieFarRight(String s) {
return farRightSet.contains(s);
}
/**
* Returns true iff this Wylie is valid as a head letter in a Tibetan
* syllable.
* @param s the string to be checked
* @return true if s is a possible superscribed letter in a Tibetan
* syllable, false if not. */
public static boolean isWylieTop(String s) {
return topSet.contains(s);
}
/**
* Converts character to its Extended Wylie correspondence.
* This assumes that the passed string is a character
* in the current keyboard.
* @param s the string to be converted
* @return the Wylie character corresponding to
* s, or null if there is no such character
* @see TibetanKeyboard
*/
public static String getWylieForChar(String s) {
if (currentKeyboardIsExtendedWylie())
return s;
return keyboard.getWylieForChar(s);
}
/** Returns true iff the currently active keyboard is the
* built-in, extended Wylie keyboard. */
public static boolean currentKeyboardIsExtendedWylie() {
return (null == getKeyboard());
}
/**
* Returns the current keyboard, or, if the current keyboard is the
* Extended Wylie keyboard, null. */
public static TibetanKeyboard getKeyboard() {
return keyboard;
}
/**
* Converts punctuation to its Extended Wylie correspondence.
* This assumes that the passed string is punctuation
* in the current keyboard.
* @param s the string to be converted
* @return the Wylie punctuation corresponding to
* s, or null if there is no such punctuation
* @see TibetanKeyboard
*/
public static String getWylieForPunc(String s) {
if (currentKeyboardIsExtendedWylie())
return s;
return keyboard.getWylieForPunc(s);
}
/**
* Converts vowel to its Extended Wylie correspondence.
* This assumes that the passed string is a vowel
* in the current keyboard.
* @param s the string to be converted
* @return the Wylie vowel corresponding to
* s, or null if there is no such vowel
* @see TibetanKeyboard
*/
public static String getWylieForVowel(String s) {
if (currentKeyboardIsExtendedWylie())
return s;
return keyboard.getWylieForVowel(s);
}
/**
* Gets the DuffCode required for a vowel, if affixed to the given
* hashKey. Not as pretty as {@link
* TibTextUtils#getVowel(List,DuffCode,DuffCode,String)}.
* @param hashKey the key for the character the vowel is to be affixed
* to; see {@link #getGlyph(String)} to learn about hash keys.
* @param vowel the vowel you want the DuffCode for
* @return the DuffCode for the vowel in the given
* context, or null if there is no such vowel in
* the context
* @see DuffCode
* @see TibTextUtils#getVowel(List,DuffCode,DuffCode,String) */
public static DuffCode getVowel(String hashKey, int vowel) {
DuffCode[] dc = (DuffCode[])tibHash.get(hashKey);
if (null == dc)
return null;
return dc[vowel]; //either a vowel or null
}
/**
* Checks to see if a glyph exists for this hash key.
* @param hashKey the key to be checked; see {@link #getGlyph(String)}
* to learn about hash keys.
* @return true if there is a glyph corresponding to
* hashKey, false if not
*/
public static boolean hasGlyph(String hashKey) {
if (tibHash.get(hashKey)==null)
return false;
else
return true;
}
/** Returns the Unicode correspondence for the Wylie wylie, which must
* be Wylie returned by getWylieForGlyph(int, int, boolean[]).
* Returns null if the Unicode correspondence is nonexistent or
* unknown. */
public static String getUnicodeForWylieForGlyph(String wylie) {
DuffCode dc = getGlyph(wylie);
return mapTMWtoUnicode(dc.getFontNum() - 1, dc.getCharNum());
}
/**
* Returns true if and only if hashKey is a known hash key from tibwn.ini.
*/
public static boolean isKnownHashKey(String hashKey) {
DuffCode[] dc = (DuffCode[])tibHash.get(hashKey);
return (null != dc);
}
/**
* Gets a glyph for this hash key. Hash keys are not identical to Extended
* Wylie. The hash key for a Tibetan stack separates the members of the stack
* with '-', for example, 's-g-r'. In Sanskrit stacks, '+' is used, e.g. 'g+h+g+h'.
* @param hashKey the key for which you want a DuffCode
* @return the TibetanMachineWeb DuffCode value for hashKey
* @see DuffCode
*/
public static DuffCode getGlyph(String hashKey) {
DuffCode dc = maybeGetGlyph(hashKey);
if (null == dc)
throw new Error("Hash key " + hashKey + " not found; it is likely that you misconfigured tibwn.ini such that, say, M is expected (i.e., it is listed as, e.g. punctuation), but no 'M~...' line appears.");
return dc;
}
/**
* Gets a glyph for this hash key if possible; returns null
* otherwise.
* @see #getGlyph(String)
*/
public static DuffCode maybeGetGlyph(String hashKey) {
DuffCode[] dc = (DuffCode[])tibHash.get(hashKey);
if (null == dc)
return null;
return dc[TMW];
}
/**
* Gets the half height character for this hash key.
* @param hashKey the key you want a half height glyph for; see {@link
* #getGlyph(String)} to learn about hash keys.
* @return the TibetanMachineWeb DuffCode of hashKey's
* reduced height glyph, or null if there is no such glyph
* @see DuffCode
*/
public static DuffCode getHalfHeightGlyph(String hashKey) {
DuffCode[] dc = (DuffCode[])tibHash.get(hashKey);
if (dc == null)
return null;
return dc[REDUCED_C];
}
private static final DuffCode TMW_cr = new DuffCode(1, '\r');
private static final DuffCode TMW_lf = new DuffCode(1, '\n');
private static final DuffCode TMW_tab = new DuffCode(1, '\t');
/** An identity function used merely for testing. */
public static DuffCode mapTMWtoItself(int font, int ordinal, int suggestedFont) {
if (font < 0 || font > 9)
return null;
if (ordinal >= 255) {
throw new Error("I didn't know that ever happened.");
}
if (ordinal < 32) {
if (ordinal == (int)'\r') {
if (0 == suggestedFont)
return TMW_cr;
else
return new DuffCode(suggestedFont, (char)ordinal); // FIXME: don't create a new one each time; it needlessly taxes the GC
} else if (ordinal == (int)'\n') {
if (0 == suggestedFont)
return TMW_lf;
else
return new DuffCode(suggestedFont, (char)ordinal); // FIXME: don't create a new one each time; it needlessly taxes the GC
} else if (ordinal == (int)'\t') {
if (0 == suggestedFont)
return TMW_tab;
else
return new DuffCode(suggestedFont, (char)ordinal); // FIXME: don't create a new one each time; it needlessly taxes the GC
} else {
// for robustness, just return font 1, char ordinal.
ThdlDebug.noteIffyCode();
return null;
}
}
return new DuffCode(font + 1, (char)ordinal);
}
/** Returns the DuffCode for the TibetanMachineWeb glyph corresponding
to the given TibetanMachine font
(0=norm,1=Skt1,2=Skt2,3=Skt3,4=Skt4) and character(32-254).
Null is never returned for an existing TibetanMachine glyph,
because every TibetanMachine glyph has a corresponding
TibetanMachineWeb glyph. Null is returned if the input isn't
valid.
Only a few control characters are supported: '\r' (carriage
return), '\n' (line feed), and '\t' (tab).
If suggestedFont is not zero, then any ordinals that are the same
in all fonts ('\n', '-', ' ', '\r', and '\t') will be converted to
the font named tmwFontNames[suggestedFont].
*/
public static DuffCode mapTMtoTMW(int font, int ordinal, int suggestedFont) {
if (font < 0 || font > 4)
return null;
if (ordinal >= 255) {
/* \'9e, etc. */
DuffCode rv = getUnusualTMtoTMW(font, ordinal);
if (null != rv && !ThdlOptions.getBooleanOption("thdl.do.not.fix.rtf.hex.escapes"))
throw new Error("oddballs still found after fixing RTF hex escapes");
/* \bullet, etc. */
if (null == rv)
rv = getOtherUnusualTMtoTMW(font, ordinal);
return rv;
}
if (ordinal < 32) {
if (ordinal == (int)'\r') {
if (0 == suggestedFont)
return TMW_cr;
else
return new DuffCode(suggestedFont, (char)ordinal); // FIXME: don't create a new one each time; it needlessly taxes the GC
} else if (ordinal == (int)'\n') {
if (0 == suggestedFont)
return TMW_lf;
else
return new DuffCode(suggestedFont, (char)ordinal); // FIXME: don't create a new one each time; it needlessly taxes the GC
} else if (ordinal == (int)'\t') {
if (0 == suggestedFont)
return TMW_tab;
else
return new DuffCode(suggestedFont, (char)ordinal); // FIXME: don't create a new one each time; it needlessly taxes the GC
} else {
// for robustness, just return font 1, char ordinal.
ThdlDebug.noteIffyCode();
return null;
}
}
// We map TibetanMachineSkt1.45, TibetanMachineSkt1.45,
// TibetanMachineSkt1.45, and TibetanMachineSkt1.45 to
// TibetanMachineWeb*.45, even though they're actually just
// garbage, since TibetanMachine.45 is the only tsheg in TM. We
// assume that a machine goofed along the way. (FIXME: optionally
// warn.)
if ((0 != suggestedFont) && (32 == ordinal || 45 == ordinal)) {
return new DuffCode(suggestedFont, (char)ordinal); // FIXME: don't create a new one each time; it needlessly taxes the GC
}
return TMtoTMW[font][ordinal-32];
}
private static final DuffCode TM_cr = new DuffCode(1, '\r');
private static final DuffCode TM_lf = new DuffCode(1, '\n');
private static final DuffCode TM_tab = new DuffCode(1, '\t');
/** Returns the DuffCode for the TibetanMachine glyph corresponding to
the given TibetanMachineWeb font
(0=TibetanMachineWeb,1=TibetanMachineWeb1,...) and character(32-127).
Null is returned for an existing TibetanMachineWeb glyph only if
that glyph is TibetanMachineWeb7.91, because every other
TibetanMachineWeb glyph has a corresponding TibetanMachine glyph.
Null is returned if the input isn't valid.
Only a few control characters are supported: '\r' (carriage
return), '\n' (line feed), and '\t' (tab).
If suggestedFont is not zero, then any ordinals that are the same
in all fonts ('\n', '-', ' ', '\r', and '\t') will be converted to
the font named tmwFontNames[suggestedFont].
*/
public static DuffCode mapTMWtoTM(int font, int ordinal, int suggestedFont) {
if (font < 0 || font > 9)
return null;
if (ordinal >= 127)
return null;
if (ordinal < 32) {
if (ordinal == (int)'\r') {
if (0 == suggestedFont)
return TM_cr;
else
return new DuffCode(suggestedFont, (char)ordinal); // FIXME: don't create a new one each time; it needlessly taxes the GC
} else if (ordinal == (int)'\n') {
if (0 == suggestedFont)
return TM_lf;
else
return new DuffCode(suggestedFont, (char)ordinal); // FIXME: don't create a new one each time; it needlessly taxes the GC
} else if (ordinal == (int)'\t') {
if (0 == suggestedFont)
return TM_tab;
else
return new DuffCode(suggestedFont, (char)ordinal); // FIXME: don't create a new one each time; it needlessly taxes the GC
} else {
// for robustness, just return font 1, char ordinal.
ThdlDebug.noteIffyCode();
return null;
}
}
if (45 == ordinal) {
// TibetanMachine.45 is a tsheg, but TibetanMachineSkt2.45
// etc. are not tshegs.
return new DuffCode(1, (char)ordinal);
}
if ((0 != suggestedFont) && (32 == ordinal)) {
return new DuffCode(suggestedFont, (char)ordinal); // FIXME: don't create a new one each time; it needlessly taxes the GC
}
DuffCode ans = TMWtoTM[font][ordinal-32];
return ans;
}
/** Tests the TMW-&gt;TM and TM-&gt;TMW mappings. */
public static void main(String[] args) {
int font, ord, count;
count = 0;
for (font = 0; font < 5; font++) {
for (ord = 32; ord < 255; ord++) {
if (mapTMtoTMW(font, ord, 0) != null) {
count++;
}
}
System.out.println("Found " + count + " TM->TMW mappings (thus far).");
}
count = 0;
for (font = 0; font < 10; font++) {
for (ord = 32; ord < 127; ord++) {
if (mapTMWtoTM(font, ord, 0) != null) {
count++;
}
}
System.out.println("Found " + count + " TMW->TM mappings (thus far).");
}
System.out.println("TMWtoTM: ");
for (font = 0; font < 10; font++) {
for (ord = 32; ord < 127; ord++) {
DuffCode dc;
if ((dc = mapTMWtoTM(font, ord, 0)) != null) {
System.out.println(dc.getCharNum() + " "
+ (dc.getFontNum()-1) + " "
+ font + " "
+ ord);
}
}
}
System.out.println("TMtoTMW: (use sort -g -k 3 -k 4): ");
for (font = 0; font < 5; font++) {
for (ord = 32; ord < 255; ord++) {
DuffCode dc;
if ((dc = mapTMtoTMW(font, ord, 0)) != null) {
System.out.println(ord + " " + font + " "
+ (dc.getFontNum()-1) + " "
+ dc.getCharNum());
}
}
}
}
/** A horrible kludge. A kludge is needed because javax.swing.rtf is
quite busted.
<p>If you open up a file that MS Word has saved (not TibetDoc, I
don't think), it will appear, thanks to Java's bugs, to have weird
RTF where you see TibetanMachine.8211 etc. The highest possible
glyph value should be 255, but that's not what appears. The bug,
precisely, is that the RTF <code>\endash</code> is not treated
like the RTF <code>\u0150</code>, as it should be, but is instead
turned into something akin to <code>\u8211</code>. This is Java's
fault, not MS Word's. This happens for \bullet, \emdash, \endash,
\lquote, \rquote, \ldblquote, and \rdblquote.
@return non-null if (font, code) identify an oddball we know.
*/
private static DuffCode getOtherUnusualTMtoTMW(int font, int code) {
if (code > 254) {
switch (code) {
case 8211: // \endash, following number assumes this is a
// Windows or OS/2 RTF file, not a Mac RTF file:
return TMtoTMW[font][150 - 32];
case 8212: // \emdash, following number assumes this is a
// Windows or OS/2 RTF file, not a Mac RTF file:
return TMtoTMW[font][151 - 32];
case 8216: // \lquote, following number assumes this is a
// Windows or OS/2 RTF file, not a Mac RTF file:
return TMtoTMW[font][145 - 32];
case 8217: // \rquote, following number assumes this is a
// Windows or OS/2 RTF file, not a Mac RTF file:
return TMtoTMW[font][146 - 32];
case 8220: // \ldblquote, following number assumes this is a
// Windows or OS/2 RTF file, not a Mac RTF file:
return TMtoTMW[font][147 - 32];
case 8221: // \rdblquote, following number assumes this is a
// Windows or OS/2 RTF file, not a Mac RTF file:
return TMtoTMW[font][148 - 32];
case 8226: // \bullet, following number assumes this is a
// Windows or OS/2 RTF file, not a Mac RTF file:
return TMtoTMW[font][149 - 32];
default:
return null;
}
} else {
return null;
}
}
/** A horrible kludge. A kludge is needed because javax.swing.rtf is
quite busted. As you'll see below though, this kludge does not
suffice.
<p>If you open up a file that TibetDoc has saved, it will appear,
thanks to Java's bugs, to have weird RTF where you see
TibetanMachine.8225 etc. The highest possible glyph value should
be 255, but that's not what appears. The bug, precisely, is that
the RTF <code>\'9c</code> is not treated like the RTF
<code>\u0156</code>, as it should be, but is instead turned into
something akin to <code>\u0347</code>. This is Java's fault, not
TibetDoc's. I think it happens for glyphs that are like \'8X and
\'9X, not for \'7X, \'aX and the rest. Thus, there are 32 guys to
worry about, and 158, \'9e, and 142, \'8e, are not used by any TM
fonts, leaving us with 30 to worry about. Unfortunately, 145
(\'91), 147 (\'93), 148 (\'94), 150 (\'96), 151 (\'97), and 152
(\'98) simply DISAPPEAR from the input document.
@return non-null if (font, code) identify an oddball we know.
@deprecated This list is thought to be as many of the 30 as are
possible to get. But we cannot give you (char)145, etc. ever
because they are simply NOT THERE. So if you are using this
method, you are LOSING INFORMATION. Do not use this method. */
private static DuffCode getUnusualTMtoTMW(int font, int code) {
// FIXME: don't use this! Do a search and replace through the RTF
// file instead.
if (code > 254) {
switch (code) {
case 346: // 0=ga-wazur
return TMtoTMW[font][140 - 32];
case 347: // 0=reduced-height ha
return TMtoTMW[font][156 - 32];
case 352: // 1=dz-wazur, 0=k-wazur
return TMtoTMW[font][138 - 32];
case 356: // 0=ca-wazur
return TMtoTMW[font][141 - 32];
case 357: // 2=b-t
return TMtoTMW[font][157 - 32];
case 353: // 0=d-r-w
return TMtoTMW[font][154 - 32];
case 377: // 0=t-w
return TMtoTMW[font][143 - 32];
case 378: // 1=reverse-ta--reverse-ta
return TMtoTMW[font][159 - 32];
case 381: // unused, here for completeness
return TMtoTMW[font][142 - 32];
case 382: // unused, here for completeness
return TMtoTMW[font][158 - 32];
case 402: // 1=dz-ny 2=n-r 3=h-y
return TMtoTMW[font][131 - 32];
case 710: // 0=s-b-r
return TMtoTMW[font][136 - 32];
case 1026: // 0=s-g-y
return TMtoTMW[font][128 - 32];
case 1027: // 0=s-p-y
return TMtoTMW[font][129 - 32];
case 1106: // 0=d-w
return TMtoTMW[font][144 - 32];
case 8117: // 0=tsh-w
return TMtoTMW[font][146 - 32];
case 8126: // 0=r-w
return TMtoTMW[font][149 - 32];
case 8218: // 0=s-b-y 2=n-y
return TMtoTMW[font][130 - 32];
case 8222: // 0=s-k-r
return TMtoTMW[font][132 - 32];
case 8224: // 0=s-n-r
return TMtoTMW[font][134 - 32];
case 8225: // 0=s-p-r
return TMtoTMW[font][135 - 32];
case 8230: // 0=s-g-r
return TMtoTMW[font][133 - 32];
case 8240: // 0=s-m-r 1=dz-r
return TMtoTMW[font][137 - 32];
case 8249: // 0=kh-wazur 1=dz-h
return TMtoTMW[font][139 - 32];
case 8250: // 0=ph-y-wazur
return TMtoTMW[font][155 - 32];
case 8482: // 0=g-r-w
return TMtoTMW[font][153 - 32];
default:
return null;
}
} else {
return null;
}
}
private static final String Unicode_cr = "\r";
private static final String Unicode_lf = "\n";
private static final String Unicode_tab = "\t";
private static final DuffCode[] tmwFor0F00
= new DuffCode[] { new DuffCode(1, (char)63), new DuffCode(8, (char)102) };
private static final DuffCode[] tmwFor0F02
= new DuffCode[] { new DuffCode(1, (char)56), new DuffCode(1, (char)118), new DuffCode(8, (char)95), new DuffCode(8, (char)92) };
private static final DuffCode[] tmwFor0F03
= new DuffCode[] { new DuffCode(1, (char)56), new DuffCode(1, (char)118), new DuffCode(8, (char)95), new DuffCode(1, (char)105) };
private static final DuffCode[] tmwFor0F0E
= new DuffCode[] { new DuffCode(1, (char)107), new DuffCode(1, (char)107) };
// for 0F40, use the full-height, not the reduced-height, form
private static final DuffCode[] tmwFor0F40
= new DuffCode[] { new DuffCode(1, (char)92) };
private static final DuffCode[] tmwFor0F42
= new DuffCode[] { new DuffCode(1, (char)93) };
private static final DuffCode[] tmwFor0F49
= new DuffCode[] { new DuffCode(1, (char)94) };
private static final DuffCode[] tmwFor0F4F
= new DuffCode[] { new DuffCode(1, (char)95) };
private static final DuffCode[] tmwFor0F51
= new DuffCode[] { new DuffCode(1, (char)96) };
private static final DuffCode[] tmwFor0F53
= new DuffCode[] { new DuffCode(1, (char)97) };
private static final DuffCode[] tmwFor0F5E
= new DuffCode[] { new DuffCode(1, (char)98) };
private static final DuffCode[] tmwFor0F62
= new DuffCode[] { new DuffCode(8, (char)66) }; // not the full-form, use \u0F6A for that...
private static final DuffCode[] tmwFor0F64
= new DuffCode[] { new DuffCode(1, (char)99) };
private static final DuffCode[] tmwFor0F67
= new DuffCode[] { new DuffCode(1, (char)100) };
private static final DuffCode[] tmwFor0F6A
= new DuffCode[] { new DuffCode(1, (char)58) };
private static final DuffCode[] tmwFor0F73
= new DuffCode[] { new DuffCode(4, (char)106), new DuffCode(1, (char)109) };
private static final DuffCode[] tmwFor0F75
= new DuffCode[] { new DuffCode(10, (char)126) };
private static final DuffCode[] tmwFor0F76
= new DuffCode[] { new DuffCode(8, (char)71), new DuffCode(8, (char)87) };
private static final DuffCode[] tmwFor0F77
= new DuffCode[] { new DuffCode(8, (char)71), new DuffCode(4, (char)106), new DuffCode(8, (char)87) };
private static final DuffCode[] tmwFor0F78
= new DuffCode[] { new DuffCode(10, (char)105), new DuffCode(8, (char)87) };
private static final DuffCode[] tmwFor0F79
= new DuffCode[] { new DuffCode(10, (char)105), new DuffCode(4, (char)106), new DuffCode(8, (char)87) };
private static final DuffCode[] tmwFor0F7E
= new DuffCode[] { new DuffCode(8, (char)91) }; // the one that lines up better -- i.e., not (8, (char)90)
private static final DuffCode[] tmwFor0F81
= new DuffCode[] { new DuffCode(4, (char)106), new DuffCode(8, (char)87) };
/** Returns an array of one, two, three, or four DuffCodes that
together represent the Tibetan Unicode character <i>ch</i>.
Returns null if there is no mapping for <i>ch</i>. For
certain codepoints, multiple TMW glyphs are appropriate, and
we return an arbitrary one. */
public static DuffCode[] mapUnicodeToTMW(char ch) {
// FIXME WARN WHENEVER AN ESCAPE IS USED FOR: f71, f72, f73, f74, f75, f76, f77, f78, f79, f7a, f7c, f81
// For U+0F71, U+0F72, U+0F74, U+0F75, U+0F7A, and U+0F7C,
// you'll get one of the possible TMW glyphs, maybe not the
// one that is most beautiful.
if ('\u0F00' == ch) {
return tmwFor0F00;
} else if ('\u0F02' == ch) {
return tmwFor0F02;
} else if ('\u0F03' == ch) {
return tmwFor0F03;
} else if ('\u0F0E' == ch) {
return tmwFor0F0E;
} else if ('\u0F40' == ch) {
return tmwFor0F40;
} else if ('\u0F42' == ch) {
return tmwFor0F42;
} else if ('\u0F49' == ch) {
return tmwFor0F49;
} else if ('\u0F4F' == ch) {
return tmwFor0F4F;
} else if ('\u0F51' == ch) {
return tmwFor0F51;
} else if ('\u0F53' == ch) {
return tmwFor0F53;
} else if ('\u0F5E' == ch) {
return tmwFor0F5E;
} else if ('\u0F62' == ch) {
return tmwFor0F62;
} else if ('\u0F64' == ch) {
return tmwFor0F64;
} else if ('\u0F67' == ch) {
return tmwFor0F67;
} else if ('\u0F6A' == ch) {
return tmwFor0F6A;
} else if ('\u0F73' == ch) {
return tmwFor0F73;
} else if ('\u0F75' == ch) {
return tmwFor0F75;
} else if ('\u0F76' == ch) {
return tmwFor0F76;
} else if ('\u0F77' == ch) {
return tmwFor0F77;
} else if ('\u0F78' == ch) {
return tmwFor0F78;
} else if ('\u0F79' == ch) {
return tmwFor0F79;
} else if ('\u0F7E' == ch) {
return tmwFor0F7E;
} else if ('\u0F81' == ch) {
return tmwFor0F81;
} else {
if (ch >= '\u0F00' && ch <= '\u0FFF') {
DuffCode[] x = UnicodeToTMW[ch - '\u0F00'];
if (null != x[0]) return x;
} else if (ch >= '\uF021' && ch <= '\uF0FF') {
DuffCode[] x = NonUnicodeToTMW[ch - '\uF000'];
if (null != x[0]) return x;
}
return null;
}
}
/** Returns the sequence of Unicode corresponding to the given
TibetanMachineWeb font
(0=TibetanMachineWeb,1=TibetanMachineWeb1,...) and
character(32-127).
Null is returned for an existing TibetanMachineWeb glyph if and
only if that glyph has no corresponding Unicode mapping. Null is
returned if the input isn't valid.
Only a few control characters are supported: '\r' (carriage
return), '\n' (line feed), and '\t' (tab).
*/
public static String mapTMWtoUnicode(int font, int ordinal) {
if (font < 0 || font > 9)
return null;
if (ordinal > 127)
return null;
if (ordinal < 32) {
if (ordinal == (int)'\r')
return Unicode_cr;
else if (ordinal == (int)'\n')
return Unicode_lf;
else if (ordinal == (int)'\t')
return Unicode_tab;
else {
// for robustness, just return a String consisting of the
// character which has the ordinal 'ordinal'.
ThdlDebug.noteIffyCode();
return null;
}
}
return TMWtoUnicode[font][ordinal-32];
}
/**
* Gets the TibetanMachine font number for this font name.
* @param name a font name
* @return between 1 and 5 if the font is one
* of the TibetanMachine fonts, otherwise 0 */
public static int getTMFontNumber(String name) {
String internedName = name.intern();
for (int i=1; i<tmFontNames.length; i++) {
if (internedName == tmFontNames[i])
return i;
}
return 0;
}
/**
* Gets the TibetanMachineWeb font number for this font name.
* @param name a font name
* @return between 1 and 10 if the font is one
* of the TibetanMachineWeb fonts, otherwise 0
*/
public static int getTMWFontNumber(String name) {
String internedName = name.intern();
for (int i=1; i<tmwFontNames.length; i++) {
// Thanks to interning, we can use == rather than .equals().
if (internedName == tmwFontNames[i])
return i;
}
return 0;
}
/**
* Gets the hash key associated with this glyph.
* @param font a TibetanMachineWeb font number
* @param code an ASCII character code minus 32
* @return the hashKey corresponding to the character at font, code;
* see {@link #getGlyph(String)} to learn about hash keys.
*/
public static String getHashKeyForGlyph(int font, int code) {
code = code - 32;
return toHashKey[font][code];
}
/**
* Gets the hash key associated with this glyph, or null if there is
* none (probably because this glyph has no THDL Extended Wylie
* transcription).
* @param dc a DuffCode denoting a TibetanMachineWeb glyph
* @return the hashKey corresponding to the character at dc; see {@link
* #getGlyph(String)} to learn about hash keys. */
public static String getHashKeyForGlyph(DuffCode dc) {
int font = dc.getFontNum();
int code = dc.getCharNum()-32;
if (code < 0) return null;
return toHashKey[font][code];
}
/**
* Gets the Extended Wylie for a given hash key.
* The hash keys in charList and so on may include separating
* characters. For example, Wylie 'sgr' has the hash key 's-g-r'.
* This method takes a hash key and converts it its correct
* Wylie value, and therefore is useful in conversions from
* TibetanMachineWeb to Wylie.
* @param hashKey the hash key for a glyph; see {@link
* #getGlyph(String)} to learn about hash keys.
* @return the Wylie value of that hash key
*/
public static String wylieForGlyph(String hashKey) {
if (hashKey.indexOf(WYLIE_SANSKRIT_STACKING_KEY) > -1)
return hashKey; //because '+' remains part of Extended Wylie for Sanskrit stacks
if (hashKey.charAt(0) == '-')
return hashKey; //because must be '-i' or '-I' vowels
StringTokenizer st = new StringTokenizer(hashKey, "-");
StringBuffer sb = new StringBuffer();
while (st.hasMoreTokens())
sb.append(st.nextToken());
return sb.toString();
}
/** Returns the ACIP transliteration for a glyph with hash key
hashKey, or returns null if there is none. */
private static String acipForGlyph(String hashKey) {
if (1 == hashKey.length()
// ~X is a special case because the EWTS is 2 characters in
// length
|| "~X".equals(hashKey)) // hard-coded EWTS value
return org.thdl.tib.text.ttt.ACIPTraits.instance().getACIPForEWTS(hashKey);
else
// else we are not be able to use it because it's not smart
// about stacks (e.g., W+W)
return BasicTibetanTranscriptionConverter.wylieToAcip(hashKey);
}
/** Error that appears in a document when some TMW cannot be
* transcribed in THDL Extended Wylie. This error message is
* documented in www/htdocs/TMW_RTF_TO_THDL_WYLIE.html, so change
* them both when you change this. */
private static String getTMWToWylieErrorString(DuffCode dc) {
return "<<[[JSKAD_TMW_TO_WYLIE_ERROR_NO_SUCH_WYLIE: Cannot convert "
+ dc.toString(true)
+ " to THDL Extended Wylie. Please see the documentation for the TM or TMW font and transcribe this yourself.]]>>";
}
/** Error that appears in a document when some TMW cannot be
* transcribed in ACIP. This error message is documented in
* www/htdocs/TMW_or_TM_To_X_Converters.html, so change them both
* when you change this. */
static String getTMWToACIPErrorString(String it, String explanation) {
return "[# JSKAD_TMW_TO_ACIP_ERROR_NO_SUCH_ACIP: Cannot convert " + it + " to ACIP" + explanation + ". Please transcribe this yourself.]";
}
private static String getTMWToACIPErrorString(DuffCode dc, String explanation) {
return getTMWToACIPErrorString(dc.toString(true), explanation);
}
/**
* Gets the Extended Wylie value for this glyph.
* @param font the font of the TibetanMachineWeb
* glyph you want the Wylie of
* @param code the ordinal, minus 32, of the TibetanMachineWeb glyph
* you want the Wylie of
* @param noSuchWylie an array which will not be touched if this is
* successful; however, if there is no THDL Extended Wylie
* corresponding to the glyph, then noSuchWylie[0] will be set to true
* @return the Wylie value corresponding to the
* glyph denoted by font, code
*/
public static String getWylieForGlyph(int font, int code,
boolean noSuchWylie[]) {
String hashKey = getHashKeyForGlyph(font, code);
if (hashKey == null) {
noSuchWylie[0] = true;
return getTMWToWylieErrorString(new DuffCode(font, (char)code));
}
return wylieForGlyph(hashKey);
}
/**
* Gets the Extended Wylie value for this glyph.
* @param dc the DuffCode of the glyph you want
* the Wylie of
* @param noSuchWylie an array which will not be touched if this is
* successful; however, if there is no THDL Extended Wylie
* corresponding to the glyph, then noSuchWylie[0] will be set to true
* @return the Wylie value corresponding to the
* glyph denoted by dc */
public static String getWylieForGlyph(DuffCode dc, boolean noSuchWylie[]) {
String hashKey = getHashKeyForGlyph(dc);
if (hashKey == null) {
noSuchWylie[0] = true;
return getTMWToWylieErrorString(dc);
}
return wylieForGlyph(hashKey);
}
/** Returns ACIP transliteration or an error message stating why no
ACIP transliteration exists for one, two, or three TMW glyphs.
This gobbles up three TMW glyphs when and only when "#" is
returned; this gobbles up two TMW glyphs when and only when "@" is
returned; this gobbles up one TMW glyph otherwise. The number
gobbled is stored into howManyGlyphsUsed[0]. Always pass in as
many glyphs as possible.
<p>noSuchACIP[0] will be set (to true) if and only if there is no
ACIP representation; in that case, an error message is returned
rather than valid ACIP. dc2 and/or dc3 should be null if there is
no context information available (i.e., if dc1 or dc2 is the last
DuffCode being converted from TMW to ACIP). Otherwise, dc2 should
be the DuffCode following dc1 and dc3 should be the DuffCode
following dc2. If the ACIP (or error message) returned captures
both dc1 and the (nonnull) dc2 and the (nonnull) dc3, then
howManyGlyphsUsed[0] will be set to 3. If the ACIP (or error
message) returned captures both dc1 and the nonnull dc2, then
howManyGlyphsUsed[0] will be set to 2. Otherwise it will be set
to 1.
<p>This would be more straightforward if it were not the case that
a TMW-&gt;ACIP conversion requires context information in the case
of U+0F04 and U+0F05. Because it does, three DuffCodes, not one,
must be passed in whenever possible.
<p>We opt to treat a lone U+0F05 or U+0F04 as an error in
TMW-&gt;ACIP conversions rather than return the pseudo-ACIP
Unicode character escape. After all, the conversion is
TMW-&gt;ACIP, not TMW-&gt;pseudo-ACIP.
@return error message or valid ACIP, never pseudo-ACIP like
Unicode character escapes
@param dc1 the leftmost TMW DuffCode if dc2 is nonnull,
or the sole TMW DuffCode
@param dc2 null if dc1 is the last (rightmost) TMW DuffCode in the
sequence, or the TMW DuffCode following dc1. If you pass in dc1
equal to the TMW DuffCode for U+0F04, and dc2 null, then "*" will
be returned, so don't leave this out unless dc1 is the rightmost
TMW DuffCode.
@param dc3 null if dc2 is null or is the last (rightmost) TMW
DuffCode in the sequence, or the TMW DuffCode following dc2
otherwise.
@param noSuchACIP an array whose first element will be set to true
if and only if an error message is returned instead of valid ACIP;
the first element is never set to false, so nominally caller will
initialize the first element to false
@param howManyGlyphsUsed an array whose first element will be set
to 3 if valid ACIP that desribes dc1, dc2, and dc3 is returned, to
2 if valid ACIP that describes both dc1 and dc2 is returned, or to
1 otherwise */
public static String getACIPForGlyph(DuffCode dc1,
DuffCode dc2,
DuffCode dc3,
boolean noSuchACIP[],
int howManyGlyphsUsed[]) {
// DLC FIXME: TMW.53 is probably going to come out all wrong (VA
// vs. WA) from this function, but
// ACIPTraits.getACIPForEWTS(String) seems to come through... will
// it always?
String hashKey = getHashKeyForGlyph(dc1);
if (null != hashKey && hashKey.equals("@")) { // hard-coded EWTS value
String nextHashKey
= ((null == dc2)
? null : getHashKeyForGlyph(dc2));
if (null != nextHashKey && nextHashKey.equals("#")) { // hard-coded EWTS value
String nextNextHashKey
= ((null == dc3)
? null : getHashKeyForGlyph(dc3));
if (null != nextNextHashKey && nextNextHashKey.equals("#")) { // hard-coded EWTS value
howManyGlyphsUsed[0] = 3;
return "#"; // hard-coded ACIP value
}
howManyGlyphsUsed[0] = 2;
return "*"; // hard-coded ACIP value
} // else fall through
}
if (null != hashKey && hashKey.equals("@#")) { // hard-coded EWTS value
String nextHashKey
= ((null == dc2)
? null : getHashKeyForGlyph(dc2));
if (null != nextHashKey && nextHashKey.equals("#")) { // hard-coded EWTS value
howManyGlyphsUsed[0] = 2; // not 3
return "#"; // hard-coded ACIP value
}
howManyGlyphsUsed[0] = 1; // not 2
return "*"; // hard-coded ACIP value
}
howManyGlyphsUsed[0] = 1;
String ans = (hashKey == null) ? null : acipForGlyph(hashKey);
if (null == ans) {
noSuchACIP[0] = true;
if (null != hashKey && hashKey.startsWith("R+"))
return getTMWToACIPErrorString(dc1, " because the ACIP R+... could imply the short superscribed form, but this most likely intends the full form (i.e., Unicode character U+0F6A)");
return getTMWToACIPErrorString(dc1, "");
}
return ans;
}
/** This addresses bug 624133, "Input freezes after impossible
* character". Returns true iff s is a proper prefix of some
* legal input for this keyboard. In the extended Wylie
* keyboard, hasInputPrefix("S") is true because "Sh" is legal
* input. hasInputPrefix("Sh") is false because though "Sh" is
* legal input, ("Sh" + y) is not valid input for any non-empty
* String y. */
public static boolean hasInputPrefix(String s) {
if (!currentKeyboardIsExtendedWylie()) {
return keyboard.hasInputPrefix(s);
} else {
return validInputSequences.hasPrefix(s);
}
}
/**
* Says whether or not this glyph involves a Sanskrit stack.
* @param font the font of a TibetanMachineWeb glyph
* @param code the ASCII value of a TibetanMachineWeb glyph minus 32
* @return true if this glyph is a Sanskrit stack,
* false if not
*/
public static boolean isSanskritStack(int font, int code) {
String val = toHashKey[font][code];
if (val.indexOf(WYLIE_SANSKRIT_STACKING_KEY) == -1)
return false;
else
return true;
}
/**
* Says whether or not this glyph involves a Sanskrit stack.
* @param dc the DuffCode of a TibetanMachineWeb glyph
* @return true if this glyph is a Sanskrit stack,
* false if not
*/
public static boolean isSanskritStack(DuffCode dc) {
int font = dc.getFontNum();
int code = dc.getCharNum()-32;
if (isSanskritStack(font, code))
return true;
else
return false;
}
/**
* Says whether or not this glyph involves a Tibetan stack.
* @param font the font of a TibetanMachineWeb glyph
* @param code the ASCII value of a TibetanMachineWeb glyph minus 32
* @return true if this glyph is a Tibetan stack,
* false if not
*/
public static boolean isStack(int font, int code) {
String val = toHashKey[font][code];
if (val.indexOf('-') < 1) //we allow '-i' and '-I' in as vowels
return false;
else
return true;
}
/**
* Says whether or not this glyph involves a Tibetan stack.
* @param dc the DuffCode of a TibetanMachineWeb glyph
* @return true if this glyph is a Tibetan stack,
* false if not
*/
public static boolean isStack(DuffCode dc) {
int font = dc.getFontNum();
int code = dc.getCharNum()-32;
return isStack(font, code);
}
/**
* Gets the hash with information about each character and stack.
* @return a hash containing a key for each
* entity defined in Wylie, whose object is the
* DuffCode for that key
*/
public static Map getTibHash() {
return tibHash;
}
/**
* Gets the hash for characters that require special bindus.
* @return a hash whose keys are all vowel glyphs (DuffCodes)
* that require a special bindu, and whose objects
* are the vowel+bindu glyph (DuffCode) corresponding to each
* such vowel glyph
*/
public static Map getBinduMap() {
return binduMap;
}
/**
* Returns true iff the keyboard has a disambiguating key.
* @return true if the installed keyboard has a disambiguating key,
* false if not
* @see TibetanKeyboard */
public static boolean hasDisambiguatingKey() {
return hasDisambiguatingKey;
}
/**
* Gets the disambiguating key.
* @return the disambiguating key for the installed
* keyboard, or ' ' if there is no such key
* @see TibetanKeyboard
*/
public static char getDisambiguatingKey() {
return disambiguating_key;
}
/**
* Returns true iff the keyboard has a Sanksrit stacking key.
* @return true if a stacking key is required to type Sanskrit stacks,
* false if not
* @see TibetanKeyboard */
public static boolean hasSanskritStackingKey() {
return hasSanskritStackingKey;
}
/**
* Returns true iff the keyboard has a Tibetan stacking key.
* @return true if a stacking key is required to type Tibetan stacks,
* false if not
* @see TibetanKeyboard */
public static boolean hasTibetanStackingKey() {
return hasTibetanStackingKey;
}
/**
* Returns true iff stacking is medial.
* @return true if the stacking key is medial, false if not, or if
* there is no stacking key
* @see TibetanKeyboard */
public static boolean isStackingMedial() {
return isStackingMedial;
}
/**
* Gets the stacking key.
* @return the stacking key, or ' ' if there
* isn't one
* @see TibetanKeyboard
*/
public static char getStackingKey() {
return stacking_key;
}
/**
* Returns true iff achen is required before vowels.
* @return true if you have to type achen first before you can get a
* vowel with achen, false if you can just type the vowel by itself (as
* in Wylie)
* @see TibetanKeyboard */
public static boolean isAChenRequiredBeforeVowel() {
return isAChenRequiredBeforeVowel;
}
/**
* Returns true iff achung is treated as a consonant.
* @return true if a-chung is considered a consonant for the purposes
* of stacking, false if not (as in Wylie)
* @see TibetanKeyboard */
public static boolean isAChungConsonant() {
return isAChungConsonant;
}
/**
* Returns true iff there is a key for the invisible 'a' vowel in this
* keyboard.
* @return true if the installed keyboard has a dummy a vowel, false if
* not
* @see TibetanKeyboard */
public static boolean hasAVowel() {
return hasAVowel;
}
/**
* Gets the invisible 'a' vowel.
* @return the dummy 'a'-vowel for the installed
* keyboard, or "" if there is no such vowel
* @see TibetanKeyboard
*/
public static String getAVowel() {
return aVowel;
}
/**
* Returns true iff this glyph is a top (superscript) vowel.
* @param dc a DuffCode representing a TibetanMachineWeb glyph
* @return true if the glyph is a top-hanging (superscript) vowel (i,
* u, e, o, ai, or ao) and false if not */
public static boolean isTopVowel(DuffCode dc) {
String wylie
= getWylieForGlyph(dc,
TibTextUtils.weDoNotCareIfThereIsCorrespondingWylieOrNot);
if (top_vowels.contains(wylie))
return true;
return false;
}
/** Returns true if and only if ch, which is an ASCII character
that you can think of as an arbitrary index into one of the
Tibetan fonts, is a character that is appropriate for ending a
line of Tibetan. <code>'-'</code>, for example, represents
the tsheg (the little dot after a syllable) in (FIXME: Edward,
is this true?) all of the TMW fonts. Thus, this would return
true for <code>'-'</code>.
Note that ch is <b>not</b> the Wylie transliteration; it is an
arbitrary character (well, not quite, since ' ', '\t', '\n' et
cetera seem to have been wisely chosen to represent Tibetan
whitespace, but pretty arbitrary). If you open up MS Word,
select TibetanMachineWeb1, and type a hyphen,
i.e. <code>'-'</code>, you'll see a tsheg appear. If you open
Jskad and type a hyphen, you won't see a tsheg.
@param ch the ASCII character "index" into the TMW font
@return true iff this is a tsheg or whitespace or the like */
public static boolean isTMWFontCharBreakable(char ch) {
// DLC FIXME: treat whitespace differently than you do
// punctuation. And treat "/ka nga/", Tibetan verse,
// specially in the caller of this method.
if (false) {
//<?Input:Punctuation?>
int ord = (int)ch;
// FIXME: why did 94 appear twice in tibwn.ini's punctuation section?
if (32 == ord) return true;
if (45 == ord) return true;
if (107 == ord) return true;
if (103 == ord) return true;
if (104 == ord) return true;
if (105 == ord) return true;
if (43 == ord) return true;
if (40 == ord) return true;
if (41 == ord) return true;
if (38 == ord) return true;
if (39 == ord) return true;
if (93 == ord) return true;
if (94 == ord) return true;
if (92 == ord) return true;
if (91 == ord) return true;
} // DLC FIXME
return ('-' == ch /* FIXME: this is the tsheg (i.e., the Wylie is ' '), but we have no constant for it. */
|| ' ' == ch /* FIXME: this is space (i.e., the Wylie is '_'), but we have no constant for it. */
|| '\t' == ch /* FIXME: this is some sort of whitespace */
|| '\n' == ch /* FIXME: this is some sort of whitespace */
|| '/' == ch /* a shad */
);
// FIXME: am I missing anything? tabs etc.?
}
}
// FIXME MAKE AUTOMATED TEST: BDE vs. B+DE -- TMW->ACIP should
// give B+DE to be very friendly to machines.