/* // give B+DE to be very friendly to machines.The contents of this file are subject to the THDL Open Community License Version 1.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License on the THDL web site (http://www.thdl.org/). Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for the specific terms governing rights and limitations under the License. The Initial Developer of this software is the Tibetan and Himalayan Digital Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL. All Rights Reserved. Contributor(s): ______________________________________. */ package org.thdl.tib.text; import java.awt.Font; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; import java.util.StringTokenizer; import javax.swing.text.SimpleAttributeSet; import javax.swing.text.StyleConstants; import org.thdl.tib.text.tshegbar.UnicodeUtils; import org.thdl.util.ThdlDebug; import org.thdl.util.ThdlOptions; import org.thdl.util.Trie; /** * Interfaces between Extended Wylie and the TibetanMachineWeb fonts. * To do this this must first read the code table, which lives in * "tibwn.ini", and which must be found in the same directory as this * class. Note that WylieWord has its own copy of this file, so edit * both or neither. * *
In addition, this class optionally loads the TibetanMachineWeb
* fonts manually via {@link #readInTMWFontFiles()}. When we do that,
* it means that users don't have to install the fonts on their
* systems, so installation of Jskad becomes easier.
* @author Edward Garrett, Tibetan and Himalayan Digital Library
* @author David Chandler
*/
public class TibetanMachineWeb implements THDLWylieConstants {
/** This addresses bug 624133, "Input freezes after impossible
* character". The input sequences that are valid in Extended
* Wylie. For example, "Sh" will be in this container, but "S"
* will not be. */
private static Trie validInputSequences = new Trie();
/** needed because a Trie cannot have a null value associated with
* a key */
private final static String anyOldObjectWillDo
= "this placeholder is useful for debugging; we need a nonnull Object anyway";
private static TibetanKeyboard keyboard = null;
private static Set charSet = null;
private static Set tibSet = null;
private static Set sanskritStackSet = null;
private static Set numberSet = null;
private static Set vowelSet = null;
private static int maxEwtsVowelLength = -1;
private static Set puncSet = null;
private static Set topSet = null;
private static Set leftSet = null;
private static Set rightSet = null;
private static Set farRightSet = null;
private static Map tibHash = new HashMap();
private static Map binduMap = new HashMap();
private static String[][] toHashKey = new String[11][95]; //note: toHashKey[0][..] is not used
private static DuffCode[][] TMtoTMW = new DuffCode[5][255-32]; // ordinal 255 doesn't occur in TM
private static DuffCode[][] TMWtoTM = new DuffCode[10][127-32]; // ordinal 127 doesn't occur in TMW
private static String[][] TMWtoUnicode = new String[10][127-32]; // ordinal 127 doesn't occur in TMW
/** For mapping single codepoints U+0F00..U+0FFF to TMW. This
won't handle 0F00, 0F02, 0F03, or 0F0E, which are made by
using multiple glyphs from TMW, but it handles all the rest.
It handles U+0F90-U+0FBC rather poorly, in that you have to
use special formatting to get those right (FIXME: warn
whenever they're used). */
private static DuffCode[][] UnicodeToTMW = new DuffCode[256][1];
/** For mapping codepoints U+F021..U+0FFF to TMW. */
private static DuffCode[][] NonUnicodeToTMW = new DuffCode[256][1];
private static String fileName = "tibwn.ini";
private static final String DELIMITER = "~";
/** vowels that appear over the glyph: */
private static Set top_vowels;
/** the font we use when we convert TMW->Unicode: */
private static SimpleAttributeSet defaultUnicodeFontAttributeSet = null;
/** a way of encoding the choice of TibetanMachineWeb font from
that family of 10 fonts: */
private static SimpleAttributeSet[] webFontAttributeSet = new SimpleAttributeSet[11];
/** a way of encoding the choice of TibetanMachine font from
that family of 5 fonts: */
private static SimpleAttributeSet[] normFontAttributeSet = new SimpleAttributeSet[6];
private static boolean hasDisambiguatingKey; //to disambiguate gy and g.y=
private static char disambiguating_key;
private static boolean hasSanskritStackingKey; //for stacking Sanskrit
private static boolean hasTibetanStackingKey; //for stacking Tibetan
private static boolean isStackingMedial; //ie g+y, not +gy
private static char stacking_key;
private static boolean isAChenRequiredBeforeVowel;
private static boolean isAChungConsonant;
private static boolean hasAVowel;
private static String aVowel;
// We use .intern() explicitly here so the code is easier to
// understand, but all string literals are interned.
public static final String[] tmFontNames = {
null,
"TibetanMachine".intern(),
"TibetanMachineSkt1".intern(),
"TibetanMachineSkt2".intern(),
"TibetanMachineSkt3".intern(),
"TibetanMachineSkt4".intern()
};
public static final String[] tmwFontNames = {
null,
"TibetanMachineWeb".intern(),
"TibetanMachineWeb1".intern(),
"TibetanMachineWeb2".intern(),
"TibetanMachineWeb3".intern(),
"TibetanMachineWeb4".intern(),
"TibetanMachineWeb5".intern(),
"TibetanMachineWeb6".intern(),
"TibetanMachineWeb7".intern(),
"TibetanMachineWeb8".intern(),
"TibetanMachineWeb9".intern()
};
/**
* represents where in an array of DuffCodes you
* find the TibetanMachine equivalence of a glyph
*/
public static final int TM = 0;
/**
* represents where in an array of DuffCodes you
* find the reduced character equivalent of a TMW glyph
*/
public static final int REDUCED_C = 1;
/**
* represents where in an array of DuffCodes you
* find the TibetanMachineWeb glyph
*/
public static final int TMW = 2;
/**
* represents where in an array of DuffCodes you
* find the gigu value for a given glyph
*/
public static final int VOWEL_i = 3;
/**
* represents where in an array of DuffCodes you
* find the zhebju value for a given glyph
*/
public static final int VOWEL_u = 4;
/**
* represents where in an array of DuffCodes you
* find the drengbu value for a given glyph
*/
public static final int VOWEL_e = 5;
/**
* represents where in an array of DuffCodes you
* find the naro value for a given glyph
*/
public static final int VOWEL_o = 6;
/**
* represents where in an array of DuffCodes you
* find the achung value for a given glyph
*/
public static final int VOWEL_A = 7;
/**
* represents where in an array of DuffCodes you
* find the achung + zhebju value for a given glyph
*/
public static final int VOWEL_U = 8;
/**
* represents where in an array of DuffCodes you
* find the Unicode equivalence of a given glyph
*/
public static final int UNICODE = 9;
/**
* represents where in an array of DuffCodes you
* find the half height equivalence of a given glyph
*/
public static final int HALF_C = 10;
// NOTE WELL: if you delete from tibetanConsonants,
// otherConsonants, numbers, vowels, or others, you'll change the
// way Jskad's Extended Wylie keyboard works, yes, but you'll also
// change TMW->Wylie.
/** comma-delimited list of supported Tibetan consonants: */
private static final String tibetanConsonants
= "k,kh,g,ng,c,ch,j,ny,t,th,d,n,p,ph,b,m,ts,tsh,dz,w,zh,z,',y,r,l,sh,s,h,a";
/** comma-delimited list of supported non-Tibetan consonants, such
* as Sanskrit consonants: */
private static final String otherConsonants // va and fa are treated pretty-much like Sanskrit. // TODO(DLC)[EWTS->Tibetan]: now are v and f in EWTS?
= "T,Th,D,N,Sh,v,f";
/** comma-delimited list of supported numbers (superscribed,
subscribed, normal, half-numerals): */
private static final String numbers
= "0,1,2,3,4,5,6,7,8,9";
/** comma-delimited list of supported punctuation and
miscellaneous characters: */
private static final String others
= "_, ,/,|,!,:,;,@,#,$,%,(,),H,M,&,@#,?,=,{,},*,~X,X"; // FIXME: not yet supporting all these...
/** comma-delimited list of supported vowels: */
private static final String vowels
= "a,i,u,e,o,I,U,ai,au,A,-i,-I";
/** comma-delimited list of head letters (superscribed letters) */
private static final String tops = "r,s,l";
/** comma-delimited list of prefixes */
private static final String lefts = "g,d,b,m,'";
/** comma-delimited list of suffixes */
private static final String rights = "g,ng,d,n,b,m,r,l,s,',T";
/** comma-delimited list of postsuffixes. nga was here in the
* past, according to Edward, to handle cases like ya'ng. pa'am
* wasn't considered, but had it been, ma probably would've gone
* here too. We now handle 'am, 'ang, etc. specially, so now
* this set is now just the postsuffixes. */
private static final String farrights = "d,s";
static {
readData();
/* Initialize to Extended Wylie keyboard. The preferences
* mechanism will switch this to the preferred keyboard. */
setKeyboard(keyboard);
}
/** If the TMW font files are resources associated with this
* class, those font files are loaded. This means that the user
* need not install the fonts on their system, but it does make
* the JAR bigger and takes time at startup.
* @return true upon successful loading, false otherwise */
private static boolean readInTMWFontFiles() {
/* Note the leading slashes on these paths: */
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn1.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn2.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn3.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn4.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn5.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn6.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn7.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn8.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachineWeb/timwn9.ttf")) return false;
return true;
}
/** If the TM font files are resources associated with this
* class, those font files are loaded. This means that the user
* need not install the fonts on their system, but it does make
* the JAR bigger and takes time at startup.
* @return true upon successful loading, false otherwise */
private static boolean readInTMFontFiles() {
/* Note the leading slashes on these paths: */
if (!readInFontFile("/Fonts/TibetanMachine/Timn.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachine/Tims1.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachine/Tims2.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachine/Tims3.ttf")) return false;
if (!readInFontFile("/Fonts/TibetanMachine/Tims4.ttf")) return false;
return true;
}
/** If the TMW font file at the given path is a resource
* associated with this class, that font file is loaded.
* @param path a path within the JAR containing this class file
* @return true upon successful loading, false otherwise */
private static boolean readInFontFile(String path) {
// Note that the TM and TMW fonts do not have hanging
// baselines. They have Roman baselines. Tony Duff said this
// is subtly necessary and that only an OpenType font can
// support baselines properly.
try {
InputStream is = TibetanMachineWeb.class.getResourceAsStream(path);
if (null == is) {
return false;
}
Font.createFont(Font.TRUETYPE_FONT, is);
} catch( Exception e ) {
e.printStackTrace();
ThdlDebug.noteIffyCode();
return false;
}
return true;
}
/** Returns the next token in st with the first occurrence of
__TILDE__ replaced with ~. Needed because the DELIMITER is ~.
Appends the escaped token to sb iff an escape sequence
occurred. */
private static String getEscapedToken(StringTokenizer st,
StringBuffer sb) {
String unescaped = st.nextToken();
int start;
if ((start = unescaped.indexOf("__TILDE__")) >= 0) {
StringBuffer x = new StringBuffer(unescaped);
x.replace(start, "__TILDE__".length(), "~");
sb.append(x.toString());
return x.toString();
} else {
return unescaped;
}
}
/**
* This method reads the data file ("tibwn.ini"), constructs
* the character, punctuation, and vowel lists, as well as
* performing other acts of initialization.
*/
private static void readData() {
if (!ThdlOptions.getBooleanOption("thdl.rely.on.system.tmw.fonts")) {
readInTMWFontFiles();
}
if (!ThdlOptions.getBooleanOption("thdl.rely.on.system.tm.fonts")) {
readInTMFontFiles();
}
defaultUnicodeFontAttributeSet = new SimpleAttributeSet();
StyleConstants.setFontFamily(defaultUnicodeFontAttributeSet,
"Tibetan Machine Uni");
webFontAttributeSet[0] = null;
for (int i=1; i If you open up a file that MS Word has saved (not TibetDoc, I
don't think), it will appear, thanks to Java's bugs, to have weird
RTF where you see TibetanMachine.8211 etc. The highest possible
glyph value should be 255, but that's not what appears. The bug,
precisely, is that the RTF If you open up a file that TibetDoc has saved, it will appear,
thanks to Java's bugs, to have weird RTF where you see
TibetanMachine.8225 etc. The highest possible glyph value should
be 255, but that's not what appears. The bug, precisely, is that
the RTF noSuchACIP[0] will be set (to true) if and only if there is no
ACIP representation; in that case, an error message is returned
rather than valid ACIP. dc2 and/or dc3 should be null if there is
no context information available (i.e., if dc1 or dc2 is the last
DuffCode being converted from TMW to ACIP). Otherwise, dc2 should
be the DuffCode following dc1 and dc3 should be the DuffCode
following dc2. If the ACIP (or error message) returned captures
both dc1 and the (nonnull) dc2 and the (nonnull) dc3, then
howManyGlyphsUsed[0] will be set to 3. If the ACIP (or error
message) returned captures both dc1 and the nonnull dc2, then
howManyGlyphsUsed[0] will be set to 2. Otherwise it will be set
to 1.
This would be more straightforward if it were not the case that
a TMW->ACIP conversion requires context information in the case
of U+0F04 and U+0F05. Because it does, three DuffCodes, not one,
must be passed in whenever possible.
We opt to treat a lone U+0F05 or U+0F04 as an error in
TMW->ACIP conversions rather than return the pseudo-ACIP
Unicode character escape. After all, the conversion is
TMW->ACIP, not TMW->pseudo-ACIP.
@return error message or valid ACIP, never pseudo-ACIP like
Unicode character escapes
@param dc1 the leftmost TMW DuffCode if dc2 is nonnull,
or the sole TMW DuffCode
@param dc2 null if dc1 is the last (rightmost) TMW DuffCode in the
sequence, or the TMW DuffCode following dc1. If you pass in dc1
equal to the TMW DuffCode for U+0F04, and dc2 null, then "*" will
be returned, so don't leave this out unless dc1 is the rightmost
TMW DuffCode.
@param dc3 null if dc2 is null or is the last (rightmost) TMW
DuffCode in the sequence, or the TMW DuffCode following dc2
otherwise.
@param noSuchACIP an array whose first element will be set to true
if and only if an error message is returned instead of valid ACIP;
the first element is never set to false, so nominally caller will
initialize the first element to false
@param howManyGlyphsUsed an array whose first element will be set
to 3 if valid ACIP that desribes dc1, dc2, and dc3 is returned, to
2 if valid ACIP that describes both dc1 and dc2 is returned, or to
1 otherwise */
public static String getACIPForGlyph(DuffCode dc1,
DuffCode dc2,
DuffCode dc3,
boolean noSuchACIP[],
int howManyGlyphsUsed[]) {
// DLC FIXME: TMW.53 is probably going to come out all wrong (VA
// vs. WA) from this function, but
// ACIPTraits.getACIPForEWTS(String) seems to come through... will
// it always?
String hashKey = getHashKeyForGlyph(dc1);
if (null != hashKey && hashKey.equals("@")) { // hard-coded EWTS value
String nextHashKey
= ((null == dc2)
? null : getHashKeyForGlyph(dc2));
if (null != nextHashKey && nextHashKey.equals("#")) { // hard-coded EWTS value
String nextNextHashKey
= ((null == dc3)
? null : getHashKeyForGlyph(dc3));
if (null != nextNextHashKey && nextNextHashKey.equals("#")) { // hard-coded EWTS value
howManyGlyphsUsed[0] = 3;
return "#"; // hard-coded ACIP value
}
howManyGlyphsUsed[0] = 2;
return "*"; // hard-coded ACIP value
} // else fall through
}
if (null != hashKey && hashKey.equals("@#")) { // hard-coded EWTS value
String nextHashKey
= ((null == dc2)
? null : getHashKeyForGlyph(dc2));
if (null != nextHashKey && nextHashKey.equals("#")) { // hard-coded EWTS value
howManyGlyphsUsed[0] = 2; // not 3
return "#"; // hard-coded ACIP value
}
howManyGlyphsUsed[0] = 1; // not 2
return "*"; // hard-coded ACIP value
}
howManyGlyphsUsed[0] = 1;
String ans = (hashKey == null) ? null : acipForGlyph(hashKey);
if (null == ans) {
noSuchACIP[0] = true;
if (null != hashKey && hashKey.startsWith("R+"))
return getTMWToACIPErrorString(dc1, " because the ACIP R+... could imply the short superscribed form, but this most likely intends the full form (i.e., Unicode character U+0F6A)");
return getTMWToACIPErrorString(dc1, "");
}
return ans;
}
/** This addresses bug 624133, "Input freezes after impossible
* character". Returns true iff s is a proper prefix of some
* legal input for this keyboard. In the extended Wylie
* keyboard, hasInputPrefix("S") is true because "Sh" is legal
* input. hasInputPrefix("Sh") is false because though "Sh" is
* legal input, ("Sh" + y) is not valid input for any non-empty
* String y. */
public static boolean hasInputPrefix(String s) {
if (!currentKeyboardIsExtendedWylie()) {
return keyboard.hasInputPrefix(s);
} else {
return validInputSequences.hasPrefix(s);
}
}
/**
* Says whether or not this glyph involves a Sanskrit stack.
* @param font the font of a TibetanMachineWeb glyph
* @param code the ASCII value of a TibetanMachineWeb glyph minus 32
* @return true if this glyph is a Sanskrit stack,
* false if not
*/
public static boolean isSanskritStack(int font, int code) {
String val = toHashKey[font][code];
if (val.indexOf(WYLIE_SANSKRIT_STACKING_KEY) == -1)
return false;
else
return true;
}
/**
* Says whether or not this glyph involves a Sanskrit stack.
* @param dc the DuffCode of a TibetanMachineWeb glyph
* @return true if this glyph is a Sanskrit stack,
* false if not
*/
public static boolean isSanskritStack(DuffCode dc) {
int font = dc.getFontNum();
int code = dc.getCharNum()-32;
if (isSanskritStack(font, code))
return true;
else
return false;
}
/**
* Says whether or not this glyph involves a Tibetan stack.
* @param font the font of a TibetanMachineWeb glyph
* @param code the ASCII value of a TibetanMachineWeb glyph minus 32
* @return true if this glyph is a Tibetan stack,
* false if not
*/
public static boolean isStack(int font, int code) {
String val = toHashKey[font][code];
if (val.indexOf('-') < 1) //we allow '-i' and '-I' in as vowels
return false;
else
return true;
}
/**
* Says whether or not this glyph involves a Tibetan stack.
* @param dc the DuffCode of a TibetanMachineWeb glyph
* @return true if this glyph is a Tibetan stack,
* false if not
*/
public static boolean isStack(DuffCode dc) {
int font = dc.getFontNum();
int code = dc.getCharNum()-32;
return isStack(font, code);
}
/**
* Gets the hash with information about each character and stack.
* @return a hash containing a key for each
* entity defined in Wylie, whose object is the
* DuffCode for that key
*/
public static Map getTibHash() {
return tibHash;
}
/**
* Gets the hash for characters that require special bindus.
* @return a hash whose keys are all vowel glyphs (DuffCodes)
* that require a special bindu, and whose objects
* are the vowel+bindu glyph (DuffCode) corresponding to each
* such vowel glyph
*/
public static Map getBinduMap() {
return binduMap;
}
/**
* Returns true iff the keyboard has a disambiguating key.
* @return true if the installed keyboard has a disambiguating key,
* false if not
* @see TibetanKeyboard */
public static boolean hasDisambiguatingKey() {
return hasDisambiguatingKey;
}
/**
* Gets the disambiguating key.
* @return the disambiguating key for the installed
* keyboard, or ' ' if there is no such key
* @see TibetanKeyboard
*/
public static char getDisambiguatingKey() {
return disambiguating_key;
}
/**
* Returns true iff the keyboard has a Sanksrit stacking key.
* @return true if a stacking key is required to type Sanskrit stacks,
* false if not
* @see TibetanKeyboard */
public static boolean hasSanskritStackingKey() {
return hasSanskritStackingKey;
}
/**
* Returns true iff the keyboard has a Tibetan stacking key.
* @return true if a stacking key is required to type Tibetan stacks,
* false if not
* @see TibetanKeyboard */
public static boolean hasTibetanStackingKey() {
return hasTibetanStackingKey;
}
/**
* Returns true iff stacking is medial.
* @return true if the stacking key is medial, false if not, or if
* there is no stacking key
* @see TibetanKeyboard */
public static boolean isStackingMedial() {
return isStackingMedial;
}
/**
* Gets the stacking key.
* @return the stacking key, or ' ' if there
* isn't one
* @see TibetanKeyboard
*/
public static char getStackingKey() {
return stacking_key;
}
/**
* Returns true iff achen is required before vowels.
* @return true if you have to type achen first before you can get a
* vowel with achen, false if you can just type the vowel by itself (as
* in Wylie)
* @see TibetanKeyboard */
public static boolean isAChenRequiredBeforeVowel() {
return isAChenRequiredBeforeVowel;
}
/**
* Returns true iff achung is treated as a consonant.
* @return true if a-chung is considered a consonant for the purposes
* of stacking, false if not (as in Wylie)
* @see TibetanKeyboard */
public static boolean isAChungConsonant() {
return isAChungConsonant;
}
/**
* Returns true iff there is a key for the invisible 'a' vowel in this
* keyboard.
* @return true if the installed keyboard has a dummy a vowel, false if
* not
* @see TibetanKeyboard */
public static boolean hasAVowel() {
return hasAVowel;
}
/**
* Gets the invisible 'a' vowel.
* @return the dummy 'a'-vowel for the installed
* keyboard, or "" if there is no such vowel
* @see TibetanKeyboard
*/
public static String getAVowel() {
return aVowel;
}
/**
* Returns true iff this glyph is a top (superscript) vowel.
* @param dc a DuffCode representing a TibetanMachineWeb glyph
* @return true if the glyph is a top-hanging (superscript) vowel (i,
* u, e, o, ai, or ao) and false if not */
public static boolean isTopVowel(DuffCode dc) {
String wylie
= getWylieForGlyph(dc,
TibTextUtils.weDoNotCareIfThereIsCorrespondingWylieOrNot);
if (top_vowels.contains(wylie))
return true;
return false;
}
/** Returns true if and only if ch, which is an ASCII character
that you can think of as an arbitrary index into one of the
Tibetan fonts, is a character that is appropriate for ending a
line of Tibetan. \endash
is not treated
like the RTF \u0150
, as it should be, but is instead
turned into something akin to \u8211
. This is Java's
fault, not MS Word's. This happens for \bullet, \emdash, \endash,
\lquote, \rquote, \ldblquote, and \rdblquote.
@return non-null if (font, code) identify an oddball we know.
*/
private static DuffCode getOtherUnusualTMtoTMW(int font, int code) {
if (code > 254) {
switch (code) {
case 8211: // \endash, following number assumes this is a
// Windows or OS/2 RTF file, not a Mac RTF file:
return TMtoTMW[font][150 - 32];
case 8212: // \emdash, following number assumes this is a
// Windows or OS/2 RTF file, not a Mac RTF file:
return TMtoTMW[font][151 - 32];
case 8216: // \lquote, following number assumes this is a
// Windows or OS/2 RTF file, not a Mac RTF file:
return TMtoTMW[font][145 - 32];
case 8217: // \rquote, following number assumes this is a
// Windows or OS/2 RTF file, not a Mac RTF file:
return TMtoTMW[font][146 - 32];
case 8220: // \ldblquote, following number assumes this is a
// Windows or OS/2 RTF file, not a Mac RTF file:
return TMtoTMW[font][147 - 32];
case 8221: // \rdblquote, following number assumes this is a
// Windows or OS/2 RTF file, not a Mac RTF file:
return TMtoTMW[font][148 - 32];
case 8226: // \bullet, following number assumes this is a
// Windows or OS/2 RTF file, not a Mac RTF file:
return TMtoTMW[font][149 - 32];
default:
return null;
}
} else {
return null;
}
}
/** A horrible kludge. A kludge is needed because javax.swing.rtf is
quite busted. As you'll see below though, this kludge does not
suffice.
\'9c
is not treated like the RTF
\u0156
, as it should be, but is instead turned into
something akin to \u0347
. This is Java's fault, not
TibetDoc's. I think it happens for glyphs that are like \'8X and
\'9X, not for \'7X, \'aX and the rest. Thus, there are 32 guys to
worry about, and 158, \'9e, and 142, \'8e, are not used by any TM
fonts, leaving us with 30 to worry about. Unfortunately, 145
(\'91), 147 (\'93), 148 (\'94), 150 (\'96), 151 (\'97), and 152
(\'98) simply DISAPPEAR from the input document.
@return non-null if (font, code) identify an oddball we know.
@deprecated This list is thought to be as many of the 30 as are
possible to get. But we cannot give you (char)145, etc. ever
because they are simply NOT THERE. So if you are using this
method, you are LOSING INFORMATION. Do not use this method. */
private static DuffCode getUnusualTMtoTMW(int font, int code) {
// FIXME: don't use this! Do a search and replace through the RTF
// file instead.
if (code > 254) {
switch (code) {
case 346: // 0=ga-wazur
return TMtoTMW[font][140 - 32];
case 347: // 0=reduced-height ha
return TMtoTMW[font][156 - 32];
case 352: // 1=dz-wazur, 0=k-wazur
return TMtoTMW[font][138 - 32];
case 356: // 0=ca-wazur
return TMtoTMW[font][141 - 32];
case 357: // 2=b-t
return TMtoTMW[font][157 - 32];
case 353: // 0=d-r-w
return TMtoTMW[font][154 - 32];
case 377: // 0=t-w
return TMtoTMW[font][143 - 32];
case 378: // 1=reverse-ta--reverse-ta
return TMtoTMW[font][159 - 32];
case 381: // unused, here for completeness
return TMtoTMW[font][142 - 32];
case 382: // unused, here for completeness
return TMtoTMW[font][158 - 32];
case 402: // 1=dz-ny 2=n-r 3=h-y
return TMtoTMW[font][131 - 32];
case 710: // 0=s-b-r
return TMtoTMW[font][136 - 32];
case 1026: // 0=s-g-y
return TMtoTMW[font][128 - 32];
case 1027: // 0=s-p-y
return TMtoTMW[font][129 - 32];
case 1106: // 0=d-w
return TMtoTMW[font][144 - 32];
case 8117: // 0=tsh-w
return TMtoTMW[font][146 - 32];
case 8126: // 0=r-w
return TMtoTMW[font][149 - 32];
case 8218: // 0=s-b-y 2=n-y
return TMtoTMW[font][130 - 32];
case 8222: // 0=s-k-r
return TMtoTMW[font][132 - 32];
case 8224: // 0=s-n-r
return TMtoTMW[font][134 - 32];
case 8225: // 0=s-p-r
return TMtoTMW[font][135 - 32];
case 8230: // 0=s-g-r
return TMtoTMW[font][133 - 32];
case 8240: // 0=s-m-r 1=dz-r
return TMtoTMW[font][137 - 32];
case 8249: // 0=kh-wazur 1=dz-h
return TMtoTMW[font][139 - 32];
case 8250: // 0=ph-y-wazur
return TMtoTMW[font][155 - 32];
case 8482: // 0=g-r-w
return TMtoTMW[font][153 - 32];
default:
return null;
}
} else {
return null;
}
}
private static final String Unicode_cr = "\r";
private static final String Unicode_lf = "\n";
private static final String Unicode_tab = "\t";
private static final DuffCode[] tmwFor0F00
= new DuffCode[] { new DuffCode(1, (char)63), new DuffCode(8, (char)102) };
private static final DuffCode[] tmwFor0F02
= new DuffCode[] { new DuffCode(1, (char)56), new DuffCode(1, (char)118), new DuffCode(8, (char)95), new DuffCode(8, (char)92) };
private static final DuffCode[] tmwFor0F03
= new DuffCode[] { new DuffCode(1, (char)56), new DuffCode(1, (char)118), new DuffCode(8, (char)95), new DuffCode(1, (char)105) };
private static final DuffCode[] tmwFor0F0E
= new DuffCode[] { new DuffCode(1, (char)107), new DuffCode(1, (char)107) };
// for 0F40, use the full-height, not the reduced-height, form
private static final DuffCode[] tmwFor0F40
= new DuffCode[] { new DuffCode(1, (char)92) };
private static final DuffCode[] tmwFor0F42
= new DuffCode[] { new DuffCode(1, (char)93) };
private static final DuffCode[] tmwFor0F49
= new DuffCode[] { new DuffCode(1, (char)94) };
private static final DuffCode[] tmwFor0F4F
= new DuffCode[] { new DuffCode(1, (char)95) };
private static final DuffCode[] tmwFor0F51
= new DuffCode[] { new DuffCode(1, (char)96) };
private static final DuffCode[] tmwFor0F53
= new DuffCode[] { new DuffCode(1, (char)97) };
private static final DuffCode[] tmwFor0F5E
= new DuffCode[] { new DuffCode(1, (char)98) };
private static final DuffCode[] tmwFor0F62
= new DuffCode[] { new DuffCode(8, (char)66) }; // not the full-form, use \u0F6A for that...
private static final DuffCode[] tmwFor0F64
= new DuffCode[] { new DuffCode(1, (char)99) };
private static final DuffCode[] tmwFor0F67
= new DuffCode[] { new DuffCode(1, (char)100) };
private static final DuffCode[] tmwFor0F6A
= new DuffCode[] { new DuffCode(1, (char)58) };
private static final DuffCode[] tmwFor0F73
= new DuffCode[] { new DuffCode(4, (char)106), new DuffCode(1, (char)109) };
private static final DuffCode[] tmwFor0F75
= new DuffCode[] { new DuffCode(10, (char)126) };
private static final DuffCode[] tmwFor0F76
= new DuffCode[] { new DuffCode(8, (char)71), new DuffCode(8, (char)87) };
private static final DuffCode[] tmwFor0F77
= new DuffCode[] { new DuffCode(8, (char)71), new DuffCode(4, (char)106), new DuffCode(8, (char)87) };
private static final DuffCode[] tmwFor0F78
= new DuffCode[] { new DuffCode(10, (char)105), new DuffCode(8, (char)87) };
private static final DuffCode[] tmwFor0F79
= new DuffCode[] { new DuffCode(10, (char)105), new DuffCode(4, (char)106), new DuffCode(8, (char)87) };
private static final DuffCode[] tmwFor0F7E
= new DuffCode[] { new DuffCode(8, (char)91) }; // the one that lines up better -- i.e., not (8, (char)90)
private static final DuffCode[] tmwFor0F81
= new DuffCode[] { new DuffCode(4, (char)106), new DuffCode(8, (char)87) };
/** Returns an array of one, two, three, or four DuffCodes that
together represent the Tibetan Unicode character ch.
Returns null if there is no mapping for ch. For
certain codepoints, multiple TMW glyphs are appropriate, and
we return an arbitrary one. */
public static DuffCode[] mapUnicodeToTMW(char ch) {
// FIXME WARN WHENEVER AN ESCAPE IS USED FOR: f71, f72, f73, f74, f75, f76, f77, f78, f79, f7a, f7c, f81
// For U+0F71, U+0F72, U+0F74, U+0F75, U+0F7A, and U+0F7C,
// you'll get one of the possible TMW glyphs, maybe not the
// one that is most beautiful.
if ('\u0F00' == ch) {
return tmwFor0F00;
} else if ('\u0F02' == ch) {
return tmwFor0F02;
} else if ('\u0F03' == ch) {
return tmwFor0F03;
} else if ('\u0F0E' == ch) {
return tmwFor0F0E;
} else if ('\u0F40' == ch) {
return tmwFor0F40;
} else if ('\u0F42' == ch) {
return tmwFor0F42;
} else if ('\u0F49' == ch) {
return tmwFor0F49;
} else if ('\u0F4F' == ch) {
return tmwFor0F4F;
} else if ('\u0F51' == ch) {
return tmwFor0F51;
} else if ('\u0F53' == ch) {
return tmwFor0F53;
} else if ('\u0F5E' == ch) {
return tmwFor0F5E;
} else if ('\u0F62' == ch) {
return tmwFor0F62;
} else if ('\u0F64' == ch) {
return tmwFor0F64;
} else if ('\u0F67' == ch) {
return tmwFor0F67;
} else if ('\u0F6A' == ch) {
return tmwFor0F6A;
} else if ('\u0F73' == ch) {
return tmwFor0F73;
} else if ('\u0F75' == ch) {
return tmwFor0F75;
} else if ('\u0F76' == ch) {
return tmwFor0F76;
} else if ('\u0F77' == ch) {
return tmwFor0F77;
} else if ('\u0F78' == ch) {
return tmwFor0F78;
} else if ('\u0F79' == ch) {
return tmwFor0F79;
} else if ('\u0F7E' == ch) {
return tmwFor0F7E;
} else if ('\u0F81' == ch) {
return tmwFor0F81;
} else {
if (ch >= '\u0F00' && ch <= '\u0FFF') {
DuffCode[] x = UnicodeToTMW[ch - '\u0F00'];
if (null != x[0]) return x;
} else if (ch >= '\uF021' && ch <= '\uF0FF') {
DuffCode[] x = NonUnicodeToTMW[ch - '\uF000'];
if (null != x[0]) return x;
}
return null;
}
}
/** Returns the sequence of Unicode corresponding to the given
TibetanMachineWeb font
(0=TibetanMachineWeb,1=TibetanMachineWeb1,...) and
character(32-127).
Null is returned for an existing TibetanMachineWeb glyph if and
only if that glyph has no corresponding Unicode mapping. Null is
returned if the input isn't valid.
Only a few control characters are supported: '\r' (carriage
return), '\n' (line feed), and '\t' (tab).
*/
public static String mapTMWtoUnicode(int font, int ordinal) {
if (font < 0 || font > 9)
return null;
if (ordinal > 127)
return null;
if (ordinal < 32) {
if (ordinal == (int)'\r')
return Unicode_cr;
else if (ordinal == (int)'\n')
return Unicode_lf;
else if (ordinal == (int)'\t')
return Unicode_tab;
else {
// for robustness, just return a String consisting of the
// character which has the ordinal 'ordinal'.
ThdlDebug.noteIffyCode();
return null;
}
}
return TMWtoUnicode[font][ordinal-32];
}
/**
* Gets the TibetanMachine font number for this font name.
* @param name a font name
* @return between 1 and 5 if the font is one
* of the TibetanMachine fonts, otherwise 0 */
public static int getTMFontNumber(String name) {
String internedName = name.intern();
for (int i=1; i'-'
, for example, represents
the tsheg (the little dot after a syllable) in (FIXME: Edward,
is this true?) all of the TMW fonts. Thus, this would return
true for '-'
.
Note that ch is not the Wylie transliteration; it is an
arbitrary character (well, not quite, since ' ', '\t', '\n' et
cetera seem to have been wisely chosen to represent Tibetan
whitespace, but pretty arbitrary). If you open up MS Word,
select TibetanMachineWeb1, and type a hyphen,
i.e. '-'
, you'll see a tsheg appear. If you open
Jskad and type a hyphen, you won't see a tsheg.
@param ch the ASCII character "index" into the TMW font
@return true iff this is a tsheg or whitespace or the like */
public static boolean isTMWFontCharBreakable(char ch) {
// DLC FIXME: treat whitespace differently than you do
// punctuation. And treat "/ka nga/", Tibetan verse,
// specially in the caller of this method.
if (false) {
//
int ord = (int)ch;
// FIXME: why did 94 appear twice in tibwn.ini's punctuation section?
if (32 == ord) return true;
if (45 == ord) return true;
if (107 == ord) return true;
if (103 == ord) return true;
if (104 == ord) return true;
if (105 == ord) return true;
if (43 == ord) return true;
if (40 == ord) return true;
if (41 == ord) return true;
if (38 == ord) return true;
if (39 == ord) return true;
if (93 == ord) return true;
if (94 == ord) return true;
if (92 == ord) return true;
if (91 == ord) return true;
} // DLC FIXME
return ('-' == ch /* FIXME: this is the tsheg (i.e., the Wylie is ' '), but we have no constant for it. */
|| ' ' == ch /* FIXME: this is space (i.e., the Wylie is '_'), but we have no constant for it. */
|| '\t' == ch /* FIXME: this is some sort of whitespace */
|| '\n' == ch /* FIXME: this is some sort of whitespace */
|| '/' == ch /* a shad */
);
// FIXME: am I missing anything? tabs etc.?
}
}
// FIXME MAKE AUTOMATED TEST: BDE vs. B+DE -- TMW->ACIP should
// give B+DE to be very friendly to machines.