ACIP->TMW and ACIP->Unicode now allow for Unicode escapes like K\u0F84. This means that the lack of support for ACIP's backslash, '\\', is mitigated because you can turn ACIP {K\} into ACIP {K\u0F84}.

Support for U+F021-U+F0FF, the PUA that the latest EWTS uses, is not provided.

Also, we've traded some speed for memory -- DuffCode now uses bytes, not ints.
This commit is contained in:
dchandler 2003-11-29 22:57:12 +00:00
parent dfaae4be93
commit ffd041e32c
2 changed files with 649 additions and 476 deletions

View file

@ -26,23 +26,27 @@ import org.thdl.util.ThdlDebug;
* An immutable representation of a Tibetan glyph in the
* TibetanMachineWeb or TibetanMachine families of fonts.
*
* A DuffCode consists of a font number, a character, and a character
* number. A font identification and a character are sufficient to
* uniquely identify any TibetanMachineWeb or TibetanMachine glyph.
* <p>A DuffCode consists of a font number, a character, and a
* character number. A font identification and a character are
* sufficient to uniquely identify any TibetanMachineWeb or
* TibetanMachine glyph. Whether a DuffCode represents a TM or TMW
* glyph is in the eye of the beholder -- such information is not
* intrinsically represented.
*
* @author Edward Garrett, Tibetan and Himalayan Digital Library
* @version 1.0 */
* @author David Chandler */
public final class DuffCode {
/**
* the font number in which this glyph can be found,
* from 1 (TibetanMachineWeb) to 10 (TibetanMachineWeb9).
*/
private int fontNum;
* the font number in which this glyph can be found, from 1
* (TibetanMachineWeb/TibetanMachine) ... to 5
* (TibetanMachineWeb4/TibetanMachineSkt4) ... to 10
* (TibetanMachineWeb9/[Invalid for TM family]). */
private byte fontNum;
/**
* the character value of this glyph, as an integer (that is, ordinal)
*/
private int charNum;
private byte charNum;
/**
* Called by {@link TibetanMachineWeb} to generate
@ -53,9 +57,8 @@ public final class DuffCode {
* and the other is the ASCII code of the character.
*
* @param s the string to parse
* @param leftToRight should be true if the first number is the font number,
* false if the second number is the font number
*/
* @param leftToRight should be true if the first number is the font
* number, false if the second number is the font number */
public DuffCode(String s, boolean leftToRight) {
StringTokenizer st = new StringTokenizer(s,",");
@ -65,17 +68,19 @@ public final class DuffCode {
Integer num1 = new Integer(val1);
Integer num2 = new Integer(val2);
int n1val = num1.intValue();
int n2val = num2.intValue();
if (n1val > 255 || n1val < 0 || n2val > 255 || n2val < 0)
throw new NumberFormatException("FAILED ASSERTION: 0<=fontNum<=255 and 0<=charNum<=255");
if (leftToRight) {
setFontNum(num1.intValue());
charNum = num2.intValue();
setFontNum(n1val);
setCharNum((char)n2val);
} else {
setFontNum(n2val);
setCharNum((char)n1val);
}
else {
setFontNum(num2.intValue());
charNum = num1.intValue();
}
}
catch (NumberFormatException e) {
} catch (NumberFormatException e) {
ThdlDebug.noteIffyCode();
}
}
@ -89,30 +94,41 @@ public final class DuffCode {
*/
public DuffCode(int font, char ch) {
setFontNum(font);
charNum = (int)ch;
setCharNum(ch);
}
private void setFontNum(int font) {
if (!(font >= 1 && font <= 10))
throw new IllegalArgumentException("DuffCodes work with font numbers in the range [1, 5] or [1, 10]. This isn't in the range [1, 10]: " + font);
fontNum = font;
fontNum = (byte)font;
}
/**
* Gets the font number of this glyph.
* @return the identifying font number for this DuffCode
*/
public int getFontNum() {
public byte getFontNum() {
return fontNum;
}
private void setCharNum(char x) {
short xs = (short)x;
if (xs >= 0 && xs <= 127)
charNum = (byte)xs;
else
charNum = (byte)(127-xs);
}
/**
* Gets the character for this glyph, as an integer.
* @return the identifying character, converted to an
* integer, for this DuffCode
*/
public int getCharNum() {
return charNum;
public short getCharNum() {
if (charNum >= 0)
return (short)charNum; // [0, 127]
else
return (short)(127-(short)charNum); // [128, 255]
}
/**
@ -120,7 +136,7 @@ public final class DuffCode {
* @return the identifying character for this DuffCode
*/
public char getCharacter() {
return (char)charNum;
return (char)getCharNum();
}
/**
@ -129,7 +145,7 @@ public final class DuffCode {
*
* @return the hash code for this object */
public int hashCode() {
return fontNum*256 + charNum;
return ((int)fontNum)*256 + getCharNum();
}
/**
@ -157,7 +173,7 @@ public final class DuffCode {
if (err[0]) wylie = "undefined";
return "<duffcode wylie="
+ wylie + " font=" + fontNum
+ " charNum=" + charNum + " character="
+ " charNum=" + getCharNum() + " character="
+ new Character(getCharacter()).toString() + "/>";
}
/**
@ -172,7 +188,7 @@ public final class DuffCode {
+ (TMW
? TibetanMachineWeb.tmwFontNames
: TibetanMachineWeb.tmFontNames)[fontNum]
+ " charNum=" + charNum + " character="
+ " charNum=" + getCharNum() + " character="
+ new Character(getCharacter()).toString() + "/>";
}
}

View file

@ -31,7 +31,7 @@ import org.thdl.util.ThdlDebug;
import org.thdl.util.ThdlLazyException;
import org.thdl.util.Trie;
import org.thdl.util.ThdlOptions;
import org.thdl.tib.text.tshegbar.UnicodeCodepointToThdlWylie;
import org.thdl.tib.text.tshegbar.UnicodeUtils;
/**
* Interfaces between Extended Wylie and the TibetanMachineWeb fonts.
@ -41,9 +41,11 @@ import org.thdl.tib.text.tshegbar.UnicodeCodepointToThdlWylie;
* both or neither.
*
* <p>In addition, this class optionally loads the TibetanMachineWeb
* fonts manually via {@link #readInTMWFontFiles()}.
* fonts manually via {@link #readInTMWFontFiles()}. When we do that,
* it means that users don't have to install the fonts on their
* systems, so installation of Jskad becomes easier.
* @author Edward Garrett, Tibetan and Himalayan Digital Library
* @version 1.0
* @author David Chandler
*/
public class TibetanMachineWeb implements THDLWylieConstants {
/** This addresses bug 624133, "Input freezes after impossible
@ -74,6 +76,14 @@ public class TibetanMachineWeb implements THDLWylieConstants {
private static DuffCode[][] TMtoTMW = new DuffCode[5][255-32]; // ordinal 255 doesn't occur in TM
private static DuffCode[][] TMWtoTM = new DuffCode[10][127-32]; // ordinal 127 doesn't occur in TMW
private static String[][] TMWtoUnicode = new String[10][127-32]; // ordinal 127 doesn't occur in TMW
/** For mapping single codepoints U+0F00..U+0FFF to TMW. This
won't handle 0F00, 0F02, 0F03, or 0F0E, which are made by
using multiple glyphs from TMW, but it handles all the rest.
It handles U+0F90-U+0FBC rather poorly, in that you have to
use special formatting to get those right (FIXME: warn
whenever they're used). */
private static DuffCode[][] UnicodeToTMW = new DuffCode[256][1];
private static String fileName = "tibwn.ini";
private static final String DELIMITER = "~";
/** vowels that appear over the glyph: */
@ -632,13 +642,51 @@ public class TibetanMachineWeb implements THDLWylieConstants {
}
TMWtoUnicode[duffCodes[TMW].getFontNum()-1][duffCodes[TMW].getCharNum()-32]
= unicodeBuffer.toString(); // TMW->Unicode mapping
char ch;
if (unicodeBuffer.length() == 1
&& UnicodeUtils.isInTibetanRange(ch = unicodeBuffer.charAt(0))) {
if (null != UnicodeToTMW[ch - '\u0F00'][0]
&& '\u0F00' != ch
&& '\u0F02' != ch
&& '\u0F03' != ch
&& '\u0F0B' != ch // any will do...
&& '\u0F0E' != ch
&& '\u0F40' != ch
&& '\u0F42' != ch
&& '\u0F49' != ch
&& '\u0F4F' != ch
&& '\u0F51' != ch
&& '\u0F53' != ch
&& '\u0F5E' != ch
&& '\u0F62' != ch
&& '\u0F64' != ch
&& '\u0F67' != ch
&& '\u0F6A' != ch
&& '\u0F71' != ch // any will do...
&& '\u0F72' != ch // any will do...
&& '\u0F73' != ch
&& '\u0F74' != ch // any will do...
&& '\u0F75' != ch // any will do...
&& '\u0F76' != ch
&& '\u0F77' != ch
&& '\u0F78' != ch
&& '\u0F79' != ch
&& '\u0F7A' != ch // any will do...
&& '\u0F7C' != ch // any will do...
&& '\u0F7E' != ch
&& '\u0F81' != ch) {
throw new Error("tibwn.ini has more than one TMW fellow listed that has the Unicode " + val + ", but it's not on the list of specially handled glyphs");
}
UnicodeToTMW[ch - '\u0F00'][0]
= duffCodes[TMW]; // Unicode->TMW mapping
}
// For V&V:
// DLC FIXME: also check for ^[90-bc]. and ^.+[40-6a]
// StringBuffer wylie_minus_plusses_buf
// = UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeString(unicodeBuffer.toString());
// = org.thdl.tib.text.tshegbar.UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeString(unicodeBuffer.toString());
// String wylie_minus_plusses
// = ((wylie_minus_plusses_buf == null)
// ? null
@ -1616,6 +1664,115 @@ private static final String Unicode_lf = "\n";
private static final String Unicode_tab = "\t";
private static final DuffCode[] tmwFor0F00
= new DuffCode[] { new DuffCode(1, (char)63), new DuffCode(8, (char)102) };
private static final DuffCode[] tmwFor0F02
= new DuffCode[] { new DuffCode(1, (char)56), new DuffCode(1, (char)118), new DuffCode(8, (char)95), new DuffCode(8, (char)92) };
private static final DuffCode[] tmwFor0F03
= new DuffCode[] { new DuffCode(1, (char)56), new DuffCode(1, (char)118), new DuffCode(8, (char)95), new DuffCode(1, (char)105) };
private static final DuffCode[] tmwFor0F0E
= new DuffCode[] { new DuffCode(1, (char)107), new DuffCode(1, (char)107) };
// for 0F40, use the full-height, not the reduced-height, form
private static final DuffCode[] tmwFor0F40
= new DuffCode[] { new DuffCode(1, (char)92) };
private static final DuffCode[] tmwFor0F42
= new DuffCode[] { new DuffCode(1, (char)93) };
private static final DuffCode[] tmwFor0F49
= new DuffCode[] { new DuffCode(1, (char)94) };
private static final DuffCode[] tmwFor0F4F
= new DuffCode[] { new DuffCode(1, (char)95) };
private static final DuffCode[] tmwFor0F51
= new DuffCode[] { new DuffCode(1, (char)96) };
private static final DuffCode[] tmwFor0F53
= new DuffCode[] { new DuffCode(1, (char)97) };
private static final DuffCode[] tmwFor0F5E
= new DuffCode[] { new DuffCode(1, (char)98) };
private static final DuffCode[] tmwFor0F62
= new DuffCode[] { new DuffCode(8, (char)66) }; // not the full-form, use \u0F6A for that...
private static final DuffCode[] tmwFor0F64
= new DuffCode[] { new DuffCode(1, (char)99) };
private static final DuffCode[] tmwFor0F67
= new DuffCode[] { new DuffCode(1, (char)100) };
private static final DuffCode[] tmwFor0F6A
= new DuffCode[] { new DuffCode(1, (char)58) };
private static final DuffCode[] tmwFor0F73
= new DuffCode[] { new DuffCode(4, (char)106), new DuffCode(1, (char)109) };
private static final DuffCode[] tmwFor0F76
= new DuffCode[] { new DuffCode(8, (char)71), new DuffCode(8, (char)87) };
private static final DuffCode[] tmwFor0F77
= new DuffCode[] { new DuffCode(8, (char)71), new DuffCode(4, (char)106), new DuffCode(8, (char)87) };
private static final DuffCode[] tmwFor0F78
= new DuffCode[] { new DuffCode(10, (char)105), new DuffCode(8, (char)87) };
private static final DuffCode[] tmwFor0F79
= new DuffCode[] { new DuffCode(10, (char)105), new DuffCode(4, (char)106), new DuffCode(8, (char)87) };
private static final DuffCode[] tmwFor0F7E
= new DuffCode[] { new DuffCode(8, (char)91) }; // the one that lines up better -- i.e., not (8, (char)90)
private static final DuffCode[] tmwFor0F81
= new DuffCode[] { new DuffCode(4, (char)106), new DuffCode(8, (char)87) };
/** Returns an array of one, two, three, or four DuffCodes that
together represent the Tibetan Unicode character <i>ch</i>.
Returns null if there is no mapping for <i>ch</i>. For
certain codepoints, multiple TMW glyphs are appropriate, and
we return an arbitrary one. */
public static DuffCode[] mapUnicodeToTMW(char ch) {
// FIXME WARN WHENEVER AN ESCAPE IS USED FOR: f71, f72, f73, f74, f75, f76, f77, f78, f79, f7a, f7c, f81
// For U+0F71, U+0F72, U+0F74, U+0F75, U+0F7A, and U+0F7C,
// you'll get one of the possible TMW glyphs, maybe not the
// one that is most beautiful.
if ('\u0F00' == ch) {
return tmwFor0F00;
} else if ('\u0F02' == ch) {
return tmwFor0F02;
} else if ('\u0F03' == ch) {
return tmwFor0F03;
} else if ('\u0F0E' == ch) {
return tmwFor0F0E;
} else if ('\u0F40' == ch) {
return tmwFor0F40;
} else if ('\u0F42' == ch) {
return tmwFor0F42;
} else if ('\u0F49' == ch) {
return tmwFor0F49;
} else if ('\u0F4F' == ch) {
return tmwFor0F4F;
} else if ('\u0F51' == ch) {
return tmwFor0F51;
} else if ('\u0F53' == ch) {
return tmwFor0F53;
} else if ('\u0F5E' == ch) {
return tmwFor0F5E;
} else if ('\u0F62' == ch) {
return tmwFor0F62;
} else if ('\u0F64' == ch) {
return tmwFor0F64;
} else if ('\u0F67' == ch) {
return tmwFor0F67;
} else if ('\u0F6A' == ch) {
return tmwFor0F6A;
} else if ('\u0F73' == ch) {
return tmwFor0F73;
} else if ('\u0F76' == ch) {
return tmwFor0F76;
} else if ('\u0F77' == ch) {
return tmwFor0F77;
} else if ('\u0F78' == ch) {
return tmwFor0F78;
} else if ('\u0F79' == ch) {
return tmwFor0F79;
} else if ('\u0F7E' == ch) {
return tmwFor0F7E;
} else if ('\u0F81' == ch) {
return tmwFor0F81;
} else {
DuffCode[] x = UnicodeToTMW[ch - '\u0F00'];
if (null == x[0]) return null;
return x;
}
}
/** Returns the sequence of Unicode corresponding to the given
TibetanMachineWeb font
(0=TibetanMachineWeb,1=TibetanMachineWeb1,...) and