ACIP->TMW and ACIP->Unicode now allow for Unicode escapes like K\u0F84. This means that the lack of support for ACIP's backslash, '\\', is mitigated because you can turn ACIP {K\} into ACIP {K\u0F84}.

Support for U+F021-U+F0FF, the PUA that the latest EWTS uses, is not provided. Also, we've traded some speed for memory -- DuffCode now uses bytes, not ints.
2003-11-29 22:57:12 +00:00 · 2003-11-29 22:57:12 +00:00 · ffd041e32c
commit ffd041e32c
parent dfaae4be93
2 changed files with 649 additions and 476 deletions
--- a/source/org/thdl/tib/text/DuffCode.java
+++ b/source/org/thdl/tib/text/DuffCode.java
@ -26,23 +26,27 @@ import org.thdl.util.ThdlDebug;
 * An immutable representation of a Tibetan glyph in the
 * TibetanMachineWeb or TibetanMachine families of fonts.
 *
-* A DuffCode consists of a font number, a character, and a character
+* <p>A DuffCode consists of a font number, a character, and a
-* number. A font identification and a character are sufficient to
+* character number.  A font identification and a character are
-* uniquely identify any TibetanMachineWeb or TibetanMachine glyph.
+* sufficient to uniquely identify any TibetanMachineWeb or
 * TibetanMachine glyph.  Whether a DuffCode represents a TM or TMW
 * glyph is in the eye of the beholder -- such information is not
 * intrinsically represented.
 *
 * @author Edward Garrett, Tibetan and Himalayan Digital Library
-* @version 1.0 */
+* @author David Chandler */
 public final class DuffCode {
 /**
-* the font number in which this glyph can be found,
+* the font number in which this glyph can be found, from 1
-* from 1 (TibetanMachineWeb) to 10 (TibetanMachineWeb9).
+* (TibetanMachineWeb/TibetanMachine) ... to 5
-*/
+* (TibetanMachineWeb4/TibetanMachineSkt4) ... to 10
-	private int fontNum;
+* (TibetanMachineWeb9/[Invalid for TM family]).  */
    private byte fontNum;
 /**
 * the character value of this glyph, as an integer (that is, ordinal)
 */
-	private int charNum;
+    private byte charNum;
 /**
 * Called by {@link TibetanMachineWeb} to generate
@ -53,9 +57,8 @@ public final class DuffCode {
 * and the other is the ASCII code of the character.
 * 
 * @param s the string to parse
-* @param leftToRight should be true if the first number is the font number,
+* @param leftToRight should be true if the first number is the font
-* false if the second number is the font number
+* number, false if the second number is the font number */
 */
    public DuffCode(String s, boolean leftToRight) {
        StringTokenizer st = new StringTokenizer(s,",");
@ -65,17 +68,19 @@ public final class DuffCode {
            Integer num1 = new Integer(val1);
            Integer num2 = new Integer(val2);
            int n1val = num1.intValue();
            int n2val = num2.intValue();
            if (n1val > 255 || n1val < 0 || n2val > 255 || n2val < 0)
                throw new NumberFormatException("FAILED ASSERTION: 0<=fontNum<=255 and 0<=charNum<=255");
            if (leftToRight) {
-                setFontNum(num1.intValue());
+                setFontNum(n1val);
-				charNum = num2.intValue();
+                setCharNum((char)n2val);
            } else {
                setFontNum(n2val);
                setCharNum((char)n1val);
            }
-			else {
+        } catch (NumberFormatException e) {
                setFontNum(num2.intValue());
 				charNum = num1.intValue();
 			}						
 		}
 		catch (NumberFormatException e) {
            ThdlDebug.noteIffyCode();
        }
    }
@ -89,30 +94,41 @@ public final class DuffCode {
 */
    public DuffCode(int font, char ch) {
        setFontNum(font);
-		charNum = (int)ch;
+        setCharNum(ch);
    }
    private void setFontNum(int font) {
        if (!(font >= 1 && font <= 10))
            throw new IllegalArgumentException("DuffCodes work with font numbers in the range [1, 5] or [1, 10].  This isn't in the range [1, 10]: " + font);
-        fontNum = font;
+        fontNum = (byte)font;
    }
 /**
 * Gets the font number of this glyph.
 * @return the identifying font number for this DuffCode
 */
-	public int getFontNum() {
+    public byte getFontNum() {
        return fontNum;
    }
    private void setCharNum(char x) {
        short xs = (short)x;
        if (xs >= 0 && xs <= 127)
            charNum = (byte)xs;
        else
            charNum = (byte)(127-xs);
    }
 /**
 * Gets the character for this glyph, as an integer.
 * @return the identifying character, converted to an
 * integer, for this DuffCode
 */
-	public int getCharNum() {
+    public short getCharNum() {
-		return charNum;
+        if (charNum >= 0)
            return (short)charNum; // [0, 127]
        else
            return (short)(127-(short)charNum);  // [128, 255]
    }
 /**
@ -120,7 +136,7 @@ public final class DuffCode {
 * @return the identifying character for this DuffCode
 */
    public char getCharacter() {
-		return (char)charNum;
+        return (char)getCharNum();
    }
 /**
@ -129,7 +145,7 @@ public final class DuffCode {
 *
 * @return the hash code for this object */
    public int hashCode() {
-        return fontNum*256 + charNum;
+        return ((int)fontNum)*256 + getCharNum();
    }
 /**
@ -157,7 +173,7 @@ public final class DuffCode {
        if (err[0]) wylie = "undefined";
        return "<duffcode wylie="
            + wylie + " font=" + fontNum
-            + " charNum=" + charNum + " character="
+            + " charNum=" + getCharNum() + " character="
            + new Character(getCharacter()).toString() + "/>";
    }
 /**
@ -172,7 +188,7 @@ public final class DuffCode {
            + (TMW
               ? TibetanMachineWeb.tmwFontNames
               : TibetanMachineWeb.tmFontNames)[fontNum]
-            + " charNum=" + charNum + " character="
+            + " charNum=" + getCharNum() + " character="
            + new Character(getCharacter()).toString() + "/>";
    }
 }
--- a/source/org/thdl/tib/text/TibetanMachineWeb.java
+++ b/source/org/thdl/tib/text/TibetanMachineWeb.java
@ -31,7 +31,7 @@ import org.thdl.util.ThdlDebug;
 import org.thdl.util.ThdlLazyException;
 import org.thdl.util.Trie;
 import org.thdl.util.ThdlOptions;
-import org.thdl.tib.text.tshegbar.UnicodeCodepointToThdlWylie;
+import org.thdl.tib.text.tshegbar.UnicodeUtils;
 /**
 * Interfaces between Extended Wylie and the TibetanMachineWeb fonts.
@ -41,9 +41,11 @@ import org.thdl.tib.text.tshegbar.UnicodeCodepointToThdlWylie;
 * both or neither.
 *
 * <p>In addition, this class optionally loads the TibetanMachineWeb
-* fonts manually via {@link #readInTMWFontFiles()}.
+* fonts manually via {@link #readInTMWFontFiles()}.  When we do that,
 * it means that users don't have to install the fonts on their
 * systems, so installation of Jskad becomes easier.
 * @author Edward Garrett, Tibetan and Himalayan Digital Library
-* @version 1.0
+* @author David Chandler
 */
 public class TibetanMachineWeb implements THDLWylieConstants {
    /** This addresses bug 624133, "Input freezes after impossible
@ -74,6 +76,14 @@ public class TibetanMachineWeb implements THDLWylieConstants {
    private static DuffCode[][] TMtoTMW = new DuffCode[5][255-32]; // ordinal 255 doesn't occur in TM
    private static DuffCode[][] TMWtoTM = new DuffCode[10][127-32]; // ordinal 127 doesn't occur in TMW
    private static String[][] TMWtoUnicode = new String[10][127-32]; // ordinal 127 doesn't occur in TMW
    /** For mapping single codepoints U+0F00..U+0FFF to TMW.  This
        won't handle 0F00, 0F02, 0F03, or 0F0E, which are made by
        using multiple glyphs from TMW, but it handles all the rest.
        It handles U+0F90-U+0FBC rather poorly, in that you have to
        use special formatting to get those right (FIXME: warn
        whenever they're used). */
    private static DuffCode[][] UnicodeToTMW = new DuffCode[256][1];
    private static String fileName = "tibwn.ini";
    private static final String DELIMITER = "~";
    /** vowels that appear over the glyph: */
@ -632,13 +642,51 @@ public class TibetanMachineWeb implements THDLWylieConstants {
                                        }
                                        TMWtoUnicode[duffCodes[TMW].getFontNum()-1][duffCodes[TMW].getCharNum()-32]
                                            = unicodeBuffer.toString(); // TMW->Unicode mapping
                                        char ch;
                                        if (unicodeBuffer.length() == 1
                                            && UnicodeUtils.isInTibetanRange(ch = unicodeBuffer.charAt(0))) {
                                            if (null != UnicodeToTMW[ch - '\u0F00'][0]
                                                && '\u0F00' != ch
                                                && '\u0F02' != ch
                                                && '\u0F03' != ch
                                                && '\u0F0B' != ch // any will do...
                                                && '\u0F0E' != ch
                                                && '\u0F40' != ch
                                                && '\u0F42' != ch
                                                && '\u0F49' != ch
                                                && '\u0F4F' != ch
                                                && '\u0F51' != ch
                                                && '\u0F53' != ch
                                                && '\u0F5E' != ch
                                                && '\u0F62' != ch
                                                && '\u0F64' != ch
                                                && '\u0F67' != ch
                                                && '\u0F6A' != ch
                                                && '\u0F71' != ch // any will do...
                                                && '\u0F72' != ch // any will do...
                                                && '\u0F73' != ch
                                                && '\u0F74' != ch // any will do...
                                                && '\u0F75' != ch // any will do...
                                                && '\u0F76' != ch
                                                && '\u0F77' != ch
                                                && '\u0F78' != ch
                                                && '\u0F79' != ch
                                                && '\u0F7A' != ch // any will do...
                                                && '\u0F7C' != ch // any will do...
                                                && '\u0F7E' != ch
                                                && '\u0F81' != ch) {
                                                throw new Error("tibwn.ini has more than one TMW fellow listed that has the Unicode " + val + ", but it's not on the list of specially handled glyphs");
                                            }
                                            UnicodeToTMW[ch - '\u0F00'][0]
                                                = duffCodes[TMW]; // Unicode->TMW mapping
                                        }
                                        // For V&V:
 // DLC FIXME: also check for ^[90-bc]. and ^.+[40-6a]
 //                                          StringBuffer wylie_minus_plusses_buf
-//                                              = UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeString(unicodeBuffer.toString());
+//                                              = org.thdl.tib.text.tshegbar.UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeString(unicodeBuffer.toString());
 //                                          String wylie_minus_plusses
 //                                              = ((wylie_minus_plusses_buf == null)
 //                                                 ? null
@ -1616,6 +1664,115 @@ private static final String Unicode_lf = "\n";
 private static final String Unicode_tab = "\t";
    private static final DuffCode[] tmwFor0F00
        = new DuffCode[] { new DuffCode(1, (char)63), new DuffCode(8, (char)102) };
    private static final DuffCode[] tmwFor0F02
        = new DuffCode[] { new DuffCode(1, (char)56), new DuffCode(1, (char)118), new DuffCode(8, (char)95), new DuffCode(8, (char)92) };
    private static final DuffCode[] tmwFor0F03
        = new DuffCode[] { new DuffCode(1, (char)56), new DuffCode(1, (char)118), new DuffCode(8, (char)95), new DuffCode(1, (char)105) };
    private static final DuffCode[] tmwFor0F0E
        = new DuffCode[] { new DuffCode(1, (char)107), new DuffCode(1, (char)107) };
    // for 0F40, use the full-height, not the reduced-height, form
    private static final DuffCode[] tmwFor0F40
        = new DuffCode[] { new DuffCode(1, (char)92) };
    private static final DuffCode[] tmwFor0F42
        = new DuffCode[] { new DuffCode(1, (char)93) };
    private static final DuffCode[] tmwFor0F49
        = new DuffCode[] { new DuffCode(1, (char)94) };
    private static final DuffCode[] tmwFor0F4F
        = new DuffCode[] { new DuffCode(1, (char)95) };
    private static final DuffCode[] tmwFor0F51
        = new DuffCode[] { new DuffCode(1, (char)96) };
    private static final DuffCode[] tmwFor0F53
        = new DuffCode[] { new DuffCode(1, (char)97) };
    private static final DuffCode[] tmwFor0F5E
        = new DuffCode[] { new DuffCode(1, (char)98) };
    private static final DuffCode[] tmwFor0F62
        = new DuffCode[] { new DuffCode(8, (char)66) }; // not the full-form, use \u0F6A for that...
    private static final DuffCode[] tmwFor0F64
        = new DuffCode[] { new DuffCode(1, (char)99) };
    private static final DuffCode[] tmwFor0F67
        = new DuffCode[] { new DuffCode(1, (char)100) };
    private static final DuffCode[] tmwFor0F6A
        = new DuffCode[] { new DuffCode(1, (char)58) };
    private static final DuffCode[] tmwFor0F73
        = new DuffCode[] { new DuffCode(4, (char)106), new DuffCode(1, (char)109) };
    private static final DuffCode[] tmwFor0F76
        = new DuffCode[] { new DuffCode(8, (char)71), new DuffCode(8, (char)87) };
    private static final DuffCode[] tmwFor0F77
        = new DuffCode[] { new DuffCode(8, (char)71), new DuffCode(4, (char)106), new DuffCode(8, (char)87) };
    private static final DuffCode[] tmwFor0F78
        = new DuffCode[] { new DuffCode(10, (char)105), new DuffCode(8, (char)87) };
    private static final DuffCode[] tmwFor0F79
        = new DuffCode[] { new DuffCode(10, (char)105), new DuffCode(4, (char)106), new DuffCode(8, (char)87) };
    private static final DuffCode[] tmwFor0F7E
        = new DuffCode[] { new DuffCode(8, (char)91) }; // the one that lines up better -- i.e., not (8, (char)90)
    private static final DuffCode[] tmwFor0F81
        = new DuffCode[] { new DuffCode(4, (char)106), new DuffCode(8, (char)87) };
    /** Returns an array of one, two, three, or four DuffCodes that
        together represent the Tibetan Unicode character <i>ch</i>.
        Returns null if there is no mapping for <i>ch</i>.  For
        certain codepoints, multiple TMW glyphs are appropriate, and
        we return an arbitrary one. */
    public static DuffCode[] mapUnicodeToTMW(char ch) {
        // FIXME WARN WHENEVER AN ESCAPE IS USED FOR: f71, f72, f73, f74, f75, f76, f77, f78, f79, f7a, f7c, f81
        // For U+0F71, U+0F72, U+0F74, U+0F75, U+0F7A, and U+0F7C,
        // you'll get one of the possible TMW glyphs, maybe not the
        // one that is most beautiful.
        if ('\u0F00' == ch) {
            return tmwFor0F00;
        } else if ('\u0F02' == ch) {
            return tmwFor0F02;
        } else if ('\u0F03' == ch) {
            return tmwFor0F03;
        } else if ('\u0F0E' == ch) {
            return tmwFor0F0E;
        } else if ('\u0F40' == ch) {
            return tmwFor0F40;
        } else if ('\u0F42' == ch) {
            return tmwFor0F42;
        } else if ('\u0F49' == ch) {
            return tmwFor0F49;
        } else if ('\u0F4F' == ch) {
            return tmwFor0F4F;
        } else if ('\u0F51' == ch) {
            return tmwFor0F51;
        } else if ('\u0F53' == ch) {
            return tmwFor0F53;
        } else if ('\u0F5E' == ch) {
            return tmwFor0F5E;
        } else if ('\u0F62' == ch) {
            return tmwFor0F62;
        } else if ('\u0F64' == ch) {
            return tmwFor0F64;
        } else if ('\u0F67' == ch) {
            return tmwFor0F67;
        } else if ('\u0F6A' == ch) {
            return tmwFor0F6A;
        } else if ('\u0F73' == ch) {
            return tmwFor0F73;
        } else if ('\u0F76' == ch) {
            return tmwFor0F76;
        } else if ('\u0F77' == ch) {
            return tmwFor0F77;
        } else if ('\u0F78' == ch) {
            return tmwFor0F78;
        } else if ('\u0F79' == ch) {
            return tmwFor0F79;
        } else if ('\u0F7E' == ch) {
            return tmwFor0F7E;
        } else if ('\u0F81' == ch) {
            return tmwFor0F81;
        } else {
            DuffCode[] x = UnicodeToTMW[ch - '\u0F00'];
            if (null == x[0]) return null;
            return x;
        }
    }
 /** Returns the sequence of Unicode corresponding to the given
    TibetanMachineWeb font
    (0=TibetanMachineWeb,1=TibetanMachineWeb1,...) and