TMW->TM, TM->TMW, and TMW->Unicode conversions are all (at least 2)

orders of magnitude faster.
2003-06-22 22:10:58 +00:00 · 2003-06-22 22:10:58 +00:00 · 1f4343bed0
commit 1f4343bed0
parent afe73c2228
2 changed files with 311 additions and 90 deletions
--- a/source/org/thdl/tib/text/TibetanDocument.java
+++ b/source/org/thdl/tib/text/TibetanDocument.java
@ -124,7 +124,7 @@ public class TibetanDocument extends DefaultStyledDocument {
 			rtf.write(out, this, 0, getLength());
 		}
 		catch (BadLocationException ble) {
-            ThdlDebug.noteIffyCode();
+            throw new Error("Cannot write RTF output; [0, " + getLength() + ") constitutes a bad position.");
 		}
 	}
@ -167,30 +167,49 @@ public class TibetanDocument extends DefaultStyledDocument {
        fontSize. */
    private void replaceDuff(int fontSize, int pos,
                             DuffData glyph, boolean asTMW) {
        replaceDuffs(fontSize, pos, pos + 1, glyph.text,
                     glyph.font, asTMW);
    }
 	/** Replacing can be more efficient than inserting and then
        removing. This replaces the glyphs at position [startOffset,
        endOffset) with data, which is interpreted as TMW glyphs if
        asTMW is true and as TM glyphs otherwise.  The font size for
        the new glyph is fontSize; the particular TM or TMW font is
        specified by newFontIndex, which is one-based, not
        zero-based. */
    private void replaceDuffs(int fontSize, int startOffset,
                              int endOffset, String data,
                              int newFontIndex, boolean asTMW) {
 		MutableAttributeSet mas
            = ((asTMW)
-               ? TibetanMachineWeb.getAttributeSet(glyph.font)
+               ? TibetanMachineWeb.getAttributeSet(newFontIndex)
-               : TibetanMachineWeb.getAttributeSetTM(glyph.font));
+               : TibetanMachineWeb.getAttributeSetTM(newFontIndex));
        StyleConstants.setFontSize(mas, fontSize);
 		try {
-            replace(pos, 1, glyph.text, mas);
+            replace(startOffset, endOffset - startOffset, data, mas);
        } catch (BadLocationException ble) {
            ThdlDebug.noteIffyCode();
 		}
    }
 	/** Replacing can be more efficient than inserting and then
-        removing. This replaces the glyph at position pos with
+        removing. This replaces the glyphs at position [startOffset,
-        unicode.  The font size for the new unicode is fontSize. */
+        endOffset) with unicode.  The font size for the new unicode is
-    private void replaceDuffWithUnicode(int fontSize, int pos,
+        fontSize.  Which particular Unicode font is used depends on
-                                        String unicode) {
+        TibetanMachineWeb.getUnicodeAttributeSet().
        @see TibetanMachineWeb#getUnicodeAttributeSet()
    */
    private void replaceDuffsWithUnicode(int fontSize, int startOffset,
                                         int endOffset, String unicode) {
 		MutableAttributeSet mas
            = TibetanMachineWeb.getUnicodeAttributeSet();
        StyleConstants.setFontSize(mas, fontSize);
 		try {
-            replace(pos, 1, unicode, mas);
+            replace(startOffset, endOffset - startOffset, unicode, mas);
        } catch (BadLocationException ble) {
-            ThdlDebug.noteIffyCode();
+            throw new Error("TMW->Unicode failed because the following constitute a bad position: startOffset " + startOffset + ", endOffset " + endOffset);
 		}
    }
@ -539,7 +558,7 @@ public class TibetanDocument extends DefaultStyledDocument {
        int count = 0;
        for (font = 0; font < 5; font++) {
            for (ord = 32; ord < 255; ord++) {
-                if (TibetanMachineWeb.mapTMtoTMW(font, ord) != null) {
+                if (TibetanMachineWeb.mapTMtoTMW(font, ord, 0) != null) {
                    equivalent[0].setData((char)ord, font + 1);
                    try {
                        insertDuff(tibetanFontSize, count++, equivalent, false);
@ -672,22 +691,91 @@ public class TibetanDocument extends DefaultStyledDocument {
        @see convertToTM(int,int) */
    private boolean convertHelper(int begin, int end, boolean toTM,
                                  boolean toUnicode, StringBuffer errors) {
        // To preserve formatting, we go paragraph by paragraph.
        // Use positions, not offsets, because our work on paragraph K
        // will affect the offsets of paragraph K+1.
        Position finalEndPos;
        if (end < 0) {
            end = getLength();
        }
        Element[] paragraphs = getParagraphs(begin, end);
        try {
            finalEndPos = createPosition(end);
        } catch (BadLocationException e) {
            throw new Error("BAD LOCATION DURING CONVERSION");
        }
        ConversionErrorHelper ceh = new ConversionErrorHelper();
        int pl = paragraphs.length;
        for (int i = 0; i < pl; i++) {
            int p_end = paragraphs[i].getEndOffset();
            if (i + 1 == paragraphs.length)
                ceh.doErrorWrapup = true;
            convertHelperHelper(paragraphs[i].getStartOffset(),
                                ((finalEndPos.getOffset() < p_end)
                                 ? finalEndPos.getOffset()
                                 : p_end),
                                toTM, toUnicode, errors, ceh);
            // Now that we've changed paragraph i, recalculate
            // paragraphs.  (PERFORMANCE FIXME: this is O(N*N), and we
            // could make it O(N) by calculating just one paragraph at
            // a time.)
            paragraphs = getParagraphs(begin, finalEndPos.getOffset());
            if (paragraphs.length != pl)
                throw new Error("Conversion failed: the number of paragraphs changed, indicating that formatting was lost.");
        }
        return ceh.errorReturn;
    }
    /** See the sole caller, convertHelper. */
    private void convertHelperHelper(int begin, int end, boolean toTM,
                                     boolean toUnicode, StringBuffer errors,
                                     ConversionErrorHelper ceh) {
        final boolean debug = false;
        if (debug)
            System.err.println("cHH: [" + begin + ", " + end + ")");
        // DLC FIXME: here's an idea, a compressor -- use the '-' (ord
        // 45) or ' ' (ord 32) glyph from the same font as the
        // preceding glyph, never others.  This reduces the size of a
        // TMW RTF file by a factor of 3 sometimes.  To do it, use
        // this routine, but give it the ability to go from TMW->TMW
        // and TM->TM.
        // toTM is ignored when toUnicode is true:
        ThdlDebug.verify(!toUnicode || !toTM);
        boolean toStdout = ThdlOptions.getBooleanOption("thdl.debug");
        boolean errorReturn = false;
        if (end < 0)
            end = getLength();
        if (begin >= end)
-            return errorReturn; // nothing to do, so no errors in the doing.
+            return; // nothing to do
-        int i = begin;
+
-        HashMap problemGlyphsTable = new HashMap();
+        // For speed, do as few replaces as possible.  To preserve
        // formatting, we'll try to replace one paragraph at a time.
        // But we *must* replace when we hit a different font (TMW3 as
        // opposed to TMW2, e.g.), so we'll likely replace many times
        // per paragraph.  One very important optimization is that we
        // don't have to treat TMW3.45 or TMW3.32 as a different font
        // than TMW.33 -- that's because each of the ten TMW fonts has
        // the same glyph at position 32 (space) and the same glyph at
        // position 45 (tsheg).  Note that we're building up a big
        // ArrayList; we're trading space for time.
        try {
            int replacementStartIndex = begin;
            StringBuffer replacementQueue = new StringBuffer();
            int replacementFontIndex = 0;
            int replacementFontSize = -1;
            int i = begin;
            HashMap problemGlyphsTable = new HashMap();
            Position endPos = createPosition(end);
            DuffData[] equivalent = new DuffData[1];
            equivalent[0] = new DuffData();
-            int errorGlyphLocation = 0;
+            boolean mustReplace = false;
            int mustReplaceUntil = -1;
            while (i < endPos.getOffset()) {
                AttributeSet attr = getCharacterElement(i).getAttributes();
                String fontName = StyleConstants.getFontFamily(attr);
@ -697,6 +785,13 @@ public class TibetanDocument extends DefaultStyledDocument {
                       : TibetanMachineWeb.getTMFontNumber(fontName));
                if (0 != fontNum) {
                    // SPEED_FIXME: determining font size might be slow, allow an override.
                    int fontSize = tibetanFontSize;
                    try {
                        fontSize = ((Integer)getCharacterElement(i).getAttributes().getAttribute(StyleConstants.FontSize)).intValue();
                    } catch (Exception e) { /* leave it as tibetanFontSize */ }
                    DuffCode dc = null;
                    String unicode = null;
                    if (toUnicode) {
@ -705,63 +800,75 @@ public class TibetanDocument extends DefaultStyledDocument {
                    } else {
                        if (toTM) {
                            dc = TibetanMachineWeb.mapTMWtoTM(fontNum - 1,
-                                                              getText(i,1).charAt(0));
+                                                              getText(i,1).charAt(0),
                                                              replacementFontIndex);
                        } else {
                            dc = TibetanMachineWeb.mapTMtoTMW(fontNum - 1,
-                                                              getText(i,1).charAt(0));
+                                                              getText(i,1).charAt(0),
                                                              replacementFontIndex);
                        }
                    }
-                    if (null != dc || null != unicode) {
+                    if (replacementQueue.length() > 0
-                        // SPEED_FIXME: determining font size might be slow
+                        && (mustReplace
-                        int fontSize = tibetanFontSize;
+                            || ((!toUnicode
-                        try {
+                                 && null != dc
-                            fontSize = ((Integer)getCharacterElement(i).getAttributes().getAttribute(StyleConstants.FontSize)).intValue();
+                                 && dc.getFontNum() != replacementFontIndex)
-                        } catch (Exception e) {
+                                || fontSize != replacementFontSize))) {
-                            // leave it as tibetanFontSize
+                        // We must replace now, because the attribute
-                        }
+                        // set has changed.
-                        if (!toUnicode) {
+                        // We have two choices: replace or
-                            equivalent[0].setData(dc.getCharacter(),
+                        // insert-and-remove.  We replace, because
-                                                  dc.getFontNum());
+                        // that preserves formatting.
                        }
-                        // We have two choices: remove-then-insert
+                        // this if-else statement is duplicated below; beware!
-                        // second vs. insert-then-remove and also
+                        int endIndex = mustReplace ? mustReplaceUntil : i;
-                        // insert-before vs. insert-after.  It turns
+                        if (toUnicode) {
-                        // out that insert-after preserves formatting
+                            replaceDuffsWithUnicode(replacementFontSize,
-                        // whereas insert-before doesn't.  And we do
+                                                    replacementStartIndex,
-                        // insert-then-remove because we're guessing
+                                                    endIndex,
-                        // that helps with formatting too.
+                                                    replacementQueue.toString());
                        if (replaceInsteadOfInserting()) {
                            if (toUnicode) {
                                replaceDuffWithUnicode(fontSize, i, unicode);
                                i += unicode.length() - 1; // we do i++ below
                            } else {
                                replaceDuff(fontSize, i, equivalent[0], !toTM);
                            }
                        } else {
-                            if (toUnicode)
+                            replaceDuffs(replacementFontSize,
-                                throw new Error("Please execute 'Clear Preferences' and retry -- you've caused us to go into insert-and-remove mode, as opposed to replace mode, and replacing, the default preference, is the only way you can get TMW->Unicode conversion at present.");
+                                         replacementStartIndex,
-                            if (insertBefore()) {
+                                         endIndex,
-                                insertDuff(fontSize, i, equivalent, !toTM);
+                                         replacementQueue.toString(),
-                                remove(i+1, 1);
+                                         replacementFontIndex,
-                            } else {
+                                         !toTM);
-                                insertDuff(fontSize, i+1, equivalent, !toTM);
+                        }
-                                remove(i, 1);
+
                        // i += numnewchars - numoldchars;
                        if (debug)
                            System.err.println("Incrementing i by " + (replacementQueue.length()
                              - (endIndex - replacementStartIndex)) + "; replaced a patch with font size " + replacementFontSize + ", fontindex " + replacementFontIndex);
                        i += (replacementQueue.length()
                              - (endIndex - replacementStartIndex));
                        replacementQueue.delete(0, replacementQueue.length());
                        mustReplace = false;
                    }
                    if (null != dc || null != unicode) {
                        if (0 == replacementQueue.length()) {
                            replacementFontSize = fontSize;
                            replacementStartIndex = i;
                            if (!toUnicode) {
                                replacementFontIndex = dc.getFontNum();
                            }
                        }
                        if (toUnicode) {
                            replacementQueue.append(unicode);
                        } else {
                            replacementQueue.append(dc.getCharacter());
                        }
                    } else {
-                        // DLC FIXME: insert into document a string
+                        // For now, on error, we insert the alphabet
-                        // saying "<<[[there's no TM equivalent for
+                        // in a big font in TMW to try and get some
-                        // this, details are ...]]>>" (For now, I'm
+                        // attention.  We also put the oddballs at the
-                        // inserting the alphabet in a big font in TMW
+                        // start of the document.  But then we delete
-                        // to try and get some attention.  And I've
+                        // the alphabet usually.
                        // *documented* this on the website.  I'm also
                        // putting the oddballs at the start of the
                        // document, but I haven't documented that
                        // (FIXME).)
-                        errorReturn = true;
+                        ceh.errorReturn = true;
                        CharacterInAGivenFont cgf
                            = new CharacterInAGivenFont(getText(i,1), fontName);
                        if (!problemGlyphsTable.containsKey(cgf)) {
@ -784,7 +891,7 @@ public class TibetanDocument extends DefaultStyledDocument {
                                // Now also put this problem glyph at
                                // the beginning of the document:
                                equivalent[0].setData(getText(i,1), fontNum);
-                                insertDuff(72, errorGlyphLocation++,
+                                insertDuff(72, ceh.errorGlyphLocation++,
                                           equivalent, toUnicode || toTM);
                                ++i;
                            }
@ -798,20 +905,83 @@ public class TibetanDocument extends DefaultStyledDocument {
                            i += trickyTMW.length();
                        }
                    }
                } else {
                    if (debug) System.err.println("non-tm/tmw found at offset " + i + "; font=" + fontName + " ord " + (int)getText(i,1).charAt(0));
                    if (replacementQueue.length() > 0) {
                        if (!mustReplace) {
                            mustReplaceUntil = i;
                            mustReplace = true;
                        }
                    }
                }
                i++;
            }
            if (replacementQueue.length() > 0) {
                // this if-else statement is duplicated above; beware!
                int endIndex = mustReplace ? mustReplaceUntil : i;
                if (toUnicode) {
                    replaceDuffsWithUnicode(replacementFontSize,
                                            replacementStartIndex,
                                            endIndex,
                                            replacementQueue.toString());
                } else {
                    replaceDuffs(replacementFontSize,
                                 replacementStartIndex,
                                 endIndex,
                                 replacementQueue.toString(),
                                 replacementFontIndex,
                                 !toTM);
                }
            }
            if (!ThdlOptions.getBooleanOption("thdl.leave.bad.tm.tmw.conversions.in.place")) {
                // Remove all characters other than the oddballs:
-                if (errorGlyphLocation > 0) {
+                if (ceh.doErrorWrapup && ceh.errorGlyphLocation > 0) {
-                    remove(errorGlyphLocation, getLength()-errorGlyphLocation-1);
+                    remove(ceh.errorGlyphLocation, getLength()-ceh.errorGlyphLocation-1);
                }
            }
        } catch (BadLocationException ble) {
            ble.printStackTrace();
            ThdlDebug.noteIffyCode();
        }
-        return errorReturn;
+    }
    /** Returns all the paragraph elements in this document that
     *  contain glyphs with offsets in the range [start, end) where
     *  end < 0 is treated as the document's length.  Note that roman,
     *  TM, Arial Unicode MS, and TMW text can all be intermingled
     *  within a paragraph.  It's the correct level of abstraction to
     *  use, however, because the next finer grain is roughly one
     *  Element per glyph. */
    private Element[] getParagraphs(int start, int end) {
        if (end < 0)
            end = getLength();
        Element arrayType[] = new Element[0];
        ArrayList v = new ArrayList();
        int pos = start;
        while (pos <= end) {
            Element pe = getParagraphElement(pos);
            v.add(pe);
            if (pe.getEndOffset() == pos)
                pos = pe.getEndOffset() + 1;
            else
                pos = pe.getEndOffset();
        }
        return (Element[])v.toArray(arrayType);
    }
 }
 /** A helper class used by TibetanDocument.convertHelper(..). */
 class ConversionErrorHelper {
    boolean errorReturn;
    /** one more than the location of the last error glyph, or zero if no
     *  error glyphs yet exist */
    int errorGlyphLocation;
    boolean doErrorWrapup;
    ConversionErrorHelper() {
        errorReturn = false;
        errorGlyphLocation = 0;
        doErrorWrapup = false;
    }
 }
--- a/source/org/thdl/tib/text/TibetanMachineWeb.java
+++ b/source/org/thdl/tib/text/TibetanMachineWeb.java
@ -212,6 +212,18 @@ public class TibetanMachineWeb implements THDLWylieConstants {
        readInFontFile("/Fonts/TibetanMachineWeb/timwn9.ttf");
    }
    /** Assumes that the TM font files are resources associated with
     *  this class and loads those font files.
     *  @throws Error if that assumption does not hold */
    private static void readInTMFontFiles() {
        /* Note the leading slashes on these paths: */
        readInFontFile("/Fonts/TibetanMachine/Timn.ttf");
        readInFontFile("/Fonts/TibetanMachine/Tims1.ttf");
        readInFontFile("/Fonts/TibetanMachine/Tims2.ttf");
        readInFontFile("/Fonts/TibetanMachine/Tims3.ttf");
        readInFontFile("/Fonts/TibetanMachine/Tims4.ttf");
    }
    /** Assumes that the TMW font file at the given path is a resource
     *  associated with this class and loads that font file.
     *  @param path a path within the JAR containing this class file
@ -263,6 +275,14 @@ public class TibetanMachineWeb implements THDLWylieConstants {
            readInFontFiles();
        }
        // DLC FIXME: include TM fonts with Jskad but not with other
        // packages.  Right now you can get them manually by editing
        // build.xml and your options.txt or my_thdl_preferences.txt
        // file.
        if (ThdlOptions.getBooleanOption("thdl.do.not.rely.on.system.tm.fonts")) {
            readInTMFontFiles();
        }
        unicodeFontAttributeSet = new SimpleAttributeSet();
        StyleConstants.setFontFamily(unicodeFontAttributeSet,
                                     ThdlOptions.getStringOption("thdl.tmw.to.unicode.font",
@ -972,27 +992,43 @@ private static final DuffCode TMW_tab = new DuffCode(1, '\t');
    valid.
    Only a few control characters are supported: '\r' (carriage
-    return), '\n' (line feed), and '\t' (tab).  */
+    return), '\n' (line feed), and '\t' (tab).
-public static DuffCode mapTMtoTMW(int font, int ordinal) {
+
    If suggestedFont is not zero, then any ordinals that are the same
    in all fonts ('\n', '-', ' ', '\r', and '\t') will be converted to
    the font named tmwFontNames[suggestedFont].
 */
 public static DuffCode mapTMtoTMW(int font, int ordinal, int suggestedFont) {
    if (font < 0 || font > 4)
        return null;
    if (ordinal > 255)
        return getUnusualTMtoTMW(font, ordinal);
    if (ordinal < 32) {
-        if (ordinal == (int)'\r')
+        if (ordinal == (int)'\r') {
-            return TMW_cr;
+            if (0 == suggestedFont)
-        else if (ordinal == (int)'\n')
+                return TMW_cr;
-            return TMW_lf;
+            else
-        else if (ordinal == (int)'\t')
+                return new DuffCode(suggestedFont, (char)ordinal);
-            return TMW_tab;
+        } else if (ordinal == (int)'\n') {
-        else {
+            if (0 == suggestedFont)
                return TMW_lf;
            else
                return new DuffCode(suggestedFont, (char)ordinal);
        } else if (ordinal == (int)'\t') {
            if (0 == suggestedFont)
                return TMW_tab;
            else
                return new DuffCode(suggestedFont, (char)ordinal);
        } else {
            // for robustness, just return font 1, char ordinal.
            ThdlDebug.noteIffyCode();
            return null;
        }
    }
-    DuffCode ans = TMtoTMW[font][ordinal-32];
+    if (0 != suggestedFont && 32 == ordinal || 45 == ordinal) {
-	return ans;
+        return new DuffCode(suggestedFont, (char)ordinal);
    }
 	return TMtoTMW[font][ordinal-32];
 }
 private static final DuffCode TM_cr = new DuffCode(1, '\r');
@ -1011,25 +1047,40 @@ private static final DuffCode TM_tab = new DuffCode(1, '\t');
    Only a few control characters are supported: '\r' (carriage
    return), '\n' (line feed), and '\t' (tab).
- */
+    If suggestedFont is not zero, then any ordinals that are the same
-public static DuffCode mapTMWtoTM(int font, int ordinal) {
+    in all fonts ('\n', '-', ' ', '\r', and '\t') will be converted to
    the font named tmwFontNames[suggestedFont].
 */
 public static DuffCode mapTMWtoTM(int font, int ordinal, int suggestedFont) {
    if (font < 0 || font > 9)
        return null;
    if (ordinal > 127)
        return null;
    if (ordinal < 32) {
-        if (ordinal == (int)'\r')
+        if (ordinal == (int)'\r') {
-            return TM_cr;
+            if (0 == suggestedFont)
-        else if (ordinal == (int)'\n')
+                return TM_cr;
-            return TM_lf;
+            else
-        else if (ordinal == (int)'\t')
+                return new DuffCode(suggestedFont, (char)ordinal);
-            return TM_tab;
+        } else if (ordinal == (int)'\n') {
-        else {
+            if (0 == suggestedFont)
                return TM_lf;
            else
                return new DuffCode(suggestedFont, (char)ordinal);
        } else if (ordinal == (int)'\t') {
            if (0 == suggestedFont)
                return TM_tab;
            else
                return new DuffCode(suggestedFont, (char)ordinal);
        } else {
            // for robustness, just return font 1, char ordinal.
            ThdlDebug.noteIffyCode();
            return null;
        }
    }
    if (0 != suggestedFont && 32 == ordinal || 45 == ordinal) {
        return new DuffCode(suggestedFont, (char)ordinal);
    }
    DuffCode ans = TMWtoTM[font][ordinal-32];
 	return ans;
 }
@ -1041,7 +1092,7 @@ public static void main(String[] args) {
    count = 0;
    for (font = 0; font < 5; font++) {
        for (ord = 32; ord < 255; ord++) {
-            if (mapTMtoTMW(font, ord) != null) {
+            if (mapTMtoTMW(font, ord, 0) != null) {
                count++;
            }
        }
@ -1051,7 +1102,7 @@ public static void main(String[] args) {
    count = 0;
    for (font = 0; font < 10; font++) {
        for (ord = 32; ord < 127; ord++) {
-            if (mapTMWtoTM(font, ord) != null) {
+            if (mapTMWtoTM(font, ord, 0) != null) {
                count++;
            }
        }
@ -1062,7 +1113,7 @@ public static void main(String[] args) {
    for (font = 0; font < 10; font++) {
        for (ord = 32; ord < 127; ord++) {
            DuffCode dc;
-            if ((dc = mapTMWtoTM(font, ord)) != null) {
+            if ((dc = mapTMWtoTM(font, ord, 0)) != null) {
                System.out.println(dc.getCharNum() + " "
                                   + (dc.getFontNum()-1) + " "
                                   + font + " "
@ -1075,7 +1126,7 @@ public static void main(String[] args) {
    for (font = 0; font < 5; font++) {
        for (ord = 32; ord < 255; ord++) {
            DuffCode dc;
-            if ((dc = mapTMtoTMW(font, ord)) != null) {
+            if ((dc = mapTMtoTMW(font, ord, 0)) != null) {
                System.out.println(ord + " " + font + " "
                                   + (dc.getFontNum()-1) + " "
                                   + dc.getCharNum());