From 1f4343bed01a8924f948fb7d0eb711e418557874 Mon Sep 17 00:00:00 2001 From: dchandler Date: Sun, 22 Jun 2003 22:10:58 +0000 Subject: [PATCH] TMW->TM, TM->TMW, and TMW->Unicode conversions are all (at least 2) orders of magnitude faster. --- source/org/thdl/tib/text/TibetanDocument.java | 302 ++++++++++++++---- .../org/thdl/tib/text/TibetanMachineWeb.java | 99 ++++-- 2 files changed, 311 insertions(+), 90 deletions(-) diff --git a/source/org/thdl/tib/text/TibetanDocument.java b/source/org/thdl/tib/text/TibetanDocument.java index 3eee28a..b1ef4ff 100644 --- a/source/org/thdl/tib/text/TibetanDocument.java +++ b/source/org/thdl/tib/text/TibetanDocument.java @@ -124,7 +124,7 @@ public class TibetanDocument extends DefaultStyledDocument { rtf.write(out, this, 0, getLength()); } catch (BadLocationException ble) { - ThdlDebug.noteIffyCode(); + throw new Error("Cannot write RTF output; [0, " + getLength() + ") constitutes a bad position."); } } @@ -167,30 +167,49 @@ public class TibetanDocument extends DefaultStyledDocument { fontSize. */ private void replaceDuff(int fontSize, int pos, DuffData glyph, boolean asTMW) { + replaceDuffs(fontSize, pos, pos + 1, glyph.text, + glyph.font, asTMW); + } + + /** Replacing can be more efficient than inserting and then + removing. This replaces the glyphs at position [startOffset, + endOffset) with data, which is interpreted as TMW glyphs if + asTMW is true and as TM glyphs otherwise. The font size for + the new glyph is fontSize; the particular TM or TMW font is + specified by newFontIndex, which is one-based, not + zero-based. */ + private void replaceDuffs(int fontSize, int startOffset, + int endOffset, String data, + int newFontIndex, boolean asTMW) { MutableAttributeSet mas = ((asTMW) - ? TibetanMachineWeb.getAttributeSet(glyph.font) - : TibetanMachineWeb.getAttributeSetTM(glyph.font)); + ? TibetanMachineWeb.getAttributeSet(newFontIndex) + : TibetanMachineWeb.getAttributeSetTM(newFontIndex)); StyleConstants.setFontSize(mas, fontSize); try { - replace(pos, 1, glyph.text, mas); + replace(startOffset, endOffset - startOffset, data, mas); } catch (BadLocationException ble) { ThdlDebug.noteIffyCode(); } } /** Replacing can be more efficient than inserting and then - removing. This replaces the glyph at position pos with - unicode. The font size for the new unicode is fontSize. */ - private void replaceDuffWithUnicode(int fontSize, int pos, - String unicode) { + removing. This replaces the glyphs at position [startOffset, + endOffset) with unicode. The font size for the new unicode is + fontSize. Which particular Unicode font is used depends on + TibetanMachineWeb.getUnicodeAttributeSet(). + + @see TibetanMachineWeb#getUnicodeAttributeSet() + */ + private void replaceDuffsWithUnicode(int fontSize, int startOffset, + int endOffset, String unicode) { MutableAttributeSet mas = TibetanMachineWeb.getUnicodeAttributeSet(); StyleConstants.setFontSize(mas, fontSize); try { - replace(pos, 1, unicode, mas); + replace(startOffset, endOffset - startOffset, unicode, mas); } catch (BadLocationException ble) { - ThdlDebug.noteIffyCode(); + throw new Error("TMW->Unicode failed because the following constitute a bad position: startOffset " + startOffset + ", endOffset " + endOffset); } } @@ -539,7 +558,7 @@ public class TibetanDocument extends DefaultStyledDocument { int count = 0; for (font = 0; font < 5; font++) { for (ord = 32; ord < 255; ord++) { - if (TibetanMachineWeb.mapTMtoTMW(font, ord) != null) { + if (TibetanMachineWeb.mapTMtoTMW(font, ord, 0) != null) { equivalent[0].setData((char)ord, font + 1); try { insertDuff(tibetanFontSize, count++, equivalent, false); @@ -672,22 +691,91 @@ public class TibetanDocument extends DefaultStyledDocument { @see convertToTM(int,int) */ private boolean convertHelper(int begin, int end, boolean toTM, boolean toUnicode, StringBuffer errors) { + // To preserve formatting, we go paragraph by paragraph. + + // Use positions, not offsets, because our work on paragraph K + // will affect the offsets of paragraph K+1. + + Position finalEndPos; + if (end < 0) { + end = getLength(); + } + Element[] paragraphs = getParagraphs(begin, end); + try { + finalEndPos = createPosition(end); + } catch (BadLocationException e) { + throw new Error("BAD LOCATION DURING CONVERSION"); + } + + ConversionErrorHelper ceh = new ConversionErrorHelper(); + int pl = paragraphs.length; + for (int i = 0; i < pl; i++) { + int p_end = paragraphs[i].getEndOffset(); + if (i + 1 == paragraphs.length) + ceh.doErrorWrapup = true; + convertHelperHelper(paragraphs[i].getStartOffset(), + ((finalEndPos.getOffset() < p_end) + ? finalEndPos.getOffset() + : p_end), + toTM, toUnicode, errors, ceh); + + // Now that we've changed paragraph i, recalculate + // paragraphs. (PERFORMANCE FIXME: this is O(N*N), and we + // could make it O(N) by calculating just one paragraph at + // a time.) + paragraphs = getParagraphs(begin, finalEndPos.getOffset()); + if (paragraphs.length != pl) + throw new Error("Conversion failed: the number of paragraphs changed, indicating that formatting was lost."); + } + return ceh.errorReturn; + } + + /** See the sole caller, convertHelper. */ + private void convertHelperHelper(int begin, int end, boolean toTM, + boolean toUnicode, StringBuffer errors, + ConversionErrorHelper ceh) { + final boolean debug = false; + if (debug) + System.err.println("cHH: [" + begin + ", " + end + ")"); + // DLC FIXME: here's an idea, a compressor -- use the '-' (ord + // 45) or ' ' (ord 32) glyph from the same font as the + // preceding glyph, never others. This reduces the size of a + // TMW RTF file by a factor of 3 sometimes. To do it, use + // this routine, but give it the ability to go from TMW->TMW + // and TM->TM. + // toTM is ignored when toUnicode is true: ThdlDebug.verify(!toUnicode || !toTM); boolean toStdout = ThdlOptions.getBooleanOption("thdl.debug"); - boolean errorReturn = false; if (end < 0) end = getLength(); if (begin >= end) - return errorReturn; // nothing to do, so no errors in the doing. - int i = begin; - HashMap problemGlyphsTable = new HashMap(); + return; // nothing to do + + // For speed, do as few replaces as possible. To preserve + // formatting, we'll try to replace one paragraph at a time. + // But we *must* replace when we hit a different font (TMW3 as + // opposed to TMW2, e.g.), so we'll likely replace many times + // per paragraph. One very important optimization is that we + // don't have to treat TMW3.45 or TMW3.32 as a different font + // than TMW.33 -- that's because each of the ten TMW fonts has + // the same glyph at position 32 (space) and the same glyph at + // position 45 (tsheg). Note that we're building up a big + // ArrayList; we're trading space for time. try { + int replacementStartIndex = begin; + StringBuffer replacementQueue = new StringBuffer(); + int replacementFontIndex = 0; + int replacementFontSize = -1; + + int i = begin; + HashMap problemGlyphsTable = new HashMap(); Position endPos = createPosition(end); DuffData[] equivalent = new DuffData[1]; equivalent[0] = new DuffData(); - int errorGlyphLocation = 0; + boolean mustReplace = false; + int mustReplaceUntil = -1; while (i < endPos.getOffset()) { AttributeSet attr = getCharacterElement(i).getAttributes(); String fontName = StyleConstants.getFontFamily(attr); @@ -697,6 +785,13 @@ public class TibetanDocument extends DefaultStyledDocument { : TibetanMachineWeb.getTMFontNumber(fontName)); if (0 != fontNum) { + + // SPEED_FIXME: determining font size might be slow, allow an override. + int fontSize = tibetanFontSize; + try { + fontSize = ((Integer)getCharacterElement(i).getAttributes().getAttribute(StyleConstants.FontSize)).intValue(); + } catch (Exception e) { /* leave it as tibetanFontSize */ } + DuffCode dc = null; String unicode = null; if (toUnicode) { @@ -705,63 +800,75 @@ public class TibetanDocument extends DefaultStyledDocument { } else { if (toTM) { dc = TibetanMachineWeb.mapTMWtoTM(fontNum - 1, - getText(i,1).charAt(0)); + getText(i,1).charAt(0), + replacementFontIndex); } else { dc = TibetanMachineWeb.mapTMtoTMW(fontNum - 1, - getText(i,1).charAt(0)); + getText(i,1).charAt(0), + replacementFontIndex); } } - if (null != dc || null != unicode) { - // SPEED_FIXME: determining font size might be slow - int fontSize = tibetanFontSize; - try { - fontSize = ((Integer)getCharacterElement(i).getAttributes().getAttribute(StyleConstants.FontSize)).intValue(); - } catch (Exception e) { - // leave it as tibetanFontSize - } + if (replacementQueue.length() > 0 + && (mustReplace + || ((!toUnicode + && null != dc + && dc.getFontNum() != replacementFontIndex) + || fontSize != replacementFontSize))) { + // We must replace now, because the attribute + // set has changed. - if (!toUnicode) { - equivalent[0].setData(dc.getCharacter(), - dc.getFontNum()); - } + // We have two choices: replace or + // insert-and-remove. We replace, because + // that preserves formatting. - // We have two choices: remove-then-insert - // second vs. insert-then-remove and also - // insert-before vs. insert-after. It turns - // out that insert-after preserves formatting - // whereas insert-before doesn't. And we do - // insert-then-remove because we're guessing - // that helps with formatting too. - if (replaceInsteadOfInserting()) { - if (toUnicode) { - replaceDuffWithUnicode(fontSize, i, unicode); - i += unicode.length() - 1; // we do i++ below - } else { - replaceDuff(fontSize, i, equivalent[0], !toTM); - } + // this if-else statement is duplicated below; beware! + int endIndex = mustReplace ? mustReplaceUntil : i; + if (toUnicode) { + replaceDuffsWithUnicode(replacementFontSize, + replacementStartIndex, + endIndex, + replacementQueue.toString()); } else { - if (toUnicode) - throw new Error("Please execute 'Clear Preferences' and retry -- you've caused us to go into insert-and-remove mode, as opposed to replace mode, and replacing, the default preference, is the only way you can get TMW->Unicode conversion at present."); - if (insertBefore()) { - insertDuff(fontSize, i, equivalent, !toTM); - remove(i+1, 1); - } else { - insertDuff(fontSize, i+1, equivalent, !toTM); - remove(i, 1); + replaceDuffs(replacementFontSize, + replacementStartIndex, + endIndex, + replacementQueue.toString(), + replacementFontIndex, + !toTM); + } + + // i += numnewchars - numoldchars; + if (debug) + System.err.println("Incrementing i by " + (replacementQueue.length() + - (endIndex - replacementStartIndex)) + "; replaced a patch with font size " + replacementFontSize + ", fontindex " + replacementFontIndex); + i += (replacementQueue.length() + - (endIndex - replacementStartIndex)); + + replacementQueue.delete(0, replacementQueue.length()); + mustReplace = false; + } + + if (null != dc || null != unicode) { + if (0 == replacementQueue.length()) { + replacementFontSize = fontSize; + replacementStartIndex = i; + if (!toUnicode) { + replacementFontIndex = dc.getFontNum(); } } + if (toUnicode) { + replacementQueue.append(unicode); + } else { + replacementQueue.append(dc.getCharacter()); + } } else { - // DLC FIXME: insert into document a string - // saying "<<[[there's no TM equivalent for - // this, details are ...]]>>" (For now, I'm - // inserting the alphabet in a big font in TMW - // to try and get some attention. And I've - // *documented* this on the website. I'm also - // putting the oddballs at the start of the - // document, but I haven't documented that - // (FIXME).) + // For now, on error, we insert the alphabet + // in a big font in TMW to try and get some + // attention. We also put the oddballs at the + // start of the document. But then we delete + // the alphabet usually. - errorReturn = true; + ceh.errorReturn = true; CharacterInAGivenFont cgf = new CharacterInAGivenFont(getText(i,1), fontName); if (!problemGlyphsTable.containsKey(cgf)) { @@ -784,7 +891,7 @@ public class TibetanDocument extends DefaultStyledDocument { // Now also put this problem glyph at // the beginning of the document: equivalent[0].setData(getText(i,1), fontNum); - insertDuff(72, errorGlyphLocation++, + insertDuff(72, ceh.errorGlyphLocation++, equivalent, toUnicode || toTM); ++i; } @@ -798,20 +905,83 @@ public class TibetanDocument extends DefaultStyledDocument { i += trickyTMW.length(); } } + } else { + if (debug) System.err.println("non-tm/tmw found at offset " + i + "; font=" + fontName + " ord " + (int)getText(i,1).charAt(0)); + if (replacementQueue.length() > 0) { + if (!mustReplace) { + mustReplaceUntil = i; + mustReplace = true; + } + } } i++; } + if (replacementQueue.length() > 0) { + // this if-else statement is duplicated above; beware! + int endIndex = mustReplace ? mustReplaceUntil : i; + if (toUnicode) { + replaceDuffsWithUnicode(replacementFontSize, + replacementStartIndex, + endIndex, + replacementQueue.toString()); + } else { + replaceDuffs(replacementFontSize, + replacementStartIndex, + endIndex, + replacementQueue.toString(), + replacementFontIndex, + !toTM); + } + } if (!ThdlOptions.getBooleanOption("thdl.leave.bad.tm.tmw.conversions.in.place")) { // Remove all characters other than the oddballs: - if (errorGlyphLocation > 0) { - remove(errorGlyphLocation, getLength()-errorGlyphLocation-1); + if (ceh.doErrorWrapup && ceh.errorGlyphLocation > 0) { + remove(ceh.errorGlyphLocation, getLength()-ceh.errorGlyphLocation-1); } } } catch (BadLocationException ble) { ble.printStackTrace(); ThdlDebug.noteIffyCode(); } - return errorReturn; + } + + /** Returns all the paragraph elements in this document that + * contain glyphs with offsets in the range [start, end) where + * end < 0 is treated as the document's length. Note that roman, + * TM, Arial Unicode MS, and TMW text can all be intermingled + * within a paragraph. It's the correct level of abstraction to + * use, however, because the next finer grain is roughly one + * Element per glyph. */ + private Element[] getParagraphs(int start, int end) { + if (end < 0) + end = getLength(); + Element arrayType[] = new Element[0]; + ArrayList v = new ArrayList(); + int pos = start; + while (pos <= end) { + Element pe = getParagraphElement(pos); + v.add(pe); + if (pe.getEndOffset() == pos) + pos = pe.getEndOffset() + 1; + else + pos = pe.getEndOffset(); + } + return (Element[])v.toArray(arrayType); + } + +} + +/** A helper class used by TibetanDocument.convertHelper(..). */ +class ConversionErrorHelper { + boolean errorReturn; + /** one more than the location of the last error glyph, or zero if no + * error glyphs yet exist */ + int errorGlyphLocation; + boolean doErrorWrapup; + ConversionErrorHelper() { + errorReturn = false; + errorGlyphLocation = 0; + doErrorWrapup = false; } } diff --git a/source/org/thdl/tib/text/TibetanMachineWeb.java b/source/org/thdl/tib/text/TibetanMachineWeb.java index 7a7d0ab..4374684 100644 --- a/source/org/thdl/tib/text/TibetanMachineWeb.java +++ b/source/org/thdl/tib/text/TibetanMachineWeb.java @@ -212,6 +212,18 @@ public class TibetanMachineWeb implements THDLWylieConstants { readInFontFile("/Fonts/TibetanMachineWeb/timwn9.ttf"); } + /** Assumes that the TM font files are resources associated with + * this class and loads those font files. + * @throws Error if that assumption does not hold */ + private static void readInTMFontFiles() { + /* Note the leading slashes on these paths: */ + readInFontFile("/Fonts/TibetanMachine/Timn.ttf"); + readInFontFile("/Fonts/TibetanMachine/Tims1.ttf"); + readInFontFile("/Fonts/TibetanMachine/Tims2.ttf"); + readInFontFile("/Fonts/TibetanMachine/Tims3.ttf"); + readInFontFile("/Fonts/TibetanMachine/Tims4.ttf"); + } + /** Assumes that the TMW font file at the given path is a resource * associated with this class and loads that font file. * @param path a path within the JAR containing this class file @@ -263,6 +275,14 @@ public class TibetanMachineWeb implements THDLWylieConstants { readInFontFiles(); } + // DLC FIXME: include TM fonts with Jskad but not with other + // packages. Right now you can get them manually by editing + // build.xml and your options.txt or my_thdl_preferences.txt + // file. + if (ThdlOptions.getBooleanOption("thdl.do.not.rely.on.system.tm.fonts")) { + readInTMFontFiles(); + } + unicodeFontAttributeSet = new SimpleAttributeSet(); StyleConstants.setFontFamily(unicodeFontAttributeSet, ThdlOptions.getStringOption("thdl.tmw.to.unicode.font", @@ -972,27 +992,43 @@ private static final DuffCode TMW_tab = new DuffCode(1, '\t'); valid. Only a few control characters are supported: '\r' (carriage - return), '\n' (line feed), and '\t' (tab). */ -public static DuffCode mapTMtoTMW(int font, int ordinal) { + return), '\n' (line feed), and '\t' (tab). + + If suggestedFont is not zero, then any ordinals that are the same + in all fonts ('\n', '-', ' ', '\r', and '\t') will be converted to + the font named tmwFontNames[suggestedFont]. +*/ +public static DuffCode mapTMtoTMW(int font, int ordinal, int suggestedFont) { if (font < 0 || font > 4) return null; if (ordinal > 255) return getUnusualTMtoTMW(font, ordinal); if (ordinal < 32) { - if (ordinal == (int)'\r') - return TMW_cr; - else if (ordinal == (int)'\n') - return TMW_lf; - else if (ordinal == (int)'\t') - return TMW_tab; - else { + if (ordinal == (int)'\r') { + if (0 == suggestedFont) + return TMW_cr; + else + return new DuffCode(suggestedFont, (char)ordinal); + } else if (ordinal == (int)'\n') { + if (0 == suggestedFont) + return TMW_lf; + else + return new DuffCode(suggestedFont, (char)ordinal); + } else if (ordinal == (int)'\t') { + if (0 == suggestedFont) + return TMW_tab; + else + return new DuffCode(suggestedFont, (char)ordinal); + } else { // for robustness, just return font 1, char ordinal. ThdlDebug.noteIffyCode(); return null; } } - DuffCode ans = TMtoTMW[font][ordinal-32]; - return ans; + if (0 != suggestedFont && 32 == ordinal || 45 == ordinal) { + return new DuffCode(suggestedFont, (char)ordinal); + } + return TMtoTMW[font][ordinal-32]; } private static final DuffCode TM_cr = new DuffCode(1, '\r'); @@ -1011,25 +1047,40 @@ private static final DuffCode TM_tab = new DuffCode(1, '\t'); Only a few control characters are supported: '\r' (carriage return), '\n' (line feed), and '\t' (tab). - */ -public static DuffCode mapTMWtoTM(int font, int ordinal) { + If suggestedFont is not zero, then any ordinals that are the same + in all fonts ('\n', '-', ' ', '\r', and '\t') will be converted to + the font named tmwFontNames[suggestedFont]. +*/ +public static DuffCode mapTMWtoTM(int font, int ordinal, int suggestedFont) { if (font < 0 || font > 9) return null; if (ordinal > 127) return null; if (ordinal < 32) { - if (ordinal == (int)'\r') - return TM_cr; - else if (ordinal == (int)'\n') - return TM_lf; - else if (ordinal == (int)'\t') - return TM_tab; - else { + if (ordinal == (int)'\r') { + if (0 == suggestedFont) + return TM_cr; + else + return new DuffCode(suggestedFont, (char)ordinal); + } else if (ordinal == (int)'\n') { + if (0 == suggestedFont) + return TM_lf; + else + return new DuffCode(suggestedFont, (char)ordinal); + } else if (ordinal == (int)'\t') { + if (0 == suggestedFont) + return TM_tab; + else + return new DuffCode(suggestedFont, (char)ordinal); + } else { // for robustness, just return font 1, char ordinal. ThdlDebug.noteIffyCode(); return null; } } + if (0 != suggestedFont && 32 == ordinal || 45 == ordinal) { + return new DuffCode(suggestedFont, (char)ordinal); + } DuffCode ans = TMWtoTM[font][ordinal-32]; return ans; } @@ -1041,7 +1092,7 @@ public static void main(String[] args) { count = 0; for (font = 0; font < 5; font++) { for (ord = 32; ord < 255; ord++) { - if (mapTMtoTMW(font, ord) != null) { + if (mapTMtoTMW(font, ord, 0) != null) { count++; } } @@ -1051,7 +1102,7 @@ public static void main(String[] args) { count = 0; for (font = 0; font < 10; font++) { for (ord = 32; ord < 127; ord++) { - if (mapTMWtoTM(font, ord) != null) { + if (mapTMWtoTM(font, ord, 0) != null) { count++; } } @@ -1062,7 +1113,7 @@ public static void main(String[] args) { for (font = 0; font < 10; font++) { for (ord = 32; ord < 127; ord++) { DuffCode dc; - if ((dc = mapTMWtoTM(font, ord)) != null) { + if ((dc = mapTMWtoTM(font, ord, 0)) != null) { System.out.println(dc.getCharNum() + " " + (dc.getFontNum()-1) + " " + font + " " @@ -1075,7 +1126,7 @@ public static void main(String[] args) { for (font = 0; font < 5; font++) { for (ord = 32; ord < 255; ord++) { DuffCode dc; - if ((dc = mapTMtoTMW(font, ord)) != null) { + if ((dc = mapTMtoTMW(font, ord, 0)) != null) { System.out.println(ord + " " + font + " " + (dc.getFontNum()-1) + " " + dc.getCharNum());