From 0f724989b50488993567d689607df16778362da3 Mon Sep 17 00:00:00 2001 From: dchandler Date: Sun, 1 Jun 2003 23:05:32 +0000 Subject: [PATCH] The Wylie 'M' used to map to TMW7.91, when it should map to TMW7.90. I've fixed that. I've also added a couple of Unicode mappings to give a flavor for how multi-codepoint mappings will be represented. TM->TMW conversion takes about 1 second per thousand glyphs on my PIII-550. --- source/org/thdl/tib/input/Jskad.java | 29 ++++- .../thdl/tib/input/TMW_RTF_TO_THDL_WYLIE.java | 12 +- source/org/thdl/tib/text/TibetanDocument.java | 115 +++++++++++------ .../org/thdl/tib/text/TibetanMachineWeb.java | 122 +++++++++++------- 4 files changed, 184 insertions(+), 94 deletions(-) diff --git a/source/org/thdl/tib/input/Jskad.java b/source/org/thdl/tib/input/Jskad.java index 4d6cd01..bbf6d79 100644 --- a/source/org/thdl/tib/input/Jskad.java +++ b/source/org/thdl/tib/input/Jskad.java @@ -327,14 +327,39 @@ public class Jskad extends JPanel implements DocumentListener { JMenuItem toTMItem = new JMenuItem("Convert TMW to TM"); // DLC FIXME: do it just in the selection? toTMItem.addActionListener(new ThdlActionListener() { public void theRealActionPerformed(ActionEvent e) { - ((TibetanDocument)dp.getDocument()).convertToTM(0, -1); // entire document + StringBuffer errors = new StringBuffer(); + boolean errorReturn + = ((TibetanDocument)dp.getDocument()).convertToTM(0, -1, errors); // entire document + if (errorReturn) { + JOptionPane.showMessageDialog(Jskad.this, + "At least one error occurred while converting Tibetan Machine Web\nto Tibetan Machine. Your document is mostly converted,\nexcept for the glyphs found after the 72-point Tibetan Machine Web\n30-letter alphabet.\nThe following glyphs were problems:\n" + + errors.toString(), + "TMW to TM Errors", + JOptionPane.PLAIN_MESSAGE); + } else { + JOptionPane.showMessageDialog(Jskad.this, + "Converting Tibetan Machine Web to Tibetan Machine met with perfect success.", + "Success", JOptionPane.PLAIN_MESSAGE); + } } }); JMenuItem toTMWItem = new JMenuItem("Convert TM to TMW"); // DLC FIXME: do it just in the selection? toTMWItem.addActionListener(new ThdlActionListener() { public void theRealActionPerformed(ActionEvent e) { - ((TibetanDocument)dp.getDocument()).convertToTMW(0, -1); // entire document + StringBuffer errors = new StringBuffer(); + boolean errorReturn + = ((TibetanDocument)dp.getDocument()).convertToTMW(0, -1, errors); // entire document + if (errorReturn) { + JOptionPane.showMessageDialog(Jskad.this, + "At least one error occurred while converting Tibetan Machine\nto Tibetan Machine Web. Your document is mostly converted,\nexcept for the glyphs found after the 72-point Tibetan Machine Web\n30-letter alphabet.\nThe following glyphs were problems:\n" + + errors.toString(), + "TM to TMW Errors", JOptionPane.PLAIN_MESSAGE); + } else { + JOptionPane.showMessageDialog(Jskad.this, + "Converting Tibetan Machine to Tibetan Machine Web met with perfect success.", + "Success", JOptionPane.PLAIN_MESSAGE); + } } }); toolsMenu.addSeparator(); diff --git a/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIE.java b/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIE.java index 78453a1..bcb637a 100644 --- a/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIE.java +++ b/source/org/thdl/tib/input/TMW_RTF_TO_THDL_WYLIE.java @@ -89,7 +89,8 @@ public class TMW_RTF_TO_THDL_WYLIE { out.println(" file. Writes the THDL Extended Wylie transliteration of that file [in"); out.println(" --to-wylie mode] or the TibetanMachine equivalent of that file [in"); out.println(" --to-tibetan-machine mode] to standard output after dealing with the curly"); - out.println(" brace problem. Exit code is zero on success, nonzero otherwise."); + out.println(" brace problem. Exit code is zero on success, 42 if some TibetanMachine glyphs"); + out.println(" couldn't be understood (though output is still given), nonzero otherwise."); out.println(""); out.println(" You may find it helpful to use `--find-some-non-tmw' mode before doing a"); out.println(" conversion so that you have confidence in the conversion's correctness."); @@ -126,7 +127,8 @@ public class TMW_RTF_TO_THDL_WYLIE { } else { // conversion {to Wylie or TM} mode // Fix curly braces in the entire document: ((TibetanDocument)dp.getDocument()).replaceTahomaCurlyBracesAndBackslashes(0, -1); - + + int exitCode = 0; if (convertToWylieMode) { ThdlDebug.verify(!convertToTMMode); // Convert to THDL Wylie: @@ -134,14 +136,14 @@ public class TMW_RTF_TO_THDL_WYLIE { } else { ThdlDebug.verify(convertToTMMode); // Convert to TibetanMachine: - ((TibetanDocument)dp.getDocument()).convertToTM(0, dp.getDocument().getLength()); + if (!((TibetanDocument)dp.getDocument()).convertToTM(0, dp.getDocument().getLength(), null)) + exitCode = 42; } // Write to standard output the result: ((TibetanDocument)dp.getDocument()).writeRTFOutputStream(out); - // Exit normally: - return 0; + return exitCode; } } catch (ThdlLazyException e) { out.println("TMW_RTF_TO_THDL_WYLIE has a BUG:"); diff --git a/source/org/thdl/tib/text/TibetanDocument.java b/source/org/thdl/tib/text/TibetanDocument.java index 9792637..6f94f7e 100644 --- a/source/org/thdl/tib/text/TibetanDocument.java +++ b/source/org/thdl/tib/text/TibetanDocument.java @@ -25,6 +25,7 @@ import javax.swing.text.rtf.RTFEditorKit; import java.io.*; import org.thdl.util.ThdlDebug; +import org.thdl.util.ThdlOptions; /** Represents a character meant to be rendered in a certain font. * @author David Chandler @@ -379,6 +380,7 @@ public class TibetanDocument extends DefaultStyledDocument { break; } if (null != toReplaceWith) { + // SPEED_FIXME: determining font size might be slow int fontSize = tibetanFontSize; try { fontSize = ((Integer)getCharacterElement(i).getAttributes().getAttribute(StyleConstants.FontSize)).intValue(); @@ -403,9 +405,14 @@ public class TibetanDocument extends DefaultStyledDocument { the document. Be sure to set the size for Tibetan as you like it before using this (well, it usually gets it right on its own, but just in case). SPEED_FIXME: might be faster to run - over the elements, if they are one per font. */ - public void convertToTM(int begin, int end) { - convertTMW_TM(begin, end, true); + over the elements, if they are one per font. + @return true on 100% success, false if any exceptional case + was encountered + @param errors if non-null, then notes about all exceptional + cases will be appended to this StringBuffer + */ + public boolean convertToTM(int begin, int end, StringBuffer errors) { + return convertTMW_TM(begin, end, true, errors); } /** Converts all TibetanMachine glyphs in the document to @@ -414,24 +421,38 @@ public class TibetanDocument extends DefaultStyledDocument { the end of the document. Be sure to set the size for Tibetan as you like it before using this (well, it usually gets it right on its own, but just in case). SPEED_FIXME: might be - faster to run over the elements, if they are one per font. */ - public void convertToTMW(int begin, int end) { - convertTMW_TM(begin, end, false); + faster to run over the elements, if they are one per font. + @return true on 100% success, false if any exceptional case + was encountered + @param errors if non-null, then notes about all exceptional + cases will be appended to this StringBuffer + */ + public boolean convertToTMW(int begin, int end, StringBuffer errors) { + return convertTMW_TM(begin, end, false, errors); } /** Helper function. + @param errors if non-null, then notes about all exceptional + cases will be appended to this StringBuffer + @return true on 100% success, false if any exceptional case + was encountered @see convertToTMW(int,int) @see convertToTM(int,int) */ - private void convertTMW_TM(int begin, int end, boolean toTM) { + private boolean convertTMW_TM(int begin, int end, boolean toTM, + StringBuffer errors) { + boolean toStdout = ThdlOptions.getBooleanOption("thdl.debug"); + boolean errorReturn = false; if (end < 0) end = getLength(); if (begin >= end) - return; + return errorReturn; // nothing to do, so no errors in the doing. int i = begin; + HashMap problemGlyphsTable = new HashMap(); try { + Position endPos = createPosition(end); DuffData[] equivalent = new DuffData[1]; equivalent[0] = new DuffData(); - while (i < end) { + while (i < endPos.getOffset()) { AttributeSet attr = getCharacterElement(i).getAttributes(); String fontName = StyleConstants.getFontFamily(attr); int fontNum @@ -441,50 +462,65 @@ public class TibetanDocument extends DefaultStyledDocument { if (0 != fontNum) { DuffCode dc = null; - try { - if (toTM) { - dc = TibetanMachineWeb.mapTMWtoTM(fontNum - 1, - getText(i,1).charAt(0)); - } else { - dc = TibetanMachineWeb.mapTMtoTMW(fontNum - 1, - getText(i,1).charAt(0)); - } - if (null != dc) { - equivalent[0].setData(dc.getCharacter(), - dc.getFontNum()); - } - } catch (ArrayIndexOutOfBoundsException e) { - // we handle this below... - System.out.println("FIXME: " - + (toTM ? "TMW->TM" : "TM->TMW") - + " conversion is in trouble"); - System.out.println("font is " + (fontNum - 1) - + ", char is " - + (int)getText(i,1).charAt(0) - + "; pos is " + i); - ThdlDebug.noteIffyCode(); + if (toTM) { + dc = TibetanMachineWeb.mapTMWtoTM(fontNum - 1, + getText(i,1).charAt(0)); + } else { + dc = TibetanMachineWeb.mapTMtoTMW(fontNum - 1, + getText(i,1).charAt(0)); } if (null != dc) { + equivalent[0].setData(dc.getCharacter(), + dc.getFontNum()); + // SPEED_FIXME: determining font size might be slow int fontSize = tibetanFontSize; try { fontSize = ((Integer)getCharacterElement(i).getAttributes().getAttribute(StyleConstants.FontSize)).intValue(); } catch (Exception e) { // leave it as tibetanFontSize } - insertDuff(fontSize, i, equivalent, !toTM); - remove(i+1, 1); + // We have two choices: remove-then-insert + // second vs. insert-then-remove and also + // insert-before vs. insert-after. It turns + // out that insert-after preserves formatting + // whereas insert-before doesn't. And we do + // insert-then-remove because we're guessing + // that helps with formatting too. + insertDuff(fontSize, i+1, equivalent, !toTM); + remove(i, 1); } else { // DLC FIXME: insert into document a string - // saying "there's no TM equivalent for this." - // (For now, I'm inserting the alphabet and - // all the numbers in a big font in TMW to try - // and get some attention. And I've + // saying "<<[[there's no TM equivalent for + // this, details are ...]]>>" (For now, I'm + // inserting the alphabet in a big font in TMW + // to try and get some attention. And I've // *documented* this on the website.) + + errorReturn = true; + CharacterInAGivenFont cgf + = new CharacterInAGivenFont(getText(i,1), fontName); + if (!problemGlyphsTable.containsKey(cgf)) { + problemGlyphsTable.put(cgf, "yes this character appears once"); + if (null != errors) { + String err + = (toTM ? "TMW->TM" : "TM->TMW") + + " conversion failed for a glyph:\nFont is " + + fontName + ", glyph number is " + + (int)getText(i,1).charAt(0) + + "; first position found (from zero) is " + + i + "\n"; + errors.append(err); + if (toStdout) { + System.out.print(err); + } + } + } + String trickyTMW - = "!-\"-#-,-%-&-'-(-)-*-+-,-.-/-0-1-2-3-4-5-6-7-8-9-:-;-<-=->-?-0-1-2-3-4-5-6-7-8-9-"; + = "!-\"-#-$-%-&-'-(-)-*-+-,-.-/-0-1-2-3-4-5-6-7-8-9-:-;-<-=->-?-"; equivalent[0] = new DuffData(trickyTMW, 1); insertDuff(72, i, equivalent, true); - i += trickyTMW.length() + 1; + i += trickyTMW.length(); } } i++; @@ -493,5 +529,6 @@ public class TibetanDocument extends DefaultStyledDocument { ble.printStackTrace(); ThdlDebug.noteIffyCode(); } + return errorReturn; } } diff --git a/source/org/thdl/tib/text/TibetanMachineWeb.java b/source/org/thdl/tib/text/TibetanMachineWeb.java index cb154e8..915c8f5 100644 --- a/source/org/thdl/tib/text/TibetanMachineWeb.java +++ b/source/org/thdl/tib/text/TibetanMachineWeb.java @@ -893,17 +893,16 @@ private static final DuffCode TMW_tab = new DuffCode(1, '\t'); Null is never returned for an existing TibetanMachine glyph, because every TibetanMachine glyph has a corresponding - TibetanMachineWeb glyph. But if (font, ord) doesn't correspond to - an existing TibetanMachine glyph, null is returned. In general, - though, this method may raise a runtime exception if you pass in a - (font, ord) that doesn't correspond to an existing TibetanMachine - glyph. + TibetanMachineWeb glyph. Null is returned if the input isn't + valid. Only a few control characters are supported: '\r' (carriage - return), '\n' (line feed), and '\t' (tab). - */ -public static DuffCode mapTMtoTMW(int font, int ordinal) - throws ArrayIndexOutOfBoundsException { + return), '\n' (line feed), and '\t' (tab). */ +public static DuffCode mapTMtoTMW(int font, int ordinal) { + if (font < 0 || font > 4) + return null; + if (ordinal > 255) + return getUnusualTMtoTMW(font, ordinal); if (ordinal < 32) { if (ordinal == (int)'\r') return TMW_cr; @@ -918,8 +917,6 @@ public static DuffCode mapTMtoTMW(int font, int ordinal) } } DuffCode ans = TMtoTMW[font][ordinal-32]; - // comment this out to test via main(..): - ThdlDebug.verify(null != ans); return ans; } @@ -934,18 +931,17 @@ private static final DuffCode TM_tab = new DuffCode(1, '\t'); Null is returned for an existing TibetanMachineWeb glyph only if that glyph is TibetanMachineWeb7.91, because every other TibetanMachineWeb glyph has a corresponding TibetanMachine glyph. - But if (font, ord) isn't (7, 91) and doesn't correspond to an - existing TibetanMachineWeb glyph, null is returned. In general, - though, this method may raise a runtime exception if you pass in a - (font, ord) that doesn't correspond to an existing - TibetanMachineWeb glyph. + Null is returned if the input isn't valid. Only a few control characters are supported: '\r' (carriage return), '\n' (line feed), and '\t' (tab). */ -public static DuffCode mapTMWtoTM(int font, int ordinal) - throws ArrayIndexOutOfBoundsException { +public static DuffCode mapTMWtoTM(int font, int ordinal) { + if (font < 0 || font > 9) + return null; + if (ordinal > 127) + return null; if (ordinal < 32) { if (ordinal == (int)'\r') return TM_cr; @@ -960,8 +956,6 @@ public static DuffCode mapTMWtoTM(int font, int ordinal) } } DuffCode ans = TMWtoTM[font][ordinal-32]; - // comment this out to test via main(..): - ThdlDebug.verify(null != ans || (font == 7 && ordinal == 91)); return ans; } @@ -1015,49 +1009,81 @@ public static void main(String[] args) { } } -private static DuffCode getTMtoTMW(int font, int code) { - if (false) { // DLC FIXME: why was this here? - if (code > 255-32) { - switch (code) { - case 8218-32: //sby - code = 130-32; - break; +/** Tibet Doc makes weird RTF where you see TibetanMachine.8225 etc. + The highest possible glyph value should be 255, but that's not + what appears. This returns non-null if (font, code) identify an + oddball we know. This list may well be incomplete, but we handle + such oddballs in a first-class fashion. */ +private static DuffCode getUnusualTMtoTMW(int font, int code) { + if (code > 255) { + if (font == 0) { + switch (code) { + case 347: // reduced-height ha + return TMtoTMW[font][156 - 32]; - case 8230-32: //sgr - code = 133-32; - break; + case 353: // d-r-w + return TMtoTMW[font][154 - 32]; - case 8225-32: //spr - code = 135-32; - break; + case 377: // t-w + return TMtoTMW[font][143 - 32]; - case 8117-32: //tshw - code = 146-32; - break; + case 710: // s-b-r + return TMtoTMW[font][136 - 32]; - case 8126-32: //rw - code = 149-32; - break; + case 1026: // s-g-y + return TMtoTMW[font][128 - 32]; - case 8482-32: //grw - code = 153-32; - break; + case 1027: // s-p-y + return TMtoTMW[font][129 - 32]; + + case 1106: // d-w + return TMtoTMW[font][144 - 32]; + + case 8117: // tsh-w + return TMtoTMW[font][146 - 32]; + + case 8126: // r-w + return TMtoTMW[font][149 - 32]; + + case 8218: // s-b-y + return TMtoTMW[font][130 - 32]; + + case 8225: // s-p-r + return TMtoTMW[font][135 - 32]; + + case 8230: // s-g-r + return TMtoTMW[font][133 - 32]; + + case 8240: // s-m-r + return TMtoTMW[font][137 - 32]; + + case 8482: // g-r-w + return TMtoTMW[font][153 - 32]; default: return null; - } - } - } + } + } else if (font == 3) { + switch (code) { + case 402: // h+y + return TMtoTMW[font][131 - 32]; - return TMtoTMW[font][code]; + default: + return null; + } + } else { + return null; + } + } else { + return null; + } } /** * Gets the TibetanMachine font number for this font name. * @param name a font name * @return between 1 and 5 if the font is one -* of the TibetanMachine fonts, otherwise 0 -*/ +* of the TibetanMachine fonts, otherwise 0 */ public static int getTMFontNumber(String name) { String internedName = name.intern(); for (int i=1; i