e2a9720d9b
org.thdl.tib.input.TMW_RTF_TO_THDL_WYLIE. It converts RTF files consisting of TMW characters to the corresponding THDL Extended Wylie. It supports --find-some-non-tmw mode, which allows you to ensure that no unusual characters will spoil the conversion. The converter has built-in intelligence that allows it to handle Tahoma '{', '}', and '\\' characters properly. The converter works on mixed Roman/TMW also, but --find-some-non-tmw and --find-all-non-tmw modes are not as useful. Invoke org.thdl.tib.input.TMW_RTF_TO_THDL_WYLIE, which resides in Jskad's jar, with no command-line options to see usage information.
372 lines
12 KiB
Java
372 lines
12 KiB
Java
/*
|
|
The contents of this file are subject to the THDL Open Community License
|
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
|
with the License. You may obtain a copy of the License on the THDL web site
|
|
(http://www.thdl.org/).
|
|
|
|
Software distributed under the License is distributed on an "AS IS" basis,
|
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
|
License for the specific terms governing rights and limitations under the
|
|
License.
|
|
|
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
|
Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
|
|
All Rights Reserved.
|
|
|
|
Contributor(s): ______________________________________.
|
|
*/
|
|
|
|
package org.thdl.tib.text;
|
|
|
|
import java.util.*;
|
|
import javax.swing.*;
|
|
import javax.swing.text.*;
|
|
import javax.swing.text.rtf.RTFEditorKit;
|
|
import java.io.*;
|
|
|
|
import org.thdl.util.ThdlDebug;
|
|
|
|
/** Represents a character meant to be rendered in a certain font.
|
|
* @author David Chandler
|
|
*/
|
|
class CharacterInAGivenFont {
|
|
private char character;
|
|
private String fontName;
|
|
public CharacterInAGivenFont(char ch, String font) {
|
|
character = ch;
|
|
fontName = font;
|
|
}
|
|
public CharacterInAGivenFont(String s, String font) {
|
|
if (s.length() != 1)
|
|
throw new Error("character in a given font was given a string "
|
|
+ s + " in a given font");
|
|
character = s.charAt(0);
|
|
fontName = font;
|
|
}
|
|
public boolean equals(Object x) {
|
|
return ((x instanceof CharacterInAGivenFont)
|
|
&& ((CharacterInAGivenFont)x).character == character
|
|
&& ((CharacterInAGivenFont)x).fontName.equals(fontName));
|
|
}
|
|
public int hashCode() {
|
|
return (int)character + fontName.hashCode();
|
|
}
|
|
public String toString() {
|
|
String characterRepresentation
|
|
= "'" + new Character(character).toString() + "'";
|
|
if ('\n' == character)
|
|
characterRepresentation = "newline";
|
|
if ('\r' == character)
|
|
characterRepresentation = "carriage return";
|
|
return characterRepresentation + " in the font "
|
|
+ ((null == fontName)
|
|
? "_ERROR_FINDING_FONT_"
|
|
: fontName);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* A TibetanDocument is a styled document that knows about Tibetan and
|
|
* will respect line breaks and the like. It allows you to insert
|
|
* Tibetan also.
|
|
* @author Edward Garrett, Tibetan and Himalayan Digital Library
|
|
* @version 1.0 */
|
|
public class TibetanDocument extends DefaultStyledDocument {
|
|
private int tibetanFontSize = 36;
|
|
|
|
/** Creates a new TibetanDocument with default styles. */
|
|
public TibetanDocument() { super(); }
|
|
|
|
/** Do not use this contructor. */
|
|
private TibetanDocument(AbstractDocument.Content c, StyleContext styles ) {
|
|
super(c, styles);
|
|
}
|
|
|
|
/**
|
|
* Creates a TibetanDocument.
|
|
* @param styles a StyleContext, which is simply passed on
|
|
* to DefaultStyledDocument's constructor
|
|
*/
|
|
public TibetanDocument(StyleContext styles) {
|
|
super(styles);
|
|
}
|
|
|
|
/**
|
|
* Sets the point size used by default for Tibetan text.
|
|
* @param size the point size for Tibetan text
|
|
*/
|
|
public void setTibetanFontSize(int size) {
|
|
tibetanFontSize = size;
|
|
}
|
|
|
|
/**
|
|
* Gets the point size for Tibetan text.
|
|
* @return the point size used for Tibetan text
|
|
*/
|
|
public int getTibetanFontSize() {
|
|
return tibetanFontSize;
|
|
}
|
|
|
|
/**
|
|
* Writes the document to an OutputStream as Rich Text Format (.rtf).
|
|
* @param out the OutputStream to write to
|
|
*/
|
|
public void writeRTFOutputStream(OutputStream out) throws IOException {
|
|
RTFEditorKit rtf = new RTFEditorKit();
|
|
|
|
try {
|
|
rtf.write(out, this, 0, getLength());
|
|
}
|
|
catch (BadLocationException ble) {
|
|
ThdlDebug.noteIffyCode();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Inserts Tibetan text into the document. The font size is applied automatically,
|
|
* according to the current Tibetan font size.
|
|
* @param offset the position at which you want to insert text
|
|
* @param s the string you want to insert
|
|
* @param attr the attributes to apply, normally a particular TibetanMachineWeb font
|
|
* @see #setTibetanFontSize(int size)
|
|
*/
|
|
public void appendDuff(int offset, String s, MutableAttributeSet attr) {
|
|
try {
|
|
StyleConstants.setFontSize(attr, tibetanFontSize);
|
|
insertString(offset, s, attr);
|
|
}
|
|
catch (BadLocationException ble) {
|
|
ThdlDebug.noteIffyCode();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Inserts a stretch of TibetanMachineWeb data into the document.
|
|
* @param glyphs the array of Tibetan data you want to insert
|
|
* @param pos the position at which you want to insert text
|
|
*/
|
|
public int insertDuff(int pos, DuffData[] glyphs) {
|
|
if (glyphs == null)
|
|
return pos;
|
|
|
|
MutableAttributeSet mas;
|
|
for (int i=0; i<glyphs.length; i++) {
|
|
mas = TibetanMachineWeb.getAttributeSet(glyphs[i].font);
|
|
appendDuff(pos, glyphs[i].text, mas);
|
|
pos += glyphs[i].text.length();
|
|
}
|
|
return pos;
|
|
}
|
|
|
|
/**
|
|
* Converts the entire document into Extended Wylie.
|
|
* If the document consists of both Tibetan and
|
|
* non-Tibetan fonts, however, the conversion stops
|
|
* at the first non-Tibetan font.
|
|
* @return the string of Wylie corresponding to this document
|
|
*/
|
|
public String getWylie() {
|
|
return getWylie(0, getLength());
|
|
}
|
|
|
|
/**
|
|
* Converts a portion of the document into Extended Wylie.
|
|
* If the document consists of both Tibetan and
|
|
* non-Tibetan fonts, however, the conversion stops
|
|
* at the first non-Tibetan font.
|
|
* @param begin the beginning of the region to convert
|
|
* @param end the end of the region to convert
|
|
* @return the string of Wylie corresponding to this document
|
|
*/
|
|
public String getWylie(int begin, int end) {
|
|
AttributeSet attr;
|
|
String fontName;
|
|
int fontNum;
|
|
DuffCode dc;
|
|
char ch;
|
|
|
|
if (begin >= end)
|
|
return "";
|
|
|
|
java.util.List dcs = new ArrayList();
|
|
int i = begin;
|
|
StringBuffer wylieBuffer = new StringBuffer();
|
|
|
|
try {
|
|
while (i < end) {
|
|
attr = getCharacterElement(i).getAttributes();
|
|
fontName = StyleConstants.getFontFamily(attr);
|
|
|
|
ch = getText(i,1).charAt(0);
|
|
|
|
//current character is formatting
|
|
if (ch == '\n' || ch == '\t') {
|
|
if (dcs.size() > 0) {
|
|
DuffCode[] dc_array = new DuffCode[0];
|
|
dc_array = (DuffCode[])dcs.toArray(dc_array);
|
|
wylieBuffer.append(TibTextUtils.getWylie(dc_array));
|
|
dcs.clear();
|
|
}
|
|
wylieBuffer.append(ch);
|
|
}
|
|
|
|
//current character isn't TMW
|
|
else if ((0 == (fontNum = TibetanMachineWeb.getTMWFontNumber(fontName)))) {
|
|
if (dcs.size() > 0) {
|
|
DuffCode[] dc_array = new DuffCode[0];
|
|
dc_array = (DuffCode[])dcs.toArray(dc_array);
|
|
wylieBuffer.append(TibTextUtils.getWylie(dc_array));
|
|
dcs.clear();
|
|
}
|
|
}
|
|
|
|
//current character is convertable
|
|
else {
|
|
dc = new DuffCode(fontNum, ch);
|
|
dcs.add(dc);
|
|
}
|
|
i++;
|
|
}
|
|
if (dcs.size() > 0) {
|
|
DuffCode[] dc_array = new DuffCode[0];
|
|
dc_array = (DuffCode[])dcs.toArray(dc_array);
|
|
wylieBuffer.append(TibTextUtils.getWylie(dc_array));
|
|
}
|
|
return wylieBuffer.toString();
|
|
}
|
|
catch (BadLocationException ble) {
|
|
ble.printStackTrace();
|
|
ThdlDebug.noteIffyCode();
|
|
}
|
|
|
|
return "";
|
|
}
|
|
|
|
/** Prints to standard output a list of all the indices of
|
|
characters that are not in a TMW font within the range [start,
|
|
end). Using a negative number for end means that this will
|
|
run to the end of the document. SPEED_FIXME: might be faster
|
|
to run over the elements, if they are one per font.
|
|
@return 1 if at least one non-TMW character was found in
|
|
the specified range, zero if none were, -1 on error. */
|
|
public int findAllNonTMWCharacters(int begin, int end) {
|
|
if (end < 0)
|
|
end = getLength();
|
|
if (begin >= end)
|
|
return 0;
|
|
int i = begin;
|
|
int returnValue = 0;
|
|
try {
|
|
while (i < end) {
|
|
AttributeSet attr = getCharacterElement(i).getAttributes();
|
|
String fontName = StyleConstants.getFontFamily(attr);
|
|
if ((0 == TibetanMachineWeb.getTMWFontNumber(fontName))) {
|
|
returnValue = 1;
|
|
CharacterInAGivenFont cgf
|
|
= new CharacterInAGivenFont(getText(i, 1), fontName);
|
|
System.out.println("non-TMW character "
|
|
+ cgf + " at location " + i);
|
|
}
|
|
i++;
|
|
}
|
|
} catch (BadLocationException ble) {
|
|
ble.printStackTrace();
|
|
ThdlDebug.noteIffyCode();
|
|
returnValue = -1;
|
|
}
|
|
return returnValue;
|
|
}
|
|
|
|
/** Finds the first occurrence of a non-TMW character in a given
|
|
font and prints it to System.out. If you have a Tahoma
|
|
newline and an Arial newline, the first occurrence of each
|
|
will be reported.
|
|
|
|
<p>Works within the range [start, end). Using a negative
|
|
number for end means that this will run to the end of the
|
|
document. SPEED_FIXME: might be faster to run over the
|
|
elements, if they are one per font.
|
|
@return 1 if at least one non-TMW character was found in
|
|
the specified range, zero if none were, -1 on error. */
|
|
public int findSomeNonTMWCharacters(int begin, int end) {
|
|
if (end < 0)
|
|
end = getLength();
|
|
if (begin >= end)
|
|
return 0;
|
|
int i = begin;
|
|
int returnValue = 0;
|
|
try {
|
|
HashMap cgfTable = new HashMap();
|
|
while (i < end) {
|
|
AttributeSet attr = getCharacterElement(i).getAttributes();
|
|
String fontName = StyleConstants.getFontFamily(attr);
|
|
if ((0 == TibetanMachineWeb.getTMWFontNumber(fontName))) {
|
|
returnValue = 1;
|
|
CharacterInAGivenFont cgf
|
|
= new CharacterInAGivenFont(getText(i, 1), fontName);
|
|
if (!cgfTable.containsKey(cgf)) {
|
|
cgfTable.put(cgf, "yes this character appears once");
|
|
System.out.println("non-TMW character "
|
|
+ cgf + " appears first at location " + i);
|
|
}
|
|
}
|
|
i++;
|
|
}
|
|
} catch (BadLocationException ble) {
|
|
ble.printStackTrace();
|
|
ThdlDebug.noteIffyCode();
|
|
returnValue = -1;
|
|
}
|
|
return returnValue;
|
|
}
|
|
|
|
private static final DuffData[] leftCurlyBraceTMW
|
|
= new DuffData[] { new DuffData("{", 1) };
|
|
private static final DuffData[] rightCurlyBraceTMW
|
|
= new DuffData[] { new DuffData("}", 1) };
|
|
private static final DuffData[] backslashTMW
|
|
= new DuffData[] { new DuffData("\\", 2) };
|
|
/** This is a band-aid used to help Jskad fix RTF files that are
|
|
mostly TMW but have some Tahoma characters that should be TMW.
|
|
Replaces '{', '}', and '\\' characters with the correct
|
|
TibetanMachineWeb. Works within the range [start, end).
|
|
Using a negative number for end means that this will run to
|
|
the end of the document. Be sure to set the size for Tibetan
|
|
as you like it before using this. SPEED_FIXME: might be
|
|
faster to run over the elements, if they are one per font. */
|
|
public void replaceTahomaCurlyBracesAndBackslashes(int begin, int end) {
|
|
if (end < 0)
|
|
end = getLength();
|
|
if (begin >= end)
|
|
return;
|
|
int i = begin;
|
|
try {
|
|
while (i < end) {
|
|
AttributeSet attr = getCharacterElement(i).getAttributes();
|
|
String fontName = StyleConstants.getFontFamily(attr);
|
|
if (fontName.equals("Tahoma")) {
|
|
DuffData[] toReplaceWith = null;
|
|
switch (getText(i, 1).charAt(0)) {
|
|
case '{':
|
|
toReplaceWith = leftCurlyBraceTMW;
|
|
break;
|
|
case '}':
|
|
toReplaceWith = rightCurlyBraceTMW;
|
|
break;
|
|
case '\\':
|
|
toReplaceWith = backslashTMW;
|
|
break;
|
|
}
|
|
if (null != toReplaceWith) {
|
|
insertDuff(i, toReplaceWith);
|
|
remove(i+1, 1);
|
|
}
|
|
}
|
|
i++;
|
|
}
|
|
} catch (BadLocationException ble) {
|
|
ble.printStackTrace();
|
|
ThdlDebug.noteIffyCode();
|
|
}
|
|
}
|
|
}
|