The *->Unicode conversions were outputting Unicode that was not

well-formed.  They still do, but they do it less often.

Chris Fynn wrote this a while back:

   By normal Tibetan & Dzongkha spelling, writing, and input rules
   Tibetan script stacks should be entered and written: 1 headline
   consonant (0F40->0F6A), any subjoined consonant(s) (0F90-> 0F9C),
   achung (0F71), shabkyu (0F74), any above headline vowel(s) (0F72
   0F7A 0F7B 0F7C 0F7D and 0F80); any ngaro (0F7E, 0F82 and 0F83).

Now efforts are made to ensure that the converters conform to the
above rules.
This commit is contained in:
dchandler 2004-12-13 02:32:46 +00:00
parent 3115f22484
commit aa5d86a6e3
3 changed files with 138 additions and 10 deletions

View file

@ -28,6 +28,7 @@ import java.awt.Color;
import org.thdl.util.ThdlDebug; import org.thdl.util.ThdlDebug;
import org.thdl.util.ThdlOptions; import org.thdl.util.ThdlOptions;
import org.thdl.util.ThdlLazyException; import org.thdl.util.ThdlLazyException;
import org.thdl.tib.text.tshegbar.UnicodeUtils;
/** Represents a character meant to be rendered in a certain font. /** Represents a character meant to be rendered in a certain font.
* @author David Chandler * @author David Chandler
@ -295,16 +296,6 @@ public class TibetanDocument extends DefaultStyledDocument {
throw new Error("TMW->Unicode failed because the following constitute a bad position: startOffset " + startOffset + ", endOffset " + endOffset); throw new Error("TMW->Unicode failed because the following constitute a bad position: startOffset " + startOffset + ", endOffset " + endOffset);
} }
} }
// DLC NOW do I stick to these rules in TMW->Unicode mappings?
// Chris Fynn wrote:
//
// By normal Tibetan & Dzongkha spelling, writing, and input rules
// Tibetan script stacks should be entered and written: 1 headline
// consonant (0F40->0F6A), any subjoined consonant(s) (0F90->
// 0F9C), achung (0F71), shabkyu (0F74), any above headline
// vowel(s) (0F72 0F7A 0F7B 0F7C 0F7D and 0F80) ; any ngaro (0F7E,
// 0F82 and 0F83)
private int insertDuff(int fontSize, int pos, DuffData[] glyphs, boolean asTMW) { private int insertDuff(int fontSize, int pos, DuffData[] glyphs, boolean asTMW) {
return insertDuff(fontSize, pos, glyphs, asTMW, Color.black); return insertDuff(fontSize, pos, glyphs, asTMW, Color.black);
@ -978,6 +969,7 @@ public class TibetanDocument extends DefaultStyledDocument {
// this if-else statement is duplicated below; beware! // this if-else statement is duplicated below; beware!
int endIndex = mustReplace ? mustReplaceUntil : i; int endIndex = mustReplace ? mustReplaceUntil : i;
if (toUnicode) { if (toUnicode) {
UnicodeUtils.fixSomeOrderingErrorsInTibetanUnicode(replacementQueue);
replaceDuffsWithUnicode(replacementFontSize, replaceDuffsWithUnicode(replacementFontSize,
replacementStartIndex, replacementStartIndex,
endIndex, endIndex,
@ -1013,6 +1005,8 @@ public class TibetanDocument extends DefaultStyledDocument {
} }
if (toUnicode) { if (toUnicode) {
replacementQueue.append(unicode); replacementQueue.append(unicode);
if (debug)
System.out.println("unicode rq.append: " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToString(unicode));
} else { } else {
replacementQueue.append(dc.getCharacter()); replacementQueue.append(dc.getCharacter());
} }
@ -1089,11 +1083,14 @@ public class TibetanDocument extends DefaultStyledDocument {
// this if-else statement is duplicated above; beware! // this if-else statement is duplicated above; beware!
int endIndex = mustReplace ? mustReplaceUntil : i; int endIndex = mustReplace ? mustReplaceUntil : i;
if (toUnicode) { if (toUnicode) {
UnicodeUtils.fixSomeOrderingErrorsInTibetanUnicode(replacementQueue);
replaceDuffsWithUnicode(replacementFontSize, replaceDuffsWithUnicode(replacementFontSize,
replacementStartIndex, replacementStartIndex,
endIndex, endIndex,
replacementQueue.toString(), replacementQueue.toString(),
unicodeFont); unicodeFont);
if (debug)
System.out.println("unicode rq: " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToString(replacementQueue.toString()));
} else { } else {
replaceDuffs(replacementFontSize, replaceDuffs(replacementFontSize,
replacementStartIndex, replacementStartIndex,

View file

@ -403,5 +403,108 @@ public class UnicodeUtils implements UnicodeConstants {
|| (cp >= '\u0FCD' && cp <= '\u0FCE') || (cp >= '\u0FCD' && cp <= '\u0FCE')
|| (cp >= '\u0FD0' && cp <= '\u0FFF')); || (cp >= '\u0FD0' && cp <= '\u0FFF'));
} }
/** This array has a number of pairs. The first element in a pair
* is the one that should come first if the two characters are
* direct neighbors in a sequence. (Note that this is not the
* most compact form for this information: we've done a cross
* product already instead of letting the code do the cross
* product.)
*/
private static char unicode_pairs[][]
= { { '\u0f71', '\u0f74' },
{ '\u0f71', '\u0f72' },
{ '\u0f71', '\u0f7a' },
{ '\u0f71', '\u0f7b' },
{ '\u0f71', '\u0f7c' },
{ '\u0f71', '\u0f7d' },
{ '\u0f71', '\u0f80' },
{ '\u0f71', '\u0f7e' },
{ '\u0f71', '\u0f82' },
{ '\u0f71', '\u0f83' },
{ '\u0f74', '\u0f72' },
{ '\u0f74', '\u0f7a' },
{ '\u0f74', '\u0f7b' },
{ '\u0f74', '\u0f7c' },
{ '\u0f74', '\u0f7d' },
{ '\u0f74', '\u0f80' },
{ '\u0f74', '\u0f7e' },
{ '\u0f74', '\u0f82' },
{ '\u0f74', '\u0f83' },
{ '\u0f72', '\u0f7e' },
{ '\u0f72', '\u0f82' },
{ '\u0f72', '\u0f83' },
{ '\u0f7a', '\u0f7e' },
{ '\u0f7a', '\u0f82' },
{ '\u0f7a', '\u0f83' },
{ '\u0f7b', '\u0f7e' },
{ '\u0f7b', '\u0f82' },
{ '\u0f7b', '\u0f83' },
{ '\u0f7c', '\u0f7e' },
{ '\u0f7c', '\u0f82' },
{ '\u0f7c', '\u0f83' },
{ '\u0f7d', '\u0f7e' },
{ '\u0f7d', '\u0f82' },
{ '\u0f7d', '\u0f83' },
{ '\u0f80', '\u0f7e' },
{ '\u0f80', '\u0f82' },
{ '\u0f80', '\u0f83' },
};
/** Mutates sb if sb contains an error like having U+0f72 directly
* before U+0f71. Let's say more:
*
* <p>Chris Fynn wrote:</p>
*
* <blockquote>By normal Tibetan & Dzongkha spelling, writing,
* and input rules Tibetan script stacks should be entered and
* written: 1 headline consonant (0F40-&gt;0F6A), any subjoined
* consonant(s) (0F90-&gt; 0F9C), achung (0F71), shabkyu (0F74),
* any above headline vowel(s) (0F72 0F7A 0F7B 0F7C 0F7D and
* 0F80); any ngaro (0F7E, 0F82 and 0F83)</blockquote>
*
* <p>FIXME DLC: We still miss some Unicode well-formedness
* problems here, but the problems that this function does catch
* may not be solved during e.g. a TMW-to-Unicode conversion
* because we don't call this function for the entire output,
* just pieces of it. Depending on how you break up those pieces
* we could miss problems that this function can fix. TODO(DLC):
* A separate tool that passes over a unicode file and outputs
* the same file modulo Unicode booboos would be better. </p>
*
* @param sb the buffer to be mutated
* @return true if sb was mutated */
public static boolean fixSomeOrderingErrorsInTibetanUnicode(StringBuffer sb) {
boolean mutated = false;
int len = sb.length();
boolean mutated_this_time_through;
// the do-while loop helps us be correct for \u0f7a\u0f72\u0f71.
// PERFORMANCE FIXME: try using a map instead of iterating
// over all of unicode_pairs and see if it isn't faster.
do {
mutated_this_time_through = false;
for (int i = 0; i < len - 1; i++)
for (int j = 0; j < unicode_pairs.length; j++)
if (unicode_pairs[j][1] == sb.charAt(i)
&& unicode_pairs[j][0] == sb.charAt(i + 1)) {
sb.setCharAt(i, unicode_pairs[j][0]);
sb.setCharAt(i + 1, unicode_pairs[j][1]);
mutated = true;
mutated_this_time_through = true;
}
} while (mutated_this_time_through);
return mutated;
}
} }

View file

@ -379,4 +379,32 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants {
assertTrue(UnicodeUtils.isInTibetanRange('\u0FF0')); assertTrue(UnicodeUtils.isInTibetanRange('\u0FF0'));
assertTrue(UnicodeUtils.isInTibetanRange('\u0FFF')); assertTrue(UnicodeUtils.isInTibetanRange('\u0FFF'));
} }
/**
* Tests the {@link UnicodeUtils#fixSomeOrderingErrorsInTibetanUnicode(StringBuffer)}
* method. */
public void testFixSomeOrderingErrorsInTibetanUnicode() {
// Test that "\u0f67\u0f72\u0f71" becomes "\u0f67\u0f71\u0f72", e.g:
String tt[][] = {
{ "\u0f67\u0f72\u0f71", "\u0f67\u0f71\u0f72" },
{ "\u0f7a\u0f72\u0f71", "\u0f71\u0f7a\u0f72" },
{ "\u0f67\u0f7e\u0f71", "\u0f67\u0f71\u0f7e" },
{ "\u0f67\u0f74\u0f71", "\u0f67\u0f71\u0f74" },
{ "\u0f67\u0f7e\u0f72", "\u0f67\u0f72\u0f7e" },
{ "\u0f67\u0f7e\u0f74", "\u0f67\u0f74\u0f7e" },
};
for (int i = 0; i < tt.length; i++) {
StringBuffer sb = new StringBuffer(tt[i][0]);
assertTrue(true == UnicodeUtils.fixSomeOrderingErrorsInTibetanUnicode(sb));
assertTrue(sb.toString().equals(tt[i][1]));
}
// Test that "\u0f67\u0f71\u0f72" stays the same, e.g.:
String uu[] = { "\u0f67\u0f71\u0f72" };
for (int i = 0; i < uu.length; i++) {
StringBuffer sb = new StringBuffer(uu[i]);
assertTrue(false == UnicodeUtils.fixSomeOrderingErrorsInTibetanUnicode(sb));
assertTrue(sb.toString().equals(uu[i]));
}
}
} }