diff --git a/source/org/thdl/tib/text/TibetanDocument.java b/source/org/thdl/tib/text/TibetanDocument.java index c8d7612..5ca08af 100644 --- a/source/org/thdl/tib/text/TibetanDocument.java +++ b/source/org/thdl/tib/text/TibetanDocument.java @@ -28,6 +28,7 @@ import java.awt.Color; import org.thdl.util.ThdlDebug; import org.thdl.util.ThdlOptions; import org.thdl.util.ThdlLazyException; +import org.thdl.tib.text.tshegbar.UnicodeUtils; /** Represents a character meant to be rendered in a certain font. * @author David Chandler @@ -295,16 +296,6 @@ public class TibetanDocument extends DefaultStyledDocument { throw new Error("TMW->Unicode failed because the following constitute a bad position: startOffset " + startOffset + ", endOffset " + endOffset); } } - // DLC NOW do I stick to these rules in TMW->Unicode mappings? -// Chris Fynn wrote: -// -// By normal Tibetan & Dzongkha spelling, writing, and input rules -// Tibetan script stacks should be entered and written: 1 headline -// consonant (0F40->0F6A), any subjoined consonant(s) (0F90-> -// 0F9C), achung (0F71), shabkyu (0F74), any above headline -// vowel(s) (0F72 0F7A 0F7B 0F7C 0F7D and 0F80) ; any ngaro (0F7E, -// 0F82 and 0F83) - private int insertDuff(int fontSize, int pos, DuffData[] glyphs, boolean asTMW) { return insertDuff(fontSize, pos, glyphs, asTMW, Color.black); @@ -978,6 +969,7 @@ public class TibetanDocument extends DefaultStyledDocument { // this if-else statement is duplicated below; beware! int endIndex = mustReplace ? mustReplaceUntil : i; if (toUnicode) { + UnicodeUtils.fixSomeOrderingErrorsInTibetanUnicode(replacementQueue); replaceDuffsWithUnicode(replacementFontSize, replacementStartIndex, endIndex, @@ -1013,6 +1005,8 @@ public class TibetanDocument extends DefaultStyledDocument { } if (toUnicode) { replacementQueue.append(unicode); + if (debug) + System.out.println("unicode rq.append: " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToString(unicode)); } else { replacementQueue.append(dc.getCharacter()); } @@ -1089,11 +1083,14 @@ public class TibetanDocument extends DefaultStyledDocument { // this if-else statement is duplicated above; beware! int endIndex = mustReplace ? mustReplaceUntil : i; if (toUnicode) { + UnicodeUtils.fixSomeOrderingErrorsInTibetanUnicode(replacementQueue); replaceDuffsWithUnicode(replacementFontSize, replacementStartIndex, endIndex, replacementQueue.toString(), unicodeFont); + if (debug) + System.out.println("unicode rq: " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToString(replacementQueue.toString())); } else { replaceDuffs(replacementFontSize, replacementStartIndex, diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java index 5757f07..b8c32d6 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java @@ -403,5 +403,108 @@ public class UnicodeUtils implements UnicodeConstants { || (cp >= '\u0FCD' && cp <= '\u0FCE') || (cp >= '\u0FD0' && cp <= '\u0FFF')); } + + /** This array has a number of pairs. The first element in a pair + * is the one that should come first if the two characters are + * direct neighbors in a sequence. (Note that this is not the + * most compact form for this information: we've done a cross + * product already instead of letting the code do the cross + * product.) + */ + private static char unicode_pairs[][] + = { { '\u0f71', '\u0f74' }, + + { '\u0f71', '\u0f72' }, + { '\u0f71', '\u0f7a' }, + { '\u0f71', '\u0f7b' }, + { '\u0f71', '\u0f7c' }, + { '\u0f71', '\u0f7d' }, + { '\u0f71', '\u0f80' }, + + { '\u0f71', '\u0f7e' }, + { '\u0f71', '\u0f82' }, + { '\u0f71', '\u0f83' }, + + { '\u0f74', '\u0f72' }, + { '\u0f74', '\u0f7a' }, + { '\u0f74', '\u0f7b' }, + { '\u0f74', '\u0f7c' }, + { '\u0f74', '\u0f7d' }, + { '\u0f74', '\u0f80' }, + + { '\u0f74', '\u0f7e' }, + { '\u0f74', '\u0f82' }, + { '\u0f74', '\u0f83' }, + + { '\u0f72', '\u0f7e' }, + { '\u0f72', '\u0f82' }, + { '\u0f72', '\u0f83' }, + + { '\u0f7a', '\u0f7e' }, + { '\u0f7a', '\u0f82' }, + { '\u0f7a', '\u0f83' }, + + { '\u0f7b', '\u0f7e' }, + { '\u0f7b', '\u0f82' }, + { '\u0f7b', '\u0f83' }, + + { '\u0f7c', '\u0f7e' }, + { '\u0f7c', '\u0f82' }, + { '\u0f7c', '\u0f83' }, + + { '\u0f7d', '\u0f7e' }, + { '\u0f7d', '\u0f82' }, + { '\u0f7d', '\u0f83' }, + + { '\u0f80', '\u0f7e' }, + { '\u0f80', '\u0f82' }, + { '\u0f80', '\u0f83' }, + }; + + /** Mutates sb if sb contains an error like having U+0f72 directly + * before U+0f71. Let's say more: + * + *

Chris Fynn wrote:

+ * + *
By normal Tibetan & Dzongkha spelling, writing, + * and input rules Tibetan script stacks should be entered and + * written: 1 headline consonant (0F40->0F6A), any subjoined + * consonant(s) (0F90-> 0F9C), achung (0F71), shabkyu (0F74), + * any above headline vowel(s) (0F72 0F7A 0F7B 0F7C 0F7D and + * 0F80); any ngaro (0F7E, 0F82 and 0F83)
+ * + *

FIXME DLC: We still miss some Unicode well-formedness + * problems here, but the problems that this function does catch + * may not be solved during e.g. a TMW-to-Unicode conversion + * because we don't call this function for the entire output, + * just pieces of it. Depending on how you break up those pieces + * we could miss problems that this function can fix. TODO(DLC): + * A separate tool that passes over a unicode file and outputs + * the same file modulo Unicode booboos would be better.

+ * + * @param sb the buffer to be mutated + * @return true if sb was mutated */ + public static boolean fixSomeOrderingErrorsInTibetanUnicode(StringBuffer sb) { + boolean mutated = false; + int len = sb.length(); + boolean mutated_this_time_through; + // the do-while loop helps us be correct for \u0f7a\u0f72\u0f71. + + // PERFORMANCE FIXME: try using a map instead of iterating + // over all of unicode_pairs and see if it isn't faster. + do { + mutated_this_time_through = false; + for (int i = 0; i < len - 1; i++) + for (int j = 0; j < unicode_pairs.length; j++) + if (unicode_pairs[j][1] == sb.charAt(i) + && unicode_pairs[j][0] == sb.charAt(i + 1)) { + sb.setCharAt(i, unicode_pairs[j][0]); + sb.setCharAt(i + 1, unicode_pairs[j][1]); + mutated = true; + mutated_this_time_through = true; + } + } while (mutated_this_time_through); + return mutated; + } } diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeUtilsTest.java b/source/org/thdl/tib/text/tshegbar/UnicodeUtilsTest.java index ae987f6..e6c9938 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeUtilsTest.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtilsTest.java @@ -379,4 +379,32 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants { assertTrue(UnicodeUtils.isInTibetanRange('\u0FF0')); assertTrue(UnicodeUtils.isInTibetanRange('\u0FFF')); } + + /** + * Tests the {@link UnicodeUtils#fixSomeOrderingErrorsInTibetanUnicode(StringBuffer)} + * method. */ + public void testFixSomeOrderingErrorsInTibetanUnicode() { + // Test that "\u0f67\u0f72\u0f71" becomes "\u0f67\u0f71\u0f72", e.g: + String tt[][] = { + { "\u0f67\u0f72\u0f71", "\u0f67\u0f71\u0f72" }, + { "\u0f7a\u0f72\u0f71", "\u0f71\u0f7a\u0f72" }, + { "\u0f67\u0f7e\u0f71", "\u0f67\u0f71\u0f7e" }, + { "\u0f67\u0f74\u0f71", "\u0f67\u0f71\u0f74" }, + { "\u0f67\u0f7e\u0f72", "\u0f67\u0f72\u0f7e" }, + { "\u0f67\u0f7e\u0f74", "\u0f67\u0f74\u0f7e" }, + }; + for (int i = 0; i < tt.length; i++) { + StringBuffer sb = new StringBuffer(tt[i][0]); + assertTrue(true == UnicodeUtils.fixSomeOrderingErrorsInTibetanUnicode(sb)); + assertTrue(sb.toString().equals(tt[i][1])); + } + + // Test that "\u0f67\u0f71\u0f72" stays the same, e.g.: + String uu[] = { "\u0f67\u0f71\u0f72" }; + for (int i = 0; i < uu.length; i++) { + StringBuffer sb = new StringBuffer(uu[i]); + assertTrue(false == UnicodeUtils.fixSomeOrderingErrorsInTibetanUnicode(sb)); + assertTrue(sb.toString().equals(uu[i])); + } + } }