The *->Unicode conversions were outputting Unicode that was not

well-formed. They still do, but they do it less often. Chris Fynn wrote this a while back: By normal Tibetan & Dzongkha spelling, writing, and input rules Tibetan script stacks should be entered and written: 1 headline consonant (0F40->0F6A), any subjoined consonant(s) (0F90-> 0F9C), achung (0F71), shabkyu (0F74), any above headline vowel(s) (0F72 0F7A 0F7B 0F7C 0F7D and 0F80); any ngaro (0F7E, 0F82 and 0F83). Now efforts are made to ensure that the converters conform to the above rules.
2004-12-13 02:32:46 +00:00 · 2004-12-13 02:32:46 +00:00 · aa5d86a6e3
commit aa5d86a6e3
parent 3115f22484
3 changed files with 138 additions and 10 deletions
--- a/source/org/thdl/tib/text/TibetanDocument.java
+++ b/source/org/thdl/tib/text/TibetanDocument.java
@ -28,6 +28,7 @@ import java.awt.Color;
 import org.thdl.util.ThdlDebug;
 import org.thdl.util.ThdlOptions;
 import org.thdl.util.ThdlLazyException;
 import org.thdl.tib.text.tshegbar.UnicodeUtils;
 /** Represents a character meant to be rendered in a certain font.
 *  @author David Chandler
@ -295,16 +296,6 @@ public class TibetanDocument extends DefaultStyledDocument {
            throw new Error("TMW->Unicode failed because the following constitute a bad position: startOffset " + startOffset + ", endOffset " + endOffset);
 		}
    }
    // DLC NOW do I stick to these rules in TMW->Unicode mappings?
 //  Chris Fynn wrote:
 //
 //  By normal Tibetan & Dzongkha spelling, writing, and input rules
 //  Tibetan script stacks should be entered and written: 1 headline
 //  consonant (0F40->0F6A), any  subjoined consonant(s) (0F90->
 //  0F9C),  achung (0F71), shabkyu (0F74), any above headline
 //  vowel(s) (0F72 0F7A 0F7B 0F7C 0F7D and 0F80) ; any ngaro (0F7E,
 //  0F82 and 0F83)
 	private int insertDuff(int fontSize, int pos, DuffData[] glyphs, boolean asTMW) {
        return insertDuff(fontSize, pos, glyphs, asTMW, Color.black);
@ -978,6 +969,7 @@ public class TibetanDocument extends DefaultStyledDocument {
                        // this if-else statement is duplicated below; beware!
                        int endIndex = mustReplace ? mustReplaceUntil : i;
                        if (toUnicode) {
                            UnicodeUtils.fixSomeOrderingErrorsInTibetanUnicode(replacementQueue);
                            replaceDuffsWithUnicode(replacementFontSize,
                                                    replacementStartIndex,
                                                    endIndex,
@ -1013,6 +1005,8 @@ public class TibetanDocument extends DefaultStyledDocument {
                        }
                        if (toUnicode) {
                            replacementQueue.append(unicode);
                            if (debug)
                                System.out.println("unicode rq.append: " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToString(unicode));
                        } else {
                            replacementQueue.append(dc.getCharacter());
                        }
@ -1089,11 +1083,14 @@ public class TibetanDocument extends DefaultStyledDocument {
                // this if-else statement is duplicated above; beware!
                int endIndex = mustReplace ? mustReplaceUntil : i;
                if (toUnicode) {
                    UnicodeUtils.fixSomeOrderingErrorsInTibetanUnicode(replacementQueue);
                    replaceDuffsWithUnicode(replacementFontSize,
                                            replacementStartIndex,
                                            endIndex,
                                            replacementQueue.toString(),
                                            unicodeFont);
                    if (debug)
                        System.out.println("unicode rq: " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToString(replacementQueue.toString()));
                } else {
                    replaceDuffs(replacementFontSize,
                                 replacementStartIndex,
--- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
@ -403,5 +403,108 @@ public class UnicodeUtils implements UnicodeConstants {
                || (cp >= '\u0FCD' && cp <= '\u0FCE')
                || (cp >= '\u0FD0' && cp <= '\u0FFF'));
    }
    /** This array has a number of pairs. The first element in a pair
     *  is the one that should come first if the two characters are
     *  direct neighbors in a sequence.  (Note that this is not the
     *  most compact form for this information: we've done a cross
     *  product already instead of letting the code do the cross
     *  product.)
     */
    private static char unicode_pairs[][]
        = { { '\u0f71', '\u0f74' },
            { '\u0f71', '\u0f72' },
            { '\u0f71', '\u0f7a' },
            { '\u0f71', '\u0f7b' },
            { '\u0f71', '\u0f7c' },
            { '\u0f71', '\u0f7d' },
            { '\u0f71', '\u0f80' },
            { '\u0f71', '\u0f7e' },
            { '\u0f71', '\u0f82' },
            { '\u0f71', '\u0f83' },
            { '\u0f74', '\u0f72' },
            { '\u0f74', '\u0f7a' },
            { '\u0f74', '\u0f7b' },
            { '\u0f74', '\u0f7c' },
            { '\u0f74', '\u0f7d' },
            { '\u0f74', '\u0f80' },
            { '\u0f74', '\u0f7e' },
            { '\u0f74', '\u0f82' },
            { '\u0f74', '\u0f83' },
            { '\u0f72', '\u0f7e' },
            { '\u0f72', '\u0f82' },
            { '\u0f72', '\u0f83' },
            { '\u0f7a', '\u0f7e' },
            { '\u0f7a', '\u0f82' },
            { '\u0f7a', '\u0f83' },
            { '\u0f7b', '\u0f7e' },
            { '\u0f7b', '\u0f82' },
            { '\u0f7b', '\u0f83' },
            { '\u0f7c', '\u0f7e' },
            { '\u0f7c', '\u0f82' },
            { '\u0f7c', '\u0f83' },
            { '\u0f7d', '\u0f7e' },
            { '\u0f7d', '\u0f82' },
            { '\u0f7d', '\u0f83' },
            { '\u0f80', '\u0f7e' },
            { '\u0f80', '\u0f82' },
            { '\u0f80', '\u0f83' },
        };
    /** Mutates sb if sb contains an error like having U+0f72 directly
     *  before U+0f71.  Let's say more:
     *
     *  <p>Chris Fynn wrote:</p>
     *
     *  <blockquote>By normal Tibetan & Dzongkha spelling, writing,
     *  and input rules Tibetan script stacks should be entered and
     *  written: 1 headline consonant (0F40-&gt;0F6A), any subjoined
     *  consonant(s) (0F90-&gt; 0F9C), achung (0F71), shabkyu (0F74),
     *  any above headline vowel(s) (0F72 0F7A 0F7B 0F7C 0F7D and
     *  0F80); any ngaro (0F7E, 0F82 and 0F83)</blockquote>
     *
     *  <p>FIXME DLC: We still miss some Unicode well-formedness
     *  problems here, but the problems that this function does catch
     *  may not be solved during e.g. a TMW-to-Unicode conversion
     *  because we don't call this function for the entire output,
     *  just pieces of it.  Depending on how you break up those pieces
     *  we could miss problems that this function can fix.  TODO(DLC):
     *  A separate tool that passes over a unicode file and outputs
     *  the same file modulo Unicode booboos would be better.  </p>
     *
     *  @param sb the buffer to be mutated
     *  @return true if sb was mutated */
    public static boolean fixSomeOrderingErrorsInTibetanUnicode(StringBuffer sb) {
        boolean mutated = false;
        int len = sb.length();
        boolean mutated_this_time_through;
        // the do-while loop helps us be correct for \u0f7a\u0f72\u0f71.
        // PERFORMANCE FIXME: try using a map instead of iterating
        // over all of unicode_pairs and see if it isn't faster.
        do {
            mutated_this_time_through = false;
            for (int i = 0; i < len - 1; i++)
                for (int j = 0; j < unicode_pairs.length; j++)
                    if (unicode_pairs[j][1] == sb.charAt(i)
                        && unicode_pairs[j][0] == sb.charAt(i + 1)) {
                        sb.setCharAt(i, unicode_pairs[j][0]);
                        sb.setCharAt(i + 1, unicode_pairs[j][1]);
                        mutated = true;
                        mutated_this_time_through = true;
                    }
        } while (mutated_this_time_through);
        return mutated;
    }
 }
--- a/source/org/thdl/tib/text/tshegbar/UnicodeUtilsTest.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtilsTest.java
@ -379,4 +379,32 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants {
        assertTrue(UnicodeUtils.isInTibetanRange('\u0FF0'));
        assertTrue(UnicodeUtils.isInTibetanRange('\u0FFF'));
    }
    /**
     * Tests the {@link UnicodeUtils#fixSomeOrderingErrorsInTibetanUnicode(StringBuffer)}
     * method. */
    public void testFixSomeOrderingErrorsInTibetanUnicode() {
        // Test that "\u0f67\u0f72\u0f71" becomes "\u0f67\u0f71\u0f72", e.g:
        String tt[][] = {
            { "\u0f67\u0f72\u0f71", "\u0f67\u0f71\u0f72" },
            { "\u0f7a\u0f72\u0f71", "\u0f71\u0f7a\u0f72" },
            { "\u0f67\u0f7e\u0f71", "\u0f67\u0f71\u0f7e" },
            { "\u0f67\u0f74\u0f71", "\u0f67\u0f71\u0f74" },
            { "\u0f67\u0f7e\u0f72", "\u0f67\u0f72\u0f7e" },
            { "\u0f67\u0f7e\u0f74", "\u0f67\u0f74\u0f7e" },
        };
        for (int i = 0; i < tt.length; i++) {
            StringBuffer sb = new StringBuffer(tt[i][0]);
            assertTrue(true == UnicodeUtils.fixSomeOrderingErrorsInTibetanUnicode(sb));
            assertTrue(sb.toString().equals(tt[i][1]));
        }
        // Test that "\u0f67\u0f71\u0f72" stays the same, e.g.:
        String uu[] = { "\u0f67\u0f71\u0f72" };
        for (int i = 0; i < uu.length; i++) {
            StringBuffer sb = new StringBuffer(uu[i]);
            assertTrue(false == UnicodeUtils.fixSomeOrderingErrorsInTibetanUnicode(sb));
            assertTrue(sb.toString().equals(uu[i]));
        }
    }
 }