diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java index ce782a4..d791e19 100644 --- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java +++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java @@ -346,7 +346,7 @@ public class LegalTshegBar }); /** Returns a two-codepoint string consisting of the Unicode - * representation of what Extended Wylie calls + * representation of what THDL Extended Wylie calls * 'i. */ public static String getConnectiveCaseSuffix() { return connectiveCaseSuffix; @@ -382,8 +382,8 @@ public class LegalTshegBar /** Returns an array of Unicode strings, all the legal suffix - particles. In Extended Wylie, these are: + particles. In THDL Extended Wylie, these are:

This is not very efficient.

*/ public static String[] getPossibleSuffixParticles() { @@ -823,9 +823,9 @@ public class LegalTshegBar isTransliteratedSanskrit(), boolean isTransliteratedChinese() (design: contains fa or va, maybe?). */ - /** Returns a StringBuffer that holds the extended wylie + /** Returns a StringBuffer that holds the THDL extended wylie * representation of this syllable. */ - public StringBuffer getExtendedWylie() { + public StringBuffer getThdlWylie() { StringBuffer sb = new StringBuffer(); char rootLetter = getRootLetter(); if (hasPrefix()) { @@ -837,7 +837,7 @@ public class LegalTshegBar boolean disambiguatorNeeded = false; char prefix = getPrefix(); - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(prefix)); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(prefix)); if (!hasHeadLetter()) { if (EWC_ya == rootLetter) { if (isConsonantThatTakesYaBtags(prefix)) @@ -857,67 +857,67 @@ public class LegalTshegBar sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); } if (hasHeadLetter()) - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getHeadLetter())); - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(rootLetter)); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter())); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(rootLetter)); if (hasSubjoinedLetter()) - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getSubjoinedLetter())); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter())); if (hasWaZurSubjoinedToRootLetter()) - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EWSUB_wa_zur)); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EWSUB_wa_zur)); - // a-chung is treated, in Extended Wylie, like a vowel. I.e., - // you don't have 'pAa', you have 'pA'. + // a-chung is treated, in THDL Extended Wylie, like a vowel. + // I.e., you don't have 'pAa', you have 'pA'. if (hasAChungOnRootLetter()) { if (hasExplicitVowel()) { if (EWV_i == getVowel()) { - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar('\u0F73')); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F73')); } else if (EWV_u == getVowel()) { - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar('\u0F75')); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F75')); } else if (EWV_e == getVowel() || EWV_o == getVowel()) { // The exception to the rule for a-chung and vowels... // DLC FIXME: are these allowed in legal Tibetan? // EWTS would have special cases for them if so, // I'd wager... - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EW_achung)); - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel())); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung)); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel())); } else { ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?"); } } else { - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EW_achung)); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung)); } } else { if (hasExplicitVowel()) - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel())); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel())); else sb.append("a"); } if (hasSuffix()) { String suf = getSuffix(); - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(suf.charAt(0))); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(0))); if (suf.length() > 1) { // DLC assert, don't verify, that the length is two. // This could change if I learn of more suffix // particles. ThdlDebug.verify(2 == suf.length()); - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(suf.charAt(1))); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(1))); } } if (hasPostsuffix()) - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPostsuffix())); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix())); return sb; } - // DLC: toXML for the dense XML /** Returns a element that contains only - * the Extended Wylie transliteration for the whole syllable and a note that the . */ + * the THDL Extended Wylie transliteration for the whole syllable + * and a note about the transliteration. */ public String toConciseXML() { // DLC version-control the EWTS document. 0.5 is used below: return (""); + + "transliteration=\"" + getThdlWylie() + "\"" + "/>"); } /** Returns a element that contains the @@ -929,18 +929,18 @@ public class LegalTshegBar + "transliterationType=\"THDL Extended Wylie 0.5\" " + (hasPrefix() ? ("prefix=\"" - + UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPrefix()) + "\" ") + + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPrefix()) + "\" ") : "") + (hasHeadLetter() ? ("headLetter=\"" - + UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getHeadLetter()) + + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter()) + "\" ") : "") + ("rootLetter=\"" - + UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getRootLetter()) + "\" ") + + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getRootLetter()) + "\" ") + (hasSubjoinedLetter() ? ("subjoinedLetter=\"" - + UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getSubjoinedLetter()) + + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter()) + "\" ") : "") + (hasWaZurSubjoinedToRootLetter() @@ -953,17 +953,17 @@ public class LegalTshegBar // DLC NOW: what about the root letter a, i.e. \u0F68 ? do we want the EWTS to be 'aa' ? + ("vowel=\"" + (hasExplicitVowel() - ? UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel()) + ? UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()) : "a") + "\" ") + (hasSuffix() ? ("suffix=\"" - + UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeString(getSuffix()) + + UnicodeCharToThdlWylie.getThdlWylieForUnicodeString(getSuffix()) + "\" ") : "") + (hasPostsuffix() ? ("postsuffix=\"" - + UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPostsuffix()) + + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix()) + "\" ") : "") + "/>"); diff --git a/source/org/thdl/tib/text/tshegbar/TshegBar.java b/source/org/thdl/tib/text/tshegbar/TshegBar.java index 4eefed6..764144d 100644 --- a/source/org/thdl/tib/text/tshegbar/TshegBar.java +++ b/source/org/thdl/tib/text/tshegbar/TshegBar.java @@ -58,7 +58,7 @@ package org.thdl.tib.text.tshegbar; *

This class allows for invalid tsheg bars, like those * containing more than one prefix, more than two suffixes, an * invalid postsuffix (secondary suffix), more than one consonant - * stack (excluding the special case of what we call in Extended + * stack (excluding the special case of what we call in THDL Extended * Wylie "'i", which is technically a consonant stack but is used in * Tibetan like a suffix).

. * diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java b/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java index 611abcd..8496989 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java @@ -40,6 +40,12 @@ public interface UnicodeConstants { static final byte NORM_NFD = 3; /** Refers to Normalization Form KD: */ static final byte NORM_NFKD = 4; + /** Refers to Normalization Form THDL, which is NFD except for + U+0F77 and U+0F79, which are + normalized according to NFKD. This is the One True + Normalization Form, as it leaves no precomposed codepoints and + does not normalize U+0F0C. */ + static final byte NORM_NFTHDL = 5; /** for those times when you need a char to represent a diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java index f527438..150d57f 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java @@ -94,35 +94,34 @@ public class UnicodeUtils implements UnicodeConstants { } /** Puts the Tibetan codepoints in tibetanUnicode, a sequence of - Unicode codepoints, into Normalization Form KD (NFKD) as - specified by Unicode 3.2. The Tibetan passages of the - returned string are in NFKD, but codepoints outside of the - range U+0F00-U+0FFF are not - necessarily put into NFKD. This form uses a maximum of + Unicode codepoints, into either Normalization Form KD (NFKD), + D (NFD), or THDL (NFTHDL), depending on the value of normForm. + NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed + for {@link org.thdl.tib.text.tshegbar#GraphemeCluster} because + NFKD normalizes U+0F0C. NFTHDL uses a maximum of codepoints, and it never uses codepoints whose use has been - {@link #isDiscouraged(char) discouraged}. It would be David - Chandler's very favorite form if not for the fact that - U+0F0C normalizes to U+0F0B in NFKD. - NFD is thus David Chandler's favorite, though it does not - decompose U+0F77 and U+0F79 (for - some reason, hopefully a well-thought-out one). + {@link #isDiscouraged(char) discouraged}. -

Recall that NFKD, as it applies to Tibetan codepoints, is - closed under string concatenation and under substringing. - Note again that if the input contains codepoints for which - {@link #isInTibetanRange(char)} is not true, then they will - not be modified.

+

The Tibetan passages of the returned string are in the + chosen normalized form, but codepoints outside of the {@link + #isInTibetanRange(char) range} + U+0F00-U+0FFF are not necessarily + put into normalized form.

+ +

Recall that normalized forms are not necessarily closed + under string concatenation, but are closed under + substringing.

Note well that only well-formed input guarantees well-formed output.

@param tibetanUnicode the codepoints to be decomposed - @param normForm NORM_NFKD or NORM_NFD */ + @param normForm NORM_NFKD, NORM_NFTHDL, or NORM_NFD */ public static void toMostlyDecomposedUnicode(StringBuffer tibetanUnicode, byte normForm) { - if (normForm != NORM_NFD && normForm != NORM_NFKD) - throw new IllegalArgumentException("normForm must be NORM_NFD or NORM_NFKD for decomposition to work"); + if (normForm != NORM_NFD && normForm != NORM_NFKD && normForm != NORM_NFTHDL) + throw new IllegalArgumentException("normForm must be NORM_NFD, NORM_NFTHDL, or NORM_NFKD for decomposition to work"); int offset = 0; while (offset < tibetanUnicode.length()) { String s @@ -157,15 +156,19 @@ public class UnicodeUtils implements UnicodeConstants { and returns null for codepoints that are already normalized or are not in the Tibetan range of Unicode. @param tibetanUnicodeCP the codepoint to normalize - @param normalizationForm NORM_NFKD or NORM_NFD if you expect - something nontrivial to happen + @param normalizationForm NORM_NFTHDL, NORM_NFKD, or NORM_NFD + if you expect something nontrivial to happen @return null if tibetanUnicodeCP is already in the chosen normalized form, or a string of two or three codepoints otherwise */ - public static String toNormalizedForm(char tibetanUnicodeCP, byte normalizationForm) { + public static String toNormalizedForm(char tibetanUnicodeCP, + byte normalizationForm) + { if (normalizationForm == NORM_NFKD - || normalizationForm == NORM_NFD) { - // Where not specified, the NFKD form is also the NFD form. + || normalizationForm == NORM_NFD + || normalizationForm == NORM_NFTHDL) { + // Where not specified, the NFKD and NFTHDL forms are + // identical to the NFD form. switch (tibetanUnicodeCP) { case '\u0F0C': return ((normalizationForm == NORM_NFKD) ? "\u0F0B" : null); @@ -178,14 +181,25 @@ public class UnicodeUtils implements UnicodeConstants { case '\u0F73': return "\u0F71\u0F72"; case '\u0F75': return "\u0F71\u0F74"; case '\u0F76': return "\u0FB2\u0F80"; - // I do not understand why NFD does not decompose this codepoint: - case '\u0F77': return ((normalizationForm == NORM_NFKD) - ? "\u0FB2\u0F71\u0F80" : null); + case '\u0F77': { + // I do not understand why NFD does not decompose this + // codepoint, hence NORM_NFTHDL does: + if (normalizationForm == NORM_NFKD + || normalizationForm == NORM_NFTHDL) + return "\u0FB2\u0F71\u0F80"; + else + return null; + } case '\u0F78': return "\u0FB3\u0F80"; - // I do not understand why NFD does not decompose this codepoint: - case '\u0F79': return ((normalizationForm == NORM_NFKD) - ? "\u0FB3\u0F71\u0F80" : null); - + case '\u0F79': { + // I do not understand why NFD does not decompose this + // codepoint, hence NORM_NFTHDL does: + if (normalizationForm == NORM_NFKD + || normalizationForm == NORM_NFTHDL) + return "\u0FB3\u0F71\u0F80"; + else + return null; + } case '\u0F81': return "\u0F71\u0F80"; case '\u0F93': return "\u0F92\u0FB7"; case '\u0F9D': return "\u0F9C\u0FB7";