diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
index ce782a4..d791e19 100644
--- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
+++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java
@@ -346,7 +346,7 @@ public class LegalTshegBar
});
/** Returns a two-codepoint string consisting of the Unicode
- * representation of what Extended Wylie calls
+ * representation of what THDL Extended Wylie calls
* 'i
. */
public static String getConnectiveCaseSuffix() {
return connectiveCaseSuffix;
@@ -382,8 +382,8 @@ public class LegalTshegBar
/** Returns an array of Unicode strings, all the legal suffix
- particles. In Extended Wylie, these are:
This is not very efficient.
*/ public static String[] getPossibleSuffixParticles() { @@ -823,9 +823,9 @@ public class LegalTshegBar isTransliteratedSanskrit(), boolean isTransliteratedChinese() (design: contains fa or va, maybe?). */ - /** Returns a StringBuffer that holds the extended wylie + /** Returns a StringBuffer that holds the THDL extended wylie * representation of this syllable. */ - public StringBuffer getExtendedWylie() { + public StringBuffer getThdlWylie() { StringBuffer sb = new StringBuffer(); char rootLetter = getRootLetter(); if (hasPrefix()) { @@ -837,7 +837,7 @@ public class LegalTshegBar boolean disambiguatorNeeded = false; char prefix = getPrefix(); - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(prefix)); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(prefix)); if (!hasHeadLetter()) { if (EWC_ya == rootLetter) { if (isConsonantThatTakesYaBtags(prefix)) @@ -857,67 +857,67 @@ public class LegalTshegBar sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); } if (hasHeadLetter()) - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getHeadLetter())); - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(rootLetter)); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter())); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(rootLetter)); if (hasSubjoinedLetter()) - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getSubjoinedLetter())); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter())); if (hasWaZurSubjoinedToRootLetter()) - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EWSUB_wa_zur)); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EWSUB_wa_zur)); - // a-chung is treated, in Extended Wylie, like a vowel. I.e., - // you don't have 'pAa', you have 'pA'. + // a-chung is treated, in THDL Extended Wylie, like a vowel. + // I.e., you don't have 'pAa', you have 'pA'. if (hasAChungOnRootLetter()) { if (hasExplicitVowel()) { if (EWV_i == getVowel()) { - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar('\u0F73')); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F73')); } else if (EWV_u == getVowel()) { - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar('\u0F75')); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F75')); } else if (EWV_e == getVowel() || EWV_o == getVowel()) { // The exception to the rule for a-chung and vowels... // DLC FIXME: are these allowed in legal Tibetan? // EWTS would have special cases for them if so, // I'd wager... - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EW_achung)); - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel())); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung)); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel())); } else { ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?"); } } else { - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EW_achung)); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung)); } } else { if (hasExplicitVowel()) - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel())); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel())); else sb.append("a"); } if (hasSuffix()) { String suf = getSuffix(); - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(suf.charAt(0))); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(0))); if (suf.length() > 1) { // DLC assert, don't verify, that the length is two. // This could change if I learn of more suffix // particles. ThdlDebug.verify(2 == suf.length()); - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(suf.charAt(1))); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(1))); } } if (hasPostsuffix()) - sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPostsuffix())); + sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix())); return sb; } - // DLC: toXML for the dense XML /** Returns aThis class allows for invalid tsheg bars, like those * containing more than one prefix, more than two suffixes, an * invalid postsuffix (secondary suffix), more than one consonant - * stack (excluding the special case of what we call in Extended + * stack (excluding the special case of what we call in THDL Extended * Wylie "'i", which is technically a consonant stack but is used in * Tibetan like a suffix).
. * diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java b/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java index 611abcd..8496989 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java @@ -40,6 +40,12 @@ public interface UnicodeConstants { static final byte NORM_NFD = 3; /** Refers to Normalization Form KD: */ static final byte NORM_NFKD = 4; + /** Refers to Normalization Form THDL, which is NFD except for +U+0F77
and U+0F79
, which are
+ normalized according to NFKD. This is the One True
+ Normalization Form, as it leaves no precomposed codepoints and
+ does not normalize U+0F0C
. */
+ static final byte NORM_NFTHDL = 5;
/** for those times when you need a char to represent a
diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
index f527438..150d57f 100644
--- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
+++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java
@@ -94,35 +94,34 @@ public class UnicodeUtils implements UnicodeConstants {
}
/** Puts the Tibetan codepoints in tibetanUnicode, a sequence of
- Unicode codepoints, into Normalization Form KD (NFKD) as
- specified by Unicode 3.2. The Tibetan passages of the
- returned string are in NFKD, but codepoints outside of the
- range U+0F00
-U+0FFF
are not
- necessarily put into NFKD. This form uses a maximum of
+ Unicode codepoints, into either Normalization Form KD (NFKD),
+ D (NFD), or THDL (NFTHDL), depending on the value of normForm.
+ NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed
+ for {@link org.thdl.tib.text.tshegbar#GraphemeCluster} because
+ NFKD normalizes U+0F0C
. NFTHDL uses a maximum of
codepoints, and it never uses codepoints whose use has been
- {@link #isDiscouraged(char) discouraged}. It would be David
- Chandler's very favorite form if not for the fact that
- U+0F0C
normalizes to U+0F0B
in NFKD.
- NFD is thus David Chandler's favorite, though it does not
- decompose U+0F77
and U+0F79
(for
- some reason, hopefully a well-thought-out one).
+ {@link #isDiscouraged(char) discouraged}.
- Recall that NFKD, as it applies to Tibetan codepoints, is - closed under string concatenation and under substringing. - Note again that if the input contains codepoints for which - {@link #isInTibetanRange(char)} is not true, then they will - not be modified.
+The Tibetan passages of the returned string are in the
+ chosen normalized form, but codepoints outside of the {@link
+ #isInTibetanRange(char) range}
+ U+0F00
-U+0FFF
are not necessarily
+ put into normalized form.
Recall that normalized forms are not necessarily closed + under string concatenation, but are closed under + substringing.
Note well that only well-formed input guarantees well-formed output.
@param tibetanUnicode the codepoints to be decomposed - @param normForm NORM_NFKD or NORM_NFD */ + @param normForm NORM_NFKD, NORM_NFTHDL, or NORM_NFD */ public static void toMostlyDecomposedUnicode(StringBuffer tibetanUnicode, byte normForm) { - if (normForm != NORM_NFD && normForm != NORM_NFKD) - throw new IllegalArgumentException("normForm must be NORM_NFD or NORM_NFKD for decomposition to work"); + if (normForm != NORM_NFD && normForm != NORM_NFKD && normForm != NORM_NFTHDL) + throw new IllegalArgumentException("normForm must be NORM_NFD, NORM_NFTHDL, or NORM_NFKD for decomposition to work"); int offset = 0; while (offset < tibetanUnicode.length()) { String s @@ -157,15 +156,19 @@ public class UnicodeUtils implements UnicodeConstants { and returns null for codepoints that are already normalized or are not in the Tibetan range of Unicode. @param tibetanUnicodeCP the codepoint to normalize - @param normalizationForm NORM_NFKD or NORM_NFD if you expect - something nontrivial to happen + @param normalizationForm NORM_NFTHDL, NORM_NFKD, or NORM_NFD + if you expect something nontrivial to happen @return null if tibetanUnicodeCP is already in the chosen normalized form, or a string of two or three codepoints otherwise */ - public static String toNormalizedForm(char tibetanUnicodeCP, byte normalizationForm) { + public static String toNormalizedForm(char tibetanUnicodeCP, + byte normalizationForm) + { if (normalizationForm == NORM_NFKD - || normalizationForm == NORM_NFD) { - // Where not specified, the NFKD form is also the NFD form. + || normalizationForm == NORM_NFD + || normalizationForm == NORM_NFTHDL) { + // Where not specified, the NFKD and NFTHDL forms are + // identical to the NFD form. switch (tibetanUnicodeCP) { case '\u0F0C': return ((normalizationForm == NORM_NFKD) ? "\u0F0B" : null); @@ -178,14 +181,25 @@ public class UnicodeUtils implements UnicodeConstants { case '\u0F73': return "\u0F71\u0F72"; case '\u0F75': return "\u0F71\u0F74"; case '\u0F76': return "\u0FB2\u0F80"; - // I do not understand why NFD does not decompose this codepoint: - case '\u0F77': return ((normalizationForm == NORM_NFKD) - ? "\u0FB2\u0F71\u0F80" : null); + case '\u0F77': { + // I do not understand why NFD does not decompose this + // codepoint, hence NORM_NFTHDL does: + if (normalizationForm == NORM_NFKD + || normalizationForm == NORM_NFTHDL) + return "\u0FB2\u0F71\u0F80"; + else + return null; + } case '\u0F78': return "\u0FB3\u0F80"; - // I do not understand why NFD does not decompose this codepoint: - case '\u0F79': return ((normalizationForm == NORM_NFKD) - ? "\u0FB3\u0F71\u0F80" : null); - + case '\u0F79': { + // I do not understand why NFD does not decompose this + // codepoint, hence NORM_NFTHDL does: + if (normalizationForm == NORM_NFKD + || normalizationForm == NORM_NFTHDL) + return "\u0FB3\u0F71\u0F80"; + else + return null; + } case '\u0F81': return "\u0F71\u0F80"; case '\u0F93': return "\u0F92\u0FB7"; case '\u0F9D': return "\u0F9C\u0FB7";