Extended Wylie is referred to as THDL Extended Wylie or THDL Wylie

because a Japanese scholar has an "Extended Wylie" also.

NFKD and NFD have a new brother, NFTHDL.  I wish there weren't a need,
but as my yet-to-be-put-into-CVS break-unicode-into-grapheme-clusters code
demonstrates, the-need-is-there.  forgive-me for the hyphens, it's late.
This commit is contained in:
dchandler 2002-12-15 06:57:32 +00:00
parent a42347b224
commit 8e8a23c6a6
4 changed files with 83 additions and 63 deletions

View file

@ -346,7 +346,7 @@ public class LegalTshegBar
}); });
/** Returns a two-codepoint string consisting of the Unicode /** Returns a two-codepoint string consisting of the Unicode
* representation of what Extended Wylie calls * representation of what THDL Extended Wylie calls
* <code>'i</code>. */ * <code>'i</code>. */
public static String getConnectiveCaseSuffix() { public static String getConnectiveCaseSuffix() {
return connectiveCaseSuffix; return connectiveCaseSuffix;
@ -382,8 +382,8 @@ public class LegalTshegBar
/** Returns an array of Unicode strings, all the legal suffix /** Returns an array of Unicode strings, all the legal suffix
particles. In Extended Wylie, these are: <ul> <li>'i</li> particles. In THDL Extended Wylie, these are: <ul>
<li>'o</li> <li>'u</li> <li>'am</li> </ul> <li>'i</li> <li>'o</li> <li>'u</li> <li>'am</li> </ul>
<p>This is not very efficient.</p> */ <p>This is not very efficient.</p> */
public static String[] getPossibleSuffixParticles() { public static String[] getPossibleSuffixParticles() {
@ -823,9 +823,9 @@ public class LegalTshegBar
isTransliteratedSanskrit(), boolean isTransliteratedChinese() isTransliteratedSanskrit(), boolean isTransliteratedChinese()
(design: contains fa or va, maybe?). */ (design: contains fa or va, maybe?). */
/** Returns a StringBuffer that holds the extended wylie /** Returns a StringBuffer that holds the THDL extended wylie
* representation of this syllable. */ * representation of this syllable. */
public StringBuffer getExtendedWylie() { public StringBuffer getThdlWylie() {
StringBuffer sb = new StringBuffer(); StringBuffer sb = new StringBuffer();
char rootLetter = getRootLetter(); char rootLetter = getRootLetter();
if (hasPrefix()) { if (hasPrefix()) {
@ -837,7 +837,7 @@ public class LegalTshegBar
boolean disambiguatorNeeded = false; boolean disambiguatorNeeded = false;
char prefix = getPrefix(); char prefix = getPrefix();
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(prefix)); sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(prefix));
if (!hasHeadLetter()) { if (!hasHeadLetter()) {
if (EWC_ya == rootLetter) { if (EWC_ya == rootLetter) {
if (isConsonantThatTakesYaBtags(prefix)) if (isConsonantThatTakesYaBtags(prefix))
@ -857,67 +857,67 @@ public class LegalTshegBar
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
} }
if (hasHeadLetter()) if (hasHeadLetter())
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getHeadLetter())); sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter()));
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(rootLetter)); sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(rootLetter));
if (hasSubjoinedLetter()) if (hasSubjoinedLetter())
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getSubjoinedLetter())); sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter()));
if (hasWaZurSubjoinedToRootLetter()) if (hasWaZurSubjoinedToRootLetter())
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EWSUB_wa_zur)); sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EWSUB_wa_zur));
// a-chung is treated, in Extended Wylie, like a vowel. I.e., // a-chung is treated, in THDL Extended Wylie, like a vowel.
// you don't have 'pAa', you have 'pA'. // I.e., you don't have 'pAa', you have 'pA'.
if (hasAChungOnRootLetter()) { if (hasAChungOnRootLetter()) {
if (hasExplicitVowel()) { if (hasExplicitVowel()) {
if (EWV_i == getVowel()) { if (EWV_i == getVowel()) {
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar('\u0F73')); sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F73'));
} else if (EWV_u == getVowel()) { } else if (EWV_u == getVowel()) {
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar('\u0F75')); sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F75'));
} else if (EWV_e == getVowel() || EWV_o == getVowel()) { } else if (EWV_e == getVowel() || EWV_o == getVowel()) {
// The exception to the rule for a-chung and vowels... // The exception to the rule for a-chung and vowels...
// DLC FIXME: are these allowed in legal Tibetan? // DLC FIXME: are these allowed in legal Tibetan?
// EWTS would have special cases for them if so, // EWTS would have special cases for them if so,
// I'd wager... // I'd wager...
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EW_achung)); sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung));
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel())); sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()));
} else { } else {
ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?"); ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?");
} }
} else { } else {
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EW_achung)); sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung));
} }
} else { } else {
if (hasExplicitVowel()) if (hasExplicitVowel())
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel())); sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()));
else else
sb.append("a"); sb.append("a");
} }
if (hasSuffix()) { if (hasSuffix()) {
String suf = getSuffix(); String suf = getSuffix();
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(suf.charAt(0))); sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(0)));
if (suf.length() > 1) { if (suf.length() > 1) {
// DLC assert, don't verify, that the length is two. // DLC assert, don't verify, that the length is two.
// This could change if I learn of more suffix // This could change if I learn of more suffix
// particles. // particles.
ThdlDebug.verify(2 == suf.length()); ThdlDebug.verify(2 == suf.length());
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(suf.charAt(1))); sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(1)));
} }
} }
if (hasPostsuffix()) if (hasPostsuffix())
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPostsuffix())); sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix()));
return sb; return sb;
} }
// DLC: toXML for the dense XML
/** Returns a <legalTibetanSyllable> element that contains only /** Returns a <legalTibetanSyllable> element that contains only
* the Extended Wylie transliteration for the whole syllable and a note that the . */ * the THDL Extended Wylie transliteration for the whole syllable
* and a note about the transliteration. */
public String toConciseXML() { public String toConciseXML() {
// DLC version-control the EWTS document. 0.5 is used below: // DLC version-control the EWTS document. 0.5 is used below:
return ("<legalTibetanSyllable " return ("<legalTibetanSyllable "
+ "transliterationType=\"THDL Extended Wylie 0.5\" " + "transliterationType=\"THDL Extended Wylie 0.5\" "
+ "transliteration=\"" + getExtendedWylie() + "\"" + "/>"); + "transliteration=\"" + getThdlWylie() + "\"" + "/>");
} }
/** Returns a <legalTibetanSyllable> element that contains the /** Returns a <legalTibetanSyllable> element that contains the
@ -929,18 +929,18 @@ public class LegalTshegBar
+ "transliterationType=\"THDL Extended Wylie 0.5\" " + "transliterationType=\"THDL Extended Wylie 0.5\" "
+ (hasPrefix() + (hasPrefix()
? ("prefix=\"" ? ("prefix=\""
+ UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPrefix()) + "\" ") + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPrefix()) + "\" ")
: "") : "")
+ (hasHeadLetter() + (hasHeadLetter()
? ("headLetter=\"" ? ("headLetter=\""
+ UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getHeadLetter()) + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter())
+ "\" ") + "\" ")
: "") : "")
+ ("rootLetter=\"" + ("rootLetter=\""
+ UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getRootLetter()) + "\" ") + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getRootLetter()) + "\" ")
+ (hasSubjoinedLetter() + (hasSubjoinedLetter()
? ("subjoinedLetter=\"" ? ("subjoinedLetter=\""
+ UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getSubjoinedLetter()) + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter())
+ "\" ") + "\" ")
: "") : "")
+ (hasWaZurSubjoinedToRootLetter() + (hasWaZurSubjoinedToRootLetter()
@ -953,17 +953,17 @@ public class LegalTshegBar
// DLC NOW: what about the root letter a, i.e. &#92;u0F68 ? do we want the EWTS to be 'aa' ? // DLC NOW: what about the root letter a, i.e. &#92;u0F68 ? do we want the EWTS to be 'aa' ?
+ ("vowel=\"" + ("vowel=\""
+ (hasExplicitVowel() + (hasExplicitVowel()
? UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel()) ? UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel())
: "a") : "a")
+ "\" ") + "\" ")
+ (hasSuffix() + (hasSuffix()
? ("suffix=\"" ? ("suffix=\""
+ UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeString(getSuffix()) + UnicodeCharToThdlWylie.getThdlWylieForUnicodeString(getSuffix())
+ "\" ") + "\" ")
: "") : "")
+ (hasPostsuffix() + (hasPostsuffix()
? ("postsuffix=\"" ? ("postsuffix=\""
+ UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPostsuffix()) + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix())
+ "\" ") + "\" ")
: "") : "")
+ "/>"); + "/>");

View file

@ -58,7 +58,7 @@ package org.thdl.tib.text.tshegbar;
* <p> This class allows for invalid tsheg bars, like those * <p> This class allows for invalid tsheg bars, like those
* containing more than one prefix, more than two suffixes, an * containing more than one prefix, more than two suffixes, an
* invalid postsuffix (secondary suffix), more than one consonant * invalid postsuffix (secondary suffix), more than one consonant
* stack (excluding the special case of what we call in Extended * stack (excluding the special case of what we call in THDL Extended
* Wylie "'i", which is technically a consonant stack but is used in * Wylie "'i", which is technically a consonant stack but is used in
* Tibetan like a suffix).</p>. * Tibetan like a suffix).</p>.
* *

View file

@ -40,6 +40,12 @@ public interface UnicodeConstants {
static final byte NORM_NFD = 3; static final byte NORM_NFD = 3;
/** Refers to Normalization Form KD: */ /** Refers to Normalization Form KD: */
static final byte NORM_NFKD = 4; static final byte NORM_NFKD = 4;
/** Refers to Normalization Form THDL, which is NFD except for
<code>U+0F77</code> and <code>U+0F79</code>, which are
normalized according to NFKD. This is the One True
Normalization Form, as it leaves no precomposed codepoints and
does not normalize <code>U+0F0C</code>. */
static final byte NORM_NFTHDL = 5;
/** for those times when you need a char to represent a /** for those times when you need a char to represent a

View file

@ -94,35 +94,34 @@ public class UnicodeUtils implements UnicodeConstants {
} }
/** Puts the Tibetan codepoints in tibetanUnicode, a sequence of /** Puts the Tibetan codepoints in tibetanUnicode, a sequence of
Unicode codepoints, into Normalization Form KD (NFKD) as Unicode codepoints, into either Normalization Form KD (NFKD),
specified by Unicode 3.2. The Tibetan passages of the D (NFD), or THDL (NFTHDL), depending on the value of normForm.
returned string are in NFKD, but codepoints outside of the NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed
range <code>U+0F00</code>-<code>U+0FFF</code> are not for {@link org.thdl.tib.text.tshegbar#GraphemeCluster} because
necessarily put into NFKD. This form uses a maximum of NFKD normalizes <code>U+0F0C</code>. NFTHDL uses a maximum of
codepoints, and it never uses codepoints whose use has been codepoints, and it never uses codepoints whose use has been
{@link #isDiscouraged(char) discouraged}. It would be David {@link #isDiscouraged(char) discouraged}.
Chandler's very favorite form if not for the fact that
<code>U+0F0C</code> normalizes to <code>U+0F0B</code> in NFKD.
NFD is thus David Chandler's favorite, though it does not
decompose <code>U+0F77</code> and <code>U+0F79</code> (for
some reason, hopefully a well-thought-out one).
<p>Recall that NFKD, as it applies to Tibetan codepoints, is <p>The Tibetan passages of the returned string are in the
closed under string concatenation and under substringing. chosen normalized form, but codepoints outside of the {@link
Note again that if the input contains codepoints for which #isInTibetanRange(char) range}
{@link #isInTibetanRange(char)} is not true, then they will <code>U+0F00</code>-<code>U+0FFF</code> are not necessarily
not be modified.</p> put into normalized form.</p>
<p>Recall that normalized forms are not necessarily closed
under string concatenation, but are closed under
substringing.</p>
<p>Note well that only well-formed input guarantees <p>Note well that only well-formed input guarantees
well-formed output.</p> well-formed output.</p>
@param tibetanUnicode the codepoints to be decomposed @param tibetanUnicode the codepoints to be decomposed
@param normForm NORM_NFKD or NORM_NFD */ @param normForm NORM_NFKD, NORM_NFTHDL, or NORM_NFD */
public static void toMostlyDecomposedUnicode(StringBuffer tibetanUnicode, public static void toMostlyDecomposedUnicode(StringBuffer tibetanUnicode,
byte normForm) byte normForm)
{ {
if (normForm != NORM_NFD && normForm != NORM_NFKD) if (normForm != NORM_NFD && normForm != NORM_NFKD && normForm != NORM_NFTHDL)
throw new IllegalArgumentException("normForm must be NORM_NFD or NORM_NFKD for decomposition to work"); throw new IllegalArgumentException("normForm must be NORM_NFD, NORM_NFTHDL, or NORM_NFKD for decomposition to work");
int offset = 0; int offset = 0;
while (offset < tibetanUnicode.length()) { while (offset < tibetanUnicode.length()) {
String s String s
@ -157,15 +156,19 @@ public class UnicodeUtils implements UnicodeConstants {
and returns null for codepoints that are already normalized or and returns null for codepoints that are already normalized or
are not in the Tibetan range of Unicode. are not in the Tibetan range of Unicode.
@param tibetanUnicodeCP the codepoint to normalize @param tibetanUnicodeCP the codepoint to normalize
@param normalizationForm NORM_NFKD or NORM_NFD if you expect @param normalizationForm NORM_NFTHDL, NORM_NFKD, or NORM_NFD
something nontrivial to happen if you expect something nontrivial to happen
@return null if tibetanUnicodeCP is already in the chosen @return null if tibetanUnicodeCP is already in the chosen
normalized form, or a string of two or three codepoints normalized form, or a string of two or three codepoints
otherwise */ otherwise */
public static String toNormalizedForm(char tibetanUnicodeCP, byte normalizationForm) { public static String toNormalizedForm(char tibetanUnicodeCP,
byte normalizationForm)
{
if (normalizationForm == NORM_NFKD if (normalizationForm == NORM_NFKD
|| normalizationForm == NORM_NFD) { || normalizationForm == NORM_NFD
// Where not specified, the NFKD form is also the NFD form. || normalizationForm == NORM_NFTHDL) {
// Where not specified, the NFKD and NFTHDL forms are
// identical to the NFD form.
switch (tibetanUnicodeCP) { switch (tibetanUnicodeCP) {
case '\u0F0C': return ((normalizationForm == NORM_NFKD) case '\u0F0C': return ((normalizationForm == NORM_NFKD)
? "\u0F0B" : null); ? "\u0F0B" : null);
@ -178,14 +181,25 @@ public class UnicodeUtils implements UnicodeConstants {
case '\u0F73': return "\u0F71\u0F72"; case '\u0F73': return "\u0F71\u0F72";
case '\u0F75': return "\u0F71\u0F74"; case '\u0F75': return "\u0F71\u0F74";
case '\u0F76': return "\u0FB2\u0F80"; case '\u0F76': return "\u0FB2\u0F80";
// I do not understand why NFD does not decompose this codepoint: case '\u0F77': {
case '\u0F77': return ((normalizationForm == NORM_NFKD) // I do not understand why NFD does not decompose this
? "\u0FB2\u0F71\u0F80" : null); // codepoint, hence NORM_NFTHDL does:
if (normalizationForm == NORM_NFKD
|| normalizationForm == NORM_NFTHDL)
return "\u0FB2\u0F71\u0F80";
else
return null;
}
case '\u0F78': return "\u0FB3\u0F80"; case '\u0F78': return "\u0FB3\u0F80";
// I do not understand why NFD does not decompose this codepoint: case '\u0F79': {
case '\u0F79': return ((normalizationForm == NORM_NFKD) // I do not understand why NFD does not decompose this
? "\u0FB3\u0F71\u0F80" : null); // codepoint, hence NORM_NFTHDL does:
if (normalizationForm == NORM_NFKD
|| normalizationForm == NORM_NFTHDL)
return "\u0FB3\u0F71\u0F80";
else
return null;
}
case '\u0F81': return "\u0F71\u0F80"; case '\u0F81': return "\u0F71\u0F80";
case '\u0F93': return "\u0F92\u0FB7"; case '\u0F93': return "\u0F92\u0FB7";
case '\u0F9D': return "\u0F9C\u0FB7"; case '\u0F9D': return "\u0F9C\u0FB7";