Extended Wylie is referred to as THDL Extended Wylie or THDL Wylie

because a Japanese scholar has an "Extended Wylie" also.

NFKD and NFD have a new brother, NFTHDL.  I wish there weren't a need,
but as my yet-to-be-put-into-CVS break-unicode-into-grapheme-clusters code
demonstrates, the-need-is-there.  forgive-me for the hyphens, it's late.
This commit is contained in:
dchandler 2002-12-15 06:57:32 +00:00
parent a42347b224
commit 8e8a23c6a6
4 changed files with 83 additions and 63 deletions

View file

@ -346,7 +346,7 @@ public class LegalTshegBar
});
/** Returns a two-codepoint string consisting of the Unicode
* representation of what Extended Wylie calls
* representation of what THDL Extended Wylie calls
* <code>'i</code>. */
public static String getConnectiveCaseSuffix() {
return connectiveCaseSuffix;
@ -382,8 +382,8 @@ public class LegalTshegBar
/** Returns an array of Unicode strings, all the legal suffix
particles. In Extended Wylie, these are: <ul> <li>'i</li>
<li>'o</li> <li>'u</li> <li>'am</li> </ul>
particles. In THDL Extended Wylie, these are: <ul>
<li>'i</li> <li>'o</li> <li>'u</li> <li>'am</li> </ul>
<p>This is not very efficient.</p> */
public static String[] getPossibleSuffixParticles() {
@ -823,9 +823,9 @@ public class LegalTshegBar
isTransliteratedSanskrit(), boolean isTransliteratedChinese()
(design: contains fa or va, maybe?). */
/** Returns a StringBuffer that holds the extended wylie
/** Returns a StringBuffer that holds the THDL extended wylie
* representation of this syllable. */
public StringBuffer getExtendedWylie() {
public StringBuffer getThdlWylie() {
StringBuffer sb = new StringBuffer();
char rootLetter = getRootLetter();
if (hasPrefix()) {
@ -837,7 +837,7 @@ public class LegalTshegBar
boolean disambiguatorNeeded = false;
char prefix = getPrefix();
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(prefix));
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(prefix));
if (!hasHeadLetter()) {
if (EWC_ya == rootLetter) {
if (isConsonantThatTakesYaBtags(prefix))
@ -857,67 +857,67 @@ public class LegalTshegBar
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
}
if (hasHeadLetter())
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getHeadLetter()));
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(rootLetter));
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter()));
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(rootLetter));
if (hasSubjoinedLetter())
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getSubjoinedLetter()));
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter()));
if (hasWaZurSubjoinedToRootLetter())
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EWSUB_wa_zur));
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EWSUB_wa_zur));
// a-chung is treated, in Extended Wylie, like a vowel. I.e.,
// you don't have 'pAa', you have 'pA'.
// a-chung is treated, in THDL Extended Wylie, like a vowel.
// I.e., you don't have 'pAa', you have 'pA'.
if (hasAChungOnRootLetter()) {
if (hasExplicitVowel()) {
if (EWV_i == getVowel()) {
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar('\u0F73'));
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F73'));
} else if (EWV_u == getVowel()) {
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar('\u0F75'));
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F75'));
} else if (EWV_e == getVowel() || EWV_o == getVowel()) {
// The exception to the rule for a-chung and vowels...
// DLC FIXME: are these allowed in legal Tibetan?
// EWTS would have special cases for them if so,
// I'd wager...
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EW_achung));
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel()));
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung));
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()));
} else {
ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?");
}
} else {
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EW_achung));
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung));
}
} else {
if (hasExplicitVowel())
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel()));
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()));
else
sb.append("a");
}
if (hasSuffix()) {
String suf = getSuffix();
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(suf.charAt(0)));
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(0)));
if (suf.length() > 1) {
// DLC assert, don't verify, that the length is two.
// This could change if I learn of more suffix
// particles.
ThdlDebug.verify(2 == suf.length());
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(suf.charAt(1)));
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(1)));
}
}
if (hasPostsuffix())
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPostsuffix()));
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix()));
return sb;
}
// DLC: toXML for the dense XML
/** Returns a <legalTibetanSyllable> element that contains only
* the Extended Wylie transliteration for the whole syllable and a note that the . */
* the THDL Extended Wylie transliteration for the whole syllable
* and a note about the transliteration. */
public String toConciseXML() {
// DLC version-control the EWTS document. 0.5 is used below:
return ("<legalTibetanSyllable "
+ "transliterationType=\"THDL Extended Wylie 0.5\" "
+ "transliteration=\"" + getExtendedWylie() + "\"" + "/>");
+ "transliteration=\"" + getThdlWylie() + "\"" + "/>");
}
/** Returns a <legalTibetanSyllable> element that contains the
@ -929,18 +929,18 @@ public class LegalTshegBar
+ "transliterationType=\"THDL Extended Wylie 0.5\" "
+ (hasPrefix()
? ("prefix=\""
+ UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPrefix()) + "\" ")
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPrefix()) + "\" ")
: "")
+ (hasHeadLetter()
? ("headLetter=\""
+ UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getHeadLetter())
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter())
+ "\" ")
: "")
+ ("rootLetter=\""
+ UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getRootLetter()) + "\" ")
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getRootLetter()) + "\" ")
+ (hasSubjoinedLetter()
? ("subjoinedLetter=\""
+ UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getSubjoinedLetter())
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter())
+ "\" ")
: "")
+ (hasWaZurSubjoinedToRootLetter()
@ -953,17 +953,17 @@ public class LegalTshegBar
// DLC NOW: what about the root letter a, i.e. &#92;u0F68 ? do we want the EWTS to be 'aa' ?
+ ("vowel=\""
+ (hasExplicitVowel()
? UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel())
? UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel())
: "a")
+ "\" ")
+ (hasSuffix()
? ("suffix=\""
+ UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeString(getSuffix())
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeString(getSuffix())
+ "\" ")
: "")
+ (hasPostsuffix()
? ("postsuffix=\""
+ UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPostsuffix())
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix())
+ "\" ")
: "")
+ "/>");

View file

@ -58,7 +58,7 @@ package org.thdl.tib.text.tshegbar;
* <p> This class allows for invalid tsheg bars, like those
* containing more than one prefix, more than two suffixes, an
* invalid postsuffix (secondary suffix), more than one consonant
* stack (excluding the special case of what we call in Extended
* stack (excluding the special case of what we call in THDL Extended
* Wylie "'i", which is technically a consonant stack but is used in
* Tibetan like a suffix).</p>.
*

View file

@ -40,6 +40,12 @@ public interface UnicodeConstants {
static final byte NORM_NFD = 3;
/** Refers to Normalization Form KD: */
static final byte NORM_NFKD = 4;
/** Refers to Normalization Form THDL, which is NFD except for
<code>U+0F77</code> and <code>U+0F79</code>, which are
normalized according to NFKD. This is the One True
Normalization Form, as it leaves no precomposed codepoints and
does not normalize <code>U+0F0C</code>. */
static final byte NORM_NFTHDL = 5;
/** for those times when you need a char to represent a

View file

@ -94,35 +94,34 @@ public class UnicodeUtils implements UnicodeConstants {
}
/** Puts the Tibetan codepoints in tibetanUnicode, a sequence of
Unicode codepoints, into Normalization Form KD (NFKD) as
specified by Unicode 3.2. The Tibetan passages of the
returned string are in NFKD, but codepoints outside of the
range <code>U+0F00</code>-<code>U+0FFF</code> are not
necessarily put into NFKD. This form uses a maximum of
Unicode codepoints, into either Normalization Form KD (NFKD),
D (NFD), or THDL (NFTHDL), depending on the value of normForm.
NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed
for {@link org.thdl.tib.text.tshegbar#GraphemeCluster} because
NFKD normalizes <code>U+0F0C</code>. NFTHDL uses a maximum of
codepoints, and it never uses codepoints whose use has been
{@link #isDiscouraged(char) discouraged}. It would be David
Chandler's very favorite form if not for the fact that
<code>U+0F0C</code> normalizes to <code>U+0F0B</code> in NFKD.
NFD is thus David Chandler's favorite, though it does not
decompose <code>U+0F77</code> and <code>U+0F79</code> (for
some reason, hopefully a well-thought-out one).
{@link #isDiscouraged(char) discouraged}.
<p>Recall that NFKD, as it applies to Tibetan codepoints, is
closed under string concatenation and under substringing.
Note again that if the input contains codepoints for which
{@link #isInTibetanRange(char)} is not true, then they will
not be modified.</p>
<p>The Tibetan passages of the returned string are in the
chosen normalized form, but codepoints outside of the {@link
#isInTibetanRange(char) range}
<code>U+0F00</code>-<code>U+0FFF</code> are not necessarily
put into normalized form.</p>
<p>Recall that normalized forms are not necessarily closed
under string concatenation, but are closed under
substringing.</p>
<p>Note well that only well-formed input guarantees
well-formed output.</p>
@param tibetanUnicode the codepoints to be decomposed
@param normForm NORM_NFKD or NORM_NFD */
@param normForm NORM_NFKD, NORM_NFTHDL, or NORM_NFD */
public static void toMostlyDecomposedUnicode(StringBuffer tibetanUnicode,
byte normForm)
{
if (normForm != NORM_NFD && normForm != NORM_NFKD)
throw new IllegalArgumentException("normForm must be NORM_NFD or NORM_NFKD for decomposition to work");
if (normForm != NORM_NFD && normForm != NORM_NFKD && normForm != NORM_NFTHDL)
throw new IllegalArgumentException("normForm must be NORM_NFD, NORM_NFTHDL, or NORM_NFKD for decomposition to work");
int offset = 0;
while (offset < tibetanUnicode.length()) {
String s
@ -157,15 +156,19 @@ public class UnicodeUtils implements UnicodeConstants {
and returns null for codepoints that are already normalized or
are not in the Tibetan range of Unicode.
@param tibetanUnicodeCP the codepoint to normalize
@param normalizationForm NORM_NFKD or NORM_NFD if you expect
something nontrivial to happen
@param normalizationForm NORM_NFTHDL, NORM_NFKD, or NORM_NFD
if you expect something nontrivial to happen
@return null if tibetanUnicodeCP is already in the chosen
normalized form, or a string of two or three codepoints
otherwise */
public static String toNormalizedForm(char tibetanUnicodeCP, byte normalizationForm) {
public static String toNormalizedForm(char tibetanUnicodeCP,
byte normalizationForm)
{
if (normalizationForm == NORM_NFKD
|| normalizationForm == NORM_NFD) {
// Where not specified, the NFKD form is also the NFD form.
|| normalizationForm == NORM_NFD
|| normalizationForm == NORM_NFTHDL) {
// Where not specified, the NFKD and NFTHDL forms are
// identical to the NFD form.
switch (tibetanUnicodeCP) {
case '\u0F0C': return ((normalizationForm == NORM_NFKD)
? "\u0F0B" : null);
@ -178,14 +181,25 @@ public class UnicodeUtils implements UnicodeConstants {
case '\u0F73': return "\u0F71\u0F72";
case '\u0F75': return "\u0F71\u0F74";
case '\u0F76': return "\u0FB2\u0F80";
// I do not understand why NFD does not decompose this codepoint:
case '\u0F77': return ((normalizationForm == NORM_NFKD)
? "\u0FB2\u0F71\u0F80" : null);
case '\u0F77': {
// I do not understand why NFD does not decompose this
// codepoint, hence NORM_NFTHDL does:
if (normalizationForm == NORM_NFKD
|| normalizationForm == NORM_NFTHDL)
return "\u0FB2\u0F71\u0F80";
else
return null;
}
case '\u0F78': return "\u0FB3\u0F80";
// I do not understand why NFD does not decompose this codepoint:
case '\u0F79': return ((normalizationForm == NORM_NFKD)
? "\u0FB3\u0F71\u0F80" : null);
case '\u0F79': {
// I do not understand why NFD does not decompose this
// codepoint, hence NORM_NFTHDL does:
if (normalizationForm == NORM_NFKD
|| normalizationForm == NORM_NFTHDL)
return "\u0FB3\u0F71\u0F80";
else
return null;
}
case '\u0F81': return "\u0F71\u0F80";
case '\u0F93': return "\u0F92\u0FB7";
case '\u0F9D': return "\u0F9C\u0FB7";