Extended Wylie is referred to as THDL Extended Wylie or THDL Wylie
because a Japanese scholar has an "Extended Wylie" also. NFKD and NFD have a new brother, NFTHDL. I wish there weren't a need, but as my yet-to-be-put-into-CVS break-unicode-into-grapheme-clusters code demonstrates, the-need-is-there. forgive-me for the hyphens, it's late.
This commit is contained in:
parent
a42347b224
commit
8e8a23c6a6
4 changed files with 83 additions and 63 deletions
|
@ -346,7 +346,7 @@ public class LegalTshegBar
|
|||
});
|
||||
|
||||
/** Returns a two-codepoint string consisting of the Unicode
|
||||
* representation of what Extended Wylie calls
|
||||
* representation of what THDL Extended Wylie calls
|
||||
* <code>'i</code>. */
|
||||
public static String getConnectiveCaseSuffix() {
|
||||
return connectiveCaseSuffix;
|
||||
|
@ -382,8 +382,8 @@ public class LegalTshegBar
|
|||
|
||||
|
||||
/** Returns an array of Unicode strings, all the legal suffix
|
||||
particles. In Extended Wylie, these are: <ul> <li>'i</li>
|
||||
<li>'o</li> <li>'u</li> <li>'am</li> </ul>
|
||||
particles. In THDL Extended Wylie, these are: <ul>
|
||||
<li>'i</li> <li>'o</li> <li>'u</li> <li>'am</li> </ul>
|
||||
|
||||
<p>This is not very efficient.</p> */
|
||||
public static String[] getPossibleSuffixParticles() {
|
||||
|
@ -823,9 +823,9 @@ public class LegalTshegBar
|
|||
isTransliteratedSanskrit(), boolean isTransliteratedChinese()
|
||||
(design: contains fa or va, maybe?). */
|
||||
|
||||
/** Returns a StringBuffer that holds the extended wylie
|
||||
/** Returns a StringBuffer that holds the THDL extended wylie
|
||||
* representation of this syllable. */
|
||||
public StringBuffer getExtendedWylie() {
|
||||
public StringBuffer getThdlWylie() {
|
||||
StringBuffer sb = new StringBuffer();
|
||||
char rootLetter = getRootLetter();
|
||||
if (hasPrefix()) {
|
||||
|
@ -837,7 +837,7 @@ public class LegalTshegBar
|
|||
|
||||
boolean disambiguatorNeeded = false;
|
||||
char prefix = getPrefix();
|
||||
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(prefix));
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(prefix));
|
||||
if (!hasHeadLetter()) {
|
||||
if (EWC_ya == rootLetter) {
|
||||
if (isConsonantThatTakesYaBtags(prefix))
|
||||
|
@ -857,67 +857,67 @@ public class LegalTshegBar
|
|||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
}
|
||||
if (hasHeadLetter())
|
||||
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getHeadLetter()));
|
||||
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(rootLetter));
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter()));
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(rootLetter));
|
||||
if (hasSubjoinedLetter())
|
||||
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getSubjoinedLetter()));
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter()));
|
||||
if (hasWaZurSubjoinedToRootLetter())
|
||||
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EWSUB_wa_zur));
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EWSUB_wa_zur));
|
||||
|
||||
// a-chung is treated, in Extended Wylie, like a vowel. I.e.,
|
||||
// you don't have 'pAa', you have 'pA'.
|
||||
// a-chung is treated, in THDL Extended Wylie, like a vowel.
|
||||
// I.e., you don't have 'pAa', you have 'pA'.
|
||||
if (hasAChungOnRootLetter()) {
|
||||
if (hasExplicitVowel()) {
|
||||
if (EWV_i == getVowel()) {
|
||||
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar('\u0F73'));
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F73'));
|
||||
} else if (EWV_u == getVowel()) {
|
||||
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar('\u0F75'));
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F75'));
|
||||
} else if (EWV_e == getVowel() || EWV_o == getVowel()) {
|
||||
// The exception to the rule for a-chung and vowels...
|
||||
|
||||
// DLC FIXME: are these allowed in legal Tibetan?
|
||||
// EWTS would have special cases for them if so,
|
||||
// I'd wager...
|
||||
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EW_achung));
|
||||
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel()));
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung));
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()));
|
||||
} else {
|
||||
ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?");
|
||||
}
|
||||
} else {
|
||||
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EW_achung));
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung));
|
||||
}
|
||||
} else {
|
||||
if (hasExplicitVowel())
|
||||
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel()));
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()));
|
||||
else
|
||||
sb.append("a");
|
||||
}
|
||||
|
||||
if (hasSuffix()) {
|
||||
String suf = getSuffix();
|
||||
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(suf.charAt(0)));
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(0)));
|
||||
if (suf.length() > 1) {
|
||||
// DLC assert, don't verify, that the length is two.
|
||||
// This could change if I learn of more suffix
|
||||
// particles.
|
||||
ThdlDebug.verify(2 == suf.length());
|
||||
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(suf.charAt(1)));
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(1)));
|
||||
}
|
||||
}
|
||||
if (hasPostsuffix())
|
||||
sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPostsuffix()));
|
||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix()));
|
||||
return sb;
|
||||
}
|
||||
|
||||
|
||||
// DLC: toXML for the dense XML
|
||||
/** Returns a <legalTibetanSyllable> element that contains only
|
||||
* the Extended Wylie transliteration for the whole syllable and a note that the . */
|
||||
* the THDL Extended Wylie transliteration for the whole syllable
|
||||
* and a note about the transliteration. */
|
||||
public String toConciseXML() {
|
||||
// DLC version-control the EWTS document. 0.5 is used below:
|
||||
return ("<legalTibetanSyllable "
|
||||
+ "transliterationType=\"THDL Extended Wylie 0.5\" "
|
||||
+ "transliteration=\"" + getExtendedWylie() + "\"" + "/>");
|
||||
+ "transliteration=\"" + getThdlWylie() + "\"" + "/>");
|
||||
}
|
||||
|
||||
/** Returns a <legalTibetanSyllable> element that contains the
|
||||
|
@ -929,18 +929,18 @@ public class LegalTshegBar
|
|||
+ "transliterationType=\"THDL Extended Wylie 0.5\" "
|
||||
+ (hasPrefix()
|
||||
? ("prefix=\""
|
||||
+ UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPrefix()) + "\" ")
|
||||
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPrefix()) + "\" ")
|
||||
: "")
|
||||
+ (hasHeadLetter()
|
||||
? ("headLetter=\""
|
||||
+ UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getHeadLetter())
|
||||
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter())
|
||||
+ "\" ")
|
||||
: "")
|
||||
+ ("rootLetter=\""
|
||||
+ UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getRootLetter()) + "\" ")
|
||||
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getRootLetter()) + "\" ")
|
||||
+ (hasSubjoinedLetter()
|
||||
? ("subjoinedLetter=\""
|
||||
+ UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getSubjoinedLetter())
|
||||
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter())
|
||||
+ "\" ")
|
||||
: "")
|
||||
+ (hasWaZurSubjoinedToRootLetter()
|
||||
|
@ -953,17 +953,17 @@ public class LegalTshegBar
|
|||
// DLC NOW: what about the root letter a, i.e. \u0F68 ? do we want the EWTS to be 'aa' ?
|
||||
+ ("vowel=\""
|
||||
+ (hasExplicitVowel()
|
||||
? UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel())
|
||||
? UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel())
|
||||
: "a")
|
||||
+ "\" ")
|
||||
+ (hasSuffix()
|
||||
? ("suffix=\""
|
||||
+ UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeString(getSuffix())
|
||||
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeString(getSuffix())
|
||||
+ "\" ")
|
||||
: "")
|
||||
+ (hasPostsuffix()
|
||||
? ("postsuffix=\""
|
||||
+ UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPostsuffix())
|
||||
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix())
|
||||
+ "\" ")
|
||||
: "")
|
||||
+ "/>");
|
||||
|
|
|
@ -58,7 +58,7 @@ package org.thdl.tib.text.tshegbar;
|
|||
* <p> This class allows for invalid tsheg bars, like those
|
||||
* containing more than one prefix, more than two suffixes, an
|
||||
* invalid postsuffix (secondary suffix), more than one consonant
|
||||
* stack (excluding the special case of what we call in Extended
|
||||
* stack (excluding the special case of what we call in THDL Extended
|
||||
* Wylie "'i", which is technically a consonant stack but is used in
|
||||
* Tibetan like a suffix).</p>.
|
||||
*
|
||||
|
|
|
@ -40,6 +40,12 @@ public interface UnicodeConstants {
|
|||
static final byte NORM_NFD = 3;
|
||||
/** Refers to Normalization Form KD: */
|
||||
static final byte NORM_NFKD = 4;
|
||||
/** Refers to Normalization Form THDL, which is NFD except for
|
||||
<code>U+0F77</code> and <code>U+0F79</code>, which are
|
||||
normalized according to NFKD. This is the One True
|
||||
Normalization Form, as it leaves no precomposed codepoints and
|
||||
does not normalize <code>U+0F0C</code>. */
|
||||
static final byte NORM_NFTHDL = 5;
|
||||
|
||||
|
||||
/** for those times when you need a char to represent a
|
||||
|
|
|
@ -94,35 +94,34 @@ public class UnicodeUtils implements UnicodeConstants {
|
|||
}
|
||||
|
||||
/** Puts the Tibetan codepoints in tibetanUnicode, a sequence of
|
||||
Unicode codepoints, into Normalization Form KD (NFKD) as
|
||||
specified by Unicode 3.2. The Tibetan passages of the
|
||||
returned string are in NFKD, but codepoints outside of the
|
||||
range <code>U+0F00</code>-<code>U+0FFF</code> are not
|
||||
necessarily put into NFKD. This form uses a maximum of
|
||||
Unicode codepoints, into either Normalization Form KD (NFKD),
|
||||
D (NFD), or THDL (NFTHDL), depending on the value of normForm.
|
||||
NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed
|
||||
for {@link org.thdl.tib.text.tshegbar#GraphemeCluster} because
|
||||
NFKD normalizes <code>U+0F0C</code>. NFTHDL uses a maximum of
|
||||
codepoints, and it never uses codepoints whose use has been
|
||||
{@link #isDiscouraged(char) discouraged}. It would be David
|
||||
Chandler's very favorite form if not for the fact that
|
||||
<code>U+0F0C</code> normalizes to <code>U+0F0B</code> in NFKD.
|
||||
NFD is thus David Chandler's favorite, though it does not
|
||||
decompose <code>U+0F77</code> and <code>U+0F79</code> (for
|
||||
some reason, hopefully a well-thought-out one).
|
||||
{@link #isDiscouraged(char) discouraged}.
|
||||
|
||||
<p>Recall that NFKD, as it applies to Tibetan codepoints, is
|
||||
closed under string concatenation and under substringing.
|
||||
Note again that if the input contains codepoints for which
|
||||
{@link #isInTibetanRange(char)} is not true, then they will
|
||||
not be modified.</p>
|
||||
<p>The Tibetan passages of the returned string are in the
|
||||
chosen normalized form, but codepoints outside of the {@link
|
||||
#isInTibetanRange(char) range}
|
||||
<code>U+0F00</code>-<code>U+0FFF</code> are not necessarily
|
||||
put into normalized form.</p>
|
||||
|
||||
<p>Recall that normalized forms are not necessarily closed
|
||||
under string concatenation, but are closed under
|
||||
substringing.</p>
|
||||
|
||||
<p>Note well that only well-formed input guarantees
|
||||
well-formed output.</p>
|
||||
|
||||
@param tibetanUnicode the codepoints to be decomposed
|
||||
@param normForm NORM_NFKD or NORM_NFD */
|
||||
@param normForm NORM_NFKD, NORM_NFTHDL, or NORM_NFD */
|
||||
public static void toMostlyDecomposedUnicode(StringBuffer tibetanUnicode,
|
||||
byte normForm)
|
||||
{
|
||||
if (normForm != NORM_NFD && normForm != NORM_NFKD)
|
||||
throw new IllegalArgumentException("normForm must be NORM_NFD or NORM_NFKD for decomposition to work");
|
||||
if (normForm != NORM_NFD && normForm != NORM_NFKD && normForm != NORM_NFTHDL)
|
||||
throw new IllegalArgumentException("normForm must be NORM_NFD, NORM_NFTHDL, or NORM_NFKD for decomposition to work");
|
||||
int offset = 0;
|
||||
while (offset < tibetanUnicode.length()) {
|
||||
String s
|
||||
|
@ -157,15 +156,19 @@ public class UnicodeUtils implements UnicodeConstants {
|
|||
and returns null for codepoints that are already normalized or
|
||||
are not in the Tibetan range of Unicode.
|
||||
@param tibetanUnicodeCP the codepoint to normalize
|
||||
@param normalizationForm NORM_NFKD or NORM_NFD if you expect
|
||||
something nontrivial to happen
|
||||
@param normalizationForm NORM_NFTHDL, NORM_NFKD, or NORM_NFD
|
||||
if you expect something nontrivial to happen
|
||||
@return null if tibetanUnicodeCP is already in the chosen
|
||||
normalized form, or a string of two or three codepoints
|
||||
otherwise */
|
||||
public static String toNormalizedForm(char tibetanUnicodeCP, byte normalizationForm) {
|
||||
public static String toNormalizedForm(char tibetanUnicodeCP,
|
||||
byte normalizationForm)
|
||||
{
|
||||
if (normalizationForm == NORM_NFKD
|
||||
|| normalizationForm == NORM_NFD) {
|
||||
// Where not specified, the NFKD form is also the NFD form.
|
||||
|| normalizationForm == NORM_NFD
|
||||
|| normalizationForm == NORM_NFTHDL) {
|
||||
// Where not specified, the NFKD and NFTHDL forms are
|
||||
// identical to the NFD form.
|
||||
switch (tibetanUnicodeCP) {
|
||||
case '\u0F0C': return ((normalizationForm == NORM_NFKD)
|
||||
? "\u0F0B" : null);
|
||||
|
@ -178,14 +181,25 @@ public class UnicodeUtils implements UnicodeConstants {
|
|||
case '\u0F73': return "\u0F71\u0F72";
|
||||
case '\u0F75': return "\u0F71\u0F74";
|
||||
case '\u0F76': return "\u0FB2\u0F80";
|
||||
// I do not understand why NFD does not decompose this codepoint:
|
||||
case '\u0F77': return ((normalizationForm == NORM_NFKD)
|
||||
? "\u0FB2\u0F71\u0F80" : null);
|
||||
case '\u0F77': {
|
||||
// I do not understand why NFD does not decompose this
|
||||
// codepoint, hence NORM_NFTHDL does:
|
||||
if (normalizationForm == NORM_NFKD
|
||||
|| normalizationForm == NORM_NFTHDL)
|
||||
return "\u0FB2\u0F71\u0F80";
|
||||
else
|
||||
return null;
|
||||
}
|
||||
case '\u0F78': return "\u0FB3\u0F80";
|
||||
// I do not understand why NFD does not decompose this codepoint:
|
||||
case '\u0F79': return ((normalizationForm == NORM_NFKD)
|
||||
? "\u0FB3\u0F71\u0F80" : null);
|
||||
|
||||
case '\u0F79': {
|
||||
// I do not understand why NFD does not decompose this
|
||||
// codepoint, hence NORM_NFTHDL does:
|
||||
if (normalizationForm == NORM_NFKD
|
||||
|| normalizationForm == NORM_NFTHDL)
|
||||
return "\u0FB3\u0F71\u0F80";
|
||||
else
|
||||
return null;
|
||||
}
|
||||
case '\u0F81': return "\u0F71\u0F80";
|
||||
case '\u0F93': return "\u0F92\u0FB7";
|
||||
case '\u0F9D': return "\u0F9C\u0FB7";
|
||||
|
|
Loading…
Reference in a new issue