Renamed UnicodeCharToExtendedWylie to
UnicodeCodepointToThdlWylie.java. Added a new class, UnicodeGraphemeCluster, that can tell you the components of a grapheme cluster from top to bottom. It does not yet have good error checking; it is not yet finished. Next is to parse clean Unicode into GraphemeClusters. After that comes scanning dirty Unicode into best-guess GraphemeClusters, and scanning dirty Unicode to get nice error messages.
This commit is contained in:
parent
8e8a23c6a6
commit
7ea185fa01
4 changed files with 481 additions and 69 deletions
|
@ -748,7 +748,7 @@ public class LegalTshegBar
|
||||||
if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) {
|
if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) {
|
||||||
return internalThrowThing(throwIfIllegal,
|
return internalThrowThing(throwIfIllegal,
|
||||||
"Illegal suffix -- not one of the ten legal suffixes: "
|
"Illegal suffix -- not one of the ten legal suffixes: "
|
||||||
+ UnicodeUtils.unicodeCPToString(suffix.charAt(0)));
|
+ UnicodeUtils.unicodeCodepointToString(suffix.charAt(0)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -837,7 +837,7 @@ public class LegalTshegBar
|
||||||
|
|
||||||
boolean disambiguatorNeeded = false;
|
boolean disambiguatorNeeded = false;
|
||||||
char prefix = getPrefix();
|
char prefix = getPrefix();
|
||||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(prefix));
|
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(prefix));
|
||||||
if (!hasHeadLetter()) {
|
if (!hasHeadLetter()) {
|
||||||
if (EWC_ya == rootLetter) {
|
if (EWC_ya == rootLetter) {
|
||||||
if (isConsonantThatTakesYaBtags(prefix))
|
if (isConsonantThatTakesYaBtags(prefix))
|
||||||
|
@ -857,55 +857,55 @@ public class LegalTshegBar
|
||||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||||
}
|
}
|
||||||
if (hasHeadLetter())
|
if (hasHeadLetter())
|
||||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter()));
|
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter()));
|
||||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(rootLetter));
|
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(rootLetter));
|
||||||
if (hasSubjoinedLetter())
|
if (hasSubjoinedLetter())
|
||||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter()));
|
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getSubjoinedLetter()));
|
||||||
if (hasWaZurSubjoinedToRootLetter())
|
if (hasWaZurSubjoinedToRootLetter())
|
||||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EWSUB_wa_zur));
|
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EWSUB_wa_zur));
|
||||||
|
|
||||||
// a-chung is treated, in THDL Extended Wylie, like a vowel.
|
// a-chung is treated, in THDL Extended Wylie, like a vowel.
|
||||||
// I.e., you don't have 'pAa', you have 'pA'.
|
// I.e., you don't have 'pAa', you have 'pA'.
|
||||||
if (hasAChungOnRootLetter()) {
|
if (hasAChungOnRootLetter()) {
|
||||||
if (hasExplicitVowel()) {
|
if (hasExplicitVowel()) {
|
||||||
if (EWV_i == getVowel()) {
|
if (EWV_i == getVowel()) {
|
||||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F73'));
|
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint('\u0F73'));
|
||||||
} else if (EWV_u == getVowel()) {
|
} else if (EWV_u == getVowel()) {
|
||||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F75'));
|
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint('\u0F75'));
|
||||||
} else if (EWV_e == getVowel() || EWV_o == getVowel()) {
|
} else if (EWV_e == getVowel() || EWV_o == getVowel()) {
|
||||||
// The exception to the rule for a-chung and vowels...
|
// The exception to the rule for a-chung and vowels...
|
||||||
|
|
||||||
// DLC FIXME: are these allowed in legal Tibetan?
|
// DLC FIXME: are these allowed in legal Tibetan?
|
||||||
// EWTS would have special cases for them if so,
|
// EWTS would have special cases for them if so,
|
||||||
// I'd wager...
|
// I'd wager...
|
||||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung));
|
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung));
|
||||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()));
|
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel()));
|
||||||
} else {
|
} else {
|
||||||
ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?");
|
ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?");
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung));
|
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung));
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if (hasExplicitVowel())
|
if (hasExplicitVowel())
|
||||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()));
|
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel()));
|
||||||
else
|
else
|
||||||
sb.append("a");
|
sb.append("a");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (hasSuffix()) {
|
if (hasSuffix()) {
|
||||||
String suf = getSuffix();
|
String suf = getSuffix();
|
||||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(0)));
|
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(0)));
|
||||||
if (suf.length() > 1) {
|
if (suf.length() > 1) {
|
||||||
// DLC assert, don't verify, that the length is two.
|
// DLC assert, don't verify, that the length is two.
|
||||||
// This could change if I learn of more suffix
|
// This could change if I learn of more suffix
|
||||||
// particles.
|
// particles.
|
||||||
ThdlDebug.verify(2 == suf.length());
|
ThdlDebug.verify(2 == suf.length());
|
||||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(1)));
|
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(1)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (hasPostsuffix())
|
if (hasPostsuffix())
|
||||||
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix()));
|
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix()));
|
||||||
return sb;
|
return sb;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -929,18 +929,18 @@ public class LegalTshegBar
|
||||||
+ "transliterationType=\"THDL Extended Wylie 0.5\" "
|
+ "transliterationType=\"THDL Extended Wylie 0.5\" "
|
||||||
+ (hasPrefix()
|
+ (hasPrefix()
|
||||||
? ("prefix=\""
|
? ("prefix=\""
|
||||||
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPrefix()) + "\" ")
|
+ UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPrefix()) + "\" ")
|
||||||
: "")
|
: "")
|
||||||
+ (hasHeadLetter()
|
+ (hasHeadLetter()
|
||||||
? ("headLetter=\""
|
? ("headLetter=\""
|
||||||
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter())
|
+ UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter())
|
||||||
+ "\" ")
|
+ "\" ")
|
||||||
: "")
|
: "")
|
||||||
+ ("rootLetter=\""
|
+ ("rootLetter=\""
|
||||||
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getRootLetter()) + "\" ")
|
+ UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getRootLetter()) + "\" ")
|
||||||
+ (hasSubjoinedLetter()
|
+ (hasSubjoinedLetter()
|
||||||
? ("subjoinedLetter=\""
|
? ("subjoinedLetter=\""
|
||||||
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter())
|
+ UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getSubjoinedLetter())
|
||||||
+ "\" ")
|
+ "\" ")
|
||||||
: "")
|
: "")
|
||||||
+ (hasWaZurSubjoinedToRootLetter()
|
+ (hasWaZurSubjoinedToRootLetter()
|
||||||
|
@ -953,17 +953,17 @@ public class LegalTshegBar
|
||||||
// DLC NOW: what about the root letter a, i.e. \u0F68 ? do we want the EWTS to be 'aa' ?
|
// DLC NOW: what about the root letter a, i.e. \u0F68 ? do we want the EWTS to be 'aa' ?
|
||||||
+ ("vowel=\""
|
+ ("vowel=\""
|
||||||
+ (hasExplicitVowel()
|
+ (hasExplicitVowel()
|
||||||
? UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel())
|
? UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel())
|
||||||
: "a")
|
: "a")
|
||||||
+ "\" ")
|
+ "\" ")
|
||||||
+ (hasSuffix()
|
+ (hasSuffix()
|
||||||
? ("suffix=\""
|
? ("suffix=\""
|
||||||
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeString(getSuffix())
|
+ UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeString(getSuffix())
|
||||||
+ "\" ")
|
+ "\" ")
|
||||||
: "")
|
: "")
|
||||||
+ (hasPostsuffix()
|
+ (hasPostsuffix()
|
||||||
? ("postsuffix=\""
|
? ("postsuffix=\""
|
||||||
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix())
|
+ UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix())
|
||||||
+ "\" ")
|
+ "\" ")
|
||||||
: "")
|
: "")
|
||||||
+ "/>");
|
+ "/>");
|
||||||
|
|
|
@ -21,7 +21,7 @@ package org.thdl.tib.text.tshegbar;
|
||||||
import org.thdl.tib.text.TibetanMachineWeb;
|
import org.thdl.tib.text.TibetanMachineWeb;
|
||||||
|
|
||||||
/** This noninstantiable class allows for converting from Unicode
|
/** This noninstantiable class allows for converting from Unicode
|
||||||
* codepoints to Extended Wylie. It cannot be used for long
|
* codepoints to THDL Extended Wylie. It cannot be used for long
|
||||||
* stretches of text, though, as it is unaware of context, which is
|
* stretches of text, though, as it is unaware of context, which is
|
||||||
* essential to understanding a non-trivial string of Tibetan
|
* essential to understanding a non-trivial string of Tibetan
|
||||||
* Unicode.
|
* Unicode.
|
||||||
|
@ -29,21 +29,22 @@ import org.thdl.tib.text.TibetanMachineWeb;
|
||||||
* <p>See the document by Nathaniel Garson and David Germano entitled
|
* <p>See the document by Nathaniel Garson and David Germano entitled
|
||||||
* <i>Extended Wylie Transliteration Scheme</i>. Note that there are
|
* <i>Extended Wylie Transliteration Scheme</i>. Note that there are
|
||||||
* a couple of issues with the November 18, 2001 revision of that
|
* a couple of issues with the November 18, 2001 revision of that
|
||||||
* document; these issues are in the Bugs tracker at our SourceForge site.</p>
|
* document; these issues are in the Bugs tracker at our SourceForge
|
||||||
|
* site.</p>
|
||||||
*
|
*
|
||||||
* @see <a href="http://sourceforge.net/projects/thdltools">SourceForge site</a>
|
* @see <a href="http://sourceforge.net/projects/thdltools">SourceForge site</a>
|
||||||
*
|
*
|
||||||
* @author David Chandler */
|
* @author David Chandler */
|
||||||
public class UnicodeCharToExtendedWylie {
|
public class UnicodeCodepointToThdlWylie {
|
||||||
|
|
||||||
/** Returns the extended Wylie for the very simple sequence x.
|
/** Returns the THDL extended Wylie for the very simple sequence
|
||||||
* Returns null iff some (Unicode) char in s has no extended
|
* x. Returns null iff some (Unicode) char in s has no THDL
|
||||||
* Wylie representation. This is unaware of context, so use it
|
* extended Wylie representation. This is unaware of context, so
|
||||||
* sparingly. */
|
* use it sparingly. */
|
||||||
public static StringBuffer getExtendedWylieForUnicodeString(String x) {
|
public static StringBuffer getThdlWylieForUnicodeString(String x) {
|
||||||
StringBuffer sb = new StringBuffer();
|
StringBuffer sb = new StringBuffer();
|
||||||
for (int i = 0; i < x.length(); i++) {
|
for (int i = 0; i < x.length(); i++) {
|
||||||
String ew = getExtendedWylieForUnicodeChar(x.charAt(i));
|
String ew = getThdlWylieForUnicodeCodepoint(x.charAt(i));
|
||||||
if (null == ew)
|
if (null == ew)
|
||||||
return null;
|
return null;
|
||||||
sb.append(ew);
|
sb.append(ew);
|
||||||
|
@ -51,12 +52,14 @@ public class UnicodeCharToExtendedWylie {
|
||||||
return sb;
|
return sb;
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns the extended Wylie for x, or null if there is none.
|
/** Returns the THDL extended Wylie for x, or null if there is
|
||||||
* Understand that multiple Unicode code points (chars) map to
|
* none. Understand that multiple Unicode code points (chars)
|
||||||
* the same Extended Wylie representation. Understand also that
|
* map to the same THDL Extended Wylie representation.
|
||||||
* the scrap of Extended Wylie returned is only valid in certain
|
* Understand also that the scrap of THDL Extended Wylie returned
|
||||||
* contexts. For example, not all consonants take ra-btags. DLC NOW what about canonicalization? */
|
* is only valid in certain contexts. For example, not all
|
||||||
public static String getExtendedWylieForUnicodeChar(char x) {
|
* consonants take ra-btags. DLC NOW what about
|
||||||
|
* canonicalization? */
|
||||||
|
public static String getThdlWylieForUnicodeCodepoint(char x) {
|
||||||
switch (x) {
|
switch (x) {
|
||||||
|
|
||||||
case '\u0F00': return "oM";
|
case '\u0F00': return "oM";
|
||||||
|
@ -130,9 +133,9 @@ public class UnicodeCharToExtendedWylie {
|
||||||
case '\u0F40': return "k";
|
case '\u0F40': return "k";
|
||||||
case '\u0F41': return "kh";
|
case '\u0F41': return "kh";
|
||||||
case '\u0F42': return "g";
|
case '\u0F42': return "g";
|
||||||
case '\u0F43': return (getExtendedWylieForUnicodeChar('\u0F42')
|
case '\u0F43': return (getThdlWylieForUnicodeCodepoint('\u0F42')
|
||||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||||
case '\u0F44': return "ng";
|
case '\u0F44': return "ng";
|
||||||
case '\u0F45': return "c";
|
case '\u0F45': return "c";
|
||||||
case '\u0F46': return "ch";
|
case '\u0F46': return "ch";
|
||||||
|
@ -142,31 +145,31 @@ public class UnicodeCharToExtendedWylie {
|
||||||
case '\u0F4A': return "T";
|
case '\u0F4A': return "T";
|
||||||
case '\u0F4B': return "Th";
|
case '\u0F4B': return "Th";
|
||||||
case '\u0F4C': return "D";
|
case '\u0F4C': return "D";
|
||||||
case '\u0F4D': return (getExtendedWylieForUnicodeChar('\u0F4C')
|
case '\u0F4D': return (getThdlWylieForUnicodeCodepoint('\u0F4C')
|
||||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||||
case '\u0F4E': return "N";
|
case '\u0F4E': return "N";
|
||||||
case '\u0F4F': return "t";
|
case '\u0F4F': return "t";
|
||||||
|
|
||||||
case '\u0F50': return "th";
|
case '\u0F50': return "th";
|
||||||
case '\u0F51': return "d";
|
case '\u0F51': return "d";
|
||||||
case '\u0F52': return (getExtendedWylieForUnicodeChar('\u0F51')
|
case '\u0F52': return (getThdlWylieForUnicodeCodepoint('\u0F51')
|
||||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||||
case '\u0F53': return "n";
|
case '\u0F53': return "n";
|
||||||
case '\u0F54': return "p";
|
case '\u0F54': return "p";
|
||||||
case '\u0F55': return "ph";
|
case '\u0F55': return "ph";
|
||||||
case '\u0F56': return "b";
|
case '\u0F56': return "b";
|
||||||
case '\u0F57': return (getExtendedWylieForUnicodeChar('\u0F56')
|
case '\u0F57': return (getThdlWylieForUnicodeCodepoint('\u0F56')
|
||||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||||
case '\u0F58': return "m";
|
case '\u0F58': return "m";
|
||||||
case '\u0F59': return "ts";
|
case '\u0F59': return "ts";
|
||||||
case '\u0F5A': return "tsh";
|
case '\u0F5A': return "tsh";
|
||||||
case '\u0F5B': return "dz";
|
case '\u0F5B': return "dz";
|
||||||
case '\u0F5C': return (getExtendedWylieForUnicodeChar('\u0F5B')
|
case '\u0F5C': return (getThdlWylieForUnicodeCodepoint('\u0F5B')
|
||||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||||
case '\u0F5D': return "w";
|
case '\u0F5D': return "w";
|
||||||
case '\u0F5E': return "zh";
|
case '\u0F5E': return "zh";
|
||||||
case '\u0F5F': return "z";
|
case '\u0F5F': return "z";
|
||||||
|
@ -180,9 +183,9 @@ public class UnicodeCharToExtendedWylie {
|
||||||
case '\u0F66': return "s";
|
case '\u0F66': return "s";
|
||||||
case '\u0F67': return "h";
|
case '\u0F67': return "h";
|
||||||
case '\u0F68': return "a"; // DLC: maybe the empty string is OK here because typing just 'i' into Jskad causes root letter \u0F68 to appear... yuck...
|
case '\u0F68': return "a"; // DLC: maybe the empty string is OK here because typing just 'i' into Jskad causes root letter \u0F68 to appear... yuck...
|
||||||
case '\u0F69': return (getExtendedWylieForUnicodeChar('\u0F40')
|
case '\u0F69': return (getThdlWylieForUnicodeCodepoint('\u0F40')
|
||||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||||
+ getExtendedWylieForUnicodeChar('\u0FB5'));
|
+ getThdlWylieForUnicodeCodepoint('\u0FB5'));
|
||||||
case '\u0F6A': return "r";
|
case '\u0F6A': return "r";
|
||||||
case '\u0F6B': return null;
|
case '\u0F6B': return null;
|
||||||
case '\u0F6C': return null;
|
case '\u0F6C': return null;
|
||||||
|
@ -227,9 +230,9 @@ public class UnicodeCharToExtendedWylie {
|
||||||
case '\u0F90': return "k";
|
case '\u0F90': return "k";
|
||||||
case '\u0F91': return "kh";
|
case '\u0F91': return "kh";
|
||||||
case '\u0F92': return "g";
|
case '\u0F92': return "g";
|
||||||
case '\u0F93': return (getExtendedWylieForUnicodeChar('\u0F92')
|
case '\u0F93': return (getThdlWylieForUnicodeCodepoint('\u0F92')
|
||||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||||
case '\u0F94': return "ng";
|
case '\u0F94': return "ng";
|
||||||
case '\u0F95': return "c";
|
case '\u0F95': return "c";
|
||||||
case '\u0F96': return "ch";
|
case '\u0F96': return "ch";
|
||||||
|
@ -239,31 +242,31 @@ public class UnicodeCharToExtendedWylie {
|
||||||
case '\u0F9A': return "T";
|
case '\u0F9A': return "T";
|
||||||
case '\u0F9B': return "Th";
|
case '\u0F9B': return "Th";
|
||||||
case '\u0F9C': return "D";
|
case '\u0F9C': return "D";
|
||||||
case '\u0F9D': return (getExtendedWylieForUnicodeChar('\u0F92')
|
case '\u0F9D': return (getThdlWylieForUnicodeCodepoint('\u0F92')
|
||||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||||
case '\u0F9E': return "N";
|
case '\u0F9E': return "N";
|
||||||
case '\u0F9F': return "t";
|
case '\u0F9F': return "t";
|
||||||
|
|
||||||
case '\u0FA0': return "th";
|
case '\u0FA0': return "th";
|
||||||
case '\u0FA1': return "d";
|
case '\u0FA1': return "d";
|
||||||
case '\u0FA2': return (getExtendedWylieForUnicodeChar('\u0FA1')
|
case '\u0FA2': return (getThdlWylieForUnicodeCodepoint('\u0FA1')
|
||||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||||
case '\u0FA3': return "n";
|
case '\u0FA3': return "n";
|
||||||
case '\u0FA4': return "p";
|
case '\u0FA4': return "p";
|
||||||
case '\u0FA5': return "ph";
|
case '\u0FA5': return "ph";
|
||||||
case '\u0FA6': return "b";
|
case '\u0FA6': return "b";
|
||||||
case '\u0FA7': return (getExtendedWylieForUnicodeChar('\u0FA6')
|
case '\u0FA7': return (getThdlWylieForUnicodeCodepoint('\u0FA6')
|
||||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||||
case '\u0FA8': return "m";
|
case '\u0FA8': return "m";
|
||||||
case '\u0FA9': return "ts";
|
case '\u0FA9': return "ts";
|
||||||
case '\u0FAA': return "tsh";
|
case '\u0FAA': return "tsh";
|
||||||
case '\u0FAB': return "dz";
|
case '\u0FAB': return "dz";
|
||||||
case '\u0FAC': return (getExtendedWylieForUnicodeChar('\u0FAB')
|
case '\u0FAC': return (getThdlWylieForUnicodeCodepoint('\u0FAB')
|
||||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||||
+ getExtendedWylieForUnicodeChar('\u0FB7'));
|
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
|
||||||
case '\u0FAD': return "w";
|
case '\u0FAD': return "w";
|
||||||
case '\u0FAE': return "zh";
|
case '\u0FAE': return "zh";
|
||||||
case '\u0FAF': return "z";
|
case '\u0FAF': return "z";
|
||||||
|
@ -277,9 +280,9 @@ public class UnicodeCharToExtendedWylie {
|
||||||
case '\u0FB6': return "s";
|
case '\u0FB6': return "s";
|
||||||
case '\u0FB7': return "h";
|
case '\u0FB7': return "h";
|
||||||
case '\u0FB8': return "a"; // DLC see note on \u0F68 ...
|
case '\u0FB8': return "a"; // DLC see note on \u0F68 ...
|
||||||
case '\u0FB9': return (getExtendedWylieForUnicodeChar('\u0F90')
|
case '\u0FB9': return (getThdlWylieForUnicodeCodepoint('\u0F90')
|
||||||
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
|
||||||
+ getExtendedWylieForUnicodeChar('\u0FB5'));
|
+ getThdlWylieForUnicodeCodepoint('\u0FB5'));
|
||||||
case '\u0FBA': return "w";
|
case '\u0FBA': return "w";
|
||||||
case '\u0FBB': return "y";
|
case '\u0FBB': return "y";
|
||||||
case '\u0FBC': return "r";
|
case '\u0FBC': return "r";
|
||||||
|
@ -309,7 +312,7 @@ public class UnicodeCharToExtendedWylie {
|
||||||
|
|
||||||
// This codepoint is in the range 0FD0-0FFF or is not in
|
// This codepoint is in the range 0FD0-0FFF or is not in
|
||||||
// the Tibetan range at all. In either case, there is no
|
// the Tibetan range at all. In either case, there is no
|
||||||
// corresponding Extended Wylie.
|
// corresponding THDL Extended Wylie.
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
} // end switch
|
} // end switch
|
377
source/org/thdl/tib/text/tshegbar/UnicodeGraphemeCluster.java
Normal file
377
source/org/thdl/tib/text/tshegbar/UnicodeGraphemeCluster.java
Normal file
|
@ -0,0 +1,377 @@
|
||||||
|
/*
|
||||||
|
The contents of this file are subject to the THDL Open Community License
|
||||||
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||||
|
with the License. You may obtain a copy of the License on the THDL web site
|
||||||
|
(http://www.thdl.org/).
|
||||||
|
|
||||||
|
Software distributed under the License is distributed on an "AS IS" basis,
|
||||||
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||||
|
License for the specific terms governing rights and limitations under the
|
||||||
|
License.
|
||||||
|
|
||||||
|
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||||
|
Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
|
||||||
|
All Rights Reserved.
|
||||||
|
|
||||||
|
Contributor(s): ______________________________________.
|
||||||
|
*/
|
||||||
|
|
||||||
|
package org.thdl.tib.text.tshegbar;
|
||||||
|
|
||||||
|
import java.util.Vector;
|
||||||
|
|
||||||
|
import org.thdl.util.ThdlDebug;
|
||||||
|
|
||||||
|
/** A UnicodeGraphemeCluster is either a non-Tibetan codepoint (such
|
||||||
|
* as whitespace or control characters or a Latin "character"), or a
|
||||||
|
* vertically stacked set of Tibetan consonants, vowels, marks, and
|
||||||
|
* signs. The Unicode string
|
||||||
|
* <code>"\u0F40\u0F0B\u0F41\u0F0B"</code> specifies
|
||||||
|
* four UnicodeGraphemeClusters (the name of the Tibetan alphabet,
|
||||||
|
* you might notice), while the Unicode string
|
||||||
|
* <code>"\u0F66\u0FA5\u0F39\u0F90\u0FB5\u0F71\u0F80\u0F7F"</code>
|
||||||
|
* is one Tibetan stack, sa over fa over ka over Sha with an a-chung,
|
||||||
|
* a reversed gi-gu, and a visarga, plus a ngas-bzung-sgor-rtags mark
|
||||||
|
* underneath all of that. I assume the latter grapheme cluster is
|
||||||
|
* nonsense, but it is considered one grapheme cluster because all
|
||||||
|
* but the first char are combining chars. See Unicode Technical
|
||||||
|
* Report 29.
|
||||||
|
*
|
||||||
|
* <p>As the above example demonstrates, not all
|
||||||
|
* UnicodeGraphemeClusters are syntactically legal in the Tibetan
|
||||||
|
* language. Not all of them are syntactically legal in Sanskrit
|
||||||
|
* transcribed in the Tibetan alphabet, either.</p>
|
||||||
|
*
|
||||||
|
* <p>The Unicode 3.2 standard (see especially Technical Report 29)
|
||||||
|
* refers to "grapheme clusters." A UnicodeGraphemeCluster is
|
||||||
|
* precisely a grapheme cluster as described by that standard. We
|
||||||
|
* interpret the standard as saying that <code>U+0F3E</code> and
|
||||||
|
* <code>U+0F3F</code> are each grapheme clusters unto themselves,
|
||||||
|
* even though they are combining codepoints.</p>
|
||||||
|
*
|
||||||
|
* @author David Chandler */
|
||||||
|
public class UnicodeGraphemeCluster
|
||||||
|
implements UnicodeReadyThunk, UnicodeConstants
|
||||||
|
{
|
||||||
|
/** @see #getCPHeight(char) */
|
||||||
|
private static final int MIN_HEIGHT = -6;
|
||||||
|
/** @see #getCPHeight(char) */
|
||||||
|
private static final int MAX_HEIGHT = 3;
|
||||||
|
|
||||||
|
/** The Unicode codepoints that compose this grapheme cluster.
|
||||||
|
This is legal, i.e. if there is a Tibetan vowel, it is the
|
||||||
|
last codepoint. It is in Normalization Form THDL (NFTHDL). */
|
||||||
|
private String unicodeString;
|
||||||
|
|
||||||
|
/** Do not use this constructor. */
|
||||||
|
private UnicodeGraphemeCluster() { super(); }
|
||||||
|
|
||||||
|
/** Creates a new GraphemeCluster given a legal sequence of
|
||||||
|
Unicode codepoints corresponding to a single grapheme
|
||||||
|
cluster.
|
||||||
|
@exception IllegalArgumentException if unicodeString is not a
|
||||||
|
syntactically correct Unicode 3.2 sequence (if it begins with
|
||||||
|
a combining codepoint or has a Tibetan vowel before another
|
||||||
|
combining character, for example, or if it is more than one
|
||||||
|
grapheme cluster. Note that syntactical correctness for
|
||||||
|
non-Tibetan codepoints is not likely to be known by this
|
||||||
|
routine. */
|
||||||
|
public UnicodeGraphemeCluster(String unicodeString)
|
||||||
|
throws IllegalArgumentException
|
||||||
|
{
|
||||||
|
// check legality:
|
||||||
|
// DLC NOW FIXME
|
||||||
|
|
||||||
|
// convert to NFTHDL:
|
||||||
|
this.unicodeString
|
||||||
|
= UnicodeUtils.toMostlyDecomposedUnicode(unicodeString, NORM_NFTHDL);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns a string of codepoints in NFTHDL form. */
|
||||||
|
public String getUnicodeRepresentation() {
|
||||||
|
return unicodeString;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns true. */
|
||||||
|
public boolean hasUnicodeRepresentation() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns true iff this stack could occur in syntactically
|
||||||
|
* correct, run-of-the-mill Tibetan (as opposed to Tibetanized
|
||||||
|
* Sanksrit, Chinese, et cetera). sga is a legal Tibetan stack,
|
||||||
|
* but g+g is not, for example. */
|
||||||
|
public boolean isLegalTibetan() {
|
||||||
|
// DLC FIXME: for those odd head marks etc., return true even
|
||||||
|
// though hasUnicodeRepresentation() will return false.
|
||||||
|
|
||||||
|
// Note that ra-btags and wa-zur both be present in legal
|
||||||
|
// Tibetan.
|
||||||
|
|
||||||
|
throw new Error("DLC FIXME: not yet implemented.");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns a <unicodeGraphemeCluster> element that contains the
|
||||||
|
* THDL Extended Wylie transliteration for this cluster. */
|
||||||
|
public String toConciseXML() {
|
||||||
|
throw new Error("DLC NOW unimplemented");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns a <unicodeGraphemeCluster> element that contains this
|
||||||
|
* cluster broken down into its constituent decomposed
|
||||||
|
* codepoints. */
|
||||||
|
public String toVerboseXML() {
|
||||||
|
throw new Error("DLC NOW unimplemented");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns the THDL Extended Wylie transliteration of this
|
||||||
|
grapheme cluster, or null if there is none (which happens for
|
||||||
|
a few Tibetan codepoints, if you'll recall). If needsVowel is
|
||||||
|
true, then an "a" will be appended when there is no EW_achung
|
||||||
|
or explicit simple vowel. If there is an explicit vowel or
|
||||||
|
EW_achung, it will always be present. Note that needsVowel is
|
||||||
|
provided because btags is the preferred THDL Extended Wylie
|
||||||
|
for the four contiguous grapheme clusters
|
||||||
|
<code>"\u0F56\u0F4F\u0F42\u0F66"</code>, and
|
||||||
|
needsVowel must be set to false for all but the grapheme
|
||||||
|
cluster corresponding to <code>\u0F4F</code> if you wish
|
||||||
|
to get the preferred THDL Extended Wylie. */
|
||||||
|
public String getThdlWylie(boolean needsVowel) {
|
||||||
|
throw new Error("DLC NOW unimplemented");
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Given some (possibly unnormalized) Unicode 3.2 string unicode,
|
||||||
|
appends grapheme clusters to the vector of GraphemeClusters
|
||||||
|
grcls if grcls is nonnulla. Performs good error checking if
|
||||||
|
validate is true. If an error is found, grcls may have been
|
||||||
|
modified if nonnull. Setting grcls to null and setting
|
||||||
|
validate to true is sometimes useful for testing the validity
|
||||||
|
of a Unicode string.
|
||||||
|
@return the number of grapheme clusters that were or would
|
||||||
|
have been added to grcls
|
||||||
|
@exception BadTibetanUnicodeException if the unicode is not
|
||||||
|
syntactically legal
|
||||||
|
@exception IllegalArgumentException if correctErrors and
|
||||||
|
validate are both true
|
||||||
|
@exception NullPointerException if unicode is null */
|
||||||
|
public static int breakUnicodeIntoGraphemeClusters(Vector grcls,
|
||||||
|
String unicode,
|
||||||
|
boolean validate,
|
||||||
|
boolean correctErrors)
|
||||||
|
throws // DLC SOON: BadTibetanUnicodeException,
|
||||||
|
IllegalArgumentException, NullPointerException
|
||||||
|
{
|
||||||
|
if (validate && correctErrors) {
|
||||||
|
throw new IllegalArgumentException("validate and correctErrors cannot both be true.");
|
||||||
|
}
|
||||||
|
throw new Error("DLC NOW unimplemented");
|
||||||
|
/*
|
||||||
|
if (start == i) {
|
||||||
|
// special tests at the beginning of input.
|
||||||
|
if (0 != height || UnicodeUtils.combinesLeftToRight(cp)) {
|
||||||
|
throw new BadTibetanUnicodeException("A combining codepoint was found at the start of input or after a mark that ends a grapheme cluster.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (height == last_height) {
|
||||||
|
if ('\u0F39' == cp) {
|
||||||
|
if (!UnicodeUtils.isTibetanConsonant(last_cp)) {
|
||||||
|
throw new BadTibetanUnicodeException("U+0F39 can only occur after a (possibly subjoined) Tibetan consonant");
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// DLC: cp BEGINS A NEW GRAPHEME CLUSTER!!!
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Test to see if this last character has ended this
|
||||||
|
// grapheme cluster:
|
||||||
|
if (UnicodeUtils.isTibetanTerminatingVowel(cp)) {
|
||||||
|
// DLC: cp ENDS A GRAPHEME CLUSTER!!!
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
||||||
|
/** FIXMEDOC */
|
||||||
|
public String getTopToBottomCodepoints() {
|
||||||
|
return getTopToBottomCodepoints(new StringBuffer(unicodeString),
|
||||||
|
0, unicodeString.length()).toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns a new StringBuffer consisting of the codepoints in
|
||||||
|
NFTHDLString at indices [start, end) sorted in top-to-bottom
|
||||||
|
order, or null on some occasions when NFTHDLString is already
|
||||||
|
sorted. A top-to-bottom ordering is a useful form for
|
||||||
|
applications wishing to render the grapheme cluster. Note
|
||||||
|
that this method is only useful if NFTHDLString is part of or
|
||||||
|
an entire grapheme cluster. Does no error checking on
|
||||||
|
NFTHDLString.
|
||||||
|
@param NFTHDLString a buffer with characters at indices i,
|
||||||
|
where start <= i < end, being the Unicode codepoints for a
|
||||||
|
single grapheme cluster or part of a grapheme cluster
|
||||||
|
@param start NFTHDLString.charAt(start) is the first codepoint
|
||||||
|
dealt with
|
||||||
|
@param end NFTHDLString.charAt(end) is the first codepoint NOT
|
||||||
|
dealt with
|
||||||
|
@return null only if (but not necessarily if) NFTHDLString is
|
||||||
|
already sorted top-to-bottom, or the sorted form of
|
||||||
|
NFTHDLString */
|
||||||
|
private static StringBuffer getTopToBottomCodepoints(StringBuffer NFTHDLString, /* DLC FIXME: for efficiency, use a ThdlCharIterator. */
|
||||||
|
int start, int end)
|
||||||
|
{
|
||||||
|
if (end <= start) /* 0-length string. */
|
||||||
|
return null;
|
||||||
|
if (start + 1 == end) /* 1-length string. */
|
||||||
|
return null;
|
||||||
|
// else we have a string of length >= 2.
|
||||||
|
|
||||||
|
// We'll use the world's fastest sorting algorithm. Linear
|
||||||
|
// time, baby. Here are the ten or so mailboxes for our
|
||||||
|
// postman's sort:
|
||||||
|
StringBuffer chunksAtCommonHeights[]
|
||||||
|
= new StringBuffer[(MAX_HEIGHT + 1) - MIN_HEIGHT];
|
||||||
|
|
||||||
|
for (int i = start; i < end; i++) {
|
||||||
|
char cp = NFTHDLString.charAt(i);
|
||||||
|
int height = getCPHeight(cp);
|
||||||
|
|
||||||
|
// initialize mailbox if necessary.
|
||||||
|
if (null == chunksAtCommonHeights[height - MIN_HEIGHT]) {
|
||||||
|
chunksAtCommonHeights[height - MIN_HEIGHT]
|
||||||
|
= new StringBuffer(2);
|
||||||
|
}
|
||||||
|
|
||||||
|
// put this cp into the correct mailbox.
|
||||||
|
chunksAtCommonHeights[height - MIN_HEIGHT].append(cp);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Now concatenate together the mailboxes:
|
||||||
|
StringBuffer sb = new StringBuffer(end - start);
|
||||||
|
for (int h = MAX_HEIGHT; h >= MIN_HEIGHT; h--) {
|
||||||
|
if (null != chunksAtCommonHeights[h - MIN_HEIGHT]) {
|
||||||
|
sb.append(chunksAtCommonHeights[h - MIN_HEIGHT]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return sb;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/** Returns the <i>height</i> for the Tibetan Unicode codepoint x.
|
||||||
|
This relative height is 0 for a base consonant, digit,
|
||||||
|
punctuation, mark, or sign. It is -1 for a subjoined
|
||||||
|
consonant, -2 for EWSUB_wa_zur, -3 for EW_achung, +1 for
|
||||||
|
EWV_gigu, and so on according to the height these codepoints
|
||||||
|
appear relative to one another when on the same stack. If two
|
||||||
|
codepoints have equal height, they should not exist in the
|
||||||
|
same grapheme cluster unless one is <code>U+0F39</code>, which
|
||||||
|
is an integral part of a consonant when tacked on to, e.g.,
|
||||||
|
EWC_PHA.
|
||||||
|
|
||||||
|
<p>If x is not a Unicode 3.2 codepoint in the Tibetan range,
|
||||||
|
or if x is not in NFTHDL form, 0 is returned. The height code
|
||||||
|
of <code>U+0F76</code> is not valid, and it is not an accident
|
||||||
|
that <code>U+0F76</code> is not in NFTHDL form.</p> */
|
||||||
|
private static int getCPHeight(char x) {
|
||||||
|
// DLC make this an assertion:
|
||||||
|
ThdlDebug.verify(null == UnicodeUtils.toNormalizedForm(x, NORM_NFTHDL));
|
||||||
|
|
||||||
|
if (x >= '\u0F90' && x <= '\u0FAC'
|
||||||
|
|| x >= '\u0FAE' && x <= '\u0FBC') {
|
||||||
|
// subjoined consonant. Note that wa-zur is an exception.
|
||||||
|
return -1;
|
||||||
|
} else if (x >= '\u0F00' && x <= '\u0F17'
|
||||||
|
|| x >= '\u0F1A' && x <= '\u0F34'
|
||||||
|
|| x >= '\u0F3A' && x <= '\u0F3D'
|
||||||
|
|| x >= '\u0F40' && x <= '\u0F6A' // consonants
|
||||||
|
|| x >= '\u0F88' && x <= '\u0F8B'
|
||||||
|
|| x >= '\u0FBE' && x <= '\u0FCF') {
|
||||||
|
// neutral height:
|
||||||
|
return 0;
|
||||||
|
} else { // Oddballs.
|
||||||
|
switch (x) {
|
||||||
|
//
|
||||||
|
// non-combining:
|
||||||
|
//
|
||||||
|
case '\u0F36':
|
||||||
|
case '\u0F38':
|
||||||
|
case '\u0F85':
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// combining, but left-to-right combining:
|
||||||
|
//
|
||||||
|
case '\u0F3E':
|
||||||
|
case '\u0F3F':
|
||||||
|
case '\u0F7F':
|
||||||
|
return 0;
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// combining by coming below:
|
||||||
|
//
|
||||||
|
case '\u0FAD':
|
||||||
|
return -2; // wa-zur
|
||||||
|
case '\u0F71':
|
||||||
|
return -3; // a-chung
|
||||||
|
case '\u0F74':
|
||||||
|
case '\u0F84':
|
||||||
|
return -4; // DLC CHECKME
|
||||||
|
case '\u0F18': // combines with digits
|
||||||
|
case '\u0F19': // combines with digits
|
||||||
|
return -5;
|
||||||
|
case '\u0F35':
|
||||||
|
case '\u0F37':
|
||||||
|
case '\u0FC6': {
|
||||||
|
ThdlDebug.verify(-6 == MIN_HEIGHT);
|
||||||
|
return -6; // min height
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// combining by coming above:
|
||||||
|
//
|
||||||
|
case '\u0F72':
|
||||||
|
case '\u0F7A':
|
||||||
|
case '\u0F7B':
|
||||||
|
case '\u0F7C':
|
||||||
|
case '\u0F7D':
|
||||||
|
case '\u0F80':
|
||||||
|
return 1;
|
||||||
|
case '\u0F7E':
|
||||||
|
case '\u0F82':
|
||||||
|
case '\u0F83':
|
||||||
|
return 2; // these three come above 0F7C, right? (DLC CHECKME)
|
||||||
|
case '\u0F86':
|
||||||
|
case '\u0F87': {
|
||||||
|
ThdlDebug.verify(3 == MAX_HEIGHT);
|
||||||
|
return 3; // max height
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// exceptional case:
|
||||||
|
//
|
||||||
|
// some would say +1, but then "\u0F40\u0FA5\u0F39" will
|
||||||
|
// not have a5 combine with 39. Unicode could well have
|
||||||
|
// put in a single codepoint for "\u0FA5\u0F39" IMO.
|
||||||
|
case '\u0F39': return 0;
|
||||||
|
|
||||||
|
|
||||||
|
default: {
|
||||||
|
if (x >= '\u0F00' && x <= '\u0FFF') {
|
||||||
|
// This wasn't explicitly handled? Hmmm... This
|
||||||
|
// won't ever happen for NFTHDL-formed input.
|
||||||
|
ThdlDebug.noteIffyCode();
|
||||||
|
}
|
||||||
|
|
||||||
|
// This codepoint is not in the Tibetan range.
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
} // end switch
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/** DLC SOON */
|
||||||
|
public boolean isTibetan() {
|
||||||
|
throw new Error("DLC FIXME: not yet implemented.");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -97,10 +97,12 @@ public class UnicodeUtils implements UnicodeConstants {
|
||||||
Unicode codepoints, into either Normalization Form KD (NFKD),
|
Unicode codepoints, into either Normalization Form KD (NFKD),
|
||||||
D (NFD), or THDL (NFTHDL), depending on the value of normForm.
|
D (NFD), or THDL (NFTHDL), depending on the value of normForm.
|
||||||
NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed
|
NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed
|
||||||
for {@link org.thdl.tib.text.tshegbar#GraphemeCluster} because
|
for {@link org.thdl.tib.text.tshegbar#UnicodeGraphemeCluster}
|
||||||
NFKD normalizes <code>U+0F0C</code>. NFTHDL uses a maximum of
|
because NFKD normalizes <code>U+0F0C</code> and neither NFD
|
||||||
codepoints, and it never uses codepoints whose use has been
|
nor NFKD breaks down <code>U+0F00</code> into its constituent
|
||||||
{@link #isDiscouraged(char) discouraged}.
|
codepoints. NFTHDL uses a maximum of codepoints, and it never
|
||||||
|
uses codepoints whose use has been {@link #isDiscouraged(char)
|
||||||
|
discouraged}.
|
||||||
|
|
||||||
<p>The Tibetan passages of the returned string are in the
|
<p>The Tibetan passages of the returned string are in the
|
||||||
chosen normalized form, but codepoints outside of the {@link
|
chosen normalized form, but codepoints outside of the {@link
|
||||||
|
@ -170,6 +172,8 @@ public class UnicodeUtils implements UnicodeConstants {
|
||||||
// Where not specified, the NFKD and NFTHDL forms are
|
// Where not specified, the NFKD and NFTHDL forms are
|
||||||
// identical to the NFD form.
|
// identical to the NFD form.
|
||||||
switch (tibetanUnicodeCP) {
|
switch (tibetanUnicodeCP) {
|
||||||
|
case '\u0F00': return ((normalizationForm == NORM_NFTHDL)
|
||||||
|
? "\u0F68\u0F7C\u0F7E" : null);
|
||||||
case '\u0F0C': return ((normalizationForm == NORM_NFKD)
|
case '\u0F0C': return ((normalizationForm == NORM_NFKD)
|
||||||
? "\u0F0B" : null);
|
? "\u0F0B" : null);
|
||||||
case '\u0F43': return "\u0F42\u0FB7";
|
case '\u0F43': return "\u0F42\u0FB7";
|
||||||
|
@ -282,9 +286,37 @@ public class UnicodeUtils implements UnicodeConstants {
|
||||||
}
|
}
|
||||||
|
|
||||||
/** Returns a human-readable, ASCII form of the Unicode codepoint
|
/** Returns a human-readable, ASCII form of the Unicode codepoint
|
||||||
ch. */
|
cp. */
|
||||||
public static String unicodeCPToString(char ch) {
|
public static String unicodeCodepointToString(char cp) {
|
||||||
return "U+" + Integer.toHexString((int)ch);
|
if (cp < '\u0010')
|
||||||
|
return "\\u000" + Integer.toHexString((int)cp);
|
||||||
|
else if (cp < '\u0100')
|
||||||
|
return "\\u00" + Integer.toHexString((int)cp);
|
||||||
|
else if (cp < '\u1000')
|
||||||
|
return "\\u0" + Integer.toHexString((int)cp);
|
||||||
|
else
|
||||||
|
return "\\u" + Integer.toHexString((int)cp);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String unicodeStringToString(String s) {
|
||||||
|
StringBuffer sb = new StringBuffer(s.length() * 6);
|
||||||
|
for (int i = 0; i < s.length(); i++) {
|
||||||
|
sb.append(unicodeCodepointToString(s.charAt(i)));
|
||||||
|
}
|
||||||
|
return sb.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Returns true iff cp is a Unicode 3.2 Tibetan consonant,
|
||||||
|
subjoined or not. This counts precomposed consonant stacks
|
||||||
|
like <code>U+0FA7</code> as consonants. If you don't wish to
|
||||||
|
treat such as consonants, then put the input into NORM_NFD,
|
||||||
|
NORM_NFKD, or NORM_NFTHDL first. If it changes under such a
|
||||||
|
normalization, it is a precomposed consonant. */
|
||||||
|
public static boolean isTibetanConsonant(char cp) {
|
||||||
|
return (((cp >= '\u0F40' && cp <= '\u0F6A')
|
||||||
|
|| (cp >= '\u0F90' && cp <= '\u0FBC'))
|
||||||
|
&& '\u0F48' != cp
|
||||||
|
&& '\u0F98' != cp);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue