diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java index d791e19..35950ac 100644 --- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java +++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java @@ -748,7 +748,7 @@ public class LegalTshegBar if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) { return internalThrowThing(throwIfIllegal, "Illegal suffix -- not one of the ten legal suffixes: " - + UnicodeUtils.unicodeCPToString(suffix.charAt(0))); + + UnicodeUtils.unicodeCodepointToString(suffix.charAt(0))); } } } @@ -837,7 +837,7 @@ public class LegalTshegBar boolean disambiguatorNeeded = false; char prefix = getPrefix(); - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(prefix)); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(prefix)); if (!hasHeadLetter()) { if (EWC_ya == rootLetter) { if (isConsonantThatTakesYaBtags(prefix)) @@ -857,55 +857,55 @@ public class LegalTshegBar sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); } if (hasHeadLetter()) - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter())); - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(rootLetter)); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter())); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(rootLetter)); if (hasSubjoinedLetter()) - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter())); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getSubjoinedLetter())); if (hasWaZurSubjoinedToRootLetter()) - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EWSUB_wa_zur)); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EWSUB_wa_zur)); // a-chung is treated, in THDL Extended Wylie, like a vowel. // I.e., you don't have 'pAa', you have 'pA'. if (hasAChungOnRootLetter()) { if (hasExplicitVowel()) { if (EWV_i == getVowel()) { - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F73')); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint('\u0F73')); } else if (EWV_u == getVowel()) { - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F75')); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint('\u0F75')); } else if (EWV_e == getVowel() || EWV_o == getVowel()) { // The exception to the rule for a-chung and vowels... // DLC FIXME: are these allowed in legal Tibetan? // EWTS would have special cases for them if so, // I'd wager... - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung)); - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel())); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung)); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel())); } else { ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?"); } } else { - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung)); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung)); } } else { if (hasExplicitVowel()) - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel())); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel())); else sb.append("a"); } if (hasSuffix()) { String suf = getSuffix(); - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(0))); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(0))); if (suf.length() > 1) { // DLC assert, don't verify, that the length is two. // This could change if I learn of more suffix // particles. ThdlDebug.verify(2 == suf.length()); - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(1))); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(1))); } } if (hasPostsuffix()) - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix())); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix())); return sb; } @@ -929,18 +929,18 @@ public class LegalTshegBar + "transliterationType=\"THDL Extended Wylie 0.5\" " + (hasPrefix() ? ("prefix=\"" - + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPrefix()) + "\" ") + + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPrefix()) + "\" ") : "") + (hasHeadLetter() ? ("headLetter=\"" - + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter()) + + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter()) + "\" ") : "") + ("rootLetter=\"" - + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getRootLetter()) + "\" ") + + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getRootLetter()) + "\" ") + (hasSubjoinedLetter() ? ("subjoinedLetter=\"" - + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter()) + + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getSubjoinedLetter()) + "\" ") : "") + (hasWaZurSubjoinedToRootLetter() @@ -953,17 +953,17 @@ public class LegalTshegBar // DLC NOW: what about the root letter a, i.e. \u0F68 ? do we want the EWTS to be 'aa' ? + ("vowel=\"" + (hasExplicitVowel() - ? UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()) + ? UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel()) : "a") + "\" ") + (hasSuffix() ? ("suffix=\"" - + UnicodeCharToThdlWylie.getThdlWylieForUnicodeString(getSuffix()) + + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeString(getSuffix()) + "\" ") : "") + (hasPostsuffix() ? ("postsuffix=\"" - + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix()) + + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix()) + "\" ") : "") + "/>"); diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java b/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java similarity index 79% rename from source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java rename to source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java index 955ca59..7cc5b13 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java @@ -21,7 +21,7 @@ package org.thdl.tib.text.tshegbar; import org.thdl.tib.text.TibetanMachineWeb; /** This noninstantiable class allows for converting from Unicode - * codepoints to Extended Wylie. It cannot be used for long + * codepoints to THDL Extended Wylie. It cannot be used for long * stretches of text, though, as it is unaware of context, which is * essential to understanding a non-trivial string of Tibetan * Unicode. @@ -29,21 +29,22 @@ import org.thdl.tib.text.TibetanMachineWeb; *
See the document by Nathaniel Garson and David Germano entitled * Extended Wylie Transliteration Scheme. Note that there are * a couple of issues with the November 18, 2001 revision of that - * document; these issues are in the Bugs tracker at our SourceForge site.
+ * document; these issues are in the Bugs tracker at our SourceForge + * site. * * @see SourceForge site * * @author David Chandler */ -public class UnicodeCharToExtendedWylie { +public class UnicodeCodepointToThdlWylie { - /** Returns the extended Wylie for the very simple sequence x. - * Returns null iff some (Unicode) char in s has no extended - * Wylie representation. This is unaware of context, so use it - * sparingly. */ - public static StringBuffer getExtendedWylieForUnicodeString(String x) { + /** Returns the THDL extended Wylie for the very simple sequence + * x. Returns null iff some (Unicode) char in s has no THDL + * extended Wylie representation. This is unaware of context, so + * use it sparingly. */ + public static StringBuffer getThdlWylieForUnicodeString(String x) { StringBuffer sb = new StringBuffer(); for (int i = 0; i < x.length(); i++) { - String ew = getExtendedWylieForUnicodeChar(x.charAt(i)); + String ew = getThdlWylieForUnicodeCodepoint(x.charAt(i)); if (null == ew) return null; sb.append(ew); @@ -51,12 +52,14 @@ public class UnicodeCharToExtendedWylie { return sb; } - /** Returns the extended Wylie for x, or null if there is none. - * Understand that multiple Unicode code points (chars) map to - * the same Extended Wylie representation. Understand also that - * the scrap of Extended Wylie returned is only valid in certain - * contexts. For example, not all consonants take ra-btags. DLC NOW what about canonicalization? */ - public static String getExtendedWylieForUnicodeChar(char x) { + /** Returns the THDL extended Wylie for x, or null if there is + * none. Understand that multiple Unicode code points (chars) + * map to the same THDL Extended Wylie representation. + * Understand also that the scrap of THDL Extended Wylie returned + * is only valid in certain contexts. For example, not all + * consonants take ra-btags. DLC NOW what about + * canonicalization? */ + public static String getThdlWylieForUnicodeCodepoint(char x) { switch (x) { case '\u0F00': return "oM"; @@ -130,9 +133,9 @@ public class UnicodeCharToExtendedWylie { case '\u0F40': return "k"; case '\u0F41': return "kh"; case '\u0F42': return "g"; - case '\u0F43': return (getExtendedWylieForUnicodeChar('\u0F42') + case '\u0F43': return (getThdlWylieForUnicodeCodepoint('\u0F42') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0F44': return "ng"; case '\u0F45': return "c"; case '\u0F46': return "ch"; @@ -142,31 +145,31 @@ public class UnicodeCharToExtendedWylie { case '\u0F4A': return "T"; case '\u0F4B': return "Th"; case '\u0F4C': return "D"; - case '\u0F4D': return (getExtendedWylieForUnicodeChar('\u0F4C') + case '\u0F4D': return (getThdlWylieForUnicodeCodepoint('\u0F4C') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0F4E': return "N"; case '\u0F4F': return "t"; case '\u0F50': return "th"; case '\u0F51': return "d"; - case '\u0F52': return (getExtendedWylieForUnicodeChar('\u0F51') + case '\u0F52': return (getThdlWylieForUnicodeCodepoint('\u0F51') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0F53': return "n"; case '\u0F54': return "p"; case '\u0F55': return "ph"; case '\u0F56': return "b"; - case '\u0F57': return (getExtendedWylieForUnicodeChar('\u0F56') + case '\u0F57': return (getThdlWylieForUnicodeCodepoint('\u0F56') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0F58': return "m"; case '\u0F59': return "ts"; case '\u0F5A': return "tsh"; case '\u0F5B': return "dz"; - case '\u0F5C': return (getExtendedWylieForUnicodeChar('\u0F5B') + case '\u0F5C': return (getThdlWylieForUnicodeCodepoint('\u0F5B') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0F5D': return "w"; case '\u0F5E': return "zh"; case '\u0F5F': return "z"; @@ -180,9 +183,9 @@ public class UnicodeCharToExtendedWylie { case '\u0F66': return "s"; case '\u0F67': return "h"; case '\u0F68': return "a"; // DLC: maybe the empty string is OK here because typing just 'i' into Jskad causes root letter \u0F68 to appear... yuck... - case '\u0F69': return (getExtendedWylieForUnicodeChar('\u0F40') + case '\u0F69': return (getThdlWylieForUnicodeCodepoint('\u0F40') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB5')); + + getThdlWylieForUnicodeCodepoint('\u0FB5')); case '\u0F6A': return "r"; case '\u0F6B': return null; case '\u0F6C': return null; @@ -227,9 +230,9 @@ public class UnicodeCharToExtendedWylie { case '\u0F90': return "k"; case '\u0F91': return "kh"; case '\u0F92': return "g"; - case '\u0F93': return (getExtendedWylieForUnicodeChar('\u0F92') + case '\u0F93': return (getThdlWylieForUnicodeCodepoint('\u0F92') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0F94': return "ng"; case '\u0F95': return "c"; case '\u0F96': return "ch"; @@ -239,31 +242,31 @@ public class UnicodeCharToExtendedWylie { case '\u0F9A': return "T"; case '\u0F9B': return "Th"; case '\u0F9C': return "D"; - case '\u0F9D': return (getExtendedWylieForUnicodeChar('\u0F92') + case '\u0F9D': return (getThdlWylieForUnicodeCodepoint('\u0F92') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0F9E': return "N"; case '\u0F9F': return "t"; case '\u0FA0': return "th"; case '\u0FA1': return "d"; - case '\u0FA2': return (getExtendedWylieForUnicodeChar('\u0FA1') + case '\u0FA2': return (getThdlWylieForUnicodeCodepoint('\u0FA1') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0FA3': return "n"; case '\u0FA4': return "p"; case '\u0FA5': return "ph"; case '\u0FA6': return "b"; - case '\u0FA7': return (getExtendedWylieForUnicodeChar('\u0FA6') + case '\u0FA7': return (getThdlWylieForUnicodeCodepoint('\u0FA6') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0FA8': return "m"; case '\u0FA9': return "ts"; case '\u0FAA': return "tsh"; case '\u0FAB': return "dz"; - case '\u0FAC': return (getExtendedWylieForUnicodeChar('\u0FAB') + case '\u0FAC': return (getThdlWylieForUnicodeCodepoint('\u0FAB') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0FAD': return "w"; case '\u0FAE': return "zh"; case '\u0FAF': return "z"; @@ -277,9 +280,9 @@ public class UnicodeCharToExtendedWylie { case '\u0FB6': return "s"; case '\u0FB7': return "h"; case '\u0FB8': return "a"; // DLC see note on \u0F68 ... - case '\u0FB9': return (getExtendedWylieForUnicodeChar('\u0F90') + case '\u0FB9': return (getThdlWylieForUnicodeCodepoint('\u0F90') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB5')); + + getThdlWylieForUnicodeCodepoint('\u0FB5')); case '\u0FBA': return "w"; case '\u0FBB': return "y"; case '\u0FBC': return "r"; @@ -309,7 +312,7 @@ public class UnicodeCharToExtendedWylie { // This codepoint is in the range 0FD0-0FFF or is not in // the Tibetan range at all. In either case, there is no - // corresponding Extended Wylie. + // corresponding THDL Extended Wylie. return null; } } // end switch diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeGraphemeCluster.java b/source/org/thdl/tib/text/tshegbar/UnicodeGraphemeCluster.java new file mode 100644 index 0000000..62fe9c4 --- /dev/null +++ b/source/org/thdl/tib/text/tshegbar/UnicodeGraphemeCluster.java @@ -0,0 +1,377 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2001 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.tshegbar; + +import java.util.Vector; + +import org.thdl.util.ThdlDebug; + +/** A UnicodeGraphemeCluster is either a non-Tibetan codepoint (such + * as whitespace or control characters or a Latin "character"), or a + * vertically stacked set of Tibetan consonants, vowels, marks, and + * signs. The Unicode string + *"\u0F40\u0F0B\u0F41\u0F0B"
specifies
+ * four UnicodeGraphemeClusters (the name of the Tibetan alphabet,
+ * you might notice), while the Unicode string
+ * "\u0F66\u0FA5\u0F39\u0F90\u0FB5\u0F71\u0F80\u0F7F"
+ * is one Tibetan stack, sa over fa over ka over Sha with an a-chung,
+ * a reversed gi-gu, and a visarga, plus a ngas-bzung-sgor-rtags mark
+ * underneath all of that. I assume the latter grapheme cluster is
+ * nonsense, but it is considered one grapheme cluster because all
+ * but the first char are combining chars. See Unicode Technical
+ * Report 29.
+ *
+ * As the above example demonstrates, not all + * UnicodeGraphemeClusters are syntactically legal in the Tibetan + * language. Not all of them are syntactically legal in Sanskrit + * transcribed in the Tibetan alphabet, either.
+ * + *The Unicode 3.2 standard (see especially Technical Report 29)
+ * refers to "grapheme clusters." A UnicodeGraphemeCluster is
+ * precisely a grapheme cluster as described by that standard. We
+ * interpret the standard as saying that U+0F3E
and
+ * U+0F3F
are each grapheme clusters unto themselves,
+ * even though they are combining codepoints.
"\u0F56\u0F4F\u0F42\u0F66"
, and
+ needsVowel must be set to false for all but the grapheme
+ cluster corresponding to \u0F4F
if you wish
+ to get the preferred THDL Extended Wylie. */
+ public String getThdlWylie(boolean needsVowel) {
+ throw new Error("DLC NOW unimplemented");
+ }
+
+ /** Given some (possibly unnormalized) Unicode 3.2 string unicode,
+ appends grapheme clusters to the vector of GraphemeClusters
+ grcls if grcls is nonnulla. Performs good error checking if
+ validate is true. If an error is found, grcls may have been
+ modified if nonnull. Setting grcls to null and setting
+ validate to true is sometimes useful for testing the validity
+ of a Unicode string.
+ @return the number of grapheme clusters that were or would
+ have been added to grcls
+ @exception BadTibetanUnicodeException if the unicode is not
+ syntactically legal
+ @exception IllegalArgumentException if correctErrors and
+ validate are both true
+ @exception NullPointerException if unicode is null */
+ public static int breakUnicodeIntoGraphemeClusters(Vector grcls,
+ String unicode,
+ boolean validate,
+ boolean correctErrors)
+ throws // DLC SOON: BadTibetanUnicodeException,
+ IllegalArgumentException, NullPointerException
+ {
+ if (validate && correctErrors) {
+ throw new IllegalArgumentException("validate and correctErrors cannot both be true.");
+ }
+ throw new Error("DLC NOW unimplemented");
+ /*
+ if (start == i) {
+ // special tests at the beginning of input.
+ if (0 != height || UnicodeUtils.combinesLeftToRight(cp)) {
+ throw new BadTibetanUnicodeException("A combining codepoint was found at the start of input or after a mark that ends a grapheme cluster.");
+ }
+ }
+ if (height == last_height) {
+ if ('\u0F39' == cp) {
+ if (!UnicodeUtils.isTibetanConsonant(last_cp)) {
+ throw new BadTibetanUnicodeException("U+0F39 can only occur after a (possibly subjoined) Tibetan consonant");
+ }
+ } else {
+ // DLC: cp BEGINS A NEW GRAPHEME CLUSTER!!!
+ }
+ }
+
+ // Test to see if this last character has ended this
+ // grapheme cluster:
+ if (UnicodeUtils.isTibetanTerminatingVowel(cp)) {
+ // DLC: cp ENDS A GRAPHEME CLUSTER!!!
+ }
+ */
+ }
+
+ /** FIXMEDOC */
+ public String getTopToBottomCodepoints() {
+ return getTopToBottomCodepoints(new StringBuffer(unicodeString),
+ 0, unicodeString.length()).toString();
+ }
+
+ /** Returns a new StringBuffer consisting of the codepoints in
+ NFTHDLString at indices [start, end) sorted in top-to-bottom
+ order, or null on some occasions when NFTHDLString is already
+ sorted. A top-to-bottom ordering is a useful form for
+ applications wishing to render the grapheme cluster. Note
+ that this method is only useful if NFTHDLString is part of or
+ an entire grapheme cluster. Does no error checking on
+ NFTHDLString.
+ @param NFTHDLString a buffer with characters at indices i,
+ where start <= i < end, being the Unicode codepoints for a
+ single grapheme cluster or part of a grapheme cluster
+ @param start NFTHDLString.charAt(start) is the first codepoint
+ dealt with
+ @param end NFTHDLString.charAt(end) is the first codepoint NOT
+ dealt with
+ @return null only if (but not necessarily if) NFTHDLString is
+ already sorted top-to-bottom, or the sorted form of
+ NFTHDLString */
+ private static StringBuffer getTopToBottomCodepoints(StringBuffer NFTHDLString, /* DLC FIXME: for efficiency, use a ThdlCharIterator. */
+ int start, int end)
+ {
+ if (end <= start) /* 0-length string. */
+ return null;
+ if (start + 1 == end) /* 1-length string. */
+ return null;
+ // else we have a string of length >= 2.
+
+ // We'll use the world's fastest sorting algorithm. Linear
+ // time, baby. Here are the ten or so mailboxes for our
+ // postman's sort:
+ StringBuffer chunksAtCommonHeights[]
+ = new StringBuffer[(MAX_HEIGHT + 1) - MIN_HEIGHT];
+
+ for (int i = start; i < end; i++) {
+ char cp = NFTHDLString.charAt(i);
+ int height = getCPHeight(cp);
+
+ // initialize mailbox if necessary.
+ if (null == chunksAtCommonHeights[height - MIN_HEIGHT]) {
+ chunksAtCommonHeights[height - MIN_HEIGHT]
+ = new StringBuffer(2);
+ }
+
+ // put this cp into the correct mailbox.
+ chunksAtCommonHeights[height - MIN_HEIGHT].append(cp);
+ }
+
+ // Now concatenate together the mailboxes:
+ StringBuffer sb = new StringBuffer(end - start);
+ for (int h = MAX_HEIGHT; h >= MIN_HEIGHT; h--) {
+ if (null != chunksAtCommonHeights[h - MIN_HEIGHT]) {
+ sb.append(chunksAtCommonHeights[h - MIN_HEIGHT]);
+ }
+ }
+ return sb;
+ }
+
+
+ /** Returns the height for the Tibetan Unicode codepoint x.
+ This relative height is 0 for a base consonant, digit,
+ punctuation, mark, or sign. It is -1 for a subjoined
+ consonant, -2 for EWSUB_wa_zur, -3 for EW_achung, +1 for
+ EWV_gigu, and so on according to the height these codepoints
+ appear relative to one another when on the same stack. If two
+ codepoints have equal height, they should not exist in the
+ same grapheme cluster unless one is U+0F39
, which
+ is an integral part of a consonant when tacked on to, e.g.,
+ EWC_PHA.
+
+ If x is not a Unicode 3.2 codepoint in the Tibetan range,
+ or if x is not in NFTHDL form, 0 is returned. The height code
+ of U+0F76
is not valid, and it is not an accident
+ that U+0F76
is not in NFTHDL form.
U+0F0C
. NFTHDL uses a maximum of
- codepoints, and it never uses codepoints whose use has been
- {@link #isDiscouraged(char) discouraged}.
+ for {@link org.thdl.tib.text.tshegbar#UnicodeGraphemeCluster}
+ because NFKD normalizes U+0F0C
and neither NFD
+ nor NFKD breaks down U+0F00
into its constituent
+ codepoints. NFTHDL uses a maximum of codepoints, and it never
+ uses codepoints whose use has been {@link #isDiscouraged(char)
+ discouraged}.
The Tibetan passages of the returned string are in the
chosen normalized form, but codepoints outside of the {@link
@@ -170,6 +172,8 @@ public class UnicodeUtils implements UnicodeConstants {
// Where not specified, the NFKD and NFTHDL forms are
// identical to the NFD form.
switch (tibetanUnicodeCP) {
+ case '\u0F00': return ((normalizationForm == NORM_NFTHDL)
+ ? "\u0F68\u0F7C\u0F7E" : null);
case '\u0F0C': return ((normalizationForm == NORM_NFKD)
? "\u0F0B" : null);
case '\u0F43': return "\u0F42\u0FB7";
@@ -282,9 +286,37 @@ public class UnicodeUtils implements UnicodeConstants {
}
/** Returns a human-readable, ASCII form of the Unicode codepoint
- ch. */
- public static String unicodeCPToString(char ch) {
- return "U+" + Integer.toHexString((int)ch);
+ cp. */
+ public static String unicodeCodepointToString(char cp) {
+ if (cp < '\u0010')
+ return "\\u000" + Integer.toHexString((int)cp);
+ else if (cp < '\u0100')
+ return "\\u00" + Integer.toHexString((int)cp);
+ else if (cp < '\u1000')
+ return "\\u0" + Integer.toHexString((int)cp);
+ else
+ return "\\u" + Integer.toHexString((int)cp);
+ }
+
+ public static String unicodeStringToString(String s) {
+ StringBuffer sb = new StringBuffer(s.length() * 6);
+ for (int i = 0; i < s.length(); i++) {
+ sb.append(unicodeCodepointToString(s.charAt(i)));
+ }
+ return sb.toString();
+ }
+
+ /** Returns true iff cp is a Unicode 3.2 Tibetan consonant,
+ subjoined or not. This counts precomposed consonant stacks
+ like U+0FA7
as consonants. If you don't wish to
+ treat such as consonants, then put the input into NORM_NFD,
+ NORM_NFKD, or NORM_NFTHDL first. If it changes under such a
+ normalization, it is a precomposed consonant. */
+ public static boolean isTibetanConsonant(char cp) {
+ return (((cp >= '\u0F40' && cp <= '\u0F6A')
+ || (cp >= '\u0F90' && cp <= '\u0FBC'))
+ && '\u0F48' != cp
+ && '\u0F98' != cp);
}
}