diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java index d791e19..35950ac 100644 --- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java +++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java @@ -748,7 +748,7 @@ public class LegalTshegBar if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) { return internalThrowThing(throwIfIllegal, "Illegal suffix -- not one of the ten legal suffixes: " - + UnicodeUtils.unicodeCPToString(suffix.charAt(0))); + + UnicodeUtils.unicodeCodepointToString(suffix.charAt(0))); } } } @@ -837,7 +837,7 @@ public class LegalTshegBar boolean disambiguatorNeeded = false; char prefix = getPrefix(); - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(prefix)); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(prefix)); if (!hasHeadLetter()) { if (EWC_ya == rootLetter) { if (isConsonantThatTakesYaBtags(prefix)) @@ -857,55 +857,55 @@ public class LegalTshegBar sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); } if (hasHeadLetter()) - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter())); - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(rootLetter)); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter())); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(rootLetter)); if (hasSubjoinedLetter()) - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter())); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getSubjoinedLetter())); if (hasWaZurSubjoinedToRootLetter()) - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EWSUB_wa_zur)); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EWSUB_wa_zur)); // a-chung is treated, in THDL Extended Wylie, like a vowel. // I.e., you don't have 'pAa', you have 'pA'. if (hasAChungOnRootLetter()) { if (hasExplicitVowel()) { if (EWV_i == getVowel()) { - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F73')); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint('\u0F73')); } else if (EWV_u == getVowel()) { - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F75')); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint('\u0F75')); } else if (EWV_e == getVowel() || EWV_o == getVowel()) { // The exception to the rule for a-chung and vowels... // DLC FIXME: are these allowed in legal Tibetan? // EWTS would have special cases for them if so, // I'd wager... - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung)); - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel())); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung)); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel())); } else { ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?"); } } else { - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung)); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung)); } } else { if (hasExplicitVowel()) - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel())); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel())); else sb.append("a"); } if (hasSuffix()) { String suf = getSuffix(); - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(0))); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(0))); if (suf.length() > 1) { // DLC assert, don't verify, that the length is two. // This could change if I learn of more suffix // particles. ThdlDebug.verify(2 == suf.length()); - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(1))); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(1))); } } if (hasPostsuffix()) - sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix())); + sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix())); return sb; } @@ -929,18 +929,18 @@ public class LegalTshegBar + "transliterationType=\"THDL Extended Wylie 0.5\" " + (hasPrefix() ? ("prefix=\"" - + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPrefix()) + "\" ") + + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPrefix()) + "\" ") : "") + (hasHeadLetter() ? ("headLetter=\"" - + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter()) + + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter()) + "\" ") : "") + ("rootLetter=\"" - + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getRootLetter()) + "\" ") + + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getRootLetter()) + "\" ") + (hasSubjoinedLetter() ? ("subjoinedLetter=\"" - + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter()) + + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getSubjoinedLetter()) + "\" ") : "") + (hasWaZurSubjoinedToRootLetter() @@ -953,17 +953,17 @@ public class LegalTshegBar // DLC NOW: what about the root letter a, i.e. \u0F68 ? do we want the EWTS to be 'aa' ? + ("vowel=\"" + (hasExplicitVowel() - ? UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()) + ? UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel()) : "a") + "\" ") + (hasSuffix() ? ("suffix=\"" - + UnicodeCharToThdlWylie.getThdlWylieForUnicodeString(getSuffix()) + + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeString(getSuffix()) + "\" ") : "") + (hasPostsuffix() ? ("postsuffix=\"" - + UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix()) + + UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix()) + "\" ") : "") + "/>"); diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java b/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java similarity index 79% rename from source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java rename to source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java index 955ca59..7cc5b13 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java @@ -21,7 +21,7 @@ package org.thdl.tib.text.tshegbar; import org.thdl.tib.text.TibetanMachineWeb; /** This noninstantiable class allows for converting from Unicode - * codepoints to Extended Wylie. It cannot be used for long + * codepoints to THDL Extended Wylie. It cannot be used for long * stretches of text, though, as it is unaware of context, which is * essential to understanding a non-trivial string of Tibetan * Unicode. @@ -29,21 +29,22 @@ import org.thdl.tib.text.TibetanMachineWeb; *

See the document by Nathaniel Garson and David Germano entitled * Extended Wylie Transliteration Scheme. Note that there are * a couple of issues with the November 18, 2001 revision of that - * document; these issues are in the Bugs tracker at our SourceForge site.

+ * document; these issues are in the Bugs tracker at our SourceForge + * site.

* * @see SourceForge site * * @author David Chandler */ -public class UnicodeCharToExtendedWylie { +public class UnicodeCodepointToThdlWylie { - /** Returns the extended Wylie for the very simple sequence x. - * Returns null iff some (Unicode) char in s has no extended - * Wylie representation. This is unaware of context, so use it - * sparingly. */ - public static StringBuffer getExtendedWylieForUnicodeString(String x) { + /** Returns the THDL extended Wylie for the very simple sequence + * x. Returns null iff some (Unicode) char in s has no THDL + * extended Wylie representation. This is unaware of context, so + * use it sparingly. */ + public static StringBuffer getThdlWylieForUnicodeString(String x) { StringBuffer sb = new StringBuffer(); for (int i = 0; i < x.length(); i++) { - String ew = getExtendedWylieForUnicodeChar(x.charAt(i)); + String ew = getThdlWylieForUnicodeCodepoint(x.charAt(i)); if (null == ew) return null; sb.append(ew); @@ -51,12 +52,14 @@ public class UnicodeCharToExtendedWylie { return sb; } - /** Returns the extended Wylie for x, or null if there is none. - * Understand that multiple Unicode code points (chars) map to - * the same Extended Wylie representation. Understand also that - * the scrap of Extended Wylie returned is only valid in certain - * contexts. For example, not all consonants take ra-btags. DLC NOW what about canonicalization? */ - public static String getExtendedWylieForUnicodeChar(char x) { + /** Returns the THDL extended Wylie for x, or null if there is + * none. Understand that multiple Unicode code points (chars) + * map to the same THDL Extended Wylie representation. + * Understand also that the scrap of THDL Extended Wylie returned + * is only valid in certain contexts. For example, not all + * consonants take ra-btags. DLC NOW what about + * canonicalization? */ + public static String getThdlWylieForUnicodeCodepoint(char x) { switch (x) { case '\u0F00': return "oM"; @@ -130,9 +133,9 @@ public class UnicodeCharToExtendedWylie { case '\u0F40': return "k"; case '\u0F41': return "kh"; case '\u0F42': return "g"; - case '\u0F43': return (getExtendedWylieForUnicodeChar('\u0F42') + case '\u0F43': return (getThdlWylieForUnicodeCodepoint('\u0F42') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0F44': return "ng"; case '\u0F45': return "c"; case '\u0F46': return "ch"; @@ -142,31 +145,31 @@ public class UnicodeCharToExtendedWylie { case '\u0F4A': return "T"; case '\u0F4B': return "Th"; case '\u0F4C': return "D"; - case '\u0F4D': return (getExtendedWylieForUnicodeChar('\u0F4C') + case '\u0F4D': return (getThdlWylieForUnicodeCodepoint('\u0F4C') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0F4E': return "N"; case '\u0F4F': return "t"; case '\u0F50': return "th"; case '\u0F51': return "d"; - case '\u0F52': return (getExtendedWylieForUnicodeChar('\u0F51') + case '\u0F52': return (getThdlWylieForUnicodeCodepoint('\u0F51') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0F53': return "n"; case '\u0F54': return "p"; case '\u0F55': return "ph"; case '\u0F56': return "b"; - case '\u0F57': return (getExtendedWylieForUnicodeChar('\u0F56') + case '\u0F57': return (getThdlWylieForUnicodeCodepoint('\u0F56') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0F58': return "m"; case '\u0F59': return "ts"; case '\u0F5A': return "tsh"; case '\u0F5B': return "dz"; - case '\u0F5C': return (getExtendedWylieForUnicodeChar('\u0F5B') + case '\u0F5C': return (getThdlWylieForUnicodeCodepoint('\u0F5B') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0F5D': return "w"; case '\u0F5E': return "zh"; case '\u0F5F': return "z"; @@ -180,9 +183,9 @@ public class UnicodeCharToExtendedWylie { case '\u0F66': return "s"; case '\u0F67': return "h"; case '\u0F68': return "a"; // DLC: maybe the empty string is OK here because typing just 'i' into Jskad causes root letter \u0F68 to appear... yuck... - case '\u0F69': return (getExtendedWylieForUnicodeChar('\u0F40') + case '\u0F69': return (getThdlWylieForUnicodeCodepoint('\u0F40') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB5')); + + getThdlWylieForUnicodeCodepoint('\u0FB5')); case '\u0F6A': return "r"; case '\u0F6B': return null; case '\u0F6C': return null; @@ -227,9 +230,9 @@ public class UnicodeCharToExtendedWylie { case '\u0F90': return "k"; case '\u0F91': return "kh"; case '\u0F92': return "g"; - case '\u0F93': return (getExtendedWylieForUnicodeChar('\u0F92') + case '\u0F93': return (getThdlWylieForUnicodeCodepoint('\u0F92') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0F94': return "ng"; case '\u0F95': return "c"; case '\u0F96': return "ch"; @@ -239,31 +242,31 @@ public class UnicodeCharToExtendedWylie { case '\u0F9A': return "T"; case '\u0F9B': return "Th"; case '\u0F9C': return "D"; - case '\u0F9D': return (getExtendedWylieForUnicodeChar('\u0F92') + case '\u0F9D': return (getThdlWylieForUnicodeCodepoint('\u0F92') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0F9E': return "N"; case '\u0F9F': return "t"; case '\u0FA0': return "th"; case '\u0FA1': return "d"; - case '\u0FA2': return (getExtendedWylieForUnicodeChar('\u0FA1') + case '\u0FA2': return (getThdlWylieForUnicodeCodepoint('\u0FA1') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0FA3': return "n"; case '\u0FA4': return "p"; case '\u0FA5': return "ph"; case '\u0FA6': return "b"; - case '\u0FA7': return (getExtendedWylieForUnicodeChar('\u0FA6') + case '\u0FA7': return (getThdlWylieForUnicodeCodepoint('\u0FA6') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0FA8': return "m"; case '\u0FA9': return "ts"; case '\u0FAA': return "tsh"; case '\u0FAB': return "dz"; - case '\u0FAC': return (getExtendedWylieForUnicodeChar('\u0FAB') + case '\u0FAC': return (getThdlWylieForUnicodeCodepoint('\u0FAB') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB7')); + + getThdlWylieForUnicodeCodepoint('\u0FB7')); case '\u0FAD': return "w"; case '\u0FAE': return "zh"; case '\u0FAF': return "z"; @@ -277,9 +280,9 @@ public class UnicodeCharToExtendedWylie { case '\u0FB6': return "s"; case '\u0FB7': return "h"; case '\u0FB8': return "a"; // DLC see note on \u0F68 ... - case '\u0FB9': return (getExtendedWylieForUnicodeChar('\u0F90') + case '\u0FB9': return (getThdlWylieForUnicodeCodepoint('\u0F90') + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? - + getExtendedWylieForUnicodeChar('\u0FB5')); + + getThdlWylieForUnicodeCodepoint('\u0FB5')); case '\u0FBA': return "w"; case '\u0FBB': return "y"; case '\u0FBC': return "r"; @@ -309,7 +312,7 @@ public class UnicodeCharToExtendedWylie { // This codepoint is in the range 0FD0-0FFF or is not in // the Tibetan range at all. In either case, there is no - // corresponding Extended Wylie. + // corresponding THDL Extended Wylie. return null; } } // end switch diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeGraphemeCluster.java b/source/org/thdl/tib/text/tshegbar/UnicodeGraphemeCluster.java new file mode 100644 index 0000000..62fe9c4 --- /dev/null +++ b/source/org/thdl/tib/text/tshegbar/UnicodeGraphemeCluster.java @@ -0,0 +1,377 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2001 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.tshegbar; + +import java.util.Vector; + +import org.thdl.util.ThdlDebug; + +/** A UnicodeGraphemeCluster is either a non-Tibetan codepoint (such + * as whitespace or control characters or a Latin "character"), or a + * vertically stacked set of Tibetan consonants, vowels, marks, and + * signs. The Unicode string + * "\u0F40\u0F0B\u0F41\u0F0B" specifies + * four UnicodeGraphemeClusters (the name of the Tibetan alphabet, + * you might notice), while the Unicode string + * "\u0F66\u0FA5\u0F39\u0F90\u0FB5\u0F71\u0F80\u0F7F" + * is one Tibetan stack, sa over fa over ka over Sha with an a-chung, + * a reversed gi-gu, and a visarga, plus a ngas-bzung-sgor-rtags mark + * underneath all of that. I assume the latter grapheme cluster is + * nonsense, but it is considered one grapheme cluster because all + * but the first char are combining chars. See Unicode Technical + * Report 29. + * + *

As the above example demonstrates, not all + * UnicodeGraphemeClusters are syntactically legal in the Tibetan + * language. Not all of them are syntactically legal in Sanskrit + * transcribed in the Tibetan alphabet, either.

+ * + *

The Unicode 3.2 standard (see especially Technical Report 29) + * refers to "grapheme clusters." A UnicodeGraphemeCluster is + * precisely a grapheme cluster as described by that standard. We + * interpret the standard as saying that U+0F3E and + * U+0F3F are each grapheme clusters unto themselves, + * even though they are combining codepoints.

+ * + * @author David Chandler */ +public class UnicodeGraphemeCluster + implements UnicodeReadyThunk, UnicodeConstants +{ + /** @see #getCPHeight(char) */ + private static final int MIN_HEIGHT = -6; + /** @see #getCPHeight(char) */ + private static final int MAX_HEIGHT = 3; + + /** The Unicode codepoints that compose this grapheme cluster. + This is legal, i.e. if there is a Tibetan vowel, it is the + last codepoint. It is in Normalization Form THDL (NFTHDL). */ + private String unicodeString; + + /** Do not use this constructor. */ + private UnicodeGraphemeCluster() { super(); } + + /** Creates a new GraphemeCluster given a legal sequence of + Unicode codepoints corresponding to a single grapheme + cluster. + @exception IllegalArgumentException if unicodeString is not a + syntactically correct Unicode 3.2 sequence (if it begins with + a combining codepoint or has a Tibetan vowel before another + combining character, for example, or if it is more than one + grapheme cluster. Note that syntactical correctness for + non-Tibetan codepoints is not likely to be known by this + routine. */ + public UnicodeGraphemeCluster(String unicodeString) + throws IllegalArgumentException + { + // check legality: + // DLC NOW FIXME + + // convert to NFTHDL: + this.unicodeString + = UnicodeUtils.toMostlyDecomposedUnicode(unicodeString, NORM_NFTHDL); + } + + /** Returns a string of codepoints in NFTHDL form. */ + public String getUnicodeRepresentation() { + return unicodeString; + } + + /** Returns true. */ + public boolean hasUnicodeRepresentation() { + return true; + } + + /** Returns true iff this stack could occur in syntactically + * correct, run-of-the-mill Tibetan (as opposed to Tibetanized + * Sanksrit, Chinese, et cetera). sga is a legal Tibetan stack, + * but g+g is not, for example. */ + public boolean isLegalTibetan() { + // DLC FIXME: for those odd head marks etc., return true even + // though hasUnicodeRepresentation() will return false. + + // Note that ra-btags and wa-zur both be present in legal + // Tibetan. + + throw new Error("DLC FIXME: not yet implemented."); + } + + /** Returns a element that contains the + * THDL Extended Wylie transliteration for this cluster. */ + public String toConciseXML() { + throw new Error("DLC NOW unimplemented"); + } + + /** Returns a element that contains this + * cluster broken down into its constituent decomposed + * codepoints. */ + public String toVerboseXML() { + throw new Error("DLC NOW unimplemented"); + } + + /** Returns the THDL Extended Wylie transliteration of this + grapheme cluster, or null if there is none (which happens for + a few Tibetan codepoints, if you'll recall). If needsVowel is + true, then an "a" will be appended when there is no EW_achung + or explicit simple vowel. If there is an explicit vowel or + EW_achung, it will always be present. Note that needsVowel is + provided because btags is the preferred THDL Extended Wylie + for the four contiguous grapheme clusters + "\u0F56\u0F4F\u0F42\u0F66", and + needsVowel must be set to false for all but the grapheme + cluster corresponding to \u0F4F if you wish + to get the preferred THDL Extended Wylie. */ + public String getThdlWylie(boolean needsVowel) { + throw new Error("DLC NOW unimplemented"); + } + + /** Given some (possibly unnormalized) Unicode 3.2 string unicode, + appends grapheme clusters to the vector of GraphemeClusters + grcls if grcls is nonnulla. Performs good error checking if + validate is true. If an error is found, grcls may have been + modified if nonnull. Setting grcls to null and setting + validate to true is sometimes useful for testing the validity + of a Unicode string. + @return the number of grapheme clusters that were or would + have been added to grcls + @exception BadTibetanUnicodeException if the unicode is not + syntactically legal + @exception IllegalArgumentException if correctErrors and + validate are both true + @exception NullPointerException if unicode is null */ + public static int breakUnicodeIntoGraphemeClusters(Vector grcls, + String unicode, + boolean validate, + boolean correctErrors) + throws // DLC SOON: BadTibetanUnicodeException, + IllegalArgumentException, NullPointerException + { + if (validate && correctErrors) { + throw new IllegalArgumentException("validate and correctErrors cannot both be true."); + } + throw new Error("DLC NOW unimplemented"); + /* + if (start == i) { + // special tests at the beginning of input. + if (0 != height || UnicodeUtils.combinesLeftToRight(cp)) { + throw new BadTibetanUnicodeException("A combining codepoint was found at the start of input or after a mark that ends a grapheme cluster."); + } + } + if (height == last_height) { + if ('\u0F39' == cp) { + if (!UnicodeUtils.isTibetanConsonant(last_cp)) { + throw new BadTibetanUnicodeException("U+0F39 can only occur after a (possibly subjoined) Tibetan consonant"); + } + } else { + // DLC: cp BEGINS A NEW GRAPHEME CLUSTER!!! + } + } + + // Test to see if this last character has ended this + // grapheme cluster: + if (UnicodeUtils.isTibetanTerminatingVowel(cp)) { + // DLC: cp ENDS A GRAPHEME CLUSTER!!! + } + */ + } + + /** FIXMEDOC */ + public String getTopToBottomCodepoints() { + return getTopToBottomCodepoints(new StringBuffer(unicodeString), + 0, unicodeString.length()).toString(); + } + + /** Returns a new StringBuffer consisting of the codepoints in + NFTHDLString at indices [start, end) sorted in top-to-bottom + order, or null on some occasions when NFTHDLString is already + sorted. A top-to-bottom ordering is a useful form for + applications wishing to render the grapheme cluster. Note + that this method is only useful if NFTHDLString is part of or + an entire grapheme cluster. Does no error checking on + NFTHDLString. + @param NFTHDLString a buffer with characters at indices i, + where start <= i < end, being the Unicode codepoints for a + single grapheme cluster or part of a grapheme cluster + @param start NFTHDLString.charAt(start) is the first codepoint + dealt with + @param end NFTHDLString.charAt(end) is the first codepoint NOT + dealt with + @return null only if (but not necessarily if) NFTHDLString is + already sorted top-to-bottom, or the sorted form of + NFTHDLString */ + private static StringBuffer getTopToBottomCodepoints(StringBuffer NFTHDLString, /* DLC FIXME: for efficiency, use a ThdlCharIterator. */ + int start, int end) + { + if (end <= start) /* 0-length string. */ + return null; + if (start + 1 == end) /* 1-length string. */ + return null; + // else we have a string of length >= 2. + + // We'll use the world's fastest sorting algorithm. Linear + // time, baby. Here are the ten or so mailboxes for our + // postman's sort: + StringBuffer chunksAtCommonHeights[] + = new StringBuffer[(MAX_HEIGHT + 1) - MIN_HEIGHT]; + + for (int i = start; i < end; i++) { + char cp = NFTHDLString.charAt(i); + int height = getCPHeight(cp); + + // initialize mailbox if necessary. + if (null == chunksAtCommonHeights[height - MIN_HEIGHT]) { + chunksAtCommonHeights[height - MIN_HEIGHT] + = new StringBuffer(2); + } + + // put this cp into the correct mailbox. + chunksAtCommonHeights[height - MIN_HEIGHT].append(cp); + } + + // Now concatenate together the mailboxes: + StringBuffer sb = new StringBuffer(end - start); + for (int h = MAX_HEIGHT; h >= MIN_HEIGHT; h--) { + if (null != chunksAtCommonHeights[h - MIN_HEIGHT]) { + sb.append(chunksAtCommonHeights[h - MIN_HEIGHT]); + } + } + return sb; + } + + + /** Returns the height for the Tibetan Unicode codepoint x. + This relative height is 0 for a base consonant, digit, + punctuation, mark, or sign. It is -1 for a subjoined + consonant, -2 for EWSUB_wa_zur, -3 for EW_achung, +1 for + EWV_gigu, and so on according to the height these codepoints + appear relative to one another when on the same stack. If two + codepoints have equal height, they should not exist in the + same grapheme cluster unless one is U+0F39, which + is an integral part of a consonant when tacked on to, e.g., + EWC_PHA. + +

If x is not a Unicode 3.2 codepoint in the Tibetan range, + or if x is not in NFTHDL form, 0 is returned. The height code + of U+0F76 is not valid, and it is not an accident + that U+0F76 is not in NFTHDL form.

*/ + private static int getCPHeight(char x) { + // DLC make this an assertion: + ThdlDebug.verify(null == UnicodeUtils.toNormalizedForm(x, NORM_NFTHDL)); + + if (x >= '\u0F90' && x <= '\u0FAC' + || x >= '\u0FAE' && x <= '\u0FBC') { + // subjoined consonant. Note that wa-zur is an exception. + return -1; + } else if (x >= '\u0F00' && x <= '\u0F17' + || x >= '\u0F1A' && x <= '\u0F34' + || x >= '\u0F3A' && x <= '\u0F3D' + || x >= '\u0F40' && x <= '\u0F6A' // consonants + || x >= '\u0F88' && x <= '\u0F8B' + || x >= '\u0FBE' && x <= '\u0FCF') { + // neutral height: + return 0; + } else { // Oddballs. + switch (x) { + // + // non-combining: + // + case '\u0F36': + case '\u0F38': + case '\u0F85': + return 0; + + + // + // combining, but left-to-right combining: + // + case '\u0F3E': + case '\u0F3F': + case '\u0F7F': + return 0; + + + // + // combining by coming below: + // + case '\u0FAD': + return -2; // wa-zur + case '\u0F71': + return -3; // a-chung + case '\u0F74': + case '\u0F84': + return -4; // DLC CHECKME + case '\u0F18': // combines with digits + case '\u0F19': // combines with digits + return -5; + case '\u0F35': + case '\u0F37': + case '\u0FC6': { + ThdlDebug.verify(-6 == MIN_HEIGHT); + return -6; // min height + } + + + // + // combining by coming above: + // + case '\u0F72': + case '\u0F7A': + case '\u0F7B': + case '\u0F7C': + case '\u0F7D': + case '\u0F80': + return 1; + case '\u0F7E': + case '\u0F82': + case '\u0F83': + return 2; // these three come above 0F7C, right? (DLC CHECKME) + case '\u0F86': + case '\u0F87': { + ThdlDebug.verify(3 == MAX_HEIGHT); + return 3; // max height + } + + + // + // exceptional case: + // + // some would say +1, but then "\u0F40\u0FA5\u0F39" will + // not have a5 combine with 39. Unicode could well have + // put in a single codepoint for "\u0FA5\u0F39" IMO. + case '\u0F39': return 0; + + + default: { + if (x >= '\u0F00' && x <= '\u0FFF') { + // This wasn't explicitly handled? Hmmm... This + // won't ever happen for NFTHDL-formed input. + ThdlDebug.noteIffyCode(); + } + + // This codepoint is not in the Tibetan range. + return 0; + } + } // end switch + } + } + /** DLC SOON */ + public boolean isTibetan() { + throw new Error("DLC FIXME: not yet implemented."); + } +} + diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java index 150d57f..3df96fb 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java @@ -97,10 +97,12 @@ public class UnicodeUtils implements UnicodeConstants { Unicode codepoints, into either Normalization Form KD (NFKD), D (NFD), or THDL (NFTHDL), depending on the value of normForm. NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed - for {@link org.thdl.tib.text.tshegbar#GraphemeCluster} because - NFKD normalizes U+0F0C. NFTHDL uses a maximum of - codepoints, and it never uses codepoints whose use has been - {@link #isDiscouraged(char) discouraged}. + for {@link org.thdl.tib.text.tshegbar#UnicodeGraphemeCluster} + because NFKD normalizes U+0F0C and neither NFD + nor NFKD breaks down U+0F00 into its constituent + codepoints. NFTHDL uses a maximum of codepoints, and it never + uses codepoints whose use has been {@link #isDiscouraged(char) + discouraged}.

The Tibetan passages of the returned string are in the chosen normalized form, but codepoints outside of the {@link @@ -170,6 +172,8 @@ public class UnicodeUtils implements UnicodeConstants { // Where not specified, the NFKD and NFTHDL forms are // identical to the NFD form. switch (tibetanUnicodeCP) { + case '\u0F00': return ((normalizationForm == NORM_NFTHDL) + ? "\u0F68\u0F7C\u0F7E" : null); case '\u0F0C': return ((normalizationForm == NORM_NFKD) ? "\u0F0B" : null); case '\u0F43': return "\u0F42\u0FB7"; @@ -282,9 +286,37 @@ public class UnicodeUtils implements UnicodeConstants { } /** Returns a human-readable, ASCII form of the Unicode codepoint - ch. */ - public static String unicodeCPToString(char ch) { - return "U+" + Integer.toHexString((int)ch); + cp. */ + public static String unicodeCodepointToString(char cp) { + if (cp < '\u0010') + return "\\u000" + Integer.toHexString((int)cp); + else if (cp < '\u0100') + return "\\u00" + Integer.toHexString((int)cp); + else if (cp < '\u1000') + return "\\u0" + Integer.toHexString((int)cp); + else + return "\\u" + Integer.toHexString((int)cp); + } + + public static String unicodeStringToString(String s) { + StringBuffer sb = new StringBuffer(s.length() * 6); + for (int i = 0; i < s.length(); i++) { + sb.append(unicodeCodepointToString(s.charAt(i))); + } + return sb.toString(); + } + + /** Returns true iff cp is a Unicode 3.2 Tibetan consonant, + subjoined or not. This counts precomposed consonant stacks + like U+0FA7 as consonants. If you don't wish to + treat such as consonants, then put the input into NORM_NFD, + NORM_NFKD, or NORM_NFTHDL first. If it changes under such a + normalization, it is a precomposed consonant. */ + public static boolean isTibetanConsonant(char cp) { + return (((cp >= '\u0F40' && cp <= '\u0F6A') + || (cp >= '\u0F90' && cp <= '\u0FBC')) + && '\u0F48' != cp + && '\u0F98' != cp); } }