Renamed UnicodeCharToExtendedWylie to

UnicodeCodepointToThdlWylie.java.

Added a new class, UnicodeGraphemeCluster, that can tell you
the components of a grapheme cluster from top to bottom.  It does not
yet have good error checking; it is not yet finished.

Next is to parse clean Unicode into GraphemeClusters.  After that comes
scanning dirty Unicode into best-guess GraphemeClusters, and scanning
dirty Unicode to get nice error messages.
This commit is contained in:
dchandler 2002-12-17 13:51:18 +00:00
parent 8e8a23c6a6
commit 7ea185fa01
4 changed files with 481 additions and 69 deletions

View file

@ -748,7 +748,7 @@ public class LegalTshegBar
if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) {
return internalThrowThing(throwIfIllegal,
"Illegal suffix -- not one of the ten legal suffixes: "
+ UnicodeUtils.unicodeCPToString(suffix.charAt(0)));
+ UnicodeUtils.unicodeCodepointToString(suffix.charAt(0)));
}
}
}
@ -837,7 +837,7 @@ public class LegalTshegBar
boolean disambiguatorNeeded = false;
char prefix = getPrefix();
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(prefix));
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(prefix));
if (!hasHeadLetter()) {
if (EWC_ya == rootLetter) {
if (isConsonantThatTakesYaBtags(prefix))
@ -857,55 +857,55 @@ public class LegalTshegBar
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
}
if (hasHeadLetter())
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter()));
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(rootLetter));
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter()));
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(rootLetter));
if (hasSubjoinedLetter())
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter()));
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getSubjoinedLetter()));
if (hasWaZurSubjoinedToRootLetter())
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EWSUB_wa_zur));
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EWSUB_wa_zur));
// a-chung is treated, in THDL Extended Wylie, like a vowel.
// I.e., you don't have 'pAa', you have 'pA'.
if (hasAChungOnRootLetter()) {
if (hasExplicitVowel()) {
if (EWV_i == getVowel()) {
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F73'));
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint('\u0F73'));
} else if (EWV_u == getVowel()) {
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar('\u0F75'));
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint('\u0F75'));
} else if (EWV_e == getVowel() || EWV_o == getVowel()) {
// The exception to the rule for a-chung and vowels...
// DLC FIXME: are these allowed in legal Tibetan?
// EWTS would have special cases for them if so,
// I'd wager...
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung));
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()));
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung));
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel()));
} else {
ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?");
}
} else {
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(EW_achung));
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung));
}
} else {
if (hasExplicitVowel())
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel()));
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel()));
else
sb.append("a");
}
if (hasSuffix()) {
String suf = getSuffix();
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(0)));
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(0)));
if (suf.length() > 1) {
// DLC assert, don't verify, that the length is two.
// This could change if I learn of more suffix
// particles.
ThdlDebug.verify(2 == suf.length());
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(suf.charAt(1)));
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(1)));
}
}
if (hasPostsuffix())
sb.append(UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix()));
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix()));
return sb;
}
@ -929,18 +929,18 @@ public class LegalTshegBar
+ "transliterationType=\"THDL Extended Wylie 0.5\" "
+ (hasPrefix()
? ("prefix=\""
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPrefix()) + "\" ")
+ UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPrefix()) + "\" ")
: "")
+ (hasHeadLetter()
? ("headLetter=\""
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getHeadLetter())
+ UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter())
+ "\" ")
: "")
+ ("rootLetter=\""
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getRootLetter()) + "\" ")
+ UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getRootLetter()) + "\" ")
+ (hasSubjoinedLetter()
? ("subjoinedLetter=\""
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getSubjoinedLetter())
+ UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getSubjoinedLetter())
+ "\" ")
: "")
+ (hasWaZurSubjoinedToRootLetter()
@ -953,17 +953,17 @@ public class LegalTshegBar
// DLC NOW: what about the root letter a, i.e. \u0F68 ? do we want the EWTS to be 'aa' ?
+ ("vowel=\""
+ (hasExplicitVowel()
? UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getVowel())
? UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel())
: "a")
+ "\" ")
+ (hasSuffix()
? ("suffix=\""
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeString(getSuffix())
+ UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeString(getSuffix())
+ "\" ")
: "")
+ (hasPostsuffix()
? ("postsuffix=\""
+ UnicodeCharToThdlWylie.getThdlWylieForUnicodeChar(getPostsuffix())
+ UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix())
+ "\" ")
: "")
+ "/>");

View file

@ -21,7 +21,7 @@ package org.thdl.tib.text.tshegbar;
import org.thdl.tib.text.TibetanMachineWeb;
/** This noninstantiable class allows for converting from Unicode
* codepoints to Extended Wylie. It cannot be used for long
* codepoints to THDL Extended Wylie. It cannot be used for long
* stretches of text, though, as it is unaware of context, which is
* essential to understanding a non-trivial string of Tibetan
* Unicode.
@ -29,21 +29,22 @@ import org.thdl.tib.text.TibetanMachineWeb;
* <p>See the document by Nathaniel Garson and David Germano entitled
* <i>Extended Wylie Transliteration Scheme</i>. Note that there are
* a couple of issues with the November 18, 2001 revision of that
* document; these issues are in the Bugs tracker at our SourceForge site.</p>
* document; these issues are in the Bugs tracker at our SourceForge
* site.</p>
*
* @see <a href="http://sourceforge.net/projects/thdltools">SourceForge site</a>
*
* @author David Chandler */
public class UnicodeCharToExtendedWylie {
public class UnicodeCodepointToThdlWylie {
/** Returns the extended Wylie for the very simple sequence x.
* Returns null iff some (Unicode) char in s has no extended
* Wylie representation. This is unaware of context, so use it
* sparingly. */
public static StringBuffer getExtendedWylieForUnicodeString(String x) {
/** Returns the THDL extended Wylie for the very simple sequence
* x. Returns null iff some (Unicode) char in s has no THDL
* extended Wylie representation. This is unaware of context, so
* use it sparingly. */
public static StringBuffer getThdlWylieForUnicodeString(String x) {
StringBuffer sb = new StringBuffer();
for (int i = 0; i < x.length(); i++) {
String ew = getExtendedWylieForUnicodeChar(x.charAt(i));
String ew = getThdlWylieForUnicodeCodepoint(x.charAt(i));
if (null == ew)
return null;
sb.append(ew);
@ -51,12 +52,14 @@ public class UnicodeCharToExtendedWylie {
return sb;
}
/** Returns the extended Wylie for x, or null if there is none.
* Understand that multiple Unicode code points (chars) map to
* the same Extended Wylie representation. Understand also that
* the scrap of Extended Wylie returned is only valid in certain
* contexts. For example, not all consonants take ra-btags. DLC NOW what about canonicalization? */
public static String getExtendedWylieForUnicodeChar(char x) {
/** Returns the THDL extended Wylie for x, or null if there is
* none. Understand that multiple Unicode code points (chars)
* map to the same THDL Extended Wylie representation.
* Understand also that the scrap of THDL Extended Wylie returned
* is only valid in certain contexts. For example, not all
* consonants take ra-btags. DLC NOW what about
* canonicalization? */
public static String getThdlWylieForUnicodeCodepoint(char x) {
switch (x) {
case '\u0F00': return "oM";
@ -130,9 +133,9 @@ public class UnicodeCharToExtendedWylie {
case '\u0F40': return "k";
case '\u0F41': return "kh";
case '\u0F42': return "g";
case '\u0F43': return (getExtendedWylieForUnicodeChar('\u0F42')
case '\u0F43': return (getThdlWylieForUnicodeCodepoint('\u0F42')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
case '\u0F44': return "ng";
case '\u0F45': return "c";
case '\u0F46': return "ch";
@ -142,31 +145,31 @@ public class UnicodeCharToExtendedWylie {
case '\u0F4A': return "T";
case '\u0F4B': return "Th";
case '\u0F4C': return "D";
case '\u0F4D': return (getExtendedWylieForUnicodeChar('\u0F4C')
case '\u0F4D': return (getThdlWylieForUnicodeCodepoint('\u0F4C')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
case '\u0F4E': return "N";
case '\u0F4F': return "t";
case '\u0F50': return "th";
case '\u0F51': return "d";
case '\u0F52': return (getExtendedWylieForUnicodeChar('\u0F51')
case '\u0F52': return (getThdlWylieForUnicodeCodepoint('\u0F51')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
case '\u0F53': return "n";
case '\u0F54': return "p";
case '\u0F55': return "ph";
case '\u0F56': return "b";
case '\u0F57': return (getExtendedWylieForUnicodeChar('\u0F56')
case '\u0F57': return (getThdlWylieForUnicodeCodepoint('\u0F56')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
case '\u0F58': return "m";
case '\u0F59': return "ts";
case '\u0F5A': return "tsh";
case '\u0F5B': return "dz";
case '\u0F5C': return (getExtendedWylieForUnicodeChar('\u0F5B')
case '\u0F5C': return (getThdlWylieForUnicodeCodepoint('\u0F5B')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
case '\u0F5D': return "w";
case '\u0F5E': return "zh";
case '\u0F5F': return "z";
@ -180,9 +183,9 @@ public class UnicodeCharToExtendedWylie {
case '\u0F66': return "s";
case '\u0F67': return "h";
case '\u0F68': return "a"; // DLC: maybe the empty string is OK here because typing just 'i' into Jskad causes root letter \u0F68 to appear... yuck...
case '\u0F69': return (getExtendedWylieForUnicodeChar('\u0F40')
case '\u0F69': return (getThdlWylieForUnicodeCodepoint('\u0F40')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB5'));
+ getThdlWylieForUnicodeCodepoint('\u0FB5'));
case '\u0F6A': return "r";
case '\u0F6B': return null;
case '\u0F6C': return null;
@ -227,9 +230,9 @@ public class UnicodeCharToExtendedWylie {
case '\u0F90': return "k";
case '\u0F91': return "kh";
case '\u0F92': return "g";
case '\u0F93': return (getExtendedWylieForUnicodeChar('\u0F92')
case '\u0F93': return (getThdlWylieForUnicodeCodepoint('\u0F92')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
case '\u0F94': return "ng";
case '\u0F95': return "c";
case '\u0F96': return "ch";
@ -239,31 +242,31 @@ public class UnicodeCharToExtendedWylie {
case '\u0F9A': return "T";
case '\u0F9B': return "Th";
case '\u0F9C': return "D";
case '\u0F9D': return (getExtendedWylieForUnicodeChar('\u0F92')
case '\u0F9D': return (getThdlWylieForUnicodeCodepoint('\u0F92')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
case '\u0F9E': return "N";
case '\u0F9F': return "t";
case '\u0FA0': return "th";
case '\u0FA1': return "d";
case '\u0FA2': return (getExtendedWylieForUnicodeChar('\u0FA1')
case '\u0FA2': return (getThdlWylieForUnicodeCodepoint('\u0FA1')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
case '\u0FA3': return "n";
case '\u0FA4': return "p";
case '\u0FA5': return "ph";
case '\u0FA6': return "b";
case '\u0FA7': return (getExtendedWylieForUnicodeChar('\u0FA6')
case '\u0FA7': return (getThdlWylieForUnicodeCodepoint('\u0FA6')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
case '\u0FA8': return "m";
case '\u0FA9': return "ts";
case '\u0FAA': return "tsh";
case '\u0FAB': return "dz";
case '\u0FAC': return (getExtendedWylieForUnicodeChar('\u0FAB')
case '\u0FAC': return (getThdlWylieForUnicodeCodepoint('\u0FAB')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB7'));
+ getThdlWylieForUnicodeCodepoint('\u0FB7'));
case '\u0FAD': return "w";
case '\u0FAE': return "zh";
case '\u0FAF': return "z";
@ -277,9 +280,9 @@ public class UnicodeCharToExtendedWylie {
case '\u0FB6': return "s";
case '\u0FB7': return "h";
case '\u0FB8': return "a"; // DLC see note on \u0F68 ...
case '\u0FB9': return (getExtendedWylieForUnicodeChar('\u0F90')
case '\u0FB9': return (getThdlWylieForUnicodeCodepoint('\u0F90')
+ TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right?
+ getExtendedWylieForUnicodeChar('\u0FB5'));
+ getThdlWylieForUnicodeCodepoint('\u0FB5'));
case '\u0FBA': return "w";
case '\u0FBB': return "y";
case '\u0FBC': return "r";
@ -309,7 +312,7 @@ public class UnicodeCharToExtendedWylie {
// This codepoint is in the range 0FD0-0FFF or is not in
// the Tibetan range at all. In either case, there is no
// corresponding Extended Wylie.
// corresponding THDL Extended Wylie.
return null;
}
} // end switch

View file

@ -0,0 +1,377 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.tshegbar;
import java.util.Vector;
import org.thdl.util.ThdlDebug;
/** A UnicodeGraphemeCluster is either a non-Tibetan codepoint (such
* as whitespace or control characters or a Latin "character"), or a
* vertically stacked set of Tibetan consonants, vowels, marks, and
* signs. The Unicode string
* <code>"&#92;u0F40&#92;u0F0B&#92;u0F41&#92;u0F0B"</code> specifies
* four UnicodeGraphemeClusters (the name of the Tibetan alphabet,
* you might notice), while the Unicode string
* <code>"&#92;u0F66&#92;u0FA5&#92;u0F39&#92;u0F90&#92;u0FB5&#92;u0F71&#92;u0F80&#92;u0F7F"</code>
* is one Tibetan stack, sa over fa over ka over Sha with an a-chung,
* a reversed gi-gu, and a visarga, plus a ngas-bzung-sgor-rtags mark
* underneath all of that. I assume the latter grapheme cluster is
* nonsense, but it is considered one grapheme cluster because all
* but the first char are combining chars. See Unicode Technical
* Report 29.
*
* <p>As the above example demonstrates, not all
* UnicodeGraphemeClusters are syntactically legal in the Tibetan
* language. Not all of them are syntactically legal in Sanskrit
* transcribed in the Tibetan alphabet, either.</p>
*
* <p>The Unicode 3.2 standard (see especially Technical Report 29)
* refers to "grapheme clusters." A UnicodeGraphemeCluster is
* precisely a grapheme cluster as described by that standard. We
* interpret the standard as saying that <code>U+0F3E</code> and
* <code>U+0F3F</code> are each grapheme clusters unto themselves,
* even though they are combining codepoints.</p>
*
* @author David Chandler */
public class UnicodeGraphemeCluster
implements UnicodeReadyThunk, UnicodeConstants
{
/** @see #getCPHeight(char) */
private static final int MIN_HEIGHT = -6;
/** @see #getCPHeight(char) */
private static final int MAX_HEIGHT = 3;
/** The Unicode codepoints that compose this grapheme cluster.
This is legal, i.e. if there is a Tibetan vowel, it is the
last codepoint. It is in Normalization Form THDL (NFTHDL). */
private String unicodeString;
/** Do not use this constructor. */
private UnicodeGraphemeCluster() { super(); }
/** Creates a new GraphemeCluster given a legal sequence of
Unicode codepoints corresponding to a single grapheme
cluster.
@exception IllegalArgumentException if unicodeString is not a
syntactically correct Unicode 3.2 sequence (if it begins with
a combining codepoint or has a Tibetan vowel before another
combining character, for example, or if it is more than one
grapheme cluster. Note that syntactical correctness for
non-Tibetan codepoints is not likely to be known by this
routine. */
public UnicodeGraphemeCluster(String unicodeString)
throws IllegalArgumentException
{
// check legality:
// DLC NOW FIXME
// convert to NFTHDL:
this.unicodeString
= UnicodeUtils.toMostlyDecomposedUnicode(unicodeString, NORM_NFTHDL);
}
/** Returns a string of codepoints in NFTHDL form. */
public String getUnicodeRepresentation() {
return unicodeString;
}
/** Returns true. */
public boolean hasUnicodeRepresentation() {
return true;
}
/** Returns true iff this stack could occur in syntactically
* correct, run-of-the-mill Tibetan (as opposed to Tibetanized
* Sanksrit, Chinese, et cetera). sga is a legal Tibetan stack,
* but g+g is not, for example. */
public boolean isLegalTibetan() {
// DLC FIXME: for those odd head marks etc., return true even
// though hasUnicodeRepresentation() will return false.
// Note that ra-btags and wa-zur both be present in legal
// Tibetan.
throw new Error("DLC FIXME: not yet implemented.");
}
/** Returns a <unicodeGraphemeCluster> element that contains the
* THDL Extended Wylie transliteration for this cluster. */
public String toConciseXML() {
throw new Error("DLC NOW unimplemented");
}
/** Returns a <unicodeGraphemeCluster> element that contains this
* cluster broken down into its constituent decomposed
* codepoints. */
public String toVerboseXML() {
throw new Error("DLC NOW unimplemented");
}
/** Returns the THDL Extended Wylie transliteration of this
grapheme cluster, or null if there is none (which happens for
a few Tibetan codepoints, if you'll recall). If needsVowel is
true, then an "a" will be appended when there is no EW_achung
or explicit simple vowel. If there is an explicit vowel or
EW_achung, it will always be present. Note that needsVowel is
provided because btags is the preferred THDL Extended Wylie
for the four contiguous grapheme clusters
<code>"&#92;u0F56&#92;u0F4F&#92;u0F42&#92;u0F66"</code>, and
needsVowel must be set to false for all but the grapheme
cluster corresponding to <code>&#92;u0F4F</code> if you wish
to get the preferred THDL Extended Wylie. */
public String getThdlWylie(boolean needsVowel) {
throw new Error("DLC NOW unimplemented");
}
/** Given some (possibly unnormalized) Unicode 3.2 string unicode,
appends grapheme clusters to the vector of GraphemeClusters
grcls if grcls is nonnulla. Performs good error checking if
validate is true. If an error is found, grcls may have been
modified if nonnull. Setting grcls to null and setting
validate to true is sometimes useful for testing the validity
of a Unicode string.
@return the number of grapheme clusters that were or would
have been added to grcls
@exception BadTibetanUnicodeException if the unicode is not
syntactically legal
@exception IllegalArgumentException if correctErrors and
validate are both true
@exception NullPointerException if unicode is null */
public static int breakUnicodeIntoGraphemeClusters(Vector grcls,
String unicode,
boolean validate,
boolean correctErrors)
throws // DLC SOON: BadTibetanUnicodeException,
IllegalArgumentException, NullPointerException
{
if (validate && correctErrors) {
throw new IllegalArgumentException("validate and correctErrors cannot both be true.");
}
throw new Error("DLC NOW unimplemented");
/*
if (start == i) {
// special tests at the beginning of input.
if (0 != height || UnicodeUtils.combinesLeftToRight(cp)) {
throw new BadTibetanUnicodeException("A combining codepoint was found at the start of input or after a mark that ends a grapheme cluster.");
}
}
if (height == last_height) {
if ('\u0F39' == cp) {
if (!UnicodeUtils.isTibetanConsonant(last_cp)) {
throw new BadTibetanUnicodeException("U+0F39 can only occur after a (possibly subjoined) Tibetan consonant");
}
} else {
// DLC: cp BEGINS A NEW GRAPHEME CLUSTER!!!
}
}
// Test to see if this last character has ended this
// grapheme cluster:
if (UnicodeUtils.isTibetanTerminatingVowel(cp)) {
// DLC: cp ENDS A GRAPHEME CLUSTER!!!
}
*/
}
/** FIXMEDOC */
public String getTopToBottomCodepoints() {
return getTopToBottomCodepoints(new StringBuffer(unicodeString),
0, unicodeString.length()).toString();
}
/** Returns a new StringBuffer consisting of the codepoints in
NFTHDLString at indices [start, end) sorted in top-to-bottom
order, or null on some occasions when NFTHDLString is already
sorted. A top-to-bottom ordering is a useful form for
applications wishing to render the grapheme cluster. Note
that this method is only useful if NFTHDLString is part of or
an entire grapheme cluster. Does no error checking on
NFTHDLString.
@param NFTHDLString a buffer with characters at indices i,
where start <= i < end, being the Unicode codepoints for a
single grapheme cluster or part of a grapheme cluster
@param start NFTHDLString.charAt(start) is the first codepoint
dealt with
@param end NFTHDLString.charAt(end) is the first codepoint NOT
dealt with
@return null only if (but not necessarily if) NFTHDLString is
already sorted top-to-bottom, or the sorted form of
NFTHDLString */
private static StringBuffer getTopToBottomCodepoints(StringBuffer NFTHDLString, /* DLC FIXME: for efficiency, use a ThdlCharIterator. */
int start, int end)
{
if (end <= start) /* 0-length string. */
return null;
if (start + 1 == end) /* 1-length string. */
return null;
// else we have a string of length >= 2.
// We'll use the world's fastest sorting algorithm. Linear
// time, baby. Here are the ten or so mailboxes for our
// postman's sort:
StringBuffer chunksAtCommonHeights[]
= new StringBuffer[(MAX_HEIGHT + 1) - MIN_HEIGHT];
for (int i = start; i < end; i++) {
char cp = NFTHDLString.charAt(i);
int height = getCPHeight(cp);
// initialize mailbox if necessary.
if (null == chunksAtCommonHeights[height - MIN_HEIGHT]) {
chunksAtCommonHeights[height - MIN_HEIGHT]
= new StringBuffer(2);
}
// put this cp into the correct mailbox.
chunksAtCommonHeights[height - MIN_HEIGHT].append(cp);
}
// Now concatenate together the mailboxes:
StringBuffer sb = new StringBuffer(end - start);
for (int h = MAX_HEIGHT; h >= MIN_HEIGHT; h--) {
if (null != chunksAtCommonHeights[h - MIN_HEIGHT]) {
sb.append(chunksAtCommonHeights[h - MIN_HEIGHT]);
}
}
return sb;
}
/** Returns the <i>height</i> for the Tibetan Unicode codepoint x.
This relative height is 0 for a base consonant, digit,
punctuation, mark, or sign. It is -1 for a subjoined
consonant, -2 for EWSUB_wa_zur, -3 for EW_achung, +1 for
EWV_gigu, and so on according to the height these codepoints
appear relative to one another when on the same stack. If two
codepoints have equal height, they should not exist in the
same grapheme cluster unless one is <code>U+0F39</code>, which
is an integral part of a consonant when tacked on to, e.g.,
EWC_PHA.
<p>If x is not a Unicode 3.2 codepoint in the Tibetan range,
or if x is not in NFTHDL form, 0 is returned. The height code
of <code>U+0F76</code> is not valid, and it is not an accident
that <code>U+0F76</code> is not in NFTHDL form.</p> */
private static int getCPHeight(char x) {
// DLC make this an assertion:
ThdlDebug.verify(null == UnicodeUtils.toNormalizedForm(x, NORM_NFTHDL));
if (x >= '\u0F90' && x <= '\u0FAC'
|| x >= '\u0FAE' && x <= '\u0FBC') {
// subjoined consonant. Note that wa-zur is an exception.
return -1;
} else if (x >= '\u0F00' && x <= '\u0F17'
|| x >= '\u0F1A' && x <= '\u0F34'
|| x >= '\u0F3A' && x <= '\u0F3D'
|| x >= '\u0F40' && x <= '\u0F6A' // consonants
|| x >= '\u0F88' && x <= '\u0F8B'
|| x >= '\u0FBE' && x <= '\u0FCF') {
// neutral height:
return 0;
} else { // Oddballs.
switch (x) {
//
// non-combining:
//
case '\u0F36':
case '\u0F38':
case '\u0F85':
return 0;
//
// combining, but left-to-right combining:
//
case '\u0F3E':
case '\u0F3F':
case '\u0F7F':
return 0;
//
// combining by coming below:
//
case '\u0FAD':
return -2; // wa-zur
case '\u0F71':
return -3; // a-chung
case '\u0F74':
case '\u0F84':
return -4; // DLC CHECKME
case '\u0F18': // combines with digits
case '\u0F19': // combines with digits
return -5;
case '\u0F35':
case '\u0F37':
case '\u0FC6': {
ThdlDebug.verify(-6 == MIN_HEIGHT);
return -6; // min height
}
//
// combining by coming above:
//
case '\u0F72':
case '\u0F7A':
case '\u0F7B':
case '\u0F7C':
case '\u0F7D':
case '\u0F80':
return 1;
case '\u0F7E':
case '\u0F82':
case '\u0F83':
return 2; // these three come above 0F7C, right? (DLC CHECKME)
case '\u0F86':
case '\u0F87': {
ThdlDebug.verify(3 == MAX_HEIGHT);
return 3; // max height
}
//
// exceptional case:
//
// some would say +1, but then "\u0F40\u0FA5\u0F39" will
// not have a5 combine with 39. Unicode could well have
// put in a single codepoint for "\u0FA5\u0F39" IMO.
case '\u0F39': return 0;
default: {
if (x >= '\u0F00' && x <= '\u0FFF') {
// This wasn't explicitly handled? Hmmm... This
// won't ever happen for NFTHDL-formed input.
ThdlDebug.noteIffyCode();
}
// This codepoint is not in the Tibetan range.
return 0;
}
} // end switch
}
}
/** DLC SOON */
public boolean isTibetan() {
throw new Error("DLC FIXME: not yet implemented.");
}
}

View file

@ -97,10 +97,12 @@ public class UnicodeUtils implements UnicodeConstants {
Unicode codepoints, into either Normalization Form KD (NFKD),
D (NFD), or THDL (NFTHDL), depending on the value of normForm.
NFD and NFKD are specified by Unicode 3.2; NFTHDL is needed
for {@link org.thdl.tib.text.tshegbar#GraphemeCluster} because
NFKD normalizes <code>U+0F0C</code>. NFTHDL uses a maximum of
codepoints, and it never uses codepoints whose use has been
{@link #isDiscouraged(char) discouraged}.
for {@link org.thdl.tib.text.tshegbar#UnicodeGraphemeCluster}
because NFKD normalizes <code>U+0F0C</code> and neither NFD
nor NFKD breaks down <code>U+0F00</code> into its constituent
codepoints. NFTHDL uses a maximum of codepoints, and it never
uses codepoints whose use has been {@link #isDiscouraged(char)
discouraged}.
<p>The Tibetan passages of the returned string are in the
chosen normalized form, but codepoints outside of the {@link
@ -170,6 +172,8 @@ public class UnicodeUtils implements UnicodeConstants {
// Where not specified, the NFKD and NFTHDL forms are
// identical to the NFD form.
switch (tibetanUnicodeCP) {
case '\u0F00': return ((normalizationForm == NORM_NFTHDL)
? "\u0F68\u0F7C\u0F7E" : null);
case '\u0F0C': return ((normalizationForm == NORM_NFKD)
? "\u0F0B" : null);
case '\u0F43': return "\u0F42\u0FB7";
@ -282,9 +286,37 @@ public class UnicodeUtils implements UnicodeConstants {
}
/** Returns a human-readable, ASCII form of the Unicode codepoint
ch. */
public static String unicodeCPToString(char ch) {
return "U+" + Integer.toHexString((int)ch);
cp. */
public static String unicodeCodepointToString(char cp) {
if (cp < '\u0010')
return "\\u000" + Integer.toHexString((int)cp);
else if (cp < '\u0100')
return "\\u00" + Integer.toHexString((int)cp);
else if (cp < '\u1000')
return "\\u0" + Integer.toHexString((int)cp);
else
return "\\u" + Integer.toHexString((int)cp);
}
public static String unicodeStringToString(String s) {
StringBuffer sb = new StringBuffer(s.length() * 6);
for (int i = 0; i < s.length(); i++) {
sb.append(unicodeCodepointToString(s.charAt(i)));
}
return sb.toString();
}
/** Returns true iff cp is a Unicode 3.2 Tibetan consonant,
subjoined or not. This counts precomposed consonant stacks
like <code>U+0FA7</code> as consonants. If you don't wish to
treat such as consonants, then put the input into NORM_NFD,
NORM_NFKD, or NORM_NFTHDL first. If it changes under such a
normalization, it is a precomposed consonant. */
public static boolean isTibetanConsonant(char cp) {
return (((cp >= '\u0F40' && cp <= '\u0F6A')
|| (cp >= '\u0F90' && cp <= '\u0FBC'))
&& '\u0F48' != cp
&& '\u0F98' != cp);
}
}