/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.tshegbar;
import java.util.Vector;
import org.thdl.util.ThdlDebug;
/** A UnicodeGraphemeCluster is either a non-Tibetan codepoint (such
* as whitespace or control characters or a Latin "character"), or a
* vertically stacked set of Tibetan consonants, vowels, marks, and
* signs. The Unicode string
* "\u0F40\u0F0B\u0F41\u0F0B"
specifies
* four UnicodeGraphemeClusters (the name of the Tibetan alphabet,
* you might notice), while the Unicode string
* "\u0F66\u0FA5\u0F39\u0F90\u0FB5\u0F71\u0F80\u0F7F"
* is one Tibetan stack, sa over fa over ka over Sha with an a-chung,
* a reversed gi-gu, and a visarga, plus a ngas-bzung-sgor-rtags mark
* underneath all of that. I assume the latter grapheme cluster is
* nonsense, but it is considered one grapheme cluster because all
* but the first char are combining chars. See Unicode Technical
* Report 29.
*
*
As the above example demonstrates, not all * UnicodeGraphemeClusters are syntactically legal in the Tibetan * language. Not all of them are syntactically legal in Sanskrit * transcribed in the Tibetan alphabet, either.
* *The Unicode 3.2 standard (see especially Technical Report 29)
* refers to "grapheme clusters." A UnicodeGraphemeCluster is
* precisely a grapheme cluster as described by that standard. We
* interpret the standard as saying that U+0F3E
and
* U+0F3F
are each grapheme clusters unto themselves,
* even though they are combining codepoints.
"\u0F56\u0F4F\u0F42\u0F66"
, and
needsVowel must be set to false for all but the grapheme
cluster corresponding to \u0F4F
if you wish
to get the preferred THDL Extended Wylie. */
public String getThdlWylie(boolean needsVowel) {
throw new Error("DLC NOW unimplemented");
}
/** Given some (possibly unnormalized) Unicode 3.2 string unicode,
appends grapheme clusters to the vector of GraphemeClusters
grcls if grcls is nonnulla. Performs good error checking if
validate is true. If an error is found, grcls may have been
modified if nonnull. Setting grcls to null and setting
validate to true is sometimes useful for testing the validity
of a Unicode string.
@return the number of grapheme clusters that were or would
have been added to grcls
@exception BadTibetanUnicodeException if the unicode is not
syntactically legal
@exception IllegalArgumentException if correctErrors and
validate are both true
@exception NullPointerException if unicode is null */
public static int breakUnicodeIntoGraphemeClusters(Vector grcls,
String unicode,
boolean validate,
boolean correctErrors)
throws // DLC SOON: BadTibetanUnicodeException,
IllegalArgumentException, NullPointerException
{
if (validate && correctErrors) {
throw new IllegalArgumentException("validate and correctErrors cannot both be true.");
}
throw new Error("DLC NOW unimplemented");
/*
if (start == i) {
// special tests at the beginning of input.
if (0 != height || UnicodeUtils.combinesLeftToRight(cp)) {
throw new BadTibetanUnicodeException("A combining codepoint was found at the start of input or after a mark that ends a grapheme cluster.");
}
}
if (height == last_height) {
if ('\u0F39' == cp) {
if (!UnicodeUtils.isTibetanConsonant(last_cp)) {
throw new BadTibetanUnicodeException("U+0F39 can only occur after a (possibly subjoined) Tibetan consonant");
}
} else {
// DLC: cp BEGINS A NEW GRAPHEME CLUSTER!!!
}
}
// Test to see if this last character has ended this
// grapheme cluster:
if (UnicodeUtils.isTibetanTerminatingVowel(cp)) {
// DLC: cp ENDS A GRAPHEME CLUSTER!!!
}
*/
}
/** FIXMEDOC */
public String getTopToBottomCodepoints() {
return getTopToBottomCodepoints(new StringBuffer(unicodeString),
0, unicodeString.length()).toString();
}
/** Returns a new StringBuffer consisting of the codepoints in
NFTHDLString at indices [start, end) sorted in top-to-bottom
order, or null on some occasions when NFTHDLString is already
sorted. A top-to-bottom ordering is a useful form for
applications wishing to render the grapheme cluster. Note
that this method is only useful if NFTHDLString is part of or
an entire grapheme cluster. Does no error checking on
NFTHDLString.
@param NFTHDLString a buffer with characters at indices i,
where start <= i < end, being the Unicode codepoints for a
single grapheme cluster or part of a grapheme cluster
@param start NFTHDLString.charAt(start) is the first codepoint
dealt with
@param end NFTHDLString.charAt(end) is the first codepoint NOT
dealt with
@return null only if (but not necessarily if) NFTHDLString is
already sorted top-to-bottom, or the sorted form of
NFTHDLString */
private static StringBuffer getTopToBottomCodepoints(StringBuffer NFTHDLString, /* DLC FIXME: for efficiency, use a ThdlCharIterator. */
int start, int end)
{
if (end <= start) /* 0-length string. */
return null;
if (start + 1 == end) /* 1-length string. */
return null;
// else we have a string of length >= 2.
// We'll use the world's fastest sorting algorithm. Linear
// time, baby. Here are the ten or so mailboxes for our
// postman's sort:
StringBuffer chunksAtCommonHeights[]
= new StringBuffer[(MAX_HEIGHT + 1) - MIN_HEIGHT];
for (int i = start; i < end; i++) {
char cp = NFTHDLString.charAt(i);
int height = getCPHeight(cp);
// initialize mailbox if necessary.
if (null == chunksAtCommonHeights[height - MIN_HEIGHT]) {
chunksAtCommonHeights[height - MIN_HEIGHT]
= new StringBuffer(2);
}
// put this cp into the correct mailbox.
chunksAtCommonHeights[height - MIN_HEIGHT].append(cp);
}
// Now concatenate together the mailboxes:
StringBuffer sb = new StringBuffer(end - start);
for (int h = MAX_HEIGHT; h >= MIN_HEIGHT; h--) {
if (null != chunksAtCommonHeights[h - MIN_HEIGHT]) {
sb.append(chunksAtCommonHeights[h - MIN_HEIGHT]);
}
}
return sb;
}
/** Returns the height for the Tibetan Unicode codepoint x.
This relative height is 0 for a base consonant, digit,
punctuation, mark, or sign. It is -1 for a subjoined
consonant, -2 for EWSUB_wa_zur, -3 for EW_achung, +1 for
EWV_gigu, and so on according to the height these codepoints
appear relative to one another when on the same stack. If two
codepoints have equal height, they should not exist in the
same grapheme cluster unless one is U+0F39
, which
is an integral part of a consonant when tacked on to, e.g.,
EWC_PHA.
If x is not a Unicode 3.2 codepoint in the Tibetan range,
or if x is not in NFTHDL form, 0 is returned. The height code
of U+0F76
is not valid, and it is not an accident
that U+0F76
is not in NFTHDL form.