Now uses terminology from the Unicode standard. No more talk of

characters, for example.

Normalization forms NFKD and NFD are supported for the Tibetan Unicode
range.  I don't like either, actually.  I've tested NFKD, but I've not yet
committed the tests.
This commit is contained in:
dchandler 2002-12-15 03:35:24 +00:00
parent 3199ff7926
commit a42347b224
7 changed files with 210 additions and 136 deletions

View file

@ -103,7 +103,7 @@ And also there are cases where they combine. For ex you can have
* consonants and vowels. In some situations, you should use {@link * consonants and vowels. In some situations, you should use {@link
* #EWSUB_wa_zur} to represent the consonant wa, while in others you * #EWSUB_wa_zur} to represent the consonant wa, while in others you
* should use {@link #EWC_wa}, even though you mean to subscribe a * should use {@link #EWC_wa}, even though you mean to subscribe a
* fixed-form wa. Basically, stick to the characters for which * fixed-form wa. Basically, stick to the codepoints for which
* enumerations exist in {@link * enumerations exist in {@link
* org.thdl.tib.text.tshegbar.UnicodeConstants} and use your common * org.thdl.tib.text.tshegbar.UnicodeConstants} and use your common
* sense.</p> * sense.</p>
@ -131,7 +131,7 @@ public class LegalTshegBar
private boolean hasWaZur; private boolean hasWaZur;
/** true iff EW_wa_zur is under the root syllable. */ /** true iff EW_wa_zur is under the root syllable. */
private boolean hasAChung; private boolean hasAChung;
/** If this is a string, it is of a single character or is equal /** If this is a string, it is of a single codepoint or is equal
* to {@link #getConnectiveCaseSuffix()} */ * to {@link #getConnectiveCaseSuffix()} */
private String suffix; private String suffix;
/** EW_da, EW_sa, or EW_ABSENT */ /** EW_da, EW_sa, or EW_ABSENT */
@ -237,7 +237,7 @@ public class LegalTshegBar
/** Returns null if there is no suffix, or a string containing the /** Returns null if there is no suffix, or a string containing the
* one consonant or a string <code>"&#92;u0F60&#92;u0F72"</code> * one consonant or a string <code>"&#92;u0F60&#92;u0F72"</code>
* containing two characters in the special case that the suffix * containing two codepoints in the special case that the suffix
* is that connective case marker {@link * is that connective case marker {@link
* #getConnectiveCaseSuffix()}. */ * #getConnectiveCaseSuffix()}. */
public String getSuffix() { public String getSuffix() {
@ -317,7 +317,7 @@ public class LegalTshegBar
} }
/** Returns a string of two characters, da and sa. */ /** Returns a string of two codepoints, da and sa. */
public static String getPossiblePostsuffixes() { public static String getPossiblePostsuffixes() {
return new String(new char[] { EWC_da, EWC_sa }); return new String(new char[] { EWC_da, EWC_sa });
} }
@ -328,7 +328,7 @@ public class LegalTshegBar
EWC_ra, EWC_la, EWC_sa EWC_ra, EWC_la, EWC_sa
}); });
/** Returns a string of ten characters, each of which can be a /** Returns a string of ten codepoints, each of which can be a
* suffix in Tibetan. */ * suffix in Tibetan. */
public static String getPossibleSuffixes() { public static String getPossibleSuffixes() {
return possibleSuffixes; return possibleSuffixes;
@ -345,7 +345,7 @@ public class LegalTshegBar
EWC_achen, EWV_i EWC_achen, EWV_i
}); });
/** Returns a two-character string consisting of the Unicode /** Returns a two-codepoint string consisting of the Unicode
* representation of what Extended Wylie calls * representation of what Extended Wylie calls
* <code>'i</code>. */ * <code>'i</code>. */
public static String getConnectiveCaseSuffix() { public static String getConnectiveCaseSuffix() {
@ -594,9 +594,9 @@ public class LegalTshegBar
* @param rootLetter the mandatory root consonant * @param rootLetter the mandatory root consonant
* @param subjoinedLetter the optional, subscribed consonant * @param subjoinedLetter the optional, subscribed consonant
* @param suffix the optional suffix, which is null, a String * @param suffix the optional suffix, which is null, a String
* consisting of a single consonant (i.e. a single character) * consisting of a single consonant (i.e. a single,
* except in the special case that this is {@link * nondecomposable codepoint) except in the special case that
* #getConnectiveCaseSuffix()} * this is {@link #getConnectiveCaseSuffix()}
* @param postsuffix the optional postsuffix, which should be * @param postsuffix the optional postsuffix, which should be
* EWC_sa or EWC_da * EWC_sa or EWC_da
* @param vowel the optional vowel */ * @param vowel the optional vowel */
@ -748,7 +748,7 @@ public class LegalTshegBar
if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) { if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) {
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
"Illegal suffix -- not one of the ten legal suffixes: " "Illegal suffix -- not one of the ten legal suffixes: "
+ UnicodeUtils.unicodeCharToString(suffix.charAt(0))); + UnicodeUtils.unicodeCPToString(suffix.charAt(0)));
} }
} }
} }
@ -971,10 +971,11 @@ public class LegalTshegBar
/** Overrides {@link org.thdl.tib.text.tshegbar.UnicodeReadyThunk} /** Overrides {@link org.thdl.tib.text.tshegbar.UnicodeReadyThunk}
method to return {@link UnicodeUtils#toCanonicalForm(String) method to return {@link
canonically-formed Unicode}. UnicodeUtils#toMostlyDecomposedUnicode(String, byte)
NFKD-normalized Unicode}.
@exception UnsupportedOperationException is never thrown */ @exception UnsupportedOperationException is never thrown */
public String getEquivalentUnicode() { public String getUnicodeRepresentation() {
StringBuffer sb = new StringBuffer(); StringBuffer sb = new StringBuffer();
if (hasPrefix()) { if (hasPrefix()) {
ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getPrefix())); ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getPrefix()));
@ -1017,7 +1018,7 @@ public class LegalTshegBar
/** Overrides {@link org.thdl.tib.text.tshegbar.UnicodeReadyThunk} /** Overrides {@link org.thdl.tib.text.tshegbar.UnicodeReadyThunk}
method to return true. */ method to return true. */
public boolean hasEquivalentUnicode() { public boolean hasUnicodeRepresentation() {
return true; return true;
} }

View file

@ -23,26 +23,37 @@ package org.thdl.tib.text.tshegbar;
* *
* <p> First, some terminology.</p> * <p> First, some terminology.</p>
* *
* <ul> <li>When we talk about a <i>glyph</i>, we mean a picture * <ul> <li>When we talk about a <i>grapheme cluster</i> (or
* found in a font. A single glyph may have one or more * <i>grcl</i>), we mean what the Unicode standard calls a "grapheme
* representations by sequences of Unicode characters, or it may not * cluster". Most glyphs (i.e., pictures) found in a font are
* be representable becuase it is only part of one Unicode character * grapheme clusters, but the picture corresponding to the Unicode
* or pictures a nonstandard character.</li> <li>When we talk about a * codepoint <code>&#92;u0F74</code> is not a grapheme cluster. In
* <i>stack</i>, we mean either a number (or half-number), a mark or * addition, in English, many fonts have a single glyph (a
* sign, a bit of punctuation, or a consonant stack.</li> <li>A * "ligature") for the combination of two grapheme clusters,
* <i>consonant stack</i> is or one or more consonants stacked * e.g. "fi". A single grapheme cluster may have one or more
* vertically, plus an optional vocalic modification such as an * representations by sequences of Unicode codepoints, or it may not
* anusvara (DLC what do we call a bindu?) or visarga, plus zero or * be representable becuase it is only part of one Unicode codepoint
* more signs like <code>&#92;u0F35</code>, plus an optional a-chung * or pictures a nonstandard character.</li> <li>We will attempt to
* (<code>&#92;u0F71</code>), plus an optional simple vowel.</li> <li>By * avoid using the word "character", as it sometimes refers to a
* <i>simple vowel</i>, we mean any of <code>&#92;u0F72</code>, * codepoint and sometimes refers to a glyph in a font and yet other
* <code>&#92;u0F74</code>, <code>&#92;u0F7A</code>, <code>&#92;u0F7B</code>, * times refers to a grapheme cluster.</li> <li>We'll try to avoid
* using the word "stack" because it sometimes refers to a sequence
* of stacked Tibetan consonants and sometimes refers to an entire
* grapheme cluster.</li> <li>A <i>Tibetan stack</i> is or one or
* more consonants stacked vertically, plus an optional vocalic
* modification such as an anusvara (DLC what do we call a bindu?) or
* visarga, plus zero or more signs like <code>&#92;u0F35</code>,
* plus an optional a-chung (<code>&#92;u0F71</code>), plus an
* optional simple vowel.</li> <li>By <i>simple vowel</i>, we mean
* any of <code>&#92;u0F72</code>, <code>&#92;u0F74</code>,
* <code>&#92;u0F7A</code>, <code>&#92;u0F7B</code>,
* <code>&#92;u0F7C</code>, <code>&#92;u0F7D</code>, or * <code>&#92;u0F7C</code>, <code>&#92;u0F7D</code>, or
* <code>&#92;u0F80</code>.</li> </ul> * <code>&#92;u0F80</code>.</li> </ul>
* *
* (Note: The string <code>"&#92;u0F68&#92;u0F7E&#92;u0F7C"</code> seems to equal * <p>(Note: The string <code>"&#92;u0F68&#92;u0F7E&#92;u0F7C"</code>
* <code>"&#92;u0F00"</code>, though the Unicode standard does not * seems to equal <code>"&#92;u0F00"</code>, though the Unicode
* indicate that it is so. This code treats it that way.)</p> * standard does not indicate that it is so. This code treats it
* that way.)</p>
* *
* <p> This class allows for invalid tsheg bars, like those * <p> This class allows for invalid tsheg bars, like those
* containing more than one prefix, more than two suffixes, an * containing more than one prefix, more than two suffixes, an
@ -55,10 +66,10 @@ package org.thdl.tib.text.tshegbar;
* and for invalid tsheg bars. Note that correctness is at the tsheg * and for invalid tsheg bars. Note that correctness is at the tsheg
* bar level only; it may be grammatically incorrect to concatenate * bar level only; it may be grammatically incorrect to concatenate
* two valid tsheg bars. Some subclasses can be represented in * two valid tsheg bars. Some subclasses can be represented in
* Unicode, but others contain nonstandard glyphs and cannot be.</p> * Unicode, but others contain nonstandard glyphs/characters and
* cannot be.</p>
* *
* @author David Chandler * @author David Chandler */
*/
public abstract class TshegBar implements UnicodeReadyThunk { public abstract class TshegBar implements UnicodeReadyThunk {
/** Returns true, as we consider a transliteration in the Tibetan /** Returns true, as we consider a transliteration in the Tibetan
* alphabet of a non-Tibetan language, say Chinese, as being * alphabet of a non-Tibetan language, say Chinese, as being

View file

@ -21,10 +21,10 @@ package org.thdl.tib.text.tshegbar;
import org.thdl.tib.text.TibetanMachineWeb; import org.thdl.tib.text.TibetanMachineWeb;
/** This noninstantiable class allows for converting from Unicode /** This noninstantiable class allows for converting from Unicode
* characters (i.e., code points) to Extended Wylie. It cannot be * codepoints to Extended Wylie. It cannot be used for long
* used for long stretches of text, though, as it is unaware of * stretches of text, though, as it is unaware of context, which is
* context, which is essential to understanding a non-trivial string * essential to understanding a non-trivial string of Tibetan
* of Tibetan Unicode. * Unicode.
* *
* <p>See the document by Nathaniel Garson and David Germano entitled * <p>See the document by Nathaniel Garson and David Germano entitled
* <i>Extended Wylie Transliteration Scheme</i>. Note that there are * <i>Extended Wylie Transliteration Scheme</i>. Note that there are
@ -307,7 +307,7 @@ public class UnicodeCharToExtendedWylie {
default: { default: {
// DLC handle space (EW's "_") // DLC handle space (EW's "_")
// This character is in the range 0FD0-0FFF or is not in // This codepoint is in the range 0FD0-0FFF or is not in
// the Tibetan range at all. In either case, there is no // the Tibetan range at all. In either case, there is no
// corresponding Extended Wylie. // corresponding Extended Wylie.
return null; return null;

View file

@ -19,7 +19,7 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.tshegbar; package org.thdl.tib.text.tshegbar;
/** Provides handy Extended Wylie-inspired names for Unicode /** Provides handy Extended Wylie-inspired names for Unicode
* characters commonly used to represent Tibetan. The consonant that * codepoints commonly used to represent Tibetan. The consonant that
* the Extended Wylie text "ka" refers to is named EWC_ka as in "The * the Extended Wylie text "ka" refers to is named EWC_ka as in "The
* Extended Wylie Consonant ka", the vowel represented in Extended * Extended Wylie Consonant ka", the vowel represented in Extended
* Wylie by "i" is EWV_i, and so on. There is at least one exception * Wylie by "i" is EWV_i, and so on. There is at least one exception
@ -30,10 +30,26 @@ package org.thdl.tib.text.tshegbar;
* @author David Chandler */ * @author David Chandler */
public interface UnicodeConstants { public interface UnicodeConstants {
/** for those times when you need a char to represent a non-existent character */ /** Refers to unnormalized Unicode: */
static final byte NORM_UNNORMALIZED = 0;
/** Refers to Normalization Form C: */
static final byte NORM_NFC = 1;
/** Refers to Normalization Form KC: */
static final byte NORM_NFKC = 2;
/** Refers to Normalization Form D: */
static final byte NORM_NFD = 3;
/** Refers to Normalization Form KD: */
static final byte NORM_NFKD = 4;
/** for those times when you need a char to represent a
non-existent codepoint */
static final char EW_ABSENT = '\u0000'; static final char EW_ABSENT = '\u0000';
//
// the thirty consonants, in alphabetical order: // the thirty consonants, in alphabetical order:
//
/** first letter of the alphabet: */ /** first letter of the alphabet: */
static final char EWC_ka = '\u0F40'; static final char EWC_ka = '\u0F40';
@ -70,11 +86,13 @@ public interface UnicodeConstants {
static final char EWC_ha = '\u0F67'; static final char EWC_ha = '\u0F67';
static final char EWC_a = '\u0F68'; static final char EWC_a = '\u0F68';
/** In the word for father, "pA lags", there is an a-chung (i.e., /** In the word for father, "pA lags", there is an a-chung (i.e.,
<code>\u0F71</code>). This is the constant for that little <code>\u0F71</code>). This is the constant for that little
guy. */ guy. */
static final char EW_achung = '\u0F71'; static final char EW_achung = '\u0F71';
/* Four of the five vowels, some say, or, others say, "the four /* Four of the five vowels, some say, or, others say, "the four
vowels": */ vowels": */
/** "gi gu", the 'i' sound in the English word keep: */ /** "gi gu", the 'i' sound in the English word keep: */

View file

@ -18,14 +18,14 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.tshegbar; package org.thdl.tib.text.tshegbar;
/** A UnicodeReadyThunk represents a string of characters. While /** A UnicodeReadyThunk represents a string of codepoints. While
* there are ways to turn a string of Unicode characters into a list * there are ways to turn a string of Unicode codepoints into a list
* of UnicodeReadyThunks (DLC reference it), you cannot * of UnicodeReadyThunks (DLC reference it), you cannot
* necessarily recover the exact sequence of Unicode characters from * necessarily recover the exact sequence of Unicode codepoints from
* a UnicodeReadyThunk. For characters that are not Tibetan * a UnicodeReadyThunk. For codepoints that are not Tibetan
* Unicode and are not one of a handful of other known characters, * Unicode and are not one of a handful of other known codepoints,
* only the most primitive operations are available. Generally in * only the most primitive operations are available. Generally in
* this case you can recover the exact string of Unicode characters, * this case you can recover the exact string of Unicode codepoints,
* but don't bank on it. * but don't bank on it.
* *
* @author David Chandler * @author David Chandler
@ -33,23 +33,25 @@ package org.thdl.tib.text.tshegbar;
public interface UnicodeReadyThunk { public interface UnicodeReadyThunk {
/** Returns true iff this thunk is entirely Tibetan (regardless of /** Returns true iff this thunk is entirely Tibetan (regardless of
whether or not all characters come from the Tibetan range of whether or not all codepoints come from the Tibetan range of
Unicode 3, i.e. <code>0x0F00</code>-<code>0x0FFF</code>). */ Unicode 3, i.e. <code>U+0F00</code>-<code>U+0FFF</code>, and
regardless of whether or not this thunk is syntactically legal
Tibetan). */
public boolean isTibetan(); public boolean isTibetan();
/** Returns a sequence of Unicode characters that is equivalent to /** Returns a sequence of Unicode codepoints that is equivalent to
* this thunk if possible. It is only possible if {@link * this thunk if possible. It is only possible if {@link
* #hasEquivalentUnicode()} is true. Unicode has more than one * #hasUnicodeRepresentation()} is true. Unicode has more than one
* way to refer to the same language element, so this is just one * way to refer to the same language element, so this is just one
* method. When more than one Unicode sequence exists, and when * method. When more than one Unicode sequence exists, and when
* the thunk {@link #isTibetan() is Tibetan}, this method returns * the thunk {@link #isTibetan() is Tibetan}, this method returns
* sequences that the Unicode 3.2 standard does not discourage. * sequences that the Unicode 3.2 standard does not discourage.
* @exception UnsupportedOperationException if {@link * @exception UnsupportedOperationException if {@link
* #hasEquivalentUnicode()} is false * #hasUnicodeRepresentation()} is false
* @return a String of Unicode characters */ * @return a String of Unicode codepoints */
public String getEquivalentUnicode() throws UnsupportedOperationException; public String getUnicodeRepresentation() throws UnsupportedOperationException;
/** Returns true iff there exists a sequence of Unicode characters /** Returns true iff there exists a sequence of Unicode codepoints
* that correctly represents this thunk. This will not be the * that correctly represents this thunk. This will not be the
* case if the thunk contains Tibetan characters for which the * case if the thunk contains Tibetan characters for which the
* Unicode standard does not provide. See the Extended Wylie * Unicode standard does not provide. See the Extended Wylie
@ -58,6 +60,6 @@ public interface UnicodeReadyThunk {
* standard section 9.13. The presence of head marks or multiple * standard section 9.13. The presence of head marks or multiple
* vowels in the thunk would cause this to return false, for * vowels in the thunk would cause this to return false, for
* example. */ * example. */
public boolean hasEquivalentUnicode(); public boolean hasUnicodeRepresentation();
} }

View file

@ -19,15 +19,15 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.tshegbar; package org.thdl.tib.text.tshegbar;
/** <p>This non-instantiable class contains utility routines for /** <p>This non-instantiable class contains utility routines for
* dealing with Tibetan Unicode characters and strings of such * dealing with Tibetan Unicode codepoints and strings of such
* characters.</p> * codepoints.</p>
* *
* @author David Chandler */ * @author David Chandler */
public class UnicodeUtils { public class UnicodeUtils implements UnicodeConstants {
/** Do not use this, as this class is not instantiable. */ /** Do not use this, as this class is not instantiable. */
private UnicodeUtils() { super(); } private UnicodeUtils() { super(); }
/** Returns true iff x is a Unicode character that represents a /** Returns true iff x is a Unicode codepoint that represents a
consonant or two-consonant stack that has a Unicode code consonant or two-consonant stack that has a Unicode code
point. Returns true only for the usual suspects (like point. Returns true only for the usual suspects (like
<code>&#92;u0F40</code>) and for Sanskrit consonants (like <code>&#92;u0F40</code>) and for Sanskrit consonants (like
@ -40,7 +40,7 @@ public class UnicodeUtils {
&& (x >= '\u0F40' && x <= '\u0F6A')); && (x >= '\u0F40' && x <= '\u0F6A'));
} }
/** Returns true iff x is a Unicode character that represents a /** Returns true iff x is a Unicode codepoint that represents a
subjoined consonant or subjoined two-consonant stack that has subjoined consonant or subjoined two-consonant stack that has
a Unicode code point. Returns true only for the usual a Unicode code point. Returns true only for the usual
suspects (like <code>&#92;u0F90</code>) and for Sanskrit suspects (like <code>&#92;u0F90</code>) and for Sanskrit
@ -61,7 +61,7 @@ public class UnicodeUtils {
'&#92;u0F6A'. The new consonants (for transcribing Chinese, I '&#92;u0F6A'. The new consonants (for transcribing Chinese, I
believe) "&#92;u0F55&#92;u0F39" (which EWTS calls "fa"), believe) "&#92;u0F55&#92;u0F39" (which EWTS calls "fa"),
"&#92;u0F56&#92;u0F39" ("va"), and "&#92;u0F5F&#92;u0F39" ("Dza") are "&#92;u0F56&#92;u0F39" ("va"), and "&#92;u0F5F&#92;u0F39" ("Dza") are
two-character sequences, but you should be aware of them two-codepoint sequences, but you should be aware of them
also. */ also. */
public static boolean isPreferredFormOfConsonant(char x) { public static boolean isPreferredFormOfConsonant(char x) {
return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */) return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */)
@ -73,16 +73,16 @@ public class UnicodeUtils {
&& (x != '\u0F5C')); && (x != '\u0F5C'));
} }
/** Returns true iff unicodeChar is a character from the Unicode /** Returns true iff unicodeCP is a codepoint from the Unicode
range U+0F00-U+0FFF. range U+0F00-U+0FFF.
@see #isEntirelyTibetanUnicode(String) */ @see #isEntirelyTibetanUnicode(String) */
public static boolean isInTibetanRange(char unicodeChar) { public static boolean isInTibetanRange(char unicodeCP) {
return (unicodeChar >= '\u0F00' && unicodeChar <= '\u0FFF'); return (unicodeCP >= '\u0F00' && unicodeCP <= '\u0FFF');
} }
/** Returns true iff unicodeString consists only of characters /** Returns true iff unicodeString consists only of codepoints
from the Unicode range U+0F00-U+0FFF. (Note that these from the Unicode range U+0F00-U+0FFF. (Note that these
characters are typically not enough to represent a Tibetan codepoints are typically not enough to represent a Tibetan
text, you may need ZWSP (zero-width space) and various text, you may need ZWSP (zero-width space) and various
whitespace from other ranges.) */ whitespace from other ranges.) */
public static boolean isEntirelyTibetanUnicode(String unicodeString) { public static boolean isEntirelyTibetanUnicode(String unicodeString) {
@ -93,21 +93,40 @@ public class UnicodeUtils {
return true; return true;
} }
/** Modifies tibetanUnicode so that it is equivalent, according to /** Puts the Tibetan codepoints in tibetanUnicode, a sequence of
the Unicode 3.2 standard, to the input buffer. The Tibetan Unicode codepoints, into Normalization Form KD (NFKD) as
passages of the returned string are in THDL-canonical form, specified by Unicode 3.2. The Tibetan passages of the
however. This form uses a maximum of characters, in general, returned string are in NFKD, but codepoints outside of the
and never uses characters whose use has been {@link range <code>U+0F00</code>-<code>U+0FFF</code> are not
#isDiscouraged(char) discouraged}. If the input contains necessarily put into NFKD. This form uses a maximum of
characters for which {@link #isInTibetanRange(char)} is not codepoints, and it never uses codepoints whose use has been
true, then they will not be modified. {@link #isDiscouraged(char) discouraged}. It would be David
Chandler's very favorite form if not for the fact that
<code>U+0F0C</code> normalizes to <code>U+0F0B</code> in NFKD.
NFD is thus David Chandler's favorite, though it does not
decompose <code>U+0F77</code> and <code>U+0F79</code> (for
some reason, hopefully a well-thought-out one).
<p>Recall that NFKD, as it applies to Tibetan codepoints, is
closed under string concatenation and under substringing.
Note again that if the input contains codepoints for which
{@link #isInTibetanRange(char)} is not true, then they will
not be modified.</p>
<p>Note well that only well-formed input guarantees <p>Note well that only well-formed input guarantees
well-formed output.</p> */ well-formed output.</p>
public static void toCanonicalForm(StringBuffer tibetanUnicode) {
@param tibetanUnicode the codepoints to be decomposed
@param normForm NORM_NFKD or NORM_NFD */
public static void toMostlyDecomposedUnicode(StringBuffer tibetanUnicode,
byte normForm)
{
if (normForm != NORM_NFD && normForm != NORM_NFKD)
throw new IllegalArgumentException("normForm must be NORM_NFD or NORM_NFKD for decomposition to work");
int offset = 0; int offset = 0;
while (offset < tibetanUnicode.length()) { while (offset < tibetanUnicode.length()) {
String s = toCanonicalForm(tibetanUnicode.charAt(offset)); String s
= toNormalizedForm(tibetanUnicode.charAt(offset), normForm);
if (null == s) { if (null == s) {
++offset; ++offset;
} else { } else {
@ -118,67 +137,88 @@ public class UnicodeUtils {
} }
} }
/** Like {@link #toCanonicalForm(StringBuffer)}, but does not /** Like {@link #toMostlyDecomposedUnicode(StringBuffer, byte)},
modify its input. Instead, it returns the canonically-formed but does not modify its input. Instead, it returns the NFKD-
version of tibetanUnicode. */ or NFD-normalized version of tibetanUnicode. */
public static String toCanonicalForm(String tibetanUnicode) { public static String toMostlyDecomposedUnicode(String tibetanUnicode,
byte normForm)
{
StringBuffer sb = new StringBuffer(tibetanUnicode); StringBuffer sb = new StringBuffer(tibetanUnicode);
toCanonicalForm(sb); toMostlyDecomposedUnicode(sb, normForm);
return sb.toString(); return sb.toString();
} }
/** There are 19 characters in the Tibetan range of Unicode 3.2 /** There are 19 codepoints in the Tibetan range of Unicode 3.2
which can be decomposed into longer strings of characters in which can be decomposed into longer strings of codepoints in
the Tibetan range of Unicode. These 19 are said not to be in the Tibetan range of Unicode. Often one wants to manipulate
THDL-canonical form. This routine returns the canonical form decomposed codepoint strings. Also, HTML and XML are W3C
for such characters, and returns null for characters that are standards that require certain normalization forms. This
already canonical or are not in the Tibetan range of Unicode. routine returns a chosen normalized form for such codepoints,
@param tibetanUnicodeChar the character to canonicalize and returns null for codepoints that are already normalized or
@return null if tibetanUnicodeChar is canonical, or a string are not in the Tibetan range of Unicode.
of two or three characters otherwise */ @param tibetanUnicodeCP the codepoint to normalize
public static String toCanonicalForm(char tibetanUnicodeChar) { @param normalizationForm NORM_NFKD or NORM_NFD if you expect
switch (tibetanUnicodeChar) { something nontrivial to happen
case '\u0F43': return new String(new char[] { '\u0F42', '\u0FB7' }); @return null if tibetanUnicodeCP is already in the chosen
case '\u0F4D': return new String(new char[] { '\u0F4C', '\u0FB7' }); normalized form, or a string of two or three codepoints
case '\u0F52': return new String(new char[] { '\u0F51', '\u0FB7' }); otherwise */
case '\u0F57': return new String(new char[] { '\u0F56', '\u0FB7' }); public static String toNormalizedForm(char tibetanUnicodeCP, byte normalizationForm) {
case '\u0F5C': return new String(new char[] { '\u0F5B', '\u0FB7' }); if (normalizationForm == NORM_NFKD
case '\u0F69': return new String(new char[] { '\u0F40', '\u0FB5' }); || normalizationForm == NORM_NFD) {
case '\u0F73': return new String(new char[] { '\u0F71', '\u0F72' }); // Where not specified, the NFKD form is also the NFD form.
case '\u0F75': return new String(new char[] { '\u0F71', '\u0F74' }); switch (tibetanUnicodeCP) {
case '\u0F76': return new String(new char[] { '\u0FB2', '\u0F80' }); case '\u0F0C': return ((normalizationForm == NORM_NFKD)
case '\u0F77': return new String(new char[] { '\u0FB2', '\u0F71', '\u0F80' }); ? "\u0F0B" : null);
case '\u0F78': return new String(new char[] { '\u0FB3', '\u0F80' }); case '\u0F43': return "\u0F42\u0FB7";
case '\u0F79': return new String(new char[] { '\u0FB3', '\u0F71', '\u0F80' }); case '\u0F4D': return "\u0F4C\u0FB7";
case '\u0F81': return new String(new char[] { '\u0F71', '\u0F80' }); case '\u0F52': return "\u0F51\u0FB7";
case '\u0F93': return new String(new char[] { '\u0F92', '\u0FB7' }); case '\u0F57': return "\u0F56\u0FB7";
case '\u0F9D': return new String(new char[] { '\u0F9C', '\u0FB7' }); case '\u0F5C': return "\u0F5B\u0FB7";
case '\u0FA2': return new String(new char[] { '\u0FA1', '\u0FB7' }); case '\u0F69': return "\u0F40\u0FB5";
case '\u0FA7': return new String(new char[] { '\u0FA6', '\u0FB7' }); case '\u0F73': return "\u0F71\u0F72";
case '\u0FAC': return new String(new char[] { '\u0FAB', '\u0FB7' }); case '\u0F75': return "\u0F71\u0F74";
case '\u0FB9': return new String(new char[] { '\u0F90', '\u0FB5' }); case '\u0F76': return "\u0FB2\u0F80";
// I do not understand why NFD does not decompose this codepoint:
case '\u0F77': return ((normalizationForm == NORM_NFKD)
? "\u0FB2\u0F71\u0F80" : null);
case '\u0F78': return "\u0FB3\u0F80";
// I do not understand why NFD does not decompose this codepoint:
case '\u0F79': return ((normalizationForm == NORM_NFKD)
? "\u0FB3\u0F71\u0F80" : null);
case '\u0F81': return "\u0F71\u0F80";
case '\u0F93': return "\u0F92\u0FB7";
case '\u0F9D': return "\u0F9C\u0FB7";
case '\u0FA2': return "\u0FA1\u0FB7";
case '\u0FA7': return "\u0FA6\u0FB7";
case '\u0FAC': return "\u0FAB\u0FB7";
case '\u0FB9': return "\u0F90\u0FB5";
default: default:
return null; return null;
} }
} }
return null;
}
/** Returns true iff tibetanUnicodeChar {@link /** Returns true iff tibetanUnicodeCP {@link
#isInTibetanRange(char)} and if the Unicode 3.2 standard #isInTibetanRange(char) is a Tibetan codepoint} and if the
discourages the use of tibetanUnicodeChar. */ Unicode 3.2 standard discourages the use of
public static boolean isDiscouraged(char tibetanUnicodeChar) { tibetanUnicodeCP. */
return ('\u0F73' == tibetanUnicodeChar public static boolean isDiscouraged(char tibetanUnicodeCP) {
|| '\u0F75' == tibetanUnicodeChar return ('\u0F73' == tibetanUnicodeCP
|| '\u0F77' == tibetanUnicodeChar || '\u0F75' == tibetanUnicodeCP
|| '\u0F81' == tibetanUnicodeChar); || '\u0F77' == tibetanUnicodeCP
|| '\u0F79' == tibetanUnicodeCP
|| '\u0F81' == tibetanUnicodeCP);
/* DLC FIXME -- I was using 3.0 p.437-440, check 3.2. */ /* DLC FIXME -- I was using 3.0 p.437-440, check 3.2. */
} }
/** Returns true iff ch corresponds to the Tibetan letter ra. /** Returns true iff ch corresponds to the Tibetan letter ra.
Several Unicode characters correspond to the Tibetan letter ra Several Unicode codepoints correspond to the Tibetan letter ra
(in its subscribed form or otherwise). Oftentimes, (in its subscribed form or otherwise). Oftentimes,
<code>&#92;u0F62</code> is thought of as the nominal <code>&#92;u0F62</code> is thought of as the nominal
representation. Returns false for some characters that representation. Returns false for some codepoints that
contain ra but are not merely ra, such as <code>&#92;u0F77</code> */ contain ra but are not merely ra, such as <code>&#92;u0F77</code> */
public static boolean isRa(char ch) { public static boolean isRa(char ch) {
return ('\u0F62' == ch return ('\u0F62' == ch
@ -188,7 +228,7 @@ public class UnicodeUtils {
} }
/** Returns true iff ch corresponds to the Tibetan letter wa. /** Returns true iff ch corresponds to the Tibetan letter wa.
Several Unicode characters correspond to the Tibetan letter Several Unicode codepoints correspond to the Tibetan letter
wa. Oftentimes, <code>&#92;u0F5D</code> is thought of as the wa. Oftentimes, <code>&#92;u0F5D</code> is thought of as the
nominal representation. */ nominal representation. */
public static boolean isWa(char ch) { public static boolean isWa(char ch) {
@ -198,7 +238,7 @@ public class UnicodeUtils {
} }
/** Returns true iff ch corresponds to the Tibetan letter ya. /** Returns true iff ch corresponds to the Tibetan letter ya.
Several Unicode characters correspond to the Tibetan letter Several Unicode codepoints correspond to the Tibetan letter
ya. Oftentimes, <code>&#92;u0F61</code> is thought of as the ya. Oftentimes, <code>&#92;u0F61</code> is thought of as the
nominal representation. */ nominal representation. */
public static boolean isYa(char ch) { public static boolean isYa(char ch) {
@ -207,14 +247,14 @@ public class UnicodeUtils {
|| '\u0FBB' == ch); || '\u0FBB' == ch);
} }
/** Returns true iff there exists at least one character ch in /** Returns true iff there exists at least one codepoint cp in
unicodeString such that ch {@link #isRa(char) is ra} or contains unicodeString such that cp {@link #isRa(char) is ra} or contains
ra (like <code>&#92;u0F77</code>). This method is not implemented ra (like <code>&#92;u0F77</code>). This method is not implemented
as fast as it could be. It calls on the canonicalization code as fast as it could be. It calls on the canonicalization code
in order to maximize reuse and minimize the possibility of in order to maximize reuse and minimize the possibility of
coder error. */ coder error. */
public static boolean containsRa(String unicodeString) { public static boolean containsRa(String unicodeString) {
String canonForm = toCanonicalForm(unicodeString); String canonForm = toMostlyDecomposedUnicode(unicodeString, NORM_NFKD);
for (int i = 0; i < canonForm.length(); i++) { for (int i = 0; i < canonForm.length(); i++) {
if (isRa(canonForm.charAt(i))) if (isRa(canonForm.charAt(i)))
return true; return true;
@ -223,11 +263,13 @@ public class UnicodeUtils {
} }
/** Inefficient shortcut. /** Inefficient shortcut.
@see #containsRa(String) */ @see #containsRa(String) */
public static boolean containsRa(char unicodeChar) { public static boolean containsRa(char unicodeCP) {
return containsRa(new String(new char[] { unicodeChar })); return containsRa(new String(new char[] { unicodeCP }));
} }
public static String unicodeCharToString(char ch) { /** Returns a human-readable, ASCII form of the Unicode codepoint
ch. */
public static String unicodeCPToString(char ch) {
return "U+" + Integer.toHexString((int)ch); return "U+" + Integer.toHexString((int)ch);
} }
} }

View file

@ -21,9 +21,9 @@
syllable. syllable.
<p> <p>
This package allows for turning a string of Unicode characters into This package allows for turning a string of Unicode codepoints into
our <i>TTBIR</i>, our Tibetan Tsheg Bar Internal Representation. our <i>TTBIR</i>, our Tibetan Tsheg Bar Internal Representation.
Said Unicode document may contain non-Tibetan characters also. Said Unicode document may contain non-Tibetan codepoints also.
</p> </p>
</body> </body>