From f4a16f8e9d22a1d00e54bb63d0865a1d8377b0d2 Mon Sep 17 00:00:00 2001 From: dchandler Date: Mon, 9 Dec 2002 01:02:23 +0000 Subject: [PATCH] This commit is for my benefit only; these classes are not ready for prime time, and the build system is not yet aware of them. I'm adding some classes for representing legal tsheg-bars (syllables, for the most part) in Unicode. These classes were designed bottom-up (OK, OK -- they weren't designed designed, but I had to write down everything I knew about Tibetan syntax somewhere). The classes are aware of extended wylie. I doubt the Javadocs work yet, and I'm still testing (and am not committing my testing code with these as it is not yet ready). Next on my list--fix these up to reflect my new awareness of suffix particles (like le'u'i'o) add classes to support syntactically incorrect Unicode sequences. Then add a UnicodeReader, and we've got the back end of a Tibetan Unicode shaping system (like half of MS's Uniscribe or Apple's Worldscript or FreeType Layout or Omega's OTPs). A top-down design would not have included LegalTshegBar. But now that my itch has been scratched, potential uses are lingering about. For example, it would be nice to scan some input and break it into LegalTshegBars, punctuation/marks/signs, and illegal stacks. Then we could alert the client of the illegality, its precise form, and its precise location. The real system for turning a Unicode stream into an internal representation suitable for conversion to EWTS/ACIP/XHTML/what-have-you need not be aware of Tibetan syntax. But to make the very best conversion from Unicode to, e.g., EWTS, it is necessary to konw that gaskad is better represented as gskad, but that jaskad is not the same as jskad. --- .../thdl/tib/text/tshegbar/LegalTshegBar.java | 1027 +++++++++++++++++ .../org/thdl/tib/text/tshegbar/TshegBar.java | 68 ++ .../tshegbar/UnicodeCharToExtendedWylie.java | 317 +++++ .../tib/text/tshegbar/UnicodeConstants.java | 98 ++ .../tib/text/tshegbar/UnicodeReadyThunk.java | 63 + .../thdl/tib/text/tshegbar/UnicodeUtils.java | 234 ++++ .../org/thdl/tib/text/tshegbar/package.html | 30 + 7 files changed, 1837 insertions(+) create mode 100644 source/org/thdl/tib/text/tshegbar/LegalTshegBar.java create mode 100644 source/org/thdl/tib/text/tshegbar/TshegBar.java create mode 100644 source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java create mode 100644 source/org/thdl/tib/text/tshegbar/UnicodeConstants.java create mode 100644 source/org/thdl/tib/text/tshegbar/UnicodeReadyThunk.java create mode 100644 source/org/thdl/tib/text/tshegbar/UnicodeUtils.java create mode 100644 source/org/thdl/tib/text/tshegbar/package.html diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java new file mode 100644 index 0000000..e72a510 --- /dev/null +++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java @@ -0,0 +1,1027 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2001 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.tshegbar; + +import org.thdl.tib.text.TibetanMachineWeb; +import org.thdl.util.ThdlDebug; + +/**

A LegalTshegBar is a simple Tibetan syllable or a syllable with + * syntactically legal {@link #getPossibleSuffixParticles() suffix + * particles}. A legal tsheg-bar is not a transliteration of Chinese + * or some other language. It obeys the following properties:

+ * + * + * + *

Note that this class uses only a subset of Unicode to represent + * consonants and vowels. In some situations, you should use {@link + * #EWSUB_wa_zur} to represent the consonant wa, while in others + * you should use {@link #EWC_wa}, even though you mean to subscribe + * a fixed-form wa. Basically, stick to the characters for which + * enumerations exist in {@link #UnicodeConstants} and use your + * common sense.

+ * + *

For a pretty good, concise summary of the rules this class + * knows about, see Joe B. Wilson's Translating Buddhism from + * Tibetan from {@see http://snowlionpubs.com/ Snow Lion + * Publications}, Appendix 1, e.g. p. 548.

+ * + * @author David Chandler */ +public class LegalTshegBar + extends TshegBar + implements UnicodeConstants +{ + /** the prefixed consonant or EW_ABSENT */ + private char prefix; + /** the consonant superscribed over the {@link #rootLetter} or + * EW_ABSENT */ + private char headLetter; + /** the root consonant, never EW_ABSENT */ + private char rootLetter; + /** subscribed letter, or EW_ABSENT */ + private char subjoinedLetter; + /** true iff EWSUB_wa_zur is under the root syllable. */ + private boolean hasWaZur; + /** true iff EW_wa_zur is under the root syllable. */ + private boolean hasAChung; + /** If this is a string, it is of a single character or is equal + * to {@link #getConnectiveCaseSuffix()} */ + private String suffix; + /** EW_da, EW_sa, or EW_ABSENT */ + private char postsuffix; + /** EWV_i, EWV_u, EWV_e, EWV_o, or EW_ABSENT */ + private char vowel; + + /** Do not use this constructor. */ + private LegalTshegBar() { super(); } + + // DLC FIXME: do we want to accept EWC_ra or EWSUB_ra_btags for + // the root letter, even if there is no head letter? Etc. + /** Constructs a valid Tibetan syllable or throws an exception. + * Use EW_ABSENT (or null in the case of suffix) for + * those parts of the syllable that are absent. The root letter + * must not be absent. To learn about the arguments, and to be + * sure that your input won't cause an exception to be thrown, + * see {@link + * #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char)}. + * + * @exception IllegalArgumentException if the rootLetter is not + * one of the thirty consonants (and represented nominally, at + * that), or if one of the other arguments is not valid, or if + * postsuffix is present but suffix is absent, etc. */ + public LegalTshegBar(char prefix, char headLetter, char rootLetter, + char subjoinedLetter, + boolean hasWaZur, + boolean hasAChung, + String suffix, char postsuffix, char vowel) + throws IllegalArgumentException + { + super(); + + throwIfNotLegalTshegBar(prefix, headLetter, rootLetter, + subjoinedLetter, hasWaZur, hasAChung, + suffix, postsuffix, vowel); + + this.prefix = prefix; + this.headLetter = headLetter; + this.rootLetter = rootLetter; + this.subjoinedLetter = subjoinedLetter; + + this.hasWaZur = hasWaZur; + this.hasAChung = hasAChung; + + // copying is slightly inefficient because it is unnecessary + // since Java strings are read-only, but translating this code + // to C++ is easier this way. + this.suffix = new String(suffix); + + this.postsuffix = postsuffix; + this.vowel = vowel; + } + + /** Like {@link + * #LegalTshegBar(char,char,char,char,boolean,boolean,String,char,char) + * but for the common case where the suffix is simply a + * consonant. */ + public LegalTshegBar(char prefix, char headLetter, char rootLetter, + char subjoinedLetter, + boolean hasWaZur, // DLC FIXME handle this + boolean hasAChung, // DLC FIXME handle this + char suffix, char postsuffix, char vowel) + throws IllegalArgumentException + { + this(prefix, headLetter, rootLetter, subjoinedLetter, + hasWaZur, hasAChung, new String(new char[] { suffix }), + postsuffix, vowel); + } + + + /** Returns the prefixed consonant, or EW_ABSENT if there is no + * prefix. */ + public char getPrefix() { + return prefix; + } + + /** Returns true iff this syllable contains a prefixed + * consonant. */ + public boolean hasPrefix() { + return (EW_ABSENT != prefix); + } + + /** Returns the non-EWSUB_wa_zur consonant subscribed to the root + * consonant, or EW_ABSENT if none is. If you want to know if there is a wa-zur, use {@link #hasWaZurSubjoinedToRootLetter()}*/ + public char getSubjoinedLetter() { + return subjoinedLetter; + } + + /** Returns true iff the root letter possesses a subscribed + * consonant ya-btags, ra-btags, la-btags, or wa-zur. */ + public boolean hasSubjoinedLetter() { + return (EW_ABSENT != subjoinedLetter); + } + + public boolean hasWaZurSubjoinedToRootLetter() { + return hasWaZur; + } + + public boolean hasAChungOnRootLetter() { + return hasAChung; + } + + /** Returns null if there is no suffix, or a string containing the + * one consonant or a string "\u0F60\u0F72" + * containing two characters in the special case that the suffix + * is that connective case marker {@link + * #getConnectiveCaseSuffix()}. */ + public String getSuffix() { + return suffix; + } + + /** Returns true iff there is a suffixed consonant or a suffixed + * 'i (DLC FIXME). */ + public boolean hasSuffix() { + return (null != suffix); + } + + /** Returns true iff there is a single, suffixed consonant. This + means that suffixes like 'am, 'i, + 'u, and 'o are not present, but this + does not rule out the presence of a postsuffix. */ + public boolean hasSimpleSuffix() { + return ((null != suffix) && (1 == suffix.length())); + } + + /** If this syllable {@link #hasSimpleSuffix() has a simple + suffix}, this returns it. + @exception Exception if {@link #hasSimpleSuffix()} is not true */ + public char getSimpleSuffix() throws Exception { + if (!hasSimpleSuffix()) + throw new Exception("there isn't a simple suffix"); + return getSuffix().charAt(0); + } + + /** Returns the secondary suffix, which is either + * EWC_da or EWC_sa, or EW_ABSENT if + * there is no postsuffix. */ + public char getPostsuffix() { + return postsuffix; + } + + /** Returns true iff there is a secondary suffix EWC_da or + * EWC_sa. */ + public boolean hasPostsuffix() { + return (EW_ABSENT != postsuffix); + } + + /** Returns true iff this syllable has a 'i + * suffix. */ + public boolean hasConnectiveCaseMarkerSuffix() { + return getSuffix().equals(getConnectiveCaseSuffix()); + } + + /** Returns the root consonant. */ + public char getRootLetter() { + return rootLetter; + } + + /** Returns the head letter of the root stack if it has one, or + * EW_ABSENT otherwise. */ + public char getHeadLetter() { + return headLetter; + } + + /** Returns true iff this syllable has a head letter. */ + public boolean hasHeadLetter() { + return (EW_ABSENT != headLetter); + } + + /** Returns the vowel, or EW_ABSENT if there is no {@link + * hasExplicitVowel() explicit vowel} (the syllable has the + * built-in "ah" sound in this case). */ + public char getVowel() { + // DLC assert this is one of { EWV_i, EWV_u, EWV_e, EWV_o } + return vowel; + } + + /** Returns false iff the implicit, built-in "ah" sound is the + only vowel for the root stack. */ + public boolean hasExplicitVowel() { + return (EW_ABSENT != vowel); + } + + + /** Returns a string of two characters, da and sa. */ + public static String getPossiblePostsuffixes() { + return new String(new char[] { EWC_da, EWC_sa }); + } + + private final static String possibleSuffixes + = new String(new char[] { + EWC_ga, EWC_nga, EWC_da, EWC_na, EWC_ba, EWC_ma, EWC_achen, + EWC_ra, EWC_la, EWC_sa + }); + + /** Returns a string of ten characters, each of which can be a + * suffix in Tibetan. */ + public static String getPossibleSuffixes() { + return possibleSuffixes; + + // DLC unit test that each EWC is a nominal form of a consonant + + // you could use either \u0F62 or \u0F6A, but we won't confuse + // this ra for a ra-mgo, so we use \u0F62, EWC_ra, not + // EWSUB_ra_btags. + } + + private final static String connectiveCaseSuffix + = new String(new char[] { + EWC_achen, EWV_i + }); + + /** Returns a two-character string consisting of the Unicode + * representation of what Extended Wylie calls + * 'i. */ + public static String getConnectiveCaseSuffix() { + return connectiveCaseSuffix; + } + + private final static String thirtyConsonants + = new String(new char[] { + EWC_ga, EWC_kha, EWC_ga, EWC_nga, + EWC_ca, EWC_cha, EWC_ja, EWC_nya, + EWC_ta, EWC_tha, EWC_da, EWC_na, + EWC_pa, EWC_pha, EWC_ba, EWC_ma, + EWC_tsa, EWC_tsha, EWC_dza, EWC_wa, + EWC_zha, EWC_za, EWC_achen, EWC_ya, + EWC_ra, EWC_la, EWC_sha, EWC_sa, + EWC_ha, EWC_a + }); + + /** Returns a String containing the nominal Unicode + * representations of the thirty consonants. The consonants are + * in the usual order you find them in the 8 row by 4 column + * table that students of the language memorize. + * @see org.thdl.tib.text.tshegbar#UnicodeConstants */ + public static String getTheThirtyConsonants() { + ThdlDebug.verify(thirtyConsonants.length() == 30); // DLC put this into a JUnit test to avoid the slow-down. + return thirtyConsonants; + } + + /** Returns true iff x is the preferred, nominal Unicode + * representation of one the thirty consonants. */ + public static boolean isNominalRepresentationOfConsonant(char x) { + return (-1 != getTheThirtyConsonants().indexOf(x)); + } + + + /** Returns an array of Unicode strings, all the legal suffix + particles. In Extended Wylie, these are: + +

This is not very efficient.

*/ + public static String[] getPossibleSuffixParticles() { + return new String[] { + new String(new char[] { EWC_achen, EWV_i }), + new String(new char[] { EWC_achen, EWV_o }), + new String(new char[] { EWC_achen, EWV_u }), + new String(new char[] { EWC_achen, EWC_ma }), + }; + } + + + /** Returns a String containing the nominal Unicode + * representations of the five prefixes. The prefixes are in + * dictionary order. + * @see org.thdl.tib.text.tshegbar#UnicodeConstants */ + public static String getTheFivePrefixes() { + final String s = new String(new char[] { + EWC_ga, EWC_da, EWC_ba, EWC_ma, EWC_achen + }); + ThdlDebug.verify(s.length() == 5); // DLC put this into a JUnit test to avoid the slow-down. + return s; + } + + /** Returns true iff x is the preferred, nominal Unicode + * representation of one of the five prefixes. */ + public static boolean isNominalRepresentationOfPrefix(char x) { + return (-1 != getTheFivePrefixes().indexOf(x)); + } + + /** Returns a String containing the nominal Unicode + * representations of the ten suffixes. The suffixes are in + * dictionary order. + * @see #getConnectiveCaseSuffix() + * @see org.thdl.tib.text.tshegbar#UnicodeConstants */ + public static String getTheTenSuffixes() { + final String s = new String(new char[] { + EWC_ga, EWC_nga, EWC_da, EWC_na, EWC_ba, + EWC_ma, EWC_achen, EWC_ra, EWC_la, EWC_sa + }); + ThdlDebug.verify(s.length() == 10); // DLC put this into a JUnit test to avoid the slow-down. + return s; + } + + /** Returns true iff x is the preferred, nominal Unicode + * representation of one of the ten suffixes. + * @see #getConnectiveCaseSuffix() + */ + public static boolean isNominalRepresentationOfSimpleSuffix(char x) { + return (-1 != getTheTenSuffixes().indexOf(x)); + } + + + /** Returns true iff the given (rootLetter, subjoinedLetter) + combination can accept an additional wa-zur. Only g-r-w, + d-r-w, and ph-y-w fall into this category according to + tibwn.ini. (DLC FIXME: are these all legal? are any others?) + + @param rootLetter the root consonant (in {@link + UnicodeUtils#isPreferredFormOfConsonant() preferred form} in + you expect true to be returned) + @param subjoinedLetter the letter subscribed to rootLetter, + which should not {@link UnicodeUtils#isWa(char) be wa} if you + expect true to be returned + @return true iff (rootLetter, subjoinedLetter, wa-zur) is a + legal stack. */ + public static boolean takesWaZur(char rootLetter, + char subjoinedLetter) { + + // DLC NOW use this test + + if (EW_ABSENT == subjoinedLetter) { + return isConsonantThatTakesWaZur(rootLetter); + } + if (EWSUB_ra_btags == subjoinedLetter) { + if (EWC_ga == rootLetter + || EWC_da == rootLetter) + return true; + } else if (EWSUB_ya_btags == subjoinedLetter) { + if (EWC_pha == rootLetter) + return true; + } + return false; + } + + /** Returns true iff rootLetter is a consonant to which wa-zur can + * be subjoined (perhaps in addition to another subjoined + * ra-btags or ya-btags. */ + public static boolean isConsonantThatTakesWaZur(char rootLetter) { + return !(EWC_ka != rootLetter + && EWC_kha != rootLetter + && EWC_ga != rootLetter + && EWC_nya != rootLetter + && EWC_da != rootLetter + && EWC_tsa != rootLetter + && EWC_tsha != rootLetter + && EWC_zha != rootLetter + && EWC_za != rootLetter + && EWC_ra != rootLetter + && EWC_la != rootLetter + && EWC_sha != rootLetter + && EWC_pha != rootLetter /* ph-y-w is legal. */ + && EWC_ha != rootLetter); + } + + /** Returns true iff rootLetter is a consonant to which ya-btags + * can be subjoined. */ + public static boolean isConsonantThatTakesYaBtags(char rootLetter) { + return !(EWC_ka != rootLetter + && EWC_kha != rootLetter + && EWC_ga != rootLetter + && EWC_pa != rootLetter + && EWC_pha != rootLetter + && EWC_ba != rootLetter + && EWC_ma != rootLetter + && EWC_ha != rootLetter); + } + + /** Returns true iff rootLetter is a consonant to which la-btags + * can be subjoined. */ + public static boolean isConsonantThatTakesLaBtags(char rootLetter) { + return !(EWC_ka != rootLetter + && EWC_ga != rootLetter + && EWC_ba != rootLetter + && EWC_ra != rootLetter + && EWC_sa != rootLetter + + // this combination is pronounced as a + // prenasaling, low-tone da in my opinion: + && EWC_za != rootLetter); + } + + + /** Returns true iff rootLetter is a consonant to which ra-btags + * can be subjoined. */ + public static boolean isConsonantThatTakesRaBtags(char rootLetter) { + return !(EWC_ka != rootLetter + && EWC_kha != rootLetter + && EWC_ga != rootLetter + && EWC_ta != rootLetter + && EWC_tha != rootLetter + && EWC_da != rootLetter + && EWC_na != rootLetter + && EWC_pa != rootLetter + && EWC_pha != rootLetter + && EWC_ba != rootLetter + && EWC_ma != rootLetter + && EWC_sa != rootLetter + && EWC_ha != rootLetter); + } + + /** Returns true iff rootLetter is a consonant that takes a ra-mgo + * (pronounced rango because ma is a prenasaling prefix) + * head letter */ + public static boolean isConsonantThatTakesRaMgo(char rootLetter) { + return !(EWC_ka != rootLetter + && EWC_ga != rootLetter + && EWC_nga != rootLetter + && EWC_ja != rootLetter + && EWC_nya != rootLetter + && EWC_ta != rootLetter + && EWC_da != rootLetter + && EWC_na != rootLetter + && EWC_ba != rootLetter + && EWC_ma != rootLetter + && EWC_tsa != rootLetter + && EWC_dza != rootLetter); + } + + /** Returns true iff rootLetter is a consonant that takes a la-mgo + * (pronounced lango because ma is a prenasaling prefix) + * head letter */ + public static boolean isConsonantThatTakesLaMgo(char rootLetter) { + return !(EWC_ka != rootLetter + && EWC_ga != rootLetter + && EWC_nga != rootLetter + && EWC_ca != rootLetter + && EWC_ja != rootLetter + && EWC_ta != rootLetter + && EWC_da != rootLetter + && EWC_pa != rootLetter + && EWC_ba != rootLetter + && EWC_ha != rootLetter); // pronunciation exception, btw + } + + /** Returns true iff rootLetter is a consonant that takes a sa-mgo + * (pronounced sango because ma is a prenasaling prefix) + * head letter */ + public static boolean isConsonantThatTakesSaMgo(char rootLetter) { + return !(EWC_ka != rootLetter + && EWC_ga != rootLetter + && EWC_nga != rootLetter + && EWC_nya != rootLetter + && EWC_ta != rootLetter + && EWC_da != rootLetter + && EWC_na != rootLetter + && EWC_pa != rootLetter + && EWC_ba != rootLetter + && EWC_ma != rootLetter + && EWC_tsa != rootLetter); + } + + /** Returns true iff the given arguments form a legal Tibetan + * syllable. + * + * @param prefix the optional, prefixed consonant + * @param headLetter the optional superscribed consonant + * @param rootLetter the mandatory root consonant + * @param subjoinedLetter the optional, subscribed consonant + * @param suffix the optional suffix, which is null, a String + * consisting of a single consonant (i.e. a single character) + * except in the special case that this is {@link + * #getConnectiveCaseSuffix()} + * @param postsuffix the optional postsuffix, which should be + * EWC_sa or EWC_da + * @param vowel the optional vowel */ + public static boolean formsLegalTshegBar(char prefix, + char headLetter, + char rootLetter, + char subjoinedLetter, + boolean hasWaZur, // DLC FIXME handle this + boolean hasAChung, // DLC FIXME handle this + String suffix, + char postsuffix, + char vowel) + { + try { + return internalLegalityTest(prefix, headLetter, rootLetter, + subjoinedLetter, hasWaZur, hasAChung, + suffix, postsuffix, vowel, false); + } catch (IllegalArgumentException e) { + throw new Error("This simply cannot happen, but it did."); + } + } + + /** Like {@link + * #formsLegalTshegBar(char,char,char,char,boolean,boolean,String,char,char) + * but for the common case where the suffix is simply a consonant. */ + public static boolean formsLegalTshegBar(char prefix, + char headLetter, + char rootLetter, + char subjoinedLetter, + boolean hasWaZur, // DLC FIXME handle this + boolean hasAChung, // DLC FIXME handle this + char suffix, + char postsuffix, + char vowel) + { + return formsLegalTshegBar(prefix, headLetter, rootLetter, + subjoinedLetter, hasWaZur, hasAChung, + new String(new char[] { suffix }), + postsuffix, vowel); + } + + + /** If you get through this gauntlet without having an exception + * thrown, then this combination makes a legal Tibetan syllable. + * @exception IllegalArgumentException if the syllable does not + * follow the rules of a Tibetan syllable. To learn about the + * arguments, see {@link + * #formsLegalTshegBar(char,char,char,char,String,char,char)}. */ + private static void throwIfNotLegalTshegBar(char prefix, + char headLetter, + char rootLetter, + char subjoinedLetter, + boolean hasWaZur, // DLC FIXME handle this + boolean hasAChung, // DLC FIXME handle this + String suffix, + char postsuffix, + char vowel) + throws IllegalArgumentException + { + internalLegalityTest(prefix, headLetter, rootLetter, + subjoinedLetter, hasWaZur, hasAChung, + suffix, postsuffix, vowel, true); + } + + /** Voodoo. Stand back. */ + private static boolean internalThrowThing(boolean doThrow, String msg) + { + if (doThrow) + throw new IllegalArgumentException(msg); + return false; + } + + /** If you get through this gauntlet without having an exception + * thrown, then this combination makes a legal Tibetan syllable. + * To learn about the arguments, see {@link + * #formsLegalTshegBar(char,char,char,char,String,char,char)}. + * @return true if this syllable is legal, false if this syllable + * is illegal and throwIfIllegal is false, does not return if + * this syllable is illegal and throwIfIllegal is true + * @exception IllegalArgumentException if the syllable does not + * follow the rules of a Tibetan syllable and throwIfIllegal is + * true */ + private static boolean internalLegalityTest(char prefix, + char headLetter, + char rootLetter, + char subjoinedLetter, + boolean hasWaZur, // DLC FIXME handle this + boolean hasAChung, // DLC FIXME handle this + String suffix, + char postsuffix, + char vowel, + boolean throwIfIllegal) + throws IllegalArgumentException + { + if (!isNominalRepresentationOfConsonant(rootLetter)) + return internalThrowThing(throwIfIllegal, + "The root letter must be one of the standard thirty Tibetan consonants, and must be represented nominally, not, for example, by FIXED-FORM RA (\\u0F6A)"); + + if (EW_ABSENT != prefix) { + // Ensure that this prefix is one of the five prefixes, + // and that it can go with this root letter: + if (!isNominalRepresentationOfPrefix(prefix)) + return internalThrowThing(throwIfIllegal, + "The prefix is not absent, so it must be one of the five possible prefixes."); + // DLC test that it can go with the root letter. + } + + if (EW_ABSENT != subjoinedLetter) { + if (EWSUB_ya_btags == subjoinedLetter) { + if (!isConsonantThatTakesYaBtags(rootLetter)) { + return internalThrowThing(throwIfIllegal, + "Cannot subscribe ya-btags to that root letter."); + } + } else if (EWSUB_ra_btags == subjoinedLetter) { + if (!isConsonantThatTakesRaBtags(rootLetter)) { + return internalThrowThing(throwIfIllegal, + "Cannot subscribe ra-btags to that root letter."); + } + } else if (EWSUB_la_btags == subjoinedLetter) { + if (!isConsonantThatTakesLaBtags(rootLetter)) { + return internalThrowThing(throwIfIllegal, + "Cannot subscribe la-btags to that root letter."); + } + } else if (EWSUB_wa_zur == subjoinedLetter) { + throw new Error("DLC FIXME: can this happen? wa-zur comes in via the boolean argument hasWaZur, not via subjoinedLetter."); + } else { + // check for a common mistake: + if ('\u0FBA' == subjoinedLetter + || '\u0FBB' == subjoinedLetter + || '\u0FBC' == subjoinedLetter) + { + return internalThrowThing(throwIfIllegal, + "The subjoined letter given is subjoinable, but you gave the fixed-form variant, which is not used in Tibetan syllables but is sometimes used in Tibetan transliteration of Sanskrit, Chinese, or some non-Tibetan language."); + } + return internalThrowThing(throwIfIllegal, + "The subjoined letter given is not one of the four consonants that may be subscribed."); + } + } // subjoinedLetter tests + + // Suffix tests: + // DLC NOW -- allow 'o, 'u, 'am, etc. + if (null != suffix) { + if (!getConnectiveCaseSuffix().equals(suffix)) { + if (suffix.length() != 1) { + return internalThrowThing(throwIfIllegal, + "Illegal suffix -- not one of the legal complex suffixes like 'u, 'o, 'i, 'am."); + } + if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) { + return internalThrowThing(throwIfIllegal, + "Illegal suffix -- not one of the ten legal suffixes: " + + UnicodeUtils.unicodeCharToString(suffix.charAt(0))); + } + } + } + if (EW_ABSENT != postsuffix) { + if (null == suffix) + return internalThrowThing(throwIfIllegal, + "You cannot have a postsuffix unless you also have a suffix."); + } + + if (EW_ABSENT != headLetter) { + if (EWC_ra == headLetter) { + if (!isConsonantThatTakesRaMgo(rootLetter)) { + return internalThrowThing(throwIfIllegal, + "The head letter ra cannot be used with that root letter."); + } + } else if (EWC_la == headLetter) { + if (!isConsonantThatTakesLaMgo(rootLetter)) { + return internalThrowThing(throwIfIllegal, + "The head letter la cannot be used with that root letter."); + } + } else if (EWC_sa == headLetter) { + if (!isConsonantThatTakesSaMgo(rootLetter)) { + // handle a common error specially: + if (EWC_la == rootLetter) + return internalThrowThing(throwIfIllegal, + "sa cannot be a head letter atop the root letter la. You probably meant to have sa the root letter and la the subjoined letter."); + + return internalThrowThing(throwIfIllegal, + "The head letter sa cannot be used with that root letter."); + } + } else { + // '\u0F6A' is not a valid head letter, even for + // "rnya". Use EWC_ra instead. + return internalThrowThing(throwIfIllegal, + "The head letter given is not valid."); + } + } // headLetter tests + + // Now see if the vowel is valid: + if (EW_ABSENT /* built-in "ah" sound */ != vowel) { + if (EWV_i != vowel + && EWV_u != vowel + && EWV_e != vowel + && EWV_o != vowel) + { + if (EWC_achen == vowel) + return internalThrowThing(throwIfIllegal, + "The vowel given is not valid. Use EW_ABSENT for the EWC_achen sound."); + if ('\u0F71' == vowel) + return internalThrowThing(throwIfIllegal, + "a-chung cannot be used in a simple Tibetan syllable."); + return internalThrowThing(throwIfIllegal, + "The vowel given is not valid."); + } + } + + // Phew. We got here, so this combination of inputs is valid. + return true; + } + + + /* + DLC add a method giving the correct connective case thingy or + throwing error if the 'i suffix already appears. + + DLC put in a method that gets pronunciation using Unicode + diacritical marks. And another using just US Roman. Note that + pronunciation is contextual, so have these methods return all + valid pronunciations, such as both "pa" and "wa" for EWC_ba. + + DLC would be nice in the appropriate class: boolean + isTransliteratedSanskrit(), boolean isTransliteratedChinese() + (design: contains fa or va, maybe?). */ + + /** Returns a StringBuffer that holds the extended wylie + * representation of this syllable. */ + public StringBuffer getExtendedWylie() { + StringBuffer sb = new StringBuffer(); + char rootLetter = getRootLetter(); + if (hasPrefix()) { + // if there is a prefix but no head letter and (prefix, + // rootLetter) is ambiguous, i.e. if it could be mistaken + // for a legal (rootLetter, subjoinedLetter) combination, + // then put out prefix,disambiguator. else just put out + // prefix. + + boolean disambiguatorNeeded = false; + char prefix = getPrefix(); + sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(prefix)); + if (!hasHeadLetter()) { + if (EWC_ya == rootLetter) { + if (isConsonantThatTakesYaBtags(prefix)) + disambiguatorNeeded = true; + } else if (EWC_ra == rootLetter) { + if (isConsonantThatTakesRaBtags(prefix)) + disambiguatorNeeded = true; + } else if (EWC_la == rootLetter) { + if (isConsonantThatTakesLaBtags(prefix)) + disambiguatorNeeded = true; + } else if (EWC_wa == rootLetter) { + if (isConsonantThatTakesWaZur(prefix)) + disambiguatorNeeded = true; + } + } + if (disambiguatorNeeded) + sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); + } + if (hasHeadLetter()) + sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getHeadLetter())); + sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(rootLetter)); + if (hasSubjoinedLetter()) + sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getSubjoinedLetter())); + if (hasWaZurSubjoinedToRootLetter()) + sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EWSUB_wa_zur)); + + // a-chung is treated, in Extended Wylie, like a vowel. I.e., + // you don't have 'pAa', you have 'pA'. + if (hasAChungOnRootLetter()) { + if (hasExplicitVowel()) { + if (EWV_i == getVowel()) { + sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar('\u0F73')); + } else if (EWV_u == getVowel()) { + sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar('\u0F75')); + } else if (EWV_e == getVowel() || EWV_o == getVowel()) { + // The exception to the rule for a-chung and vowels... + + // DLC FIXME: are these allowed in legal Tibetan? + // EWTS would have special cases for them if so, + // I'd wager... + sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EW_achung)); + sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel())); + } else { + ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?"); + } + } else { + sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(EW_achung)); + } + } else { + if (hasExplicitVowel()) + sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getVowel())); + else + sb.append("a"); + } + + if (hasSuffix()) { + String suf = getSuffix(); + sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(suf.charAt(0))); + if (suf.length() > 1) { + // DLC assert, don't verify, that the length is two. + // This could change if I learn of more suffix + // particles. + ThdlDebug.verify(2 == suf.length()); + sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(suf.charAt(1))); + } + } + if (hasPostsuffix()) + sb.append(UnicodeCharToExtendedWylie.getExtendedWylieForUnicodeChar(getPostsuffix())); + return sb; + } + + + // DLC: toXML for the dense XML + /** Returns a element that contains only + * the Extended Wylie transliteration for the whole syllable and a note that the . */ + public String toConciseXML() { + // DLC version-control the EWTS document. 0.5 is used below: + return (""); + } + + /** Returns a element that contains the + * syllable broken-down into its constituent vowel and + * consonants. */ + public String toVerboseXML() { + // DLC version-control the EWTS document. 0.5 is used below: + return (""); + } + + + /** Overrides {@link org.thdl.tib.text.tshegbar#UnicodeReadyThunk + method to return {@link UnicodeUtils#toCanonicalForm(String) + canonically-formed Unicode}. + @exception UnsupportedOperationException is never thrown */ + public String getEquivalentUnicode() { + StringBuffer sb = new StringBuffer(); + if (hasPrefix()) { + ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getPrefix())); + sb.append(getPrefix()); + } + if (hasHeadLetter()) { + // DLC FIXME this crap won't be true... + ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getPrefix())); + ThdlDebug.verify(UnicodeUtils.isSubjoinedConsonant(getRootLetter())); + sb.append(getHeadLetter()); + } else { + ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getRootLetter())); + } + sb.append(getRootLetter()); + if (hasSubjoinedLetter()) { + ThdlDebug.verify(UnicodeUtils.isSubjoinedConsonant(getSubjoinedLetter())); + sb.append(getSubjoinedLetter()); + } + if (hasWaZurSubjoinedToRootLetter()) { + ThdlDebug.verify(UnicodeUtils.isSubjoinedConsonant(EWSUB_wa_zur)); + sb.append(EWSUB_wa_zur); + } + if (hasAChungOnRootLetter()) { + ThdlDebug.verify('\u0F71' == EW_achung); + sb.append(EW_achung); + } + if (hasExplicitVowel()) { + sb.append(getVowel()); + } + if (hasSuffix()) { + ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getSuffix().charAt(0))); + sb.append(getSuffix()); + } + if (hasPostsuffix()) { + ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getPostsuffix())); + sb.append(getPostsuffix()); + } + return sb.toString(); + } + + /** Overrides {@link org.thdl.tib.text.tshegbar#UnicodeReadyThunk + method to return true. */ + public boolean hasEquivalentUnicode() { + return true; + } + + + /** Returns a descriptive XML element. */ + public String toString() { + return toConciseXML(); + } +} diff --git a/source/org/thdl/tib/text/tshegbar/TshegBar.java b/source/org/thdl/tib/text/tshegbar/TshegBar.java new file mode 100644 index 0000000..5a560f6 --- /dev/null +++ b/source/org/thdl/tib/text/tshegbar/TshegBar.java @@ -0,0 +1,68 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2001 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.tshegbar; + +/** A TshegBar (pronounced tsek bar) is roughly a Tibetan + * syllable. In truth, it is the stuff between two tseks. + * + *

First, some terminology.

+ * + *
  • When we talk about a glyph, we mean a picture + * found in a font. A single glyph may have one or more + * representations by sequences of Unicode characters, or it may not + * be representable becuase it is only part of one Unicode character + * or pictures a nonstandard character.
  • When we talk about a + * stack, we mean either a number (or half-number), a mark or + * sign, a bit of punctuation, or a consonant stack.
  • A + * consonant stack is or one or more consonants stacked + * vertically, plus an optional vocalic modification such as an + * anusvara (DLC what do we call a bindu?) or visarga, plus zero or + * more signs like \u0F35, plus an optional a-chung + * (\u0F71), plus an optional simple vowel.
  • By + * simple vowel, we mean any of \u0F72, + * \u0F74, \u0F7A, \u0F7B, + * \u0F7C, \u0F7D, or + * \u0F80.
+ * + * (Note: The string "\u0F68\u0F7E\u0F7C" seems to equal + * "\u0F00", though the Unicode standard does not + * indicate that it is so. This code treats it that way.)

+ * + *

This class allows for invalid tsheg bars, like those + * containing more than one prefix, more than two suffixes, an + * invalid postsuffix (secondary suffix), more than one consonant + * stack (excluding the special case of what we call in Extended + * Wylie "'i", which is technically a consonant stack but is used in + * Tibetan like a suffix).

. + * + *

Subclasses exist for valid, grammatically correct tsheg bars, + * and for invalid tsheg bars. Note that correctness is at the tsheg + * bar level only; it may be grammatically incorrect to concatenate + * two valid tsheg bars. Some subclasses can be represented in + * Unicode, but others contain nonstandard glyphs and cannot be.

+ * + * @author David Chandler + */ +public abstract class TshegBar implements UnicodeReadyThunk { + /** Returns true, as we consider a transliteration in the Tibetan + * alphabet of a non-Tibetan language, say Chinese, as being + * Tibetan. + * @return true */ + public boolean isTibetan() { return true; } +} diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java b/source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java new file mode 100644 index 0000000..bac731c --- /dev/null +++ b/source/org/thdl/tib/text/tshegbar/UnicodeCharToExtendedWylie.java @@ -0,0 +1,317 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2001 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.tshegbar; + +import org.thdl.tib.text.TibetanMachineWeb; + +/** This noninstantiable class allows for converting from Unicode + * characters (i.e., code points) to Extended Wylie. It cannot be + * used for long stretches of text, though, as it is unaware of + * context, which is essential to understanding a non-trivial string + * of Tibetan Unicode. + * + *

See the document by Nathaniel Garson and David Germano entitled + * Extended Wylie Transliteration Scheme. Note that there are + * a couple of issues with the November 18, 2001 revision of that + * document; these issues are in the Bugs tracker at {@see + * http://sourceforge.net/projects/thdltools}.

+ * + * @author David Chandler */ +public class UnicodeCharToExtendedWylie { + + /** Returns the extended Wylie for the very simple sequence x. + * Returns null iff some (Unicode) char in s has no extended + * Wylie representation. This is unaware of context, so use it + * sparingly. */ + public static StringBuffer getExtendedWylieForUnicodeString(String x) { + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < x.length(); i++) { + String ew = getExtendedWylieForUnicodeChar(x.charAt(i)); + if (null == ew) + return null; + sb.append(ew); + } + return sb; + } + + /** Returns the extended Wylie for x, or null if there is none. + * Understand that multiple Unicode code points (chars) map to + * the same Extended Wylie representation. Understand also that + * the scrap of Extended Wylie returned is only valid in certain + * contexts. For example, not all consonants take ra-btags. DLC NOW what about canonicalization? */ + public static String getExtendedWylieForUnicodeChar(char x) { + switch (x) { + + case '\u0F00': return "oM"; + case '\u0F01': return null; + case '\u0F02': return null; + case '\u0F03': return null; + case '\u0F04': return "@"; + case '\u0F05': return "#"; + case '\u0F06': return "$"; + case '\u0F07': return "%"; + case '\u0F08': return "!"; + case '\u0F09': return null; + case '\u0F0A': return null; + case '\u0F0B': return " "; + case '\u0F0C': return "*"; // DLC NOW: Jskad does not support this! + case '\u0F0D': return "/"; + case '\u0F0E': return "//"; // DLC FIXME: this is kind of a hack-- the Unicode standard says the spacing for this construct is different than the spacing for "\u0F0D\u0F0D" + case '\u0F0F': return ";"; + + case '\u0F10': return "["; + case '\u0F11': return "|"; + case '\u0F12': return "]"; + case '\u0F13': return "`"; + case '\u0F14': return ":"; + case '\u0F15': return null; + case '\u0F16': return null; + case '\u0F17': return null; + case '\u0F18': return null; + case '\u0F19': return null; + case '\u0F1A': return null; + case '\u0F1B': return null; + case '\u0F1C': return null; + case '\u0F1D': return null; + case '\u0F1E': return null; + case '\u0F1F': return null; + + case '\u0F20': return "0"; + case '\u0F21': return "1"; + case '\u0F22': return "2"; + case '\u0F23': return "3"; + case '\u0F24': return "4"; + case '\u0F25': return "5"; + case '\u0F26': return "6"; + case '\u0F27': return "7"; + case '\u0F28': return "8"; + case '\u0F29': return "9"; + case '\u0F2A': return null; + case '\u0F2B': return null; + case '\u0F2C': return null; + case '\u0F2D': return null; + case '\u0F2E': return null; + case '\u0F2F': return null; + + case '\u0F30': return null; + case '\u0F31': return null; + case '\u0F32': return null; + case '\u0F33': return null; + case '\u0F34': return "="; + case '\u0F35': return null; + case '\u0F36': return null; + case '\u0F37': return null; + case '\u0F38': return null; + case '\u0F39': return null; + case '\u0F3A': return "<"; + case '\u0F3B': return ">"; + case '\u0F3C': return "("; + case '\u0F3D': return ")"; + case '\u0F3E': return "{"; + case '\u0F3F': return "}"; + + case '\u0F40': return "k"; + case '\u0F41': return "kh"; + case '\u0F42': return "g"; + case '\u0F43': return (getExtendedWylieForUnicodeChar('\u0F42') + + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? + + getExtendedWylieForUnicodeChar('\u0FB7')); + case '\u0F44': return "ng"; + case '\u0F45': return "c"; + case '\u0F46': return "ch"; + case '\u0F47': return "j"; + case '\u0F48': return null; + case '\u0F49': return "ny"; + case '\u0F4A': return "T"; + case '\u0F4B': return "Th"; + case '\u0F4C': return "D"; + case '\u0F4D': return (getExtendedWylieForUnicodeChar('\u0F4C') + + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? + + getExtendedWylieForUnicodeChar('\u0FB7')); + case '\u0F4E': return "N"; + case '\u0F4F': return "t"; + + case '\u0F50': return "th"; + case '\u0F51': return "d"; + case '\u0F52': return (getExtendedWylieForUnicodeChar('\u0F51') + + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? + + getExtendedWylieForUnicodeChar('\u0FB7')); + case '\u0F53': return "n"; + case '\u0F54': return "p"; + case '\u0F55': return "ph"; + case '\u0F56': return "b"; + case '\u0F57': return (getExtendedWylieForUnicodeChar('\u0F56') + + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? + + getExtendedWylieForUnicodeChar('\u0FB7')); + case '\u0F58': return "m"; + case '\u0F59': return "ts"; + case '\u0F5A': return "tsh"; + case '\u0F5B': return "dz"; + case '\u0F5C': return (getExtendedWylieForUnicodeChar('\u0F5B') + + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? + + getExtendedWylieForUnicodeChar('\u0FB7')); + case '\u0F5D': return "w"; + case '\u0F5E': return "zh"; + case '\u0F5F': return "z"; + + case '\u0F60': return "'"; + case '\u0F61': return "y"; + case '\u0F62': return "r"; + case '\u0F63': return "l"; + case '\u0F64': return "sh"; + case '\u0F65': return "Sh"; + case '\u0F66': return "s"; + case '\u0F67': return "h"; + case '\u0F68': return "a"; // DLC: maybe the empty string is OK here because typing just 'i' into Jskad causes root letter \u0F68 to appear... yuck... + case '\u0F69': return (getExtendedWylieForUnicodeChar('\u0F40') + + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? + + getExtendedWylieForUnicodeChar('\u0FB5')); + case '\u0F6A': return "r"; + case '\u0F6B': return null; + case '\u0F6C': return null; + case '\u0F6D': return null; + case '\u0F6E': return null; + case '\u0F6F': return null; + + case '\u0F70': return null; + case '\u0F71': return "A"; + case '\u0F72': return "i"; + case '\u0F73': return "I"; + case '\u0F74': return "u"; + case '\u0F75': return "U"; + case '\u0F76': return "r-i"; // DLC Ri or r-i? I put in a bug report. + case '\u0F77': return "r-I"; // DLC or RI? + case '\u0F78': return "l-i"; + case '\u0F79': return "l-I"; + case '\u0F7A': return "e"; + case '\u0F7B': return "ai"; + case '\u0F7C': return "o"; + case '\u0F7D': return "au"; + case '\u0F7E': return "M"; + case '\u0F7F': return "H"; + + case '\u0F80': return "-i"; + case '\u0F81': return "-I"; + case '\u0F82': return "~^";// DLC unsupported in Jskad + case '\u0F83': return "~"; // DLC unsupported in Jskad + case '\u0F84': return "?"; + case '\u0F85': return "&"; + case '\u0F86': return null; + case '\u0F87': return null; + case '\u0F88': return null; + case '\u0F89': return null; + case '\u0F8A': return null; + case '\u0F8B': return null; + case '\u0F8C': return null; + case '\u0F8D': return null; + case '\u0F8E': return null; + case '\u0F8F': return null; + + case '\u0F90': return "k"; + case '\u0F91': return "kh"; + case '\u0F92': return "g"; + case '\u0F93': return (getExtendedWylieForUnicodeChar('\u0F92') + + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? + + getExtendedWylieForUnicodeChar('\u0FB7')); + case '\u0F94': return "ng"; + case '\u0F95': return "c"; + case '\u0F96': return "ch"; + case '\u0F97': return "j"; + case '\u0F98': return null; + case '\u0F99': return "ny"; + case '\u0F9A': return "T"; + case '\u0F9B': return "Th"; + case '\u0F9C': return "D"; + case '\u0F9D': return (getExtendedWylieForUnicodeChar('\u0F92') + + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? + + getExtendedWylieForUnicodeChar('\u0FB7')); + case '\u0F9E': return "N"; + case '\u0F9F': return "t"; + + case '\u0FA0': return "th"; + case '\u0FA1': return "d"; + case '\u0FA2': return (getExtendedWylieForUnicodeChar('\u0FA1') + + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? + + getExtendedWylieForUnicodeChar('\u0FB7')); + case '\u0FA3': return "n"; + case '\u0FA4': return "p"; + case '\u0FA5': return "ph"; + case '\u0FA6': return "b"; + case '\u0FA7': return (getExtendedWylieForUnicodeChar('\u0FA6') + + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? + + getExtendedWylieForUnicodeChar('\u0FB7')); + case '\u0FA8': return "m"; + case '\u0FA9': return "ts"; + case '\u0FAA': return "tsh"; + case '\u0FAB': return "dz"; + case '\u0FAC': return (getExtendedWylieForUnicodeChar('\u0FAB') + + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? + + getExtendedWylieForUnicodeChar('\u0FB7')); + case '\u0FAD': return "w"; + case '\u0FAE': return "zh"; + case '\u0FAF': return "z"; + + case '\u0FB0': return "'"; + case '\u0FB1': return "y"; + case '\u0FB2': return "r"; + case '\u0FB3': return "l"; + case '\u0FB4': return "sh"; + case '\u0FB5': return "Sh"; + case '\u0FB6': return "s"; + case '\u0FB7': return "h"; + case '\u0FB8': return "a"; // DLC see note on \u0F68 ... + case '\u0FB9': return (getExtendedWylieForUnicodeChar('\u0F90') + + TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY // DLC FIXME: is this right? + + getExtendedWylieForUnicodeChar('\u0FB5')); + case '\u0FBA': return "w"; + case '\u0FBB': return "y"; + case '\u0FBC': return "r"; + case '\u0FBD': return null; + case '\u0FBE': return null; + case '\u0FBF': return null; + + case '\u0FC0': return null; + case '\u0FC1': return null; + case '\u0FC2': return null; + case '\u0FC3': return null; + case '\u0FC4': return null; + case '\u0FC5': return null; + case '\u0FC6': return null; + case '\u0FC7': return null; + case '\u0FC8': return null; + case '\u0FC9': return null; + case '\u0FCA': return null; + case '\u0FCB': return null; + case '\u0FCC': return null; + case '\u0FCD': return null; + case '\u0FCE': return null; + case '\u0FCF': return ""; // DLC i added this to the 'EWTS document misspeaks' bug report... null I think... + + default: { + // DLC handle space (EW's "_") + + // This character is in the range 0FD0-0FFF or is not in + // the Tibetan range at all. In either case, there is no + // corresponding Extended Wylie. + return null; + } + } // end switch + } +} + diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java b/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java new file mode 100644 index 0000000..7c8a315 --- /dev/null +++ b/source/org/thdl/tib/text/tshegbar/UnicodeConstants.java @@ -0,0 +1,98 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2001 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.tshegbar; + +/** Provides handy Extended Wylie-inspired names for Unicode + * characters commonly used to represent Tibetan. The consonant that + * the Extended Wylie text "ka" refers to is named EWC_ka as in "The + * Extended Wylie Consonant ka", the vowel represented in Extended + * Wylie by "i" is EWV_i, and so on. There is at least one exception + * to the naming scheme, but exceptions are well-commented. + * + * @see org.thdl.tib.text.tshegbar#ValidTshegBar + * + * @author David Chandler */ +public interface UnicodeConstants { + + /** for those times when you need a char to represent a non-existent character */ + static final char EW_ABSENT = '\u0000'; + + // the thirty consonants, in alphabetical order: + + /** first letter of the alphabet: */ + static final char EWC_ka = '\u0F40'; + + static final char EWC_kha = '\u0F41'; + static final char EWC_ga = '\u0F42'; + static final char EWC_nga = '\u0F44'; + static final char EWC_ca = '\u0F45'; + static final char EWC_cha = '\u0F46'; + static final char EWC_ja = '\u0F47'; + static final char EWC_nya = '\u0F49'; + static final char EWC_ta = '\u0F4F'; + static final char EWC_tha = '\u0F50'; + static final char EWC_da = '\u0F51'; + static final char EWC_na = '\u0F53'; + static final char EWC_pa = '\u0F54'; + static final char EWC_pha = '\u0F55'; + static final char EWC_ba = '\u0F56'; + static final char EWC_ma = '\u0F58'; + static final char EWC_tsa = '\u0F59'; + static final char EWC_tsha = '\u0F5A'; + static final char EWC_dza = '\u0F5B'; + static final char EWC_wa = '\u0F5D'; + static final char EWC_zha = '\u0F5E'; + static final char EWC_za = '\u0F5F'; + /** Note the irregular name. The Extended Wylie representation is + 'a. */ + static final char EWC_achen = '\u0F60'; /* DLC NOW is this achen or achung? achen is EWC_a, right? comment it. replace EWC_achen everywhere if you change it. */ + static final char EWC_ya = '\u0F61'; + static final char EWC_ra = '\u0F62'; + static final char EWC_la = '\u0F63'; + static final char EWC_sha = '\u0F64'; + static final char EWC_sa = '\u0F66'; + static final char EWC_ha = '\u0F67'; + static final char EWC_a = '\u0F68'; + + /** In the word for father, "pA lags", there is an a-chung (i.e., + \u0F71). This is the constant for that little + guy. */ + static final char EW_achung = '\u0F71'; + + /* Four of the five vowels, some say, or, others say, "the four + vowels": */ + /** "gi gu" (DLC?), the 'i' sound in the English word keep: */ + static final char EWV_i = '\u0F72'; + /** "zhabs kyu", the 'u' sound in the English word tune: */ + static final char EWV_u = '\u0F74'; + /** "'greng bu" (also known as "'greng po", and pronounced dang-bo), the 'a' sound in the English word gate: */ + static final char EWV_e = '\u0F7A'; + /** "na ro" (DLC?), the 'o' sound in the English word bone: */ + static final char EWV_o = '\u0F7C'; + + + /** subscribed form of EWC_wa, a.k.a. wa-btags */ + static final char EWSUB_wa_zur = '\u0FAD'; + /** subscribed form of EWC_ya */ + static final char EWSUB_ya_btags = '\u0FB1'; + /** subscribed form of EWC_ra */ + static final char EWSUB_ra_btags = '\u0FB2'; + /** subscribed form of EWC_la */ + static final char EWSUB_la_btags = '\u0FB3'; +} diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeReadyThunk.java b/source/org/thdl/tib/text/tshegbar/UnicodeReadyThunk.java new file mode 100644 index 0000000..e85a42d --- /dev/null +++ b/source/org/thdl/tib/text/tshegbar/UnicodeReadyThunk.java @@ -0,0 +1,63 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2001 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.tshegbar; + +/** A UnicodeReadyThunk represents a string of characters. While + * there are ways to turn a string of Unicode characters into a list + * of UnicodeReadyThunks (DLC reference it), you cannot + * necessarily recover the exact sequence of Unicode characters from + * a UnicodeReadyThunk. For characters that are not Tibetan + * Unicode and are not one of a handful of other known characters, + * only the most primitive operations are available. Generally in + * this case you can recover the exact string of Unicode characters, + * but don't bank on it. + * + * @author David Chandler + */ +public interface UnicodeReadyThunk { + + /** Returns true iff this thunk is entirely Tibetan (regardless of + whether or not all characters come from the Tibetan range of + Unicode 3, i.e. 0x0F00-0x0FFF). */ + public boolean isTibetan(); + + /** Returns a sequence of Unicode characters that is equivalent to + * this thunk if possible. It is only possible if {@link + * #hasEquivalentUnicode()} is true. Unicode has more than one + * way to refer to the same language element, so this is just one + * method. When more than one Unicode sequence exists, and when + * the thunk {@link #isTibetan() is Tibetan}, this method returns + * sequences that the Unicode 3.2 standard does not discourage. + * @exception UnsupportedOperationException if {@link + * #hasEquivalentUnicode()} is false + * @return a String of Unicode characters */ + public String getEquivalentUnicode() throws UnsupportedOperationException; + + /** Returns true iff there exists a sequence of Unicode characters + * that correctly represents this thunk. This will not be the + * case if the thunk contains Tibetan characters for which the + * Unicode standard does not provide. See the Extended Wylie + * Transliteration System (EWTS) document (DLC ref, DLC mention + * Dza,fa,va doc bug) for more info, and see the Unicode 3 + * standard section 9.13. The presence of head marks or multiple + * vowels in the thunk would cause this to return false, for + * example. */ + public boolean hasEquivalentUnicode(); +} + diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java new file mode 100644 index 0000000..413cb4a --- /dev/null +++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtils.java @@ -0,0 +1,234 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2001 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.tshegbar; + +/**

This non-instantiable class contains utility routines for + * dealing with Tibetan Unicode characters and strings of such + * characters.

+ * + * @author David Chandler */ +public class UnicodeUtils { + /** Do not use this, as this class is not instantiable. */ + private UnicodeUtils() { super(); } + + /** Returns true iff x is a Unicode character that represents a + consonant or two-consonant stack that has a Unicode code + point. Returns true only for the usual suspects (like + \u0F40) and for Sanskrit consonants (like + \u0F71) and the simple two-consonant stacks in + Unicode (like \u0F43). Returns false for, among + other things, subjoined consonants like + \u0F90. */ + public static boolean isNonSubjoinedConsonant(char x) { + return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */) + && (x >= '\u0F40' && x <= '\u0F6A')); + } + + /** Returns true iff x is a Unicode character that represents a + subjoined consonant or subjoined two-consonant stack that has + a Unicode code point. Returns true only for the usual + suspects (like \u0F90) and for Sanskrit + consonants (like \u0F9C) and the simple + two-consonant stacks in Unicode (like \u0FAC). + Returns false for, among other things, non-subjoined + consonants like \u0F40. */ + public static boolean isSubjoinedConsonant(char x) { + return ((x != '\u0F98' /* reserved in Unicode 3.2, but not in use */) + && (x >= '\u0F90' && x <= '\u0FBC')); + } + + /** Returns true iff x is the preferred representation of a + Tibetan or Sanskrit consonant and cannot be broken down any + further. Returns false for, among other things, subjoined + consonants like \u0F90, two-component consonants + like \u0F43, and fixed-form consonants like + '\u0F6A'. The new consonants (for transcribing Chinese, I + believe) "\u0F55\u0F39" (which EWTS calls "fa"), + "\u0F56\u0F39" ("va"), and "\u0F5F\u0F39" ("Dza") are + two-character sequences, but you should be aware of them + also. */ + public static boolean isPreferredFormOfConsonant(char x) { + return ((x != '\u0F48' /* reserved in Unicode 3.2, but not in use */) + && (x >= '\u0F40' && x <= '\u0F68') + && (x != '\u0F43') + && (x != '\u0F4D') + && (x != '\u0F52') + && (x != '\u0F57') + && (x != '\u0F5C')); + } + + /** Returns true iff unicodeChar is a character from the Unicode + range U+0F00-U+0FFF. + @see #isEntirelyTibetanUnicode(String) */ + public static boolean isInTibetanRange(char unicodeChar) { + return (unicodeChar >= '\u0F00' && unicodeChar <= '\u0FFF'); + } + + /** Returns true iff unicodeString consists only of characters + from the Unicode range U+0F00-U+0FFF. (Note that these + characters are typically not enough to represent a Tibetan + text, you may need ZWSP (zero-width space) and various + whitespace from other ranges.) */ + public static boolean isEntirelyTibetanUnicode(String unicodeString) { + for (int i = 0; i < unicodeString.length(); i++) { + if (!isInTibetanRange(unicodeString.charAt(i))) + return false; + } + return true; + } + + /** Modifies tibetanUnicode so that it is equivalent, according to + the Unicode 3.2 standard, to the input buffer. The Tibetan + passages of the returned string are in THDL-canonical form, + however. This form uses a maximum of characters, in general, + and never uses characters whose use has been {@link + #isDiscouraged(char) discouraged}. If the input contains + characters for which {@link #isInTibetanRange(char)} is not + true, then they will not be modified. + +

Note well that only well-formed input guarantees + well-formed output.

*/ + public static void toCanonicalForm(StringBuffer tibetanUnicode) { + int offset = 0; + while (offset < tibetanUnicode.length()) { + String s = toCanonicalForm(tibetanUnicode.charAt(offset)); + if (null == s) { + ++offset; + } else { + // modify tibetanUnicode and update offset. + tibetanUnicode.deleteCharAt(offset); + tibetanUnicode.insert(offset, s); + } + } + } + + /** Like {@link #toCanonicalForm(StringBuffer)}, but does not + modify its input. Instead, it returns the canonically-formed + version of tibetanUnicode. */ + public static String toCanonicalForm(String tibetanUnicode) { + StringBuffer sb = new StringBuffer(tibetanUnicode); + toCanonicalForm(sb); + return sb.toString(); + } + + /** There are 19 characters in the Tibetan range of Unicode 3.2 + which can be decomposed into longer strings of characters in + the Tibetan range of Unicode. These 19 are said not to be in + THDL-canonical form. This routine returns the canonical form + for such characters, and returns null for characters that are + already canonical or are not in the Tibetan range of Unicode. + @param tibetanUnicodeChar the character to canonicalize + @return null if tibetanUnicodeChar is canonical, or a string + of two or three characters otherwise */ + public static String toCanonicalForm(char tibetanUnicodeChar) { + switch (tibetanUnicodeChar) { + case '\u0F43': return new String(new char[] { '\u0F42', '\u0FB7' }); + case '\u0F4D': return new String(new char[] { '\u0F4C', '\u0FB7' }); + case '\u0F52': return new String(new char[] { '\u0F51', '\u0FB7' }); + case '\u0F57': return new String(new char[] { '\u0F56', '\u0FB7' }); + case '\u0F5C': return new String(new char[] { '\u0F5B', '\u0FB7' }); + case '\u0F69': return new String(new char[] { '\u0F40', '\u0FB5' }); + case '\u0F73': return new String(new char[] { '\u0F71', '\u0F72' }); + case '\u0F75': return new String(new char[] { '\u0F71', '\u0F74' }); + case '\u0F76': return new String(new char[] { '\u0FB2', '\u0F80' }); + case '\u0F77': return new String(new char[] { '\u0FB2', '\u0F71', '\u0F80' }); + case '\u0F78': return new String(new char[] { '\u0FB3', '\u0F80' }); + case '\u0F79': return new String(new char[] { '\u0FB3', '\u0F71', '\u0F80' }); + case '\u0F81': return new String(new char[] { '\u0F71', '\u0F80' }); + case '\u0F93': return new String(new char[] { '\u0F92', '\u0FB7' }); + case '\u0F9D': return new String(new char[] { '\u0F9C', '\u0FB7' }); + case '\u0FA2': return new String(new char[] { '\u0FA1', '\u0FB7' }); + case '\u0FA7': return new String(new char[] { '\u0FA6', '\u0FB7' }); + case '\u0FAC': return new String(new char[] { '\u0FAB', '\u0FB7' }); + case '\u0FB9': return new String(new char[] { '\u0F90', '\u0FB5' }); + + default: + return null; + } + } + + /** Returns true iff tibetanUnicodeChar {@link + #isInTibetanRange(char)} and if the Unicode 3.2 standard + discourages the use of tibetanUnicodeChar. */ + public static boolean isDiscouraged(char tibetanUnicodeChar) { + return ('\u0F73' == tibetanUnicodeChar + || '\u0F75' == tibetanUnicodeChar + || '\u0F77' == tibetanUnicodeChar + || '\u0F81' == tibetanUnicodeChar); + /* DLC FIXME -- I was using 3.0 p.437-440, check 3.2. */ + } + + /** Returns true iff ch corresponds to the Tibetan letter ra. + Several Unicode characters correspond to the Tibetan letter ra + (in its subscribed form or otherwise). Oftentimes, + \u0F62 is thought of as the nominal + representation. Returns false for some characters that + contain ra but are not merely ra, such as \u0F77 */ + public static boolean isRa(char ch) { + return ('\u0F62' == ch + || '\u0F6A' == ch + || '\u0FB2' == ch + || '\u0FBC' == ch); + } + + /** Returns true iff ch corresponds to the Tibetan letter wa. + Several Unicode characters correspond to the Tibetan letter + wa. Oftentimes, \u0F5D is thought of as the + nominal representation. */ + public static boolean isWa(char ch) { + return ('\u0F5D' == ch + || '\u0FAD' == ch + || '\u0FBA' == ch); + } + + /** Returns true iff ch corresponds to the Tibetan letter ya. + Several Unicode characters correspond to the Tibetan letter + ya. Oftentimes, \u0F61 is thought of as the + nominal representation. */ + public static boolean isYa(char ch) { + return ('\u0F61' == ch + || '\u0FB1' == ch + || '\u0FBB' == ch); + } + + /** Returns true iff there exists at least one character ch in + unicodeString such that ch {@link #isRa() is ra} or contains + ra (like \u0F77). This method is not implemented + as fast as it could be. It calls on the canonicalization code + in order to maximize reuse and minimize the possibility of + coder error. */ + public static boolean containsRa(String unicodeString) { + String canonForm = toCanonicalForm(unicodeString); + for (int i = 0; i < canonForm.length(); i++) { + if (isRa(canonForm.charAt(i))) + return true; + } + return false; + } + /** Inefficient shortcut. + @see #containsRa(String) */ + public static boolean containsRa(char unicodeChar) { + return containsRa(new String(new char[] { unicodeChar })); + } + + public static String unicodeCharToString(char ch) { + return "U+" + Integer.toHexString((int)ch); + } +} + diff --git a/source/org/thdl/tib/text/tshegbar/package.html b/source/org/thdl/tib/text/tshegbar/package.html new file mode 100644 index 0000000..4de8dfa --- /dev/null +++ b/source/org/thdl/tib/text/tshegbar/package.html @@ -0,0 +1,30 @@ + + + + + + + + Provides for manipulating Tibetan text at the tsek bar level. + Roughly speaking, a "tsheg bar" (pronounced tsek bar) is a + syllable. + +

+ This package allows for turning a string of Unicode characters into + our TTBIR, our Tibetan Tsheg Bar Internal Representation. + Said Unicode document may contain non-Tibetan characters also. +

+ + +