From 33b3080068f5fb20f28b2d40bfea1948c663cc57 Mon Sep 17 00:00:00 2001 From: dchandler Date: Mon, 31 Mar 2003 00:33:50 +0000 Subject: [PATCH] Fixed a bunch of bugs; supports le'u'i'o, sgom pa'am, etc. Better tests. As part of that, I had to break TibetanMachineWeb into TibetanMachineWeb+THDLWylieConstants, because I don't want the class-wide initialization code from TibetanMachineWeb causing errors in LegalTshegBarTest. --- .../org/thdl/tib/text/THDLWylieConstants.java | 117 +++++++++ source/org/thdl/tib/text/TibTextUtils.java | 123 ++++----- .../org/thdl/tib/text/TibetanMachineWeb.java | 84 +------ .../thdl/tib/text/tshegbar/LegalTshegBar.java | 236 ++++++++++++------ .../tib/text/tshegbar/LegalTshegBarTest.java | 120 +++++++++ .../tib/text/tshegbar/UnicodeConstants.java | 5 +- .../text/tshegbar/UnicodeGraphemeCluster.java | 13 +- 7 files changed, 468 insertions(+), 230 deletions(-) create mode 100644 source/org/thdl/tib/text/THDLWylieConstants.java diff --git a/source/org/thdl/tib/text/THDLWylieConstants.java b/source/org/thdl/tib/text/THDLWylieConstants.java new file mode 100644 index 0000000..29a6a52 --- /dev/null +++ b/source/org/thdl/tib/text/THDLWylieConstants.java @@ -0,0 +1,117 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text; + +/** This is where basic, static knowledge of THDL's Extended Wylie is housed. + * @see org.thdl.tib.text#TibetanMachineWeb */ +public interface THDLWylieConstants { +/** +* the Wylie for bindu/anusvara +*/ + public static final char BINDU = 'M'; +/** +* the Wylie for tsheg +*/ + public static final char TSHEG = ' '; //this character occurs in all ten TMW fonts +/** +* the Wylie for whitespace +*/ + public static final char SPACE = '_'; //this character occurs in all ten TMW fonts +/** +* the Sanskrit stacking separator used in Extended Wylie +*/ + public static final char WYLIE_SANSKRIT_STACKING_KEY = '+'; +/** +* the Wylie disambiguating key, as a char +*/ + public static final char WYLIE_DISAMBIGUATING_KEY = '.'; +/** +* the Wylie for the invisible 'a' vowel +*/ + public static final String WYLIE_aVOWEL = "a"; +/** +* the Wylie for achung +*/ + public static final char ACHUNG_character = '\''; +/** +* the Wylie for achung +*/ + public static final String ACHUNG + = new String(new char[] { ACHUNG_character }); +/** +* the Wylie for the 28th of the 30 consonants, sa: +*/ + public static final String SA = "s"; +/** +* the Wylie for the 16th of the 30 consonants, ma: +*/ + public static final String MA = "m"; +/** +* the Wylie for the 4th of the 30 consonants, nga: +*/ + public static final String NGA = "ng"; +/** +* the Wylie for achen +*/ + public static final String ACHEN = "a"; +/** +* the Wylie for gigu +*/ + public static final String i_VOWEL = "i"; +/** +* the Wylie for zhebju +*/ + public static final String u_VOWEL = "u"; +/** +* the Wylie for drengbu +*/ + public static final String e_VOWEL = "e"; +/** +* the Wylie for naro +*/ + public static final String o_VOWEL = "o"; +/** +* the Wylie for double drengbu +*/ + public static final String ai_VOWEL = "ai"; +/** +* the Wylie for double naro +*/ + public static final String au_VOWEL = "au"; +/** +* the Wylie for the subscript achung vowel +*/ + public static final String A_VOWEL = "A"; +/** +* the Wylie for log yig gigu +*/ + public static final String reverse_i_VOWEL = "-i"; +/** +* the Wylie for the vowel achung + gigu +*/ + public static final String I_VOWEL = "I"; +/** +* the Wylie for the vowel achung + zhebju +*/ + public static final String U_VOWEL = "U"; +/** +* the Wylie for the vowel achung + log yig gigu +*/ + public static final String reverse_I_VOWEL = "-I"; +} diff --git a/source/org/thdl/tib/text/TibTextUtils.java b/source/org/thdl/tib/text/TibTextUtils.java index e52b4c1..f3dfd0c 100644 --- a/source/org/thdl/tib/text/TibTextUtils.java +++ b/source/org/thdl/tib/text/TibTextUtils.java @@ -28,7 +28,8 @@ import org.thdl.util.ThdlDebug; /** * Provides methods for converting back and forth between Extended -* Wylie and TibetanMachineWeb. This class is not instantiable. +* Wylie and Tibetan represented in TibetanMachineWeb glyphs. This +* class is not instantiable. * *

* The class provides a variety of static methods for converting @@ -37,7 +38,7 @@ import org.thdl.util.ThdlDebug; * be exported as Rich Text Format. * * @author Edward Garrett, Tibetan and Himalayan Digital Library */ -public class TibTextUtils { +public class TibTextUtils implements THDLWylieConstants { /** Do not use this contructor. */ private TibTextUtils() { super(); } @@ -255,11 +256,11 @@ public class TibTextUtils { if (k < 32) //return null if character is just formatting return String.valueOf(c); - if (c == TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY) - return String.valueOf(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); + if (c == WYLIE_DISAMBIGUATING_KEY) + return String.valueOf(WYLIE_DISAMBIGUATING_KEY); - if (c == TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY) - return String.valueOf(TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY); + if (c == WYLIE_SANSKRIT_STACKING_KEY) + return String.valueOf(WYLIE_SANSKRIT_STACKING_KEY); for (i=offset+1; i= 0 - && TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize)).equals(TibetanMachineWeb.ACHUNG)) { + && TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize)).equals(ACHUNG)) { if (null == tailEndWylie) tailEndWylie = new StringBuffer(); // prepend: tailEndWylie.insert(0, - TibetanMachineWeb.ACHUNG - + aVowelToUseAfter(TibetanMachineWeb.ACHUNG) + ACHUNG + + aVowelToUseAfter(ACHUNG) + TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize + 1))); effectiveSize -= 2; } @@ -893,8 +894,8 @@ public class TibTextUtils { for (int i = 0; i < size; i++) { wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i)); if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, wylie) - || (i != 0 && wylie.equals(TibetanMachineWeb.ACHEN))) - sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); + || (i != 0 && wylie.equals(ACHEN))) + sb.append(WYLIE_DISAMBIGUATING_KEY); sb.append(wylie + aVowelToUseAfter(wylie)); lastWylie = wylie; @@ -907,8 +908,8 @@ public class TibTextUtils { while (i+2 < size) { wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i)); if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, wylie) - || (i != 0 && wylie.equals(TibetanMachineWeb.ACHEN))) - sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); + || (i != 0 && wylie.equals(ACHEN))) + sb.append(WYLIE_DISAMBIGUATING_KEY); sb.append(wylie); lastWylie = wylie; @@ -933,7 +934,11 @@ public class TibTextUtils { if (TibetanMachineWeb.isWylieLeft(wylie0)) { /* is it ambiguous? */ if (TibetanMachineWeb.isWylieRight(wylie1) - && TibetanMachineWeb.SA.equals(wylie2)) { + && SA.equals(wylie2) /* isWylieFarRight would + * work, but the list of + * 9 words doesn't have + * any ending with d -- + * all end with s. */) { /* Yes, this is ambiguous. How do we handle it? See this from Andres: I'm posting this upon David Chandler's request. According to Lobsang @@ -1001,14 +1006,14 @@ public class TibTextUtils { // } // } // if (disambiguatorNeeded) - // sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); + // sb.append(WYLIE_DISAMBIGUATING_KEY); } else { /* no ambiguity. the "a" vowel comes after * wylie1. */ if (TibetanMachineWeb.isAmbiguousWylie(wylie0, wylie1)) - sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); + sb.append(WYLIE_DISAMBIGUATING_KEY); sb.append(wylie1 + aVowelToUseAfter(wylie1) + wylie2); @@ -1069,8 +1074,8 @@ public class TibTextUtils { if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, currWylie) || (!lastWylie.equals("") - && currWylie.equals(TibetanMachineWeb.ACHEN))) - sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); + && currWylie.equals(ACHEN))) + sb.append(WYLIE_DISAMBIGUATING_KEY); sb.append(currWylie); @@ -1125,7 +1130,7 @@ public class TibTextUtils { wylie = TibetanMachineWeb.getWylieForGlyph(dcs[i]); boolean containsBindu = false; - if (wylie.length() > 1 && wylie.charAt(wylie.length()-1) == TibetanMachineWeb.BINDU) { + if (wylie.length() > 1 && wylie.charAt(wylie.length()-1) == BINDU) { char[] cArray = wylie.toCharArray(); wylie = new String(cArray, 0, wylie.length()-1); containsBindu = true; @@ -1157,18 +1162,18 @@ public class TibTextUtils { } else if (TibetanMachineWeb.isWylieVowel(wylie)) { if (isLastVowel) { int len = wylieBuffer.length(); - int A_len = TibetanMachineWeb.A_VOWEL.length(); + int A_len = A_VOWEL.length(); - if (wylieBuffer.substring(len-A_len).equals(TibetanMachineWeb.A_VOWEL)) { + if (wylieBuffer.substring(len-A_len).equals(A_VOWEL)) { try { - if (wylie.equals(TibetanMachineWeb.i_VOWEL)) { + if (wylie.equals(i_VOWEL)) { wylieBuffer.delete(len-A_len, len); - wylieBuffer.append(TibetanMachineWeb.I_VOWEL); + wylieBuffer.append(I_VOWEL); isLastVowel = false; break process_block; - } else if (wylie.equals(TibetanMachineWeb.reverse_i_VOWEL)) { + } else if (wylie.equals(reverse_i_VOWEL)) { wylieBuffer.delete(len-A_len, len); - wylieBuffer.append(TibetanMachineWeb.reverse_I_VOWEL); + wylieBuffer.append(reverse_I_VOWEL); isLastVowel = false; break process_block; } @@ -1189,7 +1194,7 @@ public class TibTextUtils { DuffCode top_dc = (DuffCode)glyphList.get(glyphCount-1); String top_wylie = TibetanMachineWeb.getWylieForGlyph(top_dc); - if (top_wylie.equals(TibetanMachineWeb.ACHEN)) { + if (top_wylie.equals(ACHEN)) { glyphList.remove(glyphCount-1); if (glyphCount-1 == 0) { @@ -1200,7 +1205,7 @@ public class TibTextUtils { } } - if (top_dc == null || !TibetanMachineWeb.getWylieForGlyph(top_dc).equals(TibetanMachineWeb.ACHUNG)) { + if (top_dc == null || !TibetanMachineWeb.getWylieForGlyph(top_dc).equals(ACHUNG)) { String thisPart = withoutA(glyphList); wylieBuffer.append(thisPart); //append consonants in glyphList } else { @@ -1212,12 +1217,12 @@ public class TibTextUtils { wylieBuffer.append(thisPart); } - wylieBuffer.append(TibetanMachineWeb.ACHUNG); + wylieBuffer.append(ACHUNG); } } if (insertDisAmbig) - wylieBuffer.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); + wylieBuffer.append(WYLIE_DISAMBIGUATING_KEY); wylieBuffer.append(wylie); //append vowel @@ -1234,7 +1239,7 @@ public class TibTextUtils { if (containsBindu) { isLastVowel = false; wylieBuffer.append(withoutA(glyphList)); - wylieBuffer.append(TibetanMachineWeb.BINDU); //append the bindu + wylieBuffer.append(BINDU); //append the bindu glyphList.clear(); } } diff --git a/source/org/thdl/tib/text/TibetanMachineWeb.java b/source/org/thdl/tib/text/TibetanMachineWeb.java index d68f9ed..a5a66ad 100644 --- a/source/org/thdl/tib/text/TibetanMachineWeb.java +++ b/source/org/thdl/tib/text/TibetanMachineWeb.java @@ -10,7 +10,7 @@ License for the specific terms governing rights and limitations under the License. The Initial Developer of this software is the Tibetan and Himalayan Digital -Library (THDL). Portions created by the THDL are Copyright 2001 THDL. +Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL. All Rights Reserved. Contributor(s): ______________________________________. @@ -44,7 +44,7 @@ import org.thdl.util.ThdlOptions; * @version 1.0 */ // FIXME: for speed, make either this class, its methods, or both, final? -public class TibetanMachineWeb { +public class TibetanMachineWeb implements THDLWylieConstants { /** This addresses bug 624133, "Input freezes after impossible * character". The input sequences that are valid in Extended * Wylie. For example, "Sh" will be in this container, but "S" @@ -109,86 +109,6 @@ public class TibetanMachineWeb { "TibetanMachineWeb9".intern() }; /** -* the Wylie for bindu/anusvara -*/ - public static final char BINDU = 'M'; -/** -* the Wylie for tsheg -*/ - public static final char TSHEG = ' '; //this character occurs in all ten TMW fonts -/** -* the Wylie for whitespace -*/ - public static final char SPACE = '_'; //this character occurs in all ten TMW fonts -/** -* the Sanskrit stacking separator used in Extended Wylie -*/ - public static final char WYLIE_SANSKRIT_STACKING_KEY = '+'; -/** -* the Wylie disambiguating key, as a char -*/ - public static final char WYLIE_DISAMBIGUATING_KEY = '.'; -/** -* the Wylie for the invisible 'a' vowel -*/ - public static final String WYLIE_aVOWEL = "a"; -/** -* the Wylie for achung -*/ - public static final String ACHUNG = "'"; -/** -* the Wylie for the 28th of the 30 consonants, sa: -*/ - public static final String SA = "s"; -/** -* the Wylie for achen -*/ - public static final String ACHEN = "a"; -/** -* the Wylie for gigu -*/ - public static final String i_VOWEL = "i"; -/** -* the Wylie for zhebju -*/ - public static final String u_VOWEL = "u"; -/** -* the Wylie for drengbu -*/ - public static final String e_VOWEL = "e"; -/** -* the Wylie for naro -*/ - public static final String o_VOWEL = "o"; -/** -* the Wylie for double drengbu -*/ - public static final String ai_VOWEL = "ai"; -/** -* the Wylie for double naro -*/ - public static final String au_VOWEL = "au"; -/** -* the Wylie for the subscript achung vowel -*/ - public static final String A_VOWEL = "A"; -/** -* the Wylie for log yig gigu -*/ - public static final String reverse_i_VOWEL = "-i"; -/** -* the Wylie for the vowel achung + gigu -*/ - public static final String I_VOWEL = "I"; -/** -* the Wylie for the vowel achung + zhebju -*/ - public static final String U_VOWEL = "U"; -/** -* the Wylie for the vowel achung + log yig gigu -*/ - public static final String reverse_I_VOWEL = "-I"; -/** * represents where in an array of DuffCodes you * find the TibetanMachine equivalence of a glyph */ diff --git a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java index 13a0e72..e119b7d 100644 --- a/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java +++ b/source/org/thdl/tib/text/tshegbar/LegalTshegBar.java @@ -18,7 +18,7 @@ Contributor(s): ______________________________________. package org.thdl.tib.text.tshegbar; -import org.thdl.tib.text.TibetanMachineWeb; +import org.thdl.tib.text.THDLWylieConstants; import org.thdl.util.ThdlDebug; /**

A LegalTshegBar is a simple Tibetan syllable or a syllable with @@ -29,7 +29,7 @@ import org.thdl.util.ThdlDebug; *