Fixed a bunch of bugs; supports le'u'i'o, sgom pa'am, etc.

Better tests.  As part of that, I had to break TibetanMachineWeb into
TibetanMachineWeb+THDLWylieConstants, because I don't want the
class-wide initialization code from TibetanMachineWeb causing errors
in LegalTshegBarTest.
This commit is contained in:
dchandler 2003-03-31 00:33:50 +00:00
parent 1987f7d80a
commit 33b3080068
7 changed files with 468 additions and 230 deletions

View file

@ -0,0 +1,117 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text;
/** This is where basic, static knowledge of THDL's Extended Wylie is housed.
* @see org.thdl.tib.text#TibetanMachineWeb */
public interface THDLWylieConstants {
/**
* the Wylie for bindu/anusvara
*/
public static final char BINDU = 'M';
/**
* the Wylie for tsheg
*/
public static final char TSHEG = ' '; //this character occurs in all ten TMW fonts
/**
* the Wylie for whitespace
*/
public static final char SPACE = '_'; //this character occurs in all ten TMW fonts
/**
* the Sanskrit stacking separator used in Extended Wylie
*/
public static final char WYLIE_SANSKRIT_STACKING_KEY = '+';
/**
* the Wylie disambiguating key, as a char
*/
public static final char WYLIE_DISAMBIGUATING_KEY = '.';
/**
* the Wylie for the invisible 'a' vowel
*/
public static final String WYLIE_aVOWEL = "a";
/**
* the Wylie for achung
*/
public static final char ACHUNG_character = '\'';
/**
* the Wylie for achung
*/
public static final String ACHUNG
= new String(new char[] { ACHUNG_character });
/**
* the Wylie for the 28th of the 30 consonants, sa:
*/
public static final String SA = "s";
/**
* the Wylie for the 16th of the 30 consonants, ma:
*/
public static final String MA = "m";
/**
* the Wylie for the 4th of the 30 consonants, nga:
*/
public static final String NGA = "ng";
/**
* the Wylie for achen
*/
public static final String ACHEN = "a";
/**
* the Wylie for gigu
*/
public static final String i_VOWEL = "i";
/**
* the Wylie for zhebju
*/
public static final String u_VOWEL = "u";
/**
* the Wylie for drengbu
*/
public static final String e_VOWEL = "e";
/**
* the Wylie for naro
*/
public static final String o_VOWEL = "o";
/**
* the Wylie for double drengbu
*/
public static final String ai_VOWEL = "ai";
/**
* the Wylie for double naro
*/
public static final String au_VOWEL = "au";
/**
* the Wylie for the subscript achung vowel
*/
public static final String A_VOWEL = "A";
/**
* the Wylie for log yig gigu
*/
public static final String reverse_i_VOWEL = "-i";
/**
* the Wylie for the vowel achung + gigu
*/
public static final String I_VOWEL = "I";
/**
* the Wylie for the vowel achung + zhebju
*/
public static final String U_VOWEL = "U";
/**
* the Wylie for the vowel achung + log yig gigu
*/
public static final String reverse_I_VOWEL = "-I";
}

View file

@ -28,7 +28,8 @@ import org.thdl.util.ThdlDebug;
/**
* Provides methods for converting back and forth between Extended
* Wylie and TibetanMachineWeb. This class is not instantiable.
* Wylie and Tibetan represented in TibetanMachineWeb glyphs. This
* class is not instantiable.
*
* <p>
* The class provides a variety of static methods for converting
@ -37,7 +38,7 @@ import org.thdl.util.ThdlDebug;
* be exported as Rich Text Format.
*
* @author Edward Garrett, Tibetan and Himalayan Digital Library */
public class TibTextUtils {
public class TibTextUtils implements THDLWylieConstants {
/** Do not use this contructor. */
private TibTextUtils() { super(); }
@ -255,11 +256,11 @@ public class TibTextUtils {
if (k < 32) //return null if character is just formatting
return String.valueOf(c);
if (c == TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY)
return String.valueOf(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
if (c == WYLIE_DISAMBIGUATING_KEY)
return String.valueOf(WYLIE_DISAMBIGUATING_KEY);
if (c == TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY)
return String.valueOf(TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY);
if (c == WYLIE_SANSKRIT_STACKING_KEY)
return String.valueOf(WYLIE_SANSKRIT_STACKING_KEY);
for (i=offset+1; i<wylie.length()+1; i++) {
s = wylie.substring(offset, i);
@ -332,7 +333,7 @@ public class TibTextUtils {
chars.clear();
if (next.equals(String.valueOf(TibetanMachineWeb.BINDU))) {
if (next.equals(String.valueOf(BINDU))) {
if (glyphs.isEmpty())
dc = null;
else
@ -369,7 +370,7 @@ public class TibTextUtils {
break vowel_block;
}
}
DuffCode[] dc_array = (DuffCode[])TibetanMachineWeb.getTibHash().get(TibetanMachineWeb.ACHEN);
DuffCode[] dc_array = (DuffCode[])TibetanMachineWeb.getTibHash().get(ACHEN);
dc = dc_array[TibetanMachineWeb.TMW];
glyphs.addAll(getVowel(dc, next));
}
@ -398,7 +399,7 @@ public class TibTextUtils {
}
}
else if (next.equals(String.valueOf(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY))) {
else if (next.equals(String.valueOf(WYLIE_DISAMBIGUATING_KEY))) {
if (!chars.isEmpty())
glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit));
@ -406,7 +407,7 @@ public class TibTextUtils {
isSanskrit = false;
}
else if (next.equals(String.valueOf(TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY))) {
else if (next.equals(String.valueOf(WYLIE_SANSKRIT_STACKING_KEY))) {
if (!isSanskrit) { //begin sanskrit stack
switch (chars.size()) {
case 0:
@ -475,13 +476,13 @@ public class TibTextUtils {
List bindus = new ArrayList();
if (null == dc) {
bindus.add(TibetanMachineWeb.getGlyph(String.valueOf(TibetanMachineWeb.BINDU)));
bindus.add(TibetanMachineWeb.getGlyph(String.valueOf(BINDU)));
return bindus;
}
if (!TibetanMachineWeb.getBinduMap().containsKey(dc)) {
bindus.add(dc);
bindus.add(TibetanMachineWeb.getGlyph(String.valueOf(TibetanMachineWeb.BINDU)));
bindus.add(TibetanMachineWeb.getGlyph(String.valueOf(BINDU)));
return bindus;
}
@ -524,7 +525,7 @@ public class TibTextUtils {
//this vowel doesn't correspond to a glyph -
//so you just return the original context
if ( vowel.equals(TibetanMachineWeb.WYLIE_aVOWEL) ||
if ( vowel.equals(WYLIE_aVOWEL) ||
TibetanMachineWeb.isTopVowel(context_2)) {
if (context_1 != null)
vowels.add(context_1);
@ -537,34 +538,34 @@ public class TibTextUtils {
//these vowels have one invariant form - therefore,
//dc_context is just returned along with that form
if (vowel.equals(TibetanMachineWeb.ai_VOWEL)) {
if (vowel.equals(ai_VOWEL)) {
if (context_1 != null)
vowels.add(context_1);
vowels.add(context_2);
DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(TibetanMachineWeb.ai_VOWEL);
DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(ai_VOWEL);
vowels.add(dc_v[TibetanMachineWeb.TMW]);
return vowels;
}
if (vowel.equals(TibetanMachineWeb.au_VOWEL)) {
if (vowel.equals(au_VOWEL)) {
if (context_1 != null)
vowels.add(context_1);
vowels.add(context_2);
DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(TibetanMachineWeb.au_VOWEL);
DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(au_VOWEL);
vowels.add(dc_v[TibetanMachineWeb.TMW]);
return vowels;
}
if (vowel.equals(TibetanMachineWeb.reverse_i_VOWEL)) {
if (vowel.equals(reverse_i_VOWEL)) {
if (context_1 != null)
vowels.add(context_1);
vowels.add(context_2);
if (!TibetanMachineWeb.isTopVowel(context_2)) {
DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(TibetanMachineWeb.reverse_i_VOWEL);
DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(reverse_i_VOWEL);
vowels.add(dc_v[TibetanMachineWeb.TMW]);
}
@ -578,7 +579,7 @@ public class TibTextUtils {
//returned along with the vowel appropriate to
//that context
if (vowel.equals(TibetanMachineWeb.i_VOWEL)) {
if (vowel.equals(i_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_i);
if (null == dc_v && null != context_1) {
@ -597,7 +598,7 @@ public class TibTextUtils {
return vowels;
}
if (vowel.equals(TibetanMachineWeb.e_VOWEL)) {
if (vowel.equals(e_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_e);
if (null == dc_v && null != context_1) {
@ -616,7 +617,7 @@ public class TibTextUtils {
return vowels;
}
if (vowel.equals(TibetanMachineWeb.o_VOWEL)) {
if (vowel.equals(o_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_o);
if (null == dc_v && null != context_1) {
@ -641,7 +642,7 @@ public class TibTextUtils {
//both u and A cannot be affixed to ordinary k or g, but
//rather the shortened versions of k and g - therefore,
if (vowel.equals(TibetanMachineWeb.u_VOWEL)) {
if (vowel.equals(u_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_u);
@ -660,7 +661,7 @@ public class TibTextUtils {
return vowels;
}
if (vowel.equals(TibetanMachineWeb.A_VOWEL)) {
if (vowel.equals(A_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_A);
@ -680,7 +681,7 @@ public class TibTextUtils {
return vowels;
}
if (vowel.equals(TibetanMachineWeb.U_VOWEL)) {
if (vowel.equals(U_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_U);
@ -704,7 +705,7 @@ public class TibTextUtils {
//require a change from the previous character,
//and consist of two glyphs themselves
if (vowel.equals(TibetanMachineWeb.I_VOWEL)) {
if (vowel.equals(I_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
DuffCode dc_v_sub = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_A);
@ -726,11 +727,11 @@ public class TibTextUtils {
return vowels;
}
if (vowel.equals(TibetanMachineWeb.reverse_I_VOWEL)) {
if (vowel.equals(reverse_I_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
DuffCode dc_v_sub = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_A);
DuffCode[] tv_array = (DuffCode[])TibetanMachineWeb.getTibHash().get(TibetanMachineWeb.reverse_i_VOWEL);
DuffCode[] tv_array = (DuffCode[])TibetanMachineWeb.getTibHash().get(reverse_i_VOWEL);
DuffCode dc_v_sup = tv_array[TibetanMachineWeb.TMW];
if (null != context_1)
@ -766,10 +767,10 @@ public class TibTextUtils {
/** Returns "a", unless wylie is already "a". */
private static String aVowelToUseAfter(String wylie) {
if (wylie.equals(TibetanMachineWeb.ACHEN))
if (wylie.equals(ACHEN))
return "";
else
return TibetanMachineWeb.WYLIE_aVOWEL;
return WYLIE_aVOWEL;
}
private static String unambiguousPostAVowelWylie(String wylie1,
@ -781,7 +782,7 @@ public class TibTextUtils {
if (TibetanMachineWeb.isWylieTop(wylie1)
&& wylie2.equals(/* FIXME: hard-coded */ "d"))
disambiguator
= new String(new char[] { TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY });
= new String(new char[] { WYLIE_DISAMBIGUATING_KEY });
return wylie1 + disambiguator + wylie2;
}
@ -831,13 +832,13 @@ public class TibTextUtils {
}
if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, wylie))
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
sb.append(WYLIE_DISAMBIGUATING_KEY);
if (!wylie.equals(TibetanMachineWeb.ACHEN)) {
if (!wylie.equals(ACHEN)) {
sb.append(wylie);
sb.append(TibetanMachineWeb.WYLIE_aVOWEL);
sb.append(WYLIE_aVOWEL);
} else {
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
sb.append(WYLIE_DISAMBIGUATING_KEY);
sb.append(wylie);
}
}
@ -861,12 +862,12 @@ public class TibTextUtils {
StringBuffer tailEndWylie = null;
int effectiveSize = size - 2;
while (effectiveSize >= 0
&& TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize)).equals(TibetanMachineWeb.ACHUNG)) {
&& TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize)).equals(ACHUNG)) {
if (null == tailEndWylie) tailEndWylie = new StringBuffer();
// prepend:
tailEndWylie.insert(0,
TibetanMachineWeb.ACHUNG
+ aVowelToUseAfter(TibetanMachineWeb.ACHUNG)
ACHUNG
+ aVowelToUseAfter(ACHUNG)
+ TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize + 1)));
effectiveSize -= 2;
}
@ -893,8 +894,8 @@ public class TibTextUtils {
for (int i = 0; i < size; i++) {
wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i));
if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, wylie)
|| (i != 0 && wylie.equals(TibetanMachineWeb.ACHEN)))
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|| (i != 0 && wylie.equals(ACHEN)))
sb.append(WYLIE_DISAMBIGUATING_KEY);
sb.append(wylie + aVowelToUseAfter(wylie));
lastWylie = wylie;
@ -907,8 +908,8 @@ public class TibTextUtils {
while (i+2 < size) {
wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i));
if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, wylie)
|| (i != 0 && wylie.equals(TibetanMachineWeb.ACHEN)))
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|| (i != 0 && wylie.equals(ACHEN)))
sb.append(WYLIE_DISAMBIGUATING_KEY);
sb.append(wylie);
lastWylie = wylie;
@ -933,7 +934,11 @@ public class TibTextUtils {
if (TibetanMachineWeb.isWylieLeft(wylie0)) {
/* is it ambiguous? */
if (TibetanMachineWeb.isWylieRight(wylie1)
&& TibetanMachineWeb.SA.equals(wylie2)) {
&& SA.equals(wylie2) /* isWylieFarRight would
* work, but the list of
* 9 words doesn't have
* any ending with d --
* all end with s. */) {
/* Yes, this is ambiguous. How do we handle it? See this from Andres:
I'm posting this upon David Chandler's request. According to Lobsang
@ -1001,14 +1006,14 @@ public class TibTextUtils {
// }
// }
// if (disambiguatorNeeded)
// sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
// sb.append(WYLIE_DISAMBIGUATING_KEY);
} else {
/* no ambiguity. the "a" vowel comes after
* wylie1. */
if (TibetanMachineWeb.isAmbiguousWylie(wylie0, wylie1))
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
sb.append(WYLIE_DISAMBIGUATING_KEY);
sb.append(wylie1
+ aVowelToUseAfter(wylie1)
+ wylie2);
@ -1069,8 +1074,8 @@ public class TibTextUtils {
if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, currWylie)
|| (!lastWylie.equals("")
&& currWylie.equals(TibetanMachineWeb.ACHEN)))
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
&& currWylie.equals(ACHEN)))
sb.append(WYLIE_DISAMBIGUATING_KEY);
sb.append(currWylie);
@ -1125,7 +1130,7 @@ public class TibTextUtils {
wylie = TibetanMachineWeb.getWylieForGlyph(dcs[i]);
boolean containsBindu = false;
if (wylie.length() > 1 && wylie.charAt(wylie.length()-1) == TibetanMachineWeb.BINDU) {
if (wylie.length() > 1 && wylie.charAt(wylie.length()-1) == BINDU) {
char[] cArray = wylie.toCharArray();
wylie = new String(cArray, 0, wylie.length()-1);
containsBindu = true;
@ -1157,18 +1162,18 @@ public class TibTextUtils {
} else if (TibetanMachineWeb.isWylieVowel(wylie)) {
if (isLastVowel) {
int len = wylieBuffer.length();
int A_len = TibetanMachineWeb.A_VOWEL.length();
int A_len = A_VOWEL.length();
if (wylieBuffer.substring(len-A_len).equals(TibetanMachineWeb.A_VOWEL)) {
if (wylieBuffer.substring(len-A_len).equals(A_VOWEL)) {
try {
if (wylie.equals(TibetanMachineWeb.i_VOWEL)) {
if (wylie.equals(i_VOWEL)) {
wylieBuffer.delete(len-A_len, len);
wylieBuffer.append(TibetanMachineWeb.I_VOWEL);
wylieBuffer.append(I_VOWEL);
isLastVowel = false;
break process_block;
} else if (wylie.equals(TibetanMachineWeb.reverse_i_VOWEL)) {
} else if (wylie.equals(reverse_i_VOWEL)) {
wylieBuffer.delete(len-A_len, len);
wylieBuffer.append(TibetanMachineWeb.reverse_I_VOWEL);
wylieBuffer.append(reverse_I_VOWEL);
isLastVowel = false;
break process_block;
}
@ -1189,7 +1194,7 @@ public class TibTextUtils {
DuffCode top_dc = (DuffCode)glyphList.get(glyphCount-1);
String top_wylie = TibetanMachineWeb.getWylieForGlyph(top_dc);
if (top_wylie.equals(TibetanMachineWeb.ACHEN)) {
if (top_wylie.equals(ACHEN)) {
glyphList.remove(glyphCount-1);
if (glyphCount-1 == 0) {
@ -1200,7 +1205,7 @@ public class TibTextUtils {
}
}
if (top_dc == null || !TibetanMachineWeb.getWylieForGlyph(top_dc).equals(TibetanMachineWeb.ACHUNG)) {
if (top_dc == null || !TibetanMachineWeb.getWylieForGlyph(top_dc).equals(ACHUNG)) {
String thisPart = withoutA(glyphList);
wylieBuffer.append(thisPart); //append consonants in glyphList
} else {
@ -1212,12 +1217,12 @@ public class TibTextUtils {
wylieBuffer.append(thisPart);
}
wylieBuffer.append(TibetanMachineWeb.ACHUNG);
wylieBuffer.append(ACHUNG);
}
}
if (insertDisAmbig)
wylieBuffer.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
wylieBuffer.append(WYLIE_DISAMBIGUATING_KEY);
wylieBuffer.append(wylie); //append vowel
@ -1234,7 +1239,7 @@ public class TibTextUtils {
if (containsBindu) {
isLastVowel = false;
wylieBuffer.append(withoutA(glyphList));
wylieBuffer.append(TibetanMachineWeb.BINDU); //append the bindu
wylieBuffer.append(BINDU); //append the bindu
glyphList.clear();
}
}

View file

@ -10,7 +10,7 @@ License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
@ -44,7 +44,7 @@ import org.thdl.util.ThdlOptions;
* @version 1.0
*/
// FIXME: for speed, make either this class, its methods, or both, final?
public class TibetanMachineWeb {
public class TibetanMachineWeb implements THDLWylieConstants {
/** This addresses bug 624133, "Input freezes after impossible
* character". The input sequences that are valid in Extended
* Wylie. For example, "Sh" will be in this container, but "S"
@ -109,86 +109,6 @@ public class TibetanMachineWeb {
"TibetanMachineWeb9".intern()
};
/**
* the Wylie for bindu/anusvara
*/
public static final char BINDU = 'M';
/**
* the Wylie for tsheg
*/
public static final char TSHEG = ' '; //this character occurs in all ten TMW fonts
/**
* the Wylie for whitespace
*/
public static final char SPACE = '_'; //this character occurs in all ten TMW fonts
/**
* the Sanskrit stacking separator used in Extended Wylie
*/
public static final char WYLIE_SANSKRIT_STACKING_KEY = '+';
/**
* the Wylie disambiguating key, as a char
*/
public static final char WYLIE_DISAMBIGUATING_KEY = '.';
/**
* the Wylie for the invisible 'a' vowel
*/
public static final String WYLIE_aVOWEL = "a";
/**
* the Wylie for achung
*/
public static final String ACHUNG = "'";
/**
* the Wylie for the 28th of the 30 consonants, sa:
*/
public static final String SA = "s";
/**
* the Wylie for achen
*/
public static final String ACHEN = "a";
/**
* the Wylie for gigu
*/
public static final String i_VOWEL = "i";
/**
* the Wylie for zhebju
*/
public static final String u_VOWEL = "u";
/**
* the Wylie for drengbu
*/
public static final String e_VOWEL = "e";
/**
* the Wylie for naro
*/
public static final String o_VOWEL = "o";
/**
* the Wylie for double drengbu
*/
public static final String ai_VOWEL = "ai";
/**
* the Wylie for double naro
*/
public static final String au_VOWEL = "au";
/**
* the Wylie for the subscript achung vowel
*/
public static final String A_VOWEL = "A";
/**
* the Wylie for log yig gigu
*/
public static final String reverse_i_VOWEL = "-i";
/**
* the Wylie for the vowel achung + gigu
*/
public static final String I_VOWEL = "I";
/**
* the Wylie for the vowel achung + zhebju
*/
public static final String U_VOWEL = "U";
/**
* the Wylie for the vowel achung + log yig gigu
*/
public static final String reverse_I_VOWEL = "-I";
/**
* represents where in an array of DuffCodes you
* find the TibetanMachine equivalence of a glyph
*/

View file

@ -18,7 +18,7 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.tshegbar;
import org.thdl.tib.text.TibetanMachineWeb;
import org.thdl.tib.text.THDLWylieConstants;
import org.thdl.util.ThdlDebug;
/** <p>A LegalTshegBar is a simple Tibetan syllable or a syllable with
@ -29,7 +29,7 @@ import org.thdl.util.ThdlDebug;
* <ul>
*
* <li>It contains at most one prefix, which must be one of {EWC_ga,
* EWC_da, EWC_ba, EWC_ma, EWC_achen} and must be prefixable to the
* EWC_da, EWC_ba, EWC_ma, EWC_achung} and must be prefixable to the
* root letter.</li>
*
* <li>It contains no vocalic modifications</li>
@ -39,12 +39,11 @@ import org.thdl.util.ThdlDebug;
*
* <li>It contains at most one vowel from the set {EWV_a, EWV_i,
* EWV_e, EWV_u}, and that vowel is on the root stack. The one
* exception is that a 'i suffix is permitted (this is a connective
* case marker).</li>
* exception is that 'i (i.e., the connective case marker), 'u, and
* 'o suffixes are permitted.</li>
*
* <li>It has at most one suffix, which is a single consonant or the
* special connective case marker 'i (i.e.,
* <code>"&#92;u0F60&#92;u0F72"</code>).</li>
* <li>It has at most one suffix, which is a single consonant or a
* string consisting of 'i, 'u, 'o, 'am, and 'ang.</li>
*
*
DLC FIXME: we must allow many suffixes. See Andres' e-mail below:
@ -69,10 +68,8 @@ And also there are cases where they combine. For ex you can have
*
*
* <li>It may contain a EWC_sa or EWC_da postsuffix iff there exists
* a suffix (and a suffix that is not the special connective case
* marker 'i (i.e., <code>"&#92;u0F60&#92;u0F72"</code>) (DLC FIXME: 'o and
* 'am maybe? I asked in the "Embarrasing error in wylie conversion"
* bug report.).</li>
* a suffix (and a suffix that is not based on 'i, 'o, 'u, 'am, and
* 'ang).</li>
*
* <li>The root stack follows the rules of Tibetan syntax, meaning
* that the following holds:
@ -112,7 +109,7 @@ And also there are cases where they combine. For ex you can have
* e.g. p. 548.</p>
*
* @author David Chandler */
public class LegalTshegBar
public final class LegalTshegBar
extends TshegBar
implements UnicodeConstants
{
@ -129,8 +126,8 @@ public class LegalTshegBar
private boolean hasWaZur;
/** true iff EW_wa_zur is under the root syllable. */
private boolean hasAChung;
/** If this is a string, it is of a single codepoint or is equal
* to {@link #getConnectiveCaseSuffix()} */
/** If this is a string, it is of a single codepoint or is a
* string formed from 'i, 'o, 'u, 'am, and 'ang. */
private String suffix;
/** EW_da, EW_sa, or EW_ABSENT */
private char postsuffix;
@ -236,24 +233,24 @@ public class LegalTshegBar
}
/** Returns null if there is no suffix, or a string containing the
* one consonant or a string <code>"&#92;u0F60&#92;u0F72"</code>
* containing two codepoints in the special case that the suffix
* is that connective case marker {@link
* #getConnectiveCaseSuffix()}. */
* one consonant or a string like <code>"&#92;u0F60&#92;u0F72"</code>
* in the case that the suffix
* is 'i, 'u'i'o, 'am, 'ang, etc. */
public String getSuffix() {
return suffix;
}
/** Returns true iff there is a suffixed consonant or a suffixed
* <code>'i</code> (DLC FIXME). */
* string consisting of 'i, 'u, 'o, 'am, and 'ang. */
public boolean hasSuffix() {
return (null != suffix);
}
/** Returns true iff there is a single, suffixed consonant. This
means that suffixes like <code>'am</code>, <code>'i</code>,
<code>'u</code>, and <code>'o</code> are not present, but this
does not rule out the presence of a postsuffix. */
means that suffixes made from <code>'am</code>,
<code>'ang</code> <code>'i</code>, <code>'u</code>, and
<code>'o</code> are not present, but this does not rule out
the presence of a postsuffix. */
public boolean hasSimpleSuffix() {
return ((null != suffix) && (1 == suffix.length()));
}
@ -280,12 +277,6 @@ public class LegalTshegBar
return (EW_ABSENT != postsuffix);
}
/** Returns true iff this syllable has a <code>'i</code>
* suffix. */
public boolean hasConnectiveCaseMarkerSuffix() {
return getSuffix().equals(getConnectiveCaseSuffix());
}
/** Returns the root consonant. */
public char getRootLetter() {
return rootLetter;
@ -324,7 +315,7 @@ public class LegalTshegBar
private final static String possibleSuffixes
= new String(new char[] {
EWC_ga, EWC_nga, EWC_da, EWC_na, EWC_ba, EWC_ma, EWC_achen,
EWC_ga, EWC_nga, EWC_da, EWC_na, EWC_ba, EWC_ma, EWC_achung,
EWC_ra, EWC_la, EWC_sa
});
@ -340,18 +331,6 @@ public class LegalTshegBar
// EWSUB_ra_btags.
}
private final static String connectiveCaseSuffix
= new String(new char[] {
EWC_achen, EWV_i
});
/** Returns a two-codepoint string consisting of the Unicode
* representation of what THDL Extended Wylie calls
* <code>'i</code>. */
public static String getConnectiveCaseSuffix() {
return connectiveCaseSuffix;
}
private final static String thirtyConsonants
= new String(new char[] {
EWC_ga, EWC_kha, EWC_ga, EWC_nga,
@ -359,7 +338,7 @@ public class LegalTshegBar
EWC_ta, EWC_tha, EWC_da, EWC_na,
EWC_pa, EWC_pha, EWC_ba, EWC_ma,
EWC_tsa, EWC_tsha, EWC_dza, EWC_wa,
EWC_zha, EWC_za, EWC_achen, EWC_ya,
EWC_zha, EWC_za, EWC_achung, EWC_ya,
EWC_ra, EWC_la, EWC_sha, EWC_sa,
EWC_ha, EWC_a
});
@ -388,10 +367,10 @@ public class LegalTshegBar
<p>This is not very efficient.</p> */
public static String[] getPossibleSuffixParticles() {
return new String[] {
new String(new char[] { EWC_achen, EWV_i }),
new String(new char[] { EWC_achen, EWV_o }),
new String(new char[] { EWC_achen, EWV_u }),
new String(new char[] { EWC_achen, EWC_ma }),
new String(new char[] { EWC_achung, EWV_i }),
new String(new char[] { EWC_achung, EWV_o }),
new String(new char[] { EWC_achung, EWV_u }),
new String(new char[] { EWC_achung, EWC_ma }),
};
}
@ -402,7 +381,7 @@ public class LegalTshegBar
* @see org.thdl.tib.text.tshegbar.UnicodeConstants */
public static String getTheFivePrefixes() {
final String s = new String(new char[] {
EWC_ga, EWC_da, EWC_ba, EWC_ma, EWC_achen
EWC_ga, EWC_da, EWC_ba, EWC_ma, EWC_achung
});
ThdlDebug.verify(s.length() == 5); // DLC put this into a JUnit test to avoid the slow-down.
return s;
@ -416,27 +395,104 @@ public class LegalTshegBar
/** Returns a String containing the nominal Unicode
* representations of the ten suffixes. The suffixes are in
* dictionary order.
* @see #getConnectiveCaseSuffix()
* dictionary order. This doesn't include oddballs like suffixes
* based on 'i, 'u, 'o, 'am, and 'ang.
* @see org.thdl.tib.text.tshegbar.UnicodeConstants */
public static String getTheTenSuffixes() {
final String s = new String(new char[] {
EWC_ga, EWC_nga, EWC_da, EWC_na, EWC_ba,
EWC_ma, EWC_achen, EWC_ra, EWC_la, EWC_sa
EWC_ma, EWC_achung, EWC_ra, EWC_la, EWC_sa
});
ThdlDebug.verify(s.length() == 10); // DLC put this into a JUnit test to avoid the slow-down.
return s;
}
/** Returns true iff x is the preferred, nominal Unicode
* representation of one of the ten suffixes.
* @see #getConnectiveCaseSuffix()
*/
public static boolean isNominalRepresentationOfSimpleSuffix(char x) {
return (-1 != getTheTenSuffixes().indexOf(x));
}
/** Legal suffix-like particles, excluding the ten suffixes. If
* you add one, be sure that a tsheg-bar with it has the extended
* wylie you wish by adding the correct extended Wylie with it. */
private static final String[][] oddball_suffixes = new String[][] {
{
// connective case marker:
new String( new char[] {
EWC_achung, EWV_i
}),
THDLWylieConstants.ACHUNG + THDLWylieConstants.i_VOWEL
},
{
new String( new char[] {
EWC_achung, EWV_u
}),
THDLWylieConstants.ACHUNG + THDLWylieConstants.u_VOWEL
},
{
// in at least one context, this shows end of sentence:
new String( new char[] {
EWC_achung, EWV_o
}),
THDLWylieConstants.ACHUNG + THDLWylieConstants.o_VOWEL
},
{
// as in sgom pa'am:
new String( new char[] {
EWC_achung, EWC_ma
}),
THDLWylieConstants.ACHUNG + THDLWylieConstants.WYLIE_aVOWEL
+ THDLWylieConstants.MA
},
{
// meaning or, as opposed to and:
new String( new char[] {
EWC_achung, EWC_nga
}),
THDLWylieConstants.ACHUNG + THDLWylieConstants.WYLIE_aVOWEL
+ THDLWylieConstants.NGA
}
};
/** Returns true iff suffix is 'i, 'o, 'u, 'am, 'ang, or a
* concatenation like 'u'i'o. Returns false otherwise (including
* the case that suffix is the empty string). */
public static boolean isAchungBasedSuffix(String suffix) {
int i = 0; // so that the empty string causes false to be returned.
while (i == 0 || !suffix.equals("")) {
boolean startsWithOneOfThem = false;
for (int x = 0; x < oddball_suffixes.length; x++) {
if (suffix.startsWith(oddball_suffixes[x][0])) {
startsWithOneOfThem = true;
suffix = suffix.substring(oddball_suffixes[x][0].length());
break;
}
}
if (!startsWithOneOfThem)
return false;
++i;
}
return true;
}
private static String getTHDLWylieForOddballSuffix(String suffix) {
// FIXME: assert that isAchungBasedSuffix
StringBuffer wylie = new StringBuffer();
while (!suffix.equals("")) {
for (int x = 0; x < oddball_suffixes.length; x++) {
if (suffix.startsWith(oddball_suffixes[x][0])) {
wylie.append(oddball_suffixes[x][1]);
suffix = suffix.substring(oddball_suffixes[x][0].length());
break;
}
}
}
return wylie.toString();
}
/** Returns true iff the given (rootLetter, subjoinedLetter)
combination can accept an additional wa-zur. Only g-r-w,
d-r-w, and ph-y-w fall into this category according to
@ -595,8 +651,8 @@ public class LegalTshegBar
* @param subjoinedLetter the optional, subscribed consonant
* @param suffix the optional suffix, which is null, a String
* consisting of a single consonant (i.e. a single,
* nondecomposable codepoint) except in the special case that
* this is {@link #getConnectiveCaseSuffix()}
* nondecomposable codepoint), or a string of 'i (U+0F, 'u, 'o, 'am,
* and 'ang.
* @param postsuffix the optional postsuffix, which should be
* EWC_sa or EWC_da
* @param errorBuffer if non-null, and if the return code is
@ -763,13 +819,12 @@ public class LegalTshegBar
} // subjoinedLetter tests
// Suffix tests:
// DLC NOW -- allow 'o, 'u, 'am, etc.
if (null != suffix) {
if (!getConnectiveCaseSuffix().equals(suffix)) {
if (!isAchungBasedSuffix(suffix)) {
if (suffix.length() != 1) {
return internalThrowThing(throwIfIllegal,
errorBuf,
"Illegal suffix -- not one of the legal complex suffixes like 'u, 'o, 'i, 'am.");
"Illegal suffix -- not one of the legal complex suffixes like 'u, 'o, 'i, 'am, 'ang.");
}
if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) {
return internalThrowThing(throwIfIllegal,
@ -784,6 +839,10 @@ public class LegalTshegBar
return internalThrowThing(throwIfIllegal,
errorBuf,
"You cannot have a postsuffix unless you also have a suffix.");
if (isAchungBasedSuffix(suffix))
return internalThrowThing(throwIfIllegal,
errorBuf,
"You cannot have a postsuffix if you have a suffix based on 'i, 'o, 'u, 'am, and 'ang.");
}
if (EW_ABSENT != headLetter) {
@ -812,7 +871,9 @@ public class LegalTshegBar
"The head letter sa cannot be used with that root letter.");
}
} else {
// '&#92;u0F6A' is not a valid head letter, even for
// Illegal head letter.
//
// Note: U+0F6A is not a valid head letter, even for
// "rnya". Use EWC_ra instead.
return internalThrowThing(throwIfIllegal,
errorBuf,
@ -827,14 +888,14 @@ public class LegalTshegBar
&& EWV_e != vowel
&& EWV_o != vowel)
{
if (EWC_achen == vowel)
if (EWC_achung == vowel)
return internalThrowThing(throwIfIllegal,
errorBuf,
"The vowel given is not valid. Use EW_ABSENT for the EWC_achen sound.");
"The vowel given is not valid. Use EW_ABSENT for the EWC_achung sound.");
if ('\u0F71' == vowel)
return internalThrowThing(throwIfIllegal,
errorBuf,
"a-chung cannot be used in a simple Tibetan syllable."); // DLC FIXME: what about pA?
"a-chung can be used, but there is a flag for it; you don't call it the vowel.");
return internalThrowThing(throwIfIllegal,
errorBuf,
"The vowel given is not valid.");
@ -848,9 +909,6 @@ public class LegalTshegBar
/*
DLC add a method giving the correct connective case thingy or
throwing error if the 'i suffix already appears.
DLC put in a method that gets pronunciation using Unicode
diacritical marks. And another using just US Roman. Note that
pronunciation is contextual, so have these methods return all
@ -875,7 +933,7 @@ public class LegalTshegBar
boolean disambiguatorNeeded = false;
char prefix = getPrefix();
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(prefix));
if (!hasHeadLetter()) {
if (!hasHeadLetter() && !hasSubjoinedLetter()) {
if (EWC_ya == rootLetter) {
if (isConsonantThatTakesYaBtags(prefix))
disambiguatorNeeded = true;
@ -891,7 +949,7 @@ public class LegalTshegBar
}
}
if (disambiguatorNeeded)
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
sb.append(THDLWylieConstants.WYLIE_DISAMBIGUATING_KEY);
}
if (hasHeadLetter())
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter()));
@ -914,14 +972,14 @@ public class LegalTshegBar
// DLC FIXME: are these allowed in legal Tibetan?
// EWTS would have special cases for them if so,
// I'd wager...
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung));
// I'd wager, so I bet they're not.
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung_vowel));
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel()));
} else {
ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?");
}
} else {
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung));
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung_vowel));
}
} else {
if (hasExplicitVowel())
@ -930,19 +988,34 @@ public class LegalTshegBar
sb.append("a");
}
String suf = null;
if (hasSuffix()) {
String suf = getSuffix();
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(0)));
suf = getSuffix();
if (suf.length() > 1) {
// DLC assert, don't verify, that the length is two.
// This could change if I learn of more suffix
// particles.
ThdlDebug.verify(2 == suf.length());
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(1)));
// pa'am, not pa'm or pa'ama!
sb.append(getTHDLWylieForOddballSuffix(suf));
} else {
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(0)));
}
}
if (hasPostsuffix())
if (hasPostsuffix()) {
// lar.d, la-ra-da, needs a disambiguator. EWC_sa doesn't
// take any head letters, but EWC_da does.
boolean disambiguatorNeeded = false;
if (getPostsuffix() == EWC_da) {
if (suf.length() == 1) {
char simpleSuffix = suf.charAt(0);
if (EWC_ra == simpleSuffix
|| EWC_la == simpleSuffix
|| EWC_sa == simpleSuffix) {
disambiguatorNeeded = true;
}
}
}
if (disambiguatorNeeded)
sb.append(THDLWylieConstants.WYLIE_DISAMBIGUATING_KEY);
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix()));
}
return sb;
}
@ -987,7 +1060,7 @@ public class LegalTshegBar
? "hasAChungOnRootLetter=\"true\""
: "")
// DLC NOW: what about the root letter a, i.e. &#92;u0F68 ? do we want the EWTS to be 'aa' ?
// DLC NOW FIXME: what about the root letter a, i.e. &#92;u0F68 ? do we want the EWTS to be 'aa' ?
+ ("vowel=\""
+ (hasExplicitVowel()
? UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel())
@ -1019,7 +1092,8 @@ public class LegalTshegBar
sb.append(getPrefix());
}
if (hasHeadLetter()) {
// DLC FIXME this crap won't be true...
// DLC NOW FIXME this crap won't be true... it's what we must
// convert to, though. Do it.
ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getPrefix()));
ThdlDebug.verify(UnicodeUtils.isSubjoinedConsonant(getRootLetter()));
sb.append(getHeadLetter());
@ -1036,8 +1110,8 @@ public class LegalTshegBar
sb.append(EWSUB_wa_zur);
}
if (hasAChungOnRootLetter()) {
ThdlDebug.verify('\u0F71' == EW_achung);
sb.append(EW_achung);
ThdlDebug.verify('\u0F71' == EW_achung_vowel);
sb.append(EW_achung_vowel);
}
if (hasExplicitVowel()) {
sb.append(getVowel());

View file

@ -38,8 +38,64 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants {
junit.textui.TestRunner.run(LegalTshegBarTest.class);
}
/** Tests the getThdlWylie() method to see if we
handle "le'u'i'o", "sgom pa'am", "sgom pa'ang", etc.
*/
public void testGetThdlWylieForLongSuffixLikeThings() {
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_la,
EW_ABSENT, false, false,
new String(new char[] {
EWC_achung, EWV_u,
EWC_achung, EWV_i,
EWC_achung, EWV_o
}),
EW_ABSENT, EWV_e).getThdlWylie().toString().equals("le'u'i'o"));
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_la,
EW_ABSENT, false, false,
new String(new char[] {
EWC_achung, EWV_u,
EWC_achung, EWV_i,
EWC_achung, EWV_o,
EWC_achung, EWC_ma,
EWC_achung, EWC_nga,
EWC_achung, EWV_o,
EWC_achung, EWC_ma
}),
EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("la'u'i'o'am'ang'o'am"));
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_pa,
EW_ABSENT, false, false,
new String(new char[] { EWC_achung, EWC_ma }),
EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("pa'am"));
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_pa,
EW_ABSENT, false, false,
new String(new char[] { EWC_achung, EWC_nga }),
EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("pa'ang"));
}
/** Tests the getThdlWylie() method and one of the constructors. */
public void testGetThdlWylie() {
// do we disambiguate when needed?
{
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_ga, EWC_ya,
false, false, EW_ABSENT, EW_ABSENT, EWV_o).getThdlWylie().toString().equals("gyo"));
assertTrue(new LegalTshegBar(EWC_ga, EW_ABSENT, EWC_ya, EW_ABSENT,
false, false, EW_ABSENT, EW_ABSENT, EWV_o).getThdlWylie().toString().equals("g.yo"));
assertTrue(new LegalTshegBar(EWC_ba, EW_ABSENT, EWC_la, EW_ABSENT,
false, false, EWC_ga, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("b.lag"));
assertTrue(new LegalTshegBar(EWC_ba, EW_ABSENT, EWC_la, EW_ABSENT,
false, false, EWC_ga, EWC_sa, EW_ABSENT).getThdlWylie().toString().equals("b.lags"));
assertTrue(new LegalTshegBar(EWC_ba, EW_ABSENT, EWC_ra, EW_ABSENT,
false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("b.ragd"));
assertTrue(new LegalTshegBar(EWC_ba, EW_ABSENT, EWC_ra, EWC_la,
false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("brlagd"));
assertTrue(new LegalTshegBar(EWC_ba, EWC_ra, EWC_ga, EW_ABSENT,
false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("brgagd"));
assertTrue(new LegalTshegBar(EWC_ba, EWC_la, EWC_ha, EW_ABSENT,
false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("blhagd"));
assertTrue(new LegalTshegBar(EWC_ba, EWC_la, EWC_da, EW_ABSENT,
false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("bldagd"));
}
assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga, EWC_ra,
false, true, EWC_la, EWC_sa, EWV_o).getThdlWylie().toString().equals("bsgrAols"));
assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga,
@ -81,6 +137,10 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants {
EWC_la, false, false,
null, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("sla"));
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_pa,
EW_ABSENT, false, true,
null, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("pA"));
{
boolean threw = false;
try {
@ -159,4 +219,64 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants {
}
assertTrue(x);
}
/** Tests {@link
* org.thdl.tib.text.tshegbar.LegalTshegBar#getTheTenSuffixes()}. */
public void testGetTheTenSuffixes() {
String x = LegalTshegBar.getTheTenSuffixes();
assertTrue(x.length() == 10);
assertTrue(x.charAt(0) == EWC_ga);
assertTrue(x.charAt(4) == EWC_ba);
assertTrue(x.charAt(9) == EWC_sa);
}
/** Tests {@link
* org.thdl.tib.text.tshegbar.LegalTshegBar#isAchungBasedSuffix(String)}. */
public void testIsAchungBasedSuffix() {
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWC_nga
})));
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWC_ma
})));
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWV_i
})));
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWV_o
})));
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWV_u
})));
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWV_u,
EWC_achung, EWV_i,
EWC_achung, EWV_o
})));
assertTrue(!LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWV_u,
EWC_achung, EWV_i,
EWC_achung, EWV_o, /* no EWC_achung, */ EWC_nga
})));
// syntactically illegal, I'd bet, but our algorithm allows it:
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWC_ma,
EWC_achung, EWV_i,
EWC_achung, EWV_i,
EWC_achung, EWV_i,
EWC_achung, EWV_o,
EWC_achung, EWC_nga,
EWC_achung, EWV_o
})));
assertTrue(!LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWC_la
})));
assertTrue(!LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWV_e
})));
assertTrue(!LegalTshegBar.isAchungBasedSuffix(""));
}
}

View file

@ -83,20 +83,21 @@ public interface UnicodeConstants {
static final char EWC_za = '\u0F5F';
/** Note the irregular name. The Extended Wylie representation is
<code>'a</code>. */
static final char EWC_achen = '\u0F60'; /* DLC NOW is this achen or achung? achen is EWC_a, right? comment it. replace EWC_achen everywhere if you change it. */
static final char EWC_achung = '\u0F60';
static final char EWC_ya = '\u0F61';
static final char EWC_ra = '\u0F62';
static final char EWC_la = '\u0F63';
static final char EWC_sha = '\u0F64';
static final char EWC_sa = '\u0F66';
static final char EWC_ha = '\u0F67';
/** achen, the 30th consonant (and, some say, the fifth vowel) DLC NOW FIXME: rename to EWC_achen */
static final char EWC_a = '\u0F68';
/** In the word for father, "pA lags", there is an a-chung (i.e.,
<code>\u0F71</code>). This is the constant for that little
guy. */
static final char EW_achung = '\u0F71';
static final char EW_achung_vowel = '\u0F71';
/* Four of the five vowels, some say, or, others say, "the four

View file

@ -127,11 +127,12 @@ public class UnicodeGraphemeCluster
/** Returns the THDL Extended Wylie transliteration of this
grapheme cluster, or null if there is none (which happens for
a few Tibetan codepoints, if you'll recall). If needsVowel is
true, then an "a" will be appended when there is no EW_achung
or explicit simple vowel. If there is an explicit vowel or
EW_achung, it will always be present. Note that needsVowel is
provided because btags is the preferred THDL Extended Wylie
for the four contiguous grapheme clusters
true, then an "a" will be appended when there is no
EW_achung_vowel or explicit simple vowel. If there is an
explicit vowel or EW_achung_vowel, it will always be present.
Note that needsVowel is provided because btags is the
preferred THDL Extended Wylie for the four contiguous grapheme
clusters
<code>"&#92;u0F56&#92;u0F4F&#92;u0F42&#92;u0F66"</code>, and
needsVowel must be set to false for all but the grapheme
cluster corresponding to <code>&#92;u0F4F</code> if you wish
@ -257,7 +258,7 @@ public class UnicodeGraphemeCluster
/** Returns the <i>height</i> for the Tibetan Unicode codepoint x.
This relative height is 0 for a base consonant, digit,
punctuation, mark, or sign. It is -1 for a subjoined
consonant, -2 for EWSUB_wa_zur, -3 for EW_achung, +1 for
consonant, -2 for EWSUB_wa_zur, -3 for EW_achung_vowel, +1 for
EWV_gigu, and so on according to the height these codepoints
appear relative to one another when on the same stack. If two
codepoints have equal height, they should not exist in the