Fixed a bunch of bugs; supports le'u'i'o, sgom pa'am, etc.
Better tests. As part of that, I had to break TibetanMachineWeb into TibetanMachineWeb+THDLWylieConstants, because I don't want the class-wide initialization code from TibetanMachineWeb causing errors in LegalTshegBarTest.
This commit is contained in:
parent
1987f7d80a
commit
33b3080068
7 changed files with 468 additions and 230 deletions
117
source/org/thdl/tib/text/THDLWylieConstants.java
Normal file
117
source/org/thdl/tib/text/THDLWylieConstants.java
Normal file
|
@ -0,0 +1,117 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text;
|
||||
|
||||
/** This is where basic, static knowledge of THDL's Extended Wylie is housed.
|
||||
* @see org.thdl.tib.text#TibetanMachineWeb */
|
||||
public interface THDLWylieConstants {
|
||||
/**
|
||||
* the Wylie for bindu/anusvara
|
||||
*/
|
||||
public static final char BINDU = 'M';
|
||||
/**
|
||||
* the Wylie for tsheg
|
||||
*/
|
||||
public static final char TSHEG = ' '; //this character occurs in all ten TMW fonts
|
||||
/**
|
||||
* the Wylie for whitespace
|
||||
*/
|
||||
public static final char SPACE = '_'; //this character occurs in all ten TMW fonts
|
||||
/**
|
||||
* the Sanskrit stacking separator used in Extended Wylie
|
||||
*/
|
||||
public static final char WYLIE_SANSKRIT_STACKING_KEY = '+';
|
||||
/**
|
||||
* the Wylie disambiguating key, as a char
|
||||
*/
|
||||
public static final char WYLIE_DISAMBIGUATING_KEY = '.';
|
||||
/**
|
||||
* the Wylie for the invisible 'a' vowel
|
||||
*/
|
||||
public static final String WYLIE_aVOWEL = "a";
|
||||
/**
|
||||
* the Wylie for achung
|
||||
*/
|
||||
public static final char ACHUNG_character = '\'';
|
||||
/**
|
||||
* the Wylie for achung
|
||||
*/
|
||||
public static final String ACHUNG
|
||||
= new String(new char[] { ACHUNG_character });
|
||||
/**
|
||||
* the Wylie for the 28th of the 30 consonants, sa:
|
||||
*/
|
||||
public static final String SA = "s";
|
||||
/**
|
||||
* the Wylie for the 16th of the 30 consonants, ma:
|
||||
*/
|
||||
public static final String MA = "m";
|
||||
/**
|
||||
* the Wylie for the 4th of the 30 consonants, nga:
|
||||
*/
|
||||
public static final String NGA = "ng";
|
||||
/**
|
||||
* the Wylie for achen
|
||||
*/
|
||||
public static final String ACHEN = "a";
|
||||
/**
|
||||
* the Wylie for gigu
|
||||
*/
|
||||
public static final String i_VOWEL = "i";
|
||||
/**
|
||||
* the Wylie for zhebju
|
||||
*/
|
||||
public static final String u_VOWEL = "u";
|
||||
/**
|
||||
* the Wylie for drengbu
|
||||
*/
|
||||
public static final String e_VOWEL = "e";
|
||||
/**
|
||||
* the Wylie for naro
|
||||
*/
|
||||
public static final String o_VOWEL = "o";
|
||||
/**
|
||||
* the Wylie for double drengbu
|
||||
*/
|
||||
public static final String ai_VOWEL = "ai";
|
||||
/**
|
||||
* the Wylie for double naro
|
||||
*/
|
||||
public static final String au_VOWEL = "au";
|
||||
/**
|
||||
* the Wylie for the subscript achung vowel
|
||||
*/
|
||||
public static final String A_VOWEL = "A";
|
||||
/**
|
||||
* the Wylie for log yig gigu
|
||||
*/
|
||||
public static final String reverse_i_VOWEL = "-i";
|
||||
/**
|
||||
* the Wylie for the vowel achung + gigu
|
||||
*/
|
||||
public static final String I_VOWEL = "I";
|
||||
/**
|
||||
* the Wylie for the vowel achung + zhebju
|
||||
*/
|
||||
public static final String U_VOWEL = "U";
|
||||
/**
|
||||
* the Wylie for the vowel achung + log yig gigu
|
||||
*/
|
||||
public static final String reverse_I_VOWEL = "-I";
|
||||
}
|
|
@ -28,7 +28,8 @@ import org.thdl.util.ThdlDebug;
|
|||
|
||||
/**
|
||||
* Provides methods for converting back and forth between Extended
|
||||
* Wylie and TibetanMachineWeb. This class is not instantiable.
|
||||
* Wylie and Tibetan represented in TibetanMachineWeb glyphs. This
|
||||
* class is not instantiable.
|
||||
*
|
||||
* <p>
|
||||
* The class provides a variety of static methods for converting
|
||||
|
@ -37,7 +38,7 @@ import org.thdl.util.ThdlDebug;
|
|||
* be exported as Rich Text Format.
|
||||
*
|
||||
* @author Edward Garrett, Tibetan and Himalayan Digital Library */
|
||||
public class TibTextUtils {
|
||||
public class TibTextUtils implements THDLWylieConstants {
|
||||
/** Do not use this contructor. */
|
||||
private TibTextUtils() { super(); }
|
||||
|
||||
|
@ -255,11 +256,11 @@ public class TibTextUtils {
|
|||
if (k < 32) //return null if character is just formatting
|
||||
return String.valueOf(c);
|
||||
|
||||
if (c == TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY)
|
||||
return String.valueOf(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
if (c == WYLIE_DISAMBIGUATING_KEY)
|
||||
return String.valueOf(WYLIE_DISAMBIGUATING_KEY);
|
||||
|
||||
if (c == TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY)
|
||||
return String.valueOf(TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY);
|
||||
if (c == WYLIE_SANSKRIT_STACKING_KEY)
|
||||
return String.valueOf(WYLIE_SANSKRIT_STACKING_KEY);
|
||||
|
||||
for (i=offset+1; i<wylie.length()+1; i++) {
|
||||
s = wylie.substring(offset, i);
|
||||
|
@ -332,7 +333,7 @@ public class TibTextUtils {
|
|||
|
||||
chars.clear();
|
||||
|
||||
if (next.equals(String.valueOf(TibetanMachineWeb.BINDU))) {
|
||||
if (next.equals(String.valueOf(BINDU))) {
|
||||
if (glyphs.isEmpty())
|
||||
dc = null;
|
||||
else
|
||||
|
@ -369,7 +370,7 @@ public class TibTextUtils {
|
|||
break vowel_block;
|
||||
}
|
||||
}
|
||||
DuffCode[] dc_array = (DuffCode[])TibetanMachineWeb.getTibHash().get(TibetanMachineWeb.ACHEN);
|
||||
DuffCode[] dc_array = (DuffCode[])TibetanMachineWeb.getTibHash().get(ACHEN);
|
||||
dc = dc_array[TibetanMachineWeb.TMW];
|
||||
glyphs.addAll(getVowel(dc, next));
|
||||
}
|
||||
|
@ -398,7 +399,7 @@ public class TibTextUtils {
|
|||
}
|
||||
}
|
||||
|
||||
else if (next.equals(String.valueOf(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY))) {
|
||||
else if (next.equals(String.valueOf(WYLIE_DISAMBIGUATING_KEY))) {
|
||||
if (!chars.isEmpty())
|
||||
glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit));
|
||||
|
||||
|
@ -406,7 +407,7 @@ public class TibTextUtils {
|
|||
isSanskrit = false;
|
||||
}
|
||||
|
||||
else if (next.equals(String.valueOf(TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY))) {
|
||||
else if (next.equals(String.valueOf(WYLIE_SANSKRIT_STACKING_KEY))) {
|
||||
if (!isSanskrit) { //begin sanskrit stack
|
||||
switch (chars.size()) {
|
||||
case 0:
|
||||
|
@ -475,13 +476,13 @@ public class TibTextUtils {
|
|||
List bindus = new ArrayList();
|
||||
|
||||
if (null == dc) {
|
||||
bindus.add(TibetanMachineWeb.getGlyph(String.valueOf(TibetanMachineWeb.BINDU)));
|
||||
bindus.add(TibetanMachineWeb.getGlyph(String.valueOf(BINDU)));
|
||||
return bindus;
|
||||
}
|
||||
|
||||
if (!TibetanMachineWeb.getBinduMap().containsKey(dc)) {
|
||||
bindus.add(dc);
|
||||
bindus.add(TibetanMachineWeb.getGlyph(String.valueOf(TibetanMachineWeb.BINDU)));
|
||||
bindus.add(TibetanMachineWeb.getGlyph(String.valueOf(BINDU)));
|
||||
return bindus;
|
||||
}
|
||||
|
||||
|
@ -524,7 +525,7 @@ public class TibTextUtils {
|
|||
//this vowel doesn't correspond to a glyph -
|
||||
//so you just return the original context
|
||||
|
||||
if ( vowel.equals(TibetanMachineWeb.WYLIE_aVOWEL) ||
|
||||
if ( vowel.equals(WYLIE_aVOWEL) ||
|
||||
TibetanMachineWeb.isTopVowel(context_2)) {
|
||||
if (context_1 != null)
|
||||
vowels.add(context_1);
|
||||
|
@ -537,34 +538,34 @@ public class TibTextUtils {
|
|||
//these vowels have one invariant form - therefore,
|
||||
//dc_context is just returned along with that form
|
||||
|
||||
if (vowel.equals(TibetanMachineWeb.ai_VOWEL)) {
|
||||
if (vowel.equals(ai_VOWEL)) {
|
||||
if (context_1 != null)
|
||||
vowels.add(context_1);
|
||||
|
||||
vowels.add(context_2);
|
||||
DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(TibetanMachineWeb.ai_VOWEL);
|
||||
DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(ai_VOWEL);
|
||||
vowels.add(dc_v[TibetanMachineWeb.TMW]);
|
||||
return vowels;
|
||||
}
|
||||
|
||||
if (vowel.equals(TibetanMachineWeb.au_VOWEL)) {
|
||||
if (vowel.equals(au_VOWEL)) {
|
||||
if (context_1 != null)
|
||||
vowels.add(context_1);
|
||||
|
||||
vowels.add(context_2);
|
||||
DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(TibetanMachineWeb.au_VOWEL);
|
||||
DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(au_VOWEL);
|
||||
vowels.add(dc_v[TibetanMachineWeb.TMW]);
|
||||
return vowels;
|
||||
}
|
||||
|
||||
if (vowel.equals(TibetanMachineWeb.reverse_i_VOWEL)) {
|
||||
if (vowel.equals(reverse_i_VOWEL)) {
|
||||
if (context_1 != null)
|
||||
vowels.add(context_1);
|
||||
|
||||
vowels.add(context_2);
|
||||
|
||||
if (!TibetanMachineWeb.isTopVowel(context_2)) {
|
||||
DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(TibetanMachineWeb.reverse_i_VOWEL);
|
||||
DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(reverse_i_VOWEL);
|
||||
vowels.add(dc_v[TibetanMachineWeb.TMW]);
|
||||
}
|
||||
|
||||
|
@ -578,7 +579,7 @@ public class TibTextUtils {
|
|||
//returned along with the vowel appropriate to
|
||||
//that context
|
||||
|
||||
if (vowel.equals(TibetanMachineWeb.i_VOWEL)) {
|
||||
if (vowel.equals(i_VOWEL)) {
|
||||
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
|
||||
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_i);
|
||||
if (null == dc_v && null != context_1) {
|
||||
|
@ -597,7 +598,7 @@ public class TibTextUtils {
|
|||
return vowels;
|
||||
}
|
||||
|
||||
if (vowel.equals(TibetanMachineWeb.e_VOWEL)) {
|
||||
if (vowel.equals(e_VOWEL)) {
|
||||
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
|
||||
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_e);
|
||||
if (null == dc_v && null != context_1) {
|
||||
|
@ -616,7 +617,7 @@ public class TibTextUtils {
|
|||
return vowels;
|
||||
}
|
||||
|
||||
if (vowel.equals(TibetanMachineWeb.o_VOWEL)) {
|
||||
if (vowel.equals(o_VOWEL)) {
|
||||
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
|
||||
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_o);
|
||||
if (null == dc_v && null != context_1) {
|
||||
|
@ -641,7 +642,7 @@ public class TibTextUtils {
|
|||
//both u and A cannot be affixed to ordinary k or g, but
|
||||
//rather the shortened versions of k and g - therefore,
|
||||
|
||||
if (vowel.equals(TibetanMachineWeb.u_VOWEL)) {
|
||||
if (vowel.equals(u_VOWEL)) {
|
||||
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
|
||||
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
|
||||
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_u);
|
||||
|
@ -660,7 +661,7 @@ public class TibTextUtils {
|
|||
return vowels;
|
||||
}
|
||||
|
||||
if (vowel.equals(TibetanMachineWeb.A_VOWEL)) {
|
||||
if (vowel.equals(A_VOWEL)) {
|
||||
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
|
||||
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
|
||||
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_A);
|
||||
|
@ -680,7 +681,7 @@ public class TibTextUtils {
|
|||
return vowels;
|
||||
}
|
||||
|
||||
if (vowel.equals(TibetanMachineWeb.U_VOWEL)) {
|
||||
if (vowel.equals(U_VOWEL)) {
|
||||
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
|
||||
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
|
||||
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_U);
|
||||
|
@ -704,7 +705,7 @@ public class TibTextUtils {
|
|||
//require a change from the previous character,
|
||||
//and consist of two glyphs themselves
|
||||
|
||||
if (vowel.equals(TibetanMachineWeb.I_VOWEL)) {
|
||||
if (vowel.equals(I_VOWEL)) {
|
||||
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
|
||||
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
|
||||
DuffCode dc_v_sub = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_A);
|
||||
|
@ -726,11 +727,11 @@ public class TibTextUtils {
|
|||
return vowels;
|
||||
}
|
||||
|
||||
if (vowel.equals(TibetanMachineWeb.reverse_I_VOWEL)) {
|
||||
if (vowel.equals(reverse_I_VOWEL)) {
|
||||
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
|
||||
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
|
||||
DuffCode dc_v_sub = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_A);
|
||||
DuffCode[] tv_array = (DuffCode[])TibetanMachineWeb.getTibHash().get(TibetanMachineWeb.reverse_i_VOWEL);
|
||||
DuffCode[] tv_array = (DuffCode[])TibetanMachineWeb.getTibHash().get(reverse_i_VOWEL);
|
||||
DuffCode dc_v_sup = tv_array[TibetanMachineWeb.TMW];
|
||||
|
||||
if (null != context_1)
|
||||
|
@ -766,10 +767,10 @@ public class TibTextUtils {
|
|||
|
||||
/** Returns "a", unless wylie is already "a". */
|
||||
private static String aVowelToUseAfter(String wylie) {
|
||||
if (wylie.equals(TibetanMachineWeb.ACHEN))
|
||||
if (wylie.equals(ACHEN))
|
||||
return "";
|
||||
else
|
||||
return TibetanMachineWeb.WYLIE_aVOWEL;
|
||||
return WYLIE_aVOWEL;
|
||||
}
|
||||
|
||||
private static String unambiguousPostAVowelWylie(String wylie1,
|
||||
|
@ -781,7 +782,7 @@ public class TibTextUtils {
|
|||
if (TibetanMachineWeb.isWylieTop(wylie1)
|
||||
&& wylie2.equals(/* FIXME: hard-coded */ "d"))
|
||||
disambiguator
|
||||
= new String(new char[] { TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY });
|
||||
= new String(new char[] { WYLIE_DISAMBIGUATING_KEY });
|
||||
return wylie1 + disambiguator + wylie2;
|
||||
}
|
||||
|
||||
|
@ -831,13 +832,13 @@ public class TibTextUtils {
|
|||
}
|
||||
|
||||
if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, wylie))
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
sb.append(WYLIE_DISAMBIGUATING_KEY);
|
||||
|
||||
if (!wylie.equals(TibetanMachineWeb.ACHEN)) {
|
||||
if (!wylie.equals(ACHEN)) {
|
||||
sb.append(wylie);
|
||||
sb.append(TibetanMachineWeb.WYLIE_aVOWEL);
|
||||
sb.append(WYLIE_aVOWEL);
|
||||
} else {
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
sb.append(WYLIE_DISAMBIGUATING_KEY);
|
||||
sb.append(wylie);
|
||||
}
|
||||
}
|
||||
|
@ -861,12 +862,12 @@ public class TibTextUtils {
|
|||
StringBuffer tailEndWylie = null;
|
||||
int effectiveSize = size - 2;
|
||||
while (effectiveSize >= 0
|
||||
&& TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize)).equals(TibetanMachineWeb.ACHUNG)) {
|
||||
&& TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize)).equals(ACHUNG)) {
|
||||
if (null == tailEndWylie) tailEndWylie = new StringBuffer();
|
||||
// prepend:
|
||||
tailEndWylie.insert(0,
|
||||
TibetanMachineWeb.ACHUNG
|
||||
+ aVowelToUseAfter(TibetanMachineWeb.ACHUNG)
|
||||
ACHUNG
|
||||
+ aVowelToUseAfter(ACHUNG)
|
||||
+ TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize + 1)));
|
||||
effectiveSize -= 2;
|
||||
}
|
||||
|
@ -893,8 +894,8 @@ public class TibTextUtils {
|
|||
for (int i = 0; i < size; i++) {
|
||||
wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i));
|
||||
if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, wylie)
|
||||
|| (i != 0 && wylie.equals(TibetanMachineWeb.ACHEN)))
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
|| (i != 0 && wylie.equals(ACHEN)))
|
||||
sb.append(WYLIE_DISAMBIGUATING_KEY);
|
||||
|
||||
sb.append(wylie + aVowelToUseAfter(wylie));
|
||||
lastWylie = wylie;
|
||||
|
@ -907,8 +908,8 @@ public class TibTextUtils {
|
|||
while (i+2 < size) {
|
||||
wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i));
|
||||
if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, wylie)
|
||||
|| (i != 0 && wylie.equals(TibetanMachineWeb.ACHEN)))
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
|| (i != 0 && wylie.equals(ACHEN)))
|
||||
sb.append(WYLIE_DISAMBIGUATING_KEY);
|
||||
|
||||
sb.append(wylie);
|
||||
lastWylie = wylie;
|
||||
|
@ -933,7 +934,11 @@ public class TibTextUtils {
|
|||
if (TibetanMachineWeb.isWylieLeft(wylie0)) {
|
||||
/* is it ambiguous? */
|
||||
if (TibetanMachineWeb.isWylieRight(wylie1)
|
||||
&& TibetanMachineWeb.SA.equals(wylie2)) {
|
||||
&& SA.equals(wylie2) /* isWylieFarRight would
|
||||
* work, but the list of
|
||||
* 9 words doesn't have
|
||||
* any ending with d --
|
||||
* all end with s. */) {
|
||||
/* Yes, this is ambiguous. How do we handle it? See this from Andres:
|
||||
|
||||
I'm posting this upon David Chandler's request. According to Lobsang
|
||||
|
@ -1001,14 +1006,14 @@ public class TibTextUtils {
|
|||
// }
|
||||
// }
|
||||
// if (disambiguatorNeeded)
|
||||
// sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
// sb.append(WYLIE_DISAMBIGUATING_KEY);
|
||||
|
||||
|
||||
} else {
|
||||
/* no ambiguity. the "a" vowel comes after
|
||||
* wylie1. */
|
||||
if (TibetanMachineWeb.isAmbiguousWylie(wylie0, wylie1))
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
sb.append(WYLIE_DISAMBIGUATING_KEY);
|
||||
sb.append(wylie1
|
||||
+ aVowelToUseAfter(wylie1)
|
||||
+ wylie2);
|
||||
|
@ -1069,8 +1074,8 @@ public class TibTextUtils {
|
|||
|
||||
if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, currWylie)
|
||||
|| (!lastWylie.equals("")
|
||||
&& currWylie.equals(TibetanMachineWeb.ACHEN)))
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
&& currWylie.equals(ACHEN)))
|
||||
sb.append(WYLIE_DISAMBIGUATING_KEY);
|
||||
|
||||
sb.append(currWylie);
|
||||
|
||||
|
@ -1125,7 +1130,7 @@ public class TibTextUtils {
|
|||
wylie = TibetanMachineWeb.getWylieForGlyph(dcs[i]);
|
||||
|
||||
boolean containsBindu = false;
|
||||
if (wylie.length() > 1 && wylie.charAt(wylie.length()-1) == TibetanMachineWeb.BINDU) {
|
||||
if (wylie.length() > 1 && wylie.charAt(wylie.length()-1) == BINDU) {
|
||||
char[] cArray = wylie.toCharArray();
|
||||
wylie = new String(cArray, 0, wylie.length()-1);
|
||||
containsBindu = true;
|
||||
|
@ -1157,18 +1162,18 @@ public class TibTextUtils {
|
|||
} else if (TibetanMachineWeb.isWylieVowel(wylie)) {
|
||||
if (isLastVowel) {
|
||||
int len = wylieBuffer.length();
|
||||
int A_len = TibetanMachineWeb.A_VOWEL.length();
|
||||
int A_len = A_VOWEL.length();
|
||||
|
||||
if (wylieBuffer.substring(len-A_len).equals(TibetanMachineWeb.A_VOWEL)) {
|
||||
if (wylieBuffer.substring(len-A_len).equals(A_VOWEL)) {
|
||||
try {
|
||||
if (wylie.equals(TibetanMachineWeb.i_VOWEL)) {
|
||||
if (wylie.equals(i_VOWEL)) {
|
||||
wylieBuffer.delete(len-A_len, len);
|
||||
wylieBuffer.append(TibetanMachineWeb.I_VOWEL);
|
||||
wylieBuffer.append(I_VOWEL);
|
||||
isLastVowel = false;
|
||||
break process_block;
|
||||
} else if (wylie.equals(TibetanMachineWeb.reverse_i_VOWEL)) {
|
||||
} else if (wylie.equals(reverse_i_VOWEL)) {
|
||||
wylieBuffer.delete(len-A_len, len);
|
||||
wylieBuffer.append(TibetanMachineWeb.reverse_I_VOWEL);
|
||||
wylieBuffer.append(reverse_I_VOWEL);
|
||||
isLastVowel = false;
|
||||
break process_block;
|
||||
}
|
||||
|
@ -1189,7 +1194,7 @@ public class TibTextUtils {
|
|||
DuffCode top_dc = (DuffCode)glyphList.get(glyphCount-1);
|
||||
String top_wylie = TibetanMachineWeb.getWylieForGlyph(top_dc);
|
||||
|
||||
if (top_wylie.equals(TibetanMachineWeb.ACHEN)) {
|
||||
if (top_wylie.equals(ACHEN)) {
|
||||
glyphList.remove(glyphCount-1);
|
||||
|
||||
if (glyphCount-1 == 0) {
|
||||
|
@ -1200,7 +1205,7 @@ public class TibTextUtils {
|
|||
}
|
||||
}
|
||||
|
||||
if (top_dc == null || !TibetanMachineWeb.getWylieForGlyph(top_dc).equals(TibetanMachineWeb.ACHUNG)) {
|
||||
if (top_dc == null || !TibetanMachineWeb.getWylieForGlyph(top_dc).equals(ACHUNG)) {
|
||||
String thisPart = withoutA(glyphList);
|
||||
wylieBuffer.append(thisPart); //append consonants in glyphList
|
||||
} else {
|
||||
|
@ -1212,12 +1217,12 @@ public class TibTextUtils {
|
|||
wylieBuffer.append(thisPart);
|
||||
}
|
||||
|
||||
wylieBuffer.append(TibetanMachineWeb.ACHUNG);
|
||||
wylieBuffer.append(ACHUNG);
|
||||
}
|
||||
}
|
||||
|
||||
if (insertDisAmbig)
|
||||
wylieBuffer.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
wylieBuffer.append(WYLIE_DISAMBIGUATING_KEY);
|
||||
|
||||
wylieBuffer.append(wylie); //append vowel
|
||||
|
||||
|
@ -1234,7 +1239,7 @@ public class TibTextUtils {
|
|||
if (containsBindu) {
|
||||
isLastVowel = false;
|
||||
wylieBuffer.append(withoutA(glyphList));
|
||||
wylieBuffer.append(TibetanMachineWeb.BINDU); //append the bindu
|
||||
wylieBuffer.append(BINDU); //append the bindu
|
||||
glyphList.clear();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,7 +10,7 @@ License for the specific terms governing rights and limitations under the
|
|||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2001 THDL.
|
||||
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
|
@ -44,7 +44,7 @@ import org.thdl.util.ThdlOptions;
|
|||
* @version 1.0
|
||||
*/
|
||||
// FIXME: for speed, make either this class, its methods, or both, final?
|
||||
public class TibetanMachineWeb {
|
||||
public class TibetanMachineWeb implements THDLWylieConstants {
|
||||
/** This addresses bug 624133, "Input freezes after impossible
|
||||
* character". The input sequences that are valid in Extended
|
||||
* Wylie. For example, "Sh" will be in this container, but "S"
|
||||
|
@ -109,86 +109,6 @@ public class TibetanMachineWeb {
|
|||
"TibetanMachineWeb9".intern()
|
||||
};
|
||||
/**
|
||||
* the Wylie for bindu/anusvara
|
||||
*/
|
||||
public static final char BINDU = 'M';
|
||||
/**
|
||||
* the Wylie for tsheg
|
||||
*/
|
||||
public static final char TSHEG = ' '; //this character occurs in all ten TMW fonts
|
||||
/**
|
||||
* the Wylie for whitespace
|
||||
*/
|
||||
public static final char SPACE = '_'; //this character occurs in all ten TMW fonts
|
||||
/**
|
||||
* the Sanskrit stacking separator used in Extended Wylie
|
||||
*/
|
||||
public static final char WYLIE_SANSKRIT_STACKING_KEY = '+';
|
||||
/**
|
||||
* the Wylie disambiguating key, as a char
|
||||
*/
|
||||
public static final char WYLIE_DISAMBIGUATING_KEY = '.';
|
||||
/**
|
||||
* the Wylie for the invisible 'a' vowel
|
||||
*/
|
||||
public static final String WYLIE_aVOWEL = "a";
|
||||
/**
|
||||
* the Wylie for achung
|
||||
*/
|
||||
public static final String ACHUNG = "'";
|
||||
/**
|
||||
* the Wylie for the 28th of the 30 consonants, sa:
|
||||
*/
|
||||
public static final String SA = "s";
|
||||
/**
|
||||
* the Wylie for achen
|
||||
*/
|
||||
public static final String ACHEN = "a";
|
||||
/**
|
||||
* the Wylie for gigu
|
||||
*/
|
||||
public static final String i_VOWEL = "i";
|
||||
/**
|
||||
* the Wylie for zhebju
|
||||
*/
|
||||
public static final String u_VOWEL = "u";
|
||||
/**
|
||||
* the Wylie for drengbu
|
||||
*/
|
||||
public static final String e_VOWEL = "e";
|
||||
/**
|
||||
* the Wylie for naro
|
||||
*/
|
||||
public static final String o_VOWEL = "o";
|
||||
/**
|
||||
* the Wylie for double drengbu
|
||||
*/
|
||||
public static final String ai_VOWEL = "ai";
|
||||
/**
|
||||
* the Wylie for double naro
|
||||
*/
|
||||
public static final String au_VOWEL = "au";
|
||||
/**
|
||||
* the Wylie for the subscript achung vowel
|
||||
*/
|
||||
public static final String A_VOWEL = "A";
|
||||
/**
|
||||
* the Wylie for log yig gigu
|
||||
*/
|
||||
public static final String reverse_i_VOWEL = "-i";
|
||||
/**
|
||||
* the Wylie for the vowel achung + gigu
|
||||
*/
|
||||
public static final String I_VOWEL = "I";
|
||||
/**
|
||||
* the Wylie for the vowel achung + zhebju
|
||||
*/
|
||||
public static final String U_VOWEL = "U";
|
||||
/**
|
||||
* the Wylie for the vowel achung + log yig gigu
|
||||
*/
|
||||
public static final String reverse_I_VOWEL = "-I";
|
||||
/**
|
||||
* represents where in an array of DuffCodes you
|
||||
* find the TibetanMachine equivalence of a glyph
|
||||
*/
|
||||
|
|
|
@ -18,7 +18,7 @@ Contributor(s): ______________________________________.
|
|||
|
||||
package org.thdl.tib.text.tshegbar;
|
||||
|
||||
import org.thdl.tib.text.TibetanMachineWeb;
|
||||
import org.thdl.tib.text.THDLWylieConstants;
|
||||
import org.thdl.util.ThdlDebug;
|
||||
|
||||
/** <p>A LegalTshegBar is a simple Tibetan syllable or a syllable with
|
||||
|
@ -29,7 +29,7 @@ import org.thdl.util.ThdlDebug;
|
|||
* <ul>
|
||||
*
|
||||
* <li>It contains at most one prefix, which must be one of {EWC_ga,
|
||||
* EWC_da, EWC_ba, EWC_ma, EWC_achen} and must be prefixable to the
|
||||
* EWC_da, EWC_ba, EWC_ma, EWC_achung} and must be prefixable to the
|
||||
* root letter.</li>
|
||||
*
|
||||
* <li>It contains no vocalic modifications</li>
|
||||
|
@ -39,12 +39,11 @@ import org.thdl.util.ThdlDebug;
|
|||
*
|
||||
* <li>It contains at most one vowel from the set {EWV_a, EWV_i,
|
||||
* EWV_e, EWV_u}, and that vowel is on the root stack. The one
|
||||
* exception is that a 'i suffix is permitted (this is a connective
|
||||
* case marker).</li>
|
||||
* exception is that 'i (i.e., the connective case marker), 'u, and
|
||||
* 'o suffixes are permitted.</li>
|
||||
*
|
||||
* <li>It has at most one suffix, which is a single consonant or the
|
||||
* special connective case marker 'i (i.e.,
|
||||
* <code>"\u0F60\u0F72"</code>).</li>
|
||||
* <li>It has at most one suffix, which is a single consonant or a
|
||||
* string consisting of 'i, 'u, 'o, 'am, and 'ang.</li>
|
||||
*
|
||||
*
|
||||
DLC FIXME: we must allow many suffixes. See Andres' e-mail below:
|
||||
|
@ -69,10 +68,8 @@ And also there are cases where they combine. For ex you can have
|
|||
*
|
||||
*
|
||||
* <li>It may contain a EWC_sa or EWC_da postsuffix iff there exists
|
||||
* a suffix (and a suffix that is not the special connective case
|
||||
* marker 'i (i.e., <code>"\u0F60\u0F72"</code>) (DLC FIXME: 'o and
|
||||
* 'am maybe? I asked in the "Embarrasing error in wylie conversion"
|
||||
* bug report.).</li>
|
||||
* a suffix (and a suffix that is not based on 'i, 'o, 'u, 'am, and
|
||||
* 'ang).</li>
|
||||
*
|
||||
* <li>The root stack follows the rules of Tibetan syntax, meaning
|
||||
* that the following holds:
|
||||
|
@ -112,7 +109,7 @@ And also there are cases where they combine. For ex you can have
|
|||
* e.g. p. 548.</p>
|
||||
*
|
||||
* @author David Chandler */
|
||||
public class LegalTshegBar
|
||||
public final class LegalTshegBar
|
||||
extends TshegBar
|
||||
implements UnicodeConstants
|
||||
{
|
||||
|
@ -129,8 +126,8 @@ public class LegalTshegBar
|
|||
private boolean hasWaZur;
|
||||
/** true iff EW_wa_zur is under the root syllable. */
|
||||
private boolean hasAChung;
|
||||
/** If this is a string, it is of a single codepoint or is equal
|
||||
* to {@link #getConnectiveCaseSuffix()} */
|
||||
/** If this is a string, it is of a single codepoint or is a
|
||||
* string formed from 'i, 'o, 'u, 'am, and 'ang. */
|
||||
private String suffix;
|
||||
/** EW_da, EW_sa, or EW_ABSENT */
|
||||
private char postsuffix;
|
||||
|
@ -236,24 +233,24 @@ public class LegalTshegBar
|
|||
}
|
||||
|
||||
/** Returns null if there is no suffix, or a string containing the
|
||||
* one consonant or a string <code>"\u0F60\u0F72"</code>
|
||||
* containing two codepoints in the special case that the suffix
|
||||
* is that connective case marker {@link
|
||||
* #getConnectiveCaseSuffix()}. */
|
||||
* one consonant or a string like <code>"\u0F60\u0F72"</code>
|
||||
* in the case that the suffix
|
||||
* is 'i, 'u'i'o, 'am, 'ang, etc. */
|
||||
public String getSuffix() {
|
||||
return suffix;
|
||||
}
|
||||
|
||||
/** Returns true iff there is a suffixed consonant or a suffixed
|
||||
* <code>'i</code> (DLC FIXME). */
|
||||
* string consisting of 'i, 'u, 'o, 'am, and 'ang. */
|
||||
public boolean hasSuffix() {
|
||||
return (null != suffix);
|
||||
}
|
||||
|
||||
/** Returns true iff there is a single, suffixed consonant. This
|
||||
means that suffixes like <code>'am</code>, <code>'i</code>,
|
||||
<code>'u</code>, and <code>'o</code> are not present, but this
|
||||
does not rule out the presence of a postsuffix. */
|
||||
means that suffixes made from <code>'am</code>,
|
||||
<code>'ang</code> <code>'i</code>, <code>'u</code>, and
|
||||
<code>'o</code> are not present, but this does not rule out
|
||||
the presence of a postsuffix. */
|
||||
public boolean hasSimpleSuffix() {
|
||||
return ((null != suffix) && (1 == suffix.length()));
|
||||
}
|
||||
|
@ -280,12 +277,6 @@ public class LegalTshegBar
|
|||
return (EW_ABSENT != postsuffix);
|
||||
}
|
||||
|
||||
/** Returns true iff this syllable has a <code>'i</code>
|
||||
* suffix. */
|
||||
public boolean hasConnectiveCaseMarkerSuffix() {
|
||||
return getSuffix().equals(getConnectiveCaseSuffix());
|
||||
}
|
||||
|
||||
/** Returns the root consonant. */
|
||||
public char getRootLetter() {
|
||||
return rootLetter;
|
||||
|
@ -324,7 +315,7 @@ public class LegalTshegBar
|
|||
|
||||
private final static String possibleSuffixes
|
||||
= new String(new char[] {
|
||||
EWC_ga, EWC_nga, EWC_da, EWC_na, EWC_ba, EWC_ma, EWC_achen,
|
||||
EWC_ga, EWC_nga, EWC_da, EWC_na, EWC_ba, EWC_ma, EWC_achung,
|
||||
EWC_ra, EWC_la, EWC_sa
|
||||
});
|
||||
|
||||
|
@ -340,18 +331,6 @@ public class LegalTshegBar
|
|||
// EWSUB_ra_btags.
|
||||
}
|
||||
|
||||
private final static String connectiveCaseSuffix
|
||||
= new String(new char[] {
|
||||
EWC_achen, EWV_i
|
||||
});
|
||||
|
||||
/** Returns a two-codepoint string consisting of the Unicode
|
||||
* representation of what THDL Extended Wylie calls
|
||||
* <code>'i</code>. */
|
||||
public static String getConnectiveCaseSuffix() {
|
||||
return connectiveCaseSuffix;
|
||||
}
|
||||
|
||||
private final static String thirtyConsonants
|
||||
= new String(new char[] {
|
||||
EWC_ga, EWC_kha, EWC_ga, EWC_nga,
|
||||
|
@ -359,7 +338,7 @@ public class LegalTshegBar
|
|||
EWC_ta, EWC_tha, EWC_da, EWC_na,
|
||||
EWC_pa, EWC_pha, EWC_ba, EWC_ma,
|
||||
EWC_tsa, EWC_tsha, EWC_dza, EWC_wa,
|
||||
EWC_zha, EWC_za, EWC_achen, EWC_ya,
|
||||
EWC_zha, EWC_za, EWC_achung, EWC_ya,
|
||||
EWC_ra, EWC_la, EWC_sha, EWC_sa,
|
||||
EWC_ha, EWC_a
|
||||
});
|
||||
|
@ -388,10 +367,10 @@ public class LegalTshegBar
|
|||
<p>This is not very efficient.</p> */
|
||||
public static String[] getPossibleSuffixParticles() {
|
||||
return new String[] {
|
||||
new String(new char[] { EWC_achen, EWV_i }),
|
||||
new String(new char[] { EWC_achen, EWV_o }),
|
||||
new String(new char[] { EWC_achen, EWV_u }),
|
||||
new String(new char[] { EWC_achen, EWC_ma }),
|
||||
new String(new char[] { EWC_achung, EWV_i }),
|
||||
new String(new char[] { EWC_achung, EWV_o }),
|
||||
new String(new char[] { EWC_achung, EWV_u }),
|
||||
new String(new char[] { EWC_achung, EWC_ma }),
|
||||
};
|
||||
}
|
||||
|
||||
|
@ -402,7 +381,7 @@ public class LegalTshegBar
|
|||
* @see org.thdl.tib.text.tshegbar.UnicodeConstants */
|
||||
public static String getTheFivePrefixes() {
|
||||
final String s = new String(new char[] {
|
||||
EWC_ga, EWC_da, EWC_ba, EWC_ma, EWC_achen
|
||||
EWC_ga, EWC_da, EWC_ba, EWC_ma, EWC_achung
|
||||
});
|
||||
ThdlDebug.verify(s.length() == 5); // DLC put this into a JUnit test to avoid the slow-down.
|
||||
return s;
|
||||
|
@ -416,27 +395,104 @@ public class LegalTshegBar
|
|||
|
||||
/** Returns a String containing the nominal Unicode
|
||||
* representations of the ten suffixes. The suffixes are in
|
||||
* dictionary order.
|
||||
* @see #getConnectiveCaseSuffix()
|
||||
* dictionary order. This doesn't include oddballs like suffixes
|
||||
* based on 'i, 'u, 'o, 'am, and 'ang.
|
||||
* @see org.thdl.tib.text.tshegbar.UnicodeConstants */
|
||||
public static String getTheTenSuffixes() {
|
||||
final String s = new String(new char[] {
|
||||
EWC_ga, EWC_nga, EWC_da, EWC_na, EWC_ba,
|
||||
EWC_ma, EWC_achen, EWC_ra, EWC_la, EWC_sa
|
||||
EWC_ma, EWC_achung, EWC_ra, EWC_la, EWC_sa
|
||||
});
|
||||
ThdlDebug.verify(s.length() == 10); // DLC put this into a JUnit test to avoid the slow-down.
|
||||
return s;
|
||||
}
|
||||
|
||||
/** Returns true iff x is the preferred, nominal Unicode
|
||||
* representation of one of the ten suffixes.
|
||||
* @see #getConnectiveCaseSuffix()
|
||||
*/
|
||||
public static boolean isNominalRepresentationOfSimpleSuffix(char x) {
|
||||
return (-1 != getTheTenSuffixes().indexOf(x));
|
||||
}
|
||||
|
||||
|
||||
/** Legal suffix-like particles, excluding the ten suffixes. If
|
||||
* you add one, be sure that a tsheg-bar with it has the extended
|
||||
* wylie you wish by adding the correct extended Wylie with it. */
|
||||
private static final String[][] oddball_suffixes = new String[][] {
|
||||
{
|
||||
// connective case marker:
|
||||
new String( new char[] {
|
||||
EWC_achung, EWV_i
|
||||
}),
|
||||
THDLWylieConstants.ACHUNG + THDLWylieConstants.i_VOWEL
|
||||
},
|
||||
{
|
||||
new String( new char[] {
|
||||
EWC_achung, EWV_u
|
||||
}),
|
||||
THDLWylieConstants.ACHUNG + THDLWylieConstants.u_VOWEL
|
||||
},
|
||||
{
|
||||
// in at least one context, this shows end of sentence:
|
||||
new String( new char[] {
|
||||
EWC_achung, EWV_o
|
||||
}),
|
||||
THDLWylieConstants.ACHUNG + THDLWylieConstants.o_VOWEL
|
||||
},
|
||||
{
|
||||
// as in sgom pa'am:
|
||||
new String( new char[] {
|
||||
EWC_achung, EWC_ma
|
||||
}),
|
||||
THDLWylieConstants.ACHUNG + THDLWylieConstants.WYLIE_aVOWEL
|
||||
+ THDLWylieConstants.MA
|
||||
},
|
||||
{
|
||||
// meaning or, as opposed to and:
|
||||
new String( new char[] {
|
||||
EWC_achung, EWC_nga
|
||||
}),
|
||||
THDLWylieConstants.ACHUNG + THDLWylieConstants.WYLIE_aVOWEL
|
||||
+ THDLWylieConstants.NGA
|
||||
}
|
||||
};
|
||||
|
||||
/** Returns true iff suffix is 'i, 'o, 'u, 'am, 'ang, or a
|
||||
* concatenation like 'u'i'o. Returns false otherwise (including
|
||||
* the case that suffix is the empty string). */
|
||||
public static boolean isAchungBasedSuffix(String suffix) {
|
||||
int i = 0; // so that the empty string causes false to be returned.
|
||||
while (i == 0 || !suffix.equals("")) {
|
||||
boolean startsWithOneOfThem = false;
|
||||
for (int x = 0; x < oddball_suffixes.length; x++) {
|
||||
if (suffix.startsWith(oddball_suffixes[x][0])) {
|
||||
startsWithOneOfThem = true;
|
||||
suffix = suffix.substring(oddball_suffixes[x][0].length());
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!startsWithOneOfThem)
|
||||
return false;
|
||||
++i;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
private static String getTHDLWylieForOddballSuffix(String suffix) {
|
||||
// FIXME: assert that isAchungBasedSuffix
|
||||
StringBuffer wylie = new StringBuffer();
|
||||
while (!suffix.equals("")) {
|
||||
for (int x = 0; x < oddball_suffixes.length; x++) {
|
||||
if (suffix.startsWith(oddball_suffixes[x][0])) {
|
||||
wylie.append(oddball_suffixes[x][1]);
|
||||
suffix = suffix.substring(oddball_suffixes[x][0].length());
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return wylie.toString();
|
||||
}
|
||||
|
||||
|
||||
/** Returns true iff the given (rootLetter, subjoinedLetter)
|
||||
combination can accept an additional wa-zur. Only g-r-w,
|
||||
d-r-w, and ph-y-w fall into this category according to
|
||||
|
@ -595,8 +651,8 @@ public class LegalTshegBar
|
|||
* @param subjoinedLetter the optional, subscribed consonant
|
||||
* @param suffix the optional suffix, which is null, a String
|
||||
* consisting of a single consonant (i.e. a single,
|
||||
* nondecomposable codepoint) except in the special case that
|
||||
* this is {@link #getConnectiveCaseSuffix()}
|
||||
* nondecomposable codepoint), or a string of 'i (U+0F, 'u, 'o, 'am,
|
||||
* and 'ang.
|
||||
* @param postsuffix the optional postsuffix, which should be
|
||||
* EWC_sa or EWC_da
|
||||
* @param errorBuffer if non-null, and if the return code is
|
||||
|
@ -763,13 +819,12 @@ public class LegalTshegBar
|
|||
} // subjoinedLetter tests
|
||||
|
||||
// Suffix tests:
|
||||
// DLC NOW -- allow 'o, 'u, 'am, etc.
|
||||
if (null != suffix) {
|
||||
if (!getConnectiveCaseSuffix().equals(suffix)) {
|
||||
if (!isAchungBasedSuffix(suffix)) {
|
||||
if (suffix.length() != 1) {
|
||||
return internalThrowThing(throwIfIllegal,
|
||||
errorBuf,
|
||||
"Illegal suffix -- not one of the legal complex suffixes like 'u, 'o, 'i, 'am.");
|
||||
"Illegal suffix -- not one of the legal complex suffixes like 'u, 'o, 'i, 'am, 'ang.");
|
||||
}
|
||||
if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) {
|
||||
return internalThrowThing(throwIfIllegal,
|
||||
|
@ -784,6 +839,10 @@ public class LegalTshegBar
|
|||
return internalThrowThing(throwIfIllegal,
|
||||
errorBuf,
|
||||
"You cannot have a postsuffix unless you also have a suffix.");
|
||||
if (isAchungBasedSuffix(suffix))
|
||||
return internalThrowThing(throwIfIllegal,
|
||||
errorBuf,
|
||||
"You cannot have a postsuffix if you have a suffix based on 'i, 'o, 'u, 'am, and 'ang.");
|
||||
}
|
||||
|
||||
if (EW_ABSENT != headLetter) {
|
||||
|
@ -812,7 +871,9 @@ public class LegalTshegBar
|
|||
"The head letter sa cannot be used with that root letter.");
|
||||
}
|
||||
} else {
|
||||
// '\u0F6A' is not a valid head letter, even for
|
||||
// Illegal head letter.
|
||||
//
|
||||
// Note: U+0F6A is not a valid head letter, even for
|
||||
// "rnya". Use EWC_ra instead.
|
||||
return internalThrowThing(throwIfIllegal,
|
||||
errorBuf,
|
||||
|
@ -827,14 +888,14 @@ public class LegalTshegBar
|
|||
&& EWV_e != vowel
|
||||
&& EWV_o != vowel)
|
||||
{
|
||||
if (EWC_achen == vowel)
|
||||
if (EWC_achung == vowel)
|
||||
return internalThrowThing(throwIfIllegal,
|
||||
errorBuf,
|
||||
"The vowel given is not valid. Use EW_ABSENT for the EWC_achen sound.");
|
||||
"The vowel given is not valid. Use EW_ABSENT for the EWC_achung sound.");
|
||||
if ('\u0F71' == vowel)
|
||||
return internalThrowThing(throwIfIllegal,
|
||||
errorBuf,
|
||||
"a-chung cannot be used in a simple Tibetan syllable."); // DLC FIXME: what about pA?
|
||||
"a-chung can be used, but there is a flag for it; you don't call it the vowel.");
|
||||
return internalThrowThing(throwIfIllegal,
|
||||
errorBuf,
|
||||
"The vowel given is not valid.");
|
||||
|
@ -848,9 +909,6 @@ public class LegalTshegBar
|
|||
|
||||
|
||||
/*
|
||||
DLC add a method giving the correct connective case thingy or
|
||||
throwing error if the 'i suffix already appears.
|
||||
|
||||
DLC put in a method that gets pronunciation using Unicode
|
||||
diacritical marks. And another using just US Roman. Note that
|
||||
pronunciation is contextual, so have these methods return all
|
||||
|
@ -875,7 +933,7 @@ public class LegalTshegBar
|
|||
boolean disambiguatorNeeded = false;
|
||||
char prefix = getPrefix();
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(prefix));
|
||||
if (!hasHeadLetter()) {
|
||||
if (!hasHeadLetter() && !hasSubjoinedLetter()) {
|
||||
if (EWC_ya == rootLetter) {
|
||||
if (isConsonantThatTakesYaBtags(prefix))
|
||||
disambiguatorNeeded = true;
|
||||
|
@ -891,7 +949,7 @@ public class LegalTshegBar
|
|||
}
|
||||
}
|
||||
if (disambiguatorNeeded)
|
||||
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY);
|
||||
sb.append(THDLWylieConstants.WYLIE_DISAMBIGUATING_KEY);
|
||||
}
|
||||
if (hasHeadLetter())
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter()));
|
||||
|
@ -914,14 +972,14 @@ public class LegalTshegBar
|
|||
|
||||
// DLC FIXME: are these allowed in legal Tibetan?
|
||||
// EWTS would have special cases for them if so,
|
||||
// I'd wager...
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung));
|
||||
// I'd wager, so I bet they're not.
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung_vowel));
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel()));
|
||||
} else {
|
||||
ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?");
|
||||
}
|
||||
} else {
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung));
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung_vowel));
|
||||
}
|
||||
} else {
|
||||
if (hasExplicitVowel())
|
||||
|
@ -930,19 +988,34 @@ public class LegalTshegBar
|
|||
sb.append("a");
|
||||
}
|
||||
|
||||
String suf = null;
|
||||
if (hasSuffix()) {
|
||||
String suf = getSuffix();
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(0)));
|
||||
suf = getSuffix();
|
||||
if (suf.length() > 1) {
|
||||
// DLC assert, don't verify, that the length is two.
|
||||
// This could change if I learn of more suffix
|
||||
// particles.
|
||||
ThdlDebug.verify(2 == suf.length());
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(1)));
|
||||
// pa'am, not pa'm or pa'ama!
|
||||
sb.append(getTHDLWylieForOddballSuffix(suf));
|
||||
} else {
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(0)));
|
||||
}
|
||||
}
|
||||
if (hasPostsuffix())
|
||||
if (hasPostsuffix()) {
|
||||
// lar.d, la-ra-da, needs a disambiguator. EWC_sa doesn't
|
||||
// take any head letters, but EWC_da does.
|
||||
boolean disambiguatorNeeded = false;
|
||||
if (getPostsuffix() == EWC_da) {
|
||||
if (suf.length() == 1) {
|
||||
char simpleSuffix = suf.charAt(0);
|
||||
if (EWC_ra == simpleSuffix
|
||||
|| EWC_la == simpleSuffix
|
||||
|| EWC_sa == simpleSuffix) {
|
||||
disambiguatorNeeded = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (disambiguatorNeeded)
|
||||
sb.append(THDLWylieConstants.WYLIE_DISAMBIGUATING_KEY);
|
||||
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix()));
|
||||
}
|
||||
return sb;
|
||||
}
|
||||
|
||||
|
@ -987,7 +1060,7 @@ public class LegalTshegBar
|
|||
? "hasAChungOnRootLetter=\"true\""
|
||||
: "")
|
||||
|
||||
// DLC NOW: what about the root letter a, i.e. \u0F68 ? do we want the EWTS to be 'aa' ?
|
||||
// DLC NOW FIXME: what about the root letter a, i.e. \u0F68 ? do we want the EWTS to be 'aa' ?
|
||||
+ ("vowel=\""
|
||||
+ (hasExplicitVowel()
|
||||
? UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel())
|
||||
|
@ -1019,7 +1092,8 @@ public class LegalTshegBar
|
|||
sb.append(getPrefix());
|
||||
}
|
||||
if (hasHeadLetter()) {
|
||||
// DLC FIXME this crap won't be true...
|
||||
// DLC NOW FIXME this crap won't be true... it's what we must
|
||||
// convert to, though. Do it.
|
||||
ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getPrefix()));
|
||||
ThdlDebug.verify(UnicodeUtils.isSubjoinedConsonant(getRootLetter()));
|
||||
sb.append(getHeadLetter());
|
||||
|
@ -1036,8 +1110,8 @@ public class LegalTshegBar
|
|||
sb.append(EWSUB_wa_zur);
|
||||
}
|
||||
if (hasAChungOnRootLetter()) {
|
||||
ThdlDebug.verify('\u0F71' == EW_achung);
|
||||
sb.append(EW_achung);
|
||||
ThdlDebug.verify('\u0F71' == EW_achung_vowel);
|
||||
sb.append(EW_achung_vowel);
|
||||
}
|
||||
if (hasExplicitVowel()) {
|
||||
sb.append(getVowel());
|
||||
|
|
|
@ -38,8 +38,64 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants {
|
|||
junit.textui.TestRunner.run(LegalTshegBarTest.class);
|
||||
}
|
||||
|
||||
/** Tests the getThdlWylie() method to see if we
|
||||
handle "le'u'i'o", "sgom pa'am", "sgom pa'ang", etc.
|
||||
*/
|
||||
public void testGetThdlWylieForLongSuffixLikeThings() {
|
||||
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_la,
|
||||
EW_ABSENT, false, false,
|
||||
new String(new char[] {
|
||||
EWC_achung, EWV_u,
|
||||
EWC_achung, EWV_i,
|
||||
EWC_achung, EWV_o
|
||||
}),
|
||||
EW_ABSENT, EWV_e).getThdlWylie().toString().equals("le'u'i'o"));
|
||||
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_la,
|
||||
EW_ABSENT, false, false,
|
||||
new String(new char[] {
|
||||
EWC_achung, EWV_u,
|
||||
EWC_achung, EWV_i,
|
||||
EWC_achung, EWV_o,
|
||||
EWC_achung, EWC_ma,
|
||||
EWC_achung, EWC_nga,
|
||||
EWC_achung, EWV_o,
|
||||
EWC_achung, EWC_ma
|
||||
}),
|
||||
EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("la'u'i'o'am'ang'o'am"));
|
||||
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_pa,
|
||||
EW_ABSENT, false, false,
|
||||
new String(new char[] { EWC_achung, EWC_ma }),
|
||||
EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("pa'am"));
|
||||
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_pa,
|
||||
EW_ABSENT, false, false,
|
||||
new String(new char[] { EWC_achung, EWC_nga }),
|
||||
EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("pa'ang"));
|
||||
}
|
||||
|
||||
/** Tests the getThdlWylie() method and one of the constructors. */
|
||||
public void testGetThdlWylie() {
|
||||
// do we disambiguate when needed?
|
||||
{
|
||||
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_ga, EWC_ya,
|
||||
false, false, EW_ABSENT, EW_ABSENT, EWV_o).getThdlWylie().toString().equals("gyo"));
|
||||
assertTrue(new LegalTshegBar(EWC_ga, EW_ABSENT, EWC_ya, EW_ABSENT,
|
||||
false, false, EW_ABSENT, EW_ABSENT, EWV_o).getThdlWylie().toString().equals("g.yo"));
|
||||
assertTrue(new LegalTshegBar(EWC_ba, EW_ABSENT, EWC_la, EW_ABSENT,
|
||||
false, false, EWC_ga, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("b.lag"));
|
||||
assertTrue(new LegalTshegBar(EWC_ba, EW_ABSENT, EWC_la, EW_ABSENT,
|
||||
false, false, EWC_ga, EWC_sa, EW_ABSENT).getThdlWylie().toString().equals("b.lags"));
|
||||
assertTrue(new LegalTshegBar(EWC_ba, EW_ABSENT, EWC_ra, EW_ABSENT,
|
||||
false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("b.ragd"));
|
||||
assertTrue(new LegalTshegBar(EWC_ba, EW_ABSENT, EWC_ra, EWC_la,
|
||||
false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("brlagd"));
|
||||
assertTrue(new LegalTshegBar(EWC_ba, EWC_ra, EWC_ga, EW_ABSENT,
|
||||
false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("brgagd"));
|
||||
assertTrue(new LegalTshegBar(EWC_ba, EWC_la, EWC_ha, EW_ABSENT,
|
||||
false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("blhagd"));
|
||||
assertTrue(new LegalTshegBar(EWC_ba, EWC_la, EWC_da, EW_ABSENT,
|
||||
false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("bldagd"));
|
||||
}
|
||||
|
||||
assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga, EWC_ra,
|
||||
false, true, EWC_la, EWC_sa, EWV_o).getThdlWylie().toString().equals("bsgrAols"));
|
||||
assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga,
|
||||
|
@ -81,6 +137,10 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants {
|
|||
EWC_la, false, false,
|
||||
null, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("sla"));
|
||||
|
||||
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_pa,
|
||||
EW_ABSENT, false, true,
|
||||
null, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("pA"));
|
||||
|
||||
{
|
||||
boolean threw = false;
|
||||
try {
|
||||
|
@ -159,4 +219,64 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants {
|
|||
}
|
||||
assertTrue(x);
|
||||
}
|
||||
|
||||
/** Tests {@link
|
||||
* org.thdl.tib.text.tshegbar.LegalTshegBar#getTheTenSuffixes()}. */
|
||||
public void testGetTheTenSuffixes() {
|
||||
String x = LegalTshegBar.getTheTenSuffixes();
|
||||
assertTrue(x.length() == 10);
|
||||
assertTrue(x.charAt(0) == EWC_ga);
|
||||
assertTrue(x.charAt(4) == EWC_ba);
|
||||
assertTrue(x.charAt(9) == EWC_sa);
|
||||
}
|
||||
|
||||
/** Tests {@link
|
||||
* org.thdl.tib.text.tshegbar.LegalTshegBar#isAchungBasedSuffix(String)}. */
|
||||
public void testIsAchungBasedSuffix() {
|
||||
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
|
||||
EWC_achung, EWC_nga
|
||||
})));
|
||||
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
|
||||
EWC_achung, EWC_ma
|
||||
})));
|
||||
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
|
||||
EWC_achung, EWV_i
|
||||
})));
|
||||
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
|
||||
EWC_achung, EWV_o
|
||||
})));
|
||||
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
|
||||
EWC_achung, EWV_u
|
||||
})));
|
||||
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
|
||||
EWC_achung, EWV_u,
|
||||
EWC_achung, EWV_i,
|
||||
EWC_achung, EWV_o
|
||||
})));
|
||||
assertTrue(!LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
|
||||
EWC_achung, EWV_u,
|
||||
EWC_achung, EWV_i,
|
||||
EWC_achung, EWV_o, /* no EWC_achung, */ EWC_nga
|
||||
})));
|
||||
|
||||
// syntactically illegal, I'd bet, but our algorithm allows it:
|
||||
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
|
||||
EWC_achung, EWC_ma,
|
||||
EWC_achung, EWV_i,
|
||||
EWC_achung, EWV_i,
|
||||
EWC_achung, EWV_i,
|
||||
EWC_achung, EWV_o,
|
||||
EWC_achung, EWC_nga,
|
||||
EWC_achung, EWV_o
|
||||
})));
|
||||
|
||||
assertTrue(!LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
|
||||
EWC_achung, EWC_la
|
||||
})));
|
||||
assertTrue(!LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
|
||||
EWC_achung, EWV_e
|
||||
})));
|
||||
|
||||
assertTrue(!LegalTshegBar.isAchungBasedSuffix(""));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -83,20 +83,21 @@ public interface UnicodeConstants {
|
|||
static final char EWC_za = '\u0F5F';
|
||||
/** Note the irregular name. The Extended Wylie representation is
|
||||
<code>'a</code>. */
|
||||
static final char EWC_achen = '\u0F60'; /* DLC NOW is this achen or achung? achen is EWC_a, right? comment it. replace EWC_achen everywhere if you change it. */
|
||||
static final char EWC_achung = '\u0F60';
|
||||
static final char EWC_ya = '\u0F61';
|
||||
static final char EWC_ra = '\u0F62';
|
||||
static final char EWC_la = '\u0F63';
|
||||
static final char EWC_sha = '\u0F64';
|
||||
static final char EWC_sa = '\u0F66';
|
||||
static final char EWC_ha = '\u0F67';
|
||||
/** achen, the 30th consonant (and, some say, the fifth vowel) DLC NOW FIXME: rename to EWC_achen */
|
||||
static final char EWC_a = '\u0F68';
|
||||
|
||||
|
||||
/** In the word for father, "pA lags", there is an a-chung (i.e.,
|
||||
<code>\u0F71</code>). This is the constant for that little
|
||||
guy. */
|
||||
static final char EW_achung = '\u0F71';
|
||||
static final char EW_achung_vowel = '\u0F71';
|
||||
|
||||
|
||||
/* Four of the five vowels, some say, or, others say, "the four
|
||||
|
|
|
@ -127,11 +127,12 @@ public class UnicodeGraphemeCluster
|
|||
/** Returns the THDL Extended Wylie transliteration of this
|
||||
grapheme cluster, or null if there is none (which happens for
|
||||
a few Tibetan codepoints, if you'll recall). If needsVowel is
|
||||
true, then an "a" will be appended when there is no EW_achung
|
||||
or explicit simple vowel. If there is an explicit vowel or
|
||||
EW_achung, it will always be present. Note that needsVowel is
|
||||
provided because btags is the preferred THDL Extended Wylie
|
||||
for the four contiguous grapheme clusters
|
||||
true, then an "a" will be appended when there is no
|
||||
EW_achung_vowel or explicit simple vowel. If there is an
|
||||
explicit vowel or EW_achung_vowel, it will always be present.
|
||||
Note that needsVowel is provided because btags is the
|
||||
preferred THDL Extended Wylie for the four contiguous grapheme
|
||||
clusters
|
||||
<code>"\u0F56\u0F4F\u0F42\u0F66"</code>, and
|
||||
needsVowel must be set to false for all but the grapheme
|
||||
cluster corresponding to <code>\u0F4F</code> if you wish
|
||||
|
@ -257,7 +258,7 @@ public class UnicodeGraphemeCluster
|
|||
/** Returns the <i>height</i> for the Tibetan Unicode codepoint x.
|
||||
This relative height is 0 for a base consonant, digit,
|
||||
punctuation, mark, or sign. It is -1 for a subjoined
|
||||
consonant, -2 for EWSUB_wa_zur, -3 for EW_achung, +1 for
|
||||
consonant, -2 for EWSUB_wa_zur, -3 for EW_achung_vowel, +1 for
|
||||
EWV_gigu, and so on according to the height these codepoints
|
||||
appear relative to one another when on the same stack. If two
|
||||
codepoints have equal height, they should not exist in the
|
||||
|
|
Loading…
Reference in a new issue