Fixed a bunch of bugs; supports le'u'i'o, sgom pa'am, etc.

Better tests.  As part of that, I had to break TibetanMachineWeb into
TibetanMachineWeb+THDLWylieConstants, because I don't want the
class-wide initialization code from TibetanMachineWeb causing errors
in LegalTshegBarTest.
This commit is contained in:
dchandler 2003-03-31 00:33:50 +00:00
parent 1987f7d80a
commit 33b3080068
7 changed files with 468 additions and 230 deletions

View file

@ -0,0 +1,117 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text;
/** This is where basic, static knowledge of THDL's Extended Wylie is housed.
* @see org.thdl.tib.text#TibetanMachineWeb */
public interface THDLWylieConstants {
/**
* the Wylie for bindu/anusvara
*/
public static final char BINDU = 'M';
/**
* the Wylie for tsheg
*/
public static final char TSHEG = ' '; //this character occurs in all ten TMW fonts
/**
* the Wylie for whitespace
*/
public static final char SPACE = '_'; //this character occurs in all ten TMW fonts
/**
* the Sanskrit stacking separator used in Extended Wylie
*/
public static final char WYLIE_SANSKRIT_STACKING_KEY = '+';
/**
* the Wylie disambiguating key, as a char
*/
public static final char WYLIE_DISAMBIGUATING_KEY = '.';
/**
* the Wylie for the invisible 'a' vowel
*/
public static final String WYLIE_aVOWEL = "a";
/**
* the Wylie for achung
*/
public static final char ACHUNG_character = '\'';
/**
* the Wylie for achung
*/
public static final String ACHUNG
= new String(new char[] { ACHUNG_character });
/**
* the Wylie for the 28th of the 30 consonants, sa:
*/
public static final String SA = "s";
/**
* the Wylie for the 16th of the 30 consonants, ma:
*/
public static final String MA = "m";
/**
* the Wylie for the 4th of the 30 consonants, nga:
*/
public static final String NGA = "ng";
/**
* the Wylie for achen
*/
public static final String ACHEN = "a";
/**
* the Wylie for gigu
*/
public static final String i_VOWEL = "i";
/**
* the Wylie for zhebju
*/
public static final String u_VOWEL = "u";
/**
* the Wylie for drengbu
*/
public static final String e_VOWEL = "e";
/**
* the Wylie for naro
*/
public static final String o_VOWEL = "o";
/**
* the Wylie for double drengbu
*/
public static final String ai_VOWEL = "ai";
/**
* the Wylie for double naro
*/
public static final String au_VOWEL = "au";
/**
* the Wylie for the subscript achung vowel
*/
public static final String A_VOWEL = "A";
/**
* the Wylie for log yig gigu
*/
public static final String reverse_i_VOWEL = "-i";
/**
* the Wylie for the vowel achung + gigu
*/
public static final String I_VOWEL = "I";
/**
* the Wylie for the vowel achung + zhebju
*/
public static final String U_VOWEL = "U";
/**
* the Wylie for the vowel achung + log yig gigu
*/
public static final String reverse_I_VOWEL = "-I";
}

View file

@ -28,7 +28,8 @@ import org.thdl.util.ThdlDebug;
/** /**
* Provides methods for converting back and forth between Extended * Provides methods for converting back and forth between Extended
* Wylie and TibetanMachineWeb. This class is not instantiable. * Wylie and Tibetan represented in TibetanMachineWeb glyphs. This
* class is not instantiable.
* *
* <p> * <p>
* The class provides a variety of static methods for converting * The class provides a variety of static methods for converting
@ -37,7 +38,7 @@ import org.thdl.util.ThdlDebug;
* be exported as Rich Text Format. * be exported as Rich Text Format.
* *
* @author Edward Garrett, Tibetan and Himalayan Digital Library */ * @author Edward Garrett, Tibetan and Himalayan Digital Library */
public class TibTextUtils { public class TibTextUtils implements THDLWylieConstants {
/** Do not use this contructor. */ /** Do not use this contructor. */
private TibTextUtils() { super(); } private TibTextUtils() { super(); }
@ -255,11 +256,11 @@ public class TibTextUtils {
if (k < 32) //return null if character is just formatting if (k < 32) //return null if character is just formatting
return String.valueOf(c); return String.valueOf(c);
if (c == TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY) if (c == WYLIE_DISAMBIGUATING_KEY)
return String.valueOf(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); return String.valueOf(WYLIE_DISAMBIGUATING_KEY);
if (c == TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY) if (c == WYLIE_SANSKRIT_STACKING_KEY)
return String.valueOf(TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY); return String.valueOf(WYLIE_SANSKRIT_STACKING_KEY);
for (i=offset+1; i<wylie.length()+1; i++) { for (i=offset+1; i<wylie.length()+1; i++) {
s = wylie.substring(offset, i); s = wylie.substring(offset, i);
@ -332,7 +333,7 @@ public class TibTextUtils {
chars.clear(); chars.clear();
if (next.equals(String.valueOf(TibetanMachineWeb.BINDU))) { if (next.equals(String.valueOf(BINDU))) {
if (glyphs.isEmpty()) if (glyphs.isEmpty())
dc = null; dc = null;
else else
@ -369,7 +370,7 @@ public class TibTextUtils {
break vowel_block; break vowel_block;
} }
} }
DuffCode[] dc_array = (DuffCode[])TibetanMachineWeb.getTibHash().get(TibetanMachineWeb.ACHEN); DuffCode[] dc_array = (DuffCode[])TibetanMachineWeb.getTibHash().get(ACHEN);
dc = dc_array[TibetanMachineWeb.TMW]; dc = dc_array[TibetanMachineWeb.TMW];
glyphs.addAll(getVowel(dc, next)); glyphs.addAll(getVowel(dc, next));
} }
@ -398,7 +399,7 @@ public class TibTextUtils {
} }
} }
else if (next.equals(String.valueOf(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY))) { else if (next.equals(String.valueOf(WYLIE_DISAMBIGUATING_KEY))) {
if (!chars.isEmpty()) if (!chars.isEmpty())
glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit)); glyphs.addAll(getGlyphs(chars, true, !isSanskrit, isSanskrit));
@ -406,7 +407,7 @@ public class TibTextUtils {
isSanskrit = false; isSanskrit = false;
} }
else if (next.equals(String.valueOf(TibetanMachineWeb.WYLIE_SANSKRIT_STACKING_KEY))) { else if (next.equals(String.valueOf(WYLIE_SANSKRIT_STACKING_KEY))) {
if (!isSanskrit) { //begin sanskrit stack if (!isSanskrit) { //begin sanskrit stack
switch (chars.size()) { switch (chars.size()) {
case 0: case 0:
@ -475,13 +476,13 @@ public class TibTextUtils {
List bindus = new ArrayList(); List bindus = new ArrayList();
if (null == dc) { if (null == dc) {
bindus.add(TibetanMachineWeb.getGlyph(String.valueOf(TibetanMachineWeb.BINDU))); bindus.add(TibetanMachineWeb.getGlyph(String.valueOf(BINDU)));
return bindus; return bindus;
} }
if (!TibetanMachineWeb.getBinduMap().containsKey(dc)) { if (!TibetanMachineWeb.getBinduMap().containsKey(dc)) {
bindus.add(dc); bindus.add(dc);
bindus.add(TibetanMachineWeb.getGlyph(String.valueOf(TibetanMachineWeb.BINDU))); bindus.add(TibetanMachineWeb.getGlyph(String.valueOf(BINDU)));
return bindus; return bindus;
} }
@ -524,7 +525,7 @@ public class TibTextUtils {
//this vowel doesn't correspond to a glyph - //this vowel doesn't correspond to a glyph -
//so you just return the original context //so you just return the original context
if ( vowel.equals(TibetanMachineWeb.WYLIE_aVOWEL) || if ( vowel.equals(WYLIE_aVOWEL) ||
TibetanMachineWeb.isTopVowel(context_2)) { TibetanMachineWeb.isTopVowel(context_2)) {
if (context_1 != null) if (context_1 != null)
vowels.add(context_1); vowels.add(context_1);
@ -537,34 +538,34 @@ public class TibTextUtils {
//these vowels have one invariant form - therefore, //these vowels have one invariant form - therefore,
//dc_context is just returned along with that form //dc_context is just returned along with that form
if (vowel.equals(TibetanMachineWeb.ai_VOWEL)) { if (vowel.equals(ai_VOWEL)) {
if (context_1 != null) if (context_1 != null)
vowels.add(context_1); vowels.add(context_1);
vowels.add(context_2); vowels.add(context_2);
DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(TibetanMachineWeb.ai_VOWEL); DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(ai_VOWEL);
vowels.add(dc_v[TibetanMachineWeb.TMW]); vowels.add(dc_v[TibetanMachineWeb.TMW]);
return vowels; return vowels;
} }
if (vowel.equals(TibetanMachineWeb.au_VOWEL)) { if (vowel.equals(au_VOWEL)) {
if (context_1 != null) if (context_1 != null)
vowels.add(context_1); vowels.add(context_1);
vowels.add(context_2); vowels.add(context_2);
DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(TibetanMachineWeb.au_VOWEL); DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(au_VOWEL);
vowels.add(dc_v[TibetanMachineWeb.TMW]); vowels.add(dc_v[TibetanMachineWeb.TMW]);
return vowels; return vowels;
} }
if (vowel.equals(TibetanMachineWeb.reverse_i_VOWEL)) { if (vowel.equals(reverse_i_VOWEL)) {
if (context_1 != null) if (context_1 != null)
vowels.add(context_1); vowels.add(context_1);
vowels.add(context_2); vowels.add(context_2);
if (!TibetanMachineWeb.isTopVowel(context_2)) { if (!TibetanMachineWeb.isTopVowel(context_2)) {
DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(TibetanMachineWeb.reverse_i_VOWEL); DuffCode[] dc_v = (DuffCode[])TibetanMachineWeb.getTibHash().get(reverse_i_VOWEL);
vowels.add(dc_v[TibetanMachineWeb.TMW]); vowels.add(dc_v[TibetanMachineWeb.TMW]);
} }
@ -578,7 +579,7 @@ public class TibTextUtils {
//returned along with the vowel appropriate to //returned along with the vowel appropriate to
//that context //that context
if (vowel.equals(TibetanMachineWeb.i_VOWEL)) { if (vowel.equals(i_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2); String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_i); DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_i);
if (null == dc_v && null != context_1) { if (null == dc_v && null != context_1) {
@ -597,7 +598,7 @@ public class TibTextUtils {
return vowels; return vowels;
} }
if (vowel.equals(TibetanMachineWeb.e_VOWEL)) { if (vowel.equals(e_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2); String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_e); DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_e);
if (null == dc_v && null != context_1) { if (null == dc_v && null != context_1) {
@ -616,7 +617,7 @@ public class TibTextUtils {
return vowels; return vowels;
} }
if (vowel.equals(TibetanMachineWeb.o_VOWEL)) { if (vowel.equals(o_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2); String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_o); DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_o);
if (null == dc_v && null != context_1) { if (null == dc_v && null != context_1) {
@ -641,7 +642,7 @@ public class TibTextUtils {
//both u and A cannot be affixed to ordinary k or g, but //both u and A cannot be affixed to ordinary k or g, but
//rather the shortened versions of k and g - therefore, //rather the shortened versions of k and g - therefore,
if (vowel.equals(TibetanMachineWeb.u_VOWEL)) { if (vowel.equals(u_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2); String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context); DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_u); DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_u);
@ -660,7 +661,7 @@ public class TibTextUtils {
return vowels; return vowels;
} }
if (vowel.equals(TibetanMachineWeb.A_VOWEL)) { if (vowel.equals(A_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2); String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context); DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_A); DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_A);
@ -680,7 +681,7 @@ public class TibTextUtils {
return vowels; return vowels;
} }
if (vowel.equals(TibetanMachineWeb.U_VOWEL)) { if (vowel.equals(U_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2); String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context); DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_U); DuffCode dc_v = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_U);
@ -704,7 +705,7 @@ public class TibTextUtils {
//require a change from the previous character, //require a change from the previous character,
//and consist of two glyphs themselves //and consist of two glyphs themselves
if (vowel.equals(TibetanMachineWeb.I_VOWEL)) { if (vowel.equals(I_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2); String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context); DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
DuffCode dc_v_sub = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_A); DuffCode dc_v_sub = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_A);
@ -726,11 +727,11 @@ public class TibTextUtils {
return vowels; return vowels;
} }
if (vowel.equals(TibetanMachineWeb.reverse_I_VOWEL)) { if (vowel.equals(reverse_I_VOWEL)) {
String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2); String hashKey_context = TibetanMachineWeb.getHashKeyForGlyph(context_2);
DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context); DuffCode halfHeight = TibetanMachineWeb.getHalfHeightGlyph(hashKey_context);
DuffCode dc_v_sub = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_A); DuffCode dc_v_sub = TibetanMachineWeb.getVowel(hashKey_context, TibetanMachineWeb.VOWEL_A);
DuffCode[] tv_array = (DuffCode[])TibetanMachineWeb.getTibHash().get(TibetanMachineWeb.reverse_i_VOWEL); DuffCode[] tv_array = (DuffCode[])TibetanMachineWeb.getTibHash().get(reverse_i_VOWEL);
DuffCode dc_v_sup = tv_array[TibetanMachineWeb.TMW]; DuffCode dc_v_sup = tv_array[TibetanMachineWeb.TMW];
if (null != context_1) if (null != context_1)
@ -766,10 +767,10 @@ public class TibTextUtils {
/** Returns "a", unless wylie is already "a". */ /** Returns "a", unless wylie is already "a". */
private static String aVowelToUseAfter(String wylie) { private static String aVowelToUseAfter(String wylie) {
if (wylie.equals(TibetanMachineWeb.ACHEN)) if (wylie.equals(ACHEN))
return ""; return "";
else else
return TibetanMachineWeb.WYLIE_aVOWEL; return WYLIE_aVOWEL;
} }
private static String unambiguousPostAVowelWylie(String wylie1, private static String unambiguousPostAVowelWylie(String wylie1,
@ -781,7 +782,7 @@ public class TibTextUtils {
if (TibetanMachineWeb.isWylieTop(wylie1) if (TibetanMachineWeb.isWylieTop(wylie1)
&& wylie2.equals(/* FIXME: hard-coded */ "d")) && wylie2.equals(/* FIXME: hard-coded */ "d"))
disambiguator disambiguator
= new String(new char[] { TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY }); = new String(new char[] { WYLIE_DISAMBIGUATING_KEY });
return wylie1 + disambiguator + wylie2; return wylie1 + disambiguator + wylie2;
} }
@ -831,13 +832,13 @@ public class TibTextUtils {
} }
if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, wylie)) if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, wylie))
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); sb.append(WYLIE_DISAMBIGUATING_KEY);
if (!wylie.equals(TibetanMachineWeb.ACHEN)) { if (!wylie.equals(ACHEN)) {
sb.append(wylie); sb.append(wylie);
sb.append(TibetanMachineWeb.WYLIE_aVOWEL); sb.append(WYLIE_aVOWEL);
} else { } else {
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); sb.append(WYLIE_DISAMBIGUATING_KEY);
sb.append(wylie); sb.append(wylie);
} }
} }
@ -861,12 +862,12 @@ public class TibTextUtils {
StringBuffer tailEndWylie = null; StringBuffer tailEndWylie = null;
int effectiveSize = size - 2; int effectiveSize = size - 2;
while (effectiveSize >= 0 while (effectiveSize >= 0
&& TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize)).equals(TibetanMachineWeb.ACHUNG)) { && TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize)).equals(ACHUNG)) {
if (null == tailEndWylie) tailEndWylie = new StringBuffer(); if (null == tailEndWylie) tailEndWylie = new StringBuffer();
// prepend: // prepend:
tailEndWylie.insert(0, tailEndWylie.insert(0,
TibetanMachineWeb.ACHUNG ACHUNG
+ aVowelToUseAfter(TibetanMachineWeb.ACHUNG) + aVowelToUseAfter(ACHUNG)
+ TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize + 1))); + TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(effectiveSize + 1)));
effectiveSize -= 2; effectiveSize -= 2;
} }
@ -893,8 +894,8 @@ public class TibTextUtils {
for (int i = 0; i < size; i++) { for (int i = 0; i < size; i++) {
wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i)); wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i));
if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, wylie) if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, wylie)
|| (i != 0 && wylie.equals(TibetanMachineWeb.ACHEN))) || (i != 0 && wylie.equals(ACHEN)))
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); sb.append(WYLIE_DISAMBIGUATING_KEY);
sb.append(wylie + aVowelToUseAfter(wylie)); sb.append(wylie + aVowelToUseAfter(wylie));
lastWylie = wylie; lastWylie = wylie;
@ -907,8 +908,8 @@ public class TibTextUtils {
while (i+2 < size) { while (i+2 < size) {
wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i)); wylie = TibetanMachineWeb.getWylieForGlyph((DuffCode)glyphList.get(i));
if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, wylie) if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, wylie)
|| (i != 0 && wylie.equals(TibetanMachineWeb.ACHEN))) || (i != 0 && wylie.equals(ACHEN)))
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); sb.append(WYLIE_DISAMBIGUATING_KEY);
sb.append(wylie); sb.append(wylie);
lastWylie = wylie; lastWylie = wylie;
@ -933,7 +934,11 @@ public class TibTextUtils {
if (TibetanMachineWeb.isWylieLeft(wylie0)) { if (TibetanMachineWeb.isWylieLeft(wylie0)) {
/* is it ambiguous? */ /* is it ambiguous? */
if (TibetanMachineWeb.isWylieRight(wylie1) if (TibetanMachineWeb.isWylieRight(wylie1)
&& TibetanMachineWeb.SA.equals(wylie2)) { && SA.equals(wylie2) /* isWylieFarRight would
* work, but the list of
* 9 words doesn't have
* any ending with d --
* all end with s. */) {
/* Yes, this is ambiguous. How do we handle it? See this from Andres: /* Yes, this is ambiguous. How do we handle it? See this from Andres:
I'm posting this upon David Chandler's request. According to Lobsang I'm posting this upon David Chandler's request. According to Lobsang
@ -1001,14 +1006,14 @@ public class TibTextUtils {
// } // }
// } // }
// if (disambiguatorNeeded) // if (disambiguatorNeeded)
// sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); // sb.append(WYLIE_DISAMBIGUATING_KEY);
} else { } else {
/* no ambiguity. the "a" vowel comes after /* no ambiguity. the "a" vowel comes after
* wylie1. */ * wylie1. */
if (TibetanMachineWeb.isAmbiguousWylie(wylie0, wylie1)) if (TibetanMachineWeb.isAmbiguousWylie(wylie0, wylie1))
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); sb.append(WYLIE_DISAMBIGUATING_KEY);
sb.append(wylie1 sb.append(wylie1
+ aVowelToUseAfter(wylie1) + aVowelToUseAfter(wylie1)
+ wylie2); + wylie2);
@ -1069,8 +1074,8 @@ public class TibTextUtils {
if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, currWylie) if (TibetanMachineWeb.isAmbiguousWylie(lastWylie, currWylie)
|| (!lastWylie.equals("") || (!lastWylie.equals("")
&& currWylie.equals(TibetanMachineWeb.ACHEN))) && currWylie.equals(ACHEN)))
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); sb.append(WYLIE_DISAMBIGUATING_KEY);
sb.append(currWylie); sb.append(currWylie);
@ -1125,7 +1130,7 @@ public class TibTextUtils {
wylie = TibetanMachineWeb.getWylieForGlyph(dcs[i]); wylie = TibetanMachineWeb.getWylieForGlyph(dcs[i]);
boolean containsBindu = false; boolean containsBindu = false;
if (wylie.length() > 1 && wylie.charAt(wylie.length()-1) == TibetanMachineWeb.BINDU) { if (wylie.length() > 1 && wylie.charAt(wylie.length()-1) == BINDU) {
char[] cArray = wylie.toCharArray(); char[] cArray = wylie.toCharArray();
wylie = new String(cArray, 0, wylie.length()-1); wylie = new String(cArray, 0, wylie.length()-1);
containsBindu = true; containsBindu = true;
@ -1157,18 +1162,18 @@ public class TibTextUtils {
} else if (TibetanMachineWeb.isWylieVowel(wylie)) { } else if (TibetanMachineWeb.isWylieVowel(wylie)) {
if (isLastVowel) { if (isLastVowel) {
int len = wylieBuffer.length(); int len = wylieBuffer.length();
int A_len = TibetanMachineWeb.A_VOWEL.length(); int A_len = A_VOWEL.length();
if (wylieBuffer.substring(len-A_len).equals(TibetanMachineWeb.A_VOWEL)) { if (wylieBuffer.substring(len-A_len).equals(A_VOWEL)) {
try { try {
if (wylie.equals(TibetanMachineWeb.i_VOWEL)) { if (wylie.equals(i_VOWEL)) {
wylieBuffer.delete(len-A_len, len); wylieBuffer.delete(len-A_len, len);
wylieBuffer.append(TibetanMachineWeb.I_VOWEL); wylieBuffer.append(I_VOWEL);
isLastVowel = false; isLastVowel = false;
break process_block; break process_block;
} else if (wylie.equals(TibetanMachineWeb.reverse_i_VOWEL)) { } else if (wylie.equals(reverse_i_VOWEL)) {
wylieBuffer.delete(len-A_len, len); wylieBuffer.delete(len-A_len, len);
wylieBuffer.append(TibetanMachineWeb.reverse_I_VOWEL); wylieBuffer.append(reverse_I_VOWEL);
isLastVowel = false; isLastVowel = false;
break process_block; break process_block;
} }
@ -1189,7 +1194,7 @@ public class TibTextUtils {
DuffCode top_dc = (DuffCode)glyphList.get(glyphCount-1); DuffCode top_dc = (DuffCode)glyphList.get(glyphCount-1);
String top_wylie = TibetanMachineWeb.getWylieForGlyph(top_dc); String top_wylie = TibetanMachineWeb.getWylieForGlyph(top_dc);
if (top_wylie.equals(TibetanMachineWeb.ACHEN)) { if (top_wylie.equals(ACHEN)) {
glyphList.remove(glyphCount-1); glyphList.remove(glyphCount-1);
if (glyphCount-1 == 0) { if (glyphCount-1 == 0) {
@ -1200,7 +1205,7 @@ public class TibTextUtils {
} }
} }
if (top_dc == null || !TibetanMachineWeb.getWylieForGlyph(top_dc).equals(TibetanMachineWeb.ACHUNG)) { if (top_dc == null || !TibetanMachineWeb.getWylieForGlyph(top_dc).equals(ACHUNG)) {
String thisPart = withoutA(glyphList); String thisPart = withoutA(glyphList);
wylieBuffer.append(thisPart); //append consonants in glyphList wylieBuffer.append(thisPart); //append consonants in glyphList
} else { } else {
@ -1212,12 +1217,12 @@ public class TibTextUtils {
wylieBuffer.append(thisPart); wylieBuffer.append(thisPart);
} }
wylieBuffer.append(TibetanMachineWeb.ACHUNG); wylieBuffer.append(ACHUNG);
} }
} }
if (insertDisAmbig) if (insertDisAmbig)
wylieBuffer.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); wylieBuffer.append(WYLIE_DISAMBIGUATING_KEY);
wylieBuffer.append(wylie); //append vowel wylieBuffer.append(wylie); //append vowel
@ -1234,7 +1239,7 @@ public class TibTextUtils {
if (containsBindu) { if (containsBindu) {
isLastVowel = false; isLastVowel = false;
wylieBuffer.append(withoutA(glyphList)); wylieBuffer.append(withoutA(glyphList));
wylieBuffer.append(TibetanMachineWeb.BINDU); //append the bindu wylieBuffer.append(BINDU); //append the bindu
glyphList.clear(); glyphList.clear();
} }
} }

View file

@ -10,7 +10,7 @@ License for the specific terms governing rights and limitations under the
License. License.
The Initial Developer of this software is the Tibetan and Himalayan Digital The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2001 THDL. Library (THDL). Portions created by the THDL are Copyright 2001-2003 THDL.
All Rights Reserved. All Rights Reserved.
Contributor(s): ______________________________________. Contributor(s): ______________________________________.
@ -44,7 +44,7 @@ import org.thdl.util.ThdlOptions;
* @version 1.0 * @version 1.0
*/ */
// FIXME: for speed, make either this class, its methods, or both, final? // FIXME: for speed, make either this class, its methods, or both, final?
public class TibetanMachineWeb { public class TibetanMachineWeb implements THDLWylieConstants {
/** This addresses bug 624133, "Input freezes after impossible /** This addresses bug 624133, "Input freezes after impossible
* character". The input sequences that are valid in Extended * character". The input sequences that are valid in Extended
* Wylie. For example, "Sh" will be in this container, but "S" * Wylie. For example, "Sh" will be in this container, but "S"
@ -109,86 +109,6 @@ public class TibetanMachineWeb {
"TibetanMachineWeb9".intern() "TibetanMachineWeb9".intern()
}; };
/** /**
* the Wylie for bindu/anusvara
*/
public static final char BINDU = 'M';
/**
* the Wylie for tsheg
*/
public static final char TSHEG = ' '; //this character occurs in all ten TMW fonts
/**
* the Wylie for whitespace
*/
public static final char SPACE = '_'; //this character occurs in all ten TMW fonts
/**
* the Sanskrit stacking separator used in Extended Wylie
*/
public static final char WYLIE_SANSKRIT_STACKING_KEY = '+';
/**
* the Wylie disambiguating key, as a char
*/
public static final char WYLIE_DISAMBIGUATING_KEY = '.';
/**
* the Wylie for the invisible 'a' vowel
*/
public static final String WYLIE_aVOWEL = "a";
/**
* the Wylie for achung
*/
public static final String ACHUNG = "'";
/**
* the Wylie for the 28th of the 30 consonants, sa:
*/
public static final String SA = "s";
/**
* the Wylie for achen
*/
public static final String ACHEN = "a";
/**
* the Wylie for gigu
*/
public static final String i_VOWEL = "i";
/**
* the Wylie for zhebju
*/
public static final String u_VOWEL = "u";
/**
* the Wylie for drengbu
*/
public static final String e_VOWEL = "e";
/**
* the Wylie for naro
*/
public static final String o_VOWEL = "o";
/**
* the Wylie for double drengbu
*/
public static final String ai_VOWEL = "ai";
/**
* the Wylie for double naro
*/
public static final String au_VOWEL = "au";
/**
* the Wylie for the subscript achung vowel
*/
public static final String A_VOWEL = "A";
/**
* the Wylie for log yig gigu
*/
public static final String reverse_i_VOWEL = "-i";
/**
* the Wylie for the vowel achung + gigu
*/
public static final String I_VOWEL = "I";
/**
* the Wylie for the vowel achung + zhebju
*/
public static final String U_VOWEL = "U";
/**
* the Wylie for the vowel achung + log yig gigu
*/
public static final String reverse_I_VOWEL = "-I";
/**
* represents where in an array of DuffCodes you * represents where in an array of DuffCodes you
* find the TibetanMachine equivalence of a glyph * find the TibetanMachine equivalence of a glyph
*/ */

View file

@ -18,7 +18,7 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.tshegbar; package org.thdl.tib.text.tshegbar;
import org.thdl.tib.text.TibetanMachineWeb; import org.thdl.tib.text.THDLWylieConstants;
import org.thdl.util.ThdlDebug; import org.thdl.util.ThdlDebug;
/** <p>A LegalTshegBar is a simple Tibetan syllable or a syllable with /** <p>A LegalTshegBar is a simple Tibetan syllable or a syllable with
@ -29,7 +29,7 @@ import org.thdl.util.ThdlDebug;
* <ul> * <ul>
* *
* <li>It contains at most one prefix, which must be one of {EWC_ga, * <li>It contains at most one prefix, which must be one of {EWC_ga,
* EWC_da, EWC_ba, EWC_ma, EWC_achen} and must be prefixable to the * EWC_da, EWC_ba, EWC_ma, EWC_achung} and must be prefixable to the
* root letter.</li> * root letter.</li>
* *
* <li>It contains no vocalic modifications</li> * <li>It contains no vocalic modifications</li>
@ -39,12 +39,11 @@ import org.thdl.util.ThdlDebug;
* *
* <li>It contains at most one vowel from the set {EWV_a, EWV_i, * <li>It contains at most one vowel from the set {EWV_a, EWV_i,
* EWV_e, EWV_u}, and that vowel is on the root stack. The one * EWV_e, EWV_u}, and that vowel is on the root stack. The one
* exception is that a 'i suffix is permitted (this is a connective * exception is that 'i (i.e., the connective case marker), 'u, and
* case marker).</li> * 'o suffixes are permitted.</li>
* *
* <li>It has at most one suffix, which is a single consonant or the * <li>It has at most one suffix, which is a single consonant or a
* special connective case marker 'i (i.e., * string consisting of 'i, 'u, 'o, 'am, and 'ang.</li>
* <code>"&#92;u0F60&#92;u0F72"</code>).</li>
* *
* *
DLC FIXME: we must allow many suffixes. See Andres' e-mail below: DLC FIXME: we must allow many suffixes. See Andres' e-mail below:
@ -69,10 +68,8 @@ And also there are cases where they combine. For ex you can have
* *
* *
* <li>It may contain a EWC_sa or EWC_da postsuffix iff there exists * <li>It may contain a EWC_sa or EWC_da postsuffix iff there exists
* a suffix (and a suffix that is not the special connective case * a suffix (and a suffix that is not based on 'i, 'o, 'u, 'am, and
* marker 'i (i.e., <code>"&#92;u0F60&#92;u0F72"</code>) (DLC FIXME: 'o and * 'ang).</li>
* 'am maybe? I asked in the "Embarrasing error in wylie conversion"
* bug report.).</li>
* *
* <li>The root stack follows the rules of Tibetan syntax, meaning * <li>The root stack follows the rules of Tibetan syntax, meaning
* that the following holds: * that the following holds:
@ -112,7 +109,7 @@ And also there are cases where they combine. For ex you can have
* e.g. p. 548.</p> * e.g. p. 548.</p>
* *
* @author David Chandler */ * @author David Chandler */
public class LegalTshegBar public final class LegalTshegBar
extends TshegBar extends TshegBar
implements UnicodeConstants implements UnicodeConstants
{ {
@ -129,8 +126,8 @@ public class LegalTshegBar
private boolean hasWaZur; private boolean hasWaZur;
/** true iff EW_wa_zur is under the root syllable. */ /** true iff EW_wa_zur is under the root syllable. */
private boolean hasAChung; private boolean hasAChung;
/** If this is a string, it is of a single codepoint or is equal /** If this is a string, it is of a single codepoint or is a
* to {@link #getConnectiveCaseSuffix()} */ * string formed from 'i, 'o, 'u, 'am, and 'ang. */
private String suffix; private String suffix;
/** EW_da, EW_sa, or EW_ABSENT */ /** EW_da, EW_sa, or EW_ABSENT */
private char postsuffix; private char postsuffix;
@ -236,24 +233,24 @@ public class LegalTshegBar
} }
/** Returns null if there is no suffix, or a string containing the /** Returns null if there is no suffix, or a string containing the
* one consonant or a string <code>"&#92;u0F60&#92;u0F72"</code> * one consonant or a string like <code>"&#92;u0F60&#92;u0F72"</code>
* containing two codepoints in the special case that the suffix * in the case that the suffix
* is that connective case marker {@link * is 'i, 'u'i'o, 'am, 'ang, etc. */
* #getConnectiveCaseSuffix()}. */
public String getSuffix() { public String getSuffix() {
return suffix; return suffix;
} }
/** Returns true iff there is a suffixed consonant or a suffixed /** Returns true iff there is a suffixed consonant or a suffixed
* <code>'i</code> (DLC FIXME). */ * string consisting of 'i, 'u, 'o, 'am, and 'ang. */
public boolean hasSuffix() { public boolean hasSuffix() {
return (null != suffix); return (null != suffix);
} }
/** Returns true iff there is a single, suffixed consonant. This /** Returns true iff there is a single, suffixed consonant. This
means that suffixes like <code>'am</code>, <code>'i</code>, means that suffixes made from <code>'am</code>,
<code>'u</code>, and <code>'o</code> are not present, but this <code>'ang</code> <code>'i</code>, <code>'u</code>, and
does not rule out the presence of a postsuffix. */ <code>'o</code> are not present, but this does not rule out
the presence of a postsuffix. */
public boolean hasSimpleSuffix() { public boolean hasSimpleSuffix() {
return ((null != suffix) && (1 == suffix.length())); return ((null != suffix) && (1 == suffix.length()));
} }
@ -280,12 +277,6 @@ public class LegalTshegBar
return (EW_ABSENT != postsuffix); return (EW_ABSENT != postsuffix);
} }
/** Returns true iff this syllable has a <code>'i</code>
* suffix. */
public boolean hasConnectiveCaseMarkerSuffix() {
return getSuffix().equals(getConnectiveCaseSuffix());
}
/** Returns the root consonant. */ /** Returns the root consonant. */
public char getRootLetter() { public char getRootLetter() {
return rootLetter; return rootLetter;
@ -324,7 +315,7 @@ public class LegalTshegBar
private final static String possibleSuffixes private final static String possibleSuffixes
= new String(new char[] { = new String(new char[] {
EWC_ga, EWC_nga, EWC_da, EWC_na, EWC_ba, EWC_ma, EWC_achen, EWC_ga, EWC_nga, EWC_da, EWC_na, EWC_ba, EWC_ma, EWC_achung,
EWC_ra, EWC_la, EWC_sa EWC_ra, EWC_la, EWC_sa
}); });
@ -340,18 +331,6 @@ public class LegalTshegBar
// EWSUB_ra_btags. // EWSUB_ra_btags.
} }
private final static String connectiveCaseSuffix
= new String(new char[] {
EWC_achen, EWV_i
});
/** Returns a two-codepoint string consisting of the Unicode
* representation of what THDL Extended Wylie calls
* <code>'i</code>. */
public static String getConnectiveCaseSuffix() {
return connectiveCaseSuffix;
}
private final static String thirtyConsonants private final static String thirtyConsonants
= new String(new char[] { = new String(new char[] {
EWC_ga, EWC_kha, EWC_ga, EWC_nga, EWC_ga, EWC_kha, EWC_ga, EWC_nga,
@ -359,7 +338,7 @@ public class LegalTshegBar
EWC_ta, EWC_tha, EWC_da, EWC_na, EWC_ta, EWC_tha, EWC_da, EWC_na,
EWC_pa, EWC_pha, EWC_ba, EWC_ma, EWC_pa, EWC_pha, EWC_ba, EWC_ma,
EWC_tsa, EWC_tsha, EWC_dza, EWC_wa, EWC_tsa, EWC_tsha, EWC_dza, EWC_wa,
EWC_zha, EWC_za, EWC_achen, EWC_ya, EWC_zha, EWC_za, EWC_achung, EWC_ya,
EWC_ra, EWC_la, EWC_sha, EWC_sa, EWC_ra, EWC_la, EWC_sha, EWC_sa,
EWC_ha, EWC_a EWC_ha, EWC_a
}); });
@ -388,10 +367,10 @@ public class LegalTshegBar
<p>This is not very efficient.</p> */ <p>This is not very efficient.</p> */
public static String[] getPossibleSuffixParticles() { public static String[] getPossibleSuffixParticles() {
return new String[] { return new String[] {
new String(new char[] { EWC_achen, EWV_i }), new String(new char[] { EWC_achung, EWV_i }),
new String(new char[] { EWC_achen, EWV_o }), new String(new char[] { EWC_achung, EWV_o }),
new String(new char[] { EWC_achen, EWV_u }), new String(new char[] { EWC_achung, EWV_u }),
new String(new char[] { EWC_achen, EWC_ma }), new String(new char[] { EWC_achung, EWC_ma }),
}; };
} }
@ -402,7 +381,7 @@ public class LegalTshegBar
* @see org.thdl.tib.text.tshegbar.UnicodeConstants */ * @see org.thdl.tib.text.tshegbar.UnicodeConstants */
public static String getTheFivePrefixes() { public static String getTheFivePrefixes() {
final String s = new String(new char[] { final String s = new String(new char[] {
EWC_ga, EWC_da, EWC_ba, EWC_ma, EWC_achen EWC_ga, EWC_da, EWC_ba, EWC_ma, EWC_achung
}); });
ThdlDebug.verify(s.length() == 5); // DLC put this into a JUnit test to avoid the slow-down. ThdlDebug.verify(s.length() == 5); // DLC put this into a JUnit test to avoid the slow-down.
return s; return s;
@ -416,27 +395,104 @@ public class LegalTshegBar
/** Returns a String containing the nominal Unicode /** Returns a String containing the nominal Unicode
* representations of the ten suffixes. The suffixes are in * representations of the ten suffixes. The suffixes are in
* dictionary order. * dictionary order. This doesn't include oddballs like suffixes
* @see #getConnectiveCaseSuffix() * based on 'i, 'u, 'o, 'am, and 'ang.
* @see org.thdl.tib.text.tshegbar.UnicodeConstants */ * @see org.thdl.tib.text.tshegbar.UnicodeConstants */
public static String getTheTenSuffixes() { public static String getTheTenSuffixes() {
final String s = new String(new char[] { final String s = new String(new char[] {
EWC_ga, EWC_nga, EWC_da, EWC_na, EWC_ba, EWC_ga, EWC_nga, EWC_da, EWC_na, EWC_ba,
EWC_ma, EWC_achen, EWC_ra, EWC_la, EWC_sa EWC_ma, EWC_achung, EWC_ra, EWC_la, EWC_sa
}); });
ThdlDebug.verify(s.length() == 10); // DLC put this into a JUnit test to avoid the slow-down.
return s; return s;
} }
/** Returns true iff x is the preferred, nominal Unicode /** Returns true iff x is the preferred, nominal Unicode
* representation of one of the ten suffixes. * representation of one of the ten suffixes.
* @see #getConnectiveCaseSuffix()
*/ */
public static boolean isNominalRepresentationOfSimpleSuffix(char x) { public static boolean isNominalRepresentationOfSimpleSuffix(char x) {
return (-1 != getTheTenSuffixes().indexOf(x)); return (-1 != getTheTenSuffixes().indexOf(x));
} }
/** Legal suffix-like particles, excluding the ten suffixes. If
* you add one, be sure that a tsheg-bar with it has the extended
* wylie you wish by adding the correct extended Wylie with it. */
private static final String[][] oddball_suffixes = new String[][] {
{
// connective case marker:
new String( new char[] {
EWC_achung, EWV_i
}),
THDLWylieConstants.ACHUNG + THDLWylieConstants.i_VOWEL
},
{
new String( new char[] {
EWC_achung, EWV_u
}),
THDLWylieConstants.ACHUNG + THDLWylieConstants.u_VOWEL
},
{
// in at least one context, this shows end of sentence:
new String( new char[] {
EWC_achung, EWV_o
}),
THDLWylieConstants.ACHUNG + THDLWylieConstants.o_VOWEL
},
{
// as in sgom pa'am:
new String( new char[] {
EWC_achung, EWC_ma
}),
THDLWylieConstants.ACHUNG + THDLWylieConstants.WYLIE_aVOWEL
+ THDLWylieConstants.MA
},
{
// meaning or, as opposed to and:
new String( new char[] {
EWC_achung, EWC_nga
}),
THDLWylieConstants.ACHUNG + THDLWylieConstants.WYLIE_aVOWEL
+ THDLWylieConstants.NGA
}
};
/** Returns true iff suffix is 'i, 'o, 'u, 'am, 'ang, or a
* concatenation like 'u'i'o. Returns false otherwise (including
* the case that suffix is the empty string). */
public static boolean isAchungBasedSuffix(String suffix) {
int i = 0; // so that the empty string causes false to be returned.
while (i == 0 || !suffix.equals("")) {
boolean startsWithOneOfThem = false;
for (int x = 0; x < oddball_suffixes.length; x++) {
if (suffix.startsWith(oddball_suffixes[x][0])) {
startsWithOneOfThem = true;
suffix = suffix.substring(oddball_suffixes[x][0].length());
break;
}
}
if (!startsWithOneOfThem)
return false;
++i;
}
return true;
}
private static String getTHDLWylieForOddballSuffix(String suffix) {
// FIXME: assert that isAchungBasedSuffix
StringBuffer wylie = new StringBuffer();
while (!suffix.equals("")) {
for (int x = 0; x < oddball_suffixes.length; x++) {
if (suffix.startsWith(oddball_suffixes[x][0])) {
wylie.append(oddball_suffixes[x][1]);
suffix = suffix.substring(oddball_suffixes[x][0].length());
break;
}
}
}
return wylie.toString();
}
/** Returns true iff the given (rootLetter, subjoinedLetter) /** Returns true iff the given (rootLetter, subjoinedLetter)
combination can accept an additional wa-zur. Only g-r-w, combination can accept an additional wa-zur. Only g-r-w,
d-r-w, and ph-y-w fall into this category according to d-r-w, and ph-y-w fall into this category according to
@ -595,8 +651,8 @@ public class LegalTshegBar
* @param subjoinedLetter the optional, subscribed consonant * @param subjoinedLetter the optional, subscribed consonant
* @param suffix the optional suffix, which is null, a String * @param suffix the optional suffix, which is null, a String
* consisting of a single consonant (i.e. a single, * consisting of a single consonant (i.e. a single,
* nondecomposable codepoint) except in the special case that * nondecomposable codepoint), or a string of 'i (U+0F, 'u, 'o, 'am,
* this is {@link #getConnectiveCaseSuffix()} * and 'ang.
* @param postsuffix the optional postsuffix, which should be * @param postsuffix the optional postsuffix, which should be
* EWC_sa or EWC_da * EWC_sa or EWC_da
* @param errorBuffer if non-null, and if the return code is * @param errorBuffer if non-null, and if the return code is
@ -763,13 +819,12 @@ public class LegalTshegBar
} // subjoinedLetter tests } // subjoinedLetter tests
// Suffix tests: // Suffix tests:
// DLC NOW -- allow 'o, 'u, 'am, etc.
if (null != suffix) { if (null != suffix) {
if (!getConnectiveCaseSuffix().equals(suffix)) { if (!isAchungBasedSuffix(suffix)) {
if (suffix.length() != 1) { if (suffix.length() != 1) {
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf, errorBuf,
"Illegal suffix -- not one of the legal complex suffixes like 'u, 'o, 'i, 'am."); "Illegal suffix -- not one of the legal complex suffixes like 'u, 'o, 'i, 'am, 'ang.");
} }
if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) { if (!isNominalRepresentationOfSimpleSuffix(suffix.charAt(0))) {
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
@ -784,6 +839,10 @@ public class LegalTshegBar
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf, errorBuf,
"You cannot have a postsuffix unless you also have a suffix."); "You cannot have a postsuffix unless you also have a suffix.");
if (isAchungBasedSuffix(suffix))
return internalThrowThing(throwIfIllegal,
errorBuf,
"You cannot have a postsuffix if you have a suffix based on 'i, 'o, 'u, 'am, and 'ang.");
} }
if (EW_ABSENT != headLetter) { if (EW_ABSENT != headLetter) {
@ -812,7 +871,9 @@ public class LegalTshegBar
"The head letter sa cannot be used with that root letter."); "The head letter sa cannot be used with that root letter.");
} }
} else { } else {
// '&#92;u0F6A' is not a valid head letter, even for // Illegal head letter.
//
// Note: U+0F6A is not a valid head letter, even for
// "rnya". Use EWC_ra instead. // "rnya". Use EWC_ra instead.
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf, errorBuf,
@ -827,14 +888,14 @@ public class LegalTshegBar
&& EWV_e != vowel && EWV_e != vowel
&& EWV_o != vowel) && EWV_o != vowel)
{ {
if (EWC_achen == vowel) if (EWC_achung == vowel)
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf, errorBuf,
"The vowel given is not valid. Use EW_ABSENT for the EWC_achen sound."); "The vowel given is not valid. Use EW_ABSENT for the EWC_achung sound.");
if ('\u0F71' == vowel) if ('\u0F71' == vowel)
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf, errorBuf,
"a-chung cannot be used in a simple Tibetan syllable."); // DLC FIXME: what about pA? "a-chung can be used, but there is a flag for it; you don't call it the vowel.");
return internalThrowThing(throwIfIllegal, return internalThrowThing(throwIfIllegal,
errorBuf, errorBuf,
"The vowel given is not valid."); "The vowel given is not valid.");
@ -848,9 +909,6 @@ public class LegalTshegBar
/* /*
DLC add a method giving the correct connective case thingy or
throwing error if the 'i suffix already appears.
DLC put in a method that gets pronunciation using Unicode DLC put in a method that gets pronunciation using Unicode
diacritical marks. And another using just US Roman. Note that diacritical marks. And another using just US Roman. Note that
pronunciation is contextual, so have these methods return all pronunciation is contextual, so have these methods return all
@ -875,7 +933,7 @@ public class LegalTshegBar
boolean disambiguatorNeeded = false; boolean disambiguatorNeeded = false;
char prefix = getPrefix(); char prefix = getPrefix();
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(prefix)); sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(prefix));
if (!hasHeadLetter()) { if (!hasHeadLetter() && !hasSubjoinedLetter()) {
if (EWC_ya == rootLetter) { if (EWC_ya == rootLetter) {
if (isConsonantThatTakesYaBtags(prefix)) if (isConsonantThatTakesYaBtags(prefix))
disambiguatorNeeded = true; disambiguatorNeeded = true;
@ -891,7 +949,7 @@ public class LegalTshegBar
} }
} }
if (disambiguatorNeeded) if (disambiguatorNeeded)
sb.append(TibetanMachineWeb.WYLIE_DISAMBIGUATING_KEY); sb.append(THDLWylieConstants.WYLIE_DISAMBIGUATING_KEY);
} }
if (hasHeadLetter()) if (hasHeadLetter())
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter())); sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getHeadLetter()));
@ -914,14 +972,14 @@ public class LegalTshegBar
// DLC FIXME: are these allowed in legal Tibetan? // DLC FIXME: are these allowed in legal Tibetan?
// EWTS would have special cases for them if so, // EWTS would have special cases for them if so,
// I'd wager... // I'd wager, so I bet they're not.
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung)); sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung_vowel));
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel())); sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel()));
} else { } else {
ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?"); ThdlDebug.abort("only simple vowels occur in this class, how did this get past internalLegalityTest(..)?");
} }
} else { } else {
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung)); sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(EW_achung_vowel));
} }
} else { } else {
if (hasExplicitVowel()) if (hasExplicitVowel())
@ -930,19 +988,34 @@ public class LegalTshegBar
sb.append("a"); sb.append("a");
} }
String suf = null;
if (hasSuffix()) { if (hasSuffix()) {
String suf = getSuffix(); suf = getSuffix();
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(0)));
if (suf.length() > 1) { if (suf.length() > 1) {
// DLC assert, don't verify, that the length is two. // pa'am, not pa'm or pa'ama!
// This could change if I learn of more suffix sb.append(getTHDLWylieForOddballSuffix(suf));
// particles. } else {
ThdlDebug.verify(2 == suf.length()); sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(0)));
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(suf.charAt(1)));
} }
} }
if (hasPostsuffix()) if (hasPostsuffix()) {
// lar.d, la-ra-da, needs a disambiguator. EWC_sa doesn't
// take any head letters, but EWC_da does.
boolean disambiguatorNeeded = false;
if (getPostsuffix() == EWC_da) {
if (suf.length() == 1) {
char simpleSuffix = suf.charAt(0);
if (EWC_ra == simpleSuffix
|| EWC_la == simpleSuffix
|| EWC_sa == simpleSuffix) {
disambiguatorNeeded = true;
}
}
}
if (disambiguatorNeeded)
sb.append(THDLWylieConstants.WYLIE_DISAMBIGUATING_KEY);
sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix())); sb.append(UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getPostsuffix()));
}
return sb; return sb;
} }
@ -987,7 +1060,7 @@ public class LegalTshegBar
? "hasAChungOnRootLetter=\"true\"" ? "hasAChungOnRootLetter=\"true\""
: "") : "")
// DLC NOW: what about the root letter a, i.e. &#92;u0F68 ? do we want the EWTS to be 'aa' ? // DLC NOW FIXME: what about the root letter a, i.e. &#92;u0F68 ? do we want the EWTS to be 'aa' ?
+ ("vowel=\"" + ("vowel=\""
+ (hasExplicitVowel() + (hasExplicitVowel()
? UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel()) ? UnicodeCodepointToThdlWylie.getThdlWylieForUnicodeCodepoint(getVowel())
@ -1019,7 +1092,8 @@ public class LegalTshegBar
sb.append(getPrefix()); sb.append(getPrefix());
} }
if (hasHeadLetter()) { if (hasHeadLetter()) {
// DLC FIXME this crap won't be true... // DLC NOW FIXME this crap won't be true... it's what we must
// convert to, though. Do it.
ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getPrefix())); ThdlDebug.verify(UnicodeUtils.isNonSubjoinedConsonant(getPrefix()));
ThdlDebug.verify(UnicodeUtils.isSubjoinedConsonant(getRootLetter())); ThdlDebug.verify(UnicodeUtils.isSubjoinedConsonant(getRootLetter()));
sb.append(getHeadLetter()); sb.append(getHeadLetter());
@ -1036,8 +1110,8 @@ public class LegalTshegBar
sb.append(EWSUB_wa_zur); sb.append(EWSUB_wa_zur);
} }
if (hasAChungOnRootLetter()) { if (hasAChungOnRootLetter()) {
ThdlDebug.verify('\u0F71' == EW_achung); ThdlDebug.verify('\u0F71' == EW_achung_vowel);
sb.append(EW_achung); sb.append(EW_achung_vowel);
} }
if (hasExplicitVowel()) { if (hasExplicitVowel()) {
sb.append(getVowel()); sb.append(getVowel());

View file

@ -38,8 +38,64 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants {
junit.textui.TestRunner.run(LegalTshegBarTest.class); junit.textui.TestRunner.run(LegalTshegBarTest.class);
} }
/** Tests the getThdlWylie() method to see if we
handle "le'u'i'o", "sgom pa'am", "sgom pa'ang", etc.
*/
public void testGetThdlWylieForLongSuffixLikeThings() {
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_la,
EW_ABSENT, false, false,
new String(new char[] {
EWC_achung, EWV_u,
EWC_achung, EWV_i,
EWC_achung, EWV_o
}),
EW_ABSENT, EWV_e).getThdlWylie().toString().equals("le'u'i'o"));
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_la,
EW_ABSENT, false, false,
new String(new char[] {
EWC_achung, EWV_u,
EWC_achung, EWV_i,
EWC_achung, EWV_o,
EWC_achung, EWC_ma,
EWC_achung, EWC_nga,
EWC_achung, EWV_o,
EWC_achung, EWC_ma
}),
EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("la'u'i'o'am'ang'o'am"));
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_pa,
EW_ABSENT, false, false,
new String(new char[] { EWC_achung, EWC_ma }),
EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("pa'am"));
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_pa,
EW_ABSENT, false, false,
new String(new char[] { EWC_achung, EWC_nga }),
EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("pa'ang"));
}
/** Tests the getThdlWylie() method and one of the constructors. */ /** Tests the getThdlWylie() method and one of the constructors. */
public void testGetThdlWylie() { public void testGetThdlWylie() {
// do we disambiguate when needed?
{
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_ga, EWC_ya,
false, false, EW_ABSENT, EW_ABSENT, EWV_o).getThdlWylie().toString().equals("gyo"));
assertTrue(new LegalTshegBar(EWC_ga, EW_ABSENT, EWC_ya, EW_ABSENT,
false, false, EW_ABSENT, EW_ABSENT, EWV_o).getThdlWylie().toString().equals("g.yo"));
assertTrue(new LegalTshegBar(EWC_ba, EW_ABSENT, EWC_la, EW_ABSENT,
false, false, EWC_ga, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("b.lag"));
assertTrue(new LegalTshegBar(EWC_ba, EW_ABSENT, EWC_la, EW_ABSENT,
false, false, EWC_ga, EWC_sa, EW_ABSENT).getThdlWylie().toString().equals("b.lags"));
assertTrue(new LegalTshegBar(EWC_ba, EW_ABSENT, EWC_ra, EW_ABSENT,
false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("b.ragd"));
assertTrue(new LegalTshegBar(EWC_ba, EW_ABSENT, EWC_ra, EWC_la,
false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("brlagd"));
assertTrue(new LegalTshegBar(EWC_ba, EWC_ra, EWC_ga, EW_ABSENT,
false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("brgagd"));
assertTrue(new LegalTshegBar(EWC_ba, EWC_la, EWC_ha, EW_ABSENT,
false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("blhagd"));
assertTrue(new LegalTshegBar(EWC_ba, EWC_la, EWC_da, EW_ABSENT,
false, false, EWC_ga, EWC_da, EW_ABSENT).getThdlWylie().toString().equals("bldagd"));
}
assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga, EWC_ra, assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga, EWC_ra,
false, true, EWC_la, EWC_sa, EWV_o).getThdlWylie().toString().equals("bsgrAols")); false, true, EWC_la, EWC_sa, EWV_o).getThdlWylie().toString().equals("bsgrAols"));
assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga, assertTrue(new LegalTshegBar(EWC_ba, EWC_sa, EWC_ga,
@ -81,6 +137,10 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants {
EWC_la, false, false, EWC_la, false, false,
null, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("sla")); null, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("sla"));
assertTrue(new LegalTshegBar(EW_ABSENT, EW_ABSENT, EWC_pa,
EW_ABSENT, false, true,
null, EW_ABSENT, EW_ABSENT).getThdlWylie().toString().equals("pA"));
{ {
boolean threw = false; boolean threw = false;
try { try {
@ -159,4 +219,64 @@ public class LegalTshegBarTest extends TestCase implements UnicodeConstants {
} }
assertTrue(x); assertTrue(x);
} }
/** Tests {@link
* org.thdl.tib.text.tshegbar.LegalTshegBar#getTheTenSuffixes()}. */
public void testGetTheTenSuffixes() {
String x = LegalTshegBar.getTheTenSuffixes();
assertTrue(x.length() == 10);
assertTrue(x.charAt(0) == EWC_ga);
assertTrue(x.charAt(4) == EWC_ba);
assertTrue(x.charAt(9) == EWC_sa);
}
/** Tests {@link
* org.thdl.tib.text.tshegbar.LegalTshegBar#isAchungBasedSuffix(String)}. */
public void testIsAchungBasedSuffix() {
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWC_nga
})));
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWC_ma
})));
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWV_i
})));
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWV_o
})));
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWV_u
})));
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWV_u,
EWC_achung, EWV_i,
EWC_achung, EWV_o
})));
assertTrue(!LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWV_u,
EWC_achung, EWV_i,
EWC_achung, EWV_o, /* no EWC_achung, */ EWC_nga
})));
// syntactically illegal, I'd bet, but our algorithm allows it:
assertTrue(LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWC_ma,
EWC_achung, EWV_i,
EWC_achung, EWV_i,
EWC_achung, EWV_i,
EWC_achung, EWV_o,
EWC_achung, EWC_nga,
EWC_achung, EWV_o
})));
assertTrue(!LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWC_la
})));
assertTrue(!LegalTshegBar.isAchungBasedSuffix(new String(new char[] {
EWC_achung, EWV_e
})));
assertTrue(!LegalTshegBar.isAchungBasedSuffix(""));
}
} }

View file

@ -83,20 +83,21 @@ public interface UnicodeConstants {
static final char EWC_za = '\u0F5F'; static final char EWC_za = '\u0F5F';
/** Note the irregular name. The Extended Wylie representation is /** Note the irregular name. The Extended Wylie representation is
<code>'a</code>. */ <code>'a</code>. */
static final char EWC_achen = '\u0F60'; /* DLC NOW is this achen or achung? achen is EWC_a, right? comment it. replace EWC_achen everywhere if you change it. */ static final char EWC_achung = '\u0F60';
static final char EWC_ya = '\u0F61'; static final char EWC_ya = '\u0F61';
static final char EWC_ra = '\u0F62'; static final char EWC_ra = '\u0F62';
static final char EWC_la = '\u0F63'; static final char EWC_la = '\u0F63';
static final char EWC_sha = '\u0F64'; static final char EWC_sha = '\u0F64';
static final char EWC_sa = '\u0F66'; static final char EWC_sa = '\u0F66';
static final char EWC_ha = '\u0F67'; static final char EWC_ha = '\u0F67';
/** achen, the 30th consonant (and, some say, the fifth vowel) DLC NOW FIXME: rename to EWC_achen */
static final char EWC_a = '\u0F68'; static final char EWC_a = '\u0F68';
/** In the word for father, "pA lags", there is an a-chung (i.e., /** In the word for father, "pA lags", there is an a-chung (i.e.,
<code>\u0F71</code>). This is the constant for that little <code>\u0F71</code>). This is the constant for that little
guy. */ guy. */
static final char EW_achung = '\u0F71'; static final char EW_achung_vowel = '\u0F71';
/* Four of the five vowels, some say, or, others say, "the four /* Four of the five vowels, some say, or, others say, "the four

View file

@ -127,11 +127,12 @@ public class UnicodeGraphemeCluster
/** Returns the THDL Extended Wylie transliteration of this /** Returns the THDL Extended Wylie transliteration of this
grapheme cluster, or null if there is none (which happens for grapheme cluster, or null if there is none (which happens for
a few Tibetan codepoints, if you'll recall). If needsVowel is a few Tibetan codepoints, if you'll recall). If needsVowel is
true, then an "a" will be appended when there is no EW_achung true, then an "a" will be appended when there is no
or explicit simple vowel. If there is an explicit vowel or EW_achung_vowel or explicit simple vowel. If there is an
EW_achung, it will always be present. Note that needsVowel is explicit vowel or EW_achung_vowel, it will always be present.
provided because btags is the preferred THDL Extended Wylie Note that needsVowel is provided because btags is the
for the four contiguous grapheme clusters preferred THDL Extended Wylie for the four contiguous grapheme
clusters
<code>"&#92;u0F56&#92;u0F4F&#92;u0F42&#92;u0F66"</code>, and <code>"&#92;u0F56&#92;u0F4F&#92;u0F42&#92;u0F66"</code>, and
needsVowel must be set to false for all but the grapheme needsVowel must be set to false for all but the grapheme
cluster corresponding to <code>&#92;u0F4F</code> if you wish cluster corresponding to <code>&#92;u0F4F</code> if you wish
@ -257,7 +258,7 @@ public class UnicodeGraphemeCluster
/** Returns the <i>height</i> for the Tibetan Unicode codepoint x. /** Returns the <i>height</i> for the Tibetan Unicode codepoint x.
This relative height is 0 for a base consonant, digit, This relative height is 0 for a base consonant, digit,
punctuation, mark, or sign. It is -1 for a subjoined punctuation, mark, or sign. It is -1 for a subjoined
consonant, -2 for EWSUB_wa_zur, -3 for EW_achung, +1 for consonant, -2 for EWSUB_wa_zur, -3 for EW_achung_vowel, +1 for
EWV_gigu, and so on according to the height these codepoints EWV_gigu, and so on according to the height these codepoints
appear relative to one another when on the same stack. If two appear relative to one another when on the same stack. If two
codepoints have equal height, they should not exist in the codepoints have equal height, they should not exist in the