Refactored this code so that Wylie->Tibetan and ACIP->Tibetan

conversions can make use of it.  Hooray for reuse.
This commit is contained in:
dchandler 2003-08-10 19:02:56 +00:00
parent bcf1c12b6a
commit 39e0435b6b
5 changed files with 195 additions and 47 deletions

View file

@ -0,0 +1,29 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text;
/** A list of {@link TGCPair TGCPairs}.
* @author David Chandler */
public interface TGCList {
/** Returns the number of grapheme clusters in this list. */
int size();
/** Returns the <i>i</i>th grapheme cluster in this list. */
TGCPair get(int i);
}

View file

@ -0,0 +1,50 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text;
/** An ordered pair consisting of a Tibetan grapheme cluster's (see
{@link org.thdl.tib.text.tshegbar.UnicodeGraphemeCluster} for a
definition of the term}) classification and its
context-insensitive THDL Extended Wylie representation. NOTE
WELL: this is not a real grapheme cluster; I'm misusing the term
(FIXME). It's actually whole or part of one. It's part of one
when this is a vowel or U+0F7F alone.
@author David Chandler */
public class TGCPair {
public static final int OTHER = 1;
// a standalone achen would fall into this category:
public static final int CONSONANTAL_WITHOUT_VOWEL = 2;
public static final int CONSONANTAL_WITH_VOWEL = 3;
public static final int LONE_VOWEL = 4;
public static final int SANSKRIT_WITHOUT_VOWEL = 5;
public static final int SANSKRIT_WITH_VOWEL = 6;
public String wylie;
public int classification;
public TGCPair(String wylie, int classification) {
this.wylie = wylie;
this.classification = classification;
}
public String toString() {
return "<TGCPair wylie=" + wylie + " classification="
+ classification + "/>";
}
}

View file

@ -0,0 +1,48 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text;
import java.util.ArrayList;
/** A list of pseudo-grapheme clusters (vowels appear alone, FIXME:
* change the name) all in TibetanMachineWeb.
* @author David Chandler */
class TMWGCList implements TGCList {
private ArrayList al;
/** Constructs an empty TMWGCList. */
TMWGCList() {
al = new ArrayList();
}
/** Constructs an empty TMWGCList ready to hold size TGCPairs. */
TMWGCList(int size) {
al = new ArrayList(size);
}
public int size() { return al.size(); }
public TGCPair get(int i) {
return (TGCPair)al.get(i);
}
void add(TGCPair tp) {
al.add(tp);
}
}

View file

@ -830,17 +830,21 @@ public class TibTextUtils implements THDLWylieConstants {
consonant or consonant stack with optional adornment or a consonant or consonant stack with optional adornment or a
number (possibly super- or subscribed) or some other glyph number (possibly super- or subscribed) or some other glyph
alone. */ alone. */
private static ArrayList breakTshegBarIntoGraphemeClusters(java.util.List glyphList, private static TGCList breakTshegBarIntoGraphemeClusters(java.util.List glyphList,
boolean noSuchWylie[]) { boolean noSuchWylie[]) {
// Definition: adornment means vowels and achungs and bindus. // Definition: adornment means vowels and achungs and bindus.
// DLC FIXME: {H}, U+0F7F, is part of a grapheme cluster!
// David Chapman and I both need a comprehensive list of these
// guys.
int sz = glyphList.size(); int sz = glyphList.size();
ThdlDebug.verify(sz > 0); ThdlDebug.verify(sz > 0);
// A list of grapheme clusters (see UnicodeGraphemeCluster). // A list of grapheme clusters (see UnicodeGraphemeCluster).
// sz is an overestimate (speeds us up, wastes some memory). // sz is an overestimate (speeds us up, wastes some memory).
ArrayList gcs = new ArrayList(sz); TMWGCList gcs = new TMWGCList(sz);
StringBuffer buildingUpGc = new StringBuffer(); StringBuffer buildingUpGc = new StringBuffer();
@ -919,14 +923,22 @@ public class TibTextUtils implements THDLWylieConstants {
} }
private static String getClassificationOfTshegBar(ArrayList gcs, public static String getClassificationOfTshegBar(TGCList gcs,
StringBuffer warnings) { // DLC the warnings are Wylie-specific
StringBuffer warnings) {
String candidateType = null; String candidateType = null;
// Now that we have grapheme clusters, see if they match any // Now that we have grapheme clusters, see if they match any
// of the "legal tsheg bars": // of the "legal tsheg bars":
int sz = gcs.size(); int sz = gcs.size();
if (sz == 1) {
TGCPair tp = gcs.get(0);
int cls = tp.classification;
if (TGCPair.SANSKRIT_WITHOUT_VOWEL == cls
|| TGCPair.SANSKRIT_WITH_VOWEL == cls)
return "single-sanskrit-gc";
}
for (int i = 0; i < sz; i++) { for (int i = 0; i < sz; i++) {
TGCPair tp = (TGCPair)gcs.get(i); TGCPair tp = gcs.get(i);
int cls = tp.classification; int cls = tp.classification;
String wylie = tp.wylie; String wylie = tp.wylie;
if (TGCPair.OTHER == cls) { if (TGCPair.OTHER == cls) {
@ -964,7 +976,7 @@ public class TibTextUtils implements THDLWylieConstants {
if (ACHUNG.equals(wylie)) { if (ACHUNG.equals(wylie)) {
// peek ahead to distinguish between ba's, // peek ahead to distinguish between ba's,
// ba'ala and ba'am: // ba'ala and ba'am:
TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null; TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie; String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
if (isAppendageNonVowelWylie(nextwylie)) { if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-prefix/root"; candidateType = "maybe-appendaged-prefix/root";
@ -982,7 +994,7 @@ public class TibTextUtils implements THDLWylieConstants {
if (ACHUNG.equals(wylie)) { if (ACHUNG.equals(wylie)) {
// peek ahead to distinguish between pa's, // peek ahead to distinguish between pa's,
// pa'ala and pa'am: // pa'ala and pa'am:
TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null; TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie; String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
if (isAppendageNonVowelWylie(nextwylie)) { if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-root"; candidateType = "maybe-appendaged-root";
@ -1003,7 +1015,7 @@ public class TibTextUtils implements THDLWylieConstants {
if (ACHUNG.equals(wylie)) { if (ACHUNG.equals(wylie)) {
// peek ahead to distinguish between bpa's, // peek ahead to distinguish between bpa's,
// bpa'ala and bpa'am: // bpa'ala and bpa'am:
TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null; TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie; String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
if (isAppendageNonVowelWylie(nextwylie)) { if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-prefix-root"; candidateType = "maybe-appendaged-prefix-root";
@ -1025,7 +1037,7 @@ public class TibTextUtils implements THDLWylieConstants {
if (ACHUNG.equals(wylie)) { if (ACHUNG.equals(wylie)) {
// peek ahead to distinguish between // peek ahead to distinguish between
// gga'am and gaga'ala: // gga'am and gaga'ala:
TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null; TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie; String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
if (isAppendageNonVowelWylie(nextwylie)) { if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-prefix/root-root/suffix"; candidateType = "maybe-appendaged-prefix/root-root/suffix";
@ -1207,11 +1219,12 @@ public class TibTextUtils implements THDLWylieConstants {
boolean noSuchWylie[], boolean noSuchWylie[],
StringBuffer warnings, StringBuffer warnings,
StringBuffer wylieBuffer) { StringBuffer wylieBuffer) {
ArrayList gcs TGCList gcs
= breakTshegBarIntoGraphemeClusters(glyphList, noSuchWylie); = breakTshegBarIntoGraphemeClusters(glyphList, noSuchWylie);
String candidateType = getClassificationOfTshegBar(gcs, warnings); String candidateType = getClassificationOfTshegBar(gcs, warnings);
int sz = gcs.size(); int sz = gcs.size();
if (candidateType == "invalid") { if (candidateType == "invalid"
|| candidateType == "single-sanskrit-gc") {
// Forget beauty and succintness -- just be sure to // Forget beauty and succintness -- just be sure to
// generate Wylie that can be converted unambiguously into // generate Wylie that can be converted unambiguously into
// Tibetan. Use a disambiguator or vowel after each // Tibetan. Use a disambiguator or vowel after each
@ -1243,10 +1256,7 @@ public class TibTextUtils implements THDLWylieConstants {
// Appendaged vs. not appendaged? it affects nothing at // Appendaged vs. not appendaged? it affects nothing at
// this stage. // this stage.
if (candidateType.startsWith("appendaged-")) { candidateType = getCandidateTypeModuloAppendage(candidateType);
candidateType
= candidateType.substring("appendaged-".length()).intern();
}
if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType) { if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType) {
/* Yes, this is ambiguous. How do we handle it? See /* Yes, this is ambiguous. How do we handle it? See
@ -1439,29 +1449,35 @@ public class TibTextUtils implements THDLWylieConstants {
else else
return null; return null;
} }
}
/** An ordered pair consisting of a Tibetan grapheme cluster's (see /** Returns "root" instead of "appendaged-root", for example. */
{@link org.thdl.tib.text.tshegbar.UnicodeGraphemeCluster} for a private static final String getCandidateTypeModuloAppendage(String candidateType) {
definition of the term}) classification and its if (candidateType.startsWith("appendaged-")) {
context-insensitive THDL Extended Wylie representation. */ candidateType
class TGCPair { = candidateType.substring("appendaged-".length()).intern();
static final int OTHER = 1; }
// a standalone achen would fall into this category: return candidateType;
static final int CONSONANTAL_WITHOUT_VOWEL = 2;
static final int CONSONANTAL_WITH_VOWEL = 3;
static final int LONE_VOWEL = 4;
static final int SANSKRIT_WITHOUT_VOWEL = 5;
static final int SANSKRIT_WITH_VOWEL = 6;
String wylie;
int classification;
TGCPair(String wylie, int classification) {
this.wylie = wylie;
this.classification = classification;
} }
public String toString() {
return "<TGCPair wylie=" + wylie + " classification=" /** Returns an array of size 2 that lists all the possible indices
+ classification + "/>"; * of the root stack given the chosen candidate type. A negative
* number appears if there are not that many possible positions
* for the root. (You'll get two negative numbers if there is no
* root stack.) */
public static final int[] getIndicesOfRootForCandidateType(String candidateType) {
// Appendaged vs. not appendaged? it affects nothing.
candidateType = getCandidateTypeModuloAppendage(candidateType);
int[] rv = new int[] { -1, -1 };
if (candidateType == "prefix/root"
|| candidateType.startsWith("root")) {
rv[0] = 0;
} else if (candidateType.startsWith("prefix/root-")) {
rv[0] = 0;
rv[1] = 1;
} else if (candidateType.startsWith("prefix-root")) {
rv[0] = 1;
}
return rv;
} }
} }

View file

@ -877,7 +877,8 @@ public static boolean isWylieChar(String s) {
/** /**
* Checks to see if the passed string is a consonant or unadorned * Checks to see if the passed string is a consonant or unadorned
* consonant stack in Extended Wylie. * consonant stack in Extended Wylie. The string shouldn't have any
* '+' or '.' characters in it if you wnat this to return true.
* @param s the string to be checked * @param s the string to be checked
* @return true if s is such in Extended Wylie transliteration, false * @return true if s is such in Extended Wylie transliteration, false
* if not */ * if not */
@ -1151,8 +1152,8 @@ public static String getWylieForVowel(String s) {
/** /**
* Gets the DuffCode required for a vowel, if * Gets the DuffCode required for a vowel, if
* affixed to the given hashKey. * affixed to the given hashKey.
* @param hashKey the key for the character the * @param hashKey the key for the character the vowel is to be affixed
* vowel is to be affixed to * to; see {@link #getGlyph(String)} to learn about hash keys.
* @param vowel the vowel you want the DuffCode for * @param vowel the vowel you want the DuffCode for
* @return the DuffCode for the vowel in the given * @return the DuffCode for the vowel in the given
* context, or null if there is no such vowel in * context, or null if there is no such vowel in
@ -1170,7 +1171,8 @@ public static DuffCode getVowel(String hashKey, int vowel) {
/** /**
* Checks to see if a glyph exists for this hash key. * Checks to see if a glyph exists for this hash key.
* @param hashKey the key to be checked * @param hashKey the key to be checked; see {@link #getGlyph(String)}
* to learn about hash keys.
* @return true if there is a glyph corresponding to * @return true if there is a glyph corresponding to
* hashKey, false if not * hashKey, false if not
*/ */
@ -1198,7 +1200,8 @@ public static DuffCode getGlyph(String hashKey) {
/** /**
* Gets the half height character for this hash key. * Gets the half height character for this hash key.
* @param hashKey the key you want a half height glyph for * @param hashKey the key you want a half height glyph for; see {@link
* #getGlyph(String)} to learn about hash keys.
* @return the TibetanMachineWeb DuffCode of hashKey's * @return the TibetanMachineWeb DuffCode of hashKey's
* reduced height glyph, or null if there is no such glyph * reduced height glyph, or null if there is no such glyph
* @see DuffCode * @see DuffCode
@ -1627,8 +1630,8 @@ public static int getTMWFontNumber(String name) {
* Gets the hash key associated with this glyph. * Gets the hash key associated with this glyph.
* @param font a TibetanMachineWeb font number * @param font a TibetanMachineWeb font number
* @param code an ASCII character code minus 32 * @param code an ASCII character code minus 32
* @return the hashKey corresponding to the character * @return the hashKey corresponding to the character at font, code;
* at font, code * see {@link #getGlyph(String)} to learn about hash keys.
*/ */
public static String getHashKeyForGlyph(int font, int code) { public static String getHashKeyForGlyph(int font, int code) {
code = code - 32; code = code - 32;
@ -1640,7 +1643,8 @@ public static String getHashKeyForGlyph(int font, int code) {
* none (probably because this glyph has no THDL Extended Wylie * none (probably because this glyph has no THDL Extended Wylie
* transcription). * transcription).
* @param dc a DuffCode denoting a TibetanMachineWeb glyph * @param dc a DuffCode denoting a TibetanMachineWeb glyph
* @return the hashKey corresponding to the character at dc */ * @return the hashKey corresponding to the character at dc; see {@link
* #getGlyph(String)} to learn about hash keys. */
public static String getHashKeyForGlyph(DuffCode dc) { public static String getHashKeyForGlyph(DuffCode dc) {
int font = dc.getFontNum(); int font = dc.getFontNum();
int code = dc.getCharNum()-32; int code = dc.getCharNum()-32;
@ -1654,7 +1658,8 @@ public static String getHashKeyForGlyph(DuffCode dc) {
* This method takes a hash key and converts it its correct * This method takes a hash key and converts it its correct
* Wylie value, and therefore is useful in conversions from * Wylie value, and therefore is useful in conversions from
* TibetanMachineWeb to Wylie. * TibetanMachineWeb to Wylie.
* @param hashKey the hash key for a glyph * @param hashKey the hash key for a glyph; see {@link
* #getGlyph(String)} to learn about hash keys.
* @return the Wylie value of that hash key * @return the Wylie value of that hash key
*/ */
public static String wylieForGlyph(String hashKey) { public static String wylieForGlyph(String hashKey) {