Refactored this code so that Wylie->Tibetan and ACIP->Tibetan

conversions can make use of it.  Hooray for reuse.
This commit is contained in:
dchandler 2003-08-10 19:02:56 +00:00
parent bcf1c12b6a
commit 39e0435b6b
5 changed files with 195 additions and 47 deletions

View file

@ -0,0 +1,29 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text;
/** A list of {@link TGCPair TGCPairs}.
* @author David Chandler */
public interface TGCList {
/** Returns the number of grapheme clusters in this list. */
int size();
/** Returns the <i>i</i>th grapheme cluster in this list. */
TGCPair get(int i);
}

View file

@ -0,0 +1,50 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text;
/** An ordered pair consisting of a Tibetan grapheme cluster's (see
{@link org.thdl.tib.text.tshegbar.UnicodeGraphemeCluster} for a
definition of the term}) classification and its
context-insensitive THDL Extended Wylie representation. NOTE
WELL: this is not a real grapheme cluster; I'm misusing the term
(FIXME). It's actually whole or part of one. It's part of one
when this is a vowel or U+0F7F alone.
@author David Chandler */
public class TGCPair {
public static final int OTHER = 1;
// a standalone achen would fall into this category:
public static final int CONSONANTAL_WITHOUT_VOWEL = 2;
public static final int CONSONANTAL_WITH_VOWEL = 3;
public static final int LONE_VOWEL = 4;
public static final int SANSKRIT_WITHOUT_VOWEL = 5;
public static final int SANSKRIT_WITH_VOWEL = 6;
public String wylie;
public int classification;
public TGCPair(String wylie, int classification) {
this.wylie = wylie;
this.classification = classification;
}
public String toString() {
return "<TGCPair wylie=" + wylie + " classification="
+ classification + "/>";
}
}

View file

@ -0,0 +1,48 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text;
import java.util.ArrayList;
/** A list of pseudo-grapheme clusters (vowels appear alone, FIXME:
* change the name) all in TibetanMachineWeb.
* @author David Chandler */
class TMWGCList implements TGCList {
private ArrayList al;
/** Constructs an empty TMWGCList. */
TMWGCList() {
al = new ArrayList();
}
/** Constructs an empty TMWGCList ready to hold size TGCPairs. */
TMWGCList(int size) {
al = new ArrayList(size);
}
public int size() { return al.size(); }
public TGCPair get(int i) {
return (TGCPair)al.get(i);
}
void add(TGCPair tp) {
al.add(tp);
}
}

View file

@ -830,17 +830,21 @@ public class TibTextUtils implements THDLWylieConstants {
consonant or consonant stack with optional adornment or a
number (possibly super- or subscribed) or some other glyph
alone. */
private static ArrayList breakTshegBarIntoGraphemeClusters(java.util.List glyphList,
boolean noSuchWylie[]) {
private static TGCList breakTshegBarIntoGraphemeClusters(java.util.List glyphList,
boolean noSuchWylie[]) {
// Definition: adornment means vowels and achungs and bindus.
// DLC FIXME: {H}, U+0F7F, is part of a grapheme cluster!
// David Chapman and I both need a comprehensive list of these
// guys.
int sz = glyphList.size();
ThdlDebug.verify(sz > 0);
// A list of grapheme clusters (see UnicodeGraphemeCluster).
// sz is an overestimate (speeds us up, wastes some memory).
ArrayList gcs = new ArrayList(sz);
TMWGCList gcs = new TMWGCList(sz);
StringBuffer buildingUpGc = new StringBuffer();
@ -919,14 +923,22 @@ public class TibTextUtils implements THDLWylieConstants {
}
private static String getClassificationOfTshegBar(ArrayList gcs,
StringBuffer warnings) {
public static String getClassificationOfTshegBar(TGCList gcs,
// DLC the warnings are Wylie-specific
StringBuffer warnings) {
String candidateType = null;
// Now that we have grapheme clusters, see if they match any
// of the "legal tsheg bars":
int sz = gcs.size();
if (sz == 1) {
TGCPair tp = gcs.get(0);
int cls = tp.classification;
if (TGCPair.SANSKRIT_WITHOUT_VOWEL == cls
|| TGCPair.SANSKRIT_WITH_VOWEL == cls)
return "single-sanskrit-gc";
}
for (int i = 0; i < sz; i++) {
TGCPair tp = (TGCPair)gcs.get(i);
TGCPair tp = gcs.get(i);
int cls = tp.classification;
String wylie = tp.wylie;
if (TGCPair.OTHER == cls) {
@ -964,7 +976,7 @@ public class TibTextUtils implements THDLWylieConstants {
if (ACHUNG.equals(wylie)) {
// peek ahead to distinguish between ba's,
// ba'ala and ba'am:
TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-prefix/root";
@ -982,7 +994,7 @@ public class TibTextUtils implements THDLWylieConstants {
if (ACHUNG.equals(wylie)) {
// peek ahead to distinguish between pa's,
// pa'ala and pa'am:
TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-root";
@ -1003,7 +1015,7 @@ public class TibTextUtils implements THDLWylieConstants {
if (ACHUNG.equals(wylie)) {
// peek ahead to distinguish between bpa's,
// bpa'ala and bpa'am:
TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-prefix-root";
@ -1025,7 +1037,7 @@ public class TibTextUtils implements THDLWylieConstants {
if (ACHUNG.equals(wylie)) {
// peek ahead to distinguish between
// gga'am and gaga'ala:
TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-prefix/root-root/suffix";
@ -1207,11 +1219,12 @@ public class TibTextUtils implements THDLWylieConstants {
boolean noSuchWylie[],
StringBuffer warnings,
StringBuffer wylieBuffer) {
ArrayList gcs
TGCList gcs
= breakTshegBarIntoGraphemeClusters(glyphList, noSuchWylie);
String candidateType = getClassificationOfTshegBar(gcs, warnings);
int sz = gcs.size();
if (candidateType == "invalid") {
if (candidateType == "invalid"
|| candidateType == "single-sanskrit-gc") {
// Forget beauty and succintness -- just be sure to
// generate Wylie that can be converted unambiguously into
// Tibetan. Use a disambiguator or vowel after each
@ -1243,10 +1256,7 @@ public class TibTextUtils implements THDLWylieConstants {
// Appendaged vs. not appendaged? it affects nothing at
// this stage.
if (candidateType.startsWith("appendaged-")) {
candidateType
= candidateType.substring("appendaged-".length()).intern();
}
candidateType = getCandidateTypeModuloAppendage(candidateType);
if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType) {
/* Yes, this is ambiguous. How do we handle it? See
@ -1439,29 +1449,35 @@ public class TibTextUtils implements THDLWylieConstants {
else
return null;
}
}
/** An ordered pair consisting of a Tibetan grapheme cluster's (see
{@link org.thdl.tib.text.tshegbar.UnicodeGraphemeCluster} for a
definition of the term}) classification and its
context-insensitive THDL Extended Wylie representation. */
class TGCPair {
static final int OTHER = 1;
// a standalone achen would fall into this category:
static final int CONSONANTAL_WITHOUT_VOWEL = 2;
static final int CONSONANTAL_WITH_VOWEL = 3;
static final int LONE_VOWEL = 4;
static final int SANSKRIT_WITHOUT_VOWEL = 5;
static final int SANSKRIT_WITH_VOWEL = 6;
String wylie;
int classification;
TGCPair(String wylie, int classification) {
this.wylie = wylie;
this.classification = classification;
/** Returns "root" instead of "appendaged-root", for example. */
private static final String getCandidateTypeModuloAppendage(String candidateType) {
if (candidateType.startsWith("appendaged-")) {
candidateType
= candidateType.substring("appendaged-".length()).intern();
}
return candidateType;
}
public String toString() {
return "<TGCPair wylie=" + wylie + " classification="
+ classification + "/>";
/** Returns an array of size 2 that lists all the possible indices
* of the root stack given the chosen candidate type. A negative
* number appears if there are not that many possible positions
* for the root. (You'll get two negative numbers if there is no
* root stack.) */
public static final int[] getIndicesOfRootForCandidateType(String candidateType) {
// Appendaged vs. not appendaged? it affects nothing.
candidateType = getCandidateTypeModuloAppendage(candidateType);
int[] rv = new int[] { -1, -1 };
if (candidateType == "prefix/root"
|| candidateType.startsWith("root")) {
rv[0] = 0;
} else if (candidateType.startsWith("prefix/root-")) {
rv[0] = 0;
rv[1] = 1;
} else if (candidateType.startsWith("prefix-root")) {
rv[0] = 1;
}
return rv;
}
}

View file

@ -877,7 +877,8 @@ public static boolean isWylieChar(String s) {
/**
* Checks to see if the passed string is a consonant or unadorned
* consonant stack in Extended Wylie.
* consonant stack in Extended Wylie. The string shouldn't have any
* '+' or '.' characters in it if you wnat this to return true.
* @param s the string to be checked
* @return true if s is such in Extended Wylie transliteration, false
* if not */
@ -1151,8 +1152,8 @@ public static String getWylieForVowel(String s) {
/**
* Gets the DuffCode required for a vowel, if
* affixed to the given hashKey.
* @param hashKey the key for the character the
* vowel is to be affixed to
* @param hashKey the key for the character the vowel is to be affixed
* to; see {@link #getGlyph(String)} to learn about hash keys.
* @param vowel the vowel you want the DuffCode for
* @return the DuffCode for the vowel in the given
* context, or null if there is no such vowel in
@ -1170,7 +1171,8 @@ public static DuffCode getVowel(String hashKey, int vowel) {
/**
* Checks to see if a glyph exists for this hash key.
* @param hashKey the key to be checked
* @param hashKey the key to be checked; see {@link #getGlyph(String)}
* to learn about hash keys.
* @return true if there is a glyph corresponding to
* hashKey, false if not
*/
@ -1198,7 +1200,8 @@ public static DuffCode getGlyph(String hashKey) {
/**
* Gets the half height character for this hash key.
* @param hashKey the key you want a half height glyph for
* @param hashKey the key you want a half height glyph for; see {@link
* #getGlyph(String)} to learn about hash keys.
* @return the TibetanMachineWeb DuffCode of hashKey's
* reduced height glyph, or null if there is no such glyph
* @see DuffCode
@ -1627,8 +1630,8 @@ public static int getTMWFontNumber(String name) {
* Gets the hash key associated with this glyph.
* @param font a TibetanMachineWeb font number
* @param code an ASCII character code minus 32
* @return the hashKey corresponding to the character
* at font, code
* @return the hashKey corresponding to the character at font, code;
* see {@link #getGlyph(String)} to learn about hash keys.
*/
public static String getHashKeyForGlyph(int font, int code) {
code = code - 32;
@ -1640,7 +1643,8 @@ public static String getHashKeyForGlyph(int font, int code) {
* none (probably because this glyph has no THDL Extended Wylie
* transcription).
* @param dc a DuffCode denoting a TibetanMachineWeb glyph
* @return the hashKey corresponding to the character at dc */
* @return the hashKey corresponding to the character at dc; see {@link
* #getGlyph(String)} to learn about hash keys. */
public static String getHashKeyForGlyph(DuffCode dc) {
int font = dc.getFontNum();
int code = dc.getCharNum()-32;
@ -1654,7 +1658,8 @@ public static String getHashKeyForGlyph(DuffCode dc) {
* This method takes a hash key and converts it its correct
* Wylie value, and therefore is useful in conversions from
* TibetanMachineWeb to Wylie.
* @param hashKey the hash key for a glyph
* @param hashKey the hash key for a glyph; see {@link
* #getGlyph(String)} to learn about hash keys.
* @return the Wylie value of that hash key
*/
public static String wylieForGlyph(String hashKey) {