Refactored this code so that Wylie->Tibetan and ACIP->Tibetan
conversions can make use of it. Hooray for reuse.
This commit is contained in:
parent
bcf1c12b6a
commit
39e0435b6b
5 changed files with 195 additions and 47 deletions
29
source/org/thdl/tib/text/TGCList.java
Normal file
29
source/org/thdl/tib/text/TGCList.java
Normal file
|
@ -0,0 +1,29 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text;
|
||||
|
||||
/** A list of {@link TGCPair TGCPairs}.
|
||||
* @author David Chandler */
|
||||
public interface TGCList {
|
||||
/** Returns the number of grapheme clusters in this list. */
|
||||
int size();
|
||||
|
||||
/** Returns the <i>i</i>th grapheme cluster in this list. */
|
||||
TGCPair get(int i);
|
||||
}
|
50
source/org/thdl/tib/text/TGCPair.java
Normal file
50
source/org/thdl/tib/text/TGCPair.java
Normal file
|
@ -0,0 +1,50 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text;
|
||||
|
||||
|
||||
/** An ordered pair consisting of a Tibetan grapheme cluster's (see
|
||||
{@link org.thdl.tib.text.tshegbar.UnicodeGraphemeCluster} for a
|
||||
definition of the term}) classification and its
|
||||
context-insensitive THDL Extended Wylie representation. NOTE
|
||||
WELL: this is not a real grapheme cluster; I'm misusing the term
|
||||
(FIXME). It's actually whole or part of one. It's part of one
|
||||
when this is a vowel or U+0F7F alone.
|
||||
|
||||
@author David Chandler */
|
||||
public class TGCPair {
|
||||
public static final int OTHER = 1;
|
||||
// a standalone achen would fall into this category:
|
||||
public static final int CONSONANTAL_WITHOUT_VOWEL = 2;
|
||||
public static final int CONSONANTAL_WITH_VOWEL = 3;
|
||||
public static final int LONE_VOWEL = 4;
|
||||
public static final int SANSKRIT_WITHOUT_VOWEL = 5;
|
||||
public static final int SANSKRIT_WITH_VOWEL = 6;
|
||||
|
||||
public String wylie;
|
||||
public int classification;
|
||||
public TGCPair(String wylie, int classification) {
|
||||
this.wylie = wylie;
|
||||
this.classification = classification;
|
||||
}
|
||||
public String toString() {
|
||||
return "<TGCPair wylie=" + wylie + " classification="
|
||||
+ classification + "/>";
|
||||
}
|
||||
}
|
48
source/org/thdl/tib/text/TMWGCList.java
Normal file
48
source/org/thdl/tib/text/TMWGCList.java
Normal file
|
@ -0,0 +1,48 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
/** A list of pseudo-grapheme clusters (vowels appear alone, FIXME:
|
||||
* change the name) all in TibetanMachineWeb.
|
||||
* @author David Chandler */
|
||||
class TMWGCList implements TGCList {
|
||||
private ArrayList al;
|
||||
|
||||
/** Constructs an empty TMWGCList. */
|
||||
TMWGCList() {
|
||||
al = new ArrayList();
|
||||
}
|
||||
|
||||
/** Constructs an empty TMWGCList ready to hold size TGCPairs. */
|
||||
TMWGCList(int size) {
|
||||
al = new ArrayList(size);
|
||||
}
|
||||
|
||||
public int size() { return al.size(); }
|
||||
|
||||
public TGCPair get(int i) {
|
||||
return (TGCPair)al.get(i);
|
||||
}
|
||||
|
||||
void add(TGCPair tp) {
|
||||
al.add(tp);
|
||||
}
|
||||
}
|
|
@ -830,17 +830,21 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
consonant or consonant stack with optional adornment or a
|
||||
number (possibly super- or subscribed) or some other glyph
|
||||
alone. */
|
||||
private static ArrayList breakTshegBarIntoGraphemeClusters(java.util.List glyphList,
|
||||
private static TGCList breakTshegBarIntoGraphemeClusters(java.util.List glyphList,
|
||||
boolean noSuchWylie[]) {
|
||||
|
||||
// Definition: adornment means vowels and achungs and bindus.
|
||||
|
||||
// DLC FIXME: {H}, U+0F7F, is part of a grapheme cluster!
|
||||
// David Chapman and I both need a comprehensive list of these
|
||||
// guys.
|
||||
|
||||
int sz = glyphList.size();
|
||||
ThdlDebug.verify(sz > 0);
|
||||
|
||||
// A list of grapheme clusters (see UnicodeGraphemeCluster).
|
||||
// sz is an overestimate (speeds us up, wastes some memory).
|
||||
ArrayList gcs = new ArrayList(sz);
|
||||
TMWGCList gcs = new TMWGCList(sz);
|
||||
|
||||
StringBuffer buildingUpGc = new StringBuffer();
|
||||
|
||||
|
@ -919,14 +923,22 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
}
|
||||
|
||||
|
||||
private static String getClassificationOfTshegBar(ArrayList gcs,
|
||||
public static String getClassificationOfTshegBar(TGCList gcs,
|
||||
// DLC the warnings are Wylie-specific
|
||||
StringBuffer warnings) {
|
||||
String candidateType = null;
|
||||
// Now that we have grapheme clusters, see if they match any
|
||||
// of the "legal tsheg bars":
|
||||
int sz = gcs.size();
|
||||
if (sz == 1) {
|
||||
TGCPair tp = gcs.get(0);
|
||||
int cls = tp.classification;
|
||||
if (TGCPair.SANSKRIT_WITHOUT_VOWEL == cls
|
||||
|| TGCPair.SANSKRIT_WITH_VOWEL == cls)
|
||||
return "single-sanskrit-gc";
|
||||
}
|
||||
for (int i = 0; i < sz; i++) {
|
||||
TGCPair tp = (TGCPair)gcs.get(i);
|
||||
TGCPair tp = gcs.get(i);
|
||||
int cls = tp.classification;
|
||||
String wylie = tp.wylie;
|
||||
if (TGCPair.OTHER == cls) {
|
||||
|
@ -964,7 +976,7 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
if (ACHUNG.equals(wylie)) {
|
||||
// peek ahead to distinguish between ba's,
|
||||
// ba'ala and ba'am:
|
||||
TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
|
||||
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
|
||||
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
|
||||
if (isAppendageNonVowelWylie(nextwylie)) {
|
||||
candidateType = "maybe-appendaged-prefix/root";
|
||||
|
@ -982,7 +994,7 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
if (ACHUNG.equals(wylie)) {
|
||||
// peek ahead to distinguish between pa's,
|
||||
// pa'ala and pa'am:
|
||||
TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
|
||||
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
|
||||
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
|
||||
if (isAppendageNonVowelWylie(nextwylie)) {
|
||||
candidateType = "maybe-appendaged-root";
|
||||
|
@ -1003,7 +1015,7 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
if (ACHUNG.equals(wylie)) {
|
||||
// peek ahead to distinguish between bpa's,
|
||||
// bpa'ala and bpa'am:
|
||||
TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
|
||||
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
|
||||
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
|
||||
if (isAppendageNonVowelWylie(nextwylie)) {
|
||||
candidateType = "maybe-appendaged-prefix-root";
|
||||
|
@ -1025,7 +1037,7 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
if (ACHUNG.equals(wylie)) {
|
||||
// peek ahead to distinguish between
|
||||
// gga'am and gaga'ala:
|
||||
TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
|
||||
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
|
||||
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
|
||||
if (isAppendageNonVowelWylie(nextwylie)) {
|
||||
candidateType = "maybe-appendaged-prefix/root-root/suffix";
|
||||
|
@ -1207,11 +1219,12 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
boolean noSuchWylie[],
|
||||
StringBuffer warnings,
|
||||
StringBuffer wylieBuffer) {
|
||||
ArrayList gcs
|
||||
TGCList gcs
|
||||
= breakTshegBarIntoGraphemeClusters(glyphList, noSuchWylie);
|
||||
String candidateType = getClassificationOfTshegBar(gcs, warnings);
|
||||
int sz = gcs.size();
|
||||
if (candidateType == "invalid") {
|
||||
if (candidateType == "invalid"
|
||||
|| candidateType == "single-sanskrit-gc") {
|
||||
// Forget beauty and succintness -- just be sure to
|
||||
// generate Wylie that can be converted unambiguously into
|
||||
// Tibetan. Use a disambiguator or vowel after each
|
||||
|
@ -1243,10 +1256,7 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
|
||||
// Appendaged vs. not appendaged? it affects nothing at
|
||||
// this stage.
|
||||
if (candidateType.startsWith("appendaged-")) {
|
||||
candidateType
|
||||
= candidateType.substring("appendaged-".length()).intern();
|
||||
}
|
||||
candidateType = getCandidateTypeModuloAppendage(candidateType);
|
||||
|
||||
if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType) {
|
||||
/* Yes, this is ambiguous. How do we handle it? See
|
||||
|
@ -1439,29 +1449,35 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
else
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Returns "root" instead of "appendaged-root", for example. */
|
||||
private static final String getCandidateTypeModuloAppendage(String candidateType) {
|
||||
if (candidateType.startsWith("appendaged-")) {
|
||||
candidateType
|
||||
= candidateType.substring("appendaged-".length()).intern();
|
||||
}
|
||||
return candidateType;
|
||||
}
|
||||
|
||||
/** An ordered pair consisting of a Tibetan grapheme cluster's (see
|
||||
{@link org.thdl.tib.text.tshegbar.UnicodeGraphemeCluster} for a
|
||||
definition of the term}) classification and its
|
||||
context-insensitive THDL Extended Wylie representation. */
|
||||
class TGCPair {
|
||||
static final int OTHER = 1;
|
||||
// a standalone achen would fall into this category:
|
||||
static final int CONSONANTAL_WITHOUT_VOWEL = 2;
|
||||
static final int CONSONANTAL_WITH_VOWEL = 3;
|
||||
static final int LONE_VOWEL = 4;
|
||||
static final int SANSKRIT_WITHOUT_VOWEL = 5;
|
||||
static final int SANSKRIT_WITH_VOWEL = 6;
|
||||
/** Returns an array of size 2 that lists all the possible indices
|
||||
* of the root stack given the chosen candidate type. A negative
|
||||
* number appears if there are not that many possible positions
|
||||
* for the root. (You'll get two negative numbers if there is no
|
||||
* root stack.) */
|
||||
public static final int[] getIndicesOfRootForCandidateType(String candidateType) {
|
||||
// Appendaged vs. not appendaged? it affects nothing.
|
||||
candidateType = getCandidateTypeModuloAppendage(candidateType);
|
||||
|
||||
String wylie;
|
||||
int classification;
|
||||
TGCPair(String wylie, int classification) {
|
||||
this.wylie = wylie;
|
||||
this.classification = classification;
|
||||
int[] rv = new int[] { -1, -1 };
|
||||
if (candidateType == "prefix/root"
|
||||
|| candidateType.startsWith("root")) {
|
||||
rv[0] = 0;
|
||||
} else if (candidateType.startsWith("prefix/root-")) {
|
||||
rv[0] = 0;
|
||||
rv[1] = 1;
|
||||
} else if (candidateType.startsWith("prefix-root")) {
|
||||
rv[0] = 1;
|
||||
}
|
||||
public String toString() {
|
||||
return "<TGCPair wylie=" + wylie + " classification="
|
||||
+ classification + "/>";
|
||||
return rv;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -877,7 +877,8 @@ public static boolean isWylieChar(String s) {
|
|||
|
||||
/**
|
||||
* Checks to see if the passed string is a consonant or unadorned
|
||||
* consonant stack in Extended Wylie.
|
||||
* consonant stack in Extended Wylie. The string shouldn't have any
|
||||
* '+' or '.' characters in it if you wnat this to return true.
|
||||
* @param s the string to be checked
|
||||
* @return true if s is such in Extended Wylie transliteration, false
|
||||
* if not */
|
||||
|
@ -1151,8 +1152,8 @@ public static String getWylieForVowel(String s) {
|
|||
/**
|
||||
* Gets the DuffCode required for a vowel, if
|
||||
* affixed to the given hashKey.
|
||||
* @param hashKey the key for the character the
|
||||
* vowel is to be affixed to
|
||||
* @param hashKey the key for the character the vowel is to be affixed
|
||||
* to; see {@link #getGlyph(String)} to learn about hash keys.
|
||||
* @param vowel the vowel you want the DuffCode for
|
||||
* @return the DuffCode for the vowel in the given
|
||||
* context, or null if there is no such vowel in
|
||||
|
@ -1170,7 +1171,8 @@ public static DuffCode getVowel(String hashKey, int vowel) {
|
|||
|
||||
/**
|
||||
* Checks to see if a glyph exists for this hash key.
|
||||
* @param hashKey the key to be checked
|
||||
* @param hashKey the key to be checked; see {@link #getGlyph(String)}
|
||||
* to learn about hash keys.
|
||||
* @return true if there is a glyph corresponding to
|
||||
* hashKey, false if not
|
||||
*/
|
||||
|
@ -1198,7 +1200,8 @@ public static DuffCode getGlyph(String hashKey) {
|
|||
|
||||
/**
|
||||
* Gets the half height character for this hash key.
|
||||
* @param hashKey the key you want a half height glyph for
|
||||
* @param hashKey the key you want a half height glyph for; see {@link
|
||||
* #getGlyph(String)} to learn about hash keys.
|
||||
* @return the TibetanMachineWeb DuffCode of hashKey's
|
||||
* reduced height glyph, or null if there is no such glyph
|
||||
* @see DuffCode
|
||||
|
@ -1627,8 +1630,8 @@ public static int getTMWFontNumber(String name) {
|
|||
* Gets the hash key associated with this glyph.
|
||||
* @param font a TibetanMachineWeb font number
|
||||
* @param code an ASCII character code minus 32
|
||||
* @return the hashKey corresponding to the character
|
||||
* at font, code
|
||||
* @return the hashKey corresponding to the character at font, code;
|
||||
* see {@link #getGlyph(String)} to learn about hash keys.
|
||||
*/
|
||||
public static String getHashKeyForGlyph(int font, int code) {
|
||||
code = code - 32;
|
||||
|
@ -1640,7 +1643,8 @@ public static String getHashKeyForGlyph(int font, int code) {
|
|||
* none (probably because this glyph has no THDL Extended Wylie
|
||||
* transcription).
|
||||
* @param dc a DuffCode denoting a TibetanMachineWeb glyph
|
||||
* @return the hashKey corresponding to the character at dc */
|
||||
* @return the hashKey corresponding to the character at dc; see {@link
|
||||
* #getGlyph(String)} to learn about hash keys. */
|
||||
public static String getHashKeyForGlyph(DuffCode dc) {
|
||||
int font = dc.getFontNum();
|
||||
int code = dc.getCharNum()-32;
|
||||
|
@ -1654,7 +1658,8 @@ public static String getHashKeyForGlyph(DuffCode dc) {
|
|||
* This method takes a hash key and converts it its correct
|
||||
* Wylie value, and therefore is useful in conversions from
|
||||
* TibetanMachineWeb to Wylie.
|
||||
* @param hashKey the hash key for a glyph
|
||||
* @param hashKey the hash key for a glyph; see {@link
|
||||
* #getGlyph(String)} to learn about hash keys.
|
||||
* @return the Wylie value of that hash key
|
||||
*/
|
||||
public static String wylieForGlyph(String hashKey) {
|
||||
|
|
Loading…
Reference in a new issue