Refactored this code so that Wylie->Tibetan and ACIP->Tibetan
conversions can make use of it. Hooray for reuse.
This commit is contained in:
parent
bcf1c12b6a
commit
39e0435b6b
5 changed files with 195 additions and 47 deletions
|
@ -830,17 +830,21 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
consonant or consonant stack with optional adornment or a
|
||||
number (possibly super- or subscribed) or some other glyph
|
||||
alone. */
|
||||
private static ArrayList breakTshegBarIntoGraphemeClusters(java.util.List glyphList,
|
||||
boolean noSuchWylie[]) {
|
||||
private static TGCList breakTshegBarIntoGraphemeClusters(java.util.List glyphList,
|
||||
boolean noSuchWylie[]) {
|
||||
|
||||
// Definition: adornment means vowels and achungs and bindus.
|
||||
|
||||
// DLC FIXME: {H}, U+0F7F, is part of a grapheme cluster!
|
||||
// David Chapman and I both need a comprehensive list of these
|
||||
// guys.
|
||||
|
||||
int sz = glyphList.size();
|
||||
ThdlDebug.verify(sz > 0);
|
||||
|
||||
// A list of grapheme clusters (see UnicodeGraphemeCluster).
|
||||
// sz is an overestimate (speeds us up, wastes some memory).
|
||||
ArrayList gcs = new ArrayList(sz);
|
||||
TMWGCList gcs = new TMWGCList(sz);
|
||||
|
||||
StringBuffer buildingUpGc = new StringBuffer();
|
||||
|
||||
|
@ -919,14 +923,22 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
}
|
||||
|
||||
|
||||
private static String getClassificationOfTshegBar(ArrayList gcs,
|
||||
StringBuffer warnings) {
|
||||
public static String getClassificationOfTshegBar(TGCList gcs,
|
||||
// DLC the warnings are Wylie-specific
|
||||
StringBuffer warnings) {
|
||||
String candidateType = null;
|
||||
// Now that we have grapheme clusters, see if they match any
|
||||
// of the "legal tsheg bars":
|
||||
int sz = gcs.size();
|
||||
if (sz == 1) {
|
||||
TGCPair tp = gcs.get(0);
|
||||
int cls = tp.classification;
|
||||
if (TGCPair.SANSKRIT_WITHOUT_VOWEL == cls
|
||||
|| TGCPair.SANSKRIT_WITH_VOWEL == cls)
|
||||
return "single-sanskrit-gc";
|
||||
}
|
||||
for (int i = 0; i < sz; i++) {
|
||||
TGCPair tp = (TGCPair)gcs.get(i);
|
||||
TGCPair tp = gcs.get(i);
|
||||
int cls = tp.classification;
|
||||
String wylie = tp.wylie;
|
||||
if (TGCPair.OTHER == cls) {
|
||||
|
@ -964,7 +976,7 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
if (ACHUNG.equals(wylie)) {
|
||||
// peek ahead to distinguish between ba's,
|
||||
// ba'ala and ba'am:
|
||||
TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
|
||||
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
|
||||
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
|
||||
if (isAppendageNonVowelWylie(nextwylie)) {
|
||||
candidateType = "maybe-appendaged-prefix/root";
|
||||
|
@ -982,7 +994,7 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
if (ACHUNG.equals(wylie)) {
|
||||
// peek ahead to distinguish between pa's,
|
||||
// pa'ala and pa'am:
|
||||
TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
|
||||
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
|
||||
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
|
||||
if (isAppendageNonVowelWylie(nextwylie)) {
|
||||
candidateType = "maybe-appendaged-root";
|
||||
|
@ -1003,7 +1015,7 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
if (ACHUNG.equals(wylie)) {
|
||||
// peek ahead to distinguish between bpa's,
|
||||
// bpa'ala and bpa'am:
|
||||
TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
|
||||
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
|
||||
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
|
||||
if (isAppendageNonVowelWylie(nextwylie)) {
|
||||
candidateType = "maybe-appendaged-prefix-root";
|
||||
|
@ -1025,7 +1037,7 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
if (ACHUNG.equals(wylie)) {
|
||||
// peek ahead to distinguish between
|
||||
// gga'am and gaga'ala:
|
||||
TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
|
||||
TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
|
||||
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
|
||||
if (isAppendageNonVowelWylie(nextwylie)) {
|
||||
candidateType = "maybe-appendaged-prefix/root-root/suffix";
|
||||
|
@ -1207,11 +1219,12 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
boolean noSuchWylie[],
|
||||
StringBuffer warnings,
|
||||
StringBuffer wylieBuffer) {
|
||||
ArrayList gcs
|
||||
TGCList gcs
|
||||
= breakTshegBarIntoGraphemeClusters(glyphList, noSuchWylie);
|
||||
String candidateType = getClassificationOfTshegBar(gcs, warnings);
|
||||
int sz = gcs.size();
|
||||
if (candidateType == "invalid") {
|
||||
if (candidateType == "invalid"
|
||||
|| candidateType == "single-sanskrit-gc") {
|
||||
// Forget beauty and succintness -- just be sure to
|
||||
// generate Wylie that can be converted unambiguously into
|
||||
// Tibetan. Use a disambiguator or vowel after each
|
||||
|
@ -1243,10 +1256,7 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
|
||||
// Appendaged vs. not appendaged? it affects nothing at
|
||||
// this stage.
|
||||
if (candidateType.startsWith("appendaged-")) {
|
||||
candidateType
|
||||
= candidateType.substring("appendaged-".length()).intern();
|
||||
}
|
||||
candidateType = getCandidateTypeModuloAppendage(candidateType);
|
||||
|
||||
if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType) {
|
||||
/* Yes, this is ambiguous. How do we handle it? See
|
||||
|
@ -1439,29 +1449,35 @@ public class TibTextUtils implements THDLWylieConstants {
|
|||
else
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
/** An ordered pair consisting of a Tibetan grapheme cluster's (see
|
||||
{@link org.thdl.tib.text.tshegbar.UnicodeGraphemeCluster} for a
|
||||
definition of the term}) classification and its
|
||||
context-insensitive THDL Extended Wylie representation. */
|
||||
class TGCPair {
|
||||
static final int OTHER = 1;
|
||||
// a standalone achen would fall into this category:
|
||||
static final int CONSONANTAL_WITHOUT_VOWEL = 2;
|
||||
static final int CONSONANTAL_WITH_VOWEL = 3;
|
||||
static final int LONE_VOWEL = 4;
|
||||
static final int SANSKRIT_WITHOUT_VOWEL = 5;
|
||||
static final int SANSKRIT_WITH_VOWEL = 6;
|
||||
|
||||
String wylie;
|
||||
int classification;
|
||||
TGCPair(String wylie, int classification) {
|
||||
this.wylie = wylie;
|
||||
this.classification = classification;
|
||||
/** Returns "root" instead of "appendaged-root", for example. */
|
||||
private static final String getCandidateTypeModuloAppendage(String candidateType) {
|
||||
if (candidateType.startsWith("appendaged-")) {
|
||||
candidateType
|
||||
= candidateType.substring("appendaged-".length()).intern();
|
||||
}
|
||||
return candidateType;
|
||||
}
|
||||
public String toString() {
|
||||
return "<TGCPair wylie=" + wylie + " classification="
|
||||
+ classification + "/>";
|
||||
|
||||
/** Returns an array of size 2 that lists all the possible indices
|
||||
* of the root stack given the chosen candidate type. A negative
|
||||
* number appears if there are not that many possible positions
|
||||
* for the root. (You'll get two negative numbers if there is no
|
||||
* root stack.) */
|
||||
public static final int[] getIndicesOfRootForCandidateType(String candidateType) {
|
||||
// Appendaged vs. not appendaged? it affects nothing.
|
||||
candidateType = getCandidateTypeModuloAppendage(candidateType);
|
||||
|
||||
int[] rv = new int[] { -1, -1 };
|
||||
if (candidateType == "prefix/root"
|
||||
|| candidateType.startsWith("root")) {
|
||||
rv[0] = 0;
|
||||
} else if (candidateType.startsWith("prefix/root-")) {
|
||||
rv[0] = 0;
|
||||
rv[1] = 1;
|
||||
} else if (candidateType.startsWith("prefix-root")) {
|
||||
rv[0] = 1;
|
||||
}
|
||||
return rv;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue