diff --git a/source/org/thdl/tib/text/TGCList.java b/source/org/thdl/tib/text/TGCList.java
new file mode 100644
index 0000000..7e057c3
--- /dev/null
+++ b/source/org/thdl/tib/text/TGCList.java
@@ -0,0 +1,29 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis,
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+License for the specific terms governing rights and limitations under the
+License.
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
+All Rights Reserved.
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text;
+
+/** A list of {@link TGCPair TGCPairs}.
+ * @author David Chandler */
+public interface TGCList {
+ /** Returns the number of grapheme clusters in this list. */
+ int size();
+
+ /** Returns the ith grapheme cluster in this list. */
+ TGCPair get(int i);
+}
diff --git a/source/org/thdl/tib/text/TGCPair.java b/source/org/thdl/tib/text/TGCPair.java
new file mode 100644
index 0000000..d681cbd
--- /dev/null
+++ b/source/org/thdl/tib/text/TGCPair.java
@@ -0,0 +1,50 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis,
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+License for the specific terms governing rights and limitations under the
+License.
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
+All Rights Reserved.
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text;
+
+
+/** An ordered pair consisting of a Tibetan grapheme cluster's (see
+ {@link org.thdl.tib.text.tshegbar.UnicodeGraphemeCluster} for a
+ definition of the term}) classification and its
+ context-insensitive THDL Extended Wylie representation. NOTE
+ WELL: this is not a real grapheme cluster; I'm misusing the term
+ (FIXME). It's actually whole or part of one. It's part of one
+ when this is a vowel or U+0F7F alone.
+
+ @author David Chandler */
+public class TGCPair {
+ public static final int OTHER = 1;
+ // a standalone achen would fall into this category:
+ public static final int CONSONANTAL_WITHOUT_VOWEL = 2;
+ public static final int CONSONANTAL_WITH_VOWEL = 3;
+ public static final int LONE_VOWEL = 4;
+ public static final int SANSKRIT_WITHOUT_VOWEL = 5;
+ public static final int SANSKRIT_WITH_VOWEL = 6;
+
+ public String wylie;
+ public int classification;
+ public TGCPair(String wylie, int classification) {
+ this.wylie = wylie;
+ this.classification = classification;
+ }
+ public String toString() {
+ return "";
+ }
+}
diff --git a/source/org/thdl/tib/text/TMWGCList.java b/source/org/thdl/tib/text/TMWGCList.java
new file mode 100644
index 0000000..553a0d1
--- /dev/null
+++ b/source/org/thdl/tib/text/TMWGCList.java
@@ -0,0 +1,48 @@
+/*
+The contents of this file are subject to the THDL Open Community License
+Version 1.0 (the "License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License on the THDL web site
+(http://www.thdl.org/).
+
+Software distributed under the License is distributed on an "AS IS" basis,
+WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+License for the specific terms governing rights and limitations under the
+License.
+
+The Initial Developer of this software is the Tibetan and Himalayan Digital
+Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
+All Rights Reserved.
+
+Contributor(s): ______________________________________.
+*/
+
+package org.thdl.tib.text;
+
+import java.util.ArrayList;
+
+/** A list of pseudo-grapheme clusters (vowels appear alone, FIXME:
+ * change the name) all in TibetanMachineWeb.
+ * @author David Chandler */
+class TMWGCList implements TGCList {
+ private ArrayList al;
+
+ /** Constructs an empty TMWGCList. */
+ TMWGCList() {
+ al = new ArrayList();
+ }
+
+ /** Constructs an empty TMWGCList ready to hold size TGCPairs. */
+ TMWGCList(int size) {
+ al = new ArrayList(size);
+ }
+
+ public int size() { return al.size(); }
+
+ public TGCPair get(int i) {
+ return (TGCPair)al.get(i);
+ }
+
+ void add(TGCPair tp) {
+ al.add(tp);
+ }
+}
diff --git a/source/org/thdl/tib/text/TibTextUtils.java b/source/org/thdl/tib/text/TibTextUtils.java
index 1ad6491..f42695a 100644
--- a/source/org/thdl/tib/text/TibTextUtils.java
+++ b/source/org/thdl/tib/text/TibTextUtils.java
@@ -830,17 +830,21 @@ public class TibTextUtils implements THDLWylieConstants {
consonant or consonant stack with optional adornment or a
number (possibly super- or subscribed) or some other glyph
alone. */
- private static ArrayList breakTshegBarIntoGraphemeClusters(java.util.List glyphList,
- boolean noSuchWylie[]) {
+ private static TGCList breakTshegBarIntoGraphemeClusters(java.util.List glyphList,
+ boolean noSuchWylie[]) {
// Definition: adornment means vowels and achungs and bindus.
+ // DLC FIXME: {H}, U+0F7F, is part of a grapheme cluster!
+ // David Chapman and I both need a comprehensive list of these
+ // guys.
+
int sz = glyphList.size();
ThdlDebug.verify(sz > 0);
// A list of grapheme clusters (see UnicodeGraphemeCluster).
// sz is an overestimate (speeds us up, wastes some memory).
- ArrayList gcs = new ArrayList(sz);
+ TMWGCList gcs = new TMWGCList(sz);
StringBuffer buildingUpGc = new StringBuffer();
@@ -919,14 +923,22 @@ public class TibTextUtils implements THDLWylieConstants {
}
- private static String getClassificationOfTshegBar(ArrayList gcs,
- StringBuffer warnings) {
+ public static String getClassificationOfTshegBar(TGCList gcs,
+ // DLC the warnings are Wylie-specific
+ StringBuffer warnings) {
String candidateType = null;
// Now that we have grapheme clusters, see if they match any
// of the "legal tsheg bars":
int sz = gcs.size();
+ if (sz == 1) {
+ TGCPair tp = gcs.get(0);
+ int cls = tp.classification;
+ if (TGCPair.SANSKRIT_WITHOUT_VOWEL == cls
+ || TGCPair.SANSKRIT_WITH_VOWEL == cls)
+ return "single-sanskrit-gc";
+ }
for (int i = 0; i < sz; i++) {
- TGCPair tp = (TGCPair)gcs.get(i);
+ TGCPair tp = gcs.get(i);
int cls = tp.classification;
String wylie = tp.wylie;
if (TGCPair.OTHER == cls) {
@@ -964,7 +976,7 @@ public class TibTextUtils implements THDLWylieConstants {
if (ACHUNG.equals(wylie)) {
// peek ahead to distinguish between ba's,
// ba'ala and ba'am:
- TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
+ TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-prefix/root";
@@ -982,7 +994,7 @@ public class TibTextUtils implements THDLWylieConstants {
if (ACHUNG.equals(wylie)) {
// peek ahead to distinguish between pa's,
// pa'ala and pa'am:
- TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
+ TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-root";
@@ -1003,7 +1015,7 @@ public class TibTextUtils implements THDLWylieConstants {
if (ACHUNG.equals(wylie)) {
// peek ahead to distinguish between bpa's,
// bpa'ala and bpa'am:
- TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
+ TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-prefix-root";
@@ -1025,7 +1037,7 @@ public class TibTextUtils implements THDLWylieConstants {
if (ACHUNG.equals(wylie)) {
// peek ahead to distinguish between
// gga'am and gaga'ala:
- TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null;
+ TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null;
String nextwylie = (nexttp == null) ? "" : nexttp.wylie;
if (isAppendageNonVowelWylie(nextwylie)) {
candidateType = "maybe-appendaged-prefix/root-root/suffix";
@@ -1207,11 +1219,12 @@ public class TibTextUtils implements THDLWylieConstants {
boolean noSuchWylie[],
StringBuffer warnings,
StringBuffer wylieBuffer) {
- ArrayList gcs
+ TGCList gcs
= breakTshegBarIntoGraphemeClusters(glyphList, noSuchWylie);
String candidateType = getClassificationOfTshegBar(gcs, warnings);
int sz = gcs.size();
- if (candidateType == "invalid") {
+ if (candidateType == "invalid"
+ || candidateType == "single-sanskrit-gc") {
// Forget beauty and succintness -- just be sure to
// generate Wylie that can be converted unambiguously into
// Tibetan. Use a disambiguator or vowel after each
@@ -1243,10 +1256,7 @@ public class TibTextUtils implements THDLWylieConstants {
// Appendaged vs. not appendaged? it affects nothing at
// this stage.
- if (candidateType.startsWith("appendaged-")) {
- candidateType
- = candidateType.substring("appendaged-".length()).intern();
- }
+ candidateType = getCandidateTypeModuloAppendage(candidateType);
if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType) {
/* Yes, this is ambiguous. How do we handle it? See
@@ -1439,29 +1449,35 @@ public class TibTextUtils implements THDLWylieConstants {
else
return null;
}
-}
-/** An ordered pair consisting of a Tibetan grapheme cluster's (see
- {@link org.thdl.tib.text.tshegbar.UnicodeGraphemeCluster} for a
- definition of the term}) classification and its
- context-insensitive THDL Extended Wylie representation. */
-class TGCPair {
- static final int OTHER = 1;
- // a standalone achen would fall into this category:
- static final int CONSONANTAL_WITHOUT_VOWEL = 2;
- static final int CONSONANTAL_WITH_VOWEL = 3;
- static final int LONE_VOWEL = 4;
- static final int SANSKRIT_WITHOUT_VOWEL = 5;
- static final int SANSKRIT_WITH_VOWEL = 6;
-
- String wylie;
- int classification;
- TGCPair(String wylie, int classification) {
- this.wylie = wylie;
- this.classification = classification;
+ /** Returns "root" instead of "appendaged-root", for example. */
+ private static final String getCandidateTypeModuloAppendage(String candidateType) {
+ if (candidateType.startsWith("appendaged-")) {
+ candidateType
+ = candidateType.substring("appendaged-".length()).intern();
+ }
+ return candidateType;
}
- public String toString() {
- return "";
+
+ /** Returns an array of size 2 that lists all the possible indices
+ * of the root stack given the chosen candidate type. A negative
+ * number appears if there are not that many possible positions
+ * for the root. (You'll get two negative numbers if there is no
+ * root stack.) */
+ public static final int[] getIndicesOfRootForCandidateType(String candidateType) {
+ // Appendaged vs. not appendaged? it affects nothing.
+ candidateType = getCandidateTypeModuloAppendage(candidateType);
+
+ int[] rv = new int[] { -1, -1 };
+ if (candidateType == "prefix/root"
+ || candidateType.startsWith("root")) {
+ rv[0] = 0;
+ } else if (candidateType.startsWith("prefix/root-")) {
+ rv[0] = 0;
+ rv[1] = 1;
+ } else if (candidateType.startsWith("prefix-root")) {
+ rv[0] = 1;
+ }
+ return rv;
}
}
diff --git a/source/org/thdl/tib/text/TibetanMachineWeb.java b/source/org/thdl/tib/text/TibetanMachineWeb.java
index e2fe2c4..1f87db1 100644
--- a/source/org/thdl/tib/text/TibetanMachineWeb.java
+++ b/source/org/thdl/tib/text/TibetanMachineWeb.java
@@ -877,7 +877,8 @@ public static boolean isWylieChar(String s) {
/**
* Checks to see if the passed string is a consonant or unadorned
-* consonant stack in Extended Wylie.
+* consonant stack in Extended Wylie. The string shouldn't have any
+* '+' or '.' characters in it if you wnat this to return true.
* @param s the string to be checked
* @return true if s is such in Extended Wylie transliteration, false
* if not */
@@ -1151,8 +1152,8 @@ public static String getWylieForVowel(String s) {
/**
* Gets the DuffCode required for a vowel, if
* affixed to the given hashKey.
-* @param hashKey the key for the character the
-* vowel is to be affixed to
+* @param hashKey the key for the character the vowel is to be affixed
+* to; see {@link #getGlyph(String)} to learn about hash keys.
* @param vowel the vowel you want the DuffCode for
* @return the DuffCode for the vowel in the given
* context, or null if there is no such vowel in
@@ -1170,7 +1171,8 @@ public static DuffCode getVowel(String hashKey, int vowel) {
/**
* Checks to see if a glyph exists for this hash key.
-* @param hashKey the key to be checked
+* @param hashKey the key to be checked; see {@link #getGlyph(String)}
+* to learn about hash keys.
* @return true if there is a glyph corresponding to
* hashKey, false if not
*/
@@ -1198,7 +1200,8 @@ public static DuffCode getGlyph(String hashKey) {
/**
* Gets the half height character for this hash key.
-* @param hashKey the key you want a half height glyph for
+* @param hashKey the key you want a half height glyph for; see {@link
+* #getGlyph(String)} to learn about hash keys.
* @return the TibetanMachineWeb DuffCode of hashKey's
* reduced height glyph, or null if there is no such glyph
* @see DuffCode
@@ -1627,8 +1630,8 @@ public static int getTMWFontNumber(String name) {
* Gets the hash key associated with this glyph.
* @param font a TibetanMachineWeb font number
* @param code an ASCII character code minus 32
-* @return the hashKey corresponding to the character
-* at font, code
+* @return the hashKey corresponding to the character at font, code;
+* see {@link #getGlyph(String)} to learn about hash keys.
*/
public static String getHashKeyForGlyph(int font, int code) {
code = code - 32;
@@ -1640,7 +1643,8 @@ public static String getHashKeyForGlyph(int font, int code) {
* none (probably because this glyph has no THDL Extended Wylie
* transcription).
* @param dc a DuffCode denoting a TibetanMachineWeb glyph
-* @return the hashKey corresponding to the character at dc */
+* @return the hashKey corresponding to the character at dc; see {@link
+* #getGlyph(String)} to learn about hash keys. */
public static String getHashKeyForGlyph(DuffCode dc) {
int font = dc.getFontNum();
int code = dc.getCharNum()-32;
@@ -1654,7 +1658,8 @@ public static String getHashKeyForGlyph(DuffCode dc) {
* This method takes a hash key and converts it its correct
* Wylie value, and therefore is useful in conversions from
* TibetanMachineWeb to Wylie.
-* @param hashKey the hash key for a glyph
+* @param hashKey the hash key for a glyph; see {@link
+* #getGlyph(String)} to learn about hash keys.
* @return the Wylie value of that hash key
*/
public static String wylieForGlyph(String hashKey) {