diff --git a/source/org/thdl/tib/text/TGCList.java b/source/org/thdl/tib/text/TGCList.java new file mode 100644 index 0000000..7e057c3 --- /dev/null +++ b/source/org/thdl/tib/text/TGCList.java @@ -0,0 +1,29 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2003 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text; + +/** A list of {@link TGCPair TGCPairs}. + * @author David Chandler */ +public interface TGCList { + /** Returns the number of grapheme clusters in this list. */ + int size(); + + /** Returns the ith grapheme cluster in this list. */ + TGCPair get(int i); +} diff --git a/source/org/thdl/tib/text/TGCPair.java b/source/org/thdl/tib/text/TGCPair.java new file mode 100644 index 0000000..d681cbd --- /dev/null +++ b/source/org/thdl/tib/text/TGCPair.java @@ -0,0 +1,50 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2003 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text; + + +/** An ordered pair consisting of a Tibetan grapheme cluster's (see + {@link org.thdl.tib.text.tshegbar.UnicodeGraphemeCluster} for a + definition of the term}) classification and its + context-insensitive THDL Extended Wylie representation. NOTE + WELL: this is not a real grapheme cluster; I'm misusing the term + (FIXME). It's actually whole or part of one. It's part of one + when this is a vowel or U+0F7F alone. + + @author David Chandler */ +public class TGCPair { + public static final int OTHER = 1; + // a standalone achen would fall into this category: + public static final int CONSONANTAL_WITHOUT_VOWEL = 2; + public static final int CONSONANTAL_WITH_VOWEL = 3; + public static final int LONE_VOWEL = 4; + public static final int SANSKRIT_WITHOUT_VOWEL = 5; + public static final int SANSKRIT_WITH_VOWEL = 6; + + public String wylie; + public int classification; + public TGCPair(String wylie, int classification) { + this.wylie = wylie; + this.classification = classification; + } + public String toString() { + return ""; + } +} diff --git a/source/org/thdl/tib/text/TMWGCList.java b/source/org/thdl/tib/text/TMWGCList.java new file mode 100644 index 0000000..553a0d1 --- /dev/null +++ b/source/org/thdl/tib/text/TMWGCList.java @@ -0,0 +1,48 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2003 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text; + +import java.util.ArrayList; + +/** A list of pseudo-grapheme clusters (vowels appear alone, FIXME: + * change the name) all in TibetanMachineWeb. + * @author David Chandler */ +class TMWGCList implements TGCList { + private ArrayList al; + + /** Constructs an empty TMWGCList. */ + TMWGCList() { + al = new ArrayList(); + } + + /** Constructs an empty TMWGCList ready to hold size TGCPairs. */ + TMWGCList(int size) { + al = new ArrayList(size); + } + + public int size() { return al.size(); } + + public TGCPair get(int i) { + return (TGCPair)al.get(i); + } + + void add(TGCPair tp) { + al.add(tp); + } +} diff --git a/source/org/thdl/tib/text/TibTextUtils.java b/source/org/thdl/tib/text/TibTextUtils.java index 1ad6491..f42695a 100644 --- a/source/org/thdl/tib/text/TibTextUtils.java +++ b/source/org/thdl/tib/text/TibTextUtils.java @@ -830,17 +830,21 @@ public class TibTextUtils implements THDLWylieConstants { consonant or consonant stack with optional adornment or a number (possibly super- or subscribed) or some other glyph alone. */ - private static ArrayList breakTshegBarIntoGraphemeClusters(java.util.List glyphList, - boolean noSuchWylie[]) { + private static TGCList breakTshegBarIntoGraphemeClusters(java.util.List glyphList, + boolean noSuchWylie[]) { // Definition: adornment means vowels and achungs and bindus. + // DLC FIXME: {H}, U+0F7F, is part of a grapheme cluster! + // David Chapman and I both need a comprehensive list of these + // guys. + int sz = glyphList.size(); ThdlDebug.verify(sz > 0); // A list of grapheme clusters (see UnicodeGraphemeCluster). // sz is an overestimate (speeds us up, wastes some memory). - ArrayList gcs = new ArrayList(sz); + TMWGCList gcs = new TMWGCList(sz); StringBuffer buildingUpGc = new StringBuffer(); @@ -919,14 +923,22 @@ public class TibTextUtils implements THDLWylieConstants { } - private static String getClassificationOfTshegBar(ArrayList gcs, - StringBuffer warnings) { + public static String getClassificationOfTshegBar(TGCList gcs, + // DLC the warnings are Wylie-specific + StringBuffer warnings) { String candidateType = null; // Now that we have grapheme clusters, see if they match any // of the "legal tsheg bars": int sz = gcs.size(); + if (sz == 1) { + TGCPair tp = gcs.get(0); + int cls = tp.classification; + if (TGCPair.SANSKRIT_WITHOUT_VOWEL == cls + || TGCPair.SANSKRIT_WITH_VOWEL == cls) + return "single-sanskrit-gc"; + } for (int i = 0; i < sz; i++) { - TGCPair tp = (TGCPair)gcs.get(i); + TGCPair tp = gcs.get(i); int cls = tp.classification; String wylie = tp.wylie; if (TGCPair.OTHER == cls) { @@ -964,7 +976,7 @@ public class TibTextUtils implements THDLWylieConstants { if (ACHUNG.equals(wylie)) { // peek ahead to distinguish between ba's, // ba'ala and ba'am: - TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null; + TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null; String nextwylie = (nexttp == null) ? "" : nexttp.wylie; if (isAppendageNonVowelWylie(nextwylie)) { candidateType = "maybe-appendaged-prefix/root"; @@ -982,7 +994,7 @@ public class TibTextUtils implements THDLWylieConstants { if (ACHUNG.equals(wylie)) { // peek ahead to distinguish between pa's, // pa'ala and pa'am: - TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null; + TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null; String nextwylie = (nexttp == null) ? "" : nexttp.wylie; if (isAppendageNonVowelWylie(nextwylie)) { candidateType = "maybe-appendaged-root"; @@ -1003,7 +1015,7 @@ public class TibTextUtils implements THDLWylieConstants { if (ACHUNG.equals(wylie)) { // peek ahead to distinguish between bpa's, // bpa'ala and bpa'am: - TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null; + TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null; String nextwylie = (nexttp == null) ? "" : nexttp.wylie; if (isAppendageNonVowelWylie(nextwylie)) { candidateType = "maybe-appendaged-prefix-root"; @@ -1025,7 +1037,7 @@ public class TibTextUtils implements THDLWylieConstants { if (ACHUNG.equals(wylie)) { // peek ahead to distinguish between // gga'am and gaga'ala: - TGCPair nexttp = (i+1 < sz) ? (TGCPair)gcs.get(i+1) : null; + TGCPair nexttp = (i+1 < sz) ? gcs.get(i+1) : null; String nextwylie = (nexttp == null) ? "" : nexttp.wylie; if (isAppendageNonVowelWylie(nextwylie)) { candidateType = "maybe-appendaged-prefix/root-root/suffix"; @@ -1207,11 +1219,12 @@ public class TibTextUtils implements THDLWylieConstants { boolean noSuchWylie[], StringBuffer warnings, StringBuffer wylieBuffer) { - ArrayList gcs + TGCList gcs = breakTshegBarIntoGraphemeClusters(glyphList, noSuchWylie); String candidateType = getClassificationOfTshegBar(gcs, warnings); int sz = gcs.size(); - if (candidateType == "invalid") { + if (candidateType == "invalid" + || candidateType == "single-sanskrit-gc") { // Forget beauty and succintness -- just be sure to // generate Wylie that can be converted unambiguously into // Tibetan. Use a disambiguator or vowel after each @@ -1243,10 +1256,7 @@ public class TibTextUtils implements THDLWylieConstants { // Appendaged vs. not appendaged? it affects nothing at // this stage. - if (candidateType.startsWith("appendaged-")) { - candidateType - = candidateType.substring("appendaged-".length()).intern(); - } + candidateType = getCandidateTypeModuloAppendage(candidateType); if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType) { /* Yes, this is ambiguous. How do we handle it? See @@ -1439,29 +1449,35 @@ public class TibTextUtils implements THDLWylieConstants { else return null; } -} -/** An ordered pair consisting of a Tibetan grapheme cluster's (see - {@link org.thdl.tib.text.tshegbar.UnicodeGraphemeCluster} for a - definition of the term}) classification and its - context-insensitive THDL Extended Wylie representation. */ -class TGCPair { - static final int OTHER = 1; - // a standalone achen would fall into this category: - static final int CONSONANTAL_WITHOUT_VOWEL = 2; - static final int CONSONANTAL_WITH_VOWEL = 3; - static final int LONE_VOWEL = 4; - static final int SANSKRIT_WITHOUT_VOWEL = 5; - static final int SANSKRIT_WITH_VOWEL = 6; - - String wylie; - int classification; - TGCPair(String wylie, int classification) { - this.wylie = wylie; - this.classification = classification; + /** Returns "root" instead of "appendaged-root", for example. */ + private static final String getCandidateTypeModuloAppendage(String candidateType) { + if (candidateType.startsWith("appendaged-")) { + candidateType + = candidateType.substring("appendaged-".length()).intern(); + } + return candidateType; } - public String toString() { - return ""; + + /** Returns an array of size 2 that lists all the possible indices + * of the root stack given the chosen candidate type. A negative + * number appears if there are not that many possible positions + * for the root. (You'll get two negative numbers if there is no + * root stack.) */ + public static final int[] getIndicesOfRootForCandidateType(String candidateType) { + // Appendaged vs. not appendaged? it affects nothing. + candidateType = getCandidateTypeModuloAppendage(candidateType); + + int[] rv = new int[] { -1, -1 }; + if (candidateType == "prefix/root" + || candidateType.startsWith("root")) { + rv[0] = 0; + } else if (candidateType.startsWith("prefix/root-")) { + rv[0] = 0; + rv[1] = 1; + } else if (candidateType.startsWith("prefix-root")) { + rv[0] = 1; + } + return rv; } } diff --git a/source/org/thdl/tib/text/TibetanMachineWeb.java b/source/org/thdl/tib/text/TibetanMachineWeb.java index e2fe2c4..1f87db1 100644 --- a/source/org/thdl/tib/text/TibetanMachineWeb.java +++ b/source/org/thdl/tib/text/TibetanMachineWeb.java @@ -877,7 +877,8 @@ public static boolean isWylieChar(String s) { /** * Checks to see if the passed string is a consonant or unadorned -* consonant stack in Extended Wylie. +* consonant stack in Extended Wylie. The string shouldn't have any +* '+' or '.' characters in it if you wnat this to return true. * @param s the string to be checked * @return true if s is such in Extended Wylie transliteration, false * if not */ @@ -1151,8 +1152,8 @@ public static String getWylieForVowel(String s) { /** * Gets the DuffCode required for a vowel, if * affixed to the given hashKey. -* @param hashKey the key for the character the -* vowel is to be affixed to +* @param hashKey the key for the character the vowel is to be affixed +* to; see {@link #getGlyph(String)} to learn about hash keys. * @param vowel the vowel you want the DuffCode for * @return the DuffCode for the vowel in the given * context, or null if there is no such vowel in @@ -1170,7 +1171,8 @@ public static DuffCode getVowel(String hashKey, int vowel) { /** * Checks to see if a glyph exists for this hash key. -* @param hashKey the key to be checked +* @param hashKey the key to be checked; see {@link #getGlyph(String)} +* to learn about hash keys. * @return true if there is a glyph corresponding to * hashKey, false if not */ @@ -1198,7 +1200,8 @@ public static DuffCode getGlyph(String hashKey) { /** * Gets the half height character for this hash key. -* @param hashKey the key you want a half height glyph for +* @param hashKey the key you want a half height glyph for; see {@link +* #getGlyph(String)} to learn about hash keys. * @return the TibetanMachineWeb DuffCode of hashKey's * reduced height glyph, or null if there is no such glyph * @see DuffCode @@ -1627,8 +1630,8 @@ public static int getTMWFontNumber(String name) { * Gets the hash key associated with this glyph. * @param font a TibetanMachineWeb font number * @param code an ASCII character code minus 32 -* @return the hashKey corresponding to the character -* at font, code +* @return the hashKey corresponding to the character at font, code; +* see {@link #getGlyph(String)} to learn about hash keys. */ public static String getHashKeyForGlyph(int font, int code) { code = code - 32; @@ -1640,7 +1643,8 @@ public static String getHashKeyForGlyph(int font, int code) { * none (probably because this glyph has no THDL Extended Wylie * transcription). * @param dc a DuffCode denoting a TibetanMachineWeb glyph -* @return the hashKey corresponding to the character at dc */ +* @return the hashKey corresponding to the character at dc; see {@link +* #getGlyph(String)} to learn about hash keys. */ public static String getHashKeyForGlyph(DuffCode dc) { int font = dc.getFontNum(); int code = dc.getCharNum()-32; @@ -1654,7 +1658,8 @@ public static String getHashKeyForGlyph(DuffCode dc) { * This method takes a hash key and converts it its correct * Wylie value, and therefore is useful in conversions from * TibetanMachineWeb to Wylie. -* @param hashKey the hash key for a glyph +* @param hashKey the hash key for a glyph; see {@link +* #getGlyph(String)} to learn about hash keys. * @return the Wylie value of that hash key */ public static String wylieForGlyph(String hashKey) {