Two things:

One, TMW->EWTS gives dbas and dngas instead of dabs and dangs
because Chris Fynn's e-mail from today has dbas and dngas.

Second, Down with ACIPRules.  Long live ACIPTraits.  EWTS->Tibetan
conversion is closer still.
This commit is contained in:
dchandler 2005-02-22 04:36:54 +00:00
parent 82c6047cc2
commit c16f633ecf
18 changed files with 950 additions and 818 deletions

View file

@ -969,6 +969,22 @@ public class DuffPaneTest extends DuffPaneTestBase {
ensureKeysGiveCorrectWylie("'gas"); ensureKeysGiveCorrectWylie("'gas");
/* Chris Fynn's e-mail on Feb 21 2005 leads to these test
cases: */
{
ensureKeysGiveCorrectWylie("dgas");
ensureKeysGiveCorrectWylie("'gas");
ensureKeysGiveCorrectWylie("dngas");
ensureKeysGiveCorrectWylie("gnad");
ensureKeysGiveCorrectWylie("mnad");
ensureKeysGiveCorrectWylie("bags");
ensureKeysGiveCorrectWylie("dbas");
ensureKeysGiveCorrectWylie("'bas");
ensureKeysGiveCorrectWylie("mags");
ensureKeysGiveCorrectWylie("mangs");
ensureKeysGiveCorrectWylie("dmas");
}
ensureKeysGiveCorrectWylie("gangs"); ensureKeysGiveCorrectWylie("gangs");
ensureKeysGiveCorrectWylie("gnags"); ensureKeysGiveCorrectWylie("gnags");

View file

@ -27,7 +27,7 @@ import org.thdl.util.*;
import org.thdl.tib.text.*; import org.thdl.tib.text.*;
import org.thdl.tib.text.ttt.TConverter; import org.thdl.tib.text.ttt.TConverter;
import org.thdl.tib.text.ttt.ACIPTshegBarScanner; import org.thdl.tib.text.ttt.ACIPTraits;
import java.util.ArrayList; import java.util.ArrayList;
/** TibetanConverter is a command-line utility for converting to and /** TibetanConverter is a command-line utility for converting to and
@ -297,7 +297,7 @@ public class TibetanConverter implements FontConverterConstants {
if (ACIP_TO_UNI_TEXT == ct || ACIP_TO_TMW == ct) { if (ACIP_TO_UNI_TEXT == ct || ACIP_TO_TMW == ct) {
try { try {
ArrayList al ArrayList al
= ACIPTshegBarScanner.instance().scanStream(in, null, = ACIPTraits.instance().scanner().scanStream(in, null,
ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have", ThdlOptions.getIntegerOption("thdl.most.errors.a.tibetan.acip.document.can.have",
1000 - 1), 1000 - 1),
shortMessages, shortMessages,
@ -307,7 +307,8 @@ public class TibetanConverter implements FontConverterConstants {
boolean embeddedWarnings = (warningLevel != "None"); boolean embeddedWarnings = (warningLevel != "None");
boolean hasWarnings[] = new boolean[] { false }; boolean hasWarnings[] = new boolean[] { false };
if (ACIP_TO_UNI_TEXT == ct) { if (ACIP_TO_UNI_TEXT == ct) {
if (!TConverter.convertToUnicodeText(al, out, null, if (!TConverter.convertToUnicodeText(ACIPTraits.instance(),
al, out, null,
null, hasWarnings, null, hasWarnings,
embeddedWarnings, embeddedWarnings,
warningLevel, warningLevel,
@ -315,7 +316,8 @@ public class TibetanConverter implements FontConverterConstants {
return 46; return 46;
} else { } else {
if (ct != ACIP_TO_TMW) throw new Error("badness"); if (ct != ACIP_TO_TMW) throw new Error("badness");
if (!TConverter.convertToTMW(al, out, null, null, if (!TConverter.convertToTMW(ACIPTraits.instance(),
al, out, null, null,
hasWarnings, hasWarnings,
embeddedWarnings, embeddedWarnings,
warningLevel, shortMessages, warningLevel, shortMessages,

View file

@ -137,7 +137,7 @@ public class TGCPair implements THDLWylieConstants {
consonantACIP = "V"; consonantACIP = "V";
else else
consonantACIP consonantACIP
= org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(consonantWylie); = org.thdl.tib.text.ttt.ACIPTraits.instance().getACIPForEWTS(consonantWylie);
if (null == consonantACIP) { if (null == consonantACIP) {
if (null != consonantWylie && consonantWylie.startsWith("R+")) if (null != consonantWylie && consonantWylie.startsWith("R+"))
return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + consonantWylie, " because the ACIP R+... could imply the short superscribed form, but this most likely intends the full form (i.e., Unicode character U+0F6A)"); return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + consonantWylie, " because the ACIP R+... could imply the short superscribed form, but this most likely intends the full form (i.e., Unicode character U+0F6A)");
@ -160,7 +160,7 @@ public class TGCPair implements THDLWylieConstants {
} }
if (vowelWylie != null) { if (vowelWylie != null) {
String vowelACIP String vowelACIP
= org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(vowelWylie); = org.thdl.tib.text.ttt.ACIPTraits.instance().getACIPForEWTS(vowelWylie);
if (null == vowelACIP) { if (null == vowelACIP) {
return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + vowelWylie, ""); return TibetanMachineWeb.getTMWToACIPErrorString("glyph with THDL Extended Wylie " + vowelWylie, "");
} else { } else {

View file

@ -25,7 +25,7 @@ import javax.swing.text.rtf.RTFEditorKit;
import java.io.*; import java.io.*;
import org.thdl.util.ThdlDebug; import org.thdl.util.ThdlDebug;
import org.thdl.tib.text.ttt.ACIPTshegBarScanner; import org.thdl.tib.text.ttt.ACIPTraits;
import org.thdl.tib.text.ttt.TConverter; import org.thdl.tib.text.ttt.TConverter;
import org.thdl.tib.text.tshegbar.LegalTshegBar; import org.thdl.tib.text.tshegbar.LegalTshegBar;
import org.thdl.tib.text.tshegbar.UnicodeConstants; import org.thdl.tib.text.tshegbar.UnicodeConstants;
@ -333,7 +333,7 @@ public class TibTextUtils implements THDLWylieConstants {
{ {
StringBuffer errors = new StringBuffer(); StringBuffer errors = new StringBuffer();
String warningLevel = withWarnings ? "All" : "None"; String warningLevel = withWarnings ? "All" : "None";
ArrayList al = ACIPTshegBarScanner.instance().scan(acip, errors, 500, ArrayList al = ACIPTraits.instance().scanner().scan(acip, errors, 500,
false, warningLevel); false, warningLevel);
if (null == al || errors.length() > 0) { if (null == al || errors.length() > 0) {
if (errors.length() > 0) if (errors.length() > 0)
@ -348,8 +348,8 @@ public class TibTextUtils implements THDLWylieConstants {
} }
try { try {
int tloc[] = new int[] { loc }; int tloc[] = new int[] { loc };
TConverter.convertToTMW(al, tdoc, null, null, null, TConverter.convertToTMW(ACIPTraits.instance(), al, tdoc, null, null,
putWarningsInOutput, warningLevel, null, putWarningsInOutput, warningLevel,
false, colors, tloc); false, colors, tloc);
return tloc[0] - loc; return tloc[0] - loc;
} catch (IOException e) { } catch (IOException e) {
@ -1430,6 +1430,53 @@ public class TibTextUtils implements THDLWylieConstants {
candidateType = getCandidateTypeModuloAppendage(candidateType); candidateType = getCandidateTypeModuloAppendage(candidateType);
if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType) { if ("prefix/root-root/suffix-suffix/postsuffix" == candidateType) {
/* Update: Chris Fynn wrote this in response to an
e-mail from David Chapman on Feb 21, 2005:
<quote Chris Fynn feb 21 2005>
When working out the rules for Tibetan and Dzongkha
collation in Bhutan we came up with the following sequences
that could be ambiguous:
0F51 0F42 0F66
0F60 0F42 0F66
0F51 0F44 0F66
0F42 0F53 0F51
0F58 0F53 0F51
0F56 0F42 0F66
0F51 0F56 0F66
0F60 0F56 0F66
0F58 0F42 0F66
0F58 0F44 0F66
0F51 0F58 0F66
After much consultation with experts in Bhutan it was
decided these should always be read as follows:
0F51 0F42 0F66 dgas
0F60 0F42 0F66 'gas
0F51 0F44 0F66 dngas *
0F42 0F53 0F51 gnad
0F58 0F53 0F51 mnad *
0F56 0F42 0F66 bags
0F51 0F56 0F66 dbas
0F60 0F56 0F66 'bas *
0F58 0F42 0F66 mags
0F58 0F44 0F66 mangs
0F51 0F58 0F66 dmas
In most cases it was found that only one of the two possible
readings actually existed as words. 0F51 0F44 0F66 , 0F58
0F53 0F51, and 0F60 0F56 0F66 were not found as syllables in
any known words, but the experts felt that *if* they
occurred in Tibetan or Dzongkha text then dngas, mnad, and
'bas would be the most likely reading.
</quote>
Because of this e-mail, dbas and dngas were added to the list of
exceptions. */
/* Yes, this is ambiguous. How do we handle it? See /* Yes, this is ambiguous. How do we handle it? See
* this from Andres (but note that only 4 of the 14 in * this from Andres (but note that only 4 of the 14 in
* the second list are ambiguous because ra na sa and * the second list are ambiguous because ra na sa and
@ -1480,7 +1527,9 @@ public class TibTextUtils implements THDLWylieConstants {
|| wylie2.equals("n") || wylie2.equals("n")
|| wylie2.equals("s"))) || wylie2.equals("s")))
|| (wylie1.equals("d") && (wylie2.equals("g") || (wylie1.equals("d") && (wylie2.equals("g")
|| wylie2.equals("m"))) || wylie2.equals("m")
|| wylie2.equals("b")
|| wylie2.equals("ng")))
|| (wylie1.equals("b") && wylie2.equals("d")) || (wylie1.equals("b") && wylie2.equals("d"))
|| (wylie1.equals("m") && wylie2.equals("d")) || (wylie1.equals("m") && wylie2.equals("d"))
|| (wylie1.equals("'") && (wylie2.equals("g") || (wylie1.equals("'") && (wylie2.equals("g")

View file

@ -1988,7 +1988,7 @@ private static String acipForGlyph(String hashKey) {
// ~X is a special case because the EWTS is 2 characters in // ~X is a special case because the EWTS is 2 characters in
// length // length
|| "~X".equals(hashKey)) // hard-coded EWTS value || "~X".equals(hashKey)) // hard-coded EWTS value
return org.thdl.tib.text.ttt.ACIPRules.getACIPForEWTS(hashKey); return org.thdl.tib.text.ttt.ACIPTraits.instance().getACIPForEWTS(hashKey);
else else
// else we are not be able to use it because it's not smart // else we are not be able to use it because it's not smart
// about stacks (e.g., W+W) // about stacks (e.g., W+W)
@ -2116,7 +2116,7 @@ public static String getACIPForGlyph(DuffCode dc1,
// DLC FIXME: TMW.53 is probably going to come out all wrong (VA // DLC FIXME: TMW.53 is probably going to come out all wrong (VA
// vs. WA) from this function, but // vs. WA) from this function, but
// ACIPRules.getACIPForEWTS(String) seems to come through... will // ACIPTraits.getACIPForEWTS(String) seems to come through... will
// it always? // it always?
String hashKey = getHashKeyForGlyph(dc1); String hashKey = getHashKeyForGlyph(dc1);

View file

@ -9,9 +9,9 @@
// - blank lines should be ignored // - blank lines should be ignored
// - <?x?> marks a command // - <?x?> marks a command
// //
// If you change the Wylie here, it can break the ACIP->TMW and // If you change the EWTS transliteration here, it can break the
// ACIP->Unicode conversion. So keep ACIPRules in sync with this, and be // ACIP->TMW and ACIP->Unicode conversion. So keep ACIPTraits in sync
// sure to run 'ant clean check' after your change. // with this, and be sure to run 'ant clean check' after your change.
// //
// Note that some glyphs have EWTS \uF021-\uF0FF inclusive. These do // Note that some glyphs have EWTS \uF021-\uF0FF inclusive. These do
// not have anything in the Unicode column, though, because this is // not have anything in the Unicode column, though, because this is
@ -37,7 +37,7 @@
// by the way. // by the way.
// //
// If EWTS changes, then ACIP->TMW and ACIP->Unicode will break -- // If EWTS changes, then ACIP->TMW and ACIP->Unicode will break --
// modify ACIPRules and test test test. // modify ACIPTraits and test test test.
<?Input:Punctuation?> <?Input:Punctuation?>
//_~32,1~0,32 //_~32,1~0,32
@ -645,7 +645,7 @@ r+m+m~51,4~~7,59~1,110~8,121~1,123~1,125~8,107~8,114~f62,fa8,fa8
// Note that TPairList.java's unicodeExceptionsMap must be updated if // Note that TPairList.java's unicodeExceptionsMap must be updated if
// we change who uses U+0F6A. // we change who uses U+0F6A.
R+Y~52,4~~7,60~1,110~8,120~1,123~1,125~8,106~8,113~f6a,fbb R+Y~52,4~~7,60~1,110~8,120~1,123~1,125~8,106~8,113~f6a,fbb
// R+W is mentioned in ACIPRules.java: // R+W is mentioned in ACIPTraits.java:
R+W~196,4~~7,61~1,109~8,120~1,123~1,125~8,106~8,113~f6a,fba R+W~196,4~~7,61~1,109~8,120~1,123~1,125~8,106~8,113~f6a,fba
R+sh~53,4~~7,62~1,109~8,120~1,123~1,125~8,106~8,113~f6a,fb4 R+sh~53,4~~7,62~1,109~8,120~1,123~1,125~8,106~8,113~f6a,fb4
R+sh+y~54,4~~7,63~1,109~8,122~1,123~1,125~8,108~8,115~f6a,fb4,fb1 R+sh+y~54,4~~7,63~1,109~8,122~1,123~1,125~8,108~8,115~f6a,fb4,fb1
@ -667,7 +667,7 @@ l+h+w~197,4~~7,78~1,109~8,121~1,123~1,125~8,106~8,113~f63,fb7,fad
w+y~69,4~~7,79~1,109~8,121~1,123~1,125~8,107~8,114~f5d,fb1 w+y~69,4~~7,79~1,109~8,121~1,123~1,125~8,107~8,114~f5d,fb1
w+r~70,4~~7,80~1,109~8,121~1,123~1,125~8,107~8,114~f5d,fb2 w+r~70,4~~7,80~1,109~8,121~1,123~1,125~8,107~8,114~f5d,fb2
w+n~195,4~~7,81~1,109~8,120~1,123~1,125~8,106~8,113~f5d,fa3 w+n~195,4~~7,81~1,109~8,120~1,123~1,125~8,106~8,113~f5d,fa3
// w+W is mentioned in ACIPRules.java: // w+W is mentioned in ACIPTraits.java:
w+W~194,4~~7,82~1,109~8,120~1,123~1,125~8,106~8,113~f5d,fba w+W~194,4~~7,82~1,109~8,120~1,123~1,125~8,106~8,113~f5d,fba
sh+ts~71,4~~7,83~1,109~8,120~1,123~1,125~8,106~8,113~f64,fa9 sh+ts~71,4~~7,83~1,109~8,120~1,123~1,125~8,106~8,113~f64,fa9
sh+ts+y~72,4~~7,84~1,109~8,122~1,123~1,125~8,108~8,115~f64,fa9,fb1 sh+ts+y~72,4~~7,84~1,109~8,122~1,123~1,125~8,108~8,115~f64,fa9,fb1

View file

@ -1,658 +0,0 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.ttt;
import java.util.HashSet;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.StringTokenizer;
import java.util.List;
import org.thdl.util.ThdlOptions;
import org.thdl.tib.text.DuffCode;
import org.thdl.tib.text.THDLWylieConstants;
import org.thdl.tib.text.TibetanMachineWeb;
import org.thdl.tib.text.TibTextUtils;
// TODO(DLC)[EWTS->Tibetan]: this and ACIPTraits -- unify?
/** Canonizes some facts regarding the ACIP transcription system.
* @author David Chandler */
public class ACIPRules {
/** {Ksh}, the longest consonant, has 3 characters, so this is
* three. */
public static int MAX_CONSONANT_LENGTH = 3;
/** {'EEm:}, the longest wowel, has 5 characters, so this is
* five. */
public static int MAX_WOWEL_LENGTH = 5;
/** For O(1) {@link #isWowel(String)} calls. */
private static HashSet acipVowels = null;
private static String[][] baseVowels = new String[][] {
// { ACIP, EWTS, EWTS for ACIP {'\'' + baseVowels[][0]}, vowel
// numbers (see TibetanMachineWeb's VOWEL_A, VOWEL_o, etc.)
// for ACIP, vowel numbers for ACIP {'\'' + baseVowels[][0]}
{ "A", "a", "A" },
{ "I", "i", "I" },
{ "U", "u", "U" },
{ "E", "e", "Ae" },
{ "O", "o", "Ao" },
{ "EE", "ai", "Aai" },
{ "OO", "au", "Aau" },
{ "i", "-i", "A-i" }
};
/** Returns true if and only if s is an ACIP wowel. You can't
* just call this any time -- A is both a consonant and a vowel
* in ACIP, so you have to call this in the right context. */
public static boolean isWowel(String s) {
if (null == acipVowels) {
acipVowels = new HashSet(baseVowels.length * 8);
for (int i = 0; i < baseVowels.length; i++) {
// I'm on my own with 'O and 'E and 'OO and 'EE, but
// GANG'O appears and I wonder... so here they are.
// It's consistent with 'I and 'A and 'U, at least:
// all the vowels may appear as K'vowel. DLC FIXME:
// ask.
acipVowels.add(baseVowels[i][0]);
acipVowels.add('\'' + baseVowels[i][0]);
acipVowels.add(baseVowels[i][0] + 'm');
acipVowels.add('\'' + baseVowels[i][0] + 'm');
acipVowels.add(baseVowels[i][0] + ':');
acipVowels.add('\'' + baseVowels[i][0] + ':');
acipVowels.add(baseVowels[i][0] + "m:");
acipVowels.add('\'' + baseVowels[i][0] + "m:");
// Keep this code in sync with getUnicodeFor.
// Keep this code in sync with getWylieForACIPVowel.
}
// {Pm} is treated just like {PAm}; {P:} is treated just
// like {PA:}; {Pm:} is treated just like {PAm:}. But
// that happens thanks to
}
return (acipVowels.contains(s));
}
/** For O(1) {@link #isConsonant(String)} calls. */
private static HashSet consonants = null;
/** Returns true if and only if acip is an ACIP consonant (without
* a vowel). For example, returns true for "K", but not for
* "KA" or "X". */
public static boolean isConsonant(String acip) {
if (consonants == null) {
consonants = new HashSet();
consonants.add("V");
consonants.add("K");
consonants.add("KH");
consonants.add("G");
consonants.add("NG");
consonants.add("C");
consonants.add("CH");
consonants.add("J");
consonants.add("NY");
consonants.add("T");
consonants.add("TH");
consonants.add("D");
consonants.add("N");
consonants.add("P");
consonants.add("PH");
consonants.add("B");
consonants.add("M");
consonants.add("TZ");
consonants.add("TS");
consonants.add("DZ");
consonants.add("W");
consonants.add("ZH");
consonants.add("Z");
consonants.add("Y");
consonants.add("R");
consonants.add("L");
consonants.add("SH");
consonants.add("S");
consonants.add("H");
consonants.add("t");
consonants.add("th");
consonants.add("d");
consonants.add("n");
consonants.add("sh");
consonants.add("dH");
consonants.add("DH");
consonants.add("BH");
consonants.add("DZH"); // longest, MAX_CONSONANT_LENGTH characters
consonants.add("Ksh"); // longest, MAX_CONSONANT_LENGTH characters
consonants.add("GH");
consonants.add("'");
consonants.add("A");
}
return consonants.contains(acip);
}
/** A map from wylie to ACIP. Note that the Wylie "w" maps to
both "V" and "W". */
private static HashMap wylieToACIP = null;
/** Returns the ACIP transliteration corresponding to the THDL
Extended Wylie <em>atom</em> EWTS, or null if EWTS is not
recognized. */
public static String getACIPForEWTS(String EWTS) {
getWylieForACIPConsonant(null);
getWylieForACIPOther(null);
getWylieForACIPVowel(null);
String ans = (String)wylieToACIP.get(EWTS);
boolean useCapitalW = false;
if (EWTS.startsWith("w"))
useCapitalW = true; // We want W+NA, not V+NA; we want WA, not VA.
if (null == ans) {
StringBuffer finalAns = new StringBuffer(EWTS.length());
StringTokenizer sTok = new StringTokenizer(EWTS, "-+", true);
while (sTok.hasMoreTokens()) {
String part, tok = sTok.nextToken();
if (tok.equals("-") || tok.equals("+"))
part = tok;
else {
if ("w".equals(tok)) {
// There are only two stacks in TMW that have
// U+0FBA: R+Wa and w+Wa. TMW->ACIP fails for
// these unless we handle it here. (FIXME:
// add an automated test for this).
if ("R+W".equals(EWTS) || "w+W".equals(EWTS)) {
part = "W";
} else {
part = "V";
}
} else {
part = (String)wylieToACIP.get(tok);
}
}
if (null == part) return null;
finalAns.append(part);
}
if (useCapitalW)
finalAns.setCharAt(0, 'W');
return finalAns.toString();
}
if (useCapitalW)
return "W" + ans.substring(1);
else
return ans;
}
/** Registers acip->wylie mappings in toWylie; registers
wylie->acip mappings in {@link #wylieToACIP}. */
private static void putMapping(HashMap toWylie, String ACIP, String EWTS) {
toWylie.put(ACIP, EWTS);
if (null == wylieToACIP) {
wylieToACIP = new HashMap(75);
// We don't want to put "/" in toWylie:
wylieToACIP.put("(", "/");
wylieToACIP.put(")", "/");
wylieToACIP.put("?", "\\");
wylieToACIP.put("_", " "); // oddball.
wylieToACIP.put("o'i", "O'I"); // oddball for TMW9.61.
}
wylieToACIP.put(EWTS, ACIP);
}
/** Returns true if and only if s is an ACIP consonant. */
static final boolean isACIPConsonant(String s) {
return (null != ACIPRules.getWylieForACIPConsonant(s));
}
private static HashMap acipConsonant2wylie = null;
/** Returns the EWTS corresponding to the given ACIP consonant
* (without the "A" vowel). Returns null if there is no such
* EWTS.
*
* <p>Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y",
* even though sometimes the EWTS for those is "w", "R", or "Y".
* Handle that in the caller. */
static final String getWylieForACIPConsonant(String acip) {
if (acipConsonant2wylie == null) {
acipConsonant2wylie = new HashMap(37);
// oddball:
putMapping(acipConsonant2wylie, "V", "w");
// more oddballs:
putMapping(acipConsonant2wylie, "DH", "d+h");
putMapping(acipConsonant2wylie, "BH", "b+h");
putMapping(acipConsonant2wylie, "dH", "D+h");
putMapping(acipConsonant2wylie, "DZH", "dz+h");
putMapping(acipConsonant2wylie, "Ksh", "k+Sh");
putMapping(acipConsonant2wylie, "GH", "g+h");
putMapping(acipConsonant2wylie, "K", "k");
putMapping(acipConsonant2wylie, "KH", "kh");
putMapping(acipConsonant2wylie, "G", "g");
putMapping(acipConsonant2wylie, "NG", "ng");
putMapping(acipConsonant2wylie, "C", "c");
putMapping(acipConsonant2wylie, "CH", "ch");
putMapping(acipConsonant2wylie, "J", "j");
putMapping(acipConsonant2wylie, "NY", "ny");
putMapping(acipConsonant2wylie, "T", "t");
putMapping(acipConsonant2wylie, "TH", "th");
putMapping(acipConsonant2wylie, "D", "d");
putMapping(acipConsonant2wylie, "N", "n");
putMapping(acipConsonant2wylie, "P", "p");
putMapping(acipConsonant2wylie, "PH", "ph");
putMapping(acipConsonant2wylie, "B", "b");
putMapping(acipConsonant2wylie, "M", "m");
putMapping(acipConsonant2wylie, "TZ", "ts");
putMapping(acipConsonant2wylie, "TS", "tsh");
putMapping(acipConsonant2wylie, "DZ", "dz");
putMapping(acipConsonant2wylie, "W", "W"
/* NOTE WELL: sometimes "w", sometimes "W".
Handle this in the caller.
Reasoning for "W" instead of "w": r-w and
r+w are both known hash keys. We sort 'em
out this way. (They are the only things
like this according to bug report #800166.) */
);
putMapping(acipConsonant2wylie, "ZH", "zh");
putMapping(acipConsonant2wylie, "Z", "z");
putMapping(acipConsonant2wylie, "'", "'");
putMapping(acipConsonant2wylie, "Y", "y");
putMapping(acipConsonant2wylie, "R", "r");
putMapping(acipConsonant2wylie, "L", "l");
putMapping(acipConsonant2wylie, "SH", "sh");
putMapping(acipConsonant2wylie, "S", "s");
putMapping(acipConsonant2wylie, "H", "h");
putMapping(acipConsonant2wylie, "A", "a");
putMapping(acipConsonant2wylie, "t", "T");
putMapping(acipConsonant2wylie, "th", "Th");
putMapping(acipConsonant2wylie, "d", "D");
putMapping(acipConsonant2wylie, "n", "N");
putMapping(acipConsonant2wylie, "sh", "Sh");
}
return (String)acipConsonant2wylie.get(acip);
}
private static HashMap acipVowel2wylie = null;
/** Returns the EWTS corresponding to the given ACIP "vowel".
* Returns null if there is no such EWTS. */
static final String getWylieForACIPVowel(String acip) {
if (acipVowel2wylie == null) {
acipVowel2wylie = new HashMap(baseVowels.length * 4);
for (int i = 0; i < baseVowels.length; i++) {
putMapping(acipVowel2wylie, baseVowels[i][0], baseVowels[i][1]);
putMapping(acipVowel2wylie, '\'' + baseVowels[i][0], baseVowels[i][2]);
putMapping(acipVowel2wylie, baseVowels[i][0] + 'm', baseVowels[i][1] + 'M');
putMapping(acipVowel2wylie, '\'' + baseVowels[i][0] + 'm', baseVowels[i][2] + 'M');
putMapping(acipVowel2wylie, baseVowels[i][0] + ':', baseVowels[i][1] + 'H');
putMapping(acipVowel2wylie, '\'' + baseVowels[i][0] + ':', baseVowels[i][2] + 'H');
putMapping(acipVowel2wylie, baseVowels[i][0] + "m:", baseVowels[i][1] + "MH");
putMapping(acipVowel2wylie, '\'' + baseVowels[i][0] + "m:", baseVowels[i][2] + "MH");
}
// {Pm} is treated just like {PAm}; {P:} is treated just
// like {PA:}; {Pm:} is treated just like {PAm:}. But
// that happens thanks to
// TPairListFactory.getFirstConsonantAndVowel(StringBuffer,int[]).
}
return (String)acipVowel2wylie.get(acip);
}
private static HashMap acipOther2wylie = null;
/** Returns the EWTS corresponding to the given ACIP puncuation or
* mark. Returns null if there is no such EWTS. */
static final String getWylieForACIPOther(String acip) {
if (acipOther2wylie == null) {
acipOther2wylie = new HashMap(20);
// don't use putMapping for this. We don't want TMW->ACIP
// to produce "." for a U+0F0C because ACIP doesn't say
// that "." means U+0F0C. It just seems to in practice
// for ACIP Release IV texts.
acipOther2wylie.put(".", "*");
putMapping(acipOther2wylie, "m", "M");
putMapping(acipOther2wylie, ":", "H");
putMapping(acipOther2wylie, ",", "/");
putMapping(acipOther2wylie, " ", " ");
putMapping(acipOther2wylie, ";", "|");
putMapping(acipOther2wylie, "`", "!");
putMapping(acipOther2wylie, "*", "@#");
// There is no glyph in TMW with the EWTS @##, so we don't do this: putMapping(acipOther2wylie, "#", "@##");
putMapping(acipOther2wylie, "%", "~X");
putMapping(acipOther2wylie, "o", "X");
putMapping(acipOther2wylie, "&", "&");
putMapping(acipOther2wylie, "^", "\\u0F38");
putMapping(acipOther2wylie, "0", "0");
putMapping(acipOther2wylie, "1", "1");
putMapping(acipOther2wylie, "2", "2");
putMapping(acipOther2wylie, "3", "3");
putMapping(acipOther2wylie, "4", "4");
putMapping(acipOther2wylie, "5", "5");
putMapping(acipOther2wylie, "6", "6");
putMapping(acipOther2wylie, "7", "7");
putMapping(acipOther2wylie, "8", "8");
putMapping(acipOther2wylie, "9", "9");
}
return (String)acipOther2wylie.get(acip);
}
private static HashMap superACIP2unicode = null;
private static HashMap subACIP2unicode = null;
/** If acip is an ACIP consonant or vowel or punctuation mark,
* then this returns the Unicode for it. The Unicode for the
* subscribed form of the glyph is returned if subscribed is
* true. Returns null if acip is unknown. */
static String getUnicodeFor(String acip, boolean subscribed) {
if (superACIP2unicode == null) {
final boolean compactUnicode
= ThdlOptions.getBooleanOption("thdl.acip.to.unicode.conversions.use.0F52.et.cetera");
superACIP2unicode = new HashMap(144);
subACIP2unicode = new HashMap(42);
// oddball:
subACIP2unicode.put("V", "\u0FAD");
superACIP2unicode.put("DH", (compactUnicode ? "\u0F52" : "\u0F51\u0FB7"));
subACIP2unicode.put("DH", (compactUnicode ? "\u0FA2" : "\u0FA1\u0FB7"));
superACIP2unicode.put("BH", (compactUnicode ? "\u0F57" : "\u0F56\u0FB7"));
subACIP2unicode.put("BH", (compactUnicode ? "\u0FA7" : "\u0FA6\u0FB7"));
superACIP2unicode.put("dH", (compactUnicode ? "\u0F4D" : "\u0F4C\u0FB7"));
subACIP2unicode.put("dH", (compactUnicode ? "\u0F9D" : "\u0F9C\u0FB7"));
superACIP2unicode.put("DZH", (compactUnicode ? "\u0F5C" : "\u0F5B\u0FB7"));
subACIP2unicode.put("DZH", (compactUnicode ? "\u0FAC" : "\u0FAB\u0FB7"));
superACIP2unicode.put("Ksh", (compactUnicode ? "\u0F69" : "\u0F40\u0FB5"));
subACIP2unicode.put("Ksh", (compactUnicode ? "\u0FB9" : "\u0F90\u0FB5"));
superACIP2unicode.put("GH", (compactUnicode ? "\u0F43" : "\u0F42\u0FB7"));
subACIP2unicode.put("GH", (compactUnicode ? "\u0F93" : "\u0F92\u0FB7"));
superACIP2unicode.put("K", "\u0F40");
subACIP2unicode.put("K", "\u0F90");
superACIP2unicode.put("KH", "\u0F41");
subACIP2unicode.put("KH", "\u0F91");
superACIP2unicode.put("G", "\u0F42");
subACIP2unicode.put("G", "\u0F92");
superACIP2unicode.put("NG", "\u0F44");
subACIP2unicode.put("NG", "\u0F94");
superACIP2unicode.put("C", "\u0F45");
subACIP2unicode.put("C", "\u0F95");
superACIP2unicode.put("CH", "\u0F46");
subACIP2unicode.put("CH", "\u0F96");
superACIP2unicode.put("J", "\u0F47");
subACIP2unicode.put("J", "\u0F97");
superACIP2unicode.put("NY", "\u0F49");
subACIP2unicode.put("NY", "\u0F99");
superACIP2unicode.put("T", "\u0F4F");
subACIP2unicode.put("T", "\u0F9F");
superACIP2unicode.put("TH", "\u0F50");
subACIP2unicode.put("TH", "\u0FA0");
superACIP2unicode.put("D", "\u0F51");
subACIP2unicode.put("D", "\u0FA1");
superACIP2unicode.put("N", "\u0F53");
subACIP2unicode.put("N", "\u0FA3");
superACIP2unicode.put("P", "\u0F54");
subACIP2unicode.put("P", "\u0FA4");
superACIP2unicode.put("PH", "\u0F55");
subACIP2unicode.put("PH", "\u0FA5");
superACIP2unicode.put("B", "\u0F56");
subACIP2unicode.put("B", "\u0FA6");
superACIP2unicode.put("M", "\u0F58");
subACIP2unicode.put("M", "\u0FA8");
superACIP2unicode.put("TZ", "\u0F59");
subACIP2unicode.put("TZ", "\u0FA9");
superACIP2unicode.put("TS", "\u0F5A");
subACIP2unicode.put("TS", "\u0FAA");
superACIP2unicode.put("DZ", "\u0F5B");
subACIP2unicode.put("DZ", "\u0FAB");
superACIP2unicode.put("W", "\u0F5D");
subACIP2unicode.put("W", "\u0FBA"); // oddball
superACIP2unicode.put("ZH", "\u0F5E");
subACIP2unicode.put("ZH", "\u0FAE");
superACIP2unicode.put("Z", "\u0F5F");
subACIP2unicode.put("Z", "\u0FAF");
superACIP2unicode.put("'", "\u0F60");
subACIP2unicode.put("'", "\u0FB0");
superACIP2unicode.put("Y", "\u0F61");
subACIP2unicode.put("Y", "\u0FB1");
superACIP2unicode.put("R", "\u0F62");
subACIP2unicode.put("R", "\u0FB2");
superACIP2unicode.put("L", "\u0F63");
subACIP2unicode.put("L", "\u0FB3");
superACIP2unicode.put("SH", "\u0F64");
subACIP2unicode.put("SH", "\u0FB4");
superACIP2unicode.put("S", "\u0F66");
subACIP2unicode.put("S", "\u0FB6");
superACIP2unicode.put("H", "\u0F67");
subACIP2unicode.put("H", "\u0FB7");
superACIP2unicode.put("A", "\u0F68");
subACIP2unicode.put("A", "\u0FB8");
superACIP2unicode.put("t", "\u0F4A");
subACIP2unicode.put("t", "\u0F9A");
superACIP2unicode.put("th", "\u0F4B");
subACIP2unicode.put("th", "\u0F9B");
superACIP2unicode.put("d", "\u0F4C");
subACIP2unicode.put("d", "\u0F9C");
superACIP2unicode.put("n", "\u0F4E");
subACIP2unicode.put("n", "\u0F9E");
superACIP2unicode.put("sh", "\u0F65");
subACIP2unicode.put("sh", "\u0FB5");
superACIP2unicode.put("I", "\u0F72");
superACIP2unicode.put("E", "\u0F7A");
superACIP2unicode.put("O", "\u0F7C");
superACIP2unicode.put("U", "\u0F74");
superACIP2unicode.put("OO", "\u0F7D");
superACIP2unicode.put("EE", "\u0F7B");
superACIP2unicode.put("i", "\u0F80");
superACIP2unicode.put("'A", "\u0F71");
superACIP2unicode.put("'I", "\u0F71\u0F72");
superACIP2unicode.put("'E", "\u0F71\u0F7A");
superACIP2unicode.put("'O", "\u0F71\u0F7C");
superACIP2unicode.put("'U", "\u0F71\u0F74");
superACIP2unicode.put("'OO", "\u0F71\u0F7D");
superACIP2unicode.put("'EE", "\u0F71\u0F7B");
superACIP2unicode.put("'i", "\u0F71\u0F80");
superACIP2unicode.put("Im", "\u0F72\u0F7E");
superACIP2unicode.put("Em", "\u0F7A\u0F7E");
superACIP2unicode.put("Om", "\u0F7C\u0F7E");
superACIP2unicode.put("Um", "\u0F74\u0F7E");
superACIP2unicode.put("OOm", "\u0F7D\u0F7E");
superACIP2unicode.put("EEm", "\u0F7B\u0F7E");
superACIP2unicode.put("im", "\u0F80\u0F7E");
superACIP2unicode.put("'Am", "\u0F71\u0F7E");
superACIP2unicode.put("'Im", "\u0F71\u0F72\u0F7E");
superACIP2unicode.put("'Em", "\u0F71\u0F7A\u0F7E");
superACIP2unicode.put("'Om", "\u0F71\u0F7C\u0F7E");
superACIP2unicode.put("'Um", "\u0F71\u0F74\u0F7E");
superACIP2unicode.put("'OOm", "\u0F71\u0F7D\u0F7E");
superACIP2unicode.put("'EEm", "\u0F71\u0F7B\u0F7E");
superACIP2unicode.put("'im", "\u0F71\u0F80\u0F7E");
superACIP2unicode.put("I:", "\u0F72\u0F7F");
superACIP2unicode.put("E:", "\u0F7A\u0F7F");
superACIP2unicode.put("O:", "\u0F7C\u0F7F");
superACIP2unicode.put("U:", "\u0F74\u0F7F");
superACIP2unicode.put("OO:", "\u0F7D\u0F7F");
superACIP2unicode.put("EE:", "\u0F7B\u0F7F");
superACIP2unicode.put("i:", "\u0F80\u0F7F");
superACIP2unicode.put("'A:", "\u0F71\u0F7F");
superACIP2unicode.put("'I:", "\u0F71\u0F72\u0F7F");
superACIP2unicode.put("'E:", "\u0F71\u0F7A\u0F7F");
superACIP2unicode.put("'O:", "\u0F71\u0F7C\u0F7F");
superACIP2unicode.put("'U:", "\u0F71\u0F74\u0F7F");
superACIP2unicode.put("'OO:", "\u0F71\u0F7D\u0F7F");
superACIP2unicode.put("'EE:", "\u0F71\u0F7B\u0F7F");
superACIP2unicode.put("'i:", "\u0F71\u0F80\u0F7F");
superACIP2unicode.put("Im:", "\u0F72\u0F7E\u0F7F");
superACIP2unicode.put("Em:", "\u0F7A\u0F7E\u0F7F");
superACIP2unicode.put("Om:", "\u0F7C\u0F7E\u0F7F");
superACIP2unicode.put("Um:", "\u0F74\u0F7E\u0F7F");
superACIP2unicode.put("OOm:", "\u0F7D\u0F7E\u0F7F");
superACIP2unicode.put("EEm:", "\u0F7B\u0F7E\u0F7F");
superACIP2unicode.put("im:", "\u0F80\u0F7E\u0F7F");
superACIP2unicode.put("'Am:", "\u0F71\u0F7E\u0F7F");
superACIP2unicode.put("'Im:", "\u0F71\u0F72\u0F7E\u0F7F");
superACIP2unicode.put("'Em:", "\u0F71\u0F7A\u0F7E\u0F7F");
superACIP2unicode.put("'Om:", "\u0F71\u0F7C\u0F7E\u0F7F");
superACIP2unicode.put("'Um:", "\u0F71\u0F74\u0F7E\u0F7F");
superACIP2unicode.put("'OOm:", "\u0F71\u0F7D\u0F7E\u0F7F");
superACIP2unicode.put("'EEm:", "\u0F71\u0F7B\u0F7E\u0F7F");
superACIP2unicode.put("'im:", "\u0F71\u0F80\u0F7E\u0F7F");
// :m does not appear, though you'd think it's as valid as m:.
superACIP2unicode.put("m", "\u0F7E");
superACIP2unicode.put(":", "\u0F7F");
superACIP2unicode.put("m:", "\u0F7E\u0F7F");
superACIP2unicode.put("Am", "\u0F7E");
superACIP2unicode.put("A:", "\u0F7F");
superACIP2unicode.put("Am:", "\u0F7E\u0F7F");
superACIP2unicode.put("0", "\u0F20");
superACIP2unicode.put("1", "\u0F21");
superACIP2unicode.put("2", "\u0F22");
superACIP2unicode.put("3", "\u0F23");
superACIP2unicode.put("4", "\u0F24");
superACIP2unicode.put("5", "\u0F25");
superACIP2unicode.put("6", "\u0F26");
superACIP2unicode.put("7", "\u0F27");
superACIP2unicode.put("8", "\u0F28");
superACIP2unicode.put("9", "\u0F29");
// punctuation
superACIP2unicode.put("&", "\u0F85");
superACIP2unicode.put(",", "\u0F0D");
superACIP2unicode.put(" ", "\u0F0B");
superACIP2unicode.put(".", "\u0F0C");
superACIP2unicode.put("`", "\u0F08");
superACIP2unicode.put("`", "\u0F08");
superACIP2unicode.put("*", "\u0F04\u0F05");
superACIP2unicode.put("#", "\u0F04\u0F05\u0F05");
superACIP2unicode.put("%", "\u0F35"); // but might be U+0F14, so we warn.
superACIP2unicode.put("o", "\u0F37");
superACIP2unicode.put(";", "\u0F11");
superACIP2unicode.put("\r", "\r");
superACIP2unicode.put("\t", "\t");
superACIP2unicode.put("\r\n", "\r\n");
superACIP2unicode.put("\n", "\n");
superACIP2unicode.put("\\", "\u0F84");
superACIP2unicode.put("^", "\u0F38");
// DLC FIXME: "^ GONG" is "^GONG", right?
// DLC FIXME: what's the Unicode for x? RC said there is none in plain-text Unicode for x. But what about in RTF Unicode?
}
if (subscribed) {
String u = (String)subACIP2unicode.get(acip);
if (null != u) return u;
}
return (String)superACIP2unicode.get(acip);
}
/** Gets the duffcodes for vowel, such that they look good with
* the stack with hash key hashKey, and appends them to r. */
static void getDuffForACIPVowel(ArrayList duff, DuffCode preceding, String vowel) {
if (null == vowel) return;
if (null == getWylieForACIPVowel(vowel)) // FIXME: expensive assertion! Use assert.
throw new IllegalArgumentException("Vowel " + vowel + " isn't in the small set of vowels we handle correctly.");
// Order matters here.
boolean context_added[] = new boolean[] { false };
if (vowel.startsWith("A")) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.WYLIE_aVOWEL, context_added);
} else if (vowel.indexOf("'U") >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.U_VOWEL, context_added);
} else if (vowel.indexOf("'I") >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.I_VOWEL, context_added);
} else {
if (vowel.indexOf('\'') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.A_VOWEL, context_added);
}
if (vowel.indexOf("EE") >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.ai_VOWEL, context_added);
} else if (vowel.indexOf('E') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.e_VOWEL, context_added);
}
if (vowel.indexOf("OO") >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.au_VOWEL, context_added);
} else if (vowel.indexOf('O') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.o_VOWEL, context_added);
}
if (vowel.indexOf('I') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.i_VOWEL, context_added);
}
if (vowel.indexOf('U') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.u_VOWEL, context_added);
}
if (vowel.indexOf('i') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.reverse_i_VOWEL, context_added);
}
}
// FIXME: Use TMW9.61, the "o'i" special combination, when appropriate.
if (vowel.indexOf('m') >= 0) {
DuffCode last = (DuffCode)duff.get(duff.size() - 1);
duff.remove(duff.size() - 1); // getBindu will add it back...
TibTextUtils.getBindu(duff, last);
}
if (vowel.indexOf(':') >= 0)
duff.add(TibetanMachineWeb.getGlyph("H"));
}
/** Returns true if and only if l is the ACIP representation of a
letter that can be a suffix. Note that all postsuffixes are
also suffixes. l must not have an "A" -- use "S", not "SA",
that is. */
public static boolean isACIPSuffix(String l) {
return ("S".equals(l)
|| "G".equals(l)
|| "D".equals(l)
|| "M".equals(l)
|| "'".equals(l)
|| "B".equals(l)
|| "NG".equals(l)
|| "N".equals(l)
|| "L".equals(l)
|| "R".equals(l));
}
/** Returns true if and only if l is the ACIP representation of a
letter that can be a prefix. l must not have an "A" -- use
"D", not "DA", that is. */
public static boolean isACIPPrefix(String l) {
return ("'".equals(l)
|| "M".equals(l)
|| "B".equals(l)
|| "D".equals(l)
|| "G".equals(l));
}
/** Returns true if and only if l is the ACIP representation of a
letter that can be a postsuffix. l must not have an "A" --
use "D", not "DA", that is. */
public static boolean isACIPPostsuffix(String l) {
return ("S".equals(l)
|| "D".equals(l));
}
}

View file

@ -18,11 +18,25 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt; package org.thdl.tib.text.ttt;
import java.util.HashSet;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.StringTokenizer;
import java.util.List;
import org.thdl.util.ThdlOptions;
import org.thdl.tib.text.DuffCode;
import org.thdl.tib.text.THDLWylieConstants;
import org.thdl.tib.text.TibetanMachineWeb;
import org.thdl.tib.text.TibTextUtils;
/** A singleton class that should contain (but due to laziness and /** A singleton class that should contain (but due to laziness and
* ignorance probably does not contain) all the traits that make ACIP * ignorance probably does not contain) all the traits that make ACIP
* transliteration different from other (say, EWTS) * transliteration scheme different from other (say, EWTS)
* transliterations. */ * transliteration schemes. This is not safe to use in concurrent
final class ACIPTraits implements TTraits { * programs but it would be easy to make it so. */
public final class ACIPTraits implements TTraits {
/** sole instance of this class */ /** sole instance of this class */
private static ACIPTraits singleton = null; private static ACIPTraits singleton = null;
@ -30,7 +44,7 @@ final class ACIPTraits implements TTraits {
private ACIPTraits() { } private ACIPTraits() { }
/** Returns the singleton instance of this class. */ /** Returns the singleton instance of this class. */
public static ACIPTraits instance() { public static /* synchronized */ ACIPTraits instance() {
if (null == singleton) { if (null == singleton) {
singleton = new ACIPTraits(); singleton = new ACIPTraits();
} }
@ -43,15 +57,536 @@ final class ACIPTraits implements TTraits {
/** Returns '-'. */ /** Returns '-'. */
public char disambiguatorChar() { return '-'; } public char disambiguatorChar() { return '-'; }
public int maxConsonantLength() { return ACIPRules.MAX_CONSONANT_LENGTH; } public int maxConsonantLength() { return MAX_CONSONANT_LENGTH; }
public int maxWowelLength() { return ACIPRules.MAX_WOWEL_LENGTH; } public int maxWowelLength() { return MAX_WOWEL_LENGTH; }
public boolean isConsonant(String s) { return ACIPRules.isConsonant(s); }
public boolean isWowel(String s) { return ACIPRules.isWowel(s); }
public boolean hasSimpleError(TPair p) { public boolean hasSimpleError(TPair p) {
return ("A".equals(p.getLeft()) && null == p.getRight()); return ("A".equals(p.getLeft()) && null == p.getRight());
} }
public String aVowel() { return "A"; }
public boolean isPostsuffix(String l) {
return ("S".equals(l)
|| "D".equals(l));
}
public boolean isSuffix(String l) {
return ("S".equals(l)
|| "G".equals(l)
|| "D".equals(l)
|| "M".equals(l)
|| "'".equals(l)
|| "B".equals(l)
|| "NG".equals(l)
|| "N".equals(l)
|| "L".equals(l)
|| "R".equals(l));
}
public boolean isPrefix(String l) {
return ("'".equals(l)
|| "M".equals(l)
|| "B".equals(l)
|| "D".equals(l)
|| "G".equals(l));
}
private HashMap superACIP2unicode = null;
private HashMap subACIP2unicode = null;
public /* synchronized */ String getUnicodeFor(String acip, boolean subscribed) {
if (superACIP2unicode == null) {
final boolean compactUnicode
= ThdlOptions.getBooleanOption("thdl.acip.to.unicode.conversions.use.0F52.et.cetera");
superACIP2unicode = new HashMap(144);
subACIP2unicode = new HashMap(42);
// oddball:
subACIP2unicode.put("V", "\u0FAD");
superACIP2unicode.put("DH", (compactUnicode ? "\u0F52" : "\u0F51\u0FB7"));
subACIP2unicode.put("DH", (compactUnicode ? "\u0FA2" : "\u0FA1\u0FB7"));
superACIP2unicode.put("BH", (compactUnicode ? "\u0F57" : "\u0F56\u0FB7"));
subACIP2unicode.put("BH", (compactUnicode ? "\u0FA7" : "\u0FA6\u0FB7"));
superACIP2unicode.put("dH", (compactUnicode ? "\u0F4D" : "\u0F4C\u0FB7"));
subACIP2unicode.put("dH", (compactUnicode ? "\u0F9D" : "\u0F9C\u0FB7"));
superACIP2unicode.put("DZH", (compactUnicode ? "\u0F5C" : "\u0F5B\u0FB7"));
subACIP2unicode.put("DZH", (compactUnicode ? "\u0FAC" : "\u0FAB\u0FB7"));
superACIP2unicode.put("Ksh", (compactUnicode ? "\u0F69" : "\u0F40\u0FB5"));
subACIP2unicode.put("Ksh", (compactUnicode ? "\u0FB9" : "\u0F90\u0FB5"));
superACIP2unicode.put("GH", (compactUnicode ? "\u0F43" : "\u0F42\u0FB7"));
subACIP2unicode.put("GH", (compactUnicode ? "\u0F93" : "\u0F92\u0FB7"));
superACIP2unicode.put("K", "\u0F40");
subACIP2unicode.put("K", "\u0F90");
superACIP2unicode.put("KH", "\u0F41");
subACIP2unicode.put("KH", "\u0F91");
superACIP2unicode.put("G", "\u0F42");
subACIP2unicode.put("G", "\u0F92");
superACIP2unicode.put("NG", "\u0F44");
subACIP2unicode.put("NG", "\u0F94");
superACIP2unicode.put("C", "\u0F45");
subACIP2unicode.put("C", "\u0F95");
superACIP2unicode.put("CH", "\u0F46");
subACIP2unicode.put("CH", "\u0F96");
superACIP2unicode.put("J", "\u0F47");
subACIP2unicode.put("J", "\u0F97");
superACIP2unicode.put("NY", "\u0F49");
subACIP2unicode.put("NY", "\u0F99");
superACIP2unicode.put("T", "\u0F4F");
subACIP2unicode.put("T", "\u0F9F");
superACIP2unicode.put("TH", "\u0F50");
subACIP2unicode.put("TH", "\u0FA0");
superACIP2unicode.put("D", "\u0F51");
subACIP2unicode.put("D", "\u0FA1");
superACIP2unicode.put("N", "\u0F53");
subACIP2unicode.put("N", "\u0FA3");
superACIP2unicode.put("P", "\u0F54");
subACIP2unicode.put("P", "\u0FA4");
superACIP2unicode.put("PH", "\u0F55");
subACIP2unicode.put("PH", "\u0FA5");
superACIP2unicode.put("B", "\u0F56");
subACIP2unicode.put("B", "\u0FA6");
superACIP2unicode.put("M", "\u0F58");
subACIP2unicode.put("M", "\u0FA8");
superACIP2unicode.put("TZ", "\u0F59");
subACIP2unicode.put("TZ", "\u0FA9");
superACIP2unicode.put("TS", "\u0F5A");
subACIP2unicode.put("TS", "\u0FAA");
superACIP2unicode.put("DZ", "\u0F5B");
subACIP2unicode.put("DZ", "\u0FAB");
superACIP2unicode.put("W", "\u0F5D");
subACIP2unicode.put("W", "\u0FBA"); // oddball
superACIP2unicode.put("ZH", "\u0F5E");
subACIP2unicode.put("ZH", "\u0FAE");
superACIP2unicode.put("Z", "\u0F5F");
subACIP2unicode.put("Z", "\u0FAF");
superACIP2unicode.put("'", "\u0F60");
subACIP2unicode.put("'", "\u0FB0");
superACIP2unicode.put("Y", "\u0F61");
subACIP2unicode.put("Y", "\u0FB1");
superACIP2unicode.put("R", "\u0F62");
subACIP2unicode.put("R", "\u0FB2");
superACIP2unicode.put("L", "\u0F63");
subACIP2unicode.put("L", "\u0FB3");
superACIP2unicode.put("SH", "\u0F64");
subACIP2unicode.put("SH", "\u0FB4");
superACIP2unicode.put("S", "\u0F66");
subACIP2unicode.put("S", "\u0FB6");
superACIP2unicode.put("H", "\u0F67");
subACIP2unicode.put("H", "\u0FB7");
superACIP2unicode.put("A", "\u0F68");
subACIP2unicode.put("A", "\u0FB8");
superACIP2unicode.put("t", "\u0F4A");
subACIP2unicode.put("t", "\u0F9A");
superACIP2unicode.put("th", "\u0F4B");
subACIP2unicode.put("th", "\u0F9B");
superACIP2unicode.put("d", "\u0F4C");
subACIP2unicode.put("d", "\u0F9C");
superACIP2unicode.put("n", "\u0F4E");
subACIP2unicode.put("n", "\u0F9E");
superACIP2unicode.put("sh", "\u0F65");
subACIP2unicode.put("sh", "\u0FB5");
superACIP2unicode.put("I", "\u0F72");
superACIP2unicode.put("E", "\u0F7A");
superACIP2unicode.put("O", "\u0F7C");
superACIP2unicode.put("U", "\u0F74");
superACIP2unicode.put("OO", "\u0F7D");
superACIP2unicode.put("EE", "\u0F7B");
superACIP2unicode.put("i", "\u0F80");
superACIP2unicode.put("'A", "\u0F71");
superACIP2unicode.put("'I", "\u0F71\u0F72");
superACIP2unicode.put("'E", "\u0F71\u0F7A");
superACIP2unicode.put("'O", "\u0F71\u0F7C");
superACIP2unicode.put("'U", "\u0F71\u0F74");
superACIP2unicode.put("'OO", "\u0F71\u0F7D");
superACIP2unicode.put("'EE", "\u0F71\u0F7B");
superACIP2unicode.put("'i", "\u0F71\u0F80");
superACIP2unicode.put("Im", "\u0F72\u0F7E");
superACIP2unicode.put("Em", "\u0F7A\u0F7E");
superACIP2unicode.put("Om", "\u0F7C\u0F7E");
superACIP2unicode.put("Um", "\u0F74\u0F7E");
superACIP2unicode.put("OOm", "\u0F7D\u0F7E");
superACIP2unicode.put("EEm", "\u0F7B\u0F7E");
superACIP2unicode.put("im", "\u0F80\u0F7E");
superACIP2unicode.put("'Am", "\u0F71\u0F7E");
superACIP2unicode.put("'Im", "\u0F71\u0F72\u0F7E");
superACIP2unicode.put("'Em", "\u0F71\u0F7A\u0F7E");
superACIP2unicode.put("'Om", "\u0F71\u0F7C\u0F7E");
superACIP2unicode.put("'Um", "\u0F71\u0F74\u0F7E");
superACIP2unicode.put("'OOm", "\u0F71\u0F7D\u0F7E");
superACIP2unicode.put("'EEm", "\u0F71\u0F7B\u0F7E");
superACIP2unicode.put("'im", "\u0F71\u0F80\u0F7E");
superACIP2unicode.put("I:", "\u0F72\u0F7F");
superACIP2unicode.put("E:", "\u0F7A\u0F7F");
superACIP2unicode.put("O:", "\u0F7C\u0F7F");
superACIP2unicode.put("U:", "\u0F74\u0F7F");
superACIP2unicode.put("OO:", "\u0F7D\u0F7F");
superACIP2unicode.put("EE:", "\u0F7B\u0F7F");
superACIP2unicode.put("i:", "\u0F80\u0F7F");
superACIP2unicode.put("'A:", "\u0F71\u0F7F");
superACIP2unicode.put("'I:", "\u0F71\u0F72\u0F7F");
superACIP2unicode.put("'E:", "\u0F71\u0F7A\u0F7F");
superACIP2unicode.put("'O:", "\u0F71\u0F7C\u0F7F");
superACIP2unicode.put("'U:", "\u0F71\u0F74\u0F7F");
superACIP2unicode.put("'OO:", "\u0F71\u0F7D\u0F7F");
superACIP2unicode.put("'EE:", "\u0F71\u0F7B\u0F7F");
superACIP2unicode.put("'i:", "\u0F71\u0F80\u0F7F");
superACIP2unicode.put("Im:", "\u0F72\u0F7E\u0F7F");
superACIP2unicode.put("Em:", "\u0F7A\u0F7E\u0F7F");
superACIP2unicode.put("Om:", "\u0F7C\u0F7E\u0F7F");
superACIP2unicode.put("Um:", "\u0F74\u0F7E\u0F7F");
superACIP2unicode.put("OOm:", "\u0F7D\u0F7E\u0F7F");
superACIP2unicode.put("EEm:", "\u0F7B\u0F7E\u0F7F");
superACIP2unicode.put("im:", "\u0F80\u0F7E\u0F7F");
superACIP2unicode.put("'Am:", "\u0F71\u0F7E\u0F7F");
superACIP2unicode.put("'Im:", "\u0F71\u0F72\u0F7E\u0F7F");
superACIP2unicode.put("'Em:", "\u0F71\u0F7A\u0F7E\u0F7F");
superACIP2unicode.put("'Om:", "\u0F71\u0F7C\u0F7E\u0F7F");
superACIP2unicode.put("'Um:", "\u0F71\u0F74\u0F7E\u0F7F");
superACIP2unicode.put("'OOm:", "\u0F71\u0F7D\u0F7E\u0F7F");
superACIP2unicode.put("'EEm:", "\u0F71\u0F7B\u0F7E\u0F7F");
superACIP2unicode.put("'im:", "\u0F71\u0F80\u0F7E\u0F7F");
// :m does not appear, though you'd think it's as valid as m:.
superACIP2unicode.put("m", "\u0F7E");
superACIP2unicode.put(":", "\u0F7F");
superACIP2unicode.put("m:", "\u0F7E\u0F7F");
superACIP2unicode.put("Am", "\u0F7E");
superACIP2unicode.put("A:", "\u0F7F");
superACIP2unicode.put("Am:", "\u0F7E\u0F7F");
superACIP2unicode.put("0", "\u0F20");
superACIP2unicode.put("1", "\u0F21");
superACIP2unicode.put("2", "\u0F22");
superACIP2unicode.put("3", "\u0F23");
superACIP2unicode.put("4", "\u0F24");
superACIP2unicode.put("5", "\u0F25");
superACIP2unicode.put("6", "\u0F26");
superACIP2unicode.put("7", "\u0F27");
superACIP2unicode.put("8", "\u0F28");
superACIP2unicode.put("9", "\u0F29");
// punctuation
superACIP2unicode.put("&", "\u0F85");
superACIP2unicode.put(",", "\u0F0D");
superACIP2unicode.put(" ", "\u0F0B");
superACIP2unicode.put(".", "\u0F0C");
superACIP2unicode.put("`", "\u0F08");
superACIP2unicode.put("`", "\u0F08");
superACIP2unicode.put("*", "\u0F04\u0F05");
superACIP2unicode.put("#", "\u0F04\u0F05\u0F05");
superACIP2unicode.put("%", "\u0F35"); // but might be U+0F14, so we warn.
superACIP2unicode.put("o", "\u0F37");
superACIP2unicode.put(";", "\u0F11");
superACIP2unicode.put("\r", "\r");
superACIP2unicode.put("\t", "\t");
superACIP2unicode.put("\r\n", "\r\n");
superACIP2unicode.put("\n", "\n");
superACIP2unicode.put("\\", "\u0F84");
superACIP2unicode.put("^", "\u0F38");
// DLC FIXME: "^ GONG" is "^GONG", right?
// DLC FIXME: what's the Unicode for x? RC said there is none in plain-text Unicode for x. But what about in RTF Unicode?
}
if (subscribed) {
String u = (String)subACIP2unicode.get(acip);
if (null != u) return u;
}
return (String)superACIP2unicode.get(acip);
}
private HashMap acipOther2wylie = null;
public /* synchronized */ String getEwtsForOther(String acip) {
if (acipOther2wylie == null) {
acipOther2wylie = new HashMap(20);
// don't use putMapping for this. We don't want TMW->ACIP
// to produce "." for a U+0F0C because ACIP doesn't say
// that "." means U+0F0C. It just seems to in practice
// for ACIP Release IV texts.
acipOther2wylie.put(".", "*");
putMapping(acipOther2wylie, "m", "M");
putMapping(acipOther2wylie, ":", "H");
putMapping(acipOther2wylie, ",", "/");
putMapping(acipOther2wylie, " ", " ");
putMapping(acipOther2wylie, ";", "|");
putMapping(acipOther2wylie, "`", "!");
putMapping(acipOther2wylie, "*", "@#");
// There is no glyph in TMW with the EWTS @##, so we don't do this: putMapping(acipOther2wylie, "#", "@##");
putMapping(acipOther2wylie, "%", "~X");
putMapping(acipOther2wylie, "o", "X");
putMapping(acipOther2wylie, "&", "&");
putMapping(acipOther2wylie, "^", "\\u0F38");
putMapping(acipOther2wylie, "0", "0");
putMapping(acipOther2wylie, "1", "1");
putMapping(acipOther2wylie, "2", "2");
putMapping(acipOther2wylie, "3", "3");
putMapping(acipOther2wylie, "4", "4");
putMapping(acipOther2wylie, "5", "5");
putMapping(acipOther2wylie, "6", "6");
putMapping(acipOther2wylie, "7", "7");
putMapping(acipOther2wylie, "8", "8");
putMapping(acipOther2wylie, "9", "9");
}
return (String)acipOther2wylie.get(acip);
}
public TTshegBarScanner scanner() { return ACIPTshegBarScanner.instance(); }
/** Registers acip->wylie mappings in toWylie; registers
wylie->acip mappings in {@link #wylieToACIP}. */
private /* synchronized */ void putMapping(HashMap toWylie, String ACIP, String EWTS) {
toWylie.put(ACIP, EWTS);
if (null == wylieToACIP) {
wylieToACIP = new HashMap(75);
// We don't want to put "/" in toWylie:
wylieToACIP.put("(", "/");
wylieToACIP.put(")", "/");
wylieToACIP.put("?", "\\");
wylieToACIP.put("_", " "); // oddball.
wylieToACIP.put("o'i", "O'I"); // oddball for TMW9.61.
}
wylieToACIP.put(EWTS, ACIP);
}
/** A map from EWTS to ACIP. Note that the EWTS "w" maps to both
"V" and "W" in reality but this map will only give one or the
other. */
private HashMap wylieToACIP = null;
/** Returns the ACIP transliteration corresponding to the THDL
Extended Wylie <em>atom</em> EWTS, or null if EWTS is not
recognized. */
public String getACIPForEWTS(String EWTS) {
getEwtsForConsonant(null); // inits wylieToACIP
getEwtsForOther(null); // inits wylieToACIP
getEwtsForWowel(null); // inits wylieToACIP
String ans = (String)wylieToACIP.get(EWTS);
boolean useCapitalW = false;
if (EWTS.startsWith("w"))
useCapitalW = true; // We want W+NA, not V+NA; we want WA, not VA.
if (null == ans) {
StringBuffer finalAns = new StringBuffer(EWTS.length());
StringTokenizer sTok = new StringTokenizer(EWTS, "-+", true);
while (sTok.hasMoreTokens()) {
String part, tok = sTok.nextToken();
if (tok.equals("-") || tok.equals("+"))
part = tok;
else {
if ("w".equals(tok)) {
// There are only two stacks in TMW that have
// U+0FBA: R+Wa and w+Wa. TMW->ACIP fails for
// these unless we handle it here. (FIXME:
// add an automated test for this).
if ("R+W".equals(EWTS) || "w+W".equals(EWTS)) {
part = "W";
} else {
part = "V";
}
} else {
part = (String)wylieToACIP.get(tok);
}
}
if (null == part) return null;
finalAns.append(part);
}
if (useCapitalW)
finalAns.setCharAt(0, 'W');
return finalAns.toString();
}
if (useCapitalW)
return "W" + ans.substring(1);
else
return ans;
}
private HashMap acipConsonant2wylie = null;
/** Returns "W" for ACIP "W", "r" for ACIP "R", y for ACIP "Y",
* even though sometimes the EWTS for those is "w", "R", or "Y".
* Handle that in the caller. */
public /* synchronized */ String getEwtsForConsonant(String acip) {
if (acipConsonant2wylie == null) {
acipConsonant2wylie = new HashMap(37);
// oddball:
putMapping(acipConsonant2wylie, "V", "w");
// more oddballs:
putMapping(acipConsonant2wylie, "DH", "d+h");
putMapping(acipConsonant2wylie, "BH", "b+h");
putMapping(acipConsonant2wylie, "dH", "D+h");
putMapping(acipConsonant2wylie, "DZH", "dz+h"); // longest, MAX_CONSONANT_LENGTH characters
putMapping(acipConsonant2wylie, "Ksh", "k+Sh"); // longest, MAX_CONSONANT_LENGTH characters
putMapping(acipConsonant2wylie, "GH", "g+h");
putMapping(acipConsonant2wylie, "K", "k");
putMapping(acipConsonant2wylie, "KH", "kh");
putMapping(acipConsonant2wylie, "G", "g");
putMapping(acipConsonant2wylie, "NG", "ng");
putMapping(acipConsonant2wylie, "C", "c");
putMapping(acipConsonant2wylie, "CH", "ch");
putMapping(acipConsonant2wylie, "J", "j");
putMapping(acipConsonant2wylie, "NY", "ny");
putMapping(acipConsonant2wylie, "T", "t");
putMapping(acipConsonant2wylie, "TH", "th");
putMapping(acipConsonant2wylie, "D", "d");
putMapping(acipConsonant2wylie, "N", "n");
putMapping(acipConsonant2wylie, "P", "p");
putMapping(acipConsonant2wylie, "PH", "ph");
putMapping(acipConsonant2wylie, "B", "b");
putMapping(acipConsonant2wylie, "M", "m");
putMapping(acipConsonant2wylie, "TZ", "ts");
putMapping(acipConsonant2wylie, "TS", "tsh");
putMapping(acipConsonant2wylie, "DZ", "dz");
putMapping(acipConsonant2wylie, "W", "W"
/* NOTE WELL: sometimes "w", sometimes "W".
Handle this in the caller.
Reasoning for "W" instead of "w": r-w and
r+w are both known hash keys. We sort 'em
out this way. (They are the only things
like this according to bug report #800166.) */
);
putMapping(acipConsonant2wylie, "ZH", "zh");
putMapping(acipConsonant2wylie, "Z", "z");
putMapping(acipConsonant2wylie, "'", "'");
putMapping(acipConsonant2wylie, "Y", "y");
putMapping(acipConsonant2wylie, "R", "r");
putMapping(acipConsonant2wylie, "L", "l");
putMapping(acipConsonant2wylie, "SH", "sh");
putMapping(acipConsonant2wylie, "S", "s");
putMapping(acipConsonant2wylie, "H", "h");
putMapping(acipConsonant2wylie, "A", "a");
putMapping(acipConsonant2wylie, "t", "T");
putMapping(acipConsonant2wylie, "th", "Th");
putMapping(acipConsonant2wylie, "d", "D");
putMapping(acipConsonant2wylie, "n", "N");
putMapping(acipConsonant2wylie, "sh", "Sh");
}
return (String)acipConsonant2wylie.get(acip);
}
private HashMap acipWowel2wylie = null;
public /* synchronized */ String getEwtsForWowel(String acip) {
if (acipWowel2wylie == null) {
acipWowel2wylie = new HashMap(baseVowels.length * 4);
for (int i = 0; i < baseVowels.length; i++) {
putMapping(acipWowel2wylie, baseVowels[i][0], baseVowels[i][1]);
putMapping(acipWowel2wylie, '\'' + baseVowels[i][0], baseVowels[i][2]);
putMapping(acipWowel2wylie, baseVowels[i][0] + 'm', baseVowels[i][1] + 'M');
putMapping(acipWowel2wylie, '\'' + baseVowels[i][0] + 'm', baseVowels[i][2] + 'M');
putMapping(acipWowel2wylie, baseVowels[i][0] + ':', baseVowels[i][1] + 'H');
putMapping(acipWowel2wylie, '\'' + baseVowels[i][0] + ':', baseVowels[i][2] + 'H');
putMapping(acipWowel2wylie, baseVowels[i][0] + "m:", baseVowels[i][1] + "MH");
putMapping(acipWowel2wylie, '\'' + baseVowels[i][0] + "m:", baseVowels[i][2] + "MH");
}
// {Pm} is treated just like {PAm}; {P:} is treated just
// like {PA:}; {Pm:} is treated just like {PAm:}. But
// that happens thanks to
// TPairListFactory.getFirstConsonantAndVowel(StringBuffer,int[]).
// Keep this code in sync with getUnicodeFor.
}
return (String)acipWowel2wylie.get(acip);
}
/** {Ksh}, the longest consonant, has 3 characters, so this is
* three. */
private static int MAX_CONSONANT_LENGTH = 3;
/** {'EEm:}, the longest wowel, has 5 characters, so this is
* five. */
private static int MAX_WOWEL_LENGTH = 5;
private static String[][] baseVowels = new String[][] {
// { ACIP, EWTS, EWTS for ACIP {'\'' + baseVowels[][0]}, vowel
// numbers (see TibetanMachineWeb's VOWEL_A, VOWEL_o, etc.)
// for ACIP, vowel numbers for ACIP {'\'' + baseVowels[][0]}
{ "A", "a", "A" },
{ "I", "i", "I" },
{ "U", "u", "U" },
{ "E", "e", "Ae" },
{ "O", "o", "Ao" },
{ "EE", "ai", "Aai" },
{ "OO", "au", "Aau" },
{ "i", "-i", "A-i" }
};
/** Returns true if and only if s is an ACIP wowel. You can't
* just call this any time -- A is both a consonant and a vowel
* in ACIP, so you have to call this in the right context. */
public boolean isWowel(String s) {
// I'm on my own with 'O and 'E and 'OO and 'EE, but GANG'O
// appears and I wonder... so here they are. It's consistent
// with 'I and 'A and 'U, at least: all the vowels may appear
// as K'vowel. DLC FIXME: ask.
return (null != getEwtsForWowel(s));
}
/** Returns true if and only if s is an ACIP consonant. */
public boolean isConsonant(String s) {
return (null != getEwtsForConsonant(s));
}
/** Gets the duffcodes for wowel, such that they look good with
* the preceding glyph, and appends them to duff. */
public void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel) {
if (null == wowel) return;
if (null == getEwtsForWowel(wowel)) // FIXME: expensive assertion! Use assert.
throw new IllegalArgumentException("Wowel " + wowel + " isn't in the small set of wowels we handle correctly.");
// Order matters here.
boolean context_added[] = new boolean[] { false };
if (wowel.startsWith("A")) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.WYLIE_aVOWEL, context_added);
} else if (wowel.indexOf("'U") >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.U_VOWEL, context_added);
} else if (wowel.indexOf("'I") >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.I_VOWEL, context_added);
} else {
if (wowel.indexOf('\'') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.A_VOWEL, context_added);
}
if (wowel.indexOf("EE") >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.ai_VOWEL, context_added);
} else if (wowel.indexOf('E') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.e_VOWEL, context_added);
}
if (wowel.indexOf("OO") >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.au_VOWEL, context_added);
} else if (wowel.indexOf('O') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.o_VOWEL, context_added);
}
if (wowel.indexOf('I') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.i_VOWEL, context_added);
}
if (wowel.indexOf('U') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.u_VOWEL, context_added);
}
if (wowel.indexOf('i') >= 0) {
TibTextUtils.getVowel(duff, preceding, THDLWylieConstants.reverse_i_VOWEL, context_added);
}
}
// FIXME: Use TMW9.61, the "o'i" special combination, when appropriate.
if (wowel.indexOf('m') >= 0) {
DuffCode last = (DuffCode)duff.get(duff.size() - 1);
duff.remove(duff.size() - 1); // getBindu will add it back...
TibTextUtils.getBindu(duff, last);
}
if (wowel.indexOf(':') >= 0)
duff.add(TibetanMachineWeb.getGlyph(getEwtsForOther(":")));
}
} }

View file

@ -18,11 +18,10 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt; package org.thdl.tib.text.ttt;
import java.io.*; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Stack; import java.util.Stack;
import org.thdl.util.ThdlDebug;
import org.thdl.util.ThdlOptions; import org.thdl.util.ThdlOptions;
/** /**
@ -36,8 +35,10 @@ import org.thdl.util.ThdlOptions;
* the parser, not here in the lexical analyzer. That'd be cleaner, * the parser, not here in the lexical analyzer. That'd be cleaner,
* and more like how you'd do things if you used lex and yacc. * and more like how you'd do things if you used lex and yacc.
* *
* This is not public because you should use {@link ACIPTraits#scanner()}.
*
* @author David Chandler */ * @author David Chandler */
public class ACIPTshegBarScanner extends TTshegBarScanner { class ACIPTshegBarScanner extends TTshegBarScanner {
/** True if those ACIP snippets inside square brackets (e.g., /** True if those ACIP snippets inside square brackets (e.g.,
"[THIS]") are to be passed through into the output unmodified "[THIS]") are to be passed through into the output unmodified
while retaining the brackets and if those ACIP snippets inside while retaining the brackets and if those ACIP snippets inside

View file

@ -18,11 +18,14 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt; package org.thdl.tib.text.ttt;
import java.util.ArrayList;
import org.thdl.tib.text.DuffCode;
/** A singleton class that should contain (but due to laziness and /** A singleton class that should contain (but due to laziness and
* ignorance probably does not contain) all the traits that make EWTS * ignorance probably does not contain) all the traits that make EWTS
* transliteration different from other (say, ACIP) transliteration * transliteration different from other (say, ACIP) transliteration
* schemes. */ * schemes. */
final class EWTSTraits implements TTraits { public final class EWTSTraits implements TTraits {
/** sole instance of this class */ /** sole instance of this class */
private static EWTSTraits singleton = null; private static EWTSTraits singleton = null;
@ -30,7 +33,7 @@ final class EWTSTraits implements TTraits {
private EWTSTraits() { } private EWTSTraits() { }
/** */ /** */
public static EWTSTraits instance() { public static synchronized EWTSTraits instance() {
if (null == singleton) { if (null == singleton) {
singleton = new EWTSTraits(); singleton = new EWTSTraits();
} }
@ -79,4 +82,48 @@ final class EWTSTraits implements TTraits {
|| "H".equals(s) || "H".equals(s)
|| "M".equals(s)); // TODO(DLC)[EWTS->Tibetan]:??? || "M".equals(s)); // TODO(DLC)[EWTS->Tibetan]:???
} }
public String aVowel() { return "a"; }
public boolean isPostsuffix(String s) {
return ("s".equals(s) || "d".equals(s));
}
public boolean isPrefix(String l) {
return ("'".equals(l)
|| "m".equals(l)
|| "b".equals(l)
|| "d".equals(l)
|| "g".equals(l));
}
public boolean isSuffix(String l) {
return ("s".equals(l)
|| "g".equals(l)
|| "d".equals(l)
|| "m".equals(l)
|| "'".equals(l)
|| "b".equals(l)
|| "ng".equals(l)
|| "n".equals(l)
|| "l".equals(l)
|| "r".equals(l));
}
/** Returns l, since this is EWTS's traits class. */
public String getEwtsForConsonant(String l) { return l; }
/** Returns l, since this is EWTS's traits class. */
public String getEwtsForOther(String l) { return l; }
/** Returns l, since this is EWTS's traits class. */
public String getEwtsForWowel(String l) { return l; }
public TTshegBarScanner scanner() { return EWTSTshegBarScanner.instance(); }
public void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel) {
throw new Error("TODO(DLC)[EWTS->Tibetan]");
}
public String getUnicodeFor(String l, boolean subscribed) { throw new Error("TODO(DLC)[EWTS->Tibetan]"); }
} }

View file

@ -0,0 +1,56 @@
/*
The contents of this file are subject to the THDL Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the THDL web site
(http://www.thdl.org/).
Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the
License.
The Initial Developer of this software is the Tibetan and Himalayan Digital
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
All Rights Reserved.
Contributor(s): ______________________________________.
*/
package org.thdl.tib.text.ttt;
import java.util.ArrayList;
/**
* This singleton class is able to break up Strings of EWTS text (for
* example, an entire sutra file) into tsheg bars, comments, etc.
* Non-Tibetan parts are segregated (so that consumers can ensure that
* they remain non-Tibetan), and Tibetan passages are broken up into
* tsheg bars.
*
* This is not public because you should use {@link EWTSTraits#scanner()}.
*
* @author David Chandler */
class EWTSTshegBarScanner extends TTshegBarScanner {
/** See the comment in TTshegBarScanner. This does not find
errors and warnings that you'd think of a parser finding (DLC
DOES IT?). */
public ArrayList scan(String s, StringBuffer errors, int maxErrors,
boolean shortMessages, String warningLevel) {
// the size depends on whether it's mostly Tibetan or mostly
// Latin and a number of other factors. This is meant to be
// an underestimate, but not too much of an underestimate.
ArrayList al = new ArrayList(s.length() / 10);
throw new Error("DLC unimplemented");
}
/** non-public because this is a singleton */
protected EWTSTshegBarScanner() { }
private static EWTSTshegBarScanner singleton = null;
/** Returns the sole instance of this class. */
public synchronized static EWTSTshegBarScanner instance() {
if (null == singleton) {
singleton = new EWTSTshegBarScanner();
}
return singleton;
}
}

View file

@ -202,7 +202,7 @@ public class PackageTest extends TestCase {
message. */ message. */
static String ACIP2TMW2Translit(boolean EWTSNotACIP, String ACIP) { static String ACIP2TMW2Translit(boolean EWTSNotACIP, String ACIP) {
StringBuffer errors = new StringBuffer(); StringBuffer errors = new StringBuffer();
ArrayList al = ACIPTshegBarScanner.instance().scan(ACIP, errors, -1, ArrayList al = ACIPTraits.instance().scanner().scan(ACIP, errors, -1,
false, "None"); false, "None");
if (null == al || errors.length() > 0) if (null == al || errors.length() > 0)
return null; return null;
@ -210,7 +210,8 @@ public class PackageTest extends TestCase {
= new org.thdl.tib.text.TibetanDocument(); = new org.thdl.tib.text.TibetanDocument();
int loc[] = new int[] { 0 }; int loc[] = new int[] { 0 };
try { try {
if (!TConverter.convertToTMW(al, if (!TConverter.convertToTMW(ACIPTraits.instance(),
al,
tdoc, tdoc,
null, null,
null, null,
@ -7358,7 +7359,7 @@ tstHelper("ZUR");
private static void shelp(String s, String expectedErrors, String expectedScan, String warningLevel) { private static void shelp(String s, String expectedErrors, String expectedScan, String warningLevel) {
StringBuffer errors = new StringBuffer(); StringBuffer errors = new StringBuffer();
ArrayList al = ACIPTshegBarScanner.instance().scan(s, errors, -1, false, ArrayList al = ACIPTraits.instance().scanner().scan(s, errors, -1, false,
warningLevel); warningLevel);
if (null != expectedScan) { if (null != expectedScan) {
if (!al.toString().equals(expectedScan)) { if (!al.toString().equals(expectedScan)) {
@ -7392,7 +7393,7 @@ tstHelper("ZUR");
/** Tests {@link ACIPTshegBarScanner#scan(String, StringBuffer, /** Tests {@link ACIPTshegBarScanner#scan(String, StringBuffer,
int, boolean)}. */ int, boolean)}. */
public void testScanner() { public void testAcipScanner() {
shelp("Pm KA", "", "[TIBETAN_NON_PUNCTUATION:{Pm}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{KA}]"); shelp("Pm KA", "", "[TIBETAN_NON_PUNCTUATION:{Pm}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{KA}]");
shelp("KA (KHA\nGA)", "", "[TIBETAN_NON_PUNCTUATION:{KA}, TIBETAN_PUNCTUATION:{ }, START_PAREN:{(}, TIBETAN_NON_PUNCTUATION:{KHA}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{GA}, END_PAREN:{)}]"); shelp("KA (KHA\nGA)", "", "[TIBETAN_NON_PUNCTUATION:{KA}, TIBETAN_PUNCTUATION:{ }, START_PAREN:{(}, TIBETAN_NON_PUNCTUATION:{KHA}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{GA}, END_PAREN:{)}]");
@ -7682,7 +7683,8 @@ tstHelper("ZUR");
private static void uhelp(String acip, String expectedUnicode, private static void uhelp(String acip, String expectedUnicode,
String warningLevel, boolean shortMessages) { String warningLevel, boolean shortMessages) {
StringBuffer errors = new StringBuffer(); StringBuffer errors = new StringBuffer();
String unicode = TConverter.convertToUnicodeText("ACIP", acip, errors, String unicode = TConverter.convertToUnicodeText(ACIPTraits.instance(),
acip, errors,
null, true, null, true,
warningLevel, warningLevel,
shortMessages); shortMessages);

View file

@ -69,7 +69,7 @@ public class TConverter {
boolean shortMessages = false; boolean shortMessages = false;
String warningLevel = "Most"; String warningLevel = "Most";
ArrayList al ArrayList al
= ACIPTshegBarScanner.instance().scanFile(args[0], errors, = ACIPTraits.instance().scanner().scanFile(args[0], errors,
maxErrors - 1, maxErrors - 1,
shortMessages, shortMessages,
warningLevel); warningLevel);
@ -103,8 +103,9 @@ public class TConverter {
warnings = new StringBuffer(); warnings = new StringBuffer();
putWarningsInOutput = true; putWarningsInOutput = true;
} }
convertToTMW(al, System.out, errors, warnings, null, convertToTMW(ACIPTraits.instance(), al, System.out, errors, warnings,
putWarningsInOutput, warningLevel, shortMessages, colors); null, putWarningsInOutput, warningLevel, shortMessages,
colors);
int retCode = 0; int retCode = 0;
if (errors.length() > 0) { if (errors.length() > 0) {
System.err.println("Errors converting ACIP input file: "); System.err.println("Errors converting ACIP input file: ");
@ -139,7 +140,8 @@ public class TConverter {
* prefix rules in another * prefix rules in another
* @throws IOException if we cannot write to out * @throws IOException if we cannot write to out
*/ */
public static boolean convertToTMW(ArrayList scan, public static boolean convertToTMW(TTraits ttraits,
ArrayList scan,
OutputStream out, OutputStream out,
StringBuffer errors, StringBuffer errors,
StringBuffer warnings, StringBuffer warnings,
@ -152,7 +154,8 @@ public class TConverter {
{ {
TibetanDocument tdoc = new TibetanDocument(); TibetanDocument tdoc = new TibetanDocument();
boolean rv boolean rv
= convertToTMW(scan, tdoc, errors, warnings, hasWarnings, = convertToTMW(ttraits,
scan, tdoc, errors, warnings, hasWarnings,
writeWarningsToResult, warningLevel, writeWarningsToResult, warningLevel,
shortMessages, colors, shortMessages, colors,
new int[] { tdoc.getLength() }); new int[] { tdoc.getLength() });
@ -169,7 +172,8 @@ public class TConverter {
offset from zero inside tdoc at which conversion results will offset from zero inside tdoc at which conversion results will
be placed. On output, loc[0] is one past the offset of the be placed. On output, loc[0] is one past the offset of the
last of the conversion results. */ last of the conversion results. */
public static boolean convertToTMW(ArrayList scan, public static boolean convertToTMW(TTraits ttraits,
ArrayList scan,
TibetanDocument tdoc, TibetanDocument tdoc,
StringBuffer errors, StringBuffer errors,
StringBuffer warnings, StringBuffer warnings,
@ -181,7 +185,8 @@ public class TConverter {
int[] loc) int[] loc)
throws IOException throws IOException
{ {
return convertTo(false, true, scan, null, tdoc, errors, warnings, return convertTo(false, true,
ttraits, scan, null, tdoc, errors, warnings,
hasWarnings, writeWarningsToResult, warningLevel, hasWarnings, writeWarningsToResult, warningLevel,
shortMessages, colors, loc, shortMessages, colors, loc,
loc[0] == tdoc.getLength()); loc[0] == tdoc.getLength());
@ -189,33 +194,30 @@ public class TConverter {
/** Returns UTF-8 encoded Unicode. A bit indirect, so use this /** Returns UTF-8 encoded Unicode. A bit indirect, so use this
* for testing only if performance is a concern. If errors occur * for testing only if performance is a concern. If errors occur
* in scanning the ACIP or in converting a tsheg bar, then they * in scanning the transliteration or in converting a tsheg bar,
* are appended to errors if errors is non-null, as well as * then they are appended to errors if errors is non-null, as
* written to the result. If warnings occur in scanning the ACIP * well as written to the result. If warnings occur in scanning
* or in converting a tsheg bar, then they are appended to * the transliteration or in converting a tsheg bar, then they
* warnings if warnings is non-null, and they are written to the * are appended to warnings if warnings is non-null, and they are
* result if writeWarningsToResult is true. Error and warning * written to the result if writeWarningsToResult is true. Error
* messages are long and self-contained unless shortMessages is * and warning messages are long and self-contained unless
* true. Returns the conversion upon perfect success or if there * shortMessages is true. Returns the conversion upon perfect
* were merely warnings, null if errors occurred. */ * success or if there were merely warnings, null if errors
public static String convertToUnicodeText(String transliteration, * occurred. */
String acip, public static String convertToUnicodeText(TTraits ttraits,
String translit,
StringBuffer errors, StringBuffer errors,
StringBuffer warnings, StringBuffer warnings,
boolean writeWarningsToResult, boolean writeWarningsToResult,
String warningLevel, String warningLevel,
boolean shortMessages) { boolean shortMessages) {
if (transliteration != "ACIP") {
ThdlDebug.noteIffyCode();
throw new IllegalArgumentException("Unsupported transliteration");
}
ByteArrayOutputStream sw = new ByteArrayOutputStream(); ByteArrayOutputStream sw = new ByteArrayOutputStream();
ArrayList al ArrayList al
= ACIPTshegBarScanner.instance().scan(acip, errors, -1, = ttraits.scanner().scan(translit, errors, -1, shortMessages,
shortMessages, warningLevel); warningLevel);
try { try {
if (null != al) { if (null != al) {
convertToUnicodeText(al, sw, errors, convertToUnicodeText(ttraits, al, sw, errors,
warnings, null, writeWarningsToResult, warnings, null, writeWarningsToResult,
warningLevel, shortMessages); warningLevel, shortMessages);
return sw.toString("UTF-8"); return sw.toString("UTF-8");
@ -236,7 +238,8 @@ public class TConverter {
* writeWarningsToOut is true, then warnings also will be written * writeWarningsToOut is true, then warnings also will be written
* to out. * to out.
* @return true upon perfect success, false if errors occurred. * @return true upon perfect success, false if errors occurred.
* @param scan result of ACIPTshegBarScanner.scan(..) * @param scan result of using ttraits.scanner() to break up the
* original string of transliteration
* @param out stream to which to write converted text * @param out stream to which to write converted text
* @param errors if non-null, all error messages are appended * @param errors if non-null, all error messages are appended
* @param warnings if non-null, all warning messages appropriate * @param warnings if non-null, all warning messages appropriate
@ -246,9 +249,9 @@ public class TConverter {
* false otherwise * false otherwise
* @param writeWarningsToOut if true, then all warning messages * @param writeWarningsToOut if true, then all warning messages
* are written to out in the appropriate places * are written to out in the appropriate places
* @throws IOException if we cannot write to out * @throws IOException if we cannot write to out */
*/ public static boolean convertToUnicodeText(TTraits ttraits,
public static boolean convertToUnicodeText(ArrayList scan, ArrayList scan,
OutputStream out, OutputStream out,
StringBuffer errors, StringBuffer errors,
StringBuffer warnings, StringBuffer warnings,
@ -258,7 +261,8 @@ public class TConverter {
boolean shortMessages) boolean shortMessages)
throws IOException throws IOException
{ {
return convertTo(true, false, scan, out, null, errors, warnings, return convertTo(true, false,
ttraits, scan, out, null, errors, warnings,
hasWarnings, writeWarningsToOut, warningLevel, hasWarnings, writeWarningsToOut, warningLevel,
shortMessages, false, new int[] { -1 } , true); shortMessages, false, new int[] { -1 } , true);
} }
@ -283,6 +287,7 @@ public class TConverter {
private static boolean convertTo(boolean toUnicode, // else to TMW private static boolean convertTo(boolean toUnicode, // else to TMW
boolean toRTF, // else to UTF-8-encoded text boolean toRTF, // else to UTF-8-encoded text
TTraits ttraits,
ArrayList scan, ArrayList scan,
OutputStream out, // for (toUnicode && !toRTF) mode OutputStream out, // for (toUnicode && !toRTF) mode
TibetanDocument tdoc, // for !toUnicode mode or (toUnicode && toRTF) mode TibetanDocument tdoc, // for !toUnicode mode or (toUnicode && toRTF) mode
@ -368,7 +373,7 @@ public class TConverter {
if (lastGuyWasNonPunct) { if (lastGuyWasNonPunct) {
String err = "[#ERROR " + ErrorsAndWarnings.getMessage(133, shortMessages, s.getText()) + "]"; String err = "[#ERROR " + ErrorsAndWarnings.getMessage(133, shortMessages, s.getText()) + "]";
if (null != writer) { if (null != writer) {
String uni = ACIPRules.getUnicodeFor(s.getText(), false); String uni = ttraits.getUnicodeFor(s.getText(), false);
if (null == uni) { if (null == uni) {
hasErrors = true; hasErrors = true;
uni = err; uni = err;
@ -377,7 +382,7 @@ public class TConverter {
} }
if (null != tdoc) { if (null != tdoc) {
String wylie String wylie
= ACIPRules.getWylieForACIPOther(s.getText()); = ttraits.getEwtsForOther(s.getText());
if (null == wylie) { if (null == wylie) {
hasErrors = true; hasErrors = true;
tdoc.appendRoman(tdocLocation[0], err, Color.RED); tdoc.appendRoman(tdocLocation[0], err, Color.RED);
@ -658,7 +663,7 @@ public class TConverter {
} }
if (!done) { if (!done) {
if (null != writer) unicode = ACIPRules.getUnicodeFor(s.getText(), false); if (null != writer) unicode = ttraits.getUnicodeFor(s.getText(), false);
if (null != tdoc) { if (null != tdoc) {
if (s.getText().equals("\r") if (s.getText().equals("\r")
|| s.getText().equals("\t") || s.getText().equals("\t")
@ -675,7 +680,7 @@ public class TConverter {
TibetanMachineWeb.getGlyph("#") TibetanMachineWeb.getGlyph("#")
}; // hard-coded EWTS values }; // hard-coded EWTS values
} else { } else {
String wy = ACIPRules.getWylieForACIPOther(s.getText()); String wy = ttraits.getEwtsForOther(s.getText());
if (null == wy) throw new Error("No wylie for ACIP " + s.getText()); if (null == wy) throw new Error("No wylie for ACIP " + s.getText());
duff = new Object[] { TibetanMachineWeb.getGlyph(wy) }; duff = new Object[] { TibetanMachineWeb.getGlyph(wy) };
} }

View file

@ -26,22 +26,27 @@ import java.util.ArrayList;
/** An ordered pair used in ACIP/EWTS-to-TMW/Unicode conversion. The /** An ordered pair used in ACIP/EWTS-to-TMW/Unicode conversion. The
* left side is the consonant or empty; the right side is either the * left side is the consonant or empty; the right side is either the
* vowel or '+' (indicating stacking) or a disambiguator (i.e., '-' * vowel or '+' (indicating stacking in both ACIP and EWTS) or a
* in ACIP or '.' in EWTS). * disambiguator (e.g., '-' in ACIP or '.' in EWTS).
* @author David Chandler */ * @author David Chandler */
/* BIG FIXME: make this package work for EWTS, not just ACIP. (TODO(DLC)[EWTS->Tibetan]: does it?) */ /* BIG FIXME: make this package work for EWTS, not just ACIP. (TODO(DLC)[EWTS->Tibetan]: does it?) */
class TPair { class TPair {
/** The left side, or null if there is no left side. That is, the /** the part that knows ACIP from EWTS */
* non-vowel, non-'m', non-':', non-'-', non-'+' guy. */ private TTraits traits;
/** Returns the part that knows ACIP from EWTS. */
public TTraits getTraits() { return traits; }
/** The left side, or null if there is no left side. I.e., the
* non-wowel, non-disambiguator, non-'+' guy. */
private String l; private String l;
String getLeft() { String getLeft() {
ThdlDebug.verify(!"".equals(l)); ThdlDebug.verify(!"".equals(l));
return l; return l;
} }
/** The right side. That is, the vowel, with 'm' or ':' "vowel" /** The right side. That is, the wowel or disambiguator or "+"
* after it if appropriate, or "-" (disambiguator), or "+" * (for stacking) or null otherwise. */
* (stacking), or null otherwise. */
private String r; private String r;
String getRight() { String getRight() {
ThdlDebug.verify(!"".equals(r)); ThdlDebug.verify(!"".equals(r));
@ -50,13 +55,14 @@ class TPair {
/** Constructs a new TPair with left side l and right side r. /** Constructs a new TPair with left side l and right side r.
* Use null or the empty string to represent an absence. */ * Use null or the empty string to represent an absence. */
TPair(String l, String r) { TPair(TTraits traits, String l, String r) {
// Normalize: // Normalize:
if (null != l && l.equals("")) l = null; if (null != l && l.equals("")) l = null;
if (null != r && r.equals("")) r = null; if (null != r && r.equals("")) r = null;
this.l = l; this.l = l;
this.r = r; this.r = r;
this.traits = traits;
} }
/** Returns a nice String representation. Returns "(D . E)" for /** Returns a nice String representation. Returns "(D . E)" for
@ -67,8 +73,8 @@ class TPair {
+ ((null == r) ? "" : r) + ")"; + ((null == r) ? "" : r) + ")";
} }
/** Returns the number of ACIP characters that make up this /** Returns the number of transliteration characters that make up
* TPair. */ * this TPair. */
int size() { int size() {
return (((l == null) ? 0 : l.length()) return (((l == null) ? 0 : l.length())
+ ((r == null) ? 0 : r.length())); + ((r == null) ? 0 : r.length()));
@ -98,18 +104,18 @@ class TPair {
sz = l.length(); sz = l.length();
newL = l.substring(0, sz - N); newL = l.substring(0, sz - N);
} }
return new TPair(newL, newR); return new TPair(traits, newL, newR);
} }
/** Returns true if and only if this is nonempty and is l, if /** Returns true if and only if this is nonempty and if l, if
* present, is a legal ACIP consonant, and is r, if present, is a * present, is a legal consonant, and if r, if present, is a
* legal ACIP vowel. */ * legal wowel. */
boolean isLegal() { boolean isLegal() {
if (size() < 1) if (size() < 1)
return false; return false;
if (null != l && !ACIPRules.isConsonant(l)) if (null != l && !traits.isConsonant(l))
return false; return false;
if (null != r && !ACIPRules.isWowel(r)) if (null != r && !traits.isWowel(r))
return false; return false;
return true; return true;
} }
@ -119,9 +125,9 @@ class TPair {
boolean isPrefix() { boolean isPrefix() {
return (null != l return (null != l
&& ((null == r || "".equals(r)) && ((null == r || "".equals(r))
|| "-".equals(r) // TODO(DLC)[EWTS->Tibetan] || traits.disambiguator().equals(r)
|| "A".equals(r)) // FIXME: though check for BASKYABS and warn because BSKYABS is more common || traits.aVowel().equals(r)) // FIXME: though check for BASKYABS and warn because BSKYABS is more common
&& ACIPRules.isACIPPrefix(l)); && traits.isPrefix(l));
} }
/** Returns true if and only if this pair could be a Tibetan /** Returns true if and only if this pair could be a Tibetan
@ -129,25 +135,25 @@ class TPair {
boolean isPostSuffix() { boolean isPostSuffix() {
return (null != l return (null != l
&& ((null == r || "".equals(r)) && ((null == r || "".equals(r))
|| "-".equals(r) || traits.disambiguator().equals(r)
|| "A".equals(r)) // FIXME: though warn about GAMASA vs. GAMS || traits.aVowel().equals(r)) // FIXME: though warn about GAMASA vs. GAMS
&& ACIPRules.isACIPPostsuffix(l)); && traits.isPostsuffix(l));
} }
/** Returns true if and only if this pair could be a Tibetan /** Returns true if and only if this pair could be a Tibetan
* suffix. FIXME: ACIP specific, just like isPostSuffix() and isPrefix() */ * suffix. */
boolean isSuffix() { boolean isSuffix() {
return (null != l return (null != l
&& ((null == r || "".equals(r)) && ((null == r || "".equals(r))
|| "-".equals(r) || traits.disambiguator().equals(r)
|| "A".equals(r)) || traits.aVowel().equals(r))
&& ACIPRules.isACIPSuffix(l)); && traits.isSuffix(l));
} }
/** Returns true if and only if this pair is merely a /** Returns true if and only if this pair is merely a
* disambiguator. */ * disambiguator. */
boolean isDisambiguator() { boolean isDisambiguator() {
return ("-".equals(r) && getLeft() == null); return (traits.disambiguator().equals(r) && getLeft() == null);
} }
/** Yep, this works for TPairs. */ /** Yep, this works for TPairs. */
@ -160,16 +166,16 @@ class TPair {
return false; return false;
} }
/** Returns a TPair that is like this pair except that it has /** Returns a TPair that is like this pair except that it has a
* a "+" on the right if this pair is empty on the right and is * "+" on the right if this pair is empty on the right and is
* empty on the right if this pair has a disambiguator (i.e., a * empty on the right if this pair has a disambiguator on the
* '-') on the right. May return itself (but never mutates this * right. May return itself (but never mutates this
* instance). */ * instance). */
TPair insideStack() { TPair insideStack() {
if (null == getRight()) if (null == getRight())
return new TPair(getLeft(), "+"); return new TPair(traits, getLeft(), "+");
else if ("-".equals(getRight())) else if (traits.disambiguator().equals(getRight()))
return new TPair(getLeft(), null); return new TPair(traits, getLeft(), null);
else else
return this; return this;
} }
@ -194,7 +200,7 @@ class TPair {
String getWylie(boolean justLeft) { String getWylie(boolean justLeft) {
String leftWylie = null; String leftWylie = null;
if (getLeft() != null) { if (getLeft() != null) {
leftWylie = ACIPRules.getWylieForACIPConsonant(getLeft()); leftWylie = traits.getEwtsForConsonant(getLeft());
if (leftWylie == null) { if (leftWylie == null) {
if (isNumeric()) if (isNumeric())
leftWylie = getLeft(); leftWylie = getLeft();
@ -208,7 +214,7 @@ class TPair {
else if ("+".equals(getRight())) else if ("+".equals(getRight()))
rightWylie = "+"; rightWylie = "+";
else if (getRight() != null) else if (getRight() != null)
rightWylie = ACIPRules.getWylieForACIPVowel(getRight()); rightWylie = traits.getEwtsForWowel(getRight());
if (null == rightWylie) rightWylie = ""; if (null == rightWylie) rightWylie = "";
return leftWylie + rightWylie; return leftWylie + rightWylie;
} }
@ -227,18 +233,19 @@ class TPair {
void getUnicode(StringBuffer consonantSB, StringBuffer vowelSB, void getUnicode(StringBuffer consonantSB, StringBuffer vowelSB,
boolean subscribed) { boolean subscribed) {
if (null != getLeft()) { if (null != getLeft()) {
String x = ACIPRules.getUnicodeFor(getLeft(), subscribed); String x = traits.getUnicodeFor(getLeft(), subscribed);
if (null == x) throw new Error("TPair: " + getLeft() + " has no Uni"); if (null == x) throw new Error("TPair: " + getLeft() + " has no Uni");
consonantSB.append(x); consonantSB.append(x);
} }
if (null != getRight() if (null != getRight()
&& !("-".equals(getRight()) || "+".equals(getRight()) || "A".equals(getRight()))) { && !("-".equals(getRight()) || "+".equals(getRight()) || "A".equals(getRight()))) {
String x = ACIPRules.getUnicodeFor(getRight(), subscribed); String x = traits.getUnicodeFor(getRight(), subscribed);
if (null == x) throw new Error("TPair: " + getRight() + " has no Uni"); if (null == x) throw new Error("TPair: " + getRight() + " has no Uni");
vowelSB.append(x); vowelSB.append(x);
} }
} }
// TODO(DLC)[EWTS->Tibetan]
/** Returns true if this pair is surely the last pair in an ACIP /** Returns true if this pair is surely the last pair in an ACIP
* stack. Stacking continues through (* . ) and (* . +), but * stack. Stacking continues through (* . ) and (* . +), but
* stops anywhere else. */ * stops anywhere else. */

View file

@ -33,6 +33,9 @@ import java.util.ArrayList;
* *
* @author David Chandler */ * @author David Chandler */
class TPairList { class TPairList {
/** the part that knows ACIP from EWTS */
private TTraits traits;
/** FIXME: change me and see if performance improves. */ /** FIXME: change me and see if performance improves. */
private static final int INITIAL_SIZE = 1; private static final int INITIAL_SIZE = 1;
@ -41,17 +44,20 @@ class TPairList {
/** Creates a new list containing just p. */ /** Creates a new list containing just p. */
public TPairList(TPair p) { public TPairList(TPair p) {
this.traits = p.getTraits();
al = new ArrayList(1); al = new ArrayList(1);
add(p); add(p);
} }
/** Creates an empty list. */ /** Creates an empty list. */
public TPairList() { public TPairList(TTraits traits) {
this.traits = traits;
al = new ArrayList(INITIAL_SIZE); al = new ArrayList(INITIAL_SIZE);
} }
/** Creates an empty list with the capacity to hold N items. */ /** Creates an empty list with the capacity to hold N items. */
public TPairList(int N) { public TPairList(TTraits traits, int N) {
this.traits = traits;
al = new ArrayList(N); al = new ArrayList(N);
} }
@ -181,7 +187,7 @@ class TPairList {
return ErrorsAndWarnings.getMessage(125, shortMessages, translit); return ErrorsAndWarnings.getMessage(125, shortMessages, translit);
} else if ((null == p.getLeft() && !"-".equals(p.getRight())) } else if ((null == p.getLeft() && !"-".equals(p.getRight()))
|| (null != p.getLeft() || (null != p.getLeft()
&& !ACIPRules.isConsonant(p.getLeft()) && !traits.isConsonant(p.getLeft())
&& !p.isNumeric())) { && !p.isNumeric())) {
// FIXME: stop handling this outside of ErrorsAndWarnings: // FIXME: stop handling this outside of ErrorsAndWarnings:
if (null == p.getLeft()) { if (null == p.getLeft()) {
@ -406,12 +412,12 @@ class TPairList {
// and only if b1 is one, etc. // and only if b1 is one, etc.
for (int counter = 0; counter < (1<<numBreaks); counter++) { for (int counter = 0; counter < (1<<numBreaks); counter++) {
TStackList sl = new TStackList(); TStackList sl = new TStackList();
TPairList currentStack = new TPairList(); TPairList currentStack = new TPairList(traits);
for (int k = startLoc; k <= i; k++) { for (int k = startLoc; k <= i; k++) {
if (!get(k).isDisambiguator()) { if (!get(k).isDisambiguator()) {
if (get(k).isNumeric() if (get(k).isNumeric()
|| (get(k).getLeft() != null || (get(k).getLeft() != null
&& ACIPRules.isConsonant(get(k).getLeft()))) && traits.isConsonant(get(k).getLeft())))
currentStack.add(get(k).insideStack()); currentStack.add(get(k).insideStack());
else else
return null; // sA, for example, is illegal. return null; // sA, for example, is illegal.
@ -419,7 +425,7 @@ class TPairList {
if (k == i || get(k).endsACIPStack()) { if (k == i || get(k).endsACIPStack()) {
if (!currentStack.isEmpty()) if (!currentStack.isEmpty())
sl.add(currentStack.asStack()); sl.add(currentStack.asStack());
currentStack = new TPairList(); currentStack = new TPairList(traits);
} else { } else {
if (numBreaks > 0) { if (numBreaks > 0) {
for (int j = 0; breakStart+j < 3; j++) { for (int j = 0; breakStart+j < 3; j++) {
@ -427,7 +433,7 @@ class TPairList {
&& 1 == ((counter >> j) & 1)) { && 1 == ((counter >> j) & 1)) {
if (!currentStack.isEmpty()) if (!currentStack.isEmpty())
sl.add(currentStack.asStack()); sl.add(currentStack.asStack());
currentStack = new TPairList(); currentStack = new TPairList(traits);
break; // shouldn't matter, but you never know break; // shouldn't matter, but you never know
} }
} }
@ -460,9 +466,9 @@ class TPairList {
if (!isEmpty()) { if (!isEmpty()) {
TPair lastPair = get(size() - 1); TPair lastPair = get(size() - 1);
if ("+".equals(lastPair.getRight())) if ("+".equals(lastPair.getRight()))
al.set(size() - 1, new TPair(lastPair.getLeft(), null)); al.set(size() - 1, new TPair(traits, lastPair.getLeft(), null));
else if ("-".equals(lastPair.getRight())) else if ("-".equals(lastPair.getRight()))
al.set(size() - 1, new TPair(lastPair.getLeft(), null)); al.set(size() - 1, new TPair(traits, lastPair.getLeft(), null));
} }
return this; return this;
} }
@ -506,10 +512,10 @@ class TPairList {
add_U0F7F = true; add_U0F7F = true;
StringBuffer rr = new StringBuffer(p.getRight()); StringBuffer rr = new StringBuffer(p.getRight());
rr.deleteCharAt(where); rr.deleteCharAt(where);
p = new TPair(p.getLeft(), rr.toString()); p = new TPair(traits, p.getLeft(), rr.toString());
} }
boolean hasNonAVowel = (!"A".equals(p.getRight()) && null != p.getRight()); boolean hasNonAVowel = (!"A".equals(p.getRight()) && null != p.getRight());
String thislWylie = ACIPRules.getWylieForACIPConsonant(p.getLeft()); String thislWylie = traits.getEwtsForConsonant(p.getLeft());
if (thislWylie == null) { if (thislWylie == null) {
char ch; char ch;
if (p.isNumeric()) { if (p.isNumeric()) {
@ -528,21 +534,21 @@ class TPairList {
boolean isTibetan = TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(ll.toString()); boolean isTibetan = TibetanMachineWeb.isWylieTibetanConsonantOrConsonantStack(ll.toString());
boolean isSanskrit = TibetanMachineWeb.isWylieSanskritConsonantStack(lWylie.toString()); boolean isSanskrit = TibetanMachineWeb.isWylieSanskritConsonantStack(lWylie.toString());
if (ddebug && !isTibetan && !isSanskrit && !isNumeric) { if (ddebug && !isTibetan && !isSanskrit && !isNumeric) {
System.out.println("OTHER for " + lWylie + " with vowel " + ACIPRules.getWylieForACIPVowel(p.getRight()) + " and p.getRight()=" + p.getRight()); System.out.println("OTHER for " + lWylie + " with vowel " + traits.getEwtsForWowel(p.getRight()) + " and p.getRight()=" + p.getRight());
} }
if (isTibetan && isSanskrit) { if (isTibetan && isSanskrit) {
// RVA, e.g. It must be Tibetan because RWA is what // RVA, e.g. It must be Tibetan because RWA is what
// you'd use for RA over fixed-form WA. // you'd use for RA over fixed-form WA.
isSanskrit = false; isSanskrit = false;
} }
if (ddebug && hasNonAVowel && ACIPRules.getWylieForACIPVowel(p.getRight()) == null) { if (ddebug && hasNonAVowel && traits.getEwtsForWowel(p.getRight()) == null) {
System.out.println("vowel " + ACIPRules.getWylieForACIPVowel(p.getRight()) + " and p.getRight()=" + p.getRight()); System.out.println("vowel " + traits.getEwtsForWowel(p.getRight()) + " and p.getRight()=" + p.getRight());
} }
TGCPair tp; TGCPair tp;
indexList.add(new Integer(index)); indexList.add(new Integer(index));
tp = new TGCPair(lWylie.toString(), tp = new TGCPair(lWylie.toString(),
(hasNonAVowel (hasNonAVowel
? ACIPRules.getWylieForACIPVowel(p.getRight()) ? traits.getEwtsForWowel(p.getRight())
: ""), : ""),
(isNumeric (isNumeric
? TGCPair.TYPE_OTHER ? TGCPair.TYPE_OTHER
@ -697,7 +703,7 @@ class TPairList {
if (lastPair.getRight() == null || lastPair.equals("-")) { if (lastPair.getRight() == null || lastPair.equals("-")) {
duffsAndErrors.add(TibetanMachineWeb.getGlyph(hashKey)); duffsAndErrors.add(TibetanMachineWeb.getGlyph(hashKey));
} else { } else {
ACIPRules.getDuffForACIPVowel(duffsAndErrors, traits.getDuffForWowel(duffsAndErrors,
TibetanMachineWeb.getGlyph(hashKey), TibetanMachineWeb.getGlyph(hashKey),
lastPair.getRight()); lastPair.getRight());
} }

View file

@ -121,7 +121,7 @@ class TPairListFactory {
// base case for our recursion: // base case for our recursion:
if ("".equals(acip)) if ("".equals(acip))
return new TPairList(); return new TPairList(ttraits);
StringBuffer acipBuf = new StringBuffer(acip); StringBuffer acipBuf = new StringBuffer(acip);
int howMuchBuf[] = new int[1]; int howMuchBuf[] = new int[1];
@ -131,9 +131,9 @@ class TPairListFactory {
&& null != head.getLeft() && null != head.getLeft()
&& null != head.getRight() && null != head.getRight()
&& weHaveSeenVowelAlready && weHaveSeenVowelAlready
&& ACIPRules.isACIPSuffix(head.getLeft()) // DKY'O should be two horizontal units, not three. -- {D}{KY'O}, not {D}{KY}{'O}. && ttraits.isSuffix(head.getLeft()) // DKY'O should be two horizontal units, not three. -- {D}{KY'O}, not {D}{KY}{'O}.
&& head.getRight().startsWith("'")) { && head.getRight().startsWith("'")) {
head = new TPair(head.getLeft(), head = new TPair(ttraits, head.getLeft(),
// Without this disambiguator, we are // Without this disambiguator, we are
// less efficient (8 parses, not 4) and // less efficient (8 parses, not 4) and
// we can't handle PA'AM'ANG etc. // we can't handle PA'AM'ANG etc.
@ -177,11 +177,11 @@ class TPairListFactory {
} }
// TODO(DLC)[EWTS->Tibetan]: doc // TODO(DLC)[EWTS->Tibetan]: doc
private static TPairList breakHelperEWTS(String ewts, TTraits ttraits /* TODO(DLC)[EWTS->Tibetan]: use */) { private static TPairList breakHelperEWTS(String ewts, TTraits ttraits) {
// base case for our recursion: // base case for our recursion:
if ("".equals(ewts)) if ("".equals(ewts))
return new TPairList(); return new TPairList(ttraits);
StringBuffer ewtsBuf = new StringBuffer(ewts); StringBuffer ewtsBuf = new StringBuffer(ewts);
int howMuchBuf[] = new int[1]; int howMuchBuf[] = new int[1];
@ -238,11 +238,11 @@ class TPairListFactory {
int i, xl = acip.length(); int i, xl = acip.length();
if (0 == xl) { if (0 == xl) {
howMuch[0] = 0; howMuch[0] = 0;
return new TPair(null, null); return new TPair(ttraits, null, null);
} }
if (acip.charAt(0) == ttraits.disambiguatorChar()) { if (acip.charAt(0) == ttraits.disambiguatorChar()) {
howMuch[0] = 1; howMuch[0] = 1;
return new TPair(null, ttraits.disambiguator()); return new TPair(ttraits, null, ttraits.disambiguator());
} }
char ch = acip.charAt(0); char ch = acip.charAt(0);
@ -250,7 +250,7 @@ class TPairListFactory {
// like seeing 1-2-3-4. // like seeing 1-2-3-4.
if (ch >= '0' && ch <= '9') { if (ch >= '0' && ch <= '9') {
howMuch[0] = 1; // not 2... howMuch[0] = 1; // not 2...
return new TPair(acip.substring(0, 1), (xl == 1) ? null : ttraits.disambiguator()); return new TPair(ttraits, acip.substring(0, 1), (xl == 1) ? null : ttraits.disambiguator());
} }
String l = null, r = null; String l = null, r = null;
@ -264,11 +264,11 @@ class TPairListFactory {
int ll = (null == l) ? 0 : l.length(); int ll = (null == l) ? 0 : l.length();
if (null != l && xl > ll && acip.charAt(ll) == ttraits.disambiguatorChar()) { if (null != l && xl > ll && acip.charAt(ll) == ttraits.disambiguatorChar()) {
howMuch[0] = l.length() + 1; howMuch[0] = l.length() + 1;
return new TPair(l, ttraits.disambiguator()); return new TPair(ttraits, l, ttraits.disambiguator());
} }
if (null != l && xl > ll && acip.charAt(ll) == '+') { if (null != l && xl > ll && acip.charAt(ll) == '+') {
howMuch[0] = l.length() + 1; howMuch[0] = l.length() + 1;
return new TPair(l, "+"); return new TPair(ttraits, l, "+");
} }
for (i = Math.min(ttraits.maxWowelLength(), xl - ll); i >= 1; i--) { for (i = Math.min(ttraits.maxWowelLength(), xl - ll); i >= 1; i--) {
String t = null; String t = null;
@ -289,7 +289,7 @@ class TPairListFactory {
&& acip.charAt(z) == '+') { && acip.charAt(z) == '+') {
acip.deleteCharAt(z-1); acip.deleteCharAt(z-1);
howMuch[0] = l.length() + 1; howMuch[0] = l.length() + 1;
return new TPair(l, "+"); return new TPair(ttraits, l, "+");
} }
// Allow Pm to mean PAm, P: to mean PA:, Pm: to mean PAm:. /* TODO(DLC)[EWTS->Tibetan]: */ // Allow Pm to mean PAm, P: to mean PA:, Pm: to mean PAm:. /* TODO(DLC)[EWTS->Tibetan]: */
@ -305,14 +305,14 @@ class TPairListFactory {
if (null == l && null == r) { if (null == l && null == r) {
howMuch[0] = 1; // not 2... howMuch[0] = 1; // not 2...
// add a disambiguator to avoid exponential running time: // add a disambiguator to avoid exponential running time:
return new TPair(acip.substring(0, 1), return new TPair(ttraits, acip.substring(0, 1),
(xl == 1) ? null : ttraits.disambiguator()); (xl == 1) ? null : ttraits.disambiguator());
} }
howMuch[0] = (((l == null) ? 0 : l.length()) howMuch[0] = (((l == null) ? 0 : l.length())
+ ((r == null) ? 0 : r.length()) + ((r == null) ? 0 : r.length())
+ mod); + mod);
return new TPair(l, r); return new TPair(ttraits, l, r);
} // TODO(DLC)[EWTS->Tibetan]: } // TODO(DLC)[EWTS->Tibetan]:
} }

View file

@ -18,12 +18,18 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt; package org.thdl.tib.text.ttt;
import java.util.ArrayList;
import org.thdl.tib.text.DuffCode;
/** A TTraits object encapsulates all the things that make a /** A TTraits object encapsulates all the things that make a
* particular Roman transliteration scheme unique. If both EWTS and * particular Roman transliteration scheme unique. For the most
* ACIP transliterations have a property in common, then it's likely * part, this difference is expressed at the finest granularity
* encoded in a manner that's hard to modify. But if they differ in * possible -- often single characters of Roman transliteration.
* some respect, then that difference should be encoded in a TTraits *
* object. * <p>If both EWTS and ACIP transliterations have a property in
* common, then it's likely encoded in a manner that's hard to
* modify. But if they differ in some respect, then that difference
* should be encoded in a TTraits object.
* *
* <p>It is very likely that classes that implement this interface * <p>It is very likely that classes that implement this interface
* will choose to use the design pattern 'singleton'. */ * will choose to use the design pattern 'singleton'. */
@ -62,9 +68,63 @@ interface TTraits {
/** Returns true if and only if <em>s</em> is a stretch of /** Returns true if and only if <em>s</em> is a stretch of
* transliteration corresponding to a Tibetan wowel (without any * transliteration corresponding to a Tibetan wowel (without any
* [achen or other] consonant) */ * [achen or other] consonant) */
boolean isWowel(String s); boolean isWowel(String s); // TODO(DLC)[EWTS->Tibetan]: what about "m:" as opposed to "m" or ":"
/** Returns true if and only if the pair given has a simple error /** Returns true if and only if the pair given has a simple error
* other than being a mere disambiguator. */ * other than being a mere disambiguator. */
boolean hasSimpleError(TPair p); boolean hasSimpleError(TPair p);
/** The implicit 'ahhh' vowel, the one you see when you write the
human-friendly transliteration for "\u0f40\u0f0b". */
String aVowel();
/** Returns true if s is a valid postsuffix. s must not have a
wowel on it. */
boolean isPostsuffix(String s);
/** Returns true if and only if l is the representation of a
letter that can be a suffix. Note that all postsuffixes are
also suffixes. l should not have a wowel. */
boolean isSuffix(String l);
/** Returns true if and only if l is the representation of a
letter that can be a prefix. l should not have a wowel. */
boolean isPrefix(String l);
/** Returns the EWTS transliteration corresponding to the
* consonant l, which should not have a vowel. Returns null if
* there is no such EWTS.
*
* <p>May return "W" instead of "w", "r" instead of "R", and "y"
* instead of "Y" because we sometimes don't have enough context
* to decide.
*
* <p>The reasoning for "W" instead of "w" is that r-w and r+w
* are both known hash keys (as {@link
* org.thdl.tib.text#TibetanMachineWeb} would call them). We
* sort 'em out this way. (They are the only things like this
* according to bug report #800166.) */
String getEwtsForConsonant(String l);
/** Returns the EWTS corresponding to the given punctuation or
* mark. Returns null if there is no such EWTS. */
String getEwtsForOther(String l);
/** Returns the EWTS corresponding to the given "wowel". Returns
* null if there is no such EWTS. */
String getEwtsForWowel(String l);
/** If l is a consonant or vowel or punctuation mark, then this
* returns the Unicode for it. The Unicode for the subscribed
* form of the glyph is returned if subscribed is true. Returns
* null if l is unknown. */
String getUnicodeFor(String l, boolean subscribed);
/** Returns a scanner that can break up a string of
transliteration. */
TTshegBarScanner scanner();
/** Gets the duffcodes for wowel, such that they look good with
* the preceding glyph, and appends them to duff. */
void getDuffForWowel(ArrayList duff, DuffCode preceding, String wowel);
} }

View file

@ -18,7 +18,11 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.ttt; package org.thdl.tib.text.ttt;
import java.io.*; import java.io.IOException;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.InputStream;
import java.io.BufferedReader;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Stack; import java.util.Stack;
@ -40,7 +44,7 @@ public abstract class TTshegBarScanner {
* If errors is non-null, error messages will be appended to it. * If errors is non-null, error messages will be appended to it.
* Returns a list of TStrings that is the scan. Warning and * Returns a list of TStrings that is the scan. Warning and
* error messages in the result will be long and self-contained * error messages in the result will be long and self-contained
* unless shortMessagse is true. * unless shortMessages is true.
* *
* <p>This is not so efficient; copies the whole file into memory * <p>This is not so efficient; copies the whole file into memory
* first. * first.