ACIP->Unicode, without going through TMW, is now possible, so long as
\, the Sanskrit virama, is not used. Of the 1370-odd ACIP texts I've got here, about 57% make it through the gauntlet (fewer if you demand a vowel or disambiguator on every stack of a non-Tibetan tsheg bar).
This commit is contained in:
parent
245aac4911
commit
1afb3a0fdd
12 changed files with 646 additions and 40 deletions
|
@ -318,13 +318,18 @@ Contributor(s): ______________________________________.
|
|||
<param name="my.included.source.file"
|
||||
value="org/thdl/tib/text/TibetanHTML.java"/>
|
||||
</antcall>
|
||||
<!-- Put TibetanConverter in Jskad's jar for those who want
|
||||
to use it. -->
|
||||
<!-- Put TibetanConverter and ACIPConverter in Jskad's jar for
|
||||
those who want to use them. -->
|
||||
<antcall target="our-internal-javac-task">
|
||||
<param name="mybin" value="${jskadbin}"/>
|
||||
<param name="my.included.source.file"
|
||||
value="org/thdl/tib/input/TibetanConverter.java"/>
|
||||
</antcall>
|
||||
<antcall target="our-internal-javac-task">
|
||||
<param name="mybin" value="${jskadbin}"/>
|
||||
<param name="my.included.source.file"
|
||||
value="org/thdl/tib/text/ttt/ACIPConverter.java"/>
|
||||
</antcall>
|
||||
<antcall target="our-internal-javac-task">
|
||||
<param name="mybin" value="${jskadbin}"/>
|
||||
<param name="my.included.source.file"
|
||||
|
|
|
@ -833,7 +833,7 @@ public final class LegalTshegBar
|
|||
return internalThrowThing(throwIfIllegal,
|
||||
errorBuf,
|
||||
"Illegal suffix -- not one of the ten legal suffixes: "
|
||||
+ UnicodeUtils.unicodeCodepointToString(suffix.charAt(0)));
|
||||
+ UnicodeUtils.unicodeCodepointToString(suffix.charAt(0), false));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -286,8 +286,41 @@ public class UnicodeUtils implements UnicodeConstants {
|
|||
}
|
||||
|
||||
/** Returns a human-readable, ASCII form of the Unicode codepoint
|
||||
cp. */
|
||||
public static String unicodeCodepointToString(char cp) {
|
||||
cp. If shortenIfPossible is true, then printable ASCII
|
||||
characters will appear as themselves. */
|
||||
public static String unicodeCodepointToString(char cp,
|
||||
boolean shortenIfPossible) {
|
||||
if (shortenIfPossible) {
|
||||
if ((cp >= 'a' && cp <= 'z')
|
||||
|| (cp >= 'A' && cp <= 'Z')
|
||||
|| (cp >= '0' && cp <= '9')
|
||||
|| cp == '.'
|
||||
|| cp == ','
|
||||
|| cp == ' '
|
||||
|| cp == '\''
|
||||
|| cp == '"'
|
||||
|| cp == '+'
|
||||
|| cp == '-'
|
||||
|| cp == '='
|
||||
|| cp == '_'
|
||||
|| cp == '@'
|
||||
|| cp == '!'
|
||||
|| cp == '#'
|
||||
|| cp == '$'
|
||||
|| cp == '%'
|
||||
|| cp == '^'
|
||||
|| cp == '&'
|
||||
|| cp == '*'
|
||||
|| cp == '\t'
|
||||
|| cp == ':'
|
||||
|| cp == '['
|
||||
|| cp == ']'
|
||||
|| cp == '('
|
||||
|| cp == ')'
|
||||
|| cp == '{'
|
||||
|| cp == '}')
|
||||
return new String(new char[] { cp });
|
||||
}
|
||||
if (cp < '\u0010')
|
||||
return "\\u000" + Integer.toHexString((int)cp);
|
||||
else if (cp < '\u0100')
|
||||
|
@ -304,7 +337,19 @@ public class UnicodeUtils implements UnicodeConstants {
|
|||
public static String unicodeStringToString(String s) {
|
||||
StringBuffer sb = new StringBuffer(s.length() * 6);
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
sb.append(unicodeCodepointToString(s.charAt(i)));
|
||||
sb.append(unicodeCodepointToString(s.charAt(i), false));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the most succinct possible, human-readable, ASCII form
|
||||
* of the String s of Unicode codepoints. */
|
||||
public static String unicodeStringToPrettyString(String s) {
|
||||
if (s == null) return "null";
|
||||
StringBuffer sb = new StringBuffer(s.length() * 6);
|
||||
for (int i = 0; i < s.length(); i++) {
|
||||
sb.append(unicodeCodepointToString(s.charAt(i), true));
|
||||
}
|
||||
return sb.toString();
|
||||
}
|
||||
|
|
|
@ -321,15 +321,15 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants {
|
|||
* Tests the {@link UnicodeUtils#unicodeCodepointToString(char)}
|
||||
* method. */
|
||||
public void testUnicodeCodepointToString() {
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0000').equals("\\u0000"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0001').equals("\\u0001"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u000F').equals("\\u000f"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u001F').equals("\\u001f"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u00fF').equals("\\u00ff"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u01fF').equals("\\u01ff"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0ffF').equals("\\u0fff"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u1ffF').equals("\\u1fff"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\ufffF').equals("\\uffff"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0000', false).equals("\\u0000"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0001', false).equals("\\u0001"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u000F', false).equals("\\u000f"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u001F', false).equals("\\u001f"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u00fF', false).equals("\\u00ff"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u01fF', false).equals("\\u01ff"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u0ffF', false).equals("\\u0fff"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\u1ffF', false).equals("\\u1fff"));
|
||||
assertTrue(UnicodeUtils.unicodeCodepointToString('\ufffF', false).equals("\\uffff"));
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
208
source/org/thdl/tib/text/ttt/ACIPConverter.java
Normal file
208
source/org/thdl/tib/text/ttt/ACIPConverter.java
Normal file
|
@ -0,0 +1,208 @@
|
|||
/*
|
||||
The contents of this file are subject to the THDL Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the THDL web site
|
||||
(http://www.thdl.org/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is the Tibetan and Himalayan Digital
|
||||
Library (THDL). Portions created by the THDL are Copyright 2003 THDL.
|
||||
All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.text.ttt;
|
||||
|
||||
import java.io.*;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Stack;
|
||||
|
||||
import org.thdl.util.ThdlDebug;
|
||||
import org.thdl.util.ThdlOptions;
|
||||
|
||||
/**
|
||||
* This class is able to convert an ACIP file into Tibetan Machine Web.
|
||||
* From there, TMW->Unicode takes you to Unicode.
|
||||
* @author David Chandler
|
||||
*/
|
||||
public class ACIPConverter {
|
||||
static {
|
||||
// We don't want to load the TM or TMW font files ourselves:
|
||||
ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true);
|
||||
ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true);
|
||||
ThdlOptions.setUserPreference("thdl.debug", true);
|
||||
}
|
||||
|
||||
/** Command-line converter. Gives error messages on standard
|
||||
* output about why we can't convert the document perfectly and
|
||||
* exits with non-zero return code, or is silent otherwise and
|
||||
* exits with code zero. <p>FIXME: not so efficient; copies the
|
||||
* whole file into memory first. */
|
||||
public static void main(String[] args)
|
||||
throws IOException // DLC FIXME: give nice error messages
|
||||
{
|
||||
boolean verbose = true;
|
||||
boolean strict = true;
|
||||
if (args.length != 2
|
||||
|| (!(strict = "--strict".equals(args[0])) && !"--lenient".equals(args[0]))) {
|
||||
System.err.println("Bad args! Need '--strict filename' or '--lenient filename'.");
|
||||
System.exit(1);
|
||||
}
|
||||
StringBuffer errors = new StringBuffer();
|
||||
int maxErrors = 250;
|
||||
ArrayList al = ACIPTshegBarScanner.scanFile(args[1], errors, strict, maxErrors - 1);
|
||||
|
||||
if (null == al) {
|
||||
System.err.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this");
|
||||
System.err.println("Tibetan or English input?");
|
||||
System.err.println("");
|
||||
System.err.println("First " + maxErrors + " errors scanning ACIP input file: ");
|
||||
System.err.println(errors);
|
||||
System.err.println("Exiting with " + maxErrors + " or more errors; please fix input file and try again.");
|
||||
System.exit(1);
|
||||
}
|
||||
if (errors.length() > 0) {
|
||||
System.err.println("Errors scanning ACIP input file: ");
|
||||
System.err.println(errors);
|
||||
System.err.println("Exiting; please fix input file and try again.");
|
||||
System.exit(1);
|
||||
}
|
||||
|
||||
convertToUnicode(al, System.out, errors);
|
||||
if (errors.length() > 0) {
|
||||
System.err.println("Errors converting ACIP input file: ");
|
||||
System.err.println(errors);
|
||||
System.err.println("Exiting; please fix input file and try again.");
|
||||
System.exit(2);
|
||||
}
|
||||
if (verbose) System.err.println("Converted " + args[1] + " perfectly.");
|
||||
System.exit(0);
|
||||
}
|
||||
|
||||
/** Writes TMW/Latin to out. If errors occur in converting a
|
||||
* tsheg bar, then they are appended to errors if errors is
|
||||
* non-null. Returns true upon perfect success, false if errors
|
||||
* occurred.
|
||||
* @throws IOException if we cannot write to out
|
||||
*/
|
||||
public static boolean convertToTMW(ArrayList scan, String latinFont,
|
||||
OutputStream out, StringBuffer errors)
|
||||
throws IOException
|
||||
{
|
||||
throw new Error("DLC UNIMPLEMENTED");
|
||||
}
|
||||
|
||||
/** Returns UTF-8 encoded Unicode. A bit indirect, so use this
|
||||
* for testing only if performance is a concern. If errors occur
|
||||
* in scanning the ACIP or in converting a tsheg bar, then they
|
||||
* are appended to errors if errors is non-null. Returns the
|
||||
* conversion upon perfect success, null if errors occurred.
|
||||
*/
|
||||
public static String convertToUnicode(String acip,
|
||||
StringBuffer errors) {
|
||||
ByteArrayOutputStream sw = new ByteArrayOutputStream();
|
||||
ArrayList al = ACIPTshegBarScanner.scan(acip, errors, true /* DLC FIXME */, -1);
|
||||
try {
|
||||
if (null != al && convertToUnicode(al, sw, errors)) {
|
||||
return sw.toString("UTF-8");
|
||||
} else {
|
||||
System.out.println("DLC al is " + al + " and convertToUnicode returned null.");
|
||||
return null;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
throw new Error(e.toString());
|
||||
}
|
||||
}
|
||||
|
||||
/** Writes Unicode to out. If errors occur in converting a
|
||||
* tsheg bar, then they are appended to errors if errors is
|
||||
* non-null. Returns true upon perfect success, false if errors
|
||||
* occurred.
|
||||
* @throws IOException if we cannot write to out
|
||||
*/
|
||||
public static boolean convertToUnicode(ArrayList scan,
|
||||
OutputStream out,
|
||||
StringBuffer errors)
|
||||
throws IOException
|
||||
{
|
||||
int sz = scan.size();
|
||||
boolean hasErrors = false;
|
||||
BufferedWriter writer
|
||||
= new BufferedWriter(new OutputStreamWriter(out, "UTF-8"));
|
||||
for (int i = 0; i < sz; i++) {
|
||||
ACIPString s = (ACIPString)scan.get(i);
|
||||
int stype = s.getType();
|
||||
if (stype == ACIPString.ERROR) {
|
||||
hasErrors = true;
|
||||
writer.write("[#ERROR CONVERTING ACIP DOCUMENT: ");
|
||||
writer.write(s.getText());
|
||||
writer.write("]");
|
||||
} else {
|
||||
// DLC FIXME: what about 'no A on root stack' and 'no A on such-and-such stack' warnings?
|
||||
if (s.isLatin(stype)) {
|
||||
if (stype == ACIPString.FOLIO_MARKER)
|
||||
writer.write("{");
|
||||
writer.write(s.getText());
|
||||
if (stype == ACIPString.FOLIO_MARKER)
|
||||
writer.write("}");
|
||||
} else {
|
||||
String unicode = null;
|
||||
if (stype == ACIPString.TIBETAN_NON_PUNCTUATION) {
|
||||
TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText());
|
||||
String acipError;
|
||||
|
||||
if ((acipError = pl.getACIPError()) != null) {
|
||||
hasErrors = true;
|
||||
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS THESE ERRORS: " + acipError + "]";
|
||||
writer.write(errorMessage);
|
||||
if (null != errors)
|
||||
errors.append(errorMessage + "\n");
|
||||
} else {
|
||||
TParseTree pt = pl.getParseTree();
|
||||
if (null == pt) {
|
||||
hasErrors = true;
|
||||
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " IS ESSENTIALLY NOTHING.]";
|
||||
writer.write(errorMessage);
|
||||
if (null != errors)
|
||||
errors.append(errorMessage + "\n");
|
||||
} else {
|
||||
TStackList sl = pt.getBestParse();
|
||||
if (null == sl) {
|
||||
hasErrors = true;
|
||||
String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS NO LEGAL PARSES.]";
|
||||
writer.write(errorMessage);
|
||||
if (null != errors)
|
||||
errors.append(errorMessage + "\n");
|
||||
} else {
|
||||
unicode = sl.getUnicode();
|
||||
if (null == unicode) throw new Error("DLC: HOW?");
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (stype == ACIPString.START_SLASH)
|
||||
unicode = "\u0F3C";
|
||||
else if (stype == ACIPString.END_SLASH)
|
||||
unicode = "\u0F3D";
|
||||
else
|
||||
unicode = ACIPRules.getUnicodeFor(s.getText(), false);
|
||||
if (null == unicode) throw new Error("DLC: HOW?");
|
||||
}
|
||||
if (null != unicode) {
|
||||
writer.write(unicode);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
writer.close();
|
||||
return !hasErrors;
|
||||
}
|
||||
}
|
||||
// DLC FIXME: putting Tibetan in black, Sanskrit in green, and Latin
|
||||
// in yellow would help you quickly decide if ZHIGN maybe should've
|
||||
// been ZHING.
|
|
@ -28,9 +28,9 @@ class ACIPRules {
|
|||
* three. */
|
||||
public static int MAX_CONSONANT_LENGTH = 3;
|
||||
|
||||
/** {'im:}, the longest "vowel", has 4 characters, so this is
|
||||
* four. */
|
||||
public static int MAX_VOWEL_LENGTH = 4;
|
||||
/** {'EEm:}, the longest "vowel", has 5 characters, so this is
|
||||
* five. */
|
||||
public static int MAX_VOWEL_LENGTH = 5;
|
||||
|
||||
/** For O(1) {@link #isVowel(String)} calls. */
|
||||
private static HashSet acipVowels = null;
|
||||
|
@ -42,18 +42,9 @@ class ACIPRules {
|
|||
{ "U", "u" },
|
||||
{ "E", "e" },
|
||||
{ "O", "o" },
|
||||
{ "'I", "I" },
|
||||
{ "'U", "U" },
|
||||
{ "EE", "ai" },
|
||||
{ "OO", "au" },
|
||||
{ "i", "-i" },
|
||||
{ "'i", "-I" },
|
||||
{ "'A", "A" },
|
||||
{ "'O", "Ao" },
|
||||
{ "'E", "Ae" }
|
||||
// DLC I'm on my own with 'O and 'E, but GANG'O appears
|
||||
// and I wonder... so here are 'O and 'E. It's
|
||||
// consistent with 'I and 'A and 'U, at least.
|
||||
{ "i", "-i" }
|
||||
};
|
||||
|
||||
/** Returns true if and only if s is an ACIP "vowel". You can't
|
||||
|
@ -61,14 +52,24 @@ class ACIPRules {
|
|||
* ACIP, so you have to call this in the right context. */
|
||||
public static boolean isVowel(String s) {
|
||||
if (null == acipVowels) {
|
||||
acipVowels = new HashSet();
|
||||
acipVowels = new HashSet(baseVowels.length * 8);
|
||||
for (int i = 0; i < baseVowels.length; i++) {
|
||||
acipVowels.add(baseVowels[i][0]);
|
||||
acipVowels.add(baseVowels[i][0] + 'm');
|
||||
acipVowels.add(baseVowels[i][0] + ':');
|
||||
acipVowels.add(baseVowels[i][0] + "m:");
|
||||
// DLC '\' for visarga? how shall we do \ the visarga? like a vowel or not?
|
||||
// DLC I'm on my own with 'O and 'E and 'OO and 'EE, but
|
||||
// GANG'O appears and I wonder... so here they are. It's
|
||||
// consistent with 'I and 'A and 'U, at least: all the vowels
|
||||
// may appear as K'vowel.
|
||||
|
||||
acipVowels.add(baseVowels[i][0]);
|
||||
acipVowels.add('\'' + baseVowels[i][0]);
|
||||
acipVowels.add(baseVowels[i][0] + 'm');
|
||||
acipVowels.add('\'' + baseVowels[i][0] + 'm');
|
||||
acipVowels.add(baseVowels[i][0] + ':');
|
||||
acipVowels.add('\'' + baseVowels[i][0] + ':');
|
||||
acipVowels.add(baseVowels[i][0] + "m:");
|
||||
acipVowels.add('\'' + baseVowels[i][0] + "m:");
|
||||
// DLC keep this code in sync with getUnicodeFor.
|
||||
|
||||
// DLC '\' for visarga? how shall we do \ the visarga? like a vowel or not?
|
||||
}
|
||||
}
|
||||
return (acipVowels.contains(s));
|
||||
|
@ -204,4 +205,212 @@ class ACIPRules {
|
|||
}
|
||||
return (String)acipVowel2wylie.get(acip);
|
||||
}
|
||||
|
||||
private static HashMap superACIP2unicode = null;
|
||||
private static HashMap subACIP2unicode = null;
|
||||
/** If acip is an ACIP consonant or vowel or punctuation mark,
|
||||
* then this returns the Unicode for it. The Unicode for the
|
||||
* subscribed form of the glyph is returned if subscribed is
|
||||
* true. Returns null if acip is unknown. */
|
||||
static String getUnicodeFor(String acip, boolean subscribed) {
|
||||
if (superACIP2unicode == null) {
|
||||
superACIP2unicode = new HashMap(144);
|
||||
subACIP2unicode = new HashMap(42);
|
||||
|
||||
// oddball:
|
||||
subACIP2unicode.put("V", "\u0FAD");
|
||||
|
||||
superACIP2unicode.put("DH", "\u0F52");
|
||||
subACIP2unicode.put("DH", "\u0FA2");
|
||||
superACIP2unicode.put("BH", "\u0F57");
|
||||
subACIP2unicode.put("BH", "\u0FA7");
|
||||
superACIP2unicode.put("dH", "\u0F4D");
|
||||
subACIP2unicode.put("dH", "\u0F9D");
|
||||
superACIP2unicode.put("DZH", "\u0F5C");
|
||||
subACIP2unicode.put("DZH", "\u0FAC");
|
||||
superACIP2unicode.put("Ksh", "\u0F69");
|
||||
subACIP2unicode.put("Ksh", "\u0FB9");
|
||||
superACIP2unicode.put("GH", "\u0F43");
|
||||
subACIP2unicode.put("GH", "\u0F93");
|
||||
superACIP2unicode.put("K", "\u0F40");
|
||||
subACIP2unicode.put("K", "\u0F90");
|
||||
superACIP2unicode.put("KH", "\u0F41");
|
||||
subACIP2unicode.put("KH", "\u0F91");
|
||||
superACIP2unicode.put("G", "\u0F42");
|
||||
subACIP2unicode.put("G", "\u0F92");
|
||||
superACIP2unicode.put("NG", "\u0F44");
|
||||
subACIP2unicode.put("NG", "\u0F94");
|
||||
superACIP2unicode.put("C", "\u0F45");
|
||||
subACIP2unicode.put("C", "\u0F95");
|
||||
superACIP2unicode.put("CH", "\u0F46");
|
||||
subACIP2unicode.put("CH", "\u0F96");
|
||||
superACIP2unicode.put("J", "\u0F47");
|
||||
subACIP2unicode.put("J", "\u0F97");
|
||||
superACIP2unicode.put("NY", "\u0F49");
|
||||
subACIP2unicode.put("NY", "\u0F99");
|
||||
superACIP2unicode.put("T", "\u0F4F");
|
||||
subACIP2unicode.put("T", "\u0F9F");
|
||||
superACIP2unicode.put("TH", "\u0F50");
|
||||
subACIP2unicode.put("TH", "\u0FA0");
|
||||
superACIP2unicode.put("D", "\u0F51");
|
||||
subACIP2unicode.put("D", "\u0FA1");
|
||||
superACIP2unicode.put("N", "\u0F53");
|
||||
subACIP2unicode.put("N", "\u0FA3");
|
||||
superACIP2unicode.put("P", "\u0F54");
|
||||
subACIP2unicode.put("P", "\u0FA4");
|
||||
superACIP2unicode.put("PH", "\u0F55");
|
||||
subACIP2unicode.put("PH", "\u0FA5");
|
||||
superACIP2unicode.put("B", "\u0F56");
|
||||
subACIP2unicode.put("B", "\u0FA6");
|
||||
superACIP2unicode.put("M", "\u0F58");
|
||||
subACIP2unicode.put("M", "\u0FA8");
|
||||
superACIP2unicode.put("TZ", "\u0F59");
|
||||
subACIP2unicode.put("TZ", "\u0FA9");
|
||||
superACIP2unicode.put("TS", "\u0F5A");
|
||||
subACIP2unicode.put("TS", "\u0FAA");
|
||||
superACIP2unicode.put("DZ", "\u0F5B");
|
||||
subACIP2unicode.put("DZ", "\u0FAB");
|
||||
superACIP2unicode.put("W", "\u0F5D");
|
||||
subACIP2unicode.put("W", "\u0FBA"); // oddball
|
||||
superACIP2unicode.put("ZH", "\u0F5E");
|
||||
subACIP2unicode.put("ZH", "\u0FAE");
|
||||
superACIP2unicode.put("Z", "\u0F5F");
|
||||
subACIP2unicode.put("Z", "\u0FAF");
|
||||
superACIP2unicode.put("'", "\u0F60");
|
||||
subACIP2unicode.put("'", "\u0FB0");
|
||||
superACIP2unicode.put("Y", "\u0F61");
|
||||
subACIP2unicode.put("Y", "\u0FB1");
|
||||
superACIP2unicode.put("R", "\u0F62");
|
||||
subACIP2unicode.put("R", "\u0FB2");
|
||||
superACIP2unicode.put("L", "\u0F63");
|
||||
subACIP2unicode.put("L", "\u0FB3");
|
||||
superACIP2unicode.put("SH", "\u0F64");
|
||||
subACIP2unicode.put("SH", "\u0FB4");
|
||||
superACIP2unicode.put("S", "\u0F66");
|
||||
subACIP2unicode.put("S", "\u0FB6");
|
||||
superACIP2unicode.put("H", "\u0F67");
|
||||
subACIP2unicode.put("H", "\u0FB7");
|
||||
superACIP2unicode.put("A", "\u0F68");
|
||||
subACIP2unicode.put("A", "\u0FB8");
|
||||
superACIP2unicode.put("t", "\u0F4A");
|
||||
subACIP2unicode.put("t", "\u0F9A");
|
||||
superACIP2unicode.put("th", "\u0F4B");
|
||||
subACIP2unicode.put("th", "\u0F9B");
|
||||
superACIP2unicode.put("d", "\u0F4C");
|
||||
subACIP2unicode.put("d", "\u0F9C");
|
||||
superACIP2unicode.put("n", "\u0F4E");
|
||||
subACIP2unicode.put("n", "\u0F9E");
|
||||
superACIP2unicode.put("sh", "\u0F65");
|
||||
subACIP2unicode.put("sh", "\u0FB5");
|
||||
|
||||
superACIP2unicode.put("I", "\u0F72");
|
||||
superACIP2unicode.put("E", "\u0F7A");
|
||||
superACIP2unicode.put("O", "\u0F7C");
|
||||
superACIP2unicode.put("U", "\u0F74");
|
||||
superACIP2unicode.put("OO", "\u0F7D");
|
||||
superACIP2unicode.put("EE", "\u0F7B");
|
||||
superACIP2unicode.put("i", "\u0F80");
|
||||
superACIP2unicode.put("'A", "\u0F71");
|
||||
superACIP2unicode.put("'I", "\u0F71\u0F72");
|
||||
superACIP2unicode.put("'E", "\u0F71\u0F7A");
|
||||
superACIP2unicode.put("'O", "\u0F71\u0F7C");
|
||||
superACIP2unicode.put("'U", "\u0F71\u0F74");
|
||||
superACIP2unicode.put("'OO", "\u0F71\u0F7D");
|
||||
superACIP2unicode.put("'EE", "\u0F71\u0F7B");
|
||||
superACIP2unicode.put("'i", "\u0F71\u0F80");
|
||||
|
||||
superACIP2unicode.put("Im", "\u0F72\u0F7E");
|
||||
superACIP2unicode.put("Em", "\u0F7A\u0F7E");
|
||||
superACIP2unicode.put("Om", "\u0F7C\u0F7E");
|
||||
superACIP2unicode.put("Um", "\u0F74\u0F7E");
|
||||
superACIP2unicode.put("OOm", "\u0F7D\u0F7E");
|
||||
superACIP2unicode.put("EEm", "\u0F7B\u0F7E");
|
||||
superACIP2unicode.put("im", "\u0F80\u0F7E");
|
||||
superACIP2unicode.put("'Am", "\u0F71\u0F7E");
|
||||
superACIP2unicode.put("'Im", "\u0F71\u0F72\u0F7E");
|
||||
superACIP2unicode.put("'Em", "\u0F71\u0F7A\u0F7E");
|
||||
superACIP2unicode.put("'Om", "\u0F71\u0F7C\u0F7E");
|
||||
superACIP2unicode.put("'Um", "\u0F71\u0F74\u0F7E");
|
||||
superACIP2unicode.put("'OOm", "\u0F71\u0F7D\u0F7E");
|
||||
superACIP2unicode.put("'EEm", "\u0F71\u0F7B\u0F7E");
|
||||
superACIP2unicode.put("'im", "\u0F71\u0F80\u0F7E");
|
||||
|
||||
superACIP2unicode.put("I:", "\u0F72\u0F7F");
|
||||
superACIP2unicode.put("E:", "\u0F7A\u0F7F");
|
||||
superACIP2unicode.put("O:", "\u0F7C\u0F7F");
|
||||
superACIP2unicode.put("U:", "\u0F74\u0F7F");
|
||||
superACIP2unicode.put("OO:", "\u0F7D\u0F7F");
|
||||
superACIP2unicode.put("EE:", "\u0F7B\u0F7F");
|
||||
superACIP2unicode.put("i:", "\u0F80\u0F7F");
|
||||
superACIP2unicode.put("'A:", "\u0F71\u0F7F");
|
||||
superACIP2unicode.put("'I:", "\u0F71\u0F72\u0F7F");
|
||||
superACIP2unicode.put("'E:", "\u0F71\u0F7A\u0F7F");
|
||||
superACIP2unicode.put("'O:", "\u0F71\u0F7C\u0F7F");
|
||||
superACIP2unicode.put("'U:", "\u0F71\u0F74\u0F7F");
|
||||
superACIP2unicode.put("'OO:", "\u0F71\u0F7D\u0F7F");
|
||||
superACIP2unicode.put("'EE:", "\u0F71\u0F7B\u0F7F");
|
||||
superACIP2unicode.put("'i:", "\u0F71\u0F80\u0F7F");
|
||||
|
||||
superACIP2unicode.put("Im:", "\u0F72\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("Em:", "\u0F7A\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("Om:", "\u0F7C\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("Um:", "\u0F74\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("OOm:", "\u0F7D\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("EEm:", "\u0F7B\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("im:", "\u0F80\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'Am:", "\u0F71\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'Im:", "\u0F71\u0F72\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'Em:", "\u0F71\u0F7A\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'Om:", "\u0F71\u0F7C\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'Um:", "\u0F71\u0F74\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'OOm:", "\u0F71\u0F7D\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'EEm:", "\u0F71\u0F7B\u0F7E\u0F7F");
|
||||
superACIP2unicode.put("'im:", "\u0F71\u0F80\u0F7E\u0F7F");
|
||||
// :m does not appear, though you'd think it's as valid as m:.
|
||||
|
||||
// I doubt these will occur alone:
|
||||
superACIP2unicode.put("m", "\u0F7E");
|
||||
superACIP2unicode.put(":", "\u0F7F");
|
||||
|
||||
superACIP2unicode.put("Am", "\u0F7E");
|
||||
superACIP2unicode.put("A:", "\u0F7F");
|
||||
|
||||
superACIP2unicode.put("0", "\u0F20");
|
||||
superACIP2unicode.put("1", "\u0F21");
|
||||
superACIP2unicode.put("2", "\u0F22");
|
||||
superACIP2unicode.put("3", "\u0F23");
|
||||
superACIP2unicode.put("4", "\u0F24");
|
||||
superACIP2unicode.put("5", "\u0F25");
|
||||
superACIP2unicode.put("6", "\u0F26");
|
||||
superACIP2unicode.put("7", "\u0F27");
|
||||
superACIP2unicode.put("8", "\u0F28");
|
||||
superACIP2unicode.put("9", "\u0F29");
|
||||
|
||||
// DLC punctuation
|
||||
superACIP2unicode.put("&", "\u0F85");
|
||||
superACIP2unicode.put(",", "\u0F0D");
|
||||
superACIP2unicode.put(" ", "\u0F0B");
|
||||
superACIP2unicode.put(".", "\u0F0C");
|
||||
superACIP2unicode.put("`", "\u0F08");
|
||||
superACIP2unicode.put("`", "\u0F08");
|
||||
superACIP2unicode.put("*", "\u0F04\u0F05");
|
||||
superACIP2unicode.put("#", "\u0F04\u0F05\u0F05");
|
||||
superACIP2unicode.put("%", "\u0F35");
|
||||
superACIP2unicode.put(";", "\u0F11");
|
||||
superACIP2unicode.put("\r", "\r");
|
||||
superACIP2unicode.put("\t", "\t");
|
||||
superACIP2unicode.put("\n", "\n");
|
||||
superACIP2unicode.put("\\", "\u0F84"); // DLC FIXME: make this like a vowel
|
||||
// DLC FIXME: what's the Unicode for caret, ^?
|
||||
// DLC FIXME: what's the Unicode for o?
|
||||
// DLC FIXME: what's the Unicode for x?
|
||||
|
||||
}
|
||||
if (subscribed) {
|
||||
String u = (String)subACIP2unicode.get(acip);
|
||||
if (null != u) return u;
|
||||
}
|
||||
return (String)superACIP2unicode.get(acip);
|
||||
|
||||
}
|
||||
}
|
||||
|
|
|
@ -30,6 +30,15 @@ public class ACIPString {
|
|||
private int type;
|
||||
private String text;
|
||||
|
||||
/** Returns true if and only if an ACIPString with type type is to
|
||||
* be converted to Latin, not Tibetan, text. */
|
||||
public static boolean isLatin(int type) {
|
||||
return (type != TIBETAN_NON_PUNCTUATION
|
||||
&& type != TIBETAN_PUNCTUATION
|
||||
&& type != START_SLASH
|
||||
&& type != END_SLASH);
|
||||
}
|
||||
|
||||
/** For [#COMMENTS] */
|
||||
public static final int COMMENT = 0;
|
||||
/** For Folio markers like @012B */
|
||||
|
|
|
@ -57,7 +57,6 @@ public class ACIPTshegBarScanner {
|
|||
System.out.println(errors);
|
||||
System.out.println("Exiting with " + maxErrors + " or more errors; please fix input file and try again.");
|
||||
System.exit(1);
|
||||
} else {
|
||||
}
|
||||
if (errors.length() > 0) {
|
||||
System.out.println("Errors scanning ACIP input file: ");
|
||||
|
@ -90,6 +89,7 @@ public class ACIPTshegBarScanner {
|
|||
while (-1 != (amt = in.read(ch))) {
|
||||
s.append(ch, 0, amt);
|
||||
}
|
||||
in.close();
|
||||
return scan(s.toString(), errors, !strict, maxErrors);
|
||||
}
|
||||
|
||||
|
@ -621,6 +621,18 @@ public class ACIPTshegBarScanner {
|
|||
}
|
||||
|
||||
if (startSlashIndex >= 0) {
|
||||
if (startSlashIndex + 1 == i) {
|
||||
/* //NYA\\ appears in ACIP input, and I think
|
||||
* it means /NYA/. We warn about // for this
|
||||
* reason. \\ causes a tsheg-bar error (DLC
|
||||
* FIXME: verify this is so). */
|
||||
al.add(new ACIPString("//", ACIPString.ERROR));
|
||||
if (errors != null) {
|
||||
errors.append("Offset " + i + ": "
|
||||
+ "Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\n");
|
||||
}
|
||||
if (maxErrors >= 0 && ++numErrors >= maxErrors) return null;
|
||||
}
|
||||
al.add(new ACIPString(s.substring(i, i+1),
|
||||
ACIPString.END_SLASH));
|
||||
startOfString = i+1;
|
||||
|
@ -766,6 +778,9 @@ public class ACIPTshegBarScanner {
|
|||
if ((int)ch == 65533) {
|
||||
errors.append("Offset " + i + ": "
|
||||
+ "Found an illegal, unprintable character.\n");
|
||||
} else if ('\\' == ch) {
|
||||
errors.append("Offset " + i + ": "
|
||||
+ "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n");
|
||||
} else {
|
||||
errors.append("Offset " + i + ": "
|
||||
+ "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".\n");
|
||||
|
@ -849,7 +864,7 @@ public class ACIPTshegBarScanner {
|
|||
|| ch == 'x'
|
||||
|| ch == ':'
|
||||
|| ch == '^'
|
||||
|| ch == '\\'
|
||||
// DLC FIXME: we must treat this guy like a vowel, a special vowel that numerals can take on. Until then, warn. || ch == '\\'
|
||||
|
||||
|| ch == '-'
|
||||
|| ch == '+'
|
||||
|
|
|
@ -292,6 +292,12 @@ public class PackageTest extends TestCase {
|
|||
new String[] { "{SH}{LO}", "{SH+LO}" },
|
||||
new String[] { "{SH+LO}" });
|
||||
tstHelper("ZLUM", "{Z}{LU}{M}", new String[] { "{Z}{LU}{M}", "{Z+LU}{M}" }, new String[] { "{Z+LU}{M}" });
|
||||
tstHelper("K'EE", "{K'EE}");
|
||||
tstHelper("K'O", "{K'O}");
|
||||
tstHelper("K'OO", "{K'OO}");
|
||||
tstHelper("K'II", "{K'I}{I}");
|
||||
tstHelper("K'i", "{K'i}");
|
||||
tstHelper("K'A", "{K'A}");
|
||||
tstHelper("B+DDZ", "{B+}{D}{DZ}",
|
||||
new String[] { "{B+D}{DZ}",
|
||||
"{B+D+DZ}" }); // we're conservative.
|
||||
|
@ -6984,7 +6990,7 @@ tstHelper("ZUR");
|
|||
shelp("DD]",
|
||||
"Offset 2: Found a truly unmatched close bracket, ] or }.\nOffset 2: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
||||
|
||||
shelp("///NYA", "Offset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
|
||||
shelp("///NYA", "Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n");
|
||||
shelp("/NYA/", "");
|
||||
shelp("[?][BP][LS][DD1][DD2][DDD][DR][# (<{A COMMENT)}>]", "");
|
||||
shelp("[LS][# A [[[[[COMMENT][LS]",
|
||||
|
@ -7029,14 +7035,26 @@ tstHelper("ZUR");
|
|||
shelp("?", "", "[QUESTION:{?}]");
|
||||
shelp("KHAN~ BAR ", "Offset 4: Found an illegal character, ~, with ordinal 126.\n");
|
||||
shelp("[* Correction with []]",
|
||||
"Offset 5: Found an illegal character, r, with ordinal 114.\nOffset 6: Found an illegal character, r, with ordinal 114.\nOffset 7: Found an illegal character, e, with ordinal 101.\nOffset 8: Found an illegal character, c, with ordinal 99.\nOffset 14: Found an illegal character, w, with ordinal 119.\nOffset 15: Found an illegal character, i, with ordinal 105.\nOffset 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
||||
"Offset 5: Found an illegal character, r, with ordinal 114.\nOffset 6: Found an illegal character, r, with ordinal 114.\nOffset 7: Found an illegal character, e, with ordinal 101.\nOffset 8: Found an illegal character, c, with ordinal 99.\nOffset 14: Found an illegal character, w, with ordinal 119.\nOffset 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n");
|
||||
|
||||
// DLC FIXME: the line SDIG PA'I GROGS PO'I LAG TU SON PAR 'GYUR PA is followed by a blank line. Note that it's "PA", not "PA ", ending it. Autocorrect to the latter.
|
||||
|
||||
// DLC FIXME: @0B1 isn't handled correctly!
|
||||
|
||||
shelp(",NGES ? PA", "", "[TIBETAN_PUNCTUATION:{,}, TIBETAN_NON_PUNCTUATION:{NGES}, TIBETAN_PUNCTUATION:{ }, QUESTION:{?}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{PA}]");
|
||||
shelp("K\\,", "", "[TIBETAN_NON_PUNCTUATION:{K\\}, TIBETAN_PUNCTUATION:{,}]");
|
||||
|
||||
|
||||
|
||||
// FIXME: just until we treat viramas correctly:
|
||||
if (false) {
|
||||
uhelp("1\\", "\u0f21\u0f84");
|
||||
uhelp(" 1\\ ", "\u0f0b\u0f21\u0f84\u0f0b");
|
||||
}
|
||||
shelp("K\\,",
|
||||
"Offset 1: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n",
|
||||
"[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{\\}, TIBETAN_PUNCTUATION:{,}]");
|
||||
|
||||
|
||||
shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR%}]");
|
||||
shelp("PHYIR;", "", "[TIBETAN_NON_PUNCTUATION:{PHYIR}, TIBETAN_PUNCTUATION:{;}]");
|
||||
shelp("......,DAM ",
|
||||
|
@ -7078,8 +7096,70 @@ tstHelper("ZUR");
|
|||
|
||||
shelp("{ DD }", "", "[DD:{{ DD }}]"); // TD3790E2.ACT
|
||||
shelp("{ BP }", "", "[BP:{{ BP }}]"); // TD3790E2.ACT
|
||||
shelp("//NYA\\\\",
|
||||
"Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset 5: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\nOffset 6: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n",
|
||||
"[START_SLASH:{/}, ERROR:{//}, END_SLASH:{/}, TIBETAN_NON_PUNCTUATION:{NYA}, ERROR:{\\}, ERROR:{\\}]");
|
||||
|
||||
}
|
||||
private static void uhelp(String acip) {
|
||||
uhelp(acip, null);
|
||||
}
|
||||
private static void uhelp(String acip, String expectedUnicode) {
|
||||
StringBuffer errors = new StringBuffer();
|
||||
String unicode = ACIPConverter.convertToUnicode(acip, errors);
|
||||
if (null == unicode) {
|
||||
if (null != expectedUnicode && "none" != expectedUnicode) {
|
||||
System.out.println("No unicode exists for " + acip + " but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode));
|
||||
assertTrue(false);
|
||||
}
|
||||
System.out.println("DLC: Unicode for " + acip + " can't be had; errors are " + errors);
|
||||
} else {
|
||||
if (null != expectedUnicode && !expectedUnicode.equals(unicode)) {
|
||||
System.out.println("The unicode for " + acip + " is " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(unicode) + ", but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode));
|
||||
assertTrue(false);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public void testACIPConversion() {
|
||||
uhelp("G+DHA", "\u0f42\u0fa2");
|
||||
uhelp("P'EE", "\u0f54\u0f71\u0f7b");
|
||||
|
||||
uhelp("KA", "\u0f40");
|
||||
uhelp("KI", "\u0f40\u0f72");
|
||||
uhelp("KO", "\u0f40\u0f7c");
|
||||
uhelp("KE", "\u0f40\u0f7a");
|
||||
uhelp("KU", "\u0f40\u0f74");
|
||||
uhelp("KOO", "\u0f40\u0f7d");
|
||||
uhelp("KEE", "\u0f40\u0f7b");
|
||||
uhelp("KEEm", "\u0f40\u0f7b\u0f7e");
|
||||
uhelp("KEEm:", "\u0f40\u0f7b\u0f7e\u0f7f");
|
||||
uhelp("KEE:", "\u0f40\u0f7b\u0f7f");
|
||||
|
||||
uhelp("K'I", "\u0f40\u0f71\u0f72");
|
||||
uhelp("K'O", "\u0f40\u0f71\u0f7c");
|
||||
uhelp("K'E", "\u0f40\u0f71\u0f7a");
|
||||
uhelp("K'U", "\u0f40\u0f71\u0f74");
|
||||
uhelp("K'OO", "\u0f40\u0f71\u0f7d");
|
||||
uhelp("K'EE", "\u0f40\u0f71\u0f7b");
|
||||
uhelp("K'EEm", "\u0f40\u0f71\u0f7b\u0f7e");
|
||||
tstHelper("K'EEm:", "{K'EEm:}",
|
||||
new String[] { "{K'EEm:}" },
|
||||
new String[] { },
|
||||
"{K'EEm:}");
|
||||
uhelp("K'EEm:", "\u0f40\u0f71\u0f7b\u0f7e\u0f7f");
|
||||
uhelp("K'EE:", "\u0f40\u0f71\u0f7b\u0f7f");
|
||||
|
||||
uhelp("K'A:", "\u0f40\u0f71\u0f7f");
|
||||
|
||||
// DLC FIXME: in ACIP RTF files, (PARENTHESES) seem to make
|
||||
// text go from 24-point to 18-point. Thus, ACIP->Unicode.txt
|
||||
// is fundamentally flawed, whereas ACIP->Unicode.rtf is OK.
|
||||
|
||||
uhelp("/NY'EE/", "\u0f3C\u0f49\u0F71\u0F7B\u0f3D");
|
||||
uhelp("*#HUm: G+DHOO GRO`;.,", "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d");
|
||||
uhelp("*#HUm: K+DHA GRO`;.,", "none");
|
||||
}
|
||||
|
||||
/** Tests some more tsheg bars, these from Dr. Lacey's critical
|
||||
edition of Mahavyutpatti.
|
||||
|
|
|
@ -167,4 +167,19 @@ class TPair {
|
|||
if (null == rightWylie) rightWylie = "";
|
||||
return leftWylie + rightWylie;
|
||||
}
|
||||
|
||||
/** Appends legal Unicode corresponding to this (possible
|
||||
* subscribed) pair to sb. DLC FIXME: which normalization form,
|
||||
* if any? */
|
||||
void getUnicode(StringBuffer sb, boolean subscribed) {
|
||||
if (null != getLeft()) {
|
||||
String x = ACIPRules.getUnicodeFor(getLeft(), subscribed);
|
||||
if (null != x) sb.append(x);
|
||||
}
|
||||
if (null != getRight()
|
||||
&& !("-".equals(getRight()) || "A".equals(getRight()))) {
|
||||
String x = ACIPRules.getUnicodeFor(getRight(), subscribed);
|
||||
if (null != x) sb.append(x);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -603,5 +603,16 @@ class TPairList {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Appends legal Unicode corresponding to this stack to sb. DLC
|
||||
* FIXME: which normalization form, if any? */
|
||||
void getUnicode(StringBuffer sb) {
|
||||
boolean subscribed = false;
|
||||
for (int i = 0; i < size(); i++) {
|
||||
get(i).getUnicode(sb, subscribed);
|
||||
subscribed = true;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
// DLC FIXME: handle 'o' and 'x', e.g. KAo and NYAx.
|
||||
|
|
|
@ -205,6 +205,15 @@ class TStackList {
|
|||
throw new IllegalArgumentException("opl (" + opl + ") is bad for this stack list (" + toString() + ")");
|
||||
return false;
|
||||
}
|
||||
|
||||
/** Returns legal Unicode corresponding to this tsheg bar. DLC FIXME: which normalization form, if any? */
|
||||
String getUnicode() {
|
||||
StringBuffer u = new StringBuffer(size());
|
||||
for (int i = 0; i < size(); i++) {
|
||||
get(i).getUnicode(u);
|
||||
}
|
||||
return u.toString();
|
||||
}
|
||||
}
|
||||
|
||||
class BoolPair {
|
||||
|
|
Loading…
Reference in a new issue