diff --git a/build.xml b/build.xml index 5da41fb..723f596 100644 --- a/build.xml +++ b/build.xml @@ -318,13 +318,18 @@ Contributor(s): ______________________________________. - + + + + + = 'a' && cp <= 'z') + || (cp >= 'A' && cp <= 'Z') + || (cp >= '0' && cp <= '9') + || cp == '.' + || cp == ',' + || cp == ' ' + || cp == '\'' + || cp == '"' + || cp == '+' + || cp == '-' + || cp == '=' + || cp == '_' + || cp == '@' + || cp == '!' + || cp == '#' + || cp == '$' + || cp == '%' + || cp == '^' + || cp == '&' + || cp == '*' + || cp == '\t' + || cp == ':' + || cp == '[' + || cp == ']' + || cp == '(' + || cp == ')' + || cp == '{' + || cp == '}') + return new String(new char[] { cp }); + } if (cp < '\u0010') return "\\u000" + Integer.toHexString((int)cp); else if (cp < '\u0100') @@ -304,7 +337,19 @@ public class UnicodeUtils implements UnicodeConstants { public static String unicodeStringToString(String s) { StringBuffer sb = new StringBuffer(s.length() * 6); for (int i = 0; i < s.length(); i++) { - sb.append(unicodeCodepointToString(s.charAt(i))); + sb.append(unicodeCodepointToString(s.charAt(i), false)); + } + return sb.toString(); + } + + /** + * Returns the most succinct possible, human-readable, ASCII form + * of the String s of Unicode codepoints. */ + public static String unicodeStringToPrettyString(String s) { + if (s == null) return "null"; + StringBuffer sb = new StringBuffer(s.length() * 6); + for (int i = 0; i < s.length(); i++) { + sb.append(unicodeCodepointToString(s.charAt(i), true)); } return sb.toString(); } diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeUtilsTest.java b/source/org/thdl/tib/text/tshegbar/UnicodeUtilsTest.java index 309869d..ae987f6 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeUtilsTest.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeUtilsTest.java @@ -321,15 +321,15 @@ public class UnicodeUtilsTest extends TestCase implements UnicodeConstants { * Tests the {@link UnicodeUtils#unicodeCodepointToString(char)} * method. */ public void testUnicodeCodepointToString() { - assertTrue(UnicodeUtils.unicodeCodepointToString('\u0000').equals("\\u0000")); - assertTrue(UnicodeUtils.unicodeCodepointToString('\u0001').equals("\\u0001")); - assertTrue(UnicodeUtils.unicodeCodepointToString('\u000F').equals("\\u000f")); - assertTrue(UnicodeUtils.unicodeCodepointToString('\u001F').equals("\\u001f")); - assertTrue(UnicodeUtils.unicodeCodepointToString('\u00fF').equals("\\u00ff")); - assertTrue(UnicodeUtils.unicodeCodepointToString('\u01fF').equals("\\u01ff")); - assertTrue(UnicodeUtils.unicodeCodepointToString('\u0ffF').equals("\\u0fff")); - assertTrue(UnicodeUtils.unicodeCodepointToString('\u1ffF').equals("\\u1fff")); - assertTrue(UnicodeUtils.unicodeCodepointToString('\ufffF').equals("\\uffff")); + assertTrue(UnicodeUtils.unicodeCodepointToString('\u0000', false).equals("\\u0000")); + assertTrue(UnicodeUtils.unicodeCodepointToString('\u0001', false).equals("\\u0001")); + assertTrue(UnicodeUtils.unicodeCodepointToString('\u000F', false).equals("\\u000f")); + assertTrue(UnicodeUtils.unicodeCodepointToString('\u001F', false).equals("\\u001f")); + assertTrue(UnicodeUtils.unicodeCodepointToString('\u00fF', false).equals("\\u00ff")); + assertTrue(UnicodeUtils.unicodeCodepointToString('\u01fF', false).equals("\\u01ff")); + assertTrue(UnicodeUtils.unicodeCodepointToString('\u0ffF', false).equals("\\u0fff")); + assertTrue(UnicodeUtils.unicodeCodepointToString('\u1ffF', false).equals("\\u1fff")); + assertTrue(UnicodeUtils.unicodeCodepointToString('\ufffF', false).equals("\\uffff")); } /** diff --git a/source/org/thdl/tib/text/ttt/ACIPConverter.java b/source/org/thdl/tib/text/ttt/ACIPConverter.java new file mode 100644 index 0000000..03c40fa --- /dev/null +++ b/source/org/thdl/tib/text/ttt/ACIPConverter.java @@ -0,0 +1,208 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). Portions created by the THDL are Copyright 2003 THDL. +All Rights Reserved. + +Contributor(s): ______________________________________. +*/ + +package org.thdl.tib.text.ttt; + +import java.io.*; +import java.util.ArrayList; +import java.util.Stack; + +import org.thdl.util.ThdlDebug; +import org.thdl.util.ThdlOptions; + +/** +* This class is able to convert an ACIP file into Tibetan Machine Web. +* From there, TMW->Unicode takes you to Unicode. +* @author David Chandler +*/ +public class ACIPConverter { + static { + // We don't want to load the TM or TMW font files ourselves: + ThdlOptions.setUserPreference("thdl.rely.on.system.tmw.fonts", true); + ThdlOptions.setUserPreference("thdl.rely.on.system.tm.fonts", true); + ThdlOptions.setUserPreference("thdl.debug", true); + } + + /** Command-line converter. Gives error messages on standard + * output about why we can't convert the document perfectly and + * exits with non-zero return code, or is silent otherwise and + * exits with code zero.

FIXME: not so efficient; copies the + * whole file into memory first. */ + public static void main(String[] args) + throws IOException // DLC FIXME: give nice error messages + { + boolean verbose = true; + boolean strict = true; + if (args.length != 2 + || (!(strict = "--strict".equals(args[0])) && !"--lenient".equals(args[0]))) { + System.err.println("Bad args! Need '--strict filename' or '--lenient filename'."); + System.exit(1); + } + StringBuffer errors = new StringBuffer(); + int maxErrors = 250; + ArrayList al = ACIPTshegBarScanner.scanFile(args[1], errors, strict, maxErrors - 1); + + if (null == al) { + System.err.println(maxErrors + " or more errors occurred while scanning ACIP input file; is this"); + System.err.println("Tibetan or English input?"); + System.err.println(""); + System.err.println("First " + maxErrors + " errors scanning ACIP input file: "); + System.err.println(errors); + System.err.println("Exiting with " + maxErrors + " or more errors; please fix input file and try again."); + System.exit(1); + } + if (errors.length() > 0) { + System.err.println("Errors scanning ACIP input file: "); + System.err.println(errors); + System.err.println("Exiting; please fix input file and try again."); + System.exit(1); + } + + convertToUnicode(al, System.out, errors); + if (errors.length() > 0) { + System.err.println("Errors converting ACIP input file: "); + System.err.println(errors); + System.err.println("Exiting; please fix input file and try again."); + System.exit(2); + } + if (verbose) System.err.println("Converted " + args[1] + " perfectly."); + System.exit(0); + } + + /** Writes TMW/Latin to out. If errors occur in converting a + * tsheg bar, then they are appended to errors if errors is + * non-null. Returns true upon perfect success, false if errors + * occurred. + * @throws IOException if we cannot write to out + */ + public static boolean convertToTMW(ArrayList scan, String latinFont, + OutputStream out, StringBuffer errors) + throws IOException + { + throw new Error("DLC UNIMPLEMENTED"); + } + + /** Returns UTF-8 encoded Unicode. A bit indirect, so use this + * for testing only if performance is a concern. If errors occur + * in scanning the ACIP or in converting a tsheg bar, then they + * are appended to errors if errors is non-null. Returns the + * conversion upon perfect success, null if errors occurred. + */ + public static String convertToUnicode(String acip, + StringBuffer errors) { + ByteArrayOutputStream sw = new ByteArrayOutputStream(); + ArrayList al = ACIPTshegBarScanner.scan(acip, errors, true /* DLC FIXME */, -1); + try { + if (null != al && convertToUnicode(al, sw, errors)) { + return sw.toString("UTF-8"); + } else { + System.out.println("DLC al is " + al + " and convertToUnicode returned null."); + return null; + } + } catch (Exception e) { + throw new Error(e.toString()); + } + } + + /** Writes Unicode to out. If errors occur in converting a + * tsheg bar, then they are appended to errors if errors is + * non-null. Returns true upon perfect success, false if errors + * occurred. + * @throws IOException if we cannot write to out + */ + public static boolean convertToUnicode(ArrayList scan, + OutputStream out, + StringBuffer errors) + throws IOException + { + int sz = scan.size(); + boolean hasErrors = false; + BufferedWriter writer + = new BufferedWriter(new OutputStreamWriter(out, "UTF-8")); + for (int i = 0; i < sz; i++) { + ACIPString s = (ACIPString)scan.get(i); + int stype = s.getType(); + if (stype == ACIPString.ERROR) { + hasErrors = true; + writer.write("[#ERROR CONVERTING ACIP DOCUMENT: "); + writer.write(s.getText()); + writer.write("]"); + } else { + // DLC FIXME: what about 'no A on root stack' and 'no A on such-and-such stack' warnings? + if (s.isLatin(stype)) { + if (stype == ACIPString.FOLIO_MARKER) + writer.write("{"); + writer.write(s.getText()); + if (stype == ACIPString.FOLIO_MARKER) + writer.write("}"); + } else { + String unicode = null; + if (stype == ACIPString.TIBETAN_NON_PUNCTUATION) { + TPairList pl = TPairListFactory.breakACIPIntoChunks(s.getText()); + String acipError; + + if ((acipError = pl.getACIPError()) != null) { + hasErrors = true; + String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS THESE ERRORS: " + acipError + "]"; + writer.write(errorMessage); + if (null != errors) + errors.append(errorMessage + "\n"); + } else { + TParseTree pt = pl.getParseTree(); + if (null == pt) { + hasErrors = true; + String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " IS ESSENTIALLY NOTHING.]"; + writer.write(errorMessage); + if (null != errors) + errors.append(errorMessage + "\n"); + } else { + TStackList sl = pt.getBestParse(); + if (null == sl) { + hasErrors = true; + String errorMessage = "[#ERROR CONVERTING ACIP DOCUMENT: THE TSHEG BAR (\"SYLLABLE\") " + s.getText() + " HAS NO LEGAL PARSES.]"; + writer.write(errorMessage); + if (null != errors) + errors.append(errorMessage + "\n"); + } else { + unicode = sl.getUnicode(); + if (null == unicode) throw new Error("DLC: HOW?"); + } + } + } + } else { + if (stype == ACIPString.START_SLASH) + unicode = "\u0F3C"; + else if (stype == ACIPString.END_SLASH) + unicode = "\u0F3D"; + else + unicode = ACIPRules.getUnicodeFor(s.getText(), false); + if (null == unicode) throw new Error("DLC: HOW?"); + } + if (null != unicode) { + writer.write(unicode); + } + } + } + } + writer.close(); + return !hasErrors; + } +} +// DLC FIXME: putting Tibetan in black, Sanskrit in green, and Latin +// in yellow would help you quickly decide if ZHIGN maybe should've +// been ZHING. diff --git a/source/org/thdl/tib/text/ttt/ACIPRules.java b/source/org/thdl/tib/text/ttt/ACIPRules.java index 2d1db42..bcdbca8 100644 --- a/source/org/thdl/tib/text/ttt/ACIPRules.java +++ b/source/org/thdl/tib/text/ttt/ACIPRules.java @@ -28,9 +28,9 @@ class ACIPRules { * three. */ public static int MAX_CONSONANT_LENGTH = 3; - /** {'im:}, the longest "vowel", has 4 characters, so this is - * four. */ - public static int MAX_VOWEL_LENGTH = 4; + /** {'EEm:}, the longest "vowel", has 5 characters, so this is + * five. */ + public static int MAX_VOWEL_LENGTH = 5; /** For O(1) {@link #isVowel(String)} calls. */ private static HashSet acipVowels = null; @@ -42,18 +42,9 @@ class ACIPRules { { "U", "u" }, { "E", "e" }, { "O", "o" }, - { "'I", "I" }, - { "'U", "U" }, { "EE", "ai" }, { "OO", "au" }, - { "i", "-i" }, - { "'i", "-I" }, - { "'A", "A" }, - { "'O", "Ao" }, - { "'E", "Ae" } - // DLC I'm on my own with 'O and 'E, but GANG'O appears - // and I wonder... so here are 'O and 'E. It's - // consistent with 'I and 'A and 'U, at least. + { "i", "-i" } }; /** Returns true if and only if s is an ACIP "vowel". You can't @@ -61,14 +52,24 @@ class ACIPRules { * ACIP, so you have to call this in the right context. */ public static boolean isVowel(String s) { if (null == acipVowels) { - acipVowels = new HashSet(); + acipVowels = new HashSet(baseVowels.length * 8); for (int i = 0; i < baseVowels.length; i++) { - acipVowels.add(baseVowels[i][0]); - acipVowels.add(baseVowels[i][0] + 'm'); - acipVowels.add(baseVowels[i][0] + ':'); - acipVowels.add(baseVowels[i][0] + "m:"); - // DLC '\' for visarga? how shall we do \ the visarga? like a vowel or not? + // DLC I'm on my own with 'O and 'E and 'OO and 'EE, but + // GANG'O appears and I wonder... so here they are. It's + // consistent with 'I and 'A and 'U, at least: all the vowels + // may appear as K'vowel. + acipVowels.add(baseVowels[i][0]); + acipVowels.add('\'' + baseVowels[i][0]); + acipVowels.add(baseVowels[i][0] + 'm'); + acipVowels.add('\'' + baseVowels[i][0] + 'm'); + acipVowels.add(baseVowels[i][0] + ':'); + acipVowels.add('\'' + baseVowels[i][0] + ':'); + acipVowels.add(baseVowels[i][0] + "m:"); + acipVowels.add('\'' + baseVowels[i][0] + "m:"); + // DLC keep this code in sync with getUnicodeFor. + + // DLC '\' for visarga? how shall we do \ the visarga? like a vowel or not? } } return (acipVowels.contains(s)); @@ -204,4 +205,212 @@ class ACIPRules { } return (String)acipVowel2wylie.get(acip); } + + private static HashMap superACIP2unicode = null; + private static HashMap subACIP2unicode = null; + /** If acip is an ACIP consonant or vowel or punctuation mark, + * then this returns the Unicode for it. The Unicode for the + * subscribed form of the glyph is returned if subscribed is + * true. Returns null if acip is unknown. */ + static String getUnicodeFor(String acip, boolean subscribed) { + if (superACIP2unicode == null) { + superACIP2unicode = new HashMap(144); + subACIP2unicode = new HashMap(42); + + // oddball: + subACIP2unicode.put("V", "\u0FAD"); + + superACIP2unicode.put("DH", "\u0F52"); + subACIP2unicode.put("DH", "\u0FA2"); + superACIP2unicode.put("BH", "\u0F57"); + subACIP2unicode.put("BH", "\u0FA7"); + superACIP2unicode.put("dH", "\u0F4D"); + subACIP2unicode.put("dH", "\u0F9D"); + superACIP2unicode.put("DZH", "\u0F5C"); + subACIP2unicode.put("DZH", "\u0FAC"); + superACIP2unicode.put("Ksh", "\u0F69"); + subACIP2unicode.put("Ksh", "\u0FB9"); + superACIP2unicode.put("GH", "\u0F43"); + subACIP2unicode.put("GH", "\u0F93"); + superACIP2unicode.put("K", "\u0F40"); + subACIP2unicode.put("K", "\u0F90"); + superACIP2unicode.put("KH", "\u0F41"); + subACIP2unicode.put("KH", "\u0F91"); + superACIP2unicode.put("G", "\u0F42"); + subACIP2unicode.put("G", "\u0F92"); + superACIP2unicode.put("NG", "\u0F44"); + subACIP2unicode.put("NG", "\u0F94"); + superACIP2unicode.put("C", "\u0F45"); + subACIP2unicode.put("C", "\u0F95"); + superACIP2unicode.put("CH", "\u0F46"); + subACIP2unicode.put("CH", "\u0F96"); + superACIP2unicode.put("J", "\u0F47"); + subACIP2unicode.put("J", "\u0F97"); + superACIP2unicode.put("NY", "\u0F49"); + subACIP2unicode.put("NY", "\u0F99"); + superACIP2unicode.put("T", "\u0F4F"); + subACIP2unicode.put("T", "\u0F9F"); + superACIP2unicode.put("TH", "\u0F50"); + subACIP2unicode.put("TH", "\u0FA0"); + superACIP2unicode.put("D", "\u0F51"); + subACIP2unicode.put("D", "\u0FA1"); + superACIP2unicode.put("N", "\u0F53"); + subACIP2unicode.put("N", "\u0FA3"); + superACIP2unicode.put("P", "\u0F54"); + subACIP2unicode.put("P", "\u0FA4"); + superACIP2unicode.put("PH", "\u0F55"); + subACIP2unicode.put("PH", "\u0FA5"); + superACIP2unicode.put("B", "\u0F56"); + subACIP2unicode.put("B", "\u0FA6"); + superACIP2unicode.put("M", "\u0F58"); + subACIP2unicode.put("M", "\u0FA8"); + superACIP2unicode.put("TZ", "\u0F59"); + subACIP2unicode.put("TZ", "\u0FA9"); + superACIP2unicode.put("TS", "\u0F5A"); + subACIP2unicode.put("TS", "\u0FAA"); + superACIP2unicode.put("DZ", "\u0F5B"); + subACIP2unicode.put("DZ", "\u0FAB"); + superACIP2unicode.put("W", "\u0F5D"); + subACIP2unicode.put("W", "\u0FBA"); // oddball + superACIP2unicode.put("ZH", "\u0F5E"); + subACIP2unicode.put("ZH", "\u0FAE"); + superACIP2unicode.put("Z", "\u0F5F"); + subACIP2unicode.put("Z", "\u0FAF"); + superACIP2unicode.put("'", "\u0F60"); + subACIP2unicode.put("'", "\u0FB0"); + superACIP2unicode.put("Y", "\u0F61"); + subACIP2unicode.put("Y", "\u0FB1"); + superACIP2unicode.put("R", "\u0F62"); + subACIP2unicode.put("R", "\u0FB2"); + superACIP2unicode.put("L", "\u0F63"); + subACIP2unicode.put("L", "\u0FB3"); + superACIP2unicode.put("SH", "\u0F64"); + subACIP2unicode.put("SH", "\u0FB4"); + superACIP2unicode.put("S", "\u0F66"); + subACIP2unicode.put("S", "\u0FB6"); + superACIP2unicode.put("H", "\u0F67"); + subACIP2unicode.put("H", "\u0FB7"); + superACIP2unicode.put("A", "\u0F68"); + subACIP2unicode.put("A", "\u0FB8"); + superACIP2unicode.put("t", "\u0F4A"); + subACIP2unicode.put("t", "\u0F9A"); + superACIP2unicode.put("th", "\u0F4B"); + subACIP2unicode.put("th", "\u0F9B"); + superACIP2unicode.put("d", "\u0F4C"); + subACIP2unicode.put("d", "\u0F9C"); + superACIP2unicode.put("n", "\u0F4E"); + subACIP2unicode.put("n", "\u0F9E"); + superACIP2unicode.put("sh", "\u0F65"); + subACIP2unicode.put("sh", "\u0FB5"); + + superACIP2unicode.put("I", "\u0F72"); + superACIP2unicode.put("E", "\u0F7A"); + superACIP2unicode.put("O", "\u0F7C"); + superACIP2unicode.put("U", "\u0F74"); + superACIP2unicode.put("OO", "\u0F7D"); + superACIP2unicode.put("EE", "\u0F7B"); + superACIP2unicode.put("i", "\u0F80"); + superACIP2unicode.put("'A", "\u0F71"); + superACIP2unicode.put("'I", "\u0F71\u0F72"); + superACIP2unicode.put("'E", "\u0F71\u0F7A"); + superACIP2unicode.put("'O", "\u0F71\u0F7C"); + superACIP2unicode.put("'U", "\u0F71\u0F74"); + superACIP2unicode.put("'OO", "\u0F71\u0F7D"); + superACIP2unicode.put("'EE", "\u0F71\u0F7B"); + superACIP2unicode.put("'i", "\u0F71\u0F80"); + + superACIP2unicode.put("Im", "\u0F72\u0F7E"); + superACIP2unicode.put("Em", "\u0F7A\u0F7E"); + superACIP2unicode.put("Om", "\u0F7C\u0F7E"); + superACIP2unicode.put("Um", "\u0F74\u0F7E"); + superACIP2unicode.put("OOm", "\u0F7D\u0F7E"); + superACIP2unicode.put("EEm", "\u0F7B\u0F7E"); + superACIP2unicode.put("im", "\u0F80\u0F7E"); + superACIP2unicode.put("'Am", "\u0F71\u0F7E"); + superACIP2unicode.put("'Im", "\u0F71\u0F72\u0F7E"); + superACIP2unicode.put("'Em", "\u0F71\u0F7A\u0F7E"); + superACIP2unicode.put("'Om", "\u0F71\u0F7C\u0F7E"); + superACIP2unicode.put("'Um", "\u0F71\u0F74\u0F7E"); + superACIP2unicode.put("'OOm", "\u0F71\u0F7D\u0F7E"); + superACIP2unicode.put("'EEm", "\u0F71\u0F7B\u0F7E"); + superACIP2unicode.put("'im", "\u0F71\u0F80\u0F7E"); + + superACIP2unicode.put("I:", "\u0F72\u0F7F"); + superACIP2unicode.put("E:", "\u0F7A\u0F7F"); + superACIP2unicode.put("O:", "\u0F7C\u0F7F"); + superACIP2unicode.put("U:", "\u0F74\u0F7F"); + superACIP2unicode.put("OO:", "\u0F7D\u0F7F"); + superACIP2unicode.put("EE:", "\u0F7B\u0F7F"); + superACIP2unicode.put("i:", "\u0F80\u0F7F"); + superACIP2unicode.put("'A:", "\u0F71\u0F7F"); + superACIP2unicode.put("'I:", "\u0F71\u0F72\u0F7F"); + superACIP2unicode.put("'E:", "\u0F71\u0F7A\u0F7F"); + superACIP2unicode.put("'O:", "\u0F71\u0F7C\u0F7F"); + superACIP2unicode.put("'U:", "\u0F71\u0F74\u0F7F"); + superACIP2unicode.put("'OO:", "\u0F71\u0F7D\u0F7F"); + superACIP2unicode.put("'EE:", "\u0F71\u0F7B\u0F7F"); + superACIP2unicode.put("'i:", "\u0F71\u0F80\u0F7F"); + + superACIP2unicode.put("Im:", "\u0F72\u0F7E\u0F7F"); + superACIP2unicode.put("Em:", "\u0F7A\u0F7E\u0F7F"); + superACIP2unicode.put("Om:", "\u0F7C\u0F7E\u0F7F"); + superACIP2unicode.put("Um:", "\u0F74\u0F7E\u0F7F"); + superACIP2unicode.put("OOm:", "\u0F7D\u0F7E\u0F7F"); + superACIP2unicode.put("EEm:", "\u0F7B\u0F7E\u0F7F"); + superACIP2unicode.put("im:", "\u0F80\u0F7E\u0F7F"); + superACIP2unicode.put("'Am:", "\u0F71\u0F7E\u0F7F"); + superACIP2unicode.put("'Im:", "\u0F71\u0F72\u0F7E\u0F7F"); + superACIP2unicode.put("'Em:", "\u0F71\u0F7A\u0F7E\u0F7F"); + superACIP2unicode.put("'Om:", "\u0F71\u0F7C\u0F7E\u0F7F"); + superACIP2unicode.put("'Um:", "\u0F71\u0F74\u0F7E\u0F7F"); + superACIP2unicode.put("'OOm:", "\u0F71\u0F7D\u0F7E\u0F7F"); + superACIP2unicode.put("'EEm:", "\u0F71\u0F7B\u0F7E\u0F7F"); + superACIP2unicode.put("'im:", "\u0F71\u0F80\u0F7E\u0F7F"); + // :m does not appear, though you'd think it's as valid as m:. + + // I doubt these will occur alone: + superACIP2unicode.put("m", "\u0F7E"); + superACIP2unicode.put(":", "\u0F7F"); + + superACIP2unicode.put("Am", "\u0F7E"); + superACIP2unicode.put("A:", "\u0F7F"); + + superACIP2unicode.put("0", "\u0F20"); + superACIP2unicode.put("1", "\u0F21"); + superACIP2unicode.put("2", "\u0F22"); + superACIP2unicode.put("3", "\u0F23"); + superACIP2unicode.put("4", "\u0F24"); + superACIP2unicode.put("5", "\u0F25"); + superACIP2unicode.put("6", "\u0F26"); + superACIP2unicode.put("7", "\u0F27"); + superACIP2unicode.put("8", "\u0F28"); + superACIP2unicode.put("9", "\u0F29"); + + // DLC punctuation + superACIP2unicode.put("&", "\u0F85"); + superACIP2unicode.put(",", "\u0F0D"); + superACIP2unicode.put(" ", "\u0F0B"); + superACIP2unicode.put(".", "\u0F0C"); + superACIP2unicode.put("`", "\u0F08"); + superACIP2unicode.put("`", "\u0F08"); + superACIP2unicode.put("*", "\u0F04\u0F05"); + superACIP2unicode.put("#", "\u0F04\u0F05\u0F05"); + superACIP2unicode.put("%", "\u0F35"); + superACIP2unicode.put(";", "\u0F11"); + superACIP2unicode.put("\r", "\r"); + superACIP2unicode.put("\t", "\t"); + superACIP2unicode.put("\n", "\n"); + superACIP2unicode.put("\\", "\u0F84"); // DLC FIXME: make this like a vowel + // DLC FIXME: what's the Unicode for caret, ^? + // DLC FIXME: what's the Unicode for o? + // DLC FIXME: what's the Unicode for x? + + } + if (subscribed) { + String u = (String)subACIP2unicode.get(acip); + if (null != u) return u; + } + return (String)superACIP2unicode.get(acip); + + } } diff --git a/source/org/thdl/tib/text/ttt/ACIPString.java b/source/org/thdl/tib/text/ttt/ACIPString.java index f05c0b5..ef404fe 100644 --- a/source/org/thdl/tib/text/ttt/ACIPString.java +++ b/source/org/thdl/tib/text/ttt/ACIPString.java @@ -30,6 +30,15 @@ public class ACIPString { private int type; private String text; + /** Returns true if and only if an ACIPString with type type is to + * be converted to Latin, not Tibetan, text. */ + public static boolean isLatin(int type) { + return (type != TIBETAN_NON_PUNCTUATION + && type != TIBETAN_PUNCTUATION + && type != START_SLASH + && type != END_SLASH); + } + /** For [#COMMENTS] */ public static final int COMMENT = 0; /** For Folio markers like @012B */ diff --git a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java index 8048a12..2879683 100644 --- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java +++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java @@ -57,7 +57,6 @@ public class ACIPTshegBarScanner { System.out.println(errors); System.out.println("Exiting with " + maxErrors + " or more errors; please fix input file and try again."); System.exit(1); - } else { } if (errors.length() > 0) { System.out.println("Errors scanning ACIP input file: "); @@ -90,6 +89,7 @@ public class ACIPTshegBarScanner { while (-1 != (amt = in.read(ch))) { s.append(ch, 0, amt); } + in.close(); return scan(s.toString(), errors, !strict, maxErrors); } @@ -621,6 +621,18 @@ public class ACIPTshegBarScanner { } if (startSlashIndex >= 0) { + if (startSlashIndex + 1 == i) { + /* //NYA\\ appears in ACIP input, and I think + * it means /NYA/. We warn about // for this + * reason. \\ causes a tsheg-bar error (DLC + * FIXME: verify this is so). */ + al.add(new ACIPString("//", ACIPString.ERROR)); + if (errors != null) { + errors.append("Offset " + i + ": " + + "Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\n"); + } + if (maxErrors >= 0 && ++numErrors >= maxErrors) return null; + } al.add(new ACIPString(s.substring(i, i+1), ACIPString.END_SLASH)); startOfString = i+1; @@ -766,6 +778,9 @@ public class ACIPTshegBarScanner { if ((int)ch == 65533) { errors.append("Offset " + i + ": " + "Found an illegal, unprintable character.\n"); + } else if ('\\' == ch) { + errors.append("Offset " + i + ": " + + "Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n"); } else { errors.append("Offset " + i + ": " + "Found an illegal character, " + ch + ", with ordinal " + (int)ch + ".\n"); @@ -849,7 +864,7 @@ public class ACIPTshegBarScanner { || ch == 'x' || ch == ':' || ch == '^' - || ch == '\\' + // DLC FIXME: we must treat this guy like a vowel, a special vowel that numerals can take on. Until then, warn. || ch == '\\' || ch == '-' || ch == '+' diff --git a/source/org/thdl/tib/text/ttt/PackageTest.java b/source/org/thdl/tib/text/ttt/PackageTest.java index 39b4155..b447da1 100644 --- a/source/org/thdl/tib/text/ttt/PackageTest.java +++ b/source/org/thdl/tib/text/ttt/PackageTest.java @@ -292,6 +292,12 @@ public class PackageTest extends TestCase { new String[] { "{SH}{LO}", "{SH+LO}" }, new String[] { "{SH+LO}" }); tstHelper("ZLUM", "{Z}{LU}{M}", new String[] { "{Z}{LU}{M}", "{Z+LU}{M}" }, new String[] { "{Z+LU}{M}" }); + tstHelper("K'EE", "{K'EE}"); + tstHelper("K'O", "{K'O}"); + tstHelper("K'OO", "{K'OO}"); + tstHelper("K'II", "{K'I}{I}"); + tstHelper("K'i", "{K'i}"); + tstHelper("K'A", "{K'A}"); tstHelper("B+DDZ", "{B+}{D}{DZ}", new String[] { "{B+D}{DZ}", "{B+D+DZ}" }); // we're conservative. @@ -6984,7 +6990,7 @@ tstHelper("ZUR"); shelp("DD]", "Offset 2: Found a truly unmatched close bracket, ] or }.\nOffset 2: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); - shelp("///NYA", "Offset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n"); + shelp("///NYA", "Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset END: Slashes are supposed to occur in pairs, but the input had an unmatched '/' character.\n"); shelp("/NYA/", ""); shelp("[?][BP][LS][DD1][DD2][DDD][DR][# (<{A COMMENT)}>]", ""); shelp("[LS][# A [[[[[COMMENT][LS]", @@ -7029,14 +7035,26 @@ tstHelper("ZUR"); shelp("?", "", "[QUESTION:{?}]"); shelp("KHAN~ BAR ", "Offset 4: Found an illegal character, ~, with ordinal 126.\n"); shelp("[* Correction with []]", - "Offset 5: Found an illegal character, r, with ordinal 114.\nOffset 6: Found an illegal character, r, with ordinal 114.\nOffset 7: Found an illegal character, e, with ordinal 101.\nOffset 8: Found an illegal character, c, with ordinal 99.\nOffset 14: Found an illegal character, w, with ordinal 119.\nOffset 15: Found an illegal character, i, with ordinal 105.\nOffset 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); + "Offset 5: Found an illegal character, r, with ordinal 114.\nOffset 6: Found an illegal character, r, with ordinal 114.\nOffset 7: Found an illegal character, e, with ordinal 101.\nOffset 8: Found an illegal character, c, with ordinal 99.\nOffset 14: Found an illegal character, w, with ordinal 119.\nOffset 19: Found an illegal open bracket (in context, this is []]). Perhaps there is a [#COMMENT] written incorrectly as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], or an unmatched open bracket?\nOffset 21: Found a closing bracket without a matching open bracket. Perhaps a [#COMMENT] incorrectly written as [COMMENT], or a [*CORRECTION] written incorrectly as [CORRECTION], caused this.\n"); // DLC FIXME: the line SDIG PA'I GROGS PO'I LAG TU SON PAR 'GYUR PA is followed by a blank line. Note that it's "PA", not "PA ", ending it. Autocorrect to the latter. // DLC FIXME: @0B1 isn't handled correctly! shelp(",NGES ? PA", "", "[TIBETAN_PUNCTUATION:{,}, TIBETAN_NON_PUNCTUATION:{NGES}, TIBETAN_PUNCTUATION:{ }, QUESTION:{?}, TIBETAN_PUNCTUATION:{ }, TIBETAN_NON_PUNCTUATION:{PA}]"); - shelp("K\\,", "", "[TIBETAN_NON_PUNCTUATION:{K\\}, TIBETAN_PUNCTUATION:{,}]"); + + + + // FIXME: just until we treat viramas correctly: + if (false) { + uhelp("1\\", "\u0f21\u0f84"); + uhelp(" 1\\ ", "\u0f0b\u0f21\u0f84\u0f0b"); + } + shelp("K\\,", + "Offset 1: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n", + "[TIBETAN_NON_PUNCTUATION:{K}, ERROR:{\\}, TIBETAN_PUNCTUATION:{,}]"); + + shelp("MTHAR%", "", "[TIBETAN_NON_PUNCTUATION:{MTHAR%}]"); shelp("PHYIR;", "", "[TIBETAN_NON_PUNCTUATION:{PHYIR}, TIBETAN_PUNCTUATION:{;}]"); shelp("......,DAM ", @@ -7078,8 +7096,70 @@ tstHelper("ZUR"); shelp("{ DD }", "", "[DD:{{ DD }}]"); // TD3790E2.ACT shelp("{ BP }", "", "[BP:{{ BP }}]"); // TD3790E2.ACT + shelp("//NYA\\\\", + "Offset 1: Found //, which could be legal (the Unicode would be \\u0F3C\\u0F3D), but is likely in an illegal construct like //NYA\\\\.\nOffset 5: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\nOffset 6: Found a Sanskrit virama, \\, but the converter currently doesn't treat these properly. Sorry! Please do complain to the maintainers.\n", + "[START_SLASH:{/}, ERROR:{//}, END_SLASH:{/}, TIBETAN_NON_PUNCTUATION:{NYA}, ERROR:{\\}, ERROR:{\\}]"); } + private static void uhelp(String acip) { + uhelp(acip, null); + } + private static void uhelp(String acip, String expectedUnicode) { + StringBuffer errors = new StringBuffer(); + String unicode = ACIPConverter.convertToUnicode(acip, errors); + if (null == unicode) { + if (null != expectedUnicode && "none" != expectedUnicode) { + System.out.println("No unicode exists for " + acip + " but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode)); + assertTrue(false); + } + System.out.println("DLC: Unicode for " + acip + " can't be had; errors are " + errors); + } else { + if (null != expectedUnicode && !expectedUnicode.equals(unicode)) { + System.out.println("The unicode for " + acip + " is " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(unicode) + ", but you expected " + org.thdl.tib.text.tshegbar.UnicodeUtils.unicodeStringToPrettyString(expectedUnicode)); + assertTrue(false); + } + } + } + + public void testACIPConversion() { + uhelp("G+DHA", "\u0f42\u0fa2"); + uhelp("P'EE", "\u0f54\u0f71\u0f7b"); + + uhelp("KA", "\u0f40"); + uhelp("KI", "\u0f40\u0f72"); + uhelp("KO", "\u0f40\u0f7c"); + uhelp("KE", "\u0f40\u0f7a"); + uhelp("KU", "\u0f40\u0f74"); + uhelp("KOO", "\u0f40\u0f7d"); + uhelp("KEE", "\u0f40\u0f7b"); + uhelp("KEEm", "\u0f40\u0f7b\u0f7e"); + uhelp("KEEm:", "\u0f40\u0f7b\u0f7e\u0f7f"); + uhelp("KEE:", "\u0f40\u0f7b\u0f7f"); + + uhelp("K'I", "\u0f40\u0f71\u0f72"); + uhelp("K'O", "\u0f40\u0f71\u0f7c"); + uhelp("K'E", "\u0f40\u0f71\u0f7a"); + uhelp("K'U", "\u0f40\u0f71\u0f74"); + uhelp("K'OO", "\u0f40\u0f71\u0f7d"); + uhelp("K'EE", "\u0f40\u0f71\u0f7b"); + uhelp("K'EEm", "\u0f40\u0f71\u0f7b\u0f7e"); + tstHelper("K'EEm:", "{K'EEm:}", + new String[] { "{K'EEm:}" }, + new String[] { }, + "{K'EEm:}"); + uhelp("K'EEm:", "\u0f40\u0f71\u0f7b\u0f7e\u0f7f"); + uhelp("K'EE:", "\u0f40\u0f71\u0f7b\u0f7f"); + + uhelp("K'A:", "\u0f40\u0f71\u0f7f"); + + // DLC FIXME: in ACIP RTF files, (PARENTHESES) seem to make + // text go from 24-point to 18-point. Thus, ACIP->Unicode.txt + // is fundamentally flawed, whereas ACIP->Unicode.rtf is OK. + + uhelp("/NY'EE/", "\u0f3C\u0f49\u0F71\u0F7B\u0f3D"); + uhelp("*#HUm: G+DHOO GRO`;.,", "\u0f04\u0f05\u0f04\u0f05\u0f05\u0f67\u0f74\u0f7e\u0f7f\u0f0b\u0f42\u0fa2\u0f7d\u0f0b\u0f42\u0fb2\u0f7c\u0f08\u0f11\u0f0c\u0f0d"); + uhelp("*#HUm: K+DHA GRO`;.,", "none"); + } /** Tests some more tsheg bars, these from Dr. Lacey's critical edition of Mahavyutpatti. diff --git a/source/org/thdl/tib/text/ttt/TPair.java b/source/org/thdl/tib/text/ttt/TPair.java index 065d14a..e598e20 100644 --- a/source/org/thdl/tib/text/ttt/TPair.java +++ b/source/org/thdl/tib/text/ttt/TPair.java @@ -167,4 +167,19 @@ class TPair { if (null == rightWylie) rightWylie = ""; return leftWylie + rightWylie; } + + /** Appends legal Unicode corresponding to this (possible + * subscribed) pair to sb. DLC FIXME: which normalization form, + * if any? */ + void getUnicode(StringBuffer sb, boolean subscribed) { + if (null != getLeft()) { + String x = ACIPRules.getUnicodeFor(getLeft(), subscribed); + if (null != x) sb.append(x); + } + if (null != getRight() + && !("-".equals(getRight()) || "A".equals(getRight()))) { + String x = ACIPRules.getUnicodeFor(getRight(), subscribed); + if (null != x) sb.append(x); + } + } } diff --git a/source/org/thdl/tib/text/ttt/TPairList.java b/source/org/thdl/tib/text/ttt/TPairList.java index 5169142..1d97639 100644 --- a/source/org/thdl/tib/text/ttt/TPairList.java +++ b/source/org/thdl/tib/text/ttt/TPairList.java @@ -603,5 +603,16 @@ class TPairList { } } } + + /** Appends legal Unicode corresponding to this stack to sb. DLC + * FIXME: which normalization form, if any? */ + void getUnicode(StringBuffer sb) { + boolean subscribed = false; + for (int i = 0; i < size(); i++) { + get(i).getUnicode(sb, subscribed); + subscribed = true; + } + } + } // DLC FIXME: handle 'o' and 'x', e.g. KAo and NYAx. diff --git a/source/org/thdl/tib/text/ttt/TStackList.java b/source/org/thdl/tib/text/ttt/TStackList.java index 7111ba8..1b01308 100644 --- a/source/org/thdl/tib/text/ttt/TStackList.java +++ b/source/org/thdl/tib/text/ttt/TStackList.java @@ -205,6 +205,15 @@ class TStackList { throw new IllegalArgumentException("opl (" + opl + ") is bad for this stack list (" + toString() + ")"); return false; } + + /** Returns legal Unicode corresponding to this tsheg bar. DLC FIXME: which normalization form, if any? */ + String getUnicode() { + StringBuffer u = new StringBuffer(size()); + for (int i = 0; i < size(); i++) { + get(i).getUnicode(u); + } + return u.toString(); + } } class BoolPair {