A reverter that converts Unicode to computer-friendly (but not, yet,

human-friendly) EWTS is here in alpha mode.  It probably doesn't deal
well with non-Tibetan.
This commit is contained in:
dchandler 2005-08-01 05:54:20 +00:00
parent 00afd75362
commit 5788416629
13 changed files with 496 additions and 47 deletions

View file

@ -18,6 +18,16 @@ Contributor(s): ______________________________________.
package org.thdl.tib.text.reverter;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.thdl.tib.text.tshegbar.UnicodeUtils;
/** Static methods for converting Unicode to EWTS and
* (TODO(dchandler): ACIP).
* @author David Chandler
@ -28,11 +38,110 @@ public class Converter {
throw new Error("There's no point in instantiating this class.");
}
/** Converts Tibetan Unicode to EWTS transliteration. If errors
* is non-null, error messages are appended to it. (Errors are
* always inline.) */
public static String convertToEwts(String unicode,
StringBuffer errors /* DLC: use it */) {
throw new Error("DLC not yet");
/** Finds combining character sequences. */
private static BreakIterator breaker
= BreakIterator.getCharacterInstance(new Locale("bo"));
private static final boolean debug = false;
// TODO(dchandler): use this to create LegalTshegBar objects, it's
// unused right now.
private static Pattern mightBeLegalTshegBarRegex = Pattern.compile(
"^"
+ "([\u0f42\u0f51\u0f56\u0f58\u0f60])?"
// root stack: consonant w/ optional wowels:
+ "(" + GC.consonantStackRegexString + ")"
+ "(([\u0f42\u0f51\u0f56\u0f58\u0f60\u0f44\u0f53\u0f62\u0f63\u0f66][\u0f51\u0f66]?)"
+ "|(\u0f60[\u0f72\u0f74\u0f7c\u0f44\u0f58])+)?"
+ "$");
/** Splits nfthdl into grapheme clusters. Let's define a grapheme
* cluster as something an end user would say cannot be
* decomposed into two separate pieces sensibly. For the most
* part this is just figuring out the <em>combining character
* sequences</em> as defined by Unicode, but (U+0F04 U+0F05*) is
* an example of a grapheme cluster that is not a combining
* character sequence (TODO(dchandler): (0f04 0f05*), is it
* really worth it? We don't handle it right now, might be good
* for Unicode->ACIP anyway.)
* @param nfthdl Unicode in NFTHDL decomposition form
* @return List of GC objects */
private static List/*<GC>*/ SplitGC(String nfthdl) {
if (debug) {
System.out.println("debug: "
+ UnicodeUtils.unicodeStringToPrettyString(nfthdl));
}
ArrayList al = new ArrayList();
breaker.setText(nfthdl);
int start = breaker.first();
boolean just_saw_0f7f = false;
for (int end = breaker.next();
end != BreakIterator.DONE;
start = end, end = breaker.next()) {
if ((just_saw_0f7f
&& (Character.getType(nfthdl.charAt(start))
== Character.NON_SPACING_MARK))
|| (end > start && '\u0f7f' == nfthdl.charAt(start)
&& !al.isEmpty())) {
// U+0F7F is a COMBINING_SPACING_MARK, not a
// NON_SPACING_MARK, but we want to treat it like a
// NON_SPACING_MARK.
GC gc = new GC(((GC)al.get(al.size() - 1)).getNfthdl()
+ nfthdl.substring(start,end));
if (debug) {
System.out.println("debug: setting last el, "
+ al.get(al.size() - 1) + " to " + gc);
}
al.set(al.size() - 1, gc);
} else {
al.add(new GC(nfthdl.substring(start,end)));
}
just_saw_0f7f
= (end > start && '\u0f7f' == nfthdl.charAt(end - 1));
}
return al;
}
/** Converts Tibetan Unicode to computer-friendly EWTS
* transliteration. Computer-friendly is not human-friendly but
* hopefully even poorly written EWTS->Tibetan converters could
* handle the output. If errors is non-null, error messages are
* appended to it. (Errors are always inline.) */
public static String convertToEwtsForComputers(String unicode,
StringBuffer errors) {
// First, normalize as much as we can to reduce the number of
// cases we must handle.
String decomposed
= UnicodeUtils.toMostlyDecomposedUnicode(unicode,
UnicodeUtils.NORM_NFTHDL);
// TODO(dchandler): optionally warn if we see
// "\u0f40\u0f74\u0f71" which is in the wrong order.
List gcs = SplitGC(decomposed);
StringBuffer sb = new StringBuffer();
for (Iterator it = gcs.iterator(); it.hasNext(); ) {
GC gc = (GC)it.next();
StringBuffer ewts = gc.getEwtsForComputers();
if (null == ewts) {
// TODO(dchandler): use ErrorsAndWarnings?
ewts = new StringBuffer("[#ERROR 301: The Unicode '"
+ gc.getNfthdl()
+ "' (has no EWTS transliteration]");
if (null != errors) {
errors.append(ewts);
errors.append('\n');
}
}
sb.append(ewts);
}
return sb.toString();
}
}
// TODO(dchandler): give a mode where an error is given if non-Tibetan
// or at least non-EWTS (think U+534D, e.g.) is found