From 835e74c0cd46a1765d4c9e0f92e3ca7cf02cae3e Mon Sep 17 00:00:00 2001 From: amontano Date: Fri, 20 Feb 2009 23:11:17 +0000 Subject: [PATCH] Changed converters from unicode non-breaking tsheg to unicode non-breaking wylie space. --- .../BasicTibetanTranscriptionConverter.java | 19 ++------ source/org/thdl/tib/scanner/Manipulate.java | 45 +++++++++++++++++++ .../org/thdl/tib/text/TibetanMachineWeb.java | 7 +-- source/org/thdl/tib/text/tibwn.ini | 2 +- .../tshegbar/UnicodeCodepointToThdlWylie.java | 2 +- .../tib/text/ttt/EWTSTshegBarScanner.java | 2 +- source/org/thdl/util/Trie.java | 2 +- 7 files changed, 56 insertions(+), 23 deletions(-) diff --git a/source/org/thdl/tib/scanner/BasicTibetanTranscriptionConverter.java b/source/org/thdl/tib/scanner/BasicTibetanTranscriptionConverter.java index bc56b2f..8d018b9 100644 --- a/source/org/thdl/tib/scanner/BasicTibetanTranscriptionConverter.java +++ b/source/org/thdl/tib/scanner/BasicTibetanTranscriptionConverter.java @@ -46,7 +46,6 @@ public class BasicTibetanTranscriptionConverter implements FontConverterConstant private static final int WYLIE_TO_ACIP=2; private static final int UNICODE_TO_WYLIE=3; private static final int WYLIE_TO_UNICODE=4; - private static final int TIBETAN_UNICODE_RANGE[] = {3840, 4095}; /** Converts from the Acip transliteration scheme to EWTS.*/ public static String acipToWylie(String acip) @@ -253,19 +252,7 @@ public class BasicTibetanTranscriptionConverter implements FontConverterConstant nuevaPalabra = Manipulate.fixWazur(nuevaPalabra); return nuevaPalabra;*/ } - - private static int getTibetanUnicodeStart(String unicode, int pos) - { - for(; pos < unicode.length(); pos++ ) if(unicode.codePointAt(pos)>=TIBETAN_UNICODE_RANGE[0] && unicode.codePointAt(pos)<=TIBETAN_UNICODE_RANGE[1]) return pos; - return -1; - } - - private static int getTibetanUnicodeEnd(String unicode, int pos) - { - for(; pos < unicode.length(); pos++ ) if(unicode.codePointAt(pos)TIBETAN_UNICODE_RANGE[1]) return pos; - return pos; - } - + /** Converts Tibetan Unicode to EWTS. */ public static String unicodeToWylie(String unicode) { @@ -274,9 +261,9 @@ public class BasicTibetanTranscriptionConverter implements FontConverterConstant TibetanDocument tibDoc; StringBuffer errors; int posStart=0, posEnd; - while((posStart = getTibetanUnicodeStart(unicode, posStart))>=0) + while((posStart = Manipulate.getTibetanUnicodeStart(unicode, posStart))>=0) { - posEnd = getTibetanUnicodeEnd(unicode, posStart+1); + posEnd = Manipulate.getTibetanUnicodeEnd(unicode, posStart+1); startString = unicode.substring(0, posStart); tibetanString = unicode.substring(posStart, posEnd); endString = unicode.substring(posEnd); diff --git a/source/org/thdl/tib/scanner/Manipulate.java b/source/org/thdl/tib/scanner/Manipulate.java index f6450fb..ec15fa3 100644 --- a/source/org/thdl/tib/scanner/Manipulate.java +++ b/source/org/thdl/tib/scanner/Manipulate.java @@ -28,6 +28,7 @@ public class Manipulate private static String bracketMarks = "<>(){}[]"; private static String endOfSyllableMarks = " _\t"; private static String allStopMarkers = endOfSyllableMarks + endOfParagraphMarks + bracketMarks; + private static final int TIBETAN_UNICODE_RANGE[] = {3840, 4095}; /* public static String[] parseFields (String s, char delimiter) { @@ -204,6 +205,18 @@ public class Manipulate return ch>=0xF00 && ch<=0xFFF; } + public static boolean isTibetanUnicodeLetter(char ch) + { + + return ch>=0xF40 && ch<=0xFBC || ch>=0xF00 && ch<=0xF03; + } + + public static boolean isTibetanUnicodeDigit(char ch) + { + + return ch>=0xF20 && ch<=0xF33; + } + public static boolean guessIfUnicode(String line) { char ch; @@ -415,4 +428,36 @@ public class Manipulate } return ncr.toString(); } + + public static String unescape(String s) { + int i=0,len=s.length(); + char c; + StringBuffer sb = new StringBuffer(len); + while (i=TIBETAN_UNICODE_RANGE[0] && unicode.codePointAt(pos)<=TIBETAN_UNICODE_RANGE[1]) return pos; + return -1; + } + + public static int getTibetanUnicodeEnd(String unicode, int pos) + { + for(; pos < unicode.length(); pos++ ) if(unicode.codePointAt(pos)TIBETAN_UNICODE_RANGE[1]) return pos; + return pos; + } + } diff --git a/source/org/thdl/tib/text/TibetanMachineWeb.java b/source/org/thdl/tib/text/TibetanMachineWeb.java index 6a1fc33..0038ed4 100644 --- a/source/org/thdl/tib/text/TibetanMachineWeb.java +++ b/source/org/thdl/tib/text/TibetanMachineWeb.java @@ -40,6 +40,7 @@ import org.thdl.util.ThdlDebug; import org.thdl.util.ThdlOptions; import org.thdl.util.Trie; import org.thdl.tib.scanner.BasicTibetanTranscriptionConverter; +import org.thdl.tib.scanner.Manipulate; /** * Interfaces between Extended Wylie and the TibetanMachineWeb fonts. @@ -221,7 +222,7 @@ public class TibetanMachineWeb implements THDLWylieConstants { /** comma-delimited list of supported punctuation and miscellaneous characters: */ private static final String others - = "_, ,/,|,!,:,;,@,#,$,%,(,),H,M,&,@#,?,=,{,},*,~X,X"; // FIXME: not yet supporting all these... + = "_, ,/,|,!,:,;,@,#,$,%,(,),H,M,&,@#,?,=,{,},\u00A0,~X,X"; // FIXME: not yet supporting all these... /** comma-delimited list of supported vowels: */ private static final String vowels @@ -760,7 +761,7 @@ public class TibetanMachineWeb implements THDLWylieConstants { + DELIMITER + " which means that no Wylie is assigned. That isn't supported."); if (hashOn) { - tibHash.put(wylie, duffCodes); + tibHash.put(Manipulate.unescape(wylie), duffCodes); } if (isTibetan) { // Delete the dashes: @@ -783,7 +784,7 @@ public class TibetanMachineWeb implements THDLWylieConstants { + " has a line with wylie " + wylie + " but no TMW; that's not allowed"); int font = duffCodes[TMW].getFontNum(); int code = duffCodes[TMW].getCharNum()-32; - toHashKey[font][code] = wylie; + toHashKey[font][code] = Manipulate.unescape(wylie); } } } diff --git a/source/org/thdl/tib/text/tibwn.ini b/source/org/thdl/tib/text/tibwn.ini index 9f1f036..79e6ba9 100644 --- a/source/org/thdl/tib/text/tibwn.ini +++ b/source/org/thdl/tib/text/tibwn.ini @@ -113,7 +113,7 @@ __TILDE__M`~242,1~~8,95~~~~~~~0F82 // dzud.rtags.me.long.can: \u0F13~94,5~~9,92~~~~~~~0F13 // hard tsheg: -*~205,1~~1,108~~~~~~~0F0C +\u00A0~205,1~~1,108~~~~~~~0F0C diff --git a/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java b/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java index c998cd8..0ee2a31 100644 --- a/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java +++ b/source/org/thdl/tib/text/tshegbar/UnicodeCodepointToThdlWylie.java @@ -85,7 +85,7 @@ public class UnicodeCodepointToThdlWylie { case '\u0F09': return "\\u0F09"; case '\u0F0A': return "\\u0F0A"; case '\u0F0B': return " "; - case '\u0F0C': return "*"; // DLC NOW: Jskad does not support this! + case '\u0F0C': return "\\u00A0"; // AMP: Non-break space. Does Jskad support this? case '\u0F0D': return "/"; case '\u0F0E': return "//"; // DLC FIXME: this is kind of a hack-- the Unicode standard says the spacing for this construct is different than the spacing for "\u0F0D\u0F0D" case '\u0F0F': return ";"; diff --git a/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java b/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java index 78a336b..11f28bf 100644 --- a/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java +++ b/source/org/thdl/tib/text/ttt/EWTSTshegBarScanner.java @@ -115,7 +115,7 @@ class EWTSTshegBarScanner extends TTshegBarScanner { || (sb.charAt(i) >= '\u0fcf' && sb.charAt(i) <= '\u0fd1') || (THDLWylieConstants.SAUVASTIKA == sb.charAt(i)) || (THDLWylieConstants.SWASTIKA == sb.charAt(i)) - || (" /;|!:=_@#$%<>(){}*&\r\n\t\u0f36\u0f38\u0f89\u0f8a\u0f8b".indexOf(sb.charAt(i)) + || (" /;|!:=_@#$%<>(){}*&\r\n\t\u0f36\u0f38\u0f89\u0f8a\u0f8b\u00a0".indexOf(sb.charAt(i)) >= 0)) { al.add(new TString("EWTS", sb.substring(i, i+1), TString.TIBETAN_PUNCTUATION)); diff --git a/source/org/thdl/util/Trie.java b/source/org/thdl/util/Trie.java index 64f02bc..e11e857 100644 --- a/source/org/thdl/util/Trie.java +++ b/source/org/thdl/util/Trie.java @@ -90,7 +90,7 @@ public class Trie { /** Size of the m_nextChar array. */ - public static final int ALPHA_SIZE = 128; + public static final int ALPHA_SIZE = 161; /** The root node of the tree. */ Node m_Root;