I'm now stricter about accepting alphabetic characters. F, Q, X, a,

b, c, d, e, ... do not belong in ACIP, so the scanner rejects them. This should make it even easier to distinguish automatically between Tibetan and English texts.
2003-08-17 02:38:58 +00:00 · 2003-08-17 02:38:58 +00:00 · 245aac4911
commit 245aac4911
parent 39451d8879
2 changed files with 10 additions and 4 deletions
--- a/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
+++ b/source/org/thdl/tib/text/ttt/ACIPTshegBarScanner.java
@ -845,6 +845,7 @@ public class ACIPTshegBarScanner {
            // combining punctuation, vowels:
            || ch == '%'
            || ch == 'o'
+            || ch == 'm'
            || ch == 'x'
            || ch == ':'
            || ch == '^'
@ -852,8 +853,13 @@ public class ACIPTshegBarScanner {

            || ch == '-'
            || ch == '+'
-            
-            || (ch >= 'A' && ch <= 'Z')
-            || (ch >= 'a' && ch <= 'z');
+            || ((ch >= 'A' && ch <= 'Z') && ch != 'X' && ch != 'Q' && ch != 'F')
+            || ch == 'i'
+            || ch == 't'
+            || ch == 'h'
+            || ch == 'd'
+            || ch == 'n'
+            || ch == 's'
+            || ch == 'h';
    }
 }