From 7379440520324b95a85e49f3b4c3f558e3277f00 Mon Sep 17 00:00:00 2001 From: amontano Date: Tue, 26 Apr 2005 05:28:02 +0000 Subject: [PATCH] Updated "deleteQuotes" method to get rid of the double quotations properly. --- src/java/org/thdl/tib/scanner/Manipulate.java | 701 +++++++++++------- 1 file changed, 430 insertions(+), 271 deletions(-) diff --git a/src/java/org/thdl/tib/scanner/Manipulate.java b/src/java/org/thdl/tib/scanner/Manipulate.java index 300ecdf..a00fe21 100644 --- a/src/java/org/thdl/tib/scanner/Manipulate.java +++ b/src/java/org/thdl/tib/scanner/Manipulate.java @@ -1,103 +1,233 @@ /* - The contents of this file are subject to the AMP Open Community License - Version 1.0 (the "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License on the AMP web site - (http://www.tibet.iteso.mx/Guatemala/). +The contents of this file are subject to the AMP Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the AMP web site +(http://www.tibet.iteso.mx/Guatemala/). - Software distributed under the License is distributed on an "AS IS" basis, - WITHOUT WARRANTY OF ANY KIND, either express or implied. See the - License for the specific terms governing rights and limitations under the - License. +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. - The Initial Developer of this software is Andres Montano Pellegrini. Portions - created by Andres Montano Pellegrini are Copyright 2001 Andres Montano - Pellegrini. All Rights Reserved. +The Initial Developer of this software is Andres Montano Pellegrini. Portions +created by Andres Montano Pellegrini are Copyright 2001 Andres Montano +Pellegrini. All Rights Reserved. - Contributor(s): ______________________________________. - */ +Contributor(s): ______________________________________. +*/ package org.thdl.tib.scanner; -/** - * Miscelaneous static methods for the manipulation of Tibetan text. - * - * @author Andrés Montano Pellegrini - */ +/** Miscelaneous static methods for the manipulation of Tibetan text. + + @author Andrés Montano Pellegrini +*/ -public class Manipulate { +public class Manipulate +{ - /* - * public static String[] parseFields (String s, char delimiter) { int pos; - * String field; SimplifiedLinkedList ll = new SimplifiedLinkedList(); - * - * while ((pos = s.indexOf(delimiter))>=0) { field = s.substring(0, - * pos).trim(); ll.addLast(field); s = s.substring(pos+1); } - * - * ll.addLast(s.trim()); return ll.toStringArray(); } - */ + private static String endOfParagraphMarks = "/;|!:^@#$%="; + private static String bracketMarks = "<>(){}[]"; + private static String endOfSyllableMarks = " _\t"; + private static String allStopMarkers = endOfSyllableMarks + endOfParagraphMarks + bracketMarks; - public static String replace(String linea, String origSub, String newSub) { + /* public static String[] parseFields (String s, char delimiter) + { + int pos; + String field; + SimplifiedLinkedList ll = new SimplifiedLinkedList(); + + while ((pos = s.indexOf(delimiter))>=0) + { + field = s.substring(0, pos).trim(); + ll.addLast(field); + s = s.substring(pos+1); + } + + ll.addLast(s.trim()); + return ll.toStringArray(); + }*/ + + public static int indexOfAnyChar(String str, String chars) + { + int i; + for (i=0; i=0) + return i; + } + + return -1; + } + + public static int indexOfExtendedEndOfSyllableMark(String word) + { + return indexOfAnyChar(word, allStopMarkers); + } + + public static int indexOfBracketMarks(String word) + { + return indexOfAnyChar(word, bracketMarks); + } + + public static boolean isPunctuationMark(int ch) + { + return endOfParagraphMarks.indexOf(ch)>=0 || bracketMarks.indexOf(ch)>=0; + } + + public static boolean isEndOfParagraphMark(int ch) + { + return endOfParagraphMarks.indexOf(ch)>=0; + } + + public static boolean isEndOfSyllableMark(int ch) + { + return endOfSyllableMarks.indexOf(ch)>=0; + } + + public static boolean isMeaningful(String s) + { + for (int i=0; i0) { i--; break; } default: - */ + /*ciclo: + while(true) // para manejar excepciones; que honda! + { + switch(caract[i]) + { + case 'A': + if (i>0) + { + i--; + break; + } + default:*/ if (Character.isLowerCase(caract[i])) caract[i] = Character.toUpperCase(caract[i]); else if (Character.isUpperCase(caract[i])) caract[i] = Character.toLowerCase(caract[i]); - /* - * break ciclo; } } - */ + /* break ciclo; + } + }*/ } nuevaPalabra = new String(caract); // nuevaPalabra = palabra.toUpperCase(); - + // ahora hacer los cambios de Michael Roach - + nuevaPalabra = replace(nuevaPalabra, "TSH", "TQQ"); nuevaPalabra = replace(nuevaPalabra, "TS", "TZ"); nuevaPalabra = replace(nuevaPalabra, "TQQ", "TS"); @@ -111,32 +241,53 @@ public class Manipulate { nuevaPalabra = fixWazur(nuevaPalabra); return nuevaPalabra; } - - public static String acipToWylie(String linea) { + + /** If more than half of the first letters among the first are 10 characters + are uppercase assume its acip */ + public static boolean guessIfAcip(String line) + { + char ch; + int letters=0, upperCase=0, i, n; + n = line.length(); + if (n>10) n = 10; + for (i=0; i tsh, tz -> ts, v -> w, - * TH -> Th, kSH, kaSH -> k+Sh, SH -> Sh, : -> H, dh -> d+h, gh -> g+h, - * bh -> b+h, dzh -> dz+h, aa -> a, a'a -> A, ai->i, aee ->ai, au->u, - * aoo->au, ae->e, ao->o, ee->ai, oo->au, 'I->-I I->-i, a'i->I, a'u->U, - * a'e->E, a'o->O, a'i->I, a'u->U, a'e->E, a'o->O, ,->/, # -> @##, * -> - * @#, \ -> ?, ` -> !, /-/ -> (-), ga-y -> g.y, g-y -> g.y, na-y -> n+y - */ - + + /* ahora hacer los cambios de Michael Roach ts -> tsh, tz -> ts, v -> w, + TH -> Th, kSH, kaSH -> k+Sh, SH -> Sh, : -> H, dh -> d+h, gh -> g+h, bh -> b+h, dzh -> dz+h, + aa -> a, a'a -> A, ai->i, aee ->ai, au->u, aoo->au, ae->e, + ao->o, ee->ai, oo->au, 'I->-I I->-i, a'i->I, a'u->U, a'e->E, a'o->O, + a'i->I, a'u->U, a'e->E, a'o->O, ,->/, # -> @##, * -> @#, \ -> ?, ` -> !, + /-/ -> (-), ga-y -> g.y, g-y -> g.y, na-y -> n+y */ + nuevaLinea = replace(nuevaLinea, "ts", "tq"); nuevaLinea = replace(nuevaLinea, "tz", "ts"); nuevaLinea = replace(nuevaLinea, "tq", "tsh"); @@ -146,6 +297,7 @@ public class Manipulate { nuevaLinea = replace(nuevaLinea, "kaSH", "k+Sh"); nuevaLinea = replace(nuevaLinea, "SH", "Sh"); nuevaLinea = replace(nuevaLinea, ":", "H"); + nuevaLinea = replace(nuevaLinea, "NH", "NaH"); nuevaLinea = replace(nuevaLinea, "dh", "d+h"); nuevaLinea = replace(nuevaLinea, "gh", "g+h"); nuevaLinea = replace(nuevaLinea, "bh", "b+h"); @@ -169,206 +321,213 @@ public class Manipulate { nuevaLinea = replace(nuevaLinea, "na-y", "n+y"); len = nuevaLinea.length(); - for (i = 0; i < len; i++) { - ch = nuevaLinea.charAt(i); - switch (ch) { - case '#': - nuevaLinea = nuevaLinea.substring(0, i) + "@##" - + nuevaLinea.substring(i + 1); - i += 3; - len += 2; - break; - case '*': - nuevaLinea = nuevaLinea.substring(0, i) + "@#" - + nuevaLinea.substring(i + 1); - i += 2; - len++; - break; - case '\'': - if (i > 0 && i < len - 1) { - chP = nuevaLinea.charAt(i - 1); - chN = nuevaLinea.charAt(i + 1); - if (Character.isLetter(chP) && !isVowel(chP) - && isVowel(chN)) { - nuevaLinea = nuevaLinea.substring(0, i) - + Character.toUpperCase(chN) - + nuevaLinea.substring(i + 2); - len--; - } - } - break; - case 'a': - if ((i < len - 3 && nuevaLinea.charAt(i + 1) == '\'' && isVowel(nuevaLinea - .charAt(i + 2))) - && (i == 0 || !Character.isLetter(nuevaLinea - .charAt(i - 1)))) { - nuevaLinea = nuevaLinea.substring(0, i) - + Character.toUpperCase(nuevaLinea.charAt(i + 2)) - + nuevaLinea.substring(i + 3); - len -= 2; - } - } + for (i=0; i0 && i1 && !Character.isLetter(nuevaLinea.charAt(i-2)) || chN == 'a' && (i+2==len || !Character.isLetter(nuevaLinea.charAt(i+2))))) + { + nuevaLinea = nuevaLinea.substring(0,i-1) + Character.toUpperCase(chN) + nuevaLinea.substring(i+2); + len-=2; + } + } + } + } } - + open = false; - for (i = 0; i < len; i++) { - ch = nuevaLinea.charAt(i); - if (ch == '/') { - if (open) { - nuevaLinea = nuevaLinea.substring(0, i) + ")" - + nuevaLinea.substring(i + 1); - open = false; - } + for (i=0; i 0) { - switch (ch) { - case 'r': - case 'l': - case 'w': - i--; - break; - case 'y': - ch2 = sil.charAt(i - 1); - switch (ch2) { - case '.': - return "y"; - case 'n': - return "ny"; - default: - i--; - } - } - } - if (i == 0) - return sil.substring(i, i + 1); - ch = sil.charAt(i); - ch2 = sil.charAt(i - 1); - - switch (ch) { - case 'h': - switch (ch2) { - case 'k': - case 'c': - case 't': - case 'p': - case 'z': - return sil.substring(i - 1, i + 1); - case 's': - if (i - 2 >= 0 && sil.charAt(i - 2) == 't') - return "tsh"; - else - return "sh"; - default: - return "h"; - } - case 's': - if (ch2 == 't') - return "ts"; - else - return "s"; - case 'g': - if (ch2 == 'n') - return "ng"; - else - return "g"; - case 'z': - if (ch2 == 'd') - return "dz"; - else - return "z"; - } - return sil.substring(i, i + 1); + + /** Returns the base letter of a syllable. Does not include the vowel! + Ignoring cases for now. */ + public static String getBaseLetter (String sil) + { + sil = sil.toLowerCase(); + + int i=0; + char ch, ch2; + + while (!isVowel(sil.charAt(i))) i++; + if (i==0) return ""; + + i--; + if (i==-1) return ""; + + if (sil.charAt(i)=='-') i--; + + ch = sil.charAt(i); + + // check to see if it is a subscript (y, r, l, w) + if (i>0) + { + switch (ch) + { + case 'r': case 'l': case 'w': i--; + break; + case 'y': + ch2 = sil.charAt(i-1); + switch (ch2) + { + case '.': return "y"; + case 'n': return "ny"; + default: i--; + } + } + } + if (i==0) return sil.substring(i,i+1); + ch = sil.charAt(i); + ch2 = sil.charAt(i-1); + + switch(ch) + { + case 'h': + switch (ch2) + { + case 'k': case 'c': case 't': case 'p': case 'z': + return sil.substring(i-1,i+1); + case 's': + if (i-2>=0 && sil.charAt(i-2)=='t') return "tsh"; + else return "sh"; + default: return "h"; + } + case 's': + if (ch2=='t') return "ts"; + else return "s"; + case 'g': + if (ch2=='n') return "ng"; + else return "g"; + case 'z': + if (ch2=='d') return "dz"; + else return "z"; + } + return sil.substring(i,i+1); } - - public static String deleteQuotes(String s) { - int length = s.length(); - if (length > 2) { - if ((s.charAt(0) == '\"') && (s.charAt(length - 1) == '\"')) - return s.substring(1, length - 1); + + public static String deleteQuotes(String s) + { + int length = s.length(), pos; + if (length>2) + { + if ((s.charAt(0)=='\"') && (s.charAt(length-1)=='\"')) + s = s.substring(1,length-1); + + do + { + pos = s.indexOf("\"\""); + if (pos<0) break; + s = Manipulate.deleteSubstring(s, pos, pos+1); + } while (true); + } + + return s; + } + + + + /** Syntax: java Manipulate [word-file] < source-dic-entries > dest-dic-entries + + Takes the output of ConsoleScannerFilter + (in RY format), converts the Wylie to Acip + and displays the result in csv format. + arch-palabras es usado solo cuando deseamos las palabras cambiadas + a otro archivo. + + + public static void main (String[] args) throws Exception + { + String linea, palabra, definicion, nuevaPalabra; + int marker; + PrintWriter psPalabras = null; + + BufferedReader keyb = new BufferedReader(new InputStreamReader(System.in)); + + if (args.length==1) + psPalabras = new PrintWriter(new FileOutputStream(args[0])); + + while ((linea=keyb.readLine())!=null) + { + if (linea.trim().equals("")) continue; + marker = linea.indexOf('-'); + if (marker<0) // linea tiene error + { + palabra = linea; + definicion = ""; + } + else + { + palabra = linea.substring(0, marker).trim(); + definicion = linea.substring(marker+1).trim(); + } + + nuevaPalabra = wylieToAcip(palabra); + + if (psPalabras!=null) + psPalabras.println(nuevaPalabra); + else System.out.print(nuevaPalabra + '\t'); + if (definicion.equals("")) + System.out.println(palabra); + else + System.out.println(palabra + '\t' + definicion); } - return s; - } - - /** - * Syntax: java Manipulate [word-file] < source-dic-entries > - * dest-dic-entries - * - * Takes the output of ConsoleScannerFilter (in RY format), converts the - * Wylie to Acip and displays the result in csv format. arch-palabras es - * usado solo cuando deseamos las palabras cambiadas a otro archivo. - * - * - * public static void main (String[] args) throws Exception { String linea, - * palabra, definicion, nuevaPalabra; int marker; PrintWriter psPalabras = - * null; - * - * BufferedReader keyb = new BufferedReader(new - * InputStreamReader(System.in)); - * - * if (args.length==1) psPalabras = new PrintWriter(new - * FileOutputStream(args[0])); - * - * while ((linea=keyb.readLine())!=null) { if (linea.trim().equals("")) - * continue; marker = linea.indexOf('-'); if (marker <0) // linea tiene - * error { palabra = linea; definicion = ""; } else { palabra = - * linea.substring(0, marker).trim(); definicion = - * linea.substring(marker+1).trim(); } - * - * nuevaPalabra = wylieToAcip(palabra); - * - * if (psPalabras!=null) psPalabras.println(nuevaPalabra); else - * System.out.print(nuevaPalabra + '\t'); if (definicion.equals("")) - * System.out.println(palabra); else System.out.println(palabra + '\t' + - * definicion); } if (psPalabras!=null) psPalabras.flush(); } - */ -} \ No newline at end of file + if (psPalabras!=null) psPalabras.flush(); + }*/ +}