Updated "deleteQuotes" method to get rid of the double quotations properly.

This commit is contained in:
amontano 2005-04-26 05:28:02 +00:00
parent 3f378c7a66
commit 7379440520

View file

@ -1,97 +1,227 @@
/* /*
The contents of this file are subject to the AMP Open Community License The contents of this file are subject to the AMP Open Community License
Version 1.0 (the "License"); you may not use this file except in compliance Version 1.0 (the "License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License on the AMP web site with the License. You may obtain a copy of the License on the AMP web site
(http://www.tibet.iteso.mx/Guatemala/). (http://www.tibet.iteso.mx/Guatemala/).
Software distributed under the License is distributed on an "AS IS" basis, Software distributed under the License is distributed on an "AS IS" basis,
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
License for the specific terms governing rights and limitations under the License for the specific terms governing rights and limitations under the
License. License.
The Initial Developer of this software is Andres Montano Pellegrini. Portions The Initial Developer of this software is Andres Montano Pellegrini. Portions
created by Andres Montano Pellegrini are Copyright 2001 Andres Montano created by Andres Montano Pellegrini are Copyright 2001 Andres Montano
Pellegrini. All Rights Reserved. Pellegrini. All Rights Reserved.
Contributor(s): ______________________________________. Contributor(s): ______________________________________.
*/ */
package org.thdl.tib.scanner; package org.thdl.tib.scanner;
/** /** Miscelaneous static methods for the manipulation of Tibetan text.
* Miscelaneous static methods for the manipulation of Tibetan text.
*
* @author Andrés Montano Pellegrini
*/
public class Manipulate { @author Andrés Montano Pellegrini
*/
/* public class Manipulate
* public static String[] parseFields (String s, char delimiter) { int pos; {
* String field; SimplifiedLinkedList ll = new SimplifiedLinkedList();
*
* while ((pos = s.indexOf(delimiter))>=0) { field = s.substring(0,
* pos).trim(); ll.addLast(field); s = s.substring(pos+1); }
*
* ll.addLast(s.trim()); return ll.toStringArray(); }
*/
public static String replace(String linea, String origSub, String newSub) { private static String endOfParagraphMarks = "/;|!:^@#$%=";
private static String bracketMarks = "<>(){}[]";
private static String endOfSyllableMarks = " _\t";
private static String allStopMarkers = endOfSyllableMarks + endOfParagraphMarks + bracketMarks;
/* public static String[] parseFields (String s, char delimiter)
{
int pos;
String field;
SimplifiedLinkedList ll = new SimplifiedLinkedList();
while ((pos = s.indexOf(delimiter))>=0)
{
field = s.substring(0, pos).trim();
ll.addLast(field);
s = s.substring(pos+1);
}
ll.addLast(s.trim());
return ll.toStringArray();
}*/
public static int indexOfAnyChar(String str, String chars)
{
int i;
for (i=0; i<str.length(); i++)
{
if (chars.indexOf(str.charAt(i))>=0)
return i;
}
return -1;
}
public static int indexOfExtendedEndOfSyllableMark(String word)
{
return indexOfAnyChar(word, allStopMarkers);
}
public static int indexOfBracketMarks(String word)
{
return indexOfAnyChar(word, bracketMarks);
}
public static boolean isPunctuationMark(int ch)
{
return endOfParagraphMarks.indexOf(ch)>=0 || bracketMarks.indexOf(ch)>=0;
}
public static boolean isEndOfParagraphMark(int ch)
{
return endOfParagraphMarks.indexOf(ch)>=0;
}
public static boolean isEndOfSyllableMark(int ch)
{
return endOfSyllableMarks.indexOf(ch)>=0;
}
public static boolean isMeaningful(String s)
{
for (int i=0; i<s.length(); i++)
if (Character.isLetterOrDigit(s.charAt(i))) return true;
return false;
}
public static String replace(String linea, String origSub, String newSub)
{
int pos, lenOrig = origSub.length(); int pos, lenOrig = origSub.length();
while ((pos = linea.indexOf(origSub)) != -1) { while ((pos = linea.indexOf(origSub))!=-1)
linea = linea.substring(0, pos).concat(newSub).concat( {
linea.substring(pos + lenOrig)); linea = linea.substring(0, pos).concat(newSub).concat(linea.substring(pos+lenOrig));
} }
return linea; return linea;
} }
public static boolean isVowel(char ch) { public static String deleteSubstring (String string, int pos, int posEnd)
ch = Character.toLowerCase(ch); {
return ch == 'a' || ch == 'e' || ch == 'i' || ch == 'o' || ch == 'u'; if (pos<0) return string;
if (pos==0)
{
return string.substring(posEnd).trim();
}
else
{
if (posEnd<string.length())
return string.substring(0, pos).concat(string.substring(posEnd)).trim();
else
return string.substring(0, pos).trim();
}
} }
public static String wylieToAcip(String palabra) { public static String replace(String string, int pos, int posEnd, String newSub)
{
if (pos<0) return string;
if (pos==0)
{
return newSub.concat(string.substring(posEnd)).trim();
}
else
{
if (posEnd<string.length())
return string.substring(0, pos).concat(newSub).concat(string.substring(posEnd)).trim();
else
return string.substring(0, pos).concat(newSub).trim();
}
}
public static String deleteSubstring (String string, String sub)
{
int pos = string.indexOf(sub), posEnd = pos + sub.length();
return deleteSubstring(string, pos, posEnd);
}
public static String[] addString(String array[], String s, int n)
{
int i;
String newArray[] = new String[array.length+1];
for (i=0; i<n; i++)
newArray[i] = array[i];
newArray[n] = s;
for (i=n+1; i<newArray.length; i++)
newArray[i] = array[i-1];
return newArray;
}
public static String[] deleteString(String array[], int n)
{
int i;
String newArray[] = new String[array.length-1];
for (i=0; i<n; i++)
newArray[i] = array[i];
for (i=n; i<newArray.length; i++)
newArray[i] = array[i+1];
return newArray;
}
public static boolean isVowel (char ch)
{
ch = Character.toLowerCase(ch);
return ch=='a' || ch=='e' || ch=='i' || ch=='o' || ch=='u';
}
public static String wylieToAcip(String palabra)
{
// DLC FIXME: for unknown things, return null. // DLC FIXME: for unknown things, return null.
if (palabra.equals("@##")) if (palabra.equals("@##")) return "#";
return "#"; if (palabra.equals("@#")) return "*";
if (palabra.equals("@#")) if (palabra.equals("!")) return "`";
return "*"; if (palabra.equals("b+h")) return "BH";
if (palabra.equals("!")) if (palabra.equals("d+h")) return "DH";
return "`"; if (palabra.equals("X")) return null;
if (palabra.equals("b+h")) if (palabra.equals("iA")) return null;
return "BH"; if (palabra.equals("ai")) return "EE";
if (palabra.equals("d+h")) if (palabra.equals("au")) return "OO";
return "DH"; if (palabra.equals("$")) return null;
if (palabra.equals("X"))
return null;
if (palabra.equals("iA"))
return null;
if (palabra.equals("ai"))
return "EE";
if (palabra.equals("au"))
return "OO";
if (palabra.equals("$"))
return null;
if (palabra.startsWith("@") || palabra.startsWith("#")) if (palabra.startsWith("@") || palabra.startsWith("#"))
return null; // we can't convert this in isolation! We need context. return null; // we can't convert this in isolation! We need context.
char[] caract; char []caract;
int i, j, len; int i, j, len;
String nuevaPalabra; String nuevaPalabra;
caract = palabra.toCharArray(); caract = palabra.toCharArray();
len = palabra.length(); len = palabra.length();
for (j = 0; j < len; j++) { for (j=0; j<len; j++)
{
i = j; i = j;
/* /*ciclo:
* ciclo: while(true) // para manejar excepciones; que honda! { while(true) // para manejar excepciones; que honda!
* switch(caract[i]) { case 'A': if (i>0) { i--; break; } default: {
*/ switch(caract[i])
{
case 'A':
if (i>0)
{
i--;
break;
}
default:*/
if (Character.isLowerCase(caract[i])) if (Character.isLowerCase(caract[i]))
caract[i] = Character.toUpperCase(caract[i]); caract[i] = Character.toUpperCase(caract[i]);
else if (Character.isUpperCase(caract[i])) else if (Character.isUpperCase(caract[i]))
caract[i] = Character.toLowerCase(caract[i]); caract[i] = Character.toLowerCase(caract[i]);
/* /* break ciclo;
* break ciclo; } } }
*/ }*/
} }
nuevaPalabra = new String(caract); nuevaPalabra = new String(caract);
// nuevaPalabra = palabra.toUpperCase(); // nuevaPalabra = palabra.toUpperCase();
@ -112,7 +242,29 @@ public class Manipulate {
return nuevaPalabra; return nuevaPalabra;
} }
public static String acipToWylie(String linea) { /** If more than half of the first letters among the first are 10 characters
are uppercase assume its acip */
public static boolean guessIfAcip(String line)
{
char ch;
int letters=0, upperCase=0, i, n;
n = line.length();
if (n>10) n = 10;
for (i=0; i<n; i++)
{
ch = line.charAt(i);
if (Character.isLetter(ch))
{
letters++;
if (Character.isUpperCase(ch)) upperCase++;
}
}
if (letters==0 || upperCase==0) return false;
else return (letters / upperCase < 2);
}
public static String acipToWylie(String linea)
{
char caract[], ch, chP, chN; char caract[], ch, chP, chN;
String nuevaLinea; String nuevaLinea;
int i, len; int i, len;
@ -120,7 +272,8 @@ public class Manipulate {
caract = linea.toCharArray(); caract = linea.toCharArray();
len = linea.length(); len = linea.length();
for (i = 0; i < len; i++) { for (i=0; i<len; i++)
{
if (Character.isLowerCase(caract[i])) if (Character.isLowerCase(caract[i]))
caract[i] = Character.toUpperCase(caract[i]); caract[i] = Character.toUpperCase(caract[i]);
else if (Character.isUpperCase(caract[i])) else if (Character.isUpperCase(caract[i]))
@ -128,14 +281,12 @@ public class Manipulate {
} }
nuevaLinea = new String(caract); nuevaLinea = new String(caract);
/* /* ahora hacer los cambios de Michael Roach ts -> tsh, tz -> ts, v -> w,
* ahora hacer los cambios de Michael Roach ts -> tsh, tz -> ts, v -> w, TH -> Th, kSH, kaSH -> k+Sh, SH -> Sh, : -> H, dh -> d+h, gh -> g+h, bh -> b+h, dzh -> dz+h,
* TH -> Th, kSH, kaSH -> k+Sh, SH -> Sh, : -> H, dh -> d+h, gh -> g+h, aa -> a, a'a -> A, ai->i, aee ->ai, au->u, aoo->au, ae->e,
* bh -> b+h, dzh -> dz+h, aa -> a, a'a -> A, ai->i, aee ->ai, au->u, ao->o, ee->ai, oo->au, 'I->-I I->-i, a'i->I, a'u->U, a'e->E, a'o->O,
* aoo->au, ae->e, ao->o, ee->ai, oo->au, 'I->-I I->-i, a'i->I, a'u->U, a'i->I, a'u->U, a'e->E, a'o->O, ,->/, # -> @##, * -> @#, \ -> ?, ` -> !,
* a'e->E, a'o->O, a'i->I, a'u->U, a'e->E, a'o->O, ,->/, # -> @##, * -> /-/ -> (-), ga-y -> g.y, g-y -> g.y, na-y -> n+y */
* @#, \ -> ?, ` -> !, /-/ -> (-), ga-y -> g.y, g-y -> g.y, na-y -> n+y
*/
nuevaLinea = replace(nuevaLinea, "ts", "tq"); nuevaLinea = replace(nuevaLinea, "ts", "tq");
nuevaLinea = replace(nuevaLinea, "tz", "ts"); nuevaLinea = replace(nuevaLinea, "tz", "ts");
@ -146,6 +297,7 @@ public class Manipulate {
nuevaLinea = replace(nuevaLinea, "kaSH", "k+Sh"); nuevaLinea = replace(nuevaLinea, "kaSH", "k+Sh");
nuevaLinea = replace(nuevaLinea, "SH", "Sh"); nuevaLinea = replace(nuevaLinea, "SH", "Sh");
nuevaLinea = replace(nuevaLinea, ":", "H"); nuevaLinea = replace(nuevaLinea, ":", "H");
nuevaLinea = replace(nuevaLinea, "NH", "NaH");
nuevaLinea = replace(nuevaLinea, "dh", "d+h"); nuevaLinea = replace(nuevaLinea, "dh", "d+h");
nuevaLinea = replace(nuevaLinea, "gh", "g+h"); nuevaLinea = replace(nuevaLinea, "gh", "g+h");
nuevaLinea = replace(nuevaLinea, "bh", "b+h"); nuevaLinea = replace(nuevaLinea, "bh", "b+h");
@ -169,60 +321,58 @@ public class Manipulate {
nuevaLinea = replace(nuevaLinea, "na-y", "n+y"); nuevaLinea = replace(nuevaLinea, "na-y", "n+y");
len = nuevaLinea.length(); len = nuevaLinea.length();
for (i = 0; i < len; i++) { for (i=0; i<len; i++)
{
ch = nuevaLinea.charAt(i); ch = nuevaLinea.charAt(i);
switch (ch) { switch(ch)
{
case '#': case '#':
nuevaLinea = nuevaLinea.substring(0, i) + "@##" nuevaLinea = nuevaLinea.substring(0,i) + "@##" + nuevaLinea.substring(i+1);
+ nuevaLinea.substring(i + 1); i+=3;
i += 3; len+=2;
len += 2;
break; break;
case '*': case '*':
nuevaLinea = nuevaLinea.substring(0, i) + "@#" nuevaLinea = nuevaLinea.substring(0,i) + "@#" + nuevaLinea.substring(i+1);
+ nuevaLinea.substring(i + 1); i+=2;
i += 2;
len++; len++;
break; break;
case '\'': case '\'':
if (i > 0 && i < len - 1) { if (i>0 && i<len-1)
chP = nuevaLinea.charAt(i - 1); {
chN = nuevaLinea.charAt(i + 1); chP = nuevaLinea.charAt(i-1);
if (Character.isLetter(chP) && !isVowel(chP) chN = nuevaLinea.charAt(i+1);
&& isVowel(chN)) { if (isVowel(chN))
nuevaLinea = nuevaLinea.substring(0, i) {
+ Character.toUpperCase(chN) if (Character.isLetter(chP) && !isVowel(chP))
+ nuevaLinea.substring(i + 2); {
nuevaLinea = nuevaLinea.substring(0, i) + Character.toUpperCase(chN) + nuevaLinea.substring(i+2);
len--; len--;
} }
else if (chP=='a' && (i==1 || i>1 && !Character.isLetter(nuevaLinea.charAt(i-2)) || chN == 'a' && (i+2==len || !Character.isLetter(nuevaLinea.charAt(i+2)))))
{
nuevaLinea = nuevaLinea.substring(0,i-1) + Character.toUpperCase(chN) + nuevaLinea.substring(i+2);
len-=2;
}
} }
break;
case 'a':
if ((i < len - 3 && nuevaLinea.charAt(i + 1) == '\'' && isVowel(nuevaLinea
.charAt(i + 2)))
&& (i == 0 || !Character.isLetter(nuevaLinea
.charAt(i - 1)))) {
nuevaLinea = nuevaLinea.substring(0, i)
+ Character.toUpperCase(nuevaLinea.charAt(i + 2))
+ nuevaLinea.substring(i + 3);
len -= 2;
} }
} }
} }
open = false; open = false;
for (i = 0; i < len; i++) { for (i=0; i<len; i++)
{
ch = nuevaLinea.charAt(i); ch = nuevaLinea.charAt(i);
if (ch == '/') { if (ch=='/')
if (open) { {
nuevaLinea = nuevaLinea.substring(0, i) + ")" if (open)
+ nuevaLinea.substring(i + 1); {
nuevaLinea = nuevaLinea.substring(0, i) + ")" + nuevaLinea.substring(i+1);
open = false; open = false;
} }
else { else
nuevaLinea = nuevaLinea.substring(0, i) + "(" {
+ nuevaLinea.substring(i + 1); nuevaLinea = nuevaLinea.substring(0, i) + "(" + nuevaLinea.substring(i+1);
open = true; open = true;
} }
} }
@ -232,143 +382,152 @@ public class Manipulate {
return nuevaLinea; return nuevaLinea;
} }
public static String fixWazur(String linea) { public static String fixWazur(String linea)
{
int i; int i;
for (i = 1; i < linea.length(); i++) { for (i=1; i<linea.length(); i++)
if (linea.charAt(i) == 'W') { {
if (Character.isLetter(linea.charAt(i - 1))) if (linea.charAt(i)=='W')
linea = linea.substring(0, i) + 'V' {
+ linea.substring(i + 1); if (Character.isLetter(linea.charAt(i-1)))
linea = linea.substring(0,i) + 'V' + linea.substring(i+1);
} }
} }
return linea; return linea;
} }
/** /** Returns the base letter of a syllable. Does not include the vowel!
* Returns the base letter of a syllable. Does not include the vowel! Ignoring cases for now. */
* Ignoring cases for now. public static String getBaseLetter (String sil)
*/ {
public static String getBaseLetter(String sil) {
sil = sil.toLowerCase(); sil = sil.toLowerCase();
int i = 0; int i=0;
char ch, ch2; char ch, ch2;
while (!isVowel(sil.charAt(i))) while (!isVowel(sil.charAt(i))) i++;
i++; if (i==0) return "";
if (i == 0)
return "";
i--; i--;
if (i == -1) if (i==-1) return "";
return "";
if (sil.charAt(i) == '-') if (sil.charAt(i)=='-') i--;
i--;
ch = sil.charAt(i); ch = sil.charAt(i);
// check to see if it is a subscript (y, r, l, w) // check to see if it is a subscript (y, r, l, w)
if (i > 0) { if (i>0)
switch (ch) { {
case 'r': switch (ch)
case 'l': {
case 'w': case 'r': case 'l': case 'w': i--;
i--;
break; break;
case 'y': case 'y':
ch2 = sil.charAt(i - 1); ch2 = sil.charAt(i-1);
switch (ch2) { switch (ch2)
case '.': {
return "y"; case '.': return "y";
case 'n': case 'n': return "ny";
return "ny"; default: i--;
default:
i--;
} }
} }
} }
if (i == 0) if (i==0) return sil.substring(i,i+1);
return sil.substring(i, i + 1);
ch = sil.charAt(i); ch = sil.charAt(i);
ch2 = sil.charAt(i - 1); ch2 = sil.charAt(i-1);
switch (ch) { switch(ch)
{
case 'h': case 'h':
switch (ch2) { switch (ch2)
case 'k': {
case 'c': case 'k': case 'c': case 't': case 'p': case 'z':
case 't': return sil.substring(i-1,i+1);
case 'p':
case 'z':
return sil.substring(i - 1, i + 1);
case 's': case 's':
if (i - 2 >= 0 && sil.charAt(i - 2) == 't') if (i-2>=0 && sil.charAt(i-2)=='t') return "tsh";
return "tsh"; else return "sh";
else default: return "h";
return "sh";
default:
return "h";
} }
case 's': case 's':
if (ch2 == 't') if (ch2=='t') return "ts";
return "ts"; else return "s";
else
return "s";
case 'g': case 'g':
if (ch2 == 'n') if (ch2=='n') return "ng";
return "ng"; else return "g";
else
return "g";
case 'z': case 'z':
if (ch2 == 'd') if (ch2=='d') return "dz";
return "dz"; else return "z";
else
return "z";
} }
return sil.substring(i, i + 1); return sil.substring(i,i+1);
} }
public static String deleteQuotes(String s) { public static String deleteQuotes(String s)
int length = s.length(); {
if (length > 2) { int length = s.length(), pos;
if ((s.charAt(0) == '\"') && (s.charAt(length - 1) == '\"')) if (length>2)
return s.substring(1, length - 1); {
if ((s.charAt(0)=='\"') && (s.charAt(length-1)=='\"'))
s = s.substring(1,length-1);
do
{
pos = s.indexOf("\"\"");
if (pos<0) break;
s = Manipulate.deleteSubstring(s, pos, pos+1);
} while (true);
} }
return s; return s;
} }
/**
* Syntax: java Manipulate [word-file] < source-dic-entries >
* dest-dic-entries /** Syntax: java Manipulate [word-file] < source-dic-entries > dest-dic-entries
*
* Takes the output of ConsoleScannerFilter (in RY format), converts the Takes the output of ConsoleScannerFilter
* Wylie to Acip and displays the result in csv format. arch-palabras es (in RY format), converts the Wylie to Acip
* usado solo cuando deseamos las palabras cambiadas a otro archivo. and displays the result in csv format.
* arch-palabras es usado solo cuando deseamos las palabras cambiadas
* a otro archivo.
* public static void main (String[] args) throws Exception { String linea,
* palabra, definicion, nuevaPalabra; int marker; PrintWriter psPalabras =
* null; public static void main (String[] args) throws Exception
* {
* BufferedReader keyb = new BufferedReader(new String linea, palabra, definicion, nuevaPalabra;
* InputStreamReader(System.in)); int marker;
* PrintWriter psPalabras = null;
* if (args.length==1) psPalabras = new PrintWriter(new
* FileOutputStream(args[0])); BufferedReader keyb = new BufferedReader(new InputStreamReader(System.in));
*
* while ((linea=keyb.readLine())!=null) { if (linea.trim().equals("")) if (args.length==1)
* continue; marker = linea.indexOf('-'); if (marker <0) // linea tiene psPalabras = new PrintWriter(new FileOutputStream(args[0]));
* error { palabra = linea; definicion = ""; } else { palabra =
* linea.substring(0, marker).trim(); definicion = while ((linea=keyb.readLine())!=null)
* linea.substring(marker+1).trim(); } {
* if (linea.trim().equals("")) continue;
* nuevaPalabra = wylieToAcip(palabra); marker = linea.indexOf('-');
* if (marker<0) // linea tiene error
* if (psPalabras!=null) psPalabras.println(nuevaPalabra); else {
* System.out.print(nuevaPalabra + '\t'); if (definicion.equals("")) palabra = linea;
* System.out.println(palabra); else System.out.println(palabra + '\t' + definicion = "";
* definicion); } if (psPalabras!=null) psPalabras.flush(); } }
*/ else
{
palabra = linea.substring(0, marker).trim();
definicion = linea.substring(marker+1).trim();
}
nuevaPalabra = wylieToAcip(palabra);
if (psPalabras!=null)
psPalabras.println(nuevaPalabra);
else System.out.print(nuevaPalabra + '\t');
if (definicion.equals(""))
System.out.println(palabra);
else
System.out.println(palabra + '\t' + definicion);
}
if (psPalabras!=null) psPalabras.flush();
}*/
} }