Updated "deleteQuotes" method to get rid of the double quotations properly.

This commit is contained in:
amontano 2005-04-26 05:28:02 +00:00
parent 3f378c7a66
commit 7379440520

View file

@ -17,60 +17,181 @@
*/ */
package org.thdl.tib.scanner; package org.thdl.tib.scanner;
/** /** Miscelaneous static methods for the manipulation of Tibetan text.
* Miscelaneous static methods for the manipulation of Tibetan text.
* @author Andrés Montano Pellegrini
* @author Andrés Montano Pellegrini
*/ */
public class Manipulate { public class Manipulate
{
/* private static String endOfParagraphMarks = "/;|!:^@#$%=";
* public static String[] parseFields (String s, char delimiter) { int pos; private static String bracketMarks = "<>(){}[]";
* String field; SimplifiedLinkedList ll = new SimplifiedLinkedList(); private static String endOfSyllableMarks = " _\t";
* private static String allStopMarkers = endOfSyllableMarks + endOfParagraphMarks + bracketMarks;
* while ((pos = s.indexOf(delimiter))>=0) { field = s.substring(0,
* pos).trim(); ll.addLast(field); s = s.substring(pos+1); }
*
* ll.addLast(s.trim()); return ll.toStringArray(); }
*/
public static String replace(String linea, String origSub, String newSub) { /* public static String[] parseFields (String s, char delimiter)
{
int pos;
String field;
SimplifiedLinkedList ll = new SimplifiedLinkedList();
while ((pos = s.indexOf(delimiter))>=0)
{
field = s.substring(0, pos).trim();
ll.addLast(field);
s = s.substring(pos+1);
}
ll.addLast(s.trim());
return ll.toStringArray();
}*/
public static int indexOfAnyChar(String str, String chars)
{
int i;
for (i=0; i<str.length(); i++)
{
if (chars.indexOf(str.charAt(i))>=0)
return i;
}
return -1;
}
public static int indexOfExtendedEndOfSyllableMark(String word)
{
return indexOfAnyChar(word, allStopMarkers);
}
public static int indexOfBracketMarks(String word)
{
return indexOfAnyChar(word, bracketMarks);
}
public static boolean isPunctuationMark(int ch)
{
return endOfParagraphMarks.indexOf(ch)>=0 || bracketMarks.indexOf(ch)>=0;
}
public static boolean isEndOfParagraphMark(int ch)
{
return endOfParagraphMarks.indexOf(ch)>=0;
}
public static boolean isEndOfSyllableMark(int ch)
{
return endOfSyllableMarks.indexOf(ch)>=0;
}
public static boolean isMeaningful(String s)
{
for (int i=0; i<s.length(); i++)
if (Character.isLetterOrDigit(s.charAt(i))) return true;
return false;
}
public static String replace(String linea, String origSub, String newSub)
{
int pos, lenOrig = origSub.length(); int pos, lenOrig = origSub.length();
while ((pos = linea.indexOf(origSub)) != -1) { while ((pos = linea.indexOf(origSub))!=-1)
linea = linea.substring(0, pos).concat(newSub).concat( {
linea.substring(pos + lenOrig)); linea = linea.substring(0, pos).concat(newSub).concat(linea.substring(pos+lenOrig));
} }
return linea; return linea;
} }
public static boolean isVowel(char ch) { public static String deleteSubstring (String string, int pos, int posEnd)
{
if (pos<0) return string;
if (pos==0)
{
return string.substring(posEnd).trim();
}
else
{
if (posEnd<string.length())
return string.substring(0, pos).concat(string.substring(posEnd)).trim();
else
return string.substring(0, pos).trim();
}
}
public static String replace(String string, int pos, int posEnd, String newSub)
{
if (pos<0) return string;
if (pos==0)
{
return newSub.concat(string.substring(posEnd)).trim();
}
else
{
if (posEnd<string.length())
return string.substring(0, pos).concat(newSub).concat(string.substring(posEnd)).trim();
else
return string.substring(0, pos).concat(newSub).trim();
}
}
public static String deleteSubstring (String string, String sub)
{
int pos = string.indexOf(sub), posEnd = pos + sub.length();
return deleteSubstring(string, pos, posEnd);
}
public static String[] addString(String array[], String s, int n)
{
int i;
String newArray[] = new String[array.length+1];
for (i=0; i<n; i++)
newArray[i] = array[i];
newArray[n] = s;
for (i=n+1; i<newArray.length; i++)
newArray[i] = array[i-1];
return newArray;
}
public static String[] deleteString(String array[], int n)
{
int i;
String newArray[] = new String[array.length-1];
for (i=0; i<n; i++)
newArray[i] = array[i];
for (i=n; i<newArray.length; i++)
newArray[i] = array[i+1];
return newArray;
}
public static boolean isVowel (char ch)
{
ch = Character.toLowerCase(ch); ch = Character.toLowerCase(ch);
return ch=='a' || ch=='e' || ch=='i' || ch=='o' || ch=='u'; return ch=='a' || ch=='e' || ch=='i' || ch=='o' || ch=='u';
} }
public static String wylieToAcip(String palabra) { public static String wylieToAcip(String palabra)
{
// DLC FIXME: for unknown things, return null. // DLC FIXME: for unknown things, return null.
if (palabra.equals("@##")) if (palabra.equals("@##")) return "#";
return "#"; if (palabra.equals("@#")) return "*";
if (palabra.equals("@#")) if (palabra.equals("!")) return "`";
return "*"; if (palabra.equals("b+h")) return "BH";
if (palabra.equals("!")) if (palabra.equals("d+h")) return "DH";
return "`"; if (palabra.equals("X")) return null;
if (palabra.equals("b+h")) if (palabra.equals("iA")) return null;
return "BH"; if (palabra.equals("ai")) return "EE";
if (palabra.equals("d+h")) if (palabra.equals("au")) return "OO";
return "DH"; if (palabra.equals("$")) return null;
if (palabra.equals("X"))
return null;
if (palabra.equals("iA"))
return null;
if (palabra.equals("ai"))
return "EE";
if (palabra.equals("au"))
return "OO";
if (palabra.equals("$"))
return null;
if (palabra.startsWith("@") || palabra.startsWith("#")) if (palabra.startsWith("@") || palabra.startsWith("#"))
return null; // we can't convert this in isolation! We need context. return null; // we can't convert this in isolation! We need context.
char []caract; char []caract;
@ -79,19 +200,28 @@ public class Manipulate {
caract = palabra.toCharArray(); caract = palabra.toCharArray();
len = palabra.length(); len = palabra.length();
for (j = 0; j < len; j++) { for (j=0; j<len; j++)
{
i = j; i = j;
/* /*ciclo:
* ciclo: while(true) // para manejar excepciones; que honda! { while(true) // para manejar excepciones; que honda!
* switch(caract[i]) { case 'A': if (i>0) { i--; break; } default: {
*/ switch(caract[i])
{
case 'A':
if (i>0)
{
i--;
break;
}
default:*/
if (Character.isLowerCase(caract[i])) if (Character.isLowerCase(caract[i]))
caract[i] = Character.toUpperCase(caract[i]); caract[i] = Character.toUpperCase(caract[i]);
else if (Character.isUpperCase(caract[i])) else if (Character.isUpperCase(caract[i]))
caract[i] = Character.toLowerCase(caract[i]); caract[i] = Character.toLowerCase(caract[i]);
/* /* break ciclo;
* break ciclo; } } }
*/ }*/
} }
nuevaPalabra = new String(caract); nuevaPalabra = new String(caract);
// nuevaPalabra = palabra.toUpperCase(); // nuevaPalabra = palabra.toUpperCase();
@ -112,7 +242,29 @@ public class Manipulate {
return nuevaPalabra; return nuevaPalabra;
} }
public static String acipToWylie(String linea) { /** If more than half of the first letters among the first are 10 characters
are uppercase assume its acip */
public static boolean guessIfAcip(String line)
{
char ch;
int letters=0, upperCase=0, i, n;
n = line.length();
if (n>10) n = 10;
for (i=0; i<n; i++)
{
ch = line.charAt(i);
if (Character.isLetter(ch))
{
letters++;
if (Character.isUpperCase(ch)) upperCase++;
}
}
if (letters==0 || upperCase==0) return false;
else return (letters / upperCase < 2);
}
public static String acipToWylie(String linea)
{
char caract[], ch, chP, chN; char caract[], ch, chP, chN;
String nuevaLinea; String nuevaLinea;
int i, len; int i, len;
@ -120,7 +272,8 @@ public class Manipulate {
caract = linea.toCharArray(); caract = linea.toCharArray();
len = linea.length(); len = linea.length();
for (i = 0; i < len; i++) { for (i=0; i<len; i++)
{
if (Character.isLowerCase(caract[i])) if (Character.isLowerCase(caract[i]))
caract[i] = Character.toUpperCase(caract[i]); caract[i] = Character.toUpperCase(caract[i]);
else if (Character.isUpperCase(caract[i])) else if (Character.isUpperCase(caract[i]))
@ -128,14 +281,12 @@ public class Manipulate {
} }
nuevaLinea = new String(caract); nuevaLinea = new String(caract);
/* /* ahora hacer los cambios de Michael Roach ts -> tsh, tz -> ts, v -> w,
* ahora hacer los cambios de Michael Roach ts -> tsh, tz -> ts, v -> w, TH -> Th, kSH, kaSH -> k+Sh, SH -> Sh, : -> H, dh -> d+h, gh -> g+h, bh -> b+h, dzh -> dz+h,
* TH -> Th, kSH, kaSH -> k+Sh, SH -> Sh, : -> H, dh -> d+h, gh -> g+h, aa -> a, a'a -> A, ai->i, aee ->ai, au->u, aoo->au, ae->e,
* bh -> b+h, dzh -> dz+h, aa -> a, a'a -> A, ai->i, aee ->ai, au->u, ao->o, ee->ai, oo->au, 'I->-I I->-i, a'i->I, a'u->U, a'e->E, a'o->O,
* aoo->au, ae->e, ao->o, ee->ai, oo->au, 'I->-I I->-i, a'i->I, a'u->U, a'i->I, a'u->U, a'e->E, a'o->O, ,->/, # -> @##, * -> @#, \ -> ?, ` -> !,
* a'e->E, a'o->O, a'i->I, a'u->U, a'e->E, a'o->O, ,->/, # -> @##, * -> /-/ -> (-), ga-y -> g.y, g-y -> g.y, na-y -> n+y */
* @#, \ -> ?, ` -> !, /-/ -> (-), ga-y -> g.y, g-y -> g.y, na-y -> n+y
*/
nuevaLinea = replace(nuevaLinea, "ts", "tq"); nuevaLinea = replace(nuevaLinea, "ts", "tq");
nuevaLinea = replace(nuevaLinea, "tz", "ts"); nuevaLinea = replace(nuevaLinea, "tz", "ts");
@ -146,6 +297,7 @@ public class Manipulate {
nuevaLinea = replace(nuevaLinea, "kaSH", "k+Sh"); nuevaLinea = replace(nuevaLinea, "kaSH", "k+Sh");
nuevaLinea = replace(nuevaLinea, "SH", "Sh"); nuevaLinea = replace(nuevaLinea, "SH", "Sh");
nuevaLinea = replace(nuevaLinea, ":", "H"); nuevaLinea = replace(nuevaLinea, ":", "H");
nuevaLinea = replace(nuevaLinea, "NH", "NaH");
nuevaLinea = replace(nuevaLinea, "dh", "d+h"); nuevaLinea = replace(nuevaLinea, "dh", "d+h");
nuevaLinea = replace(nuevaLinea, "gh", "g+h"); nuevaLinea = replace(nuevaLinea, "gh", "g+h");
nuevaLinea = replace(nuevaLinea, "bh", "b+h"); nuevaLinea = replace(nuevaLinea, "bh", "b+h");
@ -169,60 +321,58 @@ public class Manipulate {
nuevaLinea = replace(nuevaLinea, "na-y", "n+y"); nuevaLinea = replace(nuevaLinea, "na-y", "n+y");
len = nuevaLinea.length(); len = nuevaLinea.length();
for (i = 0; i < len; i++) { for (i=0; i<len; i++)
{
ch = nuevaLinea.charAt(i); ch = nuevaLinea.charAt(i);
switch (ch) { switch(ch)
{
case '#': case '#':
nuevaLinea = nuevaLinea.substring(0, i) + "@##" nuevaLinea = nuevaLinea.substring(0,i) + "@##" + nuevaLinea.substring(i+1);
+ nuevaLinea.substring(i + 1);
i+=3; i+=3;
len+=2; len+=2;
break; break;
case '*': case '*':
nuevaLinea = nuevaLinea.substring(0, i) + "@#" nuevaLinea = nuevaLinea.substring(0,i) + "@#" + nuevaLinea.substring(i+1);
+ nuevaLinea.substring(i + 1);
i+=2; i+=2;
len++; len++;
break; break;
case '\'': case '\'':
if (i > 0 && i < len - 1) { if (i>0 && i<len-1)
{
chP = nuevaLinea.charAt(i-1); chP = nuevaLinea.charAt(i-1);
chN = nuevaLinea.charAt(i+1); chN = nuevaLinea.charAt(i+1);
if (Character.isLetter(chP) && !isVowel(chP) if (isVowel(chN))
&& isVowel(chN)) { {
nuevaLinea = nuevaLinea.substring(0, i) if (Character.isLetter(chP) && !isVowel(chP))
+ Character.toUpperCase(chN) {
+ nuevaLinea.substring(i + 2); nuevaLinea = nuevaLinea.substring(0, i) + Character.toUpperCase(chN) + nuevaLinea.substring(i+2);
len--; len--;
} }
} else if (chP=='a' && (i==1 || i>1 && !Character.isLetter(nuevaLinea.charAt(i-2)) || chN == 'a' && (i+2==len || !Character.isLetter(nuevaLinea.charAt(i+2)))))
break; {
case 'a': nuevaLinea = nuevaLinea.substring(0,i-1) + Character.toUpperCase(chN) + nuevaLinea.substring(i+2);
if ((i < len - 3 && nuevaLinea.charAt(i + 1) == '\'' && isVowel(nuevaLinea
.charAt(i + 2)))
&& (i == 0 || !Character.isLetter(nuevaLinea
.charAt(i - 1)))) {
nuevaLinea = nuevaLinea.substring(0, i)
+ Character.toUpperCase(nuevaLinea.charAt(i + 2))
+ nuevaLinea.substring(i + 3);
len-=2; len-=2;
} }
} }
} }
}
}
open = false; open = false;
for (i = 0; i < len; i++) { for (i=0; i<len; i++)
{
ch = nuevaLinea.charAt(i); ch = nuevaLinea.charAt(i);
if (ch == '/') { if (ch=='/')
if (open) { {
nuevaLinea = nuevaLinea.substring(0, i) + ")" if (open)
+ nuevaLinea.substring(i + 1); {
nuevaLinea = nuevaLinea.substring(0, i) + ")" + nuevaLinea.substring(i+1);
open = false; open = false;
} }
else { else
nuevaLinea = nuevaLinea.substring(0, i) + "(" {
+ nuevaLinea.substring(i + 1); nuevaLinea = nuevaLinea.substring(0, i) + "(" + nuevaLinea.substring(i+1);
open = true; open = true;
} }
} }
@ -232,143 +382,152 @@ public class Manipulate {
return nuevaLinea; return nuevaLinea;
} }
public static String fixWazur(String linea) { public static String fixWazur(String linea)
{
int i; int i;
for (i = 1; i < linea.length(); i++) { for (i=1; i<linea.length(); i++)
if (linea.charAt(i) == 'W') { {
if (linea.charAt(i)=='W')
{
if (Character.isLetter(linea.charAt(i-1))) if (Character.isLetter(linea.charAt(i-1)))
linea = linea.substring(0, i) + 'V' linea = linea.substring(0,i) + 'V' + linea.substring(i+1);
+ linea.substring(i + 1);
} }
} }
return linea; return linea;
} }
/** /** Returns the base letter of a syllable. Does not include the vowel!
* Returns the base letter of a syllable. Does not include the vowel! Ignoring cases for now. */
* Ignoring cases for now. public static String getBaseLetter (String sil)
*/ {
public static String getBaseLetter(String sil) {
sil = sil.toLowerCase(); sil = sil.toLowerCase();
int i=0; int i=0;
char ch, ch2; char ch, ch2;
while (!isVowel(sil.charAt(i))) while (!isVowel(sil.charAt(i))) i++;
i++; if (i==0) return "";
if (i == 0)
return "";
i--; i--;
if (i == -1) if (i==-1) return "";
return "";
if (sil.charAt(i) == '-') if (sil.charAt(i)=='-') i--;
i--;
ch = sil.charAt(i); ch = sil.charAt(i);
// check to see if it is a subscript (y, r, l, w) // check to see if it is a subscript (y, r, l, w)
if (i > 0) { if (i>0)
switch (ch) { {
case 'r': switch (ch)
case 'l': {
case 'w': case 'r': case 'l': case 'w': i--;
i--;
break; break;
case 'y': case 'y':
ch2 = sil.charAt(i-1); ch2 = sil.charAt(i-1);
switch (ch2) { switch (ch2)
case '.': {
return "y"; case '.': return "y";
case 'n': case 'n': return "ny";
return "ny"; default: i--;
default:
i--;
} }
} }
} }
if (i == 0) if (i==0) return sil.substring(i,i+1);
return sil.substring(i, i + 1);
ch = sil.charAt(i); ch = sil.charAt(i);
ch2 = sil.charAt(i-1); ch2 = sil.charAt(i-1);
switch (ch) { switch(ch)
{
case 'h': case 'h':
switch (ch2) { switch (ch2)
case 'k': {
case 'c': case 'k': case 'c': case 't': case 'p': case 'z':
case 't':
case 'p':
case 'z':
return sil.substring(i-1,i+1); return sil.substring(i-1,i+1);
case 's': case 's':
if (i - 2 >= 0 && sil.charAt(i - 2) == 't') if (i-2>=0 && sil.charAt(i-2)=='t') return "tsh";
return "tsh"; else return "sh";
else default: return "h";
return "sh";
default:
return "h";
} }
case 's': case 's':
if (ch2 == 't') if (ch2=='t') return "ts";
return "ts"; else return "s";
else
return "s";
case 'g': case 'g':
if (ch2 == 'n') if (ch2=='n') return "ng";
return "ng"; else return "g";
else
return "g";
case 'z': case 'z':
if (ch2 == 'd') if (ch2=='d') return "dz";
return "dz"; else return "z";
else
return "z";
} }
return sil.substring(i,i+1); return sil.substring(i,i+1);
} }
public static String deleteQuotes(String s) { public static String deleteQuotes(String s)
int length = s.length(); {
if (length > 2) { int length = s.length(), pos;
if (length>2)
{
if ((s.charAt(0)=='\"') && (s.charAt(length-1)=='\"')) if ((s.charAt(0)=='\"') && (s.charAt(length-1)=='\"'))
return s.substring(1, length - 1); s = s.substring(1,length-1);
do
{
pos = s.indexOf("\"\"");
if (pos<0) break;
s = Manipulate.deleteSubstring(s, pos, pos+1);
} while (true);
} }
return s; return s;
} }
/**
* Syntax: java Manipulate [word-file] < source-dic-entries >
* dest-dic-entries /** Syntax: java Manipulate [word-file] < source-dic-entries > dest-dic-entries
*
* Takes the output of ConsoleScannerFilter (in RY format), converts the Takes the output of ConsoleScannerFilter
* Wylie to Acip and displays the result in csv format. arch-palabras es (in RY format), converts the Wylie to Acip
* usado solo cuando deseamos las palabras cambiadas a otro archivo. and displays the result in csv format.
* arch-palabras es usado solo cuando deseamos las palabras cambiadas
* a otro archivo.
* public static void main (String[] args) throws Exception { String linea,
* palabra, definicion, nuevaPalabra; int marker; PrintWriter psPalabras =
* null; public static void main (String[] args) throws Exception
* {
* BufferedReader keyb = new BufferedReader(new String linea, palabra, definicion, nuevaPalabra;
* InputStreamReader(System.in)); int marker;
* PrintWriter psPalabras = null;
* if (args.length==1) psPalabras = new PrintWriter(new
* FileOutputStream(args[0])); BufferedReader keyb = new BufferedReader(new InputStreamReader(System.in));
*
* while ((linea=keyb.readLine())!=null) { if (linea.trim().equals("")) if (args.length==1)
* continue; marker = linea.indexOf('-'); if (marker <0) // linea tiene psPalabras = new PrintWriter(new FileOutputStream(args[0]));
* error { palabra = linea; definicion = ""; } else { palabra =
* linea.substring(0, marker).trim(); definicion = while ((linea=keyb.readLine())!=null)
* linea.substring(marker+1).trim(); } {
* if (linea.trim().equals("")) continue;
* nuevaPalabra = wylieToAcip(palabra); marker = linea.indexOf('-');
* if (marker<0) // linea tiene error
* if (psPalabras!=null) psPalabras.println(nuevaPalabra); else {
* System.out.print(nuevaPalabra + '\t'); if (definicion.equals("")) palabra = linea;
* System.out.println(palabra); else System.out.println(palabra + '\t' + definicion = "";
* definicion); } if (psPalabras!=null) psPalabras.flush(); } }
*/ else
{
palabra = linea.substring(0, marker).trim();
definicion = linea.substring(marker+1).trim();
}
nuevaPalabra = wylieToAcip(palabra);
if (psPalabras!=null)
psPalabras.println(nuevaPalabra);
else System.out.print(nuevaPalabra + '\t');
if (definicion.equals(""))
System.out.println(palabra);
else
System.out.println(palabra + '\t' + definicion);
}
if (psPalabras!=null) psPalabras.flush();
}*/
} }