2002-10-03 19:28:09 +00:00
|
|
|
/*
|
|
|
|
The contents of this file are subject to the AMP Open Community License
|
|
|
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
|
|
|
with the License. You may obtain a copy of the License on the AMP web site
|
|
|
|
(http://www.tibet.iteso.mx/Guatemala/).
|
|
|
|
|
|
|
|
Software distributed under the License is distributed on an "AS IS" basis,
|
|
|
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
|
|
|
License for the specific terms governing rights and limitations under the
|
|
|
|
License.
|
|
|
|
|
|
|
|
The Initial Developer of this software is Andres Montano Pellegrini. Portions
|
|
|
|
created by Andres Montano Pellegrini are Copyright 2001 Andres Montano
|
|
|
|
Pellegrini. All Rights Reserved.
|
|
|
|
|
|
|
|
Contributor(s): ______________________________________.
|
|
|
|
*/
|
|
|
|
|
|
|
|
package org.thdl.tib.scanner;
|
2003-04-01 13:08:38 +00:00
|
|
|
import org.thdl.util.*;
|
2002-10-03 19:28:09 +00:00
|
|
|
import java.io.*;
|
2003-04-01 13:08:38 +00:00
|
|
|
import java.util.Vector;
|
|
|
|
import java.util.Enumeration;
|
2002-10-03 19:28:09 +00:00
|
|
|
|
2002-11-03 08:56:11 +00:00
|
|
|
/** Loads dictionary stored in tree format and searches for words recursively.
|
|
|
|
How the the dictionary is loaded depends on which implementation of
|
|
|
|
{@link SyllableListTree} is invoked.
|
|
|
|
|
|
|
|
@author Andrés Montano Pellegrini
|
|
|
|
@see SyllableListTree
|
|
|
|
*/
|
2002-11-29 08:08:54 +00:00
|
|
|
public class LocalTibetanScanner implements TibetanScanner
|
2002-10-03 19:28:09 +00:00
|
|
|
{
|
|
|
|
public static String archivo;
|
|
|
|
private SyllableListTree raiz, silActual, lastCompSil, silAnterior;
|
|
|
|
private String wordActual, lastCompWord;
|
|
|
|
private Vector floatingSil;
|
2003-04-01 13:08:38 +00:00
|
|
|
private SimplifiedLinkedList wordList;
|
2002-10-03 19:28:09 +00:00
|
|
|
private static String endOfParagraphMarks = "/;|!:[]^@#$%=<>(){}";
|
|
|
|
private static String endOfSyllableMarks = " _\t";
|
|
|
|
|
|
|
|
static
|
|
|
|
{
|
|
|
|
archivo = null;
|
|
|
|
}
|
|
|
|
|
|
|
|
public void clearTokens()
|
|
|
|
{
|
2003-04-01 13:08:38 +00:00
|
|
|
wordList = new SimplifiedLinkedList();
|
2002-10-03 19:28:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
public DictionarySource getDictionarySource()
|
|
|
|
{
|
|
|
|
return raiz.getDictionarySource();
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
public LocalTibetanScanner(String arch) throws Exception
|
|
|
|
{
|
|
|
|
archivo = arch;
|
|
|
|
// raiz = new MemorySyllableListTree(archivo);
|
|
|
|
// raiz = new FileSyllableListTree(archivo);
|
|
|
|
raiz = new CachedSyllableListTree(archivo);
|
|
|
|
floatingSil = new Vector();
|
2003-04-01 13:08:38 +00:00
|
|
|
wordList = new SimplifiedLinkedList();
|
2002-10-03 19:28:09 +00:00
|
|
|
resetAll();
|
|
|
|
}
|
|
|
|
|
|
|
|
private void resetAll()
|
|
|
|
{
|
|
|
|
silAnterior = silActual = lastCompSil = null;
|
|
|
|
wordActual = lastCompWord = null;
|
|
|
|
}
|
|
|
|
|
|
|
|
private void scanSyllable(String sil)
|
|
|
|
{
|
|
|
|
SyllableListTree resultado=null;
|
|
|
|
Enumeration enum;
|
|
|
|
Word w;
|
|
|
|
String silSinDec;
|
2003-03-23 20:27:54 +00:00
|
|
|
boolean aadded;
|
2002-10-03 19:28:09 +00:00
|
|
|
|
|
|
|
if (silActual==null)
|
|
|
|
silActual = raiz;
|
|
|
|
|
|
|
|
silAnterior = silActual;
|
|
|
|
silActual = silActual.lookUp(sil);
|
|
|
|
|
|
|
|
if (silActual != null)
|
|
|
|
{
|
|
|
|
if (silActual.hasDef())
|
|
|
|
{
|
|
|
|
lastCompWord = concatWithSpace(wordActual, sil);
|
|
|
|
lastCompSil = silActual;
|
|
|
|
floatingSil.removeAllElements();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
silSinDec = withOutDec(sil);
|
2003-03-23 20:27:54 +00:00
|
|
|
resultado=null;
|
|
|
|
// while to take into account very weird cases like le'u'i'o
|
|
|
|
while (resultado == null && silSinDec!=null)
|
2002-10-03 19:28:09 +00:00
|
|
|
{
|
|
|
|
resultado = silAnterior.lookUp(silSinDec);
|
|
|
|
if (resultado == null)
|
|
|
|
{
|
|
|
|
silSinDec += "\'";
|
|
|
|
resultado = silAnterior.lookUp(silSinDec);
|
2003-03-23 20:27:54 +00:00
|
|
|
aadded=true;
|
2002-10-03 19:28:09 +00:00
|
|
|
}
|
2003-03-23 20:27:54 +00:00
|
|
|
else aadded=false;
|
2002-10-03 19:28:09 +00:00
|
|
|
|
|
|
|
if (resultado!=null && resultado.hasDef())
|
|
|
|
{
|
|
|
|
lastCompWord = concatWithSpace(wordActual, silSinDec);
|
|
|
|
lastCompSil = resultado;
|
|
|
|
wordActual = concatWithSpace(wordActual, sil);
|
2003-03-23 20:27:54 +00:00
|
|
|
floatingSil.removeAllElements();
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
resultado = null;
|
|
|
|
if (aadded) silSinDec = silSinDec.substring(0, silSinDec.length()-1);
|
|
|
|
silSinDec = withOutDec(silSinDec);
|
2002-10-03 19:28:09 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (resultado!=null) return;
|
|
|
|
|
|
|
|
if (lastCompSil!=null)
|
|
|
|
floatingSil.addElement(sil);
|
|
|
|
}
|
|
|
|
wordActual = concatWithSpace(wordActual, sil);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
silSinDec = withOutDec(sil);
|
2003-03-23 20:27:54 +00:00
|
|
|
resultado = null;
|
|
|
|
// while to take into account very weird cases like le'u'i'o
|
|
|
|
while (resultado==null && silSinDec!=null)
|
2002-10-03 19:28:09 +00:00
|
|
|
{
|
|
|
|
resultado = silAnterior.lookUp(silSinDec);
|
|
|
|
if (resultado == null)
|
|
|
|
{
|
|
|
|
silSinDec += "\'";
|
|
|
|
resultado = silAnterior.lookUp(silSinDec);
|
2003-03-23 20:27:54 +00:00
|
|
|
aadded=true;
|
2002-10-03 19:28:09 +00:00
|
|
|
}
|
2003-03-23 20:27:54 +00:00
|
|
|
else aadded=false;
|
2002-10-03 19:28:09 +00:00
|
|
|
// si funciona sin declension arreglado problema
|
|
|
|
if (resultado!=null && resultado.hasDef())
|
|
|
|
{
|
|
|
|
wordList.addLast(new Word(concatWithSpace(wordActual, silSinDec), concatWithSpace(wordActual,sil), resultado.getDefs()));
|
|
|
|
resetAll();
|
|
|
|
floatingSil.removeAllElements();
|
|
|
|
}
|
2003-03-23 20:27:54 +00:00
|
|
|
else
|
|
|
|
{
|
|
|
|
resultado = null;
|
|
|
|
if (aadded) silSinDec = silSinDec.substring(0, silSinDec.length()-1);
|
|
|
|
silSinDec = withOutDec(silSinDec);
|
|
|
|
}
|
|
|
|
|
2002-10-03 19:28:09 +00:00
|
|
|
}
|
|
|
|
if (resultado!=null) return;
|
|
|
|
|
|
|
|
if (lastCompSil!=null)
|
|
|
|
{
|
|
|
|
w = new Word(lastCompWord, lastCompSil.getDefs());
|
|
|
|
wordList.addLast(w);
|
|
|
|
this.resetAll();
|
|
|
|
|
|
|
|
enum = floatingSil.elements();
|
|
|
|
floatingSil = new Vector();
|
|
|
|
while (enum.hasMoreElements())
|
|
|
|
scanSyllable((String)enum.nextElement());
|
|
|
|
|
|
|
|
scanSyllable(sil);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
if (silAnterior!=raiz)
|
|
|
|
{
|
|
|
|
w = new Word(wordActual, "[incomplete word]");
|
|
|
|
wordList.addLast(w);
|
|
|
|
this.resetAll();
|
|
|
|
scanSyllable(sil);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
w = new Word(sil, "[not found]");
|
|
|
|
wordList.addLast(w);
|
|
|
|
this.resetAll();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
public void finishUp()
|
|
|
|
{
|
|
|
|
Enumeration enum;
|
|
|
|
Word w;
|
|
|
|
|
|
|
|
while (lastCompSil!=null)
|
|
|
|
{
|
|
|
|
w = new Word(lastCompWord, lastCompSil.getDefs());
|
|
|
|
wordList.addLast(w);
|
|
|
|
this.resetAll();
|
|
|
|
|
|
|
|
enum = floatingSil.elements();
|
|
|
|
floatingSil = new Vector();
|
|
|
|
while (enum.hasMoreElements())
|
|
|
|
scanSyllable((String)enum.nextElement());
|
|
|
|
}
|
|
|
|
|
|
|
|
if (silActual!=null)
|
|
|
|
{
|
|
|
|
wordList.addLast(new Word(wordActual, "[incomplete word]"));
|
|
|
|
this.resetAll();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private static String concatWithSpace(String s1, String s2)
|
|
|
|
{
|
|
|
|
if (s1==null || s1.equals(""))
|
|
|
|
return s2;
|
|
|
|
else
|
|
|
|
return s1 + ' ' + s2;
|
|
|
|
}
|
|
|
|
|
|
|
|
private static String withOutDec(String sil)
|
|
|
|
{
|
|
|
|
boolean isDeclined =false;
|
2003-03-23 20:27:54 +00:00
|
|
|
int len = sil.length(), apos;
|
2002-10-03 19:28:09 +00:00
|
|
|
|
|
|
|
if (len<3) return null;
|
|
|
|
|
2002-10-18 23:28:49 +00:00
|
|
|
char lastCar = Character.toLowerCase(sil.charAt(len-1));
|
|
|
|
if ((lastCar == 's' || lastCar == 'r') && Manipulate.isVowel(sil.charAt(len-2)))
|
2002-10-03 19:28:09 +00:00
|
|
|
{
|
|
|
|
isDeclined=true;
|
|
|
|
sil = sil.substring(0, len-1);
|
|
|
|
}
|
2003-03-23 20:27:54 +00:00
|
|
|
else
|
2002-10-03 19:28:09 +00:00
|
|
|
{
|
2003-03-23 20:27:54 +00:00
|
|
|
apos = sil.lastIndexOf('\'');
|
|
|
|
if (apos>0 && apos < len-1 && Manipulate.isVowel(sil.charAt(apos-1)) && sil.charAt(apos+1)!='u')
|
|
|
|
{
|
|
|
|
isDeclined=true;
|
|
|
|
sil = sil.substring(0, apos);
|
|
|
|
}
|
|
|
|
/* if ((lastCar == 'i' || lastCar == 'o') && sil.charAt(len-2)=='\'')
|
|
|
|
{
|
|
|
|
isDeclined=true;
|
|
|
|
sil = sil.substring(0, len-2);
|
|
|
|
}*/
|
2002-10-03 19:28:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!isDeclined) return null;
|
|
|
|
return sil;
|
|
|
|
}
|
|
|
|
|
|
|
|
public void scanBody(String in)
|
|
|
|
{
|
|
|
|
boolean hayMasLineas=true;
|
|
|
|
|
|
|
|
if (in.equals("")) finishUp();
|
|
|
|
else
|
|
|
|
{
|
|
|
|
int init = 0, fin;
|
|
|
|
String linea;
|
|
|
|
|
|
|
|
while (hayMasLineas)
|
|
|
|
{
|
|
|
|
fin = in.indexOf("\n",init);
|
|
|
|
if (fin<0)
|
|
|
|
{
|
|
|
|
linea = in.substring(init).trim();
|
|
|
|
hayMasLineas=false;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
linea = in.substring(init, fin).trim();
|
|
|
|
|
|
|
|
if (linea.equals(""))
|
|
|
|
{
|
|
|
|
finishUp();
|
|
|
|
wordList.addLast(new PunctuationMark('\n'));
|
|
|
|
}
|
|
|
|
else
|
|
|
|
scanLine(linea);
|
|
|
|
|
|
|
|
init = fin+1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private boolean isEndOfSyllable(int ch)
|
|
|
|
{
|
|
|
|
return (endOfSyllableMarks.indexOf(ch)>-1);
|
|
|
|
}
|
|
|
|
|
|
|
|
public void scanLine(String linea)
|
|
|
|
{
|
2002-11-02 16:01:40 +00:00
|
|
|
int init = 0, fin;
|
2002-10-03 19:28:09 +00:00
|
|
|
char ch;
|
|
|
|
String sil;
|
|
|
|
boolean doNotFinishUp;
|
|
|
|
|
|
|
|
if (linea.equals(""))
|
|
|
|
{
|
|
|
|
finishUp();
|
|
|
|
wordList.addLast(new PunctuationMark('\n'));
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
outAHere:
|
|
|
|
while(true)
|
|
|
|
{
|
|
|
|
doNotFinishUp=true;
|
|
|
|
|
|
|
|
// Make init skip all punctuation marks
|
|
|
|
while (true)
|
|
|
|
{
|
|
|
|
if (init>=linea.length())
|
|
|
|
break outAHere;
|
|
|
|
ch = linea.charAt(init);
|
|
|
|
if (endOfParagraphMarks.indexOf(ch)>=0)
|
|
|
|
{
|
|
|
|
if (doNotFinishUp)
|
|
|
|
{
|
|
|
|
finishUp();
|
|
|
|
doNotFinishUp=false;
|
|
|
|
}
|
|
|
|
wordList.addLast(new PunctuationMark(ch));
|
|
|
|
}
|
|
|
|
else if (endOfSyllableMarks.indexOf(ch)<0)
|
|
|
|
break;
|
|
|
|
|
|
|
|
init++;
|
|
|
|
}
|
|
|
|
|
|
|
|
doNotFinishUp = true;
|
|
|
|
|
|
|
|
/* move fin to the end of the next syllable. If finishing
|
|
|
|
up is necessary it is done after scanSyllable
|
|
|
|
*/
|
|
|
|
|
|
|
|
fin = init+1;
|
|
|
|
while (true)
|
|
|
|
{
|
|
|
|
ch = linea.charAt(fin);
|
|
|
|
if (endOfParagraphMarks.indexOf(ch)>=0)
|
|
|
|
{
|
|
|
|
doNotFinishUp = false;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
else if (endOfSyllableMarks.indexOf(ch)>=0)
|
|
|
|
{
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
fin++;
|
|
|
|
if (fin>=linea.length())
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
sil = linea.substring(init, fin);
|
|
|
|
scanSyllable(sil);
|
|
|
|
|
|
|
|
if (!doNotFinishUp)
|
|
|
|
{
|
|
|
|
finishUp();
|
|
|
|
wordList.addLast(new PunctuationMark(ch));
|
|
|
|
}
|
|
|
|
init = fin+1;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2003-04-01 13:08:38 +00:00
|
|
|
public SimplifiedLinkedList getTokenLinkedList()
|
2002-10-03 19:28:09 +00:00
|
|
|
{
|
|
|
|
return wordList;
|
|
|
|
}
|
|
|
|
|
|
|
|
public Token[] getTokenArray()
|
|
|
|
{
|
|
|
|
int i=wordList.size();
|
|
|
|
Token token[] = new Token[i];
|
2003-04-01 13:08:38 +00:00
|
|
|
SimplifiedListIterator li = wordList.listIterator();
|
2002-10-03 19:28:09 +00:00
|
|
|
while(li.hasNext())
|
|
|
|
token[--i] = (Token)li.next();
|
|
|
|
return token;
|
|
|
|
}
|
|
|
|
|
|
|
|
/** Looks for .dic file, and returns the dictionary descriptions.
|
|
|
|
Also updates the definitionTags in the Definitions class.
|
|
|
|
*/
|
|
|
|
public String[] getDictionaryDescriptions()
|
|
|
|
{
|
|
|
|
int n;
|
|
|
|
try
|
|
|
|
{
|
|
|
|
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(archivo + ".dic")));
|
2003-04-01 13:08:38 +00:00
|
|
|
SimplifiedLinkedList ll1 = new SimplifiedLinkedList(), ll2 = new SimplifiedLinkedList();
|
2002-10-03 19:28:09 +00:00
|
|
|
String s;
|
|
|
|
while ((s=br.readLine())!=null)
|
|
|
|
{
|
|
|
|
n = s.indexOf(",");
|
|
|
|
if (n < 0)
|
|
|
|
{
|
|
|
|
ll1.addLast(null);
|
|
|
|
ll2.addLast(s);
|
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
ll1.addLast(s.substring(0,n).trim());
|
|
|
|
ll2.addLast(s.substring(n+1).trim());
|
|
|
|
}
|
|
|
|
}
|
|
|
|
Definitions.defTags = ll2.toStringArray();
|
|
|
|
return ll1.toStringArray();
|
|
|
|
}
|
|
|
|
catch (Exception e)
|
|
|
|
{
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|