8ccd68789a
imports. It made two errors, but the compiler found them. I've cvs tagged the tree before doing this, just in case.
1012 lines
38 KiB
Java
1012 lines
38 KiB
Java
/*
|
|
The contents of this file are subject to the AMP Open Community License
|
|
Version 1.0 (the "License"); you may not use this file except in compliance
|
|
with the License. You may obtain a copy of the License on the AMP web site
|
|
(http://www.tibet.iteso.mx/Guatemala/).
|
|
|
|
Software distributed under the License is distributed on an "AS IS" basis,
|
|
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
|
License for the specific terms governing rights and limitations under the
|
|
License.
|
|
|
|
The Initial Developer of this software is Andres Montano Pellegrini. Portions
|
|
created by Andres Montano Pellegrini are Copyright 2001 Andres Montano
|
|
Pellegrini. All Rights Reserved.
|
|
|
|
Contributor(s): ______________________________________.
|
|
*/
|
|
package org.thdl.tib.scanner;
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.File;
|
|
import java.io.FileInputStream;
|
|
import java.io.InputStreamReader;
|
|
import java.io.RandomAccessFile;
|
|
|
|
import org.thdl.util.Link;
|
|
import org.thdl.util.SimplifiedLinkedList;
|
|
import org.thdl.util.SimplifiedListIterator;
|
|
|
|
|
|
/** Converts Tibetan dictionaries stored in text files
|
|
into a binary file tree structure format, to be used
|
|
by some implementations of the SyllableListTree.
|
|
|
|
<p>Syntax (Dictionary files are assumed to be .txt. Don't include extensions!):<ul>
|
|
<li><b>For one dictionary</b>, to read the definitions stored in <i>
|
|
dic-name.txt</i> and organize them into <i>dic-name.wrd</i> and <i>
|
|
dic-name.def</i>:<pre>java -cp DictionarySearchStandalone.jar org.thdl.tib.scanner.BinaryFileGenerator [-delimiter] dict-name</pre>
|
|
</li>
|
|
<li><b>For multiple dictionaries</b>, to read the definitions stored in <i>
|
|
dict-name1.txt</i>, <i>dict-name2.txt</i>, etc.and organize them into <i>
|
|
dest-file-name.wrd</i> and <i>dest-file-name.def</i>:<pre>java -cp DictionarySearchStandalone.jar org.thdl.tib.scanner.BinaryFileGenerator dest-file-name [-delimiter1] dict-name1 [[-delimiter2] dict-name2 ...]</pre>
|
|
</li>
|
|
</ul>
|
|
<p>-delimiter<ul>
|
|
<li><b>If this option is omitted</b>, it is assumed that each line is an entry
|
|
(no multiple-line entries) and the definition and definiendum are separated
|
|
by '-' (a dash). Even though it is not
|
|
required, it is highly recommended to include a space before and afterwards
|
|
(to eliminate any possible ambiguity with regards to the transliteration of
|
|
reverse vowels in <a href="http://iris.lib.virginia.edu/tibet/tools/ewts.pdf" target="_blank">
|
|
Extended Wylie</a>). A sample entry for the dictionary is:
|
|
<hr>
|
|
<pre>bkra shis - 1) auspiciousness, good luck, good fortune, goodness, prosperity, happiness. 2) auspicious, favorable, fortunate, successful, felicitous, lucky. 3) verse of auspiciousness; benediction, blessing. 4) a personal name.
|
|
bde legs - 1) goodness, happiness, well-being, wellfare, auspiciousness, good fortune. 2) well, fine.</pre>
|
|
<hr>
|
|
<p>If this were the content of a file called "<i>my-glossary.txt</i>" the
|
|
binary tree file would be generated with the command:</p>
|
|
<pre>java -cp DictionarySearchStandalone.jar org.thdl.tib.scanner.BinaryFileGenerator my-glossary</pre>
|
|
</li>
|
|
<li>-<b>tab</b>: it is assumed that each line is an entry (no multiple-line
|
|
entries) and the definition and definiendum are separated by '\t' (horizontal tabulation).
|
|
One tabulation is enough; don't feel the need to "align" the definitions in your
|
|
word-processor. A sample entry for the dictionary is:<hr>
|
|
<pre>bkra shis 1) auspiciousness, good luck, good fortune, goodness, prosperity, happiness. 2) auspicious, favorable, fortunate, successful, felicitous, lucky. 3) verse of auspiciousness; benediction, blessing. 4) a personal name.
|
|
bde legs 1) goodness, happiness, well-being, wellfare, auspiciousness, good fortune. 2) well, fine.</pre>
|
|
<hr>
|
|
<p>Here, the
|
|
binary tree file would be generated with the command:</p>
|
|
<pre>java -cp DictionarySearchStandalone.jar org.thdl.tib.scanner.BinaryFileGenerator -tab my-glossary</pre>
|
|
</li>
|
|
<li>
|
|
<b>-<i>string</i></b>: it is assumed that each line is an entry (no multiple-line
|
|
entries) and the definition and definiendum are separated by the character or
|
|
string of characters specified by the user. A sample entry for the dictionary
|
|
is:<hr>
|
|
<pre>bkra shis ** 1) auspiciousness, good luck, good fortune, goodness, prosperity, happiness. 2) auspicious, favorable, fortunate, successful, felicitous, lucky. 3) verse of auspiciousness; benediction, blessing. 4) a personal name.
|
|
bde legs ** 1) goodness, happiness, well-being, wellfare, auspiciousness, good fortune. 2) well, fine.</pre>
|
|
<hr>
|
|
<p>Here, the
|
|
binary tree file would be generated with the command:</p>
|
|
<pre>java -cp DictionarySearchStandalone.jar org.thdl.tib.scanner.BinaryFileGenerator -** my-glossary</pre>
|
|
</li>
|
|
<li>-<b>acip</b>: it is assumed that the electronic file is a transliteration of
|
|
a Tibetan dictionary. It is called "acip" because it accepts Acip's comment
|
|
codes ('@' to mark page numbers, brackets to mark comments, etc). Nevertheless,
|
|
it still requires the files to be in <a href="http://iris.lib.virginia.edu/tibet/tools/ewts.pdf" target="_blank">
|
|
Extended Wylie</a>, so if your file is in Acip's transliteration scheme make
|
|
sure to run <i><a href="#org.thdl.tib.scanner.AcipToWylie">org.thdl.tib.scanner.AcipToWylie</a></i> first. Definitions here can
|
|
be of multiple lines, but with no blank lines in between. It is assumed that the
|
|
definiendum starts after a blank line (except at the beginning of a new page
|
|
where it could start with the last part of the previous definition) up to the <i>
|
|
shad</i> (except when the <i>shad</i> is omitted because of grammar rules as for
|
|
instance no shad after a "ga" suffix without a secondary suffix). Each
|
|
time a new letter starts, it should be clearly marked in brackets ('[', ']'),
|
|
parenthesis ('(', ')') or llaves ('{','}'). A sample entry for the dictionary is:
|
|
<hr>
|
|
<pre>@1
|
|
|
|
(ka)
|
|
|
|
ka ba/ gdung 'degs don byed nus pa/
|
|
|
|
rkyen/ grogs byed
|
|
|
|
@2
|
|
|
|
(kha)
|
|
|
|
khyod dngos po dang de byung 'brel/ khyod dngos po las byung
|
|
zhing/ dngos po ldog stops kyis khyod ldog pa/
|
|
|
|
khyod dngos po dang bdag gcig 'brel/ khyod ngos po dang bdag
|
|
nyid gcig pa'i sgo nas tha dad gang zhig/ dngos po ldog
|
|
stops kyis khyod ldog pa/
|
|
|
|
khyod dngos po dang 'brel pa/ khyod dngos po dang tha dad gang
|
|
|
|
@3
|
|
|
|
zhig/ ngos po ldog stobs kyis khyod ldog pa/
|
|
|
|
kha dog mdog du rung ba'am/ sngo ser dkar dmar sogs mdog tu
|
|
rung ba'i gzugs/</pre>
|
|
<hr>
|
|
<p>Here the
|
|
binary tree file would be generated with the command:</p>
|
|
<pre>java -cp DictionarySearchStandalone.jar org.thdl.tib.scanner.BinaryFileGenerator -acip my-glossary</pre>
|
|
<p><i>Comments:</i> Notice in the sample text that at the beginning of page 2, "<i>zhig</i>" is not a
|
|
new definiendum, but still is part of the definition of "<i>khyod dngos po dang 'brel
|
|
pa</i>". Also the definiendum of the last entry is "<i>kha dog</i>"
|
|
(the <i>shad</i> was omitted after "<i>ga</i>" suffix) and not "<i>kha dog mdog du rung ba'am</i>".
|
|
Nevertheless the definiendum of the second term is not "<i>khyod dngos po dang bdag</i>"
|
|
since there is no omitted <i>shad</i> after that "<i>ga</i>" suffix; the
|
|
definiedum is "<i>khyod dngos po dang bdag gcig 'brel</i>". As is clear from the
|
|
sample text, the tool has to make a series of "smart guesses" to try to figure
|
|
out where each definiendum end and it's definition start. Such process is
|
|
not 100% full-proof, so expect some mistakes.<br>
|
|
</p>
|
|
</li>
|
|
<li>
|
|
<p>Dictionaries in different formats can be processed together. For instance the
|
|
command:
|
|
<pre>java -cp DictionarySearchStandalone.jar org.thdl.tib.scanner.BinaryFileGenerator alldicts ry-dic99 -acip myglossary_uma -tab myglossary_rdzogs-chen</pre>
|
|
<p>would generate <i>alldicts.def</i> and <i>alldicts.wrd</i> processing <i>ry-dic99.txt</i>
|
|
as dash-separated, <i>myglossary_rdzogs-chen.txt</i> as tab-separated and <i>
|
|
myglossary_uma.txt</i> in the transliteration format explained above.<br>
|
|
</li>
|
|
</ul>
|
|
|
|
@author Andrés Montano Pellegrini
|
|
@see SyllableListTree
|
|
@see FileSyllableListTree
|
|
@see CachedSyllableListTree
|
|
*/
|
|
public class BinaryFileGenerator extends SimplifiedLinkedList
|
|
{
|
|
private static final int versionNumber = 3;
|
|
|
|
private long posHijos;
|
|
private String sil, def[];
|
|
public final static int delimiterGeneric=0;
|
|
public final static int delimiterAcip=1;
|
|
public final static int delimiterDash=2;
|
|
|
|
/** Number of dictionary. If 0, partial word (no definition).
|
|
*/
|
|
private ByteDictionarySource sourceDef;
|
|
public static RandomAccessFile wordRaf;
|
|
private static RandomAccessFile defRaf;
|
|
|
|
static
|
|
{
|
|
wordRaf = null;
|
|
defRaf = null;
|
|
}
|
|
|
|
public BinaryFileGenerator()
|
|
{
|
|
super();
|
|
sil = null;
|
|
def = null;
|
|
posHijos=-1;
|
|
sourceDef = null;
|
|
}
|
|
|
|
private BinaryFileGenerator(String sil, String def, int numDef)
|
|
{
|
|
super();
|
|
int marker;
|
|
while (true)
|
|
{
|
|
marker = Manipulate.indexOfExtendedEndOfSyllableMark(sil);
|
|
if (marker==0) sil = sil.substring(1);
|
|
else if (marker==sil.length()-1) sil = sil.substring(0,sil.length()-1);
|
|
else break;
|
|
}
|
|
|
|
// fix for updates
|
|
this.sourceDef = new ByteDictionarySource();
|
|
|
|
if (marker<0)
|
|
{
|
|
this.sil = sil;
|
|
this.def = new String[1];
|
|
this.def[0] = def;
|
|
this.sourceDef.addNewDef(numDef);
|
|
}
|
|
else
|
|
{
|
|
this.sil = sil.substring(0, marker);
|
|
this.def = null;
|
|
addLast(new BinaryFileGenerator(sil.substring(marker+1).trim(), def, numDef));
|
|
}
|
|
posHijos=-1;
|
|
}
|
|
|
|
public String toString()
|
|
{
|
|
return sil;
|
|
}
|
|
|
|
public void addFile(String archivo, int delimiterType, String delimiter, int defNum) throws Exception
|
|
{
|
|
final short newDefiniendum=1, halfDefiniendum=2, definition=3;
|
|
short status=newDefiniendum;
|
|
int marker, len, marker2, currentPage=0, currentLine=1;
|
|
char ch;
|
|
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(archivo)));
|
|
String entrada="", s1="", s2="", currentLetter="", temp="", lastWeirdDefiniendum="", alternateWords[];
|
|
boolean markerNotFound;
|
|
|
|
// used for acip dict
|
|
switch(delimiterType)
|
|
{
|
|
case delimiterAcip:
|
|
outAHere:
|
|
while (true)
|
|
{
|
|
entrada=br.readLine();
|
|
if (entrada==null) break;
|
|
currentLine++;
|
|
|
|
entrada = entrada.trim();
|
|
len = entrada.length();
|
|
if (len<=0) continue;
|
|
|
|
// get page number
|
|
if (entrada.charAt(0)=='@')
|
|
{
|
|
marker = 1;
|
|
while(marker<len && Character.isDigit(entrada.charAt(marker)))
|
|
marker++;
|
|
temp = entrada.substring(1, marker);
|
|
if (temp.length()>0)
|
|
currentPage=Integer.parseInt(temp);
|
|
if (marker<len)
|
|
{
|
|
entrada = entrada.substring(marker).trim();
|
|
len = entrada.length();
|
|
}
|
|
else continue;
|
|
}
|
|
|
|
// get current letter
|
|
if (entrada.charAt(0)=='(' || entrada.charAt(0)=='{' || entrada.charAt(0)=='?')
|
|
{
|
|
currentLetter = entrada.substring(1, entrada.length()-2);
|
|
/*out.println(currentPage + ": " + currentLetter);
|
|
n++;*/
|
|
continue;
|
|
}
|
|
|
|
if (entrada.charAt(0)=='[')
|
|
{
|
|
marker=1;
|
|
markerNotFound=true;
|
|
do
|
|
{
|
|
while (marker<len && markerNotFound)
|
|
{
|
|
if (entrada.charAt(marker)==']') markerNotFound=false;
|
|
else marker++;
|
|
}
|
|
if (markerNotFound)
|
|
{
|
|
entrada=br.readLine();
|
|
if (entrada==null) break outAHere;
|
|
currentLine++;
|
|
len = entrada.length();
|
|
marker=0;
|
|
}
|
|
else break;
|
|
} while (true);
|
|
if (marker<len)
|
|
{
|
|
entrada = entrada.substring(marker+1).trim();
|
|
len = entrada.length();
|
|
if (len<=0) continue;
|
|
}
|
|
else continue;
|
|
}
|
|
|
|
// skip stuff. Add to previous definition.
|
|
if (entrada.startsWith("..."))
|
|
{
|
|
entrada=entrada.substring(3);
|
|
len = entrada.length();
|
|
if (len<=0) continue;
|
|
}
|
|
|
|
// find definiendum
|
|
ch = entrada.charAt(0);
|
|
if (Character.isLetter(ch) || ch=='\'')
|
|
{
|
|
/* first criteria: if it is not the root letter of section it is part of the
|
|
previous definition, probably a page change, else go for it with following
|
|
code: */
|
|
|
|
// get first syllable to check base letter
|
|
marker=1;
|
|
while (marker<len)
|
|
{
|
|
ch = entrada.charAt(marker);
|
|
if (Manipulate.isEndOfSyllableMark(ch) || Manipulate.isEndOfParagraphMark(ch)) break;
|
|
marker++;
|
|
}
|
|
|
|
if (status!=halfDefiniendum) temp = Manipulate.getBaseLetter(entrada.substring(0, marker));
|
|
|
|
// if line begins with current letter, probably it is a definiendum
|
|
if (status==halfDefiniendum || currentLetter.equals(temp))
|
|
{
|
|
/* Since new definiendum was found, update last and collect new. No need to update
|
|
status because it will be updated below. */
|
|
if (status==definition)
|
|
{
|
|
add(s1, s2, defNum);
|
|
s1=""; s2="";
|
|
}
|
|
|
|
marker=marker2=1;
|
|
markerNotFound=true;
|
|
|
|
while (marker < len)
|
|
{
|
|
ch = entrada.charAt(marker);
|
|
|
|
if (Manipulate.isEndOfParagraphMark(ch))
|
|
{
|
|
markerNotFound=false;
|
|
marker2=marker+1;
|
|
}
|
|
else if (Manipulate.isEndOfSyllableMark(ch))
|
|
{
|
|
if (marker+1<len && Manipulate.isEndOfSyllableMark(entrada.charAt(marker+1))) // verify " "
|
|
{
|
|
markerNotFound=false;
|
|
marker2=++marker;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
switch(ch)
|
|
{
|
|
case '(': case '<':
|
|
markerNotFound=false;
|
|
marker2=marker;
|
|
break;
|
|
case 'g': // verify "g "
|
|
if (marker+1<len && Manipulate.isVowel(entrada.charAt(marker-1)) && Manipulate.isEndOfSyllableMark(entrada.charAt(marker+1)))
|
|
{
|
|
temp = entrada.substring(0, marker+1);
|
|
if (!lastWeirdDefiniendum.startsWith(temp))
|
|
{
|
|
markerNotFound=false;
|
|
marker2=++marker;
|
|
lastWeirdDefiniendum=temp;
|
|
}
|
|
}
|
|
break;
|
|
case '.':
|
|
if (marker+2<len && entrada.charAt(marker+1)=='.' && entrada.charAt(marker+2)=='.')
|
|
{
|
|
markerNotFound=false;
|
|
marker2=marker;
|
|
}
|
|
break;
|
|
default:
|
|
if (Character.isDigit(ch))
|
|
{
|
|
markerNotFound=false;
|
|
marker2=marker;
|
|
}
|
|
}
|
|
}
|
|
if (markerNotFound) marker++;
|
|
else break;
|
|
}
|
|
|
|
/* either this is a definiendum that consists of several lines or
|
|
it is part of the last definition. */
|
|
if (markerNotFound)
|
|
{
|
|
/* assume that the definiendum goes on to the next line. */
|
|
s1 = s1 + entrada + " ";
|
|
status=halfDefiniendum;
|
|
}
|
|
else
|
|
{
|
|
s1 = s1 + entrada.substring(0,marker).trim();
|
|
s2 = "[" + currentPage + "] " + entrada.substring(marker2).trim();
|
|
status=definition;
|
|
|
|
while (true)
|
|
{
|
|
entrada=br.readLine();
|
|
|
|
if (entrada==null)
|
|
{
|
|
add(s1, s2, defNum);
|
|
break outAHere;
|
|
}
|
|
|
|
currentLine++;
|
|
entrada = entrada.trim();
|
|
|
|
if (entrada.equals("")) break;
|
|
else
|
|
{
|
|
s2 = s2 + " " + entrada;
|
|
}
|
|
}
|
|
|
|
}
|
|
}
|
|
else // last line did not start with the current letter, it must still be part of the definition
|
|
{
|
|
s2 = s2 + " " + entrada;
|
|
while (true)
|
|
{
|
|
entrada=br.readLine();
|
|
|
|
if (entrada==null)
|
|
{
|
|
add(s1, s2, defNum);
|
|
break outAHere;
|
|
}
|
|
|
|
currentLine++;
|
|
entrada = entrada.trim();
|
|
|
|
if (entrada.equals("")) break;
|
|
else
|
|
{
|
|
s2 = s2 + " " + entrada;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else // if first character was not a letter, it must still be part of definition
|
|
{
|
|
s2 = s2 + " " + entrada;
|
|
while (true)
|
|
{
|
|
entrada=br.readLine();
|
|
|
|
if (entrada==null)
|
|
{
|
|
add(s1, s2, defNum);
|
|
break outAHere;
|
|
}
|
|
|
|
currentLine++;
|
|
entrada = entrada.trim();
|
|
|
|
if (entrada.equals("")) break;
|
|
else
|
|
{
|
|
s2 = s2 + " " + entrada;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
break;
|
|
default:
|
|
while ((entrada = br.readLine())!=null)
|
|
{
|
|
entrada = entrada.trim();
|
|
if (!entrada.equals(""))
|
|
{
|
|
switch(delimiterType)
|
|
{
|
|
/* this is needed to make sure that the dash used in reverse vowels with extended
|
|
wylie is not confused with the dash that separates definiendum and definition. */
|
|
case delimiterDash:
|
|
marker=entrada.indexOf('-');
|
|
len = entrada.length();
|
|
while (marker>=0 && marker<len-1 && Manipulate.isVowel(entrada.charAt(marker+1)) && !Character.isWhitespace(entrada.charAt(marker-1)))
|
|
{
|
|
marker = entrada.indexOf('-', marker+1);
|
|
}
|
|
break;
|
|
default:
|
|
marker = entrada.indexOf(delimiter);
|
|
}
|
|
if (marker<=0)
|
|
{
|
|
System.out.println("Error loading line " + currentLine + ", in file " + archivo + ":");
|
|
System.out.println(entrada);
|
|
}
|
|
else
|
|
{
|
|
marker2 = Manipulate.indexOfBracketMarks(entrada.substring(0,marker));
|
|
if (marker2>0) marker = marker2;
|
|
|
|
s1 = Manipulate.deleteQuotes(entrada.substring(0,marker).trim());
|
|
s2 = Manipulate.deleteQuotes(entrada.substring(marker+delimiter.length())).trim();
|
|
|
|
if (Manipulate.isMeaningful(s2))
|
|
{
|
|
if (currentLine%5000==0)
|
|
{
|
|
System.out.println("Adding " + s1 + "...");
|
|
System.out.flush();
|
|
}
|
|
marker2 = s1.indexOf(';');
|
|
if (marker2>0)
|
|
{
|
|
alternateWords = s1.split(";");
|
|
for (marker2=0; marker2<alternateWords.length; marker2++)
|
|
{
|
|
add(alternateWords[marker2],s2, defNum);
|
|
}
|
|
|
|
}
|
|
else add(s1, s2 , defNum);
|
|
}
|
|
}
|
|
}
|
|
currentLine++;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
private void add(String word, String def, int defNum)
|
|
{
|
|
Link link, newLink;
|
|
BinaryFileGenerator ultimo;
|
|
String firstSillable;
|
|
int marker, comp;
|
|
|
|
while (true)
|
|
{
|
|
marker = Manipulate.indexOfExtendedEndOfSyllableMark(word);
|
|
if (marker==0) word = word.substring(1);
|
|
else if (marker==word.length()-1) word = word.substring(0,word.length()-1);
|
|
else break;
|
|
}
|
|
|
|
if (marker<0)
|
|
firstSillable = word;
|
|
else firstSillable = word.substring(0,marker);
|
|
|
|
/* usa orden alfabetico */
|
|
if (isEmpty() || ((comp = firstSillable.compareTo((ultimo = (BinaryFileGenerator) getLast()).toString()))<0))
|
|
{
|
|
super.addLast(new BinaryFileGenerator(word, def, defNum));
|
|
}
|
|
else
|
|
{
|
|
if (comp==0)
|
|
if (marker<0) ultimo.addMoreDef(def, defNum);
|
|
else ultimo.add(word.substring(marker+1).trim(), def, defNum);
|
|
else
|
|
{
|
|
link = cabeza;
|
|
while(link.siguiente!=null)
|
|
{
|
|
comp = firstSillable.compareTo(link.siguiente.toString());
|
|
if (comp<0)
|
|
{
|
|
newLink = new Link(new BinaryFileGenerator(word, def, defNum));
|
|
newLink.siguiente = link.siguiente;
|
|
link.siguiente = newLink;
|
|
return;
|
|
}
|
|
else
|
|
if (comp==0)
|
|
{
|
|
ultimo = (BinaryFileGenerator) link.siguiente.get();
|
|
if (marker<0) ultimo.addMoreDef(def, defNum);
|
|
else ultimo.add(word.substring(marker+1).trim(), def, defNum);
|
|
return;
|
|
}
|
|
link = link.siguiente;
|
|
}
|
|
newLink = new Link(new BinaryFileGenerator(word, def, defNum));
|
|
link.siguiente = newLink;
|
|
}
|
|
}
|
|
}
|
|
|
|
private void reGroup (int n)
|
|
{
|
|
int i, pos, posEnd;
|
|
|
|
for (i=0; i<def.length; i++)
|
|
{
|
|
if (i!=n)
|
|
{
|
|
// deal with repetitions of definitions
|
|
if (def[i].length()>=def[n].length())
|
|
{
|
|
pos = def[i].indexOf(def[n]);
|
|
|
|
// if it is the same String exactly
|
|
if (pos==0 && def[i].length()==def[n].length())
|
|
{
|
|
if (i<n)
|
|
{
|
|
sourceDef.addDictToDef(sourceDef.getDef(n), i);
|
|
def = Manipulate.deleteString(def, n);
|
|
sourceDef.deleteDef(n);
|
|
n = i;
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
sourceDef.addDictToDef(sourceDef.getDef(i), n);
|
|
def = Manipulate.deleteString(def, i);
|
|
sourceDef.deleteDef(i);
|
|
i--;
|
|
continue;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
posEnd = pos + def[n].length();
|
|
|
|
if ((pos==0 || (pos>0 && !Character.isLetter(def[i].charAt(pos-1)))) && (posEnd==def[i].length() || !Character.isLetter(def[i].charAt(posEnd))))
|
|
{
|
|
if(sourceDef.getDef(i).contains(sourceDef.getDef(n)))
|
|
{
|
|
def = Manipulate.deleteString(def, n);
|
|
sourceDef.deleteDef(n);
|
|
return;
|
|
}
|
|
|
|
// else
|
|
sourceDef.addDictToDef(sourceDef.getDef(i), n);
|
|
|
|
do
|
|
{
|
|
def[i] = Manipulate.replace(def[i], pos, posEnd, "*");
|
|
pos = def[i].indexOf(def[n]);
|
|
posEnd = pos + def[n].length();
|
|
} while ((pos==0 || (pos>0 && !Character.isLetter(def[i].charAt(pos-1)))) && (posEnd==def[i].length() || !Character.isLetter(def[i].charAt(posEnd))));
|
|
|
|
if (i<n)
|
|
{
|
|
def = Manipulate.addString(def, def[n], i);
|
|
def = Manipulate.deleteString(def, n+1);
|
|
sourceDef.insertDef(sourceDef.getDef(n), i);
|
|
sourceDef.deleteDef(n+1);
|
|
n = i;
|
|
reGroup(i+1);
|
|
}
|
|
else
|
|
{
|
|
reGroup(i);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
pos = def[n].indexOf(def[i]);
|
|
posEnd = pos + def[i].length();
|
|
|
|
if ((pos==0 || (pos>0 && !Character.isLetter(def[n].charAt(pos-1)))) && (posEnd==def[n].length() || !Character.isLetter(def[n].charAt(posEnd))))
|
|
{
|
|
if (sourceDef.getDef(n).contains(sourceDef.getDef(i)))
|
|
{
|
|
def = Manipulate.deleteString(def, i);
|
|
sourceDef.deleteDef(i);
|
|
i--;
|
|
continue;
|
|
}
|
|
|
|
sourceDef.addDictToDef(sourceDef.getDef(n), i);
|
|
|
|
do
|
|
{
|
|
def[n] = Manipulate.replace(def[n], pos, posEnd, "*");
|
|
pos = def[n].indexOf(def[i]);
|
|
posEnd = pos + def[i].length();
|
|
} while ((pos==0 || (pos>0 && !Character.isLetter(def[n].charAt(pos-1)))) && (posEnd==def[n].length() || !Character.isLetter(def[n].charAt(posEnd))));
|
|
|
|
i=-1; // start over
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// deal with repetition of dictionaries
|
|
|
|
if (sourceDef.getDef(i).equals(sourceDef.getDef(n)))
|
|
{
|
|
if (i<n)
|
|
{
|
|
def[i] = def[i] + ". " + def[n];
|
|
def = Manipulate.deleteString(def, n);
|
|
sourceDef.deleteDef(n);
|
|
n = i;
|
|
continue;
|
|
}
|
|
else
|
|
{
|
|
def[n] = def[n] + ". " + def[i];
|
|
def = Manipulate.deleteString(def, i);
|
|
sourceDef.deleteDef(i);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
private void addMoreDef(String def, int numDef)
|
|
{
|
|
String temp;
|
|
boolean notAlreadyThere, changed;
|
|
int i, pos, posEnd;
|
|
|
|
if (this.def==null)
|
|
{
|
|
// add a new definition for this dictionary
|
|
this.def = new String[1];
|
|
this.def[0] = def;
|
|
//sourceDef.add(numDef);
|
|
sourceDef.addNewDef(numDef);
|
|
}
|
|
else
|
|
{
|
|
notAlreadyThere = true;
|
|
do
|
|
{
|
|
i=0;
|
|
changed = false;
|
|
|
|
while (notAlreadyThere && i<this.def.length)
|
|
{
|
|
if (this.def[i].length()>=def.length())
|
|
{
|
|
pos = this.def[i].indexOf(def);
|
|
posEnd = pos + def.length();
|
|
if ((pos==0 || (pos>0 && !Character.isLetter(this.def[i].charAt(pos-1)))) && (posEnd==this.def[i].length() || !Character.isLetter(this.def[i].charAt(posEnd))))
|
|
{
|
|
if (!sourceDef.isDictInDef(numDef, i))
|
|
{
|
|
if (this.def[i].length()>def.length())
|
|
{
|
|
//temp = Manipulate.deleteSubstring(this.def[i], pos, posEnd);
|
|
temp = this.def[i];
|
|
do
|
|
{
|
|
temp = Manipulate.replace(temp, pos, posEnd, "*");
|
|
pos = temp.indexOf(def);
|
|
posEnd = pos + def.length();
|
|
} while ((pos==0 || (pos>0 && !Character.isLetter(temp.charAt(pos-1)))) && (posEnd==temp.length() || !Character.isLetter(temp.charAt(posEnd))));
|
|
|
|
this.def[i] = def;
|
|
this.def = Manipulate.addString(this.def, temp, i+1);
|
|
sourceDef.dubDef(i);
|
|
sourceDef.addDictToDef(numDef, i);
|
|
|
|
reGroup(i);
|
|
if (i+1<this.def.length) reGroup(i+1);
|
|
else reGroup(this.def.length-1);
|
|
}
|
|
else sourceDef.addDictToDef(numDef, i);
|
|
}
|
|
notAlreadyThere = false;
|
|
changed = false;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
pos = def.indexOf(this.def[i]);
|
|
posEnd = pos + this.def[i].length();
|
|
|
|
if ((pos==0 || (pos>0 && !Character.isLetter(def.charAt(pos-1)))) && (posEnd==def.length() || !Character.isLetter(def.charAt(posEnd))))
|
|
{
|
|
if (sourceDef.isDictInDefAlone(numDef, i))
|
|
{
|
|
this.def[i] = def;
|
|
reGroup(i);
|
|
}
|
|
else
|
|
{
|
|
sourceDef.addDictToDef(numDef, i);
|
|
do
|
|
{
|
|
//def = Manipulate.deleteSubstring(def, pos, posEnd);
|
|
def = Manipulate.replace(def, pos, posEnd, "*");
|
|
pos = def.indexOf(this.def[i]);
|
|
posEnd = pos + this.def[i].length();
|
|
} while ((pos==0 || (pos>0 && !Character.isLetter(def.charAt(pos-1)))) && (posEnd==def.length() || !Character.isLetter(def.charAt(posEnd))));
|
|
}
|
|
changed = true;
|
|
}
|
|
}
|
|
i++;
|
|
}
|
|
} while (changed);
|
|
|
|
if (notAlreadyThere)
|
|
{
|
|
// check if it is a duplicate for the same dictionary.
|
|
i = sourceDef.containsAlone(numDef);
|
|
if (i>-1)
|
|
{
|
|
this.def[i] = this.def[i] + ". " + def;
|
|
reGroup(i);
|
|
}
|
|
else
|
|
{
|
|
this.def = Manipulate.addString(this.def, def, this.def.length);
|
|
sourceDef.addNewDef(numDef);
|
|
reGroup(this.def.length-1);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
public boolean equals (Object o)
|
|
{
|
|
if (o instanceof String)
|
|
{
|
|
return sil.equals((String)o);
|
|
}
|
|
else return false;
|
|
}
|
|
|
|
|
|
private void printMe(boolean hasNext) throws Exception
|
|
{
|
|
int i;
|
|
|
|
wordRaf.writeInt((int) posHijos);
|
|
wordRaf.writeUTF(sil);
|
|
sourceDef.print(hasNext, wordRaf);
|
|
|
|
if (def!=null)
|
|
for (i=0; i<def.length; i++)
|
|
{
|
|
try
|
|
{
|
|
wordRaf.writeInt((int)defRaf.getFilePointer());
|
|
defRaf.writeUTF(def[i]);
|
|
}
|
|
catch (Exception e)
|
|
{
|
|
System.out.println(def[i]);
|
|
}
|
|
}
|
|
}
|
|
|
|
private void print() throws Exception
|
|
{
|
|
long pos;
|
|
SimplifiedListIterator i = listIterator();
|
|
|
|
BinaryFileGenerator silHijos;
|
|
boolean hasNext;
|
|
|
|
while (i.hasNext())
|
|
{
|
|
silHijos = (BinaryFileGenerator) i.next();
|
|
if (!silHijos.isEmpty()) silHijos.print();
|
|
}
|
|
pos = wordRaf.getFilePointer();
|
|
if (!isEmpty())
|
|
{
|
|
posHijos=pos;
|
|
i = listIterator();
|
|
hasNext = true;
|
|
while (hasNext)
|
|
{
|
|
silHijos = (BinaryFileGenerator) i.next();
|
|
hasNext=i.hasNext();
|
|
silHijos.printMe(hasNext);
|
|
}
|
|
}
|
|
}
|
|
|
|
private static void printSintax()
|
|
{
|
|
System.out.println("Stores multiple dictionaries into a binary tree file.");
|
|
System.out.println("Sintaxis:");
|
|
System.out.println("-For multiple dictionary sources:");
|
|
System.out.println(" java BinaryFileGenerator arch-dest [-delimiter1] arch-dict1");
|
|
System.out.println(" [[-delimiter2] arch-dict2 ...]");
|
|
System.out.println("-For one dictionary");
|
|
System.out.println(" java BinaryFileGenerator [-delimiter] arch-dict");
|
|
System.out.println("Dictionary files are assumed to be .txt. Don't include extensions!");
|
|
System.out.println(" -delimiter: default value is \'-\'. -tab takes \'\\t\' as delimiter.");
|
|
System.out.println(" -acip: use this to process dictionaries entered using the ACIP standard");
|
|
System.out.println(" to mark page numbers, comments, etc. Make sure to convert it to");
|
|
System.out.println(" THDL's extended Wylie scheme first using the AcipToWylie class.");
|
|
}
|
|
|
|
public void generateDatabase(String name) throws Exception
|
|
{
|
|
File wordF = new File(name + ".wrd"), defF = new File(name + ".def");
|
|
wordF.delete();
|
|
defF.delete();
|
|
wordRaf = new RandomAccessFile(wordF,"rw");
|
|
defRaf = new RandomAccessFile(defF,"rw");
|
|
print();
|
|
wordRaf.writeInt((int)posHijos);
|
|
|
|
// write version marker
|
|
wordRaf.writeShort(-1);
|
|
wordRaf.writeByte(-1);
|
|
|
|
// write version number
|
|
wordRaf.writeByte(versionNumber);
|
|
}
|
|
|
|
public static void main(String args[]) throws Exception
|
|
{
|
|
int delimiterType;
|
|
String delimiter;
|
|
|
|
int i, n=0, a;
|
|
|
|
delimiter = "-";
|
|
delimiterType=delimiterDash;
|
|
|
|
if (args.length==0)
|
|
{
|
|
printSintax();
|
|
return;
|
|
}
|
|
BinaryFileGenerator sl = new BinaryFileGenerator();
|
|
if (args[0].charAt(0)=='-')
|
|
{
|
|
if (args[0].equals("-tab"))
|
|
{
|
|
delimiterType = delimiterGeneric;
|
|
delimiter="\t";
|
|
}
|
|
else if (args[0].equals("-acip"))
|
|
delimiterType=delimiterAcip;
|
|
else
|
|
{
|
|
delimiterType=delimiterGeneric;
|
|
delimiter=args[0].substring(1);
|
|
}
|
|
if (args.length>2)
|
|
{
|
|
printSintax();
|
|
return;
|
|
}
|
|
sl.addFile(args[1] + ".txt",delimiterType, delimiter, 0);
|
|
a=1;
|
|
}
|
|
else
|
|
{
|
|
a=0;
|
|
if (args.length==1)
|
|
{
|
|
sl.addFile(args[0] + ".txt", delimiterType, delimiter, 0);
|
|
}
|
|
else
|
|
{
|
|
i=1;
|
|
|
|
while(i< args.length)
|
|
{
|
|
if (args[i].charAt(0)=='-')
|
|
{
|
|
if (args[i].equals("-tab"))
|
|
{
|
|
delimiterType=delimiterGeneric;
|
|
delimiter="\t";
|
|
}
|
|
else if (args[i].equals("-acip"))
|
|
delimiterType=delimiterAcip;
|
|
else
|
|
{
|
|
delimiterType=delimiterGeneric;
|
|
delimiter=args[i].substring(1);
|
|
}
|
|
i++;
|
|
}
|
|
else
|
|
{
|
|
delimiterType=delimiterDash;
|
|
}
|
|
System.out.println("\nProcessing " + args[i] + "...");
|
|
sl.addFile(args[i] + ".txt", delimiterType, delimiter, n);
|
|
n++; i++;
|
|
}
|
|
}
|
|
}
|
|
System.out.println("Writing to file " + args[a] + "...");
|
|
System.out.flush();
|
|
sl.generateDatabase(args[a]);
|
|
}
|
|
}
|