/* The contents of this file are subject to the AMP Open Community License Version 1.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License on the AMP web site (http://www.tibet.iteso.mx/Guatemala/). Software distributed under the License is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for the specific terms governing rights and limitations under the License. The Initial Developer of this software is Andres Montano Pellegrini. Portions created by Andres Montano Pellegrini are Copyright 2001 Andres Montano Pellegrini. All Rights Reserved. Contributor(s): ______________________________________. */ package org.thdl.tib.scanner; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.InputStreamReader; import java.io.RandomAccessFile; import org.thdl.util.Link; import org.thdl.util.SimplifiedLinkedList; import org.thdl.util.SimplifiedListIterator; /** Converts Tibetan dictionaries stored in text files into a binary file tree structure format, to be used by some implementations of the SyllableListTree.

Syntax (Dictionary files are assumed to be .txt. Don't include extensions!):

-delimiter

@author Andrés Montano Pellegrini @see SyllableListTree @see FileSyllableListTree @see CachedSyllableListTree */ public class BinaryFileGenerator extends SimplifiedLinkedList { private static final int versionNumber = 3; private long posHijos; private String sil, def[]; public final static int delimiterGeneric=0; public final static int delimiterAcip=1; public final static int delimiterDash=2; /** Number of dictionary. If 0, partial word (no definition). */ private ByteDictionarySource sourceDef; public static RandomAccessFile wordRaf; private static RandomAccessFile defRaf; static { wordRaf = null; defRaf = null; } public BinaryFileGenerator() { super(); sil = null; def = null; posHijos=-1; sourceDef = null; } private BinaryFileGenerator(String sil, String def, int numDef) { super(); int marker; while (true) { marker = Manipulate.indexOfExtendedEndOfSyllableMark(sil); if (marker==0) sil = sil.substring(1); else if (marker==sil.length()-1) sil = sil.substring(0,sil.length()-1); else break; } // fix for updates this.sourceDef = new ByteDictionarySource(); if (marker<0) { this.sil = sil; this.def = new String[1]; this.def[0] = def; this.sourceDef.addNewDef(numDef); } else { this.sil = sil.substring(0, marker); this.def = null; addLast(new BinaryFileGenerator(sil.substring(marker+1).trim(), def, numDef)); } posHijos=-1; } public String toString() { return sil; } public void addFile(String archivo, int delimiterType, String delimiter, int defNum) throws Exception { final short newDefiniendum=1, halfDefiniendum=2, definition=3; short status=newDefiniendum; int marker, len, marker2, currentPage=0, currentLine=1; char ch; BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(archivo))); String entrada="", s1="", s2="", currentLetter="", temp="", lastWeirdDefiniendum="", alternateWords[]; boolean markerNotFound; // used for acip dict switch(delimiterType) { case delimiterAcip: outAHere: while (true) { entrada=br.readLine(); if (entrada==null) break; currentLine++; entrada = entrada.trim(); len = entrada.length(); if (len<=0) continue; // get page number if (entrada.charAt(0)=='@') { marker = 1; while(marker0) currentPage=Integer.parseInt(temp); if (marker=0 && marker0) marker = marker2; s1 = Manipulate.deleteQuotes(entrada.substring(0,marker).trim()); s2 = Manipulate.deleteQuotes(entrada.substring(marker+delimiter.length())).trim(); if (Manipulate.isMeaningful(s2)) { if (currentLine%5000==0) { System.out.println("Adding " + s1 + "..."); System.out.flush(); } marker2 = s1.indexOf(';'); if (marker2>0) { alternateWords = s1.split(";"); for (marker2=0; marker2=def[n].length()) { pos = def[i].indexOf(def[n]); // if it is the same String exactly if (pos==0 && def[i].length()==def[n].length()) { if (i0 && !Character.isLetter(def[i].charAt(pos-1)))) && (posEnd==def[i].length() || !Character.isLetter(def[i].charAt(posEnd)))) { if(sourceDef.getDef(i).contains(sourceDef.getDef(n))) { def = Manipulate.deleteString(def, n); sourceDef.deleteDef(n); return; } // else sourceDef.addDictToDef(sourceDef.getDef(i), n); do { def[i] = Manipulate.replace(def[i], pos, posEnd, "*"); pos = def[i].indexOf(def[n]); posEnd = pos + def[n].length(); } while ((pos==0 || (pos>0 && !Character.isLetter(def[i].charAt(pos-1)))) && (posEnd==def[i].length() || !Character.isLetter(def[i].charAt(posEnd)))); if (i0 && !Character.isLetter(def[n].charAt(pos-1)))) && (posEnd==def[n].length() || !Character.isLetter(def[n].charAt(posEnd)))) { if (sourceDef.getDef(n).contains(sourceDef.getDef(i))) { def = Manipulate.deleteString(def, i); sourceDef.deleteDef(i); i--; continue; } sourceDef.addDictToDef(sourceDef.getDef(n), i); do { def[n] = Manipulate.replace(def[n], pos, posEnd, "*"); pos = def[n].indexOf(def[i]); posEnd = pos + def[i].length(); } while ((pos==0 || (pos>0 && !Character.isLetter(def[n].charAt(pos-1)))) && (posEnd==def[n].length() || !Character.isLetter(def[n].charAt(posEnd)))); i=-1; // start over continue; } } // deal with repetition of dictionaries if (sourceDef.getDef(i).equals(sourceDef.getDef(n))) { if (i=def.length()) { pos = this.def[i].indexOf(def); posEnd = pos + def.length(); if ((pos==0 || (pos>0 && !Character.isLetter(this.def[i].charAt(pos-1)))) && (posEnd==this.def[i].length() || !Character.isLetter(this.def[i].charAt(posEnd)))) { if (!sourceDef.isDictInDef(numDef, i)) { if (this.def[i].length()>def.length()) { //temp = Manipulate.deleteSubstring(this.def[i], pos, posEnd); temp = this.def[i]; do { temp = Manipulate.replace(temp, pos, posEnd, "*"); pos = temp.indexOf(def); posEnd = pos + def.length(); } while ((pos==0 || (pos>0 && !Character.isLetter(temp.charAt(pos-1)))) && (posEnd==temp.length() || !Character.isLetter(temp.charAt(posEnd)))); this.def[i] = def; this.def = Manipulate.addString(this.def, temp, i+1); sourceDef.dubDef(i); sourceDef.addDictToDef(numDef, i); reGroup(i); if (i+10 && !Character.isLetter(def.charAt(pos-1)))) && (posEnd==def.length() || !Character.isLetter(def.charAt(posEnd)))) { if (sourceDef.isDictInDefAlone(numDef, i)) { this.def[i] = def; reGroup(i); } else { sourceDef.addDictToDef(numDef, i); do { //def = Manipulate.deleteSubstring(def, pos, posEnd); def = Manipulate.replace(def, pos, posEnd, "*"); pos = def.indexOf(this.def[i]); posEnd = pos + this.def[i].length(); } while ((pos==0 || (pos>0 && !Character.isLetter(def.charAt(pos-1)))) && (posEnd==def.length() || !Character.isLetter(def.charAt(posEnd)))); } changed = true; } } i++; } } while (changed); if (notAlreadyThere) { // check if it is a duplicate for the same dictionary. i = sourceDef.containsAlone(numDef); if (i>-1) { this.def[i] = this.def[i] + ". " + def; reGroup(i); } else { this.def = Manipulate.addString(this.def, def, this.def.length); sourceDef.addNewDef(numDef); reGroup(this.def.length-1); } } } } public boolean equals (Object o) { if (o instanceof String) { return sil.equals((String)o); } else return false; } private void printMe(boolean hasNext) throws Exception { int i; wordRaf.writeInt((int) posHijos); wordRaf.writeUTF(sil); sourceDef.print(hasNext, wordRaf); if (def!=null) for (i=0; i2) { printSintax(); return; } sl.addFile(args[1] + ".txt",delimiterType, delimiter, 0); a=1; } else { a=0; if (args.length==1) { sl.addFile(args[0] + ".txt", delimiterType, delimiter, 0); } else { i=1; while(i< args.length) { if (args[i].charAt(0)=='-') { if (args[i].equals("-tab")) { delimiterType=delimiterGeneric; delimiter="\t"; } else if (args[i].equals("-acip")) delimiterType=delimiterAcip; else { delimiterType=delimiterGeneric; delimiter=args[i].substring(1); } i++; } else { delimiterType=delimiterDash; } System.out.println("\nProcessing " + args[i] + "..."); sl.addFile(args[i] + ".txt", delimiterType, delimiter, n); n++; i++; } } } System.out.println("Writing to file " + args[a] + "..."); System.out.flush(); sl.generateDatabase(args[a]); } }