This classes were for private use for processing some dictionaries. They are out of place here.
This commit is contained in:
parent
d4dc004c2a
commit
a86bad152f
2 changed files with 0 additions and 466 deletions
|
@ -1,440 +0,0 @@
|
|||
/*
|
||||
The contents of this file are subject to the AMP Open Community License
|
||||
Version 1.0 (the "License"); you may not use this file except in compliance
|
||||
with the License. You may obtain a copy of the License on the AMP web site
|
||||
(http://www.tibet.iteso.mx/Guatemala/).
|
||||
|
||||
Software distributed under the License is distributed on an "AS IS" basis,
|
||||
WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
|
||||
License for the specific terms governing rights and limitations under the
|
||||
License.
|
||||
|
||||
The Initial Developer of this software is Andres Montano Pellegrini. Portions
|
||||
created by Andres Montano Pellegrini are Copyright 2001 Andres Montano
|
||||
Pellegrini. All Rights Reserved.
|
||||
|
||||
Contributor(s): ______________________________________.
|
||||
*/
|
||||
|
||||
package org.thdl.tib.scanner;
|
||||
|
||||
import org.thdl.util.*;
|
||||
import org.thdl.tib.text.*;
|
||||
import java.net.*;
|
||||
import java.io.*;
|
||||
|
||||
public class DictionaryBreakDown
|
||||
{
|
||||
private static final int TOMERAIDER=1;
|
||||
private static final int HTML=2;
|
||||
private static final int SIMPLEBREAK=3;
|
||||
private int mode;
|
||||
private BufferedReader in;
|
||||
private int numberOfFields;
|
||||
private int numberOfMergedFields;
|
||||
|
||||
private static final int UNIQUE=0;
|
||||
private static final int MERGE_HEAD=1;
|
||||
private static final int MERGE_FOLLOWER=2;
|
||||
|
||||
public DictionaryBreakDown(BufferedReader in, int mode)
|
||||
{
|
||||
this.mode = mode;
|
||||
this.in = in;
|
||||
}
|
||||
|
||||
public static void main (String[] args) throws Exception
|
||||
{
|
||||
PrintWriter out;
|
||||
BufferedReader in=null;
|
||||
int argNum = args.length, currentArg=0, mode = SIMPLEBREAK;
|
||||
String option, format=null;
|
||||
|
||||
if (argNum<=currentArg)
|
||||
{
|
||||
System.out.println("Syntax: DictionaryBreakDown [-format format] [-tomeraider | -html] input-file");
|
||||
return;
|
||||
}
|
||||
|
||||
while (args[currentArg].charAt(0)=='-')
|
||||
{
|
||||
option = args[currentArg].substring(1);
|
||||
currentArg++;
|
||||
if (option.equals("format"))
|
||||
{
|
||||
format=args[currentArg];
|
||||
currentArg++;
|
||||
} else if (option.equals("tomeraider"))
|
||||
{
|
||||
mode=TOMERAIDER;
|
||||
} else if (option.equals("html"))
|
||||
{
|
||||
mode=HTML;
|
||||
}
|
||||
}
|
||||
if (argNum<=currentArg)
|
||||
{
|
||||
System.out.println("Syntax error. Input file expected.");
|
||||
return;
|
||||
}
|
||||
|
||||
in = getBufferedReader(args[currentArg], format);
|
||||
|
||||
new DictionaryBreakDown(in, mode).run(format);
|
||||
}
|
||||
|
||||
public static BufferedReader getBufferedReader(String s, String format) throws Exception
|
||||
{
|
||||
InputStream is;
|
||||
|
||||
if (s.indexOf("http://") >= 0)
|
||||
is = new BufferedInputStream((new URL(s)).openStream());
|
||||
else
|
||||
is = new FileInputStream(s);
|
||||
|
||||
if (format==null)
|
||||
return new BufferedReader(new InputStreamReader(is));
|
||||
else
|
||||
return new BufferedReader(new InputStreamReader(is, format));
|
||||
|
||||
}
|
||||
|
||||
private int[] buildMergeMatrix(String fields[])
|
||||
{
|
||||
int i;
|
||||
int matrix[] = new int[fields.length];
|
||||
boolean sameRoot=false;
|
||||
String root;
|
||||
|
||||
for (i=0; i<matrix.length; i++)
|
||||
matrix[i] = UNIQUE;
|
||||
|
||||
i=0;
|
||||
while (i<fields.length)
|
||||
{
|
||||
if (Character.isDigit(fields[i].charAt(fields[i].length()-1)))
|
||||
{
|
||||
matrix[i] = MERGE_HEAD;
|
||||
root = fields[i].substring(0,fields[i].length()-1);
|
||||
i++;
|
||||
while (i<fields.length && (sameRoot = fields[i].substring(0,fields[i].length()-1).equals(root)))
|
||||
{
|
||||
matrix[i] = MERGE_FOLLOWER;
|
||||
i++;
|
||||
}
|
||||
if (!sameRoot) i--;
|
||||
}
|
||||
i++;
|
||||
}
|
||||
return matrix;
|
||||
}
|
||||
|
||||
private static String deleteComas(String s)
|
||||
{
|
||||
if (s.length()>1 && s.charAt(0)=='\"' && s.charAt(s.length()-1)=='\"')
|
||||
return s.substring(1,s.length()-1);
|
||||
else return s;
|
||||
}
|
||||
|
||||
private String[] getFields(String linea)
|
||||
{
|
||||
int i=0, j, pos;
|
||||
String fields[] = new String[this.numberOfFields], tokens[];
|
||||
|
||||
tokens = linea.split("\t");
|
||||
for (i=0; i<tokens.length; i++)
|
||||
{
|
||||
fields[i] = deleteComas(tokens[i]);
|
||||
}
|
||||
|
||||
for (j=i; j<fields.length; j++)
|
||||
fields[j]="";
|
||||
|
||||
return fields;
|
||||
}
|
||||
|
||||
private String[] buildMergedFieldNames(String fieldNames[], int mergeMatrix[])
|
||||
{
|
||||
int i;
|
||||
SimplifiedLinkedList ll = new SimplifiedLinkedList();
|
||||
for (i=0; i<mergeMatrix.length; i++)
|
||||
{
|
||||
switch (mergeMatrix[i])
|
||||
{
|
||||
case UNIQUE: ll.addLast(fieldNames[i]);
|
||||
break;
|
||||
case MERGE_HEAD: ll.addLast(fieldNames[i].substring(0,fieldNames[i].length()-1));
|
||||
}
|
||||
|
||||
}
|
||||
return ll.toStringArray();
|
||||
}
|
||||
|
||||
public String [] getMergeFields(String fields[], int mergeMatrix[])
|
||||
{
|
||||
// I can safely assume that fields contains info and are non null.
|
||||
int i=0, j=0, fieldNum;
|
||||
String mergedFields[] = new String[numberOfMergedFields];
|
||||
|
||||
outAHere:
|
||||
while(i<fields.length && j<mergedFields.length)
|
||||
{
|
||||
switch(mergeMatrix[i])
|
||||
{
|
||||
case UNIQUE:
|
||||
mergedFields[j] = fields[i];
|
||||
i++; j++;
|
||||
break;
|
||||
case MERGE_HEAD:
|
||||
fieldNum=1;
|
||||
if (!fields[i].equals("") && !fields[i+1].equals(""))
|
||||
mergedFields[j] = fieldNum + ". " + fields[i];
|
||||
else mergedFields[j]=fields[i];
|
||||
i++; fieldNum++;
|
||||
while (mergeMatrix[i]==MERGE_FOLLOWER)
|
||||
{
|
||||
if (!fields[i].equals(""))
|
||||
{
|
||||
if (!mergedFields[j].equals(""))
|
||||
mergedFields[j]+="; " + fieldNum + ". " + fields[i];
|
||||
else mergedFields[j] = fieldNum + ". " + fields[i];
|
||||
}
|
||||
i++; fieldNum++;
|
||||
if (i>=fields.length) break outAHere;
|
||||
}
|
||||
j++;
|
||||
}
|
||||
}
|
||||
return mergedFields;
|
||||
}
|
||||
|
||||
public void run(String format) throws Exception
|
||||
{
|
||||
String linea, fieldNames[], fields[], mergedFieldNames[], mergedFields[], tail;
|
||||
int i, pos, j;
|
||||
int mergeMatrix[];
|
||||
PrintWriter out[] = null, outOne = null;
|
||||
SimplifiedLinkedList ll = null, llOdd = null;
|
||||
SimplifiedListIterator sli;
|
||||
char ch;
|
||||
|
||||
linea=in.readLine();
|
||||
if (linea==null) throw new Exception("File is empty!");
|
||||
fieldNames = linea.split("\t");
|
||||
numberOfFields = fieldNames.length;
|
||||
mergeMatrix = buildMergeMatrix(fieldNames);
|
||||
mergedFieldNames = buildMergedFieldNames(fieldNames, mergeMatrix);
|
||||
numberOfMergedFields = mergedFieldNames.length;
|
||||
|
||||
switch (mode)
|
||||
{
|
||||
case TOMERAIDER:
|
||||
if (format!=null) outOne = new PrintWriter(new OutputStreamWriter(new FileOutputStream("dict-in-tomeraider-format.txt"), format));
|
||||
else outOne = new PrintWriter(new FileOutputStream("dict-in-tomeraider-format.txt"));
|
||||
ll = new SimplifiedLinkedList();
|
||||
llOdd = new SimplifiedLinkedList();
|
||||
break;
|
||||
|
||||
case HTML:
|
||||
if (format != null) outOne = new PrintWriter(new OutputStreamWriter(new FileOutputStream("dict-in-tab-format.txt"), format));
|
||||
else outOne = new PrintWriter(new FileOutputStream("dict-in-tab-format.txt"));
|
||||
break;
|
||||
|
||||
case SIMPLEBREAK:
|
||||
out = new PrintWriter[mergedFieldNames.length-1];
|
||||
for (i=0; i<out.length; i++)
|
||||
{
|
||||
if (format!=null) out[i] = new PrintWriter(new OutputStreamWriter(new FileOutputStream(mergedFieldNames[i+1] + ".txt"), format));
|
||||
else out[i] = new PrintWriter(new FileOutputStream(mergedFieldNames[i+1] + ".txt"));
|
||||
}
|
||||
}
|
||||
|
||||
while ((linea=in.readLine())!=null)
|
||||
{
|
||||
fields = getFields(linea);
|
||||
mergedFields = getMergeFields(fields, mergeMatrix);
|
||||
switch (mode)
|
||||
{
|
||||
case TOMERAIDER:
|
||||
linea = fields[0];
|
||||
|
||||
forBreak:
|
||||
for (i=0; i<linea.length(); i++)
|
||||
{
|
||||
ch = linea.charAt(i);
|
||||
if (ch==';' || ch=='/')
|
||||
{
|
||||
linea = linea.substring(0, i);
|
||||
break forBreak;
|
||||
}
|
||||
}
|
||||
pos = linea.indexOf("...");
|
||||
if (pos>0) linea = linea.substring(0,pos).trim();
|
||||
pos = linea.indexOf('+');
|
||||
if (pos>0)
|
||||
{
|
||||
llOdd.addLast(mergedFields);
|
||||
continue;
|
||||
}
|
||||
try
|
||||
{
|
||||
ll.addSorted(new SortingTibetanEntry(linea, mergedFields));
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
llOdd.addLast(mergedFields);
|
||||
}
|
||||
|
||||
break;
|
||||
case HTML:
|
||||
tail = null;
|
||||
|
||||
pos = mergedFields[0].indexOf("...");
|
||||
|
||||
if (pos==0) mergedFields[0] = mergedFields[0].substring(3).trim();
|
||||
else
|
||||
if (pos>0)
|
||||
{
|
||||
tail = mergedFields[0].substring(pos + 3).trim();
|
||||
mergedFields[0] = mergedFields[0].substring(0,pos-1).trim();
|
||||
}
|
||||
outOne.print(mergedFields[0] + "\t");
|
||||
|
||||
for (i=1; i<=2; i++) // tenses & sanskrit
|
||||
{
|
||||
if (!mergedFields[i].equals(""))
|
||||
{
|
||||
if (tail!=null) outOne.print("... " + tail + ": " + mergedFields[i] + "<br>");
|
||||
else outOne.print(mergedFields[i] + "<br>");
|
||||
}
|
||||
}
|
||||
|
||||
if (!mergedFields[3].equals("")) // english
|
||||
{
|
||||
if (tail!=null) outOne.print("... " + tail + ": <b>" + mergedFields[3] + "</b><br>");
|
||||
else outOne.print("<b>" + mergedFields[3] + "</b><br>");
|
||||
}
|
||||
if (!mergedFields[4].equals("")) // english-others
|
||||
{
|
||||
if (tail!=null) outOne.print("... " + tail + ": <i>" + mergedFields[4] + "</i><br>");
|
||||
else outOne.print("<i>" + mergedFields[4] + "</i><br>");
|
||||
}
|
||||
|
||||
for (i=5; i<=8; i++)
|
||||
{
|
||||
if (!mergedFields[i].equals(""))
|
||||
{
|
||||
if (tail!=null) outOne.print("... " + tail + ": " + mergedFields[i] + "<br>");
|
||||
else outOne.print(mergedFields[i] + "<br>");
|
||||
}
|
||||
}
|
||||
|
||||
if (!mergedFields[9].equals("")) // synonyms-tibetan
|
||||
{
|
||||
if (tail!=null) outOne.print("... " + tail + ": <i>Synonyms: </i>" + mergedFields[9] + "<br>");
|
||||
else outOne.print("<i>Synonyms: </i>" + mergedFields[9] + "<br>");
|
||||
}
|
||||
|
||||
for (i=10; i<=13; i++)
|
||||
{
|
||||
if (!mergedFields[i].equals(""))
|
||||
{
|
||||
if (tail!=null) outOne.print("... " + tail + ": " + mergedFields[i] + "<br>");
|
||||
else outOne.print(mergedFields[i] + "<br>");
|
||||
}
|
||||
}
|
||||
outOne.println();
|
||||
break;
|
||||
case SIMPLEBREAK:
|
||||
tail = null;
|
||||
|
||||
pos = mergedFields[0].indexOf("...");
|
||||
|
||||
if (pos==0) mergedFields[0] = mergedFields[0].substring(3).trim();
|
||||
else
|
||||
if (pos>0)
|
||||
{
|
||||
tail = mergedFields[0].substring(pos + 3).trim();
|
||||
mergedFields[0] = mergedFields[0].substring(0,pos-1).trim();
|
||||
}
|
||||
|
||||
for (i=1; i<mergedFields.length; i++)
|
||||
{
|
||||
if (!mergedFields[i].equals(""))
|
||||
{
|
||||
if (tail!=null)
|
||||
out[i-1].println(mergedFields[0] + '\t' + "... " + tail + ": " + mergedFields[i]);
|
||||
else out[i-1].println(mergedFields[0] + '\t' + mergedFields[i]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
switch (mode)
|
||||
{
|
||||
case TOMERAIDER:
|
||||
outOne.println("<new>\nAbout\n");
|
||||
outOne.println(TibetanScanner.aboutTomeraider);
|
||||
|
||||
sli = ll.listIterator();
|
||||
while (sli.hasNext())
|
||||
{
|
||||
mergedFields = ((SortingTibetanEntry)sli.next()).get();
|
||||
outOne.println("\n<new>\n" + mergedFields[0] + "\n");
|
||||
if (!mergedFields[1].equals("")) outOne.println(mergedFields[1] + "<br>"); // tenses
|
||||
if (!mergedFields[2].equals("")) outOne.println(mergedFields[2] + "<br>"); // sanskrit
|
||||
if (!mergedFields[3].equals("")) outOne.println("<b>" + mergedFields[3] + "</b><br>"); // english
|
||||
if (!mergedFields[4].equals("")) outOne.println("<i>" + mergedFields[4] + "</i><br>"); // english-others
|
||||
for (i=5; i<=8; i++)
|
||||
{
|
||||
if (!mergedFields[i].equals("")) outOne.println(mergedFields[i] + "<br>");
|
||||
}
|
||||
if (!mergedFields[9].equals("")) outOne.println("<i>Synonyms: </i>" + mergedFields[9] + "<br>"); // synonyms-tibetan
|
||||
|
||||
for (i=10; i<=13; i++)
|
||||
{
|
||||
if (!mergedFields[i].equals("")) outOne.println(mergedFields[i] + "<br>");
|
||||
}
|
||||
}
|
||||
|
||||
sli = llOdd.listIterator();
|
||||
if (sli.hasNext())
|
||||
{
|
||||
outOne.println("\n<new>\nUnsorted entries start here.\n");
|
||||
outOne.println("Because of errors in the entries with regards to:");
|
||||
outOne.println("<ol>");
|
||||
outOne.println("<li>Invalid tibetan");
|
||||
outOne.println("<li>Conversion from Tibetan script to extended wylie");
|
||||
outOne.println("<li>or the sorting algorithm");
|
||||
outOne.println("</ol>");
|
||||
outOne.println("the following entries could not be sorted. Hopefully these errors will be corrected in future versions.");
|
||||
}
|
||||
while (sli.hasNext())
|
||||
{
|
||||
mergedFields = (String[])sli.next();
|
||||
outOne.println("\n<new>\n" + mergedFields[0] + "\n");
|
||||
if (!mergedFields[1].equals("")) outOne.println(mergedFields[1] + "<br>"); // tenses
|
||||
if (!mergedFields[2].equals("")) outOne.println(mergedFields[2] + "<br>"); // sanskrit
|
||||
if (!mergedFields[3].equals("")) outOne.println("<b>" + mergedFields[3] + "</b><br>"); // english
|
||||
if (!mergedFields[4].equals("")) outOne.println("<i>" + mergedFields[4] + "</i><br>"); // english-others
|
||||
for (i=5; i<=8; i++)
|
||||
{
|
||||
if (!mergedFields[i].equals("")) outOne.println(mergedFields[i] + "<br>");
|
||||
}
|
||||
if (!mergedFields[9].equals("")) outOne.println("<i>Synonyms: </i>" + mergedFields[9] + "<br>"); // synonyms-tibetan
|
||||
|
||||
for (i=10; i<=13; i++)
|
||||
{
|
||||
if (!mergedFields[i].equals("")) outOne.println(mergedFields[i] + "<br>");
|
||||
}
|
||||
}
|
||||
case HTML:
|
||||
outOne.flush(); // no break above so that both flush.
|
||||
break;
|
||||
|
||||
case SIMPLEBREAK:
|
||||
for (i=0; i<out.length; i++)
|
||||
out[i].flush();
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,26 +0,0 @@
|
|||
package org.thdl.tib.scanner;
|
||||
|
||||
import org.thdl.tib.text.*;
|
||||
|
||||
class SortingTibetanEntry implements Comparable
|
||||
{
|
||||
private String fields[];
|
||||
private TibetanString header;
|
||||
|
||||
public SortingTibetanEntry (String header, String fields[])
|
||||
{
|
||||
this.header = new TibetanString(header);
|
||||
this.fields = fields;
|
||||
}
|
||||
|
||||
public int compareTo(Object o)
|
||||
{
|
||||
SortingTibetanEntry ste = (SortingTibetanEntry)o;
|
||||
return header.compareTo(ste.header);
|
||||
}
|
||||
|
||||
public String[] get()
|
||||
{
|
||||
return fields;
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue