package org.thdl.tib.scanner; import java.net.*; import java.io.*; class AcipToTab { private BufferedReader in; private PrintWriter out; private String currentDefiniendum, currentDefinition; public AcipToTab(BufferedReader in, PrintWriter out) { this.in = in; this.out = out; } public void add() { out.println(currentDefiniendum + '\t' + currentDefinition); } public static void main (String[] args) throws Exception { PrintWriter out; BufferedReader in=null; boolean file=false; switch (args.length) { case 0: out = new PrintWriter(System.out); in = new BufferedReader(new InputStreamReader(System.in)); break; case 1: out = new PrintWriter(System.out); file = true; break; default: out = new PrintWriter(new FileOutputStream(args[1])); file = true; } if (file) { if (args[0].indexOf("http://") >= 0) in = new BufferedReader(new InputStreamReader(new BufferedInputStream((new URL(args[0])).openStream()))); else in = new BufferedReader(new InputStreamReader(new FileInputStream(args[0]))); } new AcipToTab(in, out).run(); } public void run() throws Exception { final short newDefiniendum=1, halfDefiniendum=2, definition=3; short status=newDefiniendum; int marker, len, marker2, n=0, total=0, currentPage=0, currentLine=1, pos; char ch; String entrada="", currentLetter="", temp="", lastDefiniendum="", lastWeirdDefiniendum=""; boolean markerNotFound; currentDefiniendum=""; currentDefinition=""; outAHere: while (true) { entrada=in.readLine(); if (entrada==null) break; currentLine++; entrada = entrada.trim(); len = entrada.length(); if (len<=0) continue; // get page number if (entrada.charAt(0)=='@') { marker = 1; while(marker0) { currentPage=Integer.parseInt(temp); } if (marker0) n++; lastDefiniendum=currentDefiniendum; currentDefiniendum=""; currentDefinition=""; } marker=marker2=1; markerNotFound=true; while (marker < len) { ch = entrada.charAt(marker); if (Manipulate.isEndOfParagraphMark(ch)) { markerNotFound=false; marker2=marker+1; } else if (Manipulate.isEndOfSyllableMark(ch)) { if (marker+10 && Manipulate.isVowel(entrada.charAt(pos-1)) && (markerNotFound || entrada.substring(0,pos+1).length() < entrada.substring(0, marker).trim().length())) { // out.println(currentPage + ": " + entrada); n++; }*/ /* either this is a definiendum that consists of several lines or it is part of the last definition. */ if (markerNotFound) { /* assume that the definiendum goes on to the next line. */ currentDefiniendum = currentDefiniendum + " "; status=halfDefiniendum; } else { // total++; currentDefiniendum = currentDefiniendum + entrada.substring(0,marker).trim(); currentDefinition = "[" + currentPage + "] " + entrada.substring(marker2).trim(); status=definition; while (true) { entrada=in.readLine(); if (entrada==null) { // add here add(); // if (new TibetanString(lastDefiniendum).compareTo(new TibetanString(currentDefiniendum))>0) n++; break outAHere; } currentLine++; entrada = entrada.trim(); if (entrada.equals("")) break; else { currentDefinition = currentDefinition + " " + entrada; } } } } else // last line did not start with the current letter, it must still be part of the definition { currentDefinition = currentDefinition + " " + entrada; while (true) { entrada=in.readLine(); if (entrada==null) { // add here add(); // if (new TibetanString(lastDefiniendum).compareTo(new TibetanString(currentDefiniendum))>0) n++; break outAHere; } currentLine++; entrada = entrada.trim(); if (entrada.equals("")) break; { currentDefinition = currentDefinition + " " + entrada; } } } } else // if first character was not a letter, it must still be part of definition { currentDefinition = currentDefinition + " " + entrada; while (true) { entrada=in.readLine(); if (entrada==null) { // add here add(); break outAHere; } currentLine++; entrada = entrada.trim(); if (entrada.equals("")) break; else { currentDefinition = currentDefinition + " " + entrada; } } } } // out.println(n + " / " + total); out.flush(); } }