fixed the importing of dictionaries using '-' as a separator, without confusing such character with reverse vowel in the tibetanized sanskrit.

2002-11-27 23:30:44 +00:00 · 2002-11-27 23:30:44 +00:00 · c12088ce5d
commit c12088ce5d
parent c13adf9d14
1 changed files with 264 additions and 229 deletions
--- a/source/org/thdl/tib/scanner/BinaryFileGenerator.java
+++ b/source/org/thdl/tib/scanner/BinaryFileGenerator.java
@ -36,6 +36,10 @@ public class BinaryFileGenerator extends LinkedList
 	private long posHijos;
 	private String sil, def[];
    private static String delimiter;
    private static int delimiterType;
    private final static int delimiterGeneric=0;
    private final static int delimiterAcip=1;
    private final static int delimiterDash=2;
 	/** Number of dictionary. If 0, partial word (no definition).
 	*/
@ -47,7 +51,8 @@ public class BinaryFileGenerator extends LinkedList
 	{
 		wordRaf = null;
 		defRaf = null;
-        delimiter = " - ";
+        delimiter = null;
        delimiterType=delimiterDash;
 	}
 	public BinaryFileGenerator()
@ -110,186 +115,212 @@ public class BinaryFileGenerator extends LinkedList
 		boolean markerNotFound;
        // used for acip dict 
-        if (delimiter==null)
+        switch(delimiterType)
        {
-		    outAHere:
+            case delimiterAcip:
-		    while (true)
+    		    outAHere:
-		    {
+	    	    while (true)
 		        entrada=br.readLine();
 		        if (entrada==null) break;
 		        currentLine++;
 		        entrada = entrada.trim();
 		        len = entrada.length();
 		        if (len<=0) continue;
                // get page number
 	            if (entrada.charAt(0)=='@')
 	            {
 	                marker = 1;
 	                while(marker<len && Character.isDigit(entrada.charAt(marker)))
 	                    marker++;
 	                temp = entrada.substring(1, marker);
 	                if (temp.length()>0)
 	                currentPage=Integer.parseInt(temp);
 	                if (marker<len)
 	                {
 	                    entrada = entrada.substring(marker).trim();
 	                    len = entrada.length();
 	                }
 	                else continue;   
 		        }
 	            // get current letter
 	            if (entrada.charAt(0)=='(' || entrada.charAt(0)=='{' || entrada.charAt(0)=='?')
 	            {
 	                currentLetter = entrada.substring(1, entrada.length()-2);		            
 	                /*out.println(currentPage + ": " + currentLetter);
 	                n++;*/
 	                continue;
 	            }
 	            if (entrada.charAt(0)=='[')
 	            {
 	                marker=1;
 	                markerNotFound=true;
 	                do
 	                {
    	                while (marker<len && markerNotFound)
 	                    {
 	                        if (entrada.charAt(marker)==']') markerNotFound=false;
 	                        else marker++;
 	                    }
 	                    if (markerNotFound)
                        {
            		        entrada=br.readLine();
 		                    if (entrada==null) break outAHere;
 		                    currentLine++;
            		        len = entrada.length();
            		        marker=0;
                        }
                        else break;
 	                } while (true);
 	                if (marker<len)
 	                {
 	                    entrada = entrada.substring(marker+1).trim();
 	                    len = entrada.length();
 	                    if (len<=0) continue;
 	                }
 	                else continue;
 	            }
 		        // skip stuff. Add to previous definition.
 		        if (entrada.startsWith("..."))
 		        {
-		            entrada=entrada.substring(3);
+		            entrada=br.readLine();
 		            if (entrada==null) break;
    		        currentLine++;
 	    	        entrada = entrada.trim();
 		            len = entrada.length();
 		            if (len<=0) continue;
 		        }
-		        // find definiendum
+                    // get page number
-		        ch = entrada.charAt(0);
+    	            if (entrada.charAt(0)=='@')
-                if (Character.isLetter(ch) || ch=='\'')
+	                {
-                {
+	                    marker = 1;
-                    /* first criteria: if it is not the root letter of section it is part of the
+	                    while(marker<len && Character.isDigit(entrada.charAt(marker)))
-                    previous definition, probably a page change, else go for it with following
+	                        marker++;
-                    code: */
+    	                temp = entrada.substring(1, marker);
 	                    if (temp.length()>0)
 	                    currentPage=Integer.parseInt(temp);
 	                    if (marker<len)
 	                    {
 	                        entrada = entrada.substring(marker).trim();
 	                        len = entrada.length();
    	                }
 	                    else continue;   
 		            }
-                    // get first syllable to check base letter
+	                // get current letter
-                    marker=1;
+    	            if (entrada.charAt(0)=='(' || entrada.charAt(0)=='{' || entrada.charAt(0)=='?')
-                    while (marker<len)
+	                {
 	                    currentLetter = entrada.substring(1, entrada.length()-2);		            
 	                    /*out.println(currentPage + ": " + currentLetter);
 	                    n++;*/
 	                    continue;
    	            }
 	                if (entrada.charAt(0)=='[')
 	                {
 	                    marker=1;
 	                    markerNotFound=true;
 	                    do
    	                {
        	                while (marker<len && markerNotFound)
 	                        {
 	                            if (entrada.charAt(marker)==']') markerNotFound=false;
 	                            else marker++;
 	                        }
 	                        if (markerNotFound)
                            {
                		        entrada=br.readLine();
 		                        if (entrada==null) break outAHere;
 		                        currentLine++;
            	    	        len = entrada.length();
            		            marker=0;
                            }
                            else break;
 	                    } while (true);
 	                    if (marker<len)
 	                    {
 	                        entrada = entrada.substring(marker+1).trim();
 	                        len = entrada.length();
 	                        if (len<=0) continue;
    	                }
 	                    else continue;
 	                }
    		        // skip stuff. Add to previous definition.
 	    	        if (entrada.startsWith("..."))
 		            {
 		                entrada=entrada.substring(3);
 		                len = entrada.length();
 		                if (len<=0) continue;
    		        }
    		        // find definiendum
 	    	        ch = entrada.charAt(0);
                    if (Character.isLetter(ch) || ch=='\'')
                    {
-                        ch = entrada.charAt(marker);
+                        /* first criteria: if it is not the root letter of section it is part of the
-                        if (ch==' ' || ch=='/') break;
+                        previous definition, probably a page change, else go for it with following
-                        marker++;
+                        code: */
                    }
-                    if (status!=halfDefiniendum) temp = Manipulate.getBaseLetter(entrada.substring(0, marker));
+                        // get first syllable to check base letter
                        marker=1;
                        while (marker<len)
                        {
                            ch = entrada.charAt(marker);
                            if (ch==' ' || ch=='/') break;
                            marker++;
                        }
-                    // if line begins with current letter, probably it is a definiendum
+                        if (status!=halfDefiniendum) temp = Manipulate.getBaseLetter(entrada.substring(0, marker));
-                    if (status==halfDefiniendum || currentLetter.equals(temp))
+                        
-   	                {
+                        // if line begins with current letter, probably it is a definiendum
-   	                    /* Since new definiendum was found, update last and collect new. No need to update
+                        if (status==halfDefiniendum || currentLetter.equals(temp))
   	                    status because it will be updated below. */
   	                    if (status==definition)
   	                    {
-                            add(s1, s2, defNum);
+   	                        /* Since new definiendum was found, update last and collect new. No need to update
-		                    s1=""; s2="";
+   	                        status because it will be updated below. */
-   	                    }
+   	                        if (status==definition)
       	                marker=marker2=1;
   	                    markerNotFound=true;
   	                    while (marker < len)
   	                    {
       	                    ch = entrada.charAt(marker);
   	                        switch(ch)
   	                        {
-   	                            case '/':
+                                add(s1, s2, defNum);
-   	                                markerNotFound=false;
+		                        s1=""; s2="";
-   	                                marker2=marker+1;
+       	                    }
-   	                            break;
+   	                    
-   	                            case '(': case '<':
+           	                marker=marker2=1;
-   	                                markerNotFound=false;
+   	                        markerNotFound=true;
-   	                                marker2=marker;
+       	                    
-   	                            break;
+   	                        while (marker < len)
-   	                            case 'g': // verify "g "
+   	                        {
-       	                            if (marker+1<len && Manipulate.isVowel(entrada.charAt(marker-1)) && entrada.charAt(marker+1)==' ')
+       	                        ch = entrada.charAt(marker);
-       	                            {
+       	                        switch(ch)
-       	                                temp = entrada.substring(0, marker+1);
+   	                            {
-       	                                if (!lastWeirdDefiniendum.startsWith(temp))
+   	                                case '/':
   	                                    markerNotFound=false;
   	                                    marker2=marker+1;
   	                                break;
   	                                case '(': case '<':
   	                                    markerNotFound=false;
       	                                marker2=marker;
   	                                break;
   	                                case 'g': // verify "g "
       	                                if (marker+1<len && Manipulate.isVowel(entrada.charAt(marker-1)) && entrada.charAt(marker+1)==' ')
       	                                {
       	                                    temp = entrada.substring(0, marker+1);
       	                                    if (!lastWeirdDefiniendum.startsWith(temp))
           	                                {
   	                                            markerNotFound=false;
   	                                            marker2=++marker;
                                                lastWeirdDefiniendum=temp;
                                            }
   	                                    }
   	                                break;
       	                            case ' ': // verify "  "
           	                            if (marker+1<len && entrada.charAt(marker+1)==' ')
       	                                {
   	                                        markerNotFound=false;
   	                                        marker2=++marker;
-                                            lastWeirdDefiniendum=temp;
+   	                                    }
-                                        }
+   	                                break;
-   	                                }
+   	                                case '.':
-   	                            break;
+       	                                if (marker+2<len && entrada.charAt(marker+1)=='.' && entrada.charAt(marker+2)=='.')
-   	                            case ' ': // verify "  "
+   	                                    {
-       	                            if (marker+1<len && entrada.charAt(marker+1)==' ')
+   	                                        markerNotFound=false;
-       	                            {
+   	                                        marker2=marker;
-   	                                    markerNotFound=false;
+   	                                    }
-   	                                    marker2=++marker;
+   	                                break;
-   	                                }
+       	                            default:
-   	                            break;
+   	                                    if (Character.isDigit(ch))
-   	                            case '.':
+   	                                    {
-   	                                if (marker+2<len && entrada.charAt(marker+1)=='.' && entrada.charAt(marker+2)=='.')
+   	                                        markerNotFound=false;
-   	                                {
+   	                                        marker2=marker;
-   	                                    markerNotFound=false;
+       	                                }
-   	                                    marker2=marker;
+   	                            }
-   	                                }
+   	                            if (markerNotFound) marker++;
-   	                            break;
+   	                            else break;
       	                        default:
   	                                if (Character.isDigit(ch))
   	                                {
   	                                    markerNotFound=false;
   	                                    marker2=marker;
   	                                }
   	                        }
   	                        if (markerNotFound) marker++;
   	                        else break;
   	                    }
-   	                    /* either this is a definiendum that consists of several lines or
+   	                        /* either this is a definiendum that consists of several lines or
-   	                    it is part of the last definition. */
+       	                    it is part of the last definition. */
-   	                    if (markerNotFound) 
+       	                    if (markerNotFound) 
-       	                {
+           	                {
-   	                        /* assume that the definiendum goes on to the next line. */
+   	                            /* assume that the definiendum goes on to the next line. */
-   	                        s1 = s1 + entrada + " ";
+   	                            s1 = s1 + entrada + " ";
-   	                        status=halfDefiniendum;
+   	                            status=halfDefiniendum;
-   	                    }
+   	                        }
-       	                else
+       	                    else
-   	                    {
+   	                        {
-   	                        s1 = s1 + entrada.substring(0,marker).trim();
+       	                        s1 = s1 + entrada.substring(0,marker).trim();
-   	                        s2 = "[" + currentPage + "] " + entrada.substring(marker2).trim();
+   	                            s2 = "[" + currentPage + "] " + entrada.substring(marker2).trim();
-   	                        status=definition;
+   	                            status=definition;
   	                            while (true)
   	                            {
            		                entrada=br.readLine();
 		                            if (entrada==null)
    		                        {
 	    	                            add(s1, s2, defNum);
 		                                break outAHere;
 		                            }
                		            currentLine++;
                		            entrada = entrada.trim();
                		            if (entrada.equals("")) break;
                		            else
                		            {
 		                                s2 = s2 + " " + entrada;
 		                            }
 		                        }
       	                    }   	            
 	                    }
 	                    else // last line did not start with the current letter, it must still be part of the definition
 	                    {
                            s2 = s2 + " " + entrada;
   	                        while (true)
   	                        {
            		            entrada=br.readLine();
-		                        if (entrada==null)
+    		                    if (entrada==null)
-		                        {
+	    	                    {
 		                            add(s1, s2, defNum);
 		                            break outAHere;
 		                        }
@ -297,86 +328,75 @@ public class BinaryFileGenerator extends LinkedList
            		            currentLine++;
            		            entrada = entrada.trim();
-            		            if (entrada.equals("")) break;
+                		        if (entrada.equals("")) break;
-            		            else
+                		        else
-            		            {
+                		        {
 		                            s2 = s2 + " " + entrada;
 		                        }
 		                    }
-   	                        
+    	                }
   	                    }   	            
 	                }
-	                else // last line did not start with the current letter, it must still be part of the definition
+	                else // if first character was not a letter, it must still be part of definition
-	                {
+    	            {
                        s2 = s2 + " " + entrada;
   	                    while (true)
   	                    {
-            		        entrada=br.readLine();
+            	    	    entrada=br.readLine();
 		                    if (entrada==null)
-		                    {
+    		                {
-		                        add(s1, s2, defNum);
+	    	                    add(s1, s2, defNum);
 		                        break outAHere;
 		                    }
            		        currentLine++;
            		        entrada = entrada.trim();
-            		        if (entrada.equals("")) break;
+                		    if (entrada.equals("")) break;
-            		        else
+                		    else
-            		        {
+                		    {
 		                        s2 = s2 + " " + entrada;
 		                    }
 		                }
 	                }
-	            }
+		        }
-	            else // if first character was not a letter, it must still be part of definition
+		    break;
-	            {
+		    default:
-                    s2 = s2 + " " + entrada;
+        		while ((entrada = br.readLine())!=null)
-   	                while (true)
+	        	{
-   	                {
+		        	entrada = entrada.trim();
-            		    entrada=br.readLine();
+			        if (!entrada.equals(""))
-            		            
+        			{
-		                if (entrada==null)
+        			    switch(delimiterType)
        			    {
        			        /* this is needed to make sure that the dash used in reverse vowels with extended
        			        wylie is not confused with the dash that separates definiendum and definition. */
        			        case delimiterDash:
        			            marker=entrada.indexOf('-');
        			            len = entrada.length(); 
        			            while (marker>=0 && marker<len-1 && Manipulate.isVowel(entrada.charAt(marker+1)) && !Character.isWhitespace(entrada.charAt(marker-1)))
        			            {
        			                marker = entrada.indexOf('-', marker+1);
        			            }
        			        break;
        			        default:
 	        		        marker = entrada.indexOf(delimiter);
 	        		    }
 		                if (marker<0)
 		                {
-		                    add(s1, s2, defNum);
+		                    System.out.println("Error loading line " + currentLine + ", in file " + archivo + ":");
-		                    break outAHere;
+		                    System.out.println(entrada);
-		                }
+        		        }
-		                        
+	        	        else
-            		    currentLine++;
+		                {
-            		    entrada = entrada.trim();
+		                    s1 = deleteQuotes(entrada.substring(0,marker).trim());
-            		            
+		                    s2 = deleteQuotes(entrada.substring(marker+1).trim());
-            		    if (entrada.equals("")) break;
+		                    add(s1, s2 , defNum);
-            		    else
+    		            }
-            		    {
+    		        }
-		                    s2 = s2 + " " + entrada;
+	    		}
-		                }
+		    	currentLine++;            
 		            }
 	            }
 		    }
        }
        else
 		while ((entrada = br.readLine())!=null)
 		{
 			entrada = entrada.trim();
 			if (!entrada.equals(""))
 			{
 			    marker = entrada.indexOf(delimiter);
 		        if (marker<0)
 		        {
 		            System.out.println("Error loading line " + currentLine + ", in file " + archivo + ":");
 		            System.out.println(entrada);
 		        }
 		        else
 		        {
 		            s1 = deleteQuotes(entrada.substring(0,marker).trim());
 		            s2 = deleteQuotes(entrada.substring(marker+1).trim());
 		            add(s1, s2 , defNum);
 		        }
 			}
 			currentLine++;
 		}
 	}
@ -547,11 +567,17 @@ public class BinaryFileGenerator extends LinkedList
        if (args[0].charAt(0)=='-')
        {
            if (args[0].equals("-tab"))
            {
                delimiterType = delimiterGeneric;
                delimiter="\t";
            }
            else if (args[0].equals("-acip"))
-                delimiter=null;
+                delimiterType=delimiterAcip;
            else
            {
                delimiterType=delimiterGeneric;
                delimiter=args[0].substring(1);
            }
            if (args.length>2)
            {
                printSintax();
@ -576,14 +602,23 @@ public class BinaryFileGenerator extends LinkedList
                    if (args[i].charAt(0)=='-')
                    {
                        if (args[i].equals("-tab"))
-                        delimiter="\t";
+                        {
                            delimiterType=delimiterGeneric;
                            delimiter="\t";
                        }
                        else if (args[i].equals("-acip"))
-                            delimiter=null;
+                            delimiterType=delimiterAcip;
                        else
                        {
                            delimiterType=delimiterGeneric;
                            delimiter=args[i].substring(1);
                        }
                        i++;
                    }
-                    else delimiter=" -";
+                    else 
                    {
                        delimiterType=delimiterDash;
                    }
                    sl.addFile(args[i] + ".txt", n);
                    n++; i++;
                }