From 9eedfcd909a9dbfda1c7be11b55e6ea125296b4f Mon Sep 17 00:00:00 2001 From: eg3p Date: Thu, 5 Dec 2002 01:48:41 +0000 Subject: [PATCH] This is Tashi's TibetanSyllable class for sorting Wylie Tibetan. It does not have many methods for determining the root letter, suffix, and so on, but these should be easy to add. David, please use this class to the extent that it and your new work overlap. --- source/org/thdl/tib/text/TibetanSyllable.java | 1074 +++++++++++++++++ 1 file changed, 1074 insertions(+) create mode 100644 source/org/thdl/tib/text/TibetanSyllable.java diff --git a/source/org/thdl/tib/text/TibetanSyllable.java b/source/org/thdl/tib/text/TibetanSyllable.java new file mode 100644 index 0000000..d808344 --- /dev/null +++ b/source/org/thdl/tib/text/TibetanSyllable.java @@ -0,0 +1,1074 @@ +/* +The contents of this file are subject to the THDL Open Community License +Version 1.0 (the "License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License on the THDL web site +(http://www.thdl.org/). + +Software distributed under the License is distributed on an "AS IS" basis, +WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +License for the specific terms governing rights and limitations under the +License. + +The Initial Developer of this software is the Tibetan and Himalayan Digital +Library (THDL). + +Copyright 2001 Tashi Tsering +All Rights Reserved + +Contributor(s): ______________________________________. +*/ +/* +//************************************************************************** +// +// This is the java version of the class of TibetanSyllable +// represented by Wylie translateration system. Use the class, Tibetan +// syllables can be compared. And also one can sort Tibetan syllables, words, +// phrases and sentences by comparison of two syllables. The order of the +// consonants and the order of the vowels are based on "tshig mdzod chen mo" +// (The Big Dictionary). +// +// +// Author: Tashi Tsering +// Date: 11/27/2002 +// Written at: University of Virginia +// +//*********************************************************************************** +//The interface of the class of TibetanSyllable: +// +//class name: TibetanSyllable +// +//constructor: TibetanSyllable ( String S ) +// String S is the representation of a Tibetan syllable by string. +// +//method: +// int CompareWith( TibetanSyllable thatSyllable ) +// +// return: 0 if this syllable is the same with thatSyllable; +// 1 if this syllable is bigger than thatSyllable, +// i.e. this syllable goes after thatSyllable in the order of a dictionary; +// -1 if this syllable is less than thatSyllable, +// i.e. this syllable goes before thatSyllable in the order of a dictionary; +// Those invalid syllables will be treated as the biggest syllable, that they are always +// bigger than valid syllables.; +//************************************************************************************ +// +*/ + + +package org.thdl.tib.text; + + +class TibetanSyllable { + + + String theSyllable; // The character String of the syllable. + boolean TibetanSyllableFlag; //True for Tibetan syllable, false for Sanskrit syllable. + int nComponents; // Number of Tibetan characters represented by Wylie system in the syllable. + int nVowels; // The number of vowels in the syllable. + String [][] Components; // Components of a syllable consists of Tibetan Wylie "characters". + // Components[0][0]-----Base letter + // Components[0][1]-----Superscript + // Components[0][2]-----Prefix + // Components[0][3]-----Subscript + // Components[0][4]-----vowel + // Components[0][5]-----Suffix + // Components[0][6]-----Second suffix + //You can add your own method to return different component of a syllable. + +// The constructor +TibetanSyllable ( String s ){ + //Filter the spaces that are at the beginning or end of the syllable. + while (s.length()>0) if(s.charAt(0) == ' ' && s.length()>1) s = s.substring(1,s.length()); + else if(s.charAt(s.length()-1) == ' ' && s.length()>1) s = s.substring(0, s.length()-1); + else break; + theSyllable = s; + ItsComponents(); +} + + + + +int CompareWith( TibetanSyllable s ){ + + int n = ( nVowels > s.GetnVowels()) ? nVowels : s.GetnVowels(); + + String [][] temp; + + temp = s.GetComponents(); + + for (int i=0; i ValueOfTibetanCharacter(temp[i][j])) + return 1; // This syllable is bigger than that syllable s. + + else if(ValueOfTibetanCharacter(Components[i][j]) < ValueOfTibetanCharacter(temp[i][j])) + return -1; // This syllable is smaller than that syllable s. + + else; + } + } + + return 0; // They are the same syllable. +} + + +//Return the base letter of a syllable +String BaseLetter(){ + return Components[0][0]; +} + + +boolean IsTibetanSyllable(){ + return TibetanSyllableFlag; +} + + + +String GetTheSyllable(){ + return theSyllable; +} + + + +void SetTheSyllable( String s ){ + theSyllable = s; +} + + +String [][] GetComponents( ){ + return Components; +} + + + +int GetnComponents( ){ + return nComponents; +} + + + +int ItsLength( ){ + return nComponents; +} + + + +int GetnVowels( ){ + return nVowels; +} + + + +boolean GetTibetanSyllableFlag( ){ + return TibetanSyllableFlag; +} + + + +//To examine a component to see if it is a vowel. Return true if the component is vowel. +boolean IsTibetanVowel(String thecomponent) +{ + + if( (thecomponent.equals("a")) || + (thecomponent.equals("i")) || + (thecomponent.equals("u")) || + (thecomponent.equals("e")) || + (thecomponent.equals("o")) ) + return true; + else return false; +} + + +//To examine a component, see if it is a prefix. Return true if the component is prefix. +boolean IsPrefix(String thecomponent) +{ + + if( (thecomponent.equals("g")) || + (thecomponent.equals("d")) || + (thecomponent.equals("b")) || + (thecomponent.equals("m")) || + (thecomponent.equals("'")) ) + return true; + else return false; + +} + + +//To examine a component, see if it is a base letter. Return true if it is. +boolean IsBaseLetter(String thecomponent) +{ + + if( (thecomponent.equals("k")) || + (thecomponent.equals("kh")) || + (thecomponent.equals("g")) || + (thecomponent.equals("ng")) || + (thecomponent.equals("c")) || + (thecomponent.equals("ch")) || + (thecomponent.equals("j")) || + (thecomponent.equals("ny")) || + (thecomponent.equals("t")) || + (thecomponent.equals("th")) || + (thecomponent.equals("d")) || + (thecomponent.equals("n")) || + (thecomponent.equals("p")) || + (thecomponent.equals("ph")) || + (thecomponent.equals("b")) || + (thecomponent.equals("m")) || + (thecomponent.equals("ts")) || + (thecomponent.equals("tsh")) || + (thecomponent.equals("dz")) || + (thecomponent.equals("w")) || + (thecomponent.equals("zh")) || + (thecomponent.equals("z")) || + (thecomponent.equals("'")) || + (thecomponent.equals("y")) || + (thecomponent.equals(".y")) || //Special for making "g.ya" different from gya. + (thecomponent.equals("r")) || + (thecomponent.equals("l")) || + (thecomponent.equals("sh")) || + (thecomponent.equals("s")) || + (thecomponent.equals("h")) || + (thecomponent.equals("a")) ) + return true; + else return false; +} + + +//To examine a component, see if it is a supperscript. Return true if it is, otherwise false. +boolean IsSuperscript(String thecomponent) +{ + + if( (thecomponent.equals("r")) || + (thecomponent.equals("l")) || + (thecomponent.equals("s")) ) + return true; + else return false; +} + + +//To examine a component, see if it is a subscript. Return true if it is, otherwise false. +boolean IsSubscript(String thecomponent) +{ + + if( (thecomponent.equals("w")) || + (thecomponent.equals("y")) || + (thecomponent.equals("r")) || + (thecomponent.equals("l")) ) + return true; + else return false; +} + + +//To examine a component, see if it is a suffix. Return true if it is, otherwise false. +boolean IsSuffix(String thecomponent) +{ + + if( (thecomponent.equals("g")) || + (thecomponent.equals("ng")) || + (thecomponent.equals("d")) || + (thecomponent.equals("n")) || + (thecomponent.equals("b")) || + (thecomponent.equals("m")) || + (thecomponent.equals("'")) || + (thecomponent.equals("r")) || + (thecomponent.equals("l")) || + (thecomponent.equals("s")) ) + return true; + else return false; +} + + +//To examine a component, see if it is a the second suffix. Return true if it is, otherwise false. +boolean IsSecondSuffix(String thecomponent) +{ + if(thecomponent.equals("s")) + return true; + else return false; +} + + +//To examine a component, see if it is a Sanskrit consonant. Return true if it is, otherwise false. +boolean IsSanskritConsonant(String thecomponent) +{ + + if( (thecomponent.equals("T")) || + (thecomponent.equals("Th")) || + (thecomponent.equals("D")) || + (thecomponent.equals("N")) || + (thecomponent.equals("Sh")) || + (thecomponent.equals("M")) || + (thecomponent.equals("`")) || + (thecomponent.equals("f")) || + (thecomponent.equals("v"))) + return true; + else return false; +} + + +//To examine a component, see if it is a Sanskrit vowel. Return true if it is, otherwise false. +boolean IsSanskritVowel(String thecomponent) +{ + + if( (thecomponent.equals("A")) || + (thecomponent.equals("I")) || + (thecomponent.equals("U")) || + (thecomponent.equals("-i"))|| + (thecomponent.equals("-I"))|| + (thecomponent.equals("ai"))|| + (thecomponent.equals("au"))) + return true; + else return false; +} + + +//To examine a component, see if it is a Sanskrit symbole. Return true if it is, otherwise false. +boolean IsSanskritSpecialSymbol(String thecomponent) +{ + if( (thecomponent.equals("+")) ) + return true; + else return false; +} + + +//To examine a component, see if it is a Tibetan symbol. Return true if it is, otherwise false. +boolean IsThisTibetanSymbol(String thecomponent) +{ + if( IsTibetanVowel(thecomponent) || + IsBaseLetter(thecomponent) ) + return true; + else return false; +} + + +//To examine a component, see if it is a Sanskrit symbol. Return true if it is, otherwise false. +boolean IsThisSanskritSymbol(String thecomponent) +{ + if( IsSanskritVowel(thecomponent) || + IsSanskritConsonant( thecomponent) || + IsSanskritSpecialSymbol(thecomponent) ) + return true; + else return false; +} + + +//To examine a component, see if it is a Tibetan symbol. Return true if it is, otherwise false. +boolean IsThisTibetanOrSanskritSymbol(String thecomponent) +{ + if( IsThisTibetanSymbol(thecomponent) || + IsThisSanskritSymbol(thecomponent) ) + return true; + else return false; +} + + +//To examine a component, see if it is a Tibetan or Sanskrit symbol. Return true if it is, otherwise false. +boolean IsThisTibetanOrSanskritVowel(String thecomponent) +{ + if( IsSanskritVowel( thecomponent) || + IsTibetanVowel( thecomponent) ) + return true; + else return false; +} + + +//To examine a pair of components, see if one of them is a prefix and the other one is +//a base letter that can follow the prefix. Return true if it is, otherwise false. +boolean PrefixBaseletterMatch(String prefix, String baseletter) +{ + char c; + if(prefix.length()!=1) return false; //No prefix. + else c = prefix.charAt(0); + + switch(c){ + case 'g': + if( (baseletter.equals("c")) || + (baseletter.equals("ny")) || + (baseletter.equals("t")) || + (baseletter.equals("d")) || + (baseletter.equals("n")) || + (baseletter.equals("ts")) || + (baseletter.equals("zh")) || + (baseletter.equals("z")) || + (baseletter.equals("sh")) || + (baseletter.equals("s")) || + (baseletter.equals(".y")) ) + return true; + else return false; + + case 'd': + if( (baseletter.equals("k")) || + (baseletter.equals("p")) || + (baseletter.equals("g")) || + (baseletter.equals("b")) || + (baseletter.equals("ng")) || + (baseletter.equals("m")) ) + return true; + else return false; + + case 'b': + if( (baseletter.equals("c")) || + (baseletter.equals("g")) || + (baseletter.equals("t")) || + (baseletter.equals("d")) || + (baseletter.equals("ts")) || + (baseletter.equals("zh")) || + (baseletter.equals("z")) || + (baseletter.equals("sh")) || + (baseletter.equals("s")) || + (baseletter.equals("k")) ) + return true; + else return false; + + case 'm': + if( (baseletter.equals("kh")) || + (baseletter.equals("ch")) || + (baseletter.equals("th")) || + (baseletter.equals("tsh")) || + (baseletter.equals("g")) || + (baseletter.equals("j")) || + (baseletter.equals("d")) || + (baseletter.equals("dz")) || + (baseletter.equals("ng")) || + (baseletter.equals("ny")) || + (baseletter.equals("n")) ) + return true; + else return false; + + case '\'': + if( (baseletter.equals("kh")) || + (baseletter.equals("ch")) || + (baseletter.equals("th")) || + (baseletter.equals("ph")) || + (baseletter.equals("tsh")) || + (baseletter.equals("g")) || + (baseletter.equals("j")) || + (baseletter.equals("d")) || + (baseletter.equals("b")) || + (baseletter.equals("dz") )) + return true; + else return false; + + + } + return false; +} + + +//To examine a pair of components, see if one of them is a subscript and the other one is +//a base letter that can be followed by the subscript. Return true if it is, otherwise false. +boolean BaseletterSubscriptMatch(String baseletter, String subscript) +{ + char c; + if(subscript.length()!=1) return false; //No subscript. + else c = subscript.charAt(0); + + switch(c){ + case 'y': + if( (baseletter.equals("k")) || + (baseletter.equals("kh")) || + (baseletter.equals("g")) || + (baseletter.equals("p")) || + (baseletter.equals("ph")) || + (baseletter.equals("b")) || + (baseletter.equals("m")) ) + return true; + else return false; + + case 'r': + if( (baseletter.equals("k")) || + (baseletter.equals("t")) || + (baseletter.equals("p")) || + (baseletter.equals("kh")) || + (baseletter.equals("ph")) || + (baseletter.equals("g")) || + (baseletter.equals("d")) || + (baseletter.equals("b")) || + (baseletter.equals("h")) || + (baseletter.equals("m")) || + (baseletter.equals("s"))) + return true; + else return false; + + + case 'l': + if( (baseletter.equals("k")) || + (baseletter.equals("g")) || + (baseletter.equals("b")) || + (baseletter.equals("r")) || + (baseletter.equals("s")) || + (baseletter.equals("z") )) + return true; + else return false; + + case 'w': + if( (baseletter.equals("k")) || + (baseletter.equals("kh")) || + (baseletter.equals("g")) || + (baseletter.equals("ny")) || + (baseletter.equals("d")) || + (baseletter.equals("ch")) || + (baseletter.equals("zh")) || + (baseletter.equals("z")) || + (baseletter.equals("r")) || + (baseletter.equals("l")) || + (baseletter.equals("sh")) || + (baseletter.equals("s")) || + (baseletter.equals("h") )) + return true; + else return false; + + } + return false; +} + + +//To examine a pair of components, see if one of them is a superscript and the other one is +//a base letter that can follow the superscript. Return true if it is, otherwise false. +boolean SuperscriptBaseletterMatch(String superscript, String baseletter) +{ + char c; + if(superscript.length()!=1) return false; //No superscript. + else c = superscript.charAt(0); + + switch(c){ + case 'r': + if( (baseletter.equals("k")) || + (baseletter.equals("t")) || + (baseletter.equals("ts")) || + (baseletter.equals("g")) || + (baseletter.equals("j")) || + (baseletter.equals("d")) || + (baseletter.equals("b")) || + (baseletter.equals("dz")) || + (baseletter.equals("ng")) || + (baseletter.equals("ny")) || + (baseletter.equals("n")) || + (baseletter.equals("m") )) + return true; + else return false; + + case 'l': + if( (baseletter.equals("k")) || + (baseletter.equals("c")) || + (baseletter.equals("t")) || + (baseletter.equals("p")) || + (baseletter.equals("g")) || + (baseletter.equals("j")) || + (baseletter.equals("d")) || + (baseletter.equals("b")) || + (baseletter.equals("ng")) || + (baseletter.equals("h") )) + return true; + else return false; + + case 's': + if( (baseletter.equals("k")) || + (baseletter.equals("t")) || + (baseletter.equals("p")) || + (baseletter.equals("ts")) || + (baseletter.equals("g")) || + (baseletter.equals("d")) || + (baseletter.equals("b")) || + (baseletter.equals("ng")) || + (baseletter.equals("ny")) || + (baseletter.equals("n")) || + (baseletter.equals("m"))) + return true; + else return false; + + } + return false; +} + + + +//Assign values for Tibetan Wylie characters for comparison. +int ValueOfTibetanCharacter(String theCharacter){ + + if(theCharacter == null ) return 0; + if(theCharacter.equals("$")) return 0; // For non-presence. + + if(theCharacter.equals("k")) return 1; + if(theCharacter.equals("kh")) return 2; + if(theCharacter.equals("g")) return 3; + if(theCharacter.equals("ng")) return 4; + if(theCharacter.equals("c")) return 5; + if(theCharacter.equals("ch")) return 6; + if(theCharacter.equals("j")) return 7; + if(theCharacter.equals("ny")) return 8; + + + if(theCharacter.equals("T")) return 9; + if(theCharacter.equals("Th")) return 10; + if(theCharacter.equals("D")) return 11; + if(theCharacter.equals("N")) return 12; + + if(theCharacter.equals("t")) return 13; + if(theCharacter.equals("th")) return 14; + if(theCharacter.equals("d")) return 15; + if(theCharacter.equals("n")) return 16; + if(theCharacter.equals("p")) return 17; + if(theCharacter.equals("ph")) return 18; + if(theCharacter.equals("b")) return 19; + if(theCharacter.equals("m")) return 20; + if(theCharacter.equals("ts")) return 21; + if(theCharacter.equals("tsh")) return 22; + if(theCharacter.equals("dz")) return 23; + if(theCharacter.equals("w")) return 24; + if(theCharacter.equals("zh")) return 25; + if(theCharacter.equals("z")) return 26; + if(theCharacter.equals("'")) return 27; + if(theCharacter.equals("y")) return 28; + if(theCharacter.equals(".y")) return 28; + if(theCharacter.equals("r")) return 29; + if(theCharacter.equals("l")) return 30; + if(theCharacter.equals("sh")) return 31; + if(theCharacter.equals("Sh")) return 32; + if(theCharacter.equals("s")) return 33; + if(theCharacter.equals("h")) return 34; + if(theCharacter.equals("a")) return 35; + +// if(theCharacter.equals("a")) return 41; + if(theCharacter.equals("A")) return 42; + if(theCharacter.equals("i")) return 43; + if(theCharacter.equals("I")) return 44; + if(theCharacter.equals("u")) return 47; + if(theCharacter.equals("U")) return 48; + if(theCharacter.equals("-i")) return 45; + if(theCharacter.equals("-I")) return 46; + if(theCharacter.equals("e")) return 49; + if(theCharacter.equals("ai")) return 50; + if(theCharacter.equals("o")) return 51; + if(theCharacter.equals("au")) return 52; + + if(theCharacter.equals("invalid")) return 100; + + return 100; +} + + +//This is the key function in the class, which extracts the components of a syllable +//from the Wylie string of the syllable and put them into the order in that we compare +//syllables each other. + +void ItsComponents(){ + + String thisString; + String SyllableByComponents[] = new String[100]; // Syllable consist of and ordered by components represented + // by Tibetan Wylie characters. Assume there are no more than + Components = new String[10][20]; // 20 components in a syllable. + + + int s = 0; + nComponents = 0; // Number of Tibetan characters represented by Wylie system in the syllable. + int i=0; + + //Cut the String of the syllable into the consequence of Tibetan Wylie characters of the syllable. + while ( i= (i+j); j++){ + thisString = theSyllable.substring(i,i+j); + if ( IsThisTibetanOrSanskritSymbol(thisString) ) { s = j; continue;} + if ( theSyllable.length() > (i+j) && j<3 ) continue; + if ( s != 0) break; + else { InValidSyllable(); return; } + } + if ( s == 0) { InValidSyllable(); return; } + + if(theSyllable.substring(i,i+s).equals("+")) { s=0; continue;} //Take off the Sanskrit stacking symbol "+" from the String. + + SyllableByComponents[nComponents++] = theSyllable.substring(i,i+s); + i = i + s; + s = 0; + } + + + int nVowel=0; // Number of vowels in a syllable. + + int nCBV[] = new int[6]; // Number of components before a vowel, assume there are 5 vowels in the syllable. + // Normallly, there is only one vowel, sometimes two vowels in a syllable. + int nCAV[] = new int[6]; // Number of components after vowel, assume there are 5 vowels in the syllable. + // Normallly, there is only one vowel, sometimes two vewls in a syllable. + boolean SanskritFlag = false; // Is the syllable Sanskrit? + + TibetanSyllableFlag = true; + + //Calculate nVowel, nCBV and nCAV. + for(i=0; i=2 ) { //For more than two vowel Tibetan syllable, like "nga'i", "tshu'u": + + int StartPoint = nCBV[0]; + + for(int j=0; j