changes to build file and xsl transformations--more reliance on saxon and XSLT 2.0

This commit is contained in:
eg3p 2007-05-22 08:56:06 +00:00
parent 0440ef5ffb
commit dfce6c24e8

View file

@ -33,76 +33,196 @@
</fileset> </fileset>
</path> </path>
<!--
<target name="segment-videos">
<fileset dir="${segmentation.instructions.dir}" id="segmentation.instructions">
<include name="**/*.xml"/>
</fileset>
<pathconvert pathsep=" " property="about.to.segment" refid="segmentation.instructions"/>
<java classname="fieldling.quicktime.MovieSegmenter" dir="${segmented.video.dir}" fork="yes">
<arg line="${about.to.segment}"/>
<classpath>
<pathelement location="${bin.dir}"/>
<path refid="classpath"/>
</classpath>
</java>
</target>
-->
<!-- archive tasks --> <!-- archive tasks -->
<!-- FIXME: title_metadata.xml is not being retrieved as UTF-8!! -->
<target name="archive-get-metadata"> <target name="archive-get-metadata">
<mkdir dir="${archive}"/> <mkdir dir="${archive}"/>
<get src="${get.title.metadata}" dest="${archive}/title_metadata.xml" verbose="on"/> <get src="${get.title.metadata}" dest="${archive}/title_metadata.xml" verbose="on"/>
</target> </target>
<target name="archive-get-list-of-transcripts"> <target name="archive-get-list-of-transcripts">
<mkdir dir="${wylie}"/> <!-- <java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-s"/>
<arg value="${archive}/title_metadata.xml"/>
<arg value="-o"/>
<arg value="${archive}/titles_as_list.txt"/>
<arg value="${styles}/get-list-of-transcripts.xsl"/>
<classpath>
<path refid="saxon.classpath"/>
</classpath>
</java>-->
<xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list.txt" style="${styles}/get-list-of-transcripts.xsl"/> <xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list.txt" style="${styles}/get-list-of-transcripts.xsl"/>
<xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list2.txt" style="${styles}/get-list-of-transcripts2.xsl"/>
</target> </target>
<target name="archive-get-transcripts"> <target name="archive-get-transcripts">
<loadfile property="transcript-list" srcFile="${archive}/titles_as_list.txt"/> <!-- encoding="UTF-8" --> <mkdir dir="${wylie}"/>
<loadfile property="transcript-list" srcFile="${archive}/titles_as_list2.txt"/> <!-- encoding="UTF-8" -->
<foreach list="${transcript-list}" delimiter=" " param="filename" target="archive-get-one-transcript"/> <foreach list="${transcript-list}" delimiter=" " param="filename" target="archive-get-one-transcript"/>
<!--<foreach list="${transcript-list}" delimiter=" " param="id.plus.filename" target="archive-get-one-transcript"/>-->
</target> </target>
<target name="archive-get-one-transcript"> <target name="archive-get-one-transcript">
<!--<propertyregex property="filename" input="${id.plus.filename}" regexp=".+/(.+)" select="\1"/>-->
<get src="${url.to.transcripts}/${filename}" dest="${wylie}/${filename}" ignoreerrors="true"/> <get src="${url.to.transcripts}/${filename}" dest="${wylie}/${filename}" ignoreerrors="true"/>
</target> </target>
<target name="saxon-test">
<mkdir dir="${unicode}"/>
<java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-s"/>
<arg value="${archive}/title_metadata.xml"/>
<arg value="${styles}/mergeMetadataAndData.xsl"/>
<arg value="transcript.location=http://www.thdl.org/avarch/transcripts/"/>
<arg value="transform.to.dir=${unicode}"/>
<classpath>
<path refid="saxon.classpath"/>
<pathelement location="${vanillalib}/Jskad.jar"/>
</classpath>
</java>
</target>
<target name="archive-transcripts-to-unicode"> <target name="archive-transcripts-to-unicode">
<mkdir dir="${unicode}"/> <mkdir dir="${unicode}"/>
<antcall target="jskad-dist"> <antcall target="jskad-dist">
<param name="my.jar.suffix" value=""/> <param name="my.jar.suffix" value=""/>
</antcall> </antcall>
<!-- why not use ant task for saxon?? --> <mkdir dir="${lucene-thdl.bin}"/>
<java classname="net.sf.saxon.Transform" fork="yes"> <javac srcdir="${source}" destdir="${lucene-thdl.bin}" includes="org/thdl/util/ant/SaxonLiaison.java" debug="on"/>
<arg value="-o"/> <loadfile property="transcript-list" srcFile="${archive}/titles_as_list.txt"/> <!-- encoding="UTF-8" -->
<arg value="${unicode}"/> <foreach list="${transcript-list}" delimiter=" " param="id.plus.filename" target="archive-one-transcript-to-unicode"/>
<arg value="${wylie}"/> </target>
<arg value="${styles}/qdToUnicode.xsl"/>
<target name="archive-one-transcript-to-unicode">
<propertyregex property="title.id" input="${id.plus.filename}" regexp="(.+)/.+" select="\1"/>
<propertyregex property="filename" input="${id.plus.filename}" regexp=".+/(.+)" select="\1"/>
<!-- note: this processor is used to get around bug described here:
This class is a hack to work around Ant bug #41314: http://issues.apache.org/bugzilla/show_bug.cgi?id=41314
See Trevor's bike shed: http://www.vocaro.com/trevor/blog/2007/01/08/how-to-use-saxon-with-ant/
-->
<if>
<available file="${wylie}/${filename}" property="transcript.exists"/>
<then>
<xslt in="${wylie}/${filename}" out="${unicode}/${filename}" style="${styles}/qdToUnicode.xsl" processor="org.thdl.util.ant.SaxonLiaison">
<param name="title.id" expression="${title.id}"/>
<classpath> <classpath>
<pathelement location="${lucene-thdl.bin}"/>
<pathelement location="${vanillalib}/Jskad.jar"/> <pathelement location="${vanillalib}/Jskad.jar"/>
<path refid="saxon.classpath"/> <path refid="saxon.classpath"/>
</classpath> </classpath>
</java> </xslt>
</then>
</if>
</target> </target>
<!--
<target name="archive-transcripts-to-unicode">
<!-- solr tasks -->
<target name="solr-prepare-transcripts">
</target>
<!-- solr stuff -->
<!-- <target name="solr-1:clean-local" depends="clean">
<delete dir="${solarized.transcript.dir.prefinal}"/>
<delete dir="${solarized.transcript.dir.final}"/>
<mkdir dir="${solarized.transcript.dir.prefinal}"/>
<mkdir dir="${solarized.transcript.dir.final}"/>
</target>-->
<!-- <target name="solr-2:prepare-documents" depends="init, dbxml-4:ngram-magic">-->
<!-- create xml data file used to assign tags to mono tsheg bars -->
<!--
<java classname="net.sf.saxon.Transform" fork="yes"> <java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-o"/> <arg value="-o"/>
<arg value="${unicode.transcript.dir}"/> <arg value="${build.dir}/tshegbartags.xml"/>
<arg value="${wylie.transcript.dir}"/> <arg value="${lexicon.dir}/lhasa-verbs.xml"/>
<arg value="${stylesheet.dir}/qdToUnicode.xsl"/> <arg value="${stylesheet.dir}/prepareTshegBarTagger.xsl"/>
<classpath> <classpath>
<pathelement location="${bin.dir}"/> <pathelement location="${bin.dir}"/>
<path refid="classpath"/> <path refid="classpath"/>
</classpath> </classpath>
</java> </java>
</target>
--> -->
<!-- create xml file used to assign synonyms to certain mono tsheg bars -->
<!--
<java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-o"/>
<arg value="${build.dir}/synonyms.xml"/>
<arg value="${lexicon.dir}/lhasa-verbs.xml"/>
<arg value="${stylesheet.dir}/makeSynonymFile.xsl"/>
<classpath>
<pathelement location="${bin.dir}"/>
<path refid="classpath"/>
</classpath>
</java>
-->
<!-- <loadfile property="xquery" srcfile="${stylesheet.dir}/solarizeTranscriptDatabase.xql" encoding="UTF-8"/>
<java classname="org.thdl.dbxml.QueryTools" fork="yes">
<arg value="${dbxml.environment.dir}"/>
<arg value="${dbxml.container}"/>
<arg value="${xquery}"/>
<arg value="${solarized.transcript.dir.prefinal}"/>
<classpath>
<pathelement location="${bin.dir}"/>
<path refid="classpath"/>
</classpath>
<jvmarg value="-Djava.library.path=${dbxml.lib}"/>
</java>-->
<!-- insert whether or not media exists for segment -->
<!-- <java classname="net.sf.saxon.Transform" fork="yes">
<arg value="-o"/>
<arg value="${solarized.transcript.dir.final}"/>
<arg value="${solarized.transcript.dir.prefinal}"/>
<arg value="${stylesheet.dir}/movieInfoToSolrAdd.xsl"/>
<classpath>
<pathelement location="${bin.dir}"/>
<path refid="classpath"/>
</classpath>
</java>
<copy todir="${solarized.transcript.dir.final}">
<fileset dir="." includes="*.sh"/>
</copy>
</target>-->
<!--
This is because the example schema.xml specifies a "uniqueKey" field called "id".
Whenever you POST instructions to Solr to add a document with the same value for
the uniqueKey as an existing document, it automaticaly replaces it for you.
However, for us, uniqueKeys combine the document id (which won't change) with
sentence ids (which may change), e.g. <field name="id">2291_d629e12</field>.
So, to replace, we'll find XML document by name (document id) in dbxml database,
then get all sentence ids for that document, then combine docId_sentenceId and
remove/replace from lucene.
-->
<!--
<target name="solr-3:commit-documents">
<exec executable="sh" dir="${solarized.transcript.dir.final}">
<arg value="post.sh"/>
<arg value="*.xml"/>
</exec>
</target>
<target name="solr-4:delete-documents" depends="solr-1:clean-local">
<exec executable="curl">
<arg value="${solr.update}"/>
<arg value="-data-binary"/> should have double dash at beginning
<arg value="&lt;delete&gt;&lt;query&gt;id:[* TO *]&lt;/query&gt;&lt;/delete&gt;"/>
</exec>
<exec executable="curl">
<arg value="${solr.update}"/>
<arg value="-data-binary"/> double dash again!
<arg value="&lt;commit/&gt;"/>
</exec>
</target>
-->
<!-- concordance program --> <!-- concordance program -->
<target name="lucene-thdl-compile" depends="init"> <target name="lucene-thdl-compile" depends="init">
<mkdir dir="${lucene-thdl.bin}"/> <mkdir dir="${lucene-thdl.bin}"/>