224 lines
8.9 KiB
XML
224 lines
8.9 KiB
XML
<?xml version="1.0" encoding="utf-8"?>
|
|
|
|
<!--
|
|
Some THDL titles are problematic:
|
|
* 691, A New Script, claims to have a transcript but there is no
|
|
transcript at the URL.
|
|
* 2069, Husked Barley, has a zero KB transcript.
|
|
* 2116, Nasal Congestion, also has a zero KB transcript.
|
|
Each of these is handled here as a title with metadata but no data,
|
|
just like any other title without a transcription.
|
|
-->
|
|
|
|
<project name="lucene-thdl" default="lucene-thdl-jar" basedir=".">
|
|
<import file="build.xml"/>
|
|
|
|
<taskdef resource="net/sf/antcontrib/antcontrib.properties">
|
|
<classpath>
|
|
<pathelement location="${ext}/to-be-installed-with-ant/ant-contrib.jar"/>
|
|
</classpath>
|
|
</taskdef>
|
|
|
|
<property name="get.title.metadata" value="http://thdl.org/avarch/mediaflowcat/title_metadata.php"/>
|
|
<property name="url.to.transcripts" value="http://www.thdl.org/avarch/transcripts"/>
|
|
<property name="url.to.media.high" value="http://www.thdl.org/media/high"/>
|
|
<property name="url.to.media.low" value="http://www.thdl.org/media/low"/>
|
|
<property name="url.to.media.audio" value="http://www.thdl.org/media/audio"/>
|
|
<property name="lucene-thdl.bin" location="${bin}/lucene-thdl"/>
|
|
<property name="archive" location="archive"/>
|
|
<property name="styles" location="${archive}/styles"/>
|
|
<property name="transcripts" location="${archive}/transcripts-with-metadata"/>
|
|
<property name="solr" location="${archive}/solr"/>
|
|
|
|
<path id="lucene.classpath">
|
|
<fileset id="lucene.extensions" dir="${ext}/apache">
|
|
<include name="*.jar"/>
|
|
</fileset>
|
|
</path>
|
|
|
|
<path id="saxon.classpath">
|
|
<fileset id="saxon.extensions" dir="${ext}/saxon">
|
|
<include name="*.jar"/>
|
|
</fileset>
|
|
</path>
|
|
|
|
<target name="compile-and-jar-libraries">
|
|
<antcall target="jskad-dist">
|
|
<param name="my.jar.suffix" value=""/>
|
|
</antcall>
|
|
</target>
|
|
|
|
<!-- archive tasks -->
|
|
<target name="archive-get-metadata">
|
|
<mkdir dir="${archive}"/>
|
|
<get src="${get.title.metadata}" dest="${archive}/title_metadata.xml" verbose="on"/>
|
|
</target>
|
|
|
|
<!-- due to encoding issues, none of the transcripts with filenames that need to be url-encoded
|
|
are retrieved correctly by this task; for example the umlaut in title 00007 messes things
|
|
up FIX ME!! -->
|
|
<target name="archive-get-and-transform-data" depends="compile-and-jar-libraries">
|
|
<mkdir dir="${transcripts}"/>
|
|
<java classname="net.sf.saxon.Transform" fork="yes">
|
|
<arg value="-s"/>
|
|
<arg value="${archive}/title_metadata.xml"/>
|
|
<arg value="-o"/>
|
|
<arg value="${transcripts}/DUMMY_FOR_BASE_URI"/>
|
|
<arg value="${styles}/mergeMetadataAndData.xsl"/>
|
|
<arg value="transcript.location=${url.to.transcripts}/"/>
|
|
<classpath>
|
|
<path refid="saxon.classpath"/>
|
|
<pathelement location="${vanillalib}/Jskad.jar"/>
|
|
</classpath>
|
|
</java>
|
|
</target>
|
|
|
|
<!-- solr tasks -->
|
|
<target name="solr-prepare-transcripts">
|
|
<mkdir dir="${solr}"/>
|
|
<xslt basedir="${transcripts}" destdir="${solr}" extension=".xml" style="${styles}/solarizeTranscript.xsl"/>
|
|
</target>
|
|
|
|
|
|
<target name="archive-get-list-of-transcripts">
|
|
<!-- <java classname="net.sf.saxon.Transform" fork="yes">
|
|
<arg value="-s"/>
|
|
<arg value="${archive}/title_metadata.xml"/>
|
|
<arg value="-o"/>
|
|
<arg value="${archive}/titles_as_list.txt"/>
|
|
<arg value="${styles}/get-list-of-transcripts.xsl"/>
|
|
<classpath>
|
|
<path refid="saxon.classpath"/>
|
|
</classpath>
|
|
</java>-->
|
|
<xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list.txt" style="${styles}/get-list-of-transcripts.xsl"/>
|
|
</target>
|
|
|
|
<target name="archive-get-transcripts">
|
|
<mkdir dir="${wylie}"/>
|
|
<loadfile property="transcript-list" srcFile="${archive}/titles_as_list2.txt"/> <!-- encoding="UTF-8" -->
|
|
<foreach list="${transcript-list}" delimiter=" " param="filename" target="archive-get-one-transcript"/>
|
|
<!--<foreach list="${transcript-list}" delimiter=" " param="id.plus.filename" target="archive-get-one-transcript"/>-->
|
|
</target>
|
|
|
|
<target name="archive-get-one-transcript">
|
|
<!--<propertyregex property="filename" input="${id.plus.filename}" regexp=".+/(.+)" select="\1"/>-->
|
|
<get src="${url.to.transcripts}/${filename}" dest="${wylie}/${filename}" ignoreerrors="true"/>
|
|
</target>
|
|
|
|
|
|
|
|
<!-- solr stuff -->
|
|
<!-- <target name="solr-1:clean-local" depends="clean">
|
|
<delete dir="${solarized.transcript.dir.prefinal}"/>
|
|
<delete dir="${solarized.transcript.dir.final}"/>
|
|
<mkdir dir="${solarized.transcript.dir.prefinal}"/>
|
|
<mkdir dir="${solarized.transcript.dir.final}"/>
|
|
</target>-->
|
|
|
|
<!-- <target name="solr-2:prepare-documents" depends="init, dbxml-4:ngram-magic">-->
|
|
<!-- create xml data file used to assign tags to mono tsheg bars -->
|
|
<!--
|
|
<java classname="net.sf.saxon.Transform" fork="yes">
|
|
<arg value="-o"/>
|
|
<arg value="${build.dir}/tshegbartags.xml"/>
|
|
<arg value="${lexicon.dir}/lhasa-verbs.xml"/>
|
|
<arg value="${stylesheet.dir}/prepareTshegBarTagger.xsl"/>
|
|
<classpath>
|
|
<pathelement location="${bin.dir}"/>
|
|
<path refid="classpath"/>
|
|
</classpath>
|
|
</java>
|
|
-->
|
|
|
|
<!-- create xml file used to assign synonyms to certain mono tsheg bars -->
|
|
<!--
|
|
<java classname="net.sf.saxon.Transform" fork="yes">
|
|
<arg value="-o"/>
|
|
<arg value="${build.dir}/synonyms.xml"/>
|
|
<arg value="${lexicon.dir}/lhasa-verbs.xml"/>
|
|
<arg value="${stylesheet.dir}/makeSynonymFile.xsl"/>
|
|
<classpath>
|
|
<pathelement location="${bin.dir}"/>
|
|
<path refid="classpath"/>
|
|
</classpath>
|
|
</java>
|
|
-->
|
|
|
|
<!-- <loadfile property="xquery" srcfile="${stylesheet.dir}/solarizeTranscriptDatabase.xql" encoding="UTF-8"/>
|
|
<java classname="org.thdl.dbxml.QueryTools" fork="yes">
|
|
<arg value="${dbxml.environment.dir}"/>
|
|
<arg value="${dbxml.container}"/>
|
|
<arg value="${xquery}"/>
|
|
<arg value="${solarized.transcript.dir.prefinal}"/>
|
|
<classpath>
|
|
<pathelement location="${bin.dir}"/>
|
|
<path refid="classpath"/>
|
|
</classpath>
|
|
<jvmarg value="-Djava.library.path=${dbxml.lib}"/>
|
|
</java>-->
|
|
|
|
<!-- insert whether or not media exists for segment -->
|
|
<!-- <java classname="net.sf.saxon.Transform" fork="yes">
|
|
<arg value="-o"/>
|
|
<arg value="${solarized.transcript.dir.final}"/>
|
|
<arg value="${solarized.transcript.dir.prefinal}"/>
|
|
<arg value="${stylesheet.dir}/movieInfoToSolrAdd.xsl"/>
|
|
<classpath>
|
|
<pathelement location="${bin.dir}"/>
|
|
<path refid="classpath"/>
|
|
</classpath>
|
|
</java>
|
|
|
|
<copy todir="${solarized.transcript.dir.final}">
|
|
<fileset dir="." includes="*.sh"/>
|
|
</copy>
|
|
</target>-->
|
|
|
|
<!--
|
|
This is because the example schema.xml specifies a "uniqueKey" field called "id".
|
|
Whenever you POST instructions to Solr to add a document with the same value for
|
|
the uniqueKey as an existing document, it automaticaly replaces it for you.
|
|
|
|
However, for us, uniqueKeys combine the document id (which won't change) with
|
|
sentence ids (which may change), e.g. <field name="id">2291_d629e12</field>.
|
|
|
|
So, to replace, we'll find XML document by name (document id) in dbxml database,
|
|
then get all sentence ids for that document, then combine docId_sentenceId and
|
|
remove/replace from lucene.
|
|
-->
|
|
<!--
|
|
<target name="solr-3:commit-documents">
|
|
<exec executable="sh" dir="${solarized.transcript.dir.final}">
|
|
<arg value="post.sh"/>
|
|
<arg value="*.xml"/>
|
|
</exec>
|
|
</target>
|
|
|
|
<target name="solr-4:delete-documents" depends="solr-1:clean-local">
|
|
<exec executable="curl">
|
|
<arg value="${solr.update}"/>
|
|
<arg value="-data-binary"/> should have double dash at beginning
|
|
<arg value="<delete><query>id:[* TO *]</query></delete>"/>
|
|
</exec>
|
|
<exec executable="curl">
|
|
<arg value="${solr.update}"/>
|
|
<arg value="-data-binary"/> double dash again!
|
|
<arg value="<commit/>"/>
|
|
</exec>
|
|
</target>
|
|
-->
|
|
|
|
<!-- concordance program -->
|
|
<target name="lucene-thdl-compile" depends="init">
|
|
<mkdir dir="${lucene-thdl.bin}"/>
|
|
<javac srcdir="${source}" destdir="${lucene-thdl.bin}" includes="org/thdl/lucene/**.java" debug="on">
|
|
<classpath refid="lucene.classpath"/>
|
|
</javac>
|
|
</target>
|
|
|
|
<target name="lucene-thdl-jar" depends="lucene-thdl-compile">
|
|
<jar destfile="${vanillalib}/lucene-thdl.jar" basedir="${lucene-thdl.bin}/"/>
|
|
</target>
|
|
|
|
</project>
|