added saxon8 to repository to support xslt 2.0 transformations; these are used in transforming THDL's wylie transcripts into unicode transcripts
This commit is contained in:
parent
4f553caf54
commit
0440ef5ffb
3 changed files with 54 additions and 44 deletions
|
@ -1,6 +1,9 @@
|
||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
|
|
||||||
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
|
<xsl:stylesheet version="1.0"
|
||||||
|
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||||
|
xmlns:java="java" >
|
||||||
|
|
||||||
<xsl:output method="text" encoding="utf-8"/>
|
<xsl:output method="text" encoding="utf-8"/>
|
||||||
|
|
||||||
<xsl:template match="/">
|
<xsl:template match="/">
|
||||||
|
@ -8,6 +11,6 @@
|
||||||
</xsl:template>
|
</xsl:template>
|
||||||
|
|
||||||
<xsl:template match="transcript">
|
<xsl:template match="transcript">
|
||||||
<xsl:value-of select="."/><xsl:text> </xsl:text>
|
<xsl:value-of select="java:net.URLEncoder.encode(.,'UTF-8')"/><xsl:text> </xsl:text>
|
||||||
</xsl:template>
|
</xsl:template>
|
||||||
</xsl:stylesheet>
|
</xsl:stylesheet>
|
||||||
|
|
BIN
extensions/saxon/saxon.jar
Normal file
BIN
extensions/saxon/saxon.jar
Normal file
Binary file not shown.
|
@ -3,10 +3,10 @@
|
||||||
<project name="lucene-thdl" default="lucene-thdl-jar" basedir=".">
|
<project name="lucene-thdl" default="lucene-thdl-jar" basedir=".">
|
||||||
<import file="build.xml"/>
|
<import file="build.xml"/>
|
||||||
|
|
||||||
<taskdef resource="net/sf/antcontrib/antcontrib.properties">
|
<taskdef resource="net/sf/antcontrib/antcontrib.properties">
|
||||||
<classpath>
|
<classpath>
|
||||||
<pathelement location="${ext}/to-be-installed-with-ant/ant-contrib.jar"/>
|
<pathelement location="${ext}/to-be-installed-with-ant/ant-contrib.jar"/>
|
||||||
</classpath>
|
</classpath>
|
||||||
</taskdef>
|
</taskdef>
|
||||||
|
|
||||||
<property name="lucene-thdl.bin" location="${bin}/lucene-thdl"/>
|
<property name="lucene-thdl.bin" location="${bin}/lucene-thdl"/>
|
||||||
|
@ -27,6 +27,12 @@
|
||||||
</fileset>
|
</fileset>
|
||||||
</path>
|
</path>
|
||||||
|
|
||||||
|
<path id="saxon.classpath">
|
||||||
|
<fileset id="saxon.extensions" dir="${ext}/saxon">
|
||||||
|
<include name="*.jar"/>
|
||||||
|
</fileset>
|
||||||
|
</path>
|
||||||
|
|
||||||
<!--
|
<!--
|
||||||
<target name="segment-videos">
|
<target name="segment-videos">
|
||||||
<fileset dir="${segmentation.instructions.dir}" id="segmentation.instructions">
|
<fileset dir="${segmentation.instructions.dir}" id="segmentation.instructions">
|
||||||
|
@ -43,6 +49,45 @@
|
||||||
</target>
|
</target>
|
||||||
-->
|
-->
|
||||||
|
|
||||||
|
<!-- archive tasks -->
|
||||||
|
|
||||||
|
<!-- FIXME: title_metadata.xml is not being retrieved as UTF-8!! -->
|
||||||
|
<target name="archive-get-metadata">
|
||||||
|
<mkdir dir="${archive}"/>
|
||||||
|
<get src="${get.title.metadata}" dest="${archive}/title_metadata.xml" verbose="on"/>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="archive-get-list-of-transcripts">
|
||||||
|
<mkdir dir="${wylie}"/>
|
||||||
|
<xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list.txt" style="${styles}/get-list-of-transcripts.xsl"/>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="archive-get-transcripts">
|
||||||
|
<loadfile property="transcript-list" srcFile="${archive}/titles_as_list.txt"/> <!-- encoding="UTF-8" -->
|
||||||
|
<foreach list="${transcript-list}" delimiter=" " param="filename" target="archive-get-one-transcript"/>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="archive-get-one-transcript">
|
||||||
|
<get src="${url.to.transcripts}/${filename}" dest="${wylie}/${filename}" ignoreerrors="true"/>
|
||||||
|
</target>
|
||||||
|
|
||||||
|
<target name="archive-transcripts-to-unicode">
|
||||||
|
<mkdir dir="${unicode}"/>
|
||||||
|
<antcall target="jskad-dist">
|
||||||
|
<param name="my.jar.suffix" value=""/>
|
||||||
|
</antcall>
|
||||||
|
<!-- why not use ant task for saxon?? -->
|
||||||
|
<java classname="net.sf.saxon.Transform" fork="yes">
|
||||||
|
<arg value="-o"/>
|
||||||
|
<arg value="${unicode}"/>
|
||||||
|
<arg value="${wylie}"/>
|
||||||
|
<arg value="${styles}/qdToUnicode.xsl"/>
|
||||||
|
<classpath>
|
||||||
|
<pathelement location="${vanillalib}/Jskad.jar"/>
|
||||||
|
<path refid="saxon.classpath"/>
|
||||||
|
</classpath>
|
||||||
|
</java>
|
||||||
|
</target>
|
||||||
<!--
|
<!--
|
||||||
|
|
||||||
<target name="archive-transcripts-to-unicode">
|
<target name="archive-transcripts-to-unicode">
|
||||||
|
@ -58,44 +103,6 @@
|
||||||
</java>
|
</java>
|
||||||
</target>
|
</target>
|
||||||
-->
|
-->
|
||||||
|
|
||||||
<!-- archive tasks -->
|
|
||||||
|
|
||||||
<!-- title_metadata.xml is not being retrieved as UTF-8. need to fix this.
|
|
||||||
perhaps we don't need to decodeUTF the transcript file name???
|
|
||||||
-->
|
|
||||||
<target name="archive-get-metadata">
|
|
||||||
<mkdir dir="${archive}"/>
|
|
||||||
<get src="${get.title.metadata}" dest="${archive}/title_metadata.xml" verbose="on"/>
|
|
||||||
</target>
|
|
||||||
|
|
||||||
<target name="archive-get-list-of-transcripts">
|
|
||||||
<mkdir dir="${wylie}"/>
|
|
||||||
<xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list.txt" style="${styles}/get-list-of-transcripts.xsl"/>
|
|
||||||
</target>
|
|
||||||
|
|
||||||
<!-- must delete following transcripts from titles_as_list.txt
|
|
||||||
02069_clip-21-husked-barle_00.xml
|
|
||||||
02116_nasal-congestion_00.xml
|
|
||||||
A_New_Script02.xml
|
|
||||||
|
|
||||||
00007_06-dawa-and-purdrön_08.xml
|
|
||||||
or, should put in protection against URLs that cannot be "got"
|
|
||||||
-->
|
|
||||||
<target name="archive-get-transcripts">
|
|
||||||
<loadfile property="transcript-list" srcFile="${archive}/titles_as_list.txt"/>
|
|
||||||
<!--<loadfile property="transcript-list" srcFile="${archive}/titles_as_list.txt" encoding="UTF-8"/>-->
|
|
||||||
<foreach list="${transcript-list}" delimiter=" " param="filename" target="archive-get-one-transcript"/>
|
|
||||||
</target>
|
|
||||||
|
|
||||||
<!-- problem: destination filename should not be URLEncoded -->
|
|
||||||
<target name="archive-get-one-transcript">
|
|
||||||
<!--<urlencode property="file.location" value="00007_06-dawa-and-purdrön_08.xml" />
|
|
||||||
<get src="${url.to.transcripts}/${file.location}" dest="${wylie}/TEST.xml"/>-->
|
|
||||||
<get src="${url.to.transcripts}/${filename}" dest="${wylie}/${filename}"/>
|
|
||||||
</target>
|
|
||||||
|
|
||||||
|
|
||||||
<!-- concordance program -->
|
<!-- concordance program -->
|
||||||
<target name="lucene-thdl-compile" depends="init">
|
<target name="lucene-thdl-compile" depends="init">
|
||||||
<mkdir dir="${lucene-thdl.bin}"/>
|
<mkdir dir="${lucene-thdl.bin}"/>
|
||||||
|
|
Loading…
Reference in a new issue