added a few stylesheets for lucene-based indexing of THDL's avdb

This commit is contained in:
eg3p 2007-05-15 17:48:15 +00:00
parent 3dd452a298
commit f8a97cce4e
3 changed files with 138 additions and 3 deletions

View file

@ -0,0 +1,16 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
<xsl:output method="text" encoding="utf-8"/>
<xsl:param name="prefix" select="''"/>
<xsl:template match="/">
<xsl:apply-templates select="//transcript"/>
</xsl:template>
<xsl:template match="transcript">
<xsl:value-of select="$prefix"/><xsl:value-of select="."/><xsl:text>
</xsl:text>
</xsl:template>
</xsl:stylesheet>

View file

@ -0,0 +1,109 @@
<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:thdl="java:org.thdl.tib.text.ttt.EwtsToUnicodeForXslt"
exclude-result-prefixes="thdl"
version="2.0">
<!-- <xsl:param name="mediaref"/> -->
<xsl:template match="/">
<xsl:apply-templates/>
</xsl:template>
<!-- <xsl:template match="SOUNDFILE">
<SOUNDFILE href="{$mediaref}"/>
</xsl:template> -->
<xsl:template match="TEXT">
<TEXT>
<xsl:apply-templates/>
</TEXT>
</xsl:template>
<xsl:template match="S">
<xsl:element name="S">
<xsl:apply-templates select="@*[not(. = 'id')]"/>
<xsl:attribute name="id">
<xsl:value-of select="generate-id(.)"/>
</xsl:attribute>
<xsl:apply-templates select="*"/>
</xsl:element>
</xsl:template>
<xsl:template match="FORM">
<xsl:variable name="wylie" select="string(.)"/>
<xsl:variable name="converted" select="thdl:convertEwtsTo($wylie)"/>
<FORM xml:lang="bo">
<xsl:for-each select="tokenize($converted, '[\[\]]')">
<xsl:choose>
<xsl:when test="starts-with(., '#ERROR')"></xsl:when>
<!-- <ERROR number="{substring(., 8, 3)}" offense="{substring-before(substring-after(., '{'), '}')}">
<MSG><xsl:value-of select="."/></MSG>
<SRC><xsl:value-of select="$wylie"/></SRC>
</ERROR>
</xsl:when>-->
<xsl:otherwise>
<xsl:value-of select="."/>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each>
</FORM>
<FORM xml:lang="bo-Latn">
<xsl:value-of select="$wylie"/>
</FORM>
</xsl:template>
<xsl:template match="TRANSL">
<TRANSL xml:lang="en">
<xsl:apply-templates/>
</TRANSL>
</xsl:template>
<xsl:template match="TRANSL_ZH">
<TRANSL xml:lang="zh">
<xsl:apply-templates/>
</TRANSL>
</xsl:template>
<xsl:template match="SPEAKER">
<xsl:variable name="wylie" select="string(.)"/>
<xsl:variable name="converted" select="thdl:convertEwtsTo($wylie)"/>
<SPEAKER xml:lang="bo" personId="{@personId}" wylie="{$wylie}">
<xsl:for-each select="tokenize($converted, '[\[\]]')">
<xsl:choose>
<xsl:when test="starts-with(., '#ERROR')"></xsl:when>
<!--
<ERROR number="{substring(., 8, 3)}" offense="{substring-before(substring-after(., '{'), '}')}">
<MSG><xsl:value-of select="."/></MSG>
<SRC><xsl:value-of select="$wylie"/></SRC>
</ERROR>
</xsl:when> -->
<xsl:otherwise>
<xsl:value-of select="."/>
</xsl:otherwise>
</xsl:choose>
</xsl:for-each>
</SPEAKER>
</xsl:template>
<xsl:template match="node()|@*">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
<!-- <xsl:template match="*">
<xsl:element name="{name(.)}">
<xsl:for-each select="@*">
<xsl:attribute name="{name(.)}">
<xsl:value-of select="."/>
</xsl:attribute>
</xsl:for-each>
<xsl:apply-templates/>
</xsl:element>
</xsl:template> -->
</xsl:stylesheet>

View file

@ -1,6 +1,6 @@
<?xml version="1.0" encoding="utf-8"?>
<project name="thdl-concordancer" default="index-for-solr" basedir=".">
<project name="lucene-thdl" default="lucene-thdl-jar" basedir=".">
<import file="build.xml"/>
@ -9,6 +9,7 @@
<property name="wylie" location="${archive}/wylie"/>
<property name="unicode" location="${archive}/unicode"/>
<property name="solr" location="${archive}/solr"/>
<property name="styles" location="${archive}/styles"/>
<property name="get.title.metadata" value="http://thdl.org/avarch/mediaflowcat/title_metadata.php"/>
<property name="url.to.transcripts" value="http://www.thdl.org/avarch/transcripts"/>
<property name="url.to.media.high" value="http://www.thdl.org/media/high"/>
@ -69,15 +70,24 @@
<get src="${get.title.metadata}" dest="${archive}/title_metadata.xml" verbose="on"/>
</target>
<target name="archive-get-transcripts">
<xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list.txt" style="${styles}/get-list-of-transcripts.xsl">
<param name="prefix" expression="${url.to.transcripts}/"/>
</xslt>
</target>
<!--
<filelist
id="docfiles"
dir="${doc.src}">
<file name="foo.xml"/>
<file name="bar.xml"/>
</filelist>
</filelist>
<filterreader classname="org.apache.tools.ant.filters.PrefixLines">
<param name="prefix" value="Foo"/>
</filterreader>
-->
<!-- concordance program -->
<target name="lucene-thdl-compile" depends="init">
<mkdir dir="${lucene-thdl.bin}"/>