added a few stylesheets for lucene-based indexing of THDL's avdb
This commit is contained in:
parent
3dd452a298
commit
f8a97cce4e
3 changed files with 138 additions and 3 deletions
16
archive/styles/get-list-of-transcripts.xsl
Normal file
16
archive/styles/get-list-of-transcripts.xsl
Normal file
|
@ -0,0 +1,16 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" version="1.0">
|
||||
<xsl:output method="text" encoding="utf-8"/>
|
||||
|
||||
<xsl:param name="prefix" select="''"/>
|
||||
|
||||
<xsl:template match="/">
|
||||
<xsl:apply-templates select="//transcript"/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="transcript">
|
||||
<xsl:value-of select="$prefix"/><xsl:value-of select="."/><xsl:text>
|
||||
</xsl:text>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
109
archive/styles/qdToUnicode.xsl
Normal file
109
archive/styles/qdToUnicode.xsl
Normal file
|
@ -0,0 +1,109 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
|
||||
<xsl:stylesheet
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:thdl="java:org.thdl.tib.text.ttt.EwtsToUnicodeForXslt"
|
||||
exclude-result-prefixes="thdl"
|
||||
version="2.0">
|
||||
|
||||
<!-- <xsl:param name="mediaref"/> -->
|
||||
|
||||
<xsl:template match="/">
|
||||
<xsl:apply-templates/>
|
||||
</xsl:template>
|
||||
|
||||
<!-- <xsl:template match="SOUNDFILE">
|
||||
<SOUNDFILE href="{$mediaref}"/>
|
||||
</xsl:template> -->
|
||||
|
||||
<xsl:template match="TEXT">
|
||||
<TEXT>
|
||||
<xsl:apply-templates/>
|
||||
</TEXT>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="S">
|
||||
<xsl:element name="S">
|
||||
<xsl:apply-templates select="@*[not(. = 'id')]"/>
|
||||
<xsl:attribute name="id">
|
||||
<xsl:value-of select="generate-id(.)"/>
|
||||
</xsl:attribute>
|
||||
<xsl:apply-templates select="*"/>
|
||||
</xsl:element>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="FORM">
|
||||
<xsl:variable name="wylie" select="string(.)"/>
|
||||
<xsl:variable name="converted" select="thdl:convertEwtsTo($wylie)"/>
|
||||
<FORM xml:lang="bo">
|
||||
<xsl:for-each select="tokenize($converted, '[\[\]]')">
|
||||
<xsl:choose>
|
||||
<xsl:when test="starts-with(., '#ERROR')">࿐</xsl:when>
|
||||
|
||||
<!-- <ERROR number="{substring(., 8, 3)}" offense="{substring-before(substring-after(., '{'), '}')}">
|
||||
<MSG><xsl:value-of select="."/></MSG>
|
||||
<SRC><xsl:value-of select="$wylie"/></SRC>
|
||||
</ERROR>
|
||||
</xsl:when>-->
|
||||
<xsl:otherwise>
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</xsl:for-each>
|
||||
</FORM>
|
||||
<FORM xml:lang="bo-Latn">
|
||||
<xsl:value-of select="$wylie"/>
|
||||
</FORM>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="TRANSL">
|
||||
<TRANSL xml:lang="en">
|
||||
<xsl:apply-templates/>
|
||||
</TRANSL>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="TRANSL_ZH">
|
||||
<TRANSL xml:lang="zh">
|
||||
<xsl:apply-templates/>
|
||||
</TRANSL>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="SPEAKER">
|
||||
<xsl:variable name="wylie" select="string(.)"/>
|
||||
<xsl:variable name="converted" select="thdl:convertEwtsTo($wylie)"/>
|
||||
<SPEAKER xml:lang="bo" personId="{@personId}" wylie="{$wylie}">
|
||||
<xsl:for-each select="tokenize($converted, '[\[\]]')">
|
||||
<xsl:choose>
|
||||
<xsl:when test="starts-with(., '#ERROR')">࿐</xsl:when>
|
||||
<!--
|
||||
<ERROR number="{substring(., 8, 3)}" offense="{substring-before(substring-after(., '{'), '}')}">
|
||||
<MSG><xsl:value-of select="."/></MSG>
|
||||
<SRC><xsl:value-of select="$wylie"/></SRC>
|
||||
</ERROR>
|
||||
</xsl:when> -->
|
||||
<xsl:otherwise>
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:otherwise>
|
||||
</xsl:choose>
|
||||
</xsl:for-each>
|
||||
</SPEAKER>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="node()|@*">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="@*|node()"/>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
|
||||
<!-- <xsl:template match="*">
|
||||
<xsl:element name="{name(.)}">
|
||||
<xsl:for-each select="@*">
|
||||
<xsl:attribute name="{name(.)}">
|
||||
<xsl:value-of select="."/>
|
||||
</xsl:attribute>
|
||||
</xsl:for-each>
|
||||
<xsl:apply-templates/>
|
||||
</xsl:element>
|
||||
</xsl:template> -->
|
||||
|
||||
</xsl:stylesheet>
|
|
@ -1,6 +1,6 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
|
||||
<project name="thdl-concordancer" default="index-for-solr" basedir=".">
|
||||
<project name="lucene-thdl" default="lucene-thdl-jar" basedir=".">
|
||||
|
||||
<import file="build.xml"/>
|
||||
|
||||
|
@ -9,6 +9,7 @@
|
|||
<property name="wylie" location="${archive}/wylie"/>
|
||||
<property name="unicode" location="${archive}/unicode"/>
|
||||
<property name="solr" location="${archive}/solr"/>
|
||||
<property name="styles" location="${archive}/styles"/>
|
||||
<property name="get.title.metadata" value="http://thdl.org/avarch/mediaflowcat/title_metadata.php"/>
|
||||
<property name="url.to.transcripts" value="http://www.thdl.org/avarch/transcripts"/>
|
||||
<property name="url.to.media.high" value="http://www.thdl.org/media/high"/>
|
||||
|
@ -69,6 +70,12 @@
|
|||
<get src="${get.title.metadata}" dest="${archive}/title_metadata.xml" verbose="on"/>
|
||||
</target>
|
||||
|
||||
<target name="archive-get-transcripts">
|
||||
<xslt in="${archive}/title_metadata.xml" out="${archive}/titles_as_list.txt" style="${styles}/get-list-of-transcripts.xsl">
|
||||
<param name="prefix" expression="${url.to.transcripts}/"/>
|
||||
</xslt>
|
||||
</target>
|
||||
|
||||
<!--
|
||||
<filelist
|
||||
id="docfiles"
|
||||
|
@ -76,8 +83,11 @@
|
|||
<file name="foo.xml"/>
|
||||
<file name="bar.xml"/>
|
||||
</filelist>
|
||||
-->
|
||||
|
||||
<filterreader classname="org.apache.tools.ant.filters.PrefixLines">
|
||||
<param name="prefix" value="Foo"/>
|
||||
</filterreader>
|
||||
-->
|
||||
<!-- concordance program -->
|
||||
<target name="lucene-thdl-compile" depends="init">
|
||||
<mkdir dir="${lucene-thdl.bin}"/>
|
||||
|
|
Loading…
Reference in a new issue