Conflicts: source/net/yacy/document/TextParser.javapull/1/head
parent
9ef5a80f4e
commit
bcbd8eee33
@ -0,0 +1,970 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<stylesheet
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
version="1.0" xmlns:h="http://www.w3.org/1999/xhtml"
|
||||
xmlns="http://www.w3.org/1999/XSL/Transform"
|
||||
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
|
||||
xmlns:xalan="http://xml.apache.org/xalan"
|
||||
xmlns:java="http://xml.apache.org/xalan/java"
|
||||
exclude-result-prefixes="java"
|
||||
>
|
||||
|
||||
|
||||
<!-- Version 0.21 by Fabien.Gandon@sophia.inria.fr -->
|
||||
<!-- This software is distributed under either the CeCILL-C license or the
|
||||
GNU Lesser General Public License version 3 license. -->
|
||||
<!-- This program is free software: you can redistribute it and/or modify
|
||||
it under the terms of the GNU Lesser General Public License -->
|
||||
<!-- as published by the Free Software Foundation version 3 of the License
|
||||
or under the terms of the CeCILL-C license. -->
|
||||
<!-- This program is distributed in the hope that it will be useful, but
|
||||
WITHOUT ANY WARRANTY; without even the implied -->
|
||||
<!-- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -->
|
||||
<!-- See the GNU Lesser General Public License version 3 at http://www.gnu.org/licenses/ -->
|
||||
<!-- and the CeCILL-C license at http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.html
|
||||
for more details -->
|
||||
|
||||
|
||||
<output indent="yes" method="xml" media-type="application/rdf+xml"
|
||||
encoding="UTF-8" omit-xml-declaration="yes" />
|
||||
|
||||
<!-- base of the current HTML doc -->
|
||||
<variable name='html_base' select="//*/h:head/h:base[position()=1]/@href" />
|
||||
|
||||
<!-- default HTML vocabulary namespace -->
|
||||
<variable name='default_voc' select="'http://www.w3.org/1999/xhtml/vocab#'" />
|
||||
|
||||
<!-- parser instance -->
|
||||
<param name='parser' select="''" />
|
||||
|
||||
<!-- url of the current XHTML page if provided by the XSLT engine -->
|
||||
<param name='url' select="''" />
|
||||
|
||||
<!-- this contains the URL of the source document whether it was provided
|
||||
by the base or as a parameter e.g. http://example.org/bla/file.html -->
|
||||
<variable name='this'>
|
||||
<choose>
|
||||
<when test="string-length($html_base)>0">
|
||||
<value-of select="$html_base" />
|
||||
</when>
|
||||
<otherwise>
|
||||
<value-of select="$url" />
|
||||
</otherwise>
|
||||
</choose>
|
||||
</variable>
|
||||
|
||||
<!-- this_location contains the location the source document e.g. http://example.org/bla/ -->
|
||||
<variable name='this_location'>
|
||||
<call-template name="get-location">
|
||||
<with-param name="url" select="$this" />
|
||||
</call-template>
|
||||
</variable>
|
||||
|
||||
<!-- this_root contains the root location of the source document e.g. http://example.org/ -->
|
||||
<variable name='this_root'>
|
||||
<call-template name="get-root">
|
||||
<with-param name="url" select="$this" />
|
||||
</call-template>
|
||||
</variable>
|
||||
|
||||
|
||||
<!-- templates for parsing - - - - - - - - - - - - - - - - - - - - - - -
|
||||
- - - - - - - - - - - - - -->
|
||||
|
||||
<!--Start the RDF generation -->
|
||||
<template match="/">
|
||||
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
|
||||
<apply-templates mode="rdf2rdfxml" /> <!-- the mode is used to ease integration with other XSLT templates -->
|
||||
</rdf:RDF>
|
||||
</template>
|
||||
|
||||
|
||||
<!-- match RDFa element -->
|
||||
<template
|
||||
match="*[attribute::property or attribute::rel or attribute::rev or attribute::typeof]"
|
||||
mode="rdf2rdfxml">
|
||||
|
||||
<!-- identify suject -->
|
||||
<variable name="subject">
|
||||
<call-template name="subject" />
|
||||
</variable>
|
||||
|
||||
|
||||
<!-- do we have object properties? -->
|
||||
<if test="string-length(@rel)>0 or string-length(@rev)>0">
|
||||
<variable name="object"> <!-- identify the object(s) -->
|
||||
<choose>
|
||||
<when test="@resource">
|
||||
<call-template name="expand-curie-or-uri">
|
||||
<with-param name="curie_or_uri" select="@resource" />
|
||||
</call-template>
|
||||
</when>
|
||||
<when test="@href">
|
||||
<call-template name="expand-curie-or-uri">
|
||||
<with-param name="curie_or_uri" select="@href" />
|
||||
</call-template>
|
||||
</when>
|
||||
<when
|
||||
test="descendant::*[attribute::about or attribute::src or attribute::typeof or
|
||||
attribute::href or attribute::resource or
|
||||
attribute::rel or attribute::rev or attribute::property]">
|
||||
<call-template name="recurse-objects" />
|
||||
</when>
|
||||
<otherwise>
|
||||
<call-template name="self-curie-or-uri">
|
||||
<with-param name="node" select="." />
|
||||
</call-template>
|
||||
</otherwise>
|
||||
</choose>
|
||||
</variable>
|
||||
|
||||
<call-template name="relrev">
|
||||
<with-param name="subject" select="$subject" />
|
||||
<with-param name="object" select="$object" />
|
||||
</call-template>
|
||||
|
||||
</if>
|
||||
|
||||
|
||||
<!-- do we have data properties ? -->
|
||||
<if test="string-length(@property)>0">
|
||||
|
||||
<!-- identify language -->
|
||||
<variable name="language"
|
||||
select="string(ancestor-or-self::*/attribute::xml:lang[position()=1])" />
|
||||
|
||||
<variable name="expended-pro">
|
||||
<call-template name="expand-ns">
|
||||
<with-param name="qname" select="@property" />
|
||||
</call-template>
|
||||
</variable>
|
||||
|
||||
<choose>
|
||||
<when test="@content"> <!-- there is a specific content -->
|
||||
<call-template name="property">
|
||||
<with-param name="subject" select="$subject" />
|
||||
<with-param name="object" select="@content" />
|
||||
<with-param name="datatype">
|
||||
<choose>
|
||||
<when test="@datatype='' or not(@datatype)"></when> <!-- enforcing plain literal -->
|
||||
<otherwise>
|
||||
<call-template name="expand-ns">
|
||||
<with-param name="qname" select="@datatype" />
|
||||
</call-template>
|
||||
</otherwise>
|
||||
</choose>
|
||||
</with-param>
|
||||
<with-param name="predicate" select="@property" />
|
||||
<with-param name="attrib" select="'true'" />
|
||||
<with-param name="language" select="$language" />
|
||||
</call-template>
|
||||
</when>
|
||||
<when test="not(*)"> <!-- there no specific content but there are no children elements in the
|
||||
content -->
|
||||
<call-template name="property">
|
||||
<with-param name="subject" select="$subject" />
|
||||
<with-param name="object" select="." />
|
||||
<with-param name="datatype">
|
||||
<choose>
|
||||
<when test="@datatype='' or not(@datatype)"></when> <!-- enforcing plain literal -->
|
||||
<otherwise>
|
||||
<call-template name="expand-ns">
|
||||
<with-param name="qname" select="@datatype" />
|
||||
</call-template>
|
||||
</otherwise>
|
||||
</choose>
|
||||
</with-param>
|
||||
<with-param name="predicate" select="@property" />
|
||||
<with-param name="attrib" select="'true'" />
|
||||
<with-param name="language" select="$language" />
|
||||
</call-template>
|
||||
</when>
|
||||
<otherwise> <!-- there is no specific content; we use the value of element -->
|
||||
<call-template name="property">
|
||||
<with-param name="subject" select="$subject" />
|
||||
<with-param name="object" select="." />
|
||||
<with-param name="datatype">
|
||||
<choose>
|
||||
<when test="@datatype='' or not(@datatype)">
|
||||
http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral
|
||||
</when> <!-- enforcing XML literal -->
|
||||
<otherwise>
|
||||
<call-template name="expand-ns">
|
||||
<with-param name="qname" select="@datatype" />
|
||||
</call-template>
|
||||
</otherwise>
|
||||
</choose>
|
||||
</with-param>
|
||||
<with-param name="predicate" select="@property" />
|
||||
<with-param name="attrib" select="'false'" />
|
||||
<with-param name="language" select="$language" />
|
||||
</call-template>
|
||||
</otherwise>
|
||||
</choose>
|
||||
</if>
|
||||
|
||||
<!-- do we have classes ? -->
|
||||
<if test="@typeof">
|
||||
<call-template name="class">
|
||||
<with-param name="resource">
|
||||
<call-template name="self-curie-or-uri">
|
||||
<with-param name="node" select="." />
|
||||
</call-template>
|
||||
</with-param>
|
||||
<with-param name="class" select="@typeof" />
|
||||
</call-template>
|
||||
</if>
|
||||
|
||||
<apply-templates mode="rdf2rdfxml" />
|
||||
|
||||
</template>
|
||||
|
||||
|
||||
<!-- named templates to process URIs and token lists - - - - - - - - - -
|
||||
- - - - - - - - - - - - - - - - - - - - - - - - - - -->
|
||||
|
||||
<!-- tokenize a string using space as a delimiter -->
|
||||
<template name="tokenize">
|
||||
<param name="string" />
|
||||
<if test="string-length($string)>0">
|
||||
<choose>
|
||||
<when test="contains($string,' ')">
|
||||
<value-of select="normalize-space(substring-before($string,' '))" />
|
||||
<call-template name="tokenize">
|
||||
<with-param name="string"
|
||||
select="normalize-space(substring-after($string,' '))" />
|
||||
</call-template>
|
||||
</when>
|
||||
<otherwise>
|
||||
<value-of select="$string" />
|
||||
</otherwise>
|
||||
</choose>
|
||||
</if>
|
||||
</template>
|
||||
|
||||
<!-- get file location from URL -->
|
||||
<template name="get-location">
|
||||
<param name="url" />
|
||||
<if test="string-length($url)>0 and contains($url,'/')">
|
||||
<value-of select="concat(substring-before($url,'/'),'/')" />
|
||||
<call-template name="get-location">
|
||||
<with-param name="url" select="substring-after($url,'/')" />
|
||||
</call-template>
|
||||
</if>
|
||||
</template>
|
||||
|
||||
<!-- get root location from URL -->
|
||||
<template name="get-root">
|
||||
<param name="url" />
|
||||
<choose>
|
||||
<when test="contains($url,'//')">
|
||||
<value-of
|
||||
select="concat(substring-before($url,'//'),'//',substring-before(substring-after($url,'//'),'/'),'/')" />
|
||||
</when>
|
||||
<otherwise>
|
||||
UNKNOWN ROOT
|
||||
</otherwise>
|
||||
</choose>
|
||||
</template>
|
||||
|
||||
<!-- return namespace of a qname -->
|
||||
<template name="return-ns">
|
||||
<param name="qname" />
|
||||
<variable name="ns_prefix" select="substring-before($qname,':')" />
|
||||
<if test="string-length($ns_prefix)>0"> <!-- prefix must be explicit -->
|
||||
<variable name="name" select="substring-after($qname,':')" />
|
||||
<value-of
|
||||
select="ancestor-or-self::*/namespace::*[name()=$ns_prefix][position()=1]" />
|
||||
</if>
|
||||
<if
|
||||
test="string-length($ns_prefix)=0 and ancestor-or-self::*/namespace::*[name()=''][position()=1]"> <!-- no prefix -->
|
||||
<variable name="name" select="substring-after($qname,':')" />
|
||||
<value-of select="ancestor-or-self::*/namespace::*[name()=''][position()=1]" />
|
||||
</if>
|
||||
</template>
|
||||
|
||||
|
||||
<!-- expand namespace of a qname -->
|
||||
<template name="expand-ns">
|
||||
<param name="qname" />
|
||||
<variable name="ns_prefix" select="substring-before($qname,':')" />
|
||||
<if test="string-length($ns_prefix)>0"> <!-- prefix must be explicit -->
|
||||
<variable name="name" select="substring-after($qname,':')" />
|
||||
<variable name="ns_uri"
|
||||
select="ancestor-or-self::*/namespace::*[name()=$ns_prefix][position()=1]" />
|
||||
<value-of select="concat($ns_uri,$name)" />
|
||||
</if>
|
||||
<if
|
||||
test="string-length($ns_prefix)=0 and ancestor-or-self::*/namespace::*[name()=''][position()=1]"> <!-- no prefix -->
|
||||
<variable name="name" select="substring-after($qname,':')" />
|
||||
<variable name="ns_uri"
|
||||
select="ancestor-or-self::*/namespace::*[name()=''][position()=1]" />
|
||||
<value-of select="concat($ns_uri,$name)" />
|
||||
</if>
|
||||
</template>
|
||||
|
||||
<!-- determines the CURIE / URI of a node -->
|
||||
<template name="self-curie-or-uri">
|
||||
<param name="node" />
|
||||
<choose>
|
||||
<when test="$node/attribute::about"> <!-- we have an about attribute to extend -->
|
||||
<call-template name="expand-curie-or-uri">
|
||||
<with-param name="curie_or_uri" select="$node/attribute::about" />
|
||||
</call-template>
|
||||
</when>
|
||||
<when test="$node/attribute::src"> <!-- we have an src attribute to extend -->
|
||||
<call-template name="expand-curie-or-uri">
|
||||
<with-param name="curie_or_uri" select="$node/attribute::src" />
|
||||
</call-template>
|
||||
</when>
|
||||
<when
|
||||
test="$node/attribute::resource and not($node/attribute::rel or $node/attribute::rev)"> <!-- enforcing the resource as subject if no rel or rev -->
|
||||
<call-template name="expand-curie-or-uri">
|
||||
<with-param name="curie_or_uri" select="$node/attribute::resource" />
|
||||
</call-template>
|
||||
</when>
|
||||
<when
|
||||
test="$node/attribute::href and not($node/attribute::rel or $node/attribute::rev)"> <!-- enforcing the href as subject if no rel or rev -->
|
||||
<call-template name="expand-curie-or-uri">
|
||||
<with-param name="curie_or_uri" select="$node/attribute::href" />
|
||||
</call-template>
|
||||
</when>
|
||||
<when
|
||||
test="$node/self::h:head or $node/self::h:body or $node/self::h:html">
|
||||
<value-of select="$this" />
|
||||
</when> <!-- enforcing the doc as subject -->
|
||||
<when test="$node/attribute::id"> <!-- we have an id attribute to extend -->
|
||||
<value-of select="concat($this,'#',$node/attribute::id)" />
|
||||
</when>
|
||||
<otherwise>
|
||||
blank:node:
|
||||
<value-of select="generate-id($node)" />
|
||||
</otherwise>
|
||||
</choose>
|
||||
</template>
|
||||
|
||||
|
||||
<!-- expand CURIE / URI -->
|
||||
<template name="expand-curie-or-uri">
|
||||
<param name="curie_or_uri" />
|
||||
<choose>
|
||||
<when test="starts-with($curie_or_uri,'[_:')"> <!-- we have a CURIE blank node -->
|
||||
<value-of
|
||||
select="concat('blank:node:',substring-after(substring-before($curie_or_uri,']'),'[_:'))" />
|
||||
</when>
|
||||
<when test="starts-with($curie_or_uri,'[')"> <!-- we have a CURIE between square brackets -->
|
||||
<call-template name="expand-ns">
|
||||
<with-param name="qname"
|
||||
select="substring-after(substring-before($curie_or_uri,']'),'[')" />
|
||||
</call-template>
|
||||
</when>
|
||||
<when test="starts-with($curie_or_uri,'#')"> <!-- we have an anchor -->
|
||||
<value-of select="concat($this,$curie_or_uri)" />
|
||||
</when>
|
||||
<when test="string-length($curie_or_uri)=0"> <!-- empty anchor means the document itself -->
|
||||
<value-of select="$this" />
|
||||
</when>
|
||||
<when
|
||||
test="not(starts-with($curie_or_uri,'[')) and contains($curie_or_uri,':')"> <!-- it is a URI -->
|
||||
<value-of select="$curie_or_uri" />
|
||||
</when>
|
||||
<when
|
||||
test="not(contains($curie_or_uri,'://')) and not(starts-with($curie_or_uri,'/'))"> <!-- relative URL -->
|
||||
<value-of select="concat($this_location,$curie_or_uri)" />
|
||||
</when>
|
||||
<when
|
||||
test="not(contains($curie_or_uri,'://')) and (starts-with($curie_or_uri,'/'))"> <!-- URL from root domain -->
|
||||
<value-of select="concat($this_root,substring-after($curie_or_uri,'/'))" />
|
||||
</when>
|
||||
<otherwise>
|
||||
UNKNOWN CURIE URI
|
||||
</otherwise>
|
||||
</choose>
|
||||
</template>
|
||||
|
||||
<!-- returns the first token in a list separated by spaces -->
|
||||
<template name="get-first-token">
|
||||
<param name="tokens" />
|
||||
<if test="string-length($tokens)>0">
|
||||
<choose>
|
||||
<when test="contains($tokens,' ')">
|
||||
<value-of select="normalize-space(substring-before($tokens,' '))" />
|
||||
</when>
|
||||
<otherwise>
|
||||
<value-of select="$tokens" />
|
||||
</otherwise>
|
||||
</choose>
|
||||
</if>
|
||||
</template>
|
||||
|
||||
<!-- returns the namespace for an object property -->
|
||||
<template name="get-relrev-ns">
|
||||
<param name="qname" />
|
||||
<variable name="ns_prefix"
|
||||
select="substring-before(translate($qname,'[]',''),':')" />
|
||||
<choose>
|
||||
<when test="string-length($ns_prefix)>0">
|
||||
<call-template name="return-ns">
|
||||
<with-param name="qname" select="$qname" />
|
||||
</call-template>
|
||||
</when>
|
||||
<!-- returns default_voc if the predicate is a reserved value -->
|
||||
<otherwise>
|
||||
<variable name="is-reserved">
|
||||
<call-template name="check-reserved">
|
||||
<with-param name="nonprefixed">
|
||||
<call-template name="no-leading-colon">
|
||||
<with-param name="name" select="$qname" />
|
||||
</call-template>
|
||||
</with-param>
|
||||
</call-template>
|
||||
</variable>
|
||||
<if test="$is-reserved='true'">
|
||||
<value-of select="$default_voc" />
|
||||
</if>
|
||||
</otherwise>
|
||||
</choose>
|
||||
</template>
|
||||
|
||||
<!-- returns the namespace for a data property -->
|
||||
<template name="get-property-ns">
|
||||
<param name="qname" />
|
||||
<variable name="ns_prefix"
|
||||
select="substring-before(translate($qname,'[]',''),':')" />
|
||||
<choose>
|
||||
<when test="string-length($ns_prefix)>0">
|
||||
<call-template name="return-ns">
|
||||
<with-param name="qname" select="$qname" />
|
||||
</call-template>
|
||||
</when>
|
||||
<!-- returns default_voc otherwise -->
|
||||
<otherwise>
|
||||
<value-of select="$default_voc" />
|
||||
</otherwise>
|
||||
</choose>
|
||||
</template>
|
||||
|
||||
<!-- returns the qname for a predicate -->
|
||||
<template name="get-predicate-name">
|
||||
<param name="qname" />
|
||||
<variable name="clean_name" select="translate($qname,'[]','')" />
|
||||
<call-template name="no-leading-colon">
|
||||
<with-param name="name" select="$clean_name" />
|
||||
</call-template>
|
||||
</template>
|
||||
|
||||
<!-- no leading colon -->
|
||||
<template name="no-leading-colon">
|
||||
<param name="name" />
|
||||
<choose>
|
||||
<when test="starts-with($name,':')"> <!-- remove leading colons -->
|
||||
<value-of select="substring-after($name,':')" />
|
||||
</when>
|
||||
<otherwise>
|
||||
<value-of select="$name" />
|
||||
</otherwise>
|
||||
</choose>
|
||||
</template>
|
||||
|
||||
<!-- check if a predicate is reserved -->
|
||||
<template name="check-reserved">
|
||||
<param name="nonprefixed" />
|
||||
<choose>
|
||||
<when
|
||||
test="$nonprefixed='alternate' or $nonprefixed='appendix' or $nonprefixed='bookmark' or $nonprefixed='cite'">
|
||||
true
|
||||
</when>
|
||||
<when
|
||||
test="$nonprefixed='chapter' or $nonprefixed='contents' or $nonprefixed='copyright' or $nonprefixed='first'">
|
||||
true
|
||||
</when>
|
||||
<when
|
||||
test="$nonprefixed='glossary' or $nonprefixed='help' or $nonprefixed='icon' or $nonprefixed='index'">
|
||||
true
|
||||
</when>
|
||||
<when
|
||||
test="$nonprefixed='last' or $nonprefixed='license' or $nonprefixed='meta' or $nonprefixed='next'">
|
||||
true
|
||||
</when>
|
||||
<when
|
||||
test="$nonprefixed='p3pv1' or $nonprefixed='prev' or $nonprefixed='role' or $nonprefixed='section'">
|
||||
true
|
||||
</when>
|
||||
<when
|
||||
test="$nonprefixed='stylesheet' or $nonprefixed='subsection' or $nonprefixed='start' or $nonprefixed='top'">
|
||||
true
|
||||
</when>
|
||||
<when test="$nonprefixed='up'">
|
||||
true
|
||||
</when>
|
||||
<when
|
||||
test="$nonprefixed='made' or $nonprefixed='previous' or $nonprefixed='search'">
|
||||
true
|
||||
</when> <!-- added because they are frequent -->
|
||||
<otherwise>
|
||||
false
|
||||
</otherwise>
|
||||
</choose>
|
||||
</template>
|
||||
|
||||
<!-- named templates to generate RDF - - - - - - - - - - - - - - - - - -
|
||||
- - - - - - - - - - - - - - -->
|
||||
|
||||
<template name="recursive-copy"> <!-- full copy -->
|
||||
<copy>
|
||||
<for-each select="node()|attribute::* ">
|
||||
<call-template name="recursive-copy" />
|
||||
</for-each>
|
||||
</copy>
|
||||
</template>
|
||||
|
||||
|
||||
<template name="subject"> <!-- determines current subject -->
|
||||
<choose>
|
||||
|
||||
<!-- current node is a meta or a link in the head and with no about attribute -->
|
||||
<when
|
||||
test="(self::h:link or self::h:meta) and ( ancestor::h:head ) and not(attribute::about)">
|
||||
<value-of select="$this" />
|
||||
</when>
|
||||
|
||||
<!-- an attribute about was specified on the node -->
|
||||
<when test="self::*/attribute::about">
|
||||
<call-template name="expand-curie-or-uri">
|
||||
<with-param name="curie_or_uri" select="@about" />
|
||||
</call-template>
|
||||
</when>
|
||||
|
||||
<!-- an attribute src was specified on the node -->
|
||||
<when test="self::*/attribute::src">
|
||||
<call-template name="expand-curie-or-uri">
|
||||
<with-param name="curie_or_uri" select="@src" />
|
||||
</call-template>
|
||||
</when>
|
||||
|
||||
|
||||
<!-- an attribute typeof was specified on the node -->
|
||||
<when test="self::*/attribute::typeof">
|
||||
<call-template name="self-curie-or-uri">
|
||||
<with-param name="node" select="." />
|
||||
</call-template>
|
||||
</when>
|
||||
|
||||
<!-- current node is a meta or a link in the body and with no about attribute -->
|
||||
<when
|
||||
test="(self::h:link or self::h:meta) and not( ancestor::h:head ) and not(attribute::about)">
|
||||
<call-template name="self-curie-or-uri">
|
||||
<with-param name="node" select="parent::*" />
|
||||
</call-template>
|
||||
</when>
|
||||
|
||||
<!-- an about was specified on its parent or the parent had a rel or a
|
||||
rev attribute but no href or an typeof. -->
|
||||
<when
|
||||
test="ancestor::*[attribute::about or attribute::src or attribute::typeof or attribute::resource or attribute::href or attribute::rel or attribute::rev][position()=1]">
|
||||
<variable name="selected_ancestor"
|
||||
select="ancestor::*[attribute::about or attribute::src or attribute::typeof or attribute::resource or attribute::href or attribute::rel or attribute::rev][position()=1]" />
|
||||
<choose>
|
||||
<when
|
||||
test="$selected_ancestor[(attribute::rel or attribute::rev) and not (attribute::resource or attribute::href)]">
|
||||
<value-of
|
||||
select="concat('blank:node:INSIDE_',generate-id($selected_ancestor))" />
|
||||
</when>
|
||||
<when test="$selected_ancestor/attribute::about">
|
||||
<call-template name="expand-curie-or-uri">
|
||||
<with-param name="curie_or_uri" select="$selected_ancestor/attribute::about" />
|
||||
</call-template>
|
||||
</when>
|
||||
<when test="$selected_ancestor/attribute::src">
|
||||
<call-template name="expand-curie-or-uri">
|
||||
<with-param name="curie_or_uri" select="$selected_ancestor/attribute::src" />
|
||||
</call-template>
|
||||
</when>
|
||||
<when test="$selected_ancestor/attribute::resource">
|
||||
<call-template name="expand-curie-or-uri">
|
||||
<with-param name="curie_or_uri"
|
||||
select="$selected_ancestor/attribute::resource" />
|
||||
</call-template>
|
||||
</when>
|
||||
<when test="$selected_ancestor/attribute::href">
|
||||
<call-template name="expand-curie-or-uri">
|
||||
<with-param name="curie_or_uri" select="$selected_ancestor/attribute::href" />
|
||||
</call-template>
|
||||
</when>
|
||||
<otherwise>
|
||||
<call-template name="self-curie-or-uri">
|
||||
<with-param name="node" select="$selected_ancestor" />
|
||||
</call-template>
|
||||
</otherwise>
|
||||
</choose>
|
||||
</when>
|
||||
|
||||
<otherwise> <!-- it must be about the current document -->
|
||||
<value-of select="$this" />
|
||||
</otherwise>
|
||||
|
||||
</choose>
|
||||
</template>
|
||||
|
||||
<!-- recursive call for object(s) of object properties -->
|
||||
<template name="recurse-objects">
|
||||
<xsl:for-each select="child::*">
|
||||
<choose>
|
||||
<when test="attribute::about or attribute::src"> <!-- there is a known resource -->
|
||||
<call-template name="expand-curie-or-uri">
|
||||
<with-param name="curie_or_uri" select="attribute::about | attribute::src" />
|
||||
</call-template>
|
||||
<text> </text>
|
||||
</when>
|
||||
<when
|
||||
test="(attribute::resource or attribute::href) and ( not (attribute::rel or attribute::rev or attribute::property))"> <!-- there is an incomplet triple -->
|
||||
<call-template name="expand-curie-or-uri">
|
||||
<with-param name="curie_or_uri"
|
||||
select="attribute::resource | attribute::href" />
|
||||
</call-template>
|
||||
<text> </text>
|
||||
</when>
|
||||
<when test="attribute::typeof and not (attribute::about)"> <!-- there is an implicit resource -->
|
||||
<call-template name="self-curie-or-uri">
|
||||
<with-param name="node" select="." />
|
||||
</call-template>
|
||||
<text> </text>
|
||||
</when>
|
||||
<when test="attribute::rel or attribute::rev or attribute::property"> <!-- there is an implicit resource -->
|
||||
<if
|
||||
test="not (preceding-sibling::*[attribute::rel or attribute::rev or attribute::property])"> <!-- generate the triple only once -->
|
||||
<call-template name="subject" />
|
||||
<text> </text>
|
||||
</if>
|
||||
</when>
|
||||
<otherwise> <!-- nothing at that level thus consider children -->
|
||||
<call-template name="recurse-objects" />
|
||||
</otherwise>
|
||||
</choose>
|
||||
</xsl:for-each>
|
||||
</template>
|
||||
|
||||
<!-- generate recursive call for multiple objects in rel or rev -->
|
||||
<template name="relrev">
|
||||
<param name="subject" />
|
||||
<param name="object" />
|
||||
|
||||
<!-- test for multiple predicates -->
|
||||
<variable name="single-object">
|
||||
<call-template name="get-first-token">
|
||||
<with-param name="tokens" select="$object" />
|
||||
</call-template>
|
||||
</variable>
|
||||
|
||||
<if test="string-length(@rel)>0">
|
||||
<call-template name="relation">
|
||||
<with-param name="subject" select="$subject" />
|
||||
<with-param name="object" select="$single-object" />
|
||||
<with-param name="predicate" select="@rel" />
|
||||
</call-template>
|
||||
</if>
|
||||
|
||||
<if test="string-length(@rev)>0">
|
||||
<call-template name="relation">
|
||||
<with-param name="subject" select="$single-object" />
|
||||
<with-param name="object" select="$subject" />
|
||||
<with-param name="predicate" select="@rev" />
|
||||
</call-template>
|
||||
</if>
|
||||
|
||||
<!-- recursive call for multiple predicates -->
|
||||
<variable name="other-objects"
|
||||
select="normalize-space(substring-after($object,' '))" />
|
||||
<if test="string-length($other-objects)>0">
|
||||
<call-template name="relrev">
|
||||
<with-param name="subject" select="$subject" />
|
||||
<with-param name="object" select="$other-objects" />
|
||||
</call-template>
|
||||
</if>
|
||||
|
||||
</template>
|
||||
|
||||
|
||||
<!-- generate an RDF statement for a relation -->
|
||||
<template name="relation">
|
||||
<param name="subject" />
|
||||
<param name="predicate" />
|
||||
<param name="object" />
|
||||
|
||||
<!-- test for multiple predicates -->
|
||||
<variable name="single-predicate">
|
||||
<call-template name="get-first-token">
|
||||
<with-param name="tokens" select="$predicate" />
|
||||
</call-template>
|
||||
</variable>
|
||||
|
||||
<!-- get namespace of the predicate -->
|
||||
<variable name="predicate-ns">
|
||||
<call-template name="get-relrev-ns">
|
||||
<with-param name="qname" select="$single-predicate" />
|
||||
</call-template>
|
||||
</variable>
|
||||
|
||||
<!-- get name of the predicate -->
|
||||
<variable name="predicate-name">
|
||||
<call-template name="get-predicate-name">
|
||||
<with-param name="qname" select="$single-predicate" />
|
||||
</call-template>
|
||||
</variable>
|
||||
|
||||
<choose>
|
||||
<when test="string-length($predicate-ns)>0"> <!-- there is a known namespace for the predicate -->
|
||||
<choose>
|
||||
<when test="starts-with($subject,'blank:node:')">
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheSubjectNodeID($parser,substring-after($subject,'blank:node:'))" />
|
||||
</when>
|
||||
<otherwise>
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheSubjectURI($parser,$subject)" />
|
||||
</otherwise>
|
||||
</choose>
|
||||
<!-- get full predicate -->
|
||||
<variable name="expanded-predicate">
|
||||
<call-template name="expand-ns">
|
||||
<with-param name="qname" select="$single-predicate" />
|
||||
</call-template>
|
||||
</variable>
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setThePropertyURI($parser,$expanded-predicate)" />
|
||||
|
||||
<choose>
|
||||
<when test="starts-with($object,'blank:node:')">
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheObjectNodeID($parser,substring-after($object,'blank:node:'))" />
|
||||
</when>
|
||||
<otherwise>
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheObjectURI($parser,$object)" />
|
||||
</otherwise>
|
||||
</choose>
|
||||
</when>
|
||||
<otherwise> <!-- no namespace generate a comment for debug -->
|
||||
<xsl:comment>
|
||||
No namespace for the rel or rev value ; could not produce the
|
||||
triple for:
|
||||
<value-of select="$subject" />
|
||||
-
|
||||
<value-of select="$single-predicate" />
|
||||
-
|
||||
<value-of select="$object" />
|
||||
</xsl:comment>
|
||||
</otherwise>
|
||||
</choose>
|
||||
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.flushObjectProperty($parser)" />
|
||||
|
||||
<!-- recursive call for multiple predicates -->
|
||||
<variable name="other-predicates"
|
||||
select="normalize-space(substring-after($predicate,' '))" />
|
||||
<if test="string-length($other-predicates)>0">
|
||||
<call-template name="relation">
|
||||
<with-param name="subject" select="$subject" />
|
||||
<with-param name="predicate" select="$other-predicates" />
|
||||
<with-param name="object" select="$object" />
|
||||
</call-template>
|
||||
</if>
|
||||
|
||||
</template>
|
||||
|
||||
|
||||
<!-- generate an RDF statement for a property -->
|
||||
<template name="property">
|
||||
<param name="subject" />
|
||||
<param name="predicate" />
|
||||
<param name="object" />
|
||||
<param name="datatype" />
|
||||
<param name="attrib" /> <!-- is the content from an attribute ? true /false -->
|
||||
<param name="language" />
|
||||
|
||||
<!-- test for multiple predicates -->
|
||||
<variable name="single-predicate">
|
||||
<call-template name="get-first-token">
|
||||
<with-param name="tokens" select="$predicate" />
|
||||
</call-template>
|
||||
</variable>
|
||||
|
||||
<!-- get namespace of the predicate -->
|
||||
<variable name="predicate-ns">
|
||||
<call-template name="get-property-ns">
|
||||
<with-param name="qname" select="$single-predicate" />
|
||||
</call-template>
|
||||
</variable>
|
||||
|
||||
|
||||
<!-- get name of the predicate -->
|
||||
<variable name="predicate-name">
|
||||
<call-template name="get-predicate-name">
|
||||
<with-param name="qname" select="$single-predicate" />
|
||||
</call-template>
|
||||
</variable>
|
||||
|
||||
<choose>
|
||||
<when test="string-length($predicate-ns)>0"> <!-- there is a known namespace for the predicate -->
|
||||
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setThePropertyURI($parser,$predicate-name)" />
|
||||
|
||||
<choose>
|
||||
<when test="starts-with($subject,'blank:nod: ')">
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheSubjectNodeID($parser,substring-after($subject,'blank:node:'))" />
|
||||
</when>
|
||||
<otherwise>
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheSubjectURI($parser,$subject)" />
|
||||
</otherwise>
|
||||
</choose>
|
||||
|
||||
<if test="string-length($language)>0">
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheLanguage($parser,$language)" />
|
||||
</if>
|
||||
|
||||
<choose>
|
||||
<when
|
||||
test="$datatype='http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral'">
|
||||
<choose>
|
||||
<when test="$attrib='true'"> <!-- content is in an attribute -->
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheDatatype($parser,$datatype)" />
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheValue($parser,normalize-space(string($object)))" />
|
||||
</when>
|
||||
<otherwise> <!-- content is in the element and may include some tags -->
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheDatatype($parser,$datatype)" />
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheValue($parser,$object)" />
|
||||
</otherwise>
|
||||
</choose>
|
||||
</when>
|
||||
<when test="string-length($datatype)>0">
|
||||
<!-- there is a datatype other than XMLLiteral -->
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheDatatype($parser,$datatype)" />
|
||||
<choose>
|
||||
<when test="$attrib='true'"> <!-- content is in an attribute -->
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheValue($parser,normalize-space(string($object)))" />
|
||||
</when>
|
||||
<otherwise> <!-- content is in the text nodes of the element -->
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheValue($parser,$object)" />
|
||||
</otherwise>
|
||||
</choose>
|
||||
</when>
|
||||
<otherwise> <!-- there is no datatype -->
|
||||
<choose>
|
||||
<when test="$attrib='true'"> <!-- content is in an attribute -->
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheValue($parser,normalize-space(string($object)))" />
|
||||
</when>
|
||||
<otherwise> <!-- content is in the text nodes of the element -->
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheValue($parser,$object)" />
|
||||
</otherwise>
|
||||
</choose>
|
||||
</otherwise>
|
||||
</choose>
|
||||
</when>
|
||||
<otherwise> <!-- generate a comment for debug -->
|
||||
<xsl:comment>
|
||||
Could not produce the triple for:
|
||||
<value-of select="$subject" />
|
||||
-
|
||||
<value-of select="$single-predicate" />
|
||||
-
|
||||
<value-of select="$object" />
|
||||
</xsl:comment>
|
||||
</otherwise>
|
||||
</choose>
|
||||
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.flushDataProperty($parser)" />
|
||||
|
||||
<!-- recursive call for multiple predicates -->
|
||||
<variable name="other-predicates"
|
||||
select="normalize-space(substring-after($predicate,' '))" />
|
||||
<if test="string-length($other-predicates)>0">
|
||||
<call-template name="property">
|
||||
<with-param name="subject" select="$subject" />
|
||||
<with-param name="predicate" select="$other-predicates" />
|
||||
<with-param name="object" select="$object" />
|
||||
<with-param name="datatype" select="$datatype" />
|
||||
<with-param name="attrib" select="$attrib" />
|
||||
<with-param name="language" select="$language" />
|
||||
</call-template>
|
||||
</if>
|
||||
|
||||
</template>
|
||||
|
||||
<!-- generate an RDF statement for a class -->
|
||||
<template name="class">
|
||||
<param name="resource" />
|
||||
<param name="class" />
|
||||
|
||||
<!-- case multiple classes -->
|
||||
<variable name="single-class">
|
||||
<call-template name="get-first-token">
|
||||
<with-param name="tokens" select="$class" />
|
||||
</call-template>
|
||||
</variable>
|
||||
|
||||
<!-- get namespace of the class -->
|
||||
<variable name="class-ns">
|
||||
<call-template name="return-ns">
|
||||
<with-param name="qname" select="$single-class" />
|
||||
</call-template>
|
||||
</variable>
|
||||
|
||||
<if test="string-length($class-ns)>0"> <!-- we have a qname for the class -->
|
||||
<variable name="expended-class">
|
||||
<call-template name="expand-ns">
|
||||
<with-param name="qname" select="$single-class" />
|
||||
</call-template>
|
||||
</variable>
|
||||
<choose>
|
||||
<when test="starts-with($resource,'blank:node:')">
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheSubjectNodeID($parser,substring-after($resource,'blank:node:'))" />
|
||||
</when>
|
||||
<otherwise>
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheSubjectURI($parser,$resource)" />
|
||||
</otherwise>
|
||||
</choose>
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setThePropertyURI($parser,'http://www.w3.org/1999/02/22-rdf-syntax-ns#type')" />
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheObjectURI($parser,$expended-class)" />
|
||||
<value-of
|
||||
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.flushObjectProperty($parser)" />
|
||||
</if>
|
||||
|
||||
|
||||
<!-- recursive call for multiple classes -->
|
||||
<variable name="other-classes" select="normalize-space(substring-after($class,' '))" />
|
||||
<if test="string-length($other-classes)>0">
|
||||
<call-template name="class">
|
||||
<with-param name="resource" select="$resource" />
|
||||
<with-param name="class" select="$other-classes" />
|
||||
</call-template>
|
||||
</if>
|
||||
|
||||
</template>
|
||||
|
||||
|
||||
<!-- ignore the rest of the DOM -->
|
||||
<template match="text()|@*|*" mode="rdf2rdfxml">
|
||||
<apply-templates mode="rdf2rdfxml" />
|
||||
</template>
|
||||
|
||||
|
||||
</stylesheet>
|
@ -0,0 +1,135 @@
|
||||
package net.yacy.document.parser.augment;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.net.MalformedURLException;
|
||||
|
||||
import net.yacy.yacy;
|
||||
import net.yacy.cora.document.ASCII;
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.parser.htmlParser;
|
||||
import net.yacy.document.parser.rdfa.IRDFaTriple;
|
||||
import net.yacy.document.parser.rdfa.impl.RDFaParser;
|
||||
import net.yacy.document.parser.rdfa.impl.RDFaTripleImpl;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
||||
|
||||
public class AugmentParser extends RDFaParser {
|
||||
|
||||
public AugmentParser(String name) {
|
||||
super(name);
|
||||
|
||||
System.out.println("augmented parser was initialized");
|
||||
|
||||
SUPPORTED_EXTENSIONS.remove("htm");
|
||||
SUPPORTED_EXTENSIONS.remove("html");
|
||||
SUPPORTED_EXTENSIONS.remove("shtml");
|
||||
SUPPORTED_EXTENSIONS.remove("xhtml");
|
||||
SUPPORTED_EXTENSIONS.remove("php");
|
||||
SUPPORTED_EXTENSIONS.remove("php3");
|
||||
SUPPORTED_EXTENSIONS.remove("php4");
|
||||
SUPPORTED_EXTENSIONS.remove("php5");
|
||||
SUPPORTED_EXTENSIONS.remove("cfm");
|
||||
SUPPORTED_EXTENSIONS.remove("asp");
|
||||
SUPPORTED_EXTENSIONS.remove("aspx");
|
||||
SUPPORTED_EXTENSIONS.remove("tex");
|
||||
SUPPORTED_EXTENSIONS.remove("txt");
|
||||
SUPPORTED_EXTENSIONS.remove("jsp");
|
||||
SUPPORTED_EXTENSIONS.remove("mf");
|
||||
SUPPORTED_EXTENSIONS.remove("pl");
|
||||
SUPPORTED_EXTENSIONS.remove("py");
|
||||
SUPPORTED_MIME_TYPES.remove("text/html");
|
||||
SUPPORTED_MIME_TYPES.remove("text/xhtml+xml");
|
||||
SUPPORTED_MIME_TYPES.remove("application/xhtml+xml");
|
||||
SUPPORTED_MIME_TYPES.remove("application/x-httpd-php");
|
||||
SUPPORTED_MIME_TYPES.remove("application/x-tex");
|
||||
SUPPORTED_MIME_TYPES.remove("text/plain");
|
||||
SUPPORTED_MIME_TYPES.remove("text/sgml");
|
||||
SUPPORTED_MIME_TYPES.remove("text/csv");
|
||||
|
||||
SUPPORTED_EXTENSIONS.add("html");
|
||||
SUPPORTED_EXTENSIONS.add("php");
|
||||
SUPPORTED_MIME_TYPES.add("text/html");
|
||||
SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
|
||||
SUPPORTED_EXTENSIONS.add("html");
|
||||
SUPPORTED_EXTENSIONS.add("htm");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Document[] parse(MultiProtocolURI url, String mimeType,
|
||||
String charset, InputStream source) throws Failure,
|
||||
InterruptedException {
|
||||
|
||||
Document[] htmlDocs = super.parse(url, mimeType, charset, source);
|
||||
try {
|
||||
source.reset();
|
||||
} catch (IOException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
|
||||
String urlHash = String.valueOf(url.hashCode());
|
||||
|
||||
DigestURI durl;
|
||||
try {
|
||||
durl = new DigestURI(MultiProtocolURI.unescape(url.toString()));
|
||||
urlHash = ASCII.String(durl.hash());
|
||||
} catch (MalformedURLException e1) {
|
||||
// TODO Auto-generated catch block
|
||||
e1.printStackTrace();
|
||||
}
|
||||
|
||||
Document theDoc = htmlDocs[0];
|
||||
|
||||
|
||||
Document superDoc = new Document(url, mimeType, charset, null, null, null, "", "",
|
||||
"", null, "", 0, 0, null, null, null, null, false);
|
||||
|
||||
// if the magic word appears in the document, perform extra actions.
|
||||
|
||||
|
||||
// if (htmlDocs[0].getKeywords().contains("magicword")) {
|
||||
// String all = "";
|
||||
//
|
||||
// all = "yacylatest";
|
||||
// superDoc = new Document(url, mimeType, charset, null, null, null, "", "",
|
||||
// "", null, "", 0, 0, all.getBytes(), null, null, null, false);
|
||||
// }
|
||||
|
||||
Document augmentDoc = parseAndAugment(url, mimeType, charset, source);
|
||||
|
||||
|
||||
Document[] retDocs = new Document[htmlDocs.length + 2];
|
||||
for (int i = 0; i < htmlDocs.length; i++) {
|
||||
retDocs[i] = htmlDocs[i];
|
||||
}
|
||||
|
||||
retDocs[retDocs.length - 1] = augmentDoc;
|
||||
retDocs[retDocs.length - 2] = superDoc;
|
||||
|
||||
return retDocs;
|
||||
|
||||
}
|
||||
|
||||
|
||||
private Document parseAndAugment(MultiProtocolURI url,
|
||||
String mimeType, String charset, InputStream source) {
|
||||
|
||||
String all = "";
|
||||
|
||||
// add even more information to the document in external routines.
|
||||
|
||||
// all = "augmented";
|
||||
|
||||
Document doc = new Document(url, mimeType, charset, null, null, null, "", "",
|
||||
"", null, "", 0, 0, all.getBytes(), null, null, null, false);
|
||||
return doc;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,82 @@
|
||||
/**
|
||||
* rssParser.java
|
||||
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
|
||||
* First released 20.08.2010 at http://yacy.net
|
||||
*
|
||||
* $LastChangedDate: 2011-04-21 15:58:49 +0200 (Do, 21 Apr 2011) $
|
||||
* $LastChangedRevision: 7672 $
|
||||
* $LastChangedBy: orbiter $
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
|
||||
package net.yacy.document.parser;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Properties;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.cora.document.RSSFeed;
|
||||
import net.yacy.cora.document.RSSReader;
|
||||
import net.yacy.cora.document.Hit;
|
||||
import net.yacy.document.AbstractParser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Parser;
|
||||
import net.yacy.document.TextParser;
|
||||
import net.yacy.document.parser.html.ImageEntry;
|
||||
|
||||
public class rdfParser extends AbstractParser implements Parser {
|
||||
|
||||
public rdfParser() {
|
||||
super("RDF Parser");
|
||||
|
||||
SUPPORTED_EXTENSIONS.add("rdf");
|
||||
SUPPORTED_MIME_TYPES.add("application/rdf+xml");
|
||||
}
|
||||
|
||||
public Document[] parse(final MultiProtocolURI url, final String mimeType,
|
||||
final String charset, final InputStream source)
|
||||
throws Failure, InterruptedException {
|
||||
|
||||
|
||||
// this function currently only registers detected rdf files.
|
||||
|
||||
// next step: load rdf content into triplestore.
|
||||
|
||||
final List<Document> docs = new ArrayList<Document>();
|
||||
|
||||
Document doc;
|
||||
|
||||
String all = "rdfdatasource";
|
||||
doc = new Document(url, mimeType, charset, null, null, null, "", "",
|
||||
"", null, "", 0, 0, all.getBytes(), null, null, null, false);
|
||||
|
||||
docs.add(doc);
|
||||
|
||||
final Document[] da = new Document[docs.size()];
|
||||
docs.toArray(da);
|
||||
return da;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,14 @@
|
||||
package net.yacy.document.parser.rdfa;
|
||||
|
||||
public interface IRDFaTriple{
|
||||
|
||||
String getPropertyURI();
|
||||
String getSubjectURI();
|
||||
String getSubjectNodeURI();
|
||||
String getObjectURI();
|
||||
String getObjectNodeURI();
|
||||
String getValue();
|
||||
String getDataType();
|
||||
String getLanguage();
|
||||
|
||||
}
|
@ -0,0 +1,170 @@
|
||||
/**
|
||||
*
|
||||
*/
|
||||
package net.yacy.document.parser.rdfa.impl;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.util.HashSet;
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.Parser.Failure;
|
||||
import net.yacy.document.parser.htmlParser;
|
||||
import net.yacy.document.parser.rdfa.IRDFaTriple;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
||||
/**
|
||||
* @author fgandon
|
||||
*
|
||||
*/
|
||||
public class RDFaParser extends htmlParser {
|
||||
|
||||
public RDFaParser(String name) {
|
||||
super(name);
|
||||
SUPPORTED_EXTENSIONS.remove("htm");
|
||||
SUPPORTED_EXTENSIONS.remove("html");
|
||||
SUPPORTED_EXTENSIONS.remove("shtml");
|
||||
SUPPORTED_EXTENSIONS.remove("xhtml");
|
||||
SUPPORTED_EXTENSIONS.remove("php");
|
||||
SUPPORTED_EXTENSIONS.remove("php3");
|
||||
SUPPORTED_EXTENSIONS.remove("php4");
|
||||
SUPPORTED_EXTENSIONS.remove("php5");
|
||||
SUPPORTED_EXTENSIONS.remove("cfm");
|
||||
SUPPORTED_EXTENSIONS.remove("asp");
|
||||
SUPPORTED_EXTENSIONS.remove("aspx");
|
||||
SUPPORTED_EXTENSIONS.remove("tex");
|
||||
SUPPORTED_EXTENSIONS.remove("txt");
|
||||
SUPPORTED_EXTENSIONS.remove("jsp");
|
||||
SUPPORTED_EXTENSIONS.remove("mf");
|
||||
SUPPORTED_EXTENSIONS.remove("pl");
|
||||
SUPPORTED_EXTENSIONS.remove("py");
|
||||
SUPPORTED_MIME_TYPES.remove("text/html");
|
||||
SUPPORTED_MIME_TYPES.remove("text/xhtml+xml");
|
||||
SUPPORTED_MIME_TYPES.remove("application/xhtml+xml");
|
||||
SUPPORTED_MIME_TYPES.remove("application/x-httpd-php");
|
||||
SUPPORTED_MIME_TYPES.remove("application/x-tex");
|
||||
SUPPORTED_MIME_TYPES.remove("text/plain");
|
||||
SUPPORTED_MIME_TYPES.remove("text/sgml");
|
||||
SUPPORTED_MIME_TYPES.remove("text/csv");
|
||||
|
||||
SUPPORTED_EXTENSIONS.add("html");
|
||||
SUPPORTED_EXTENSIONS.add("php");
|
||||
SUPPORTED_MIME_TYPES.add("text/html");
|
||||
SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
|
||||
SUPPORTED_EXTENSIONS.add("html");
|
||||
SUPPORTED_EXTENSIONS.add("htm");
|
||||
}
|
||||
|
||||
@Override
|
||||
public Document[] parse(MultiProtocolURI url, String mimeType,
|
||||
String charset, InputStream source) throws Failure,
|
||||
InterruptedException {
|
||||
|
||||
Document[] htmlDocs = parseHtml(url, mimeType, charset, source);
|
||||
|
||||
// TODO: current hardcoded restriction: apply rdfa parser only on selected sources.
|
||||
|
||||
if (url.toString().contains(".yacy") || url.toString().contains("experiments")) {
|
||||
// if (true == false) {
|
||||
Document rdfaDoc = parseRDFa(url, mimeType, charset, source);
|
||||
Document[] retDocs = new Document[htmlDocs.length + 1];
|
||||
for (int i = 0; i < htmlDocs.length; i++) {
|
||||
retDocs[i] = htmlDocs[i];
|
||||
}
|
||||
retDocs[retDocs.length - 1] = rdfaDoc;
|
||||
return retDocs;
|
||||
} else {
|
||||
return htmlDocs;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
private Document parseRDFa(MultiProtocolURI url, String mimeType,
|
||||
String charset, InputStream source) {
|
||||
RDFaTripleImpl triple;
|
||||
IRDFaTriple[] allTriples = null;
|
||||
try {
|
||||
triple = new RDFaTripleImpl(new InputStreamReader(source), url
|
||||
.toString());
|
||||
allTriples = triple.parse();
|
||||
|
||||
} catch (Exception e) {
|
||||
Log.logWarning("RDFA PARSER", "Triple extraction failed");
|
||||
}
|
||||
|
||||
Document doc = new Document(url, mimeType, charset, null, null, null, "", "",
|
||||
"", null, "", 0, 0, null, null, null, null, false);
|
||||
|
||||
try {
|
||||
if (allTriples.length > 0)
|
||||
doc = convertAllTriplesToDocument(url, mimeType, charset,
|
||||
allTriples);
|
||||
|
||||
} catch (Exception e) {
|
||||
Log.logWarning("RDFA PARSER",
|
||||
"Conversion triple to document failed");
|
||||
}
|
||||
return doc;
|
||||
|
||||
}
|
||||
|
||||
private Document[] parseHtml(MultiProtocolURI url, String mimeType,
|
||||
String charset, InputStream source) throws Failure,
|
||||
InterruptedException {
|
||||
|
||||
Document[] htmlDocs = null;
|
||||
try {
|
||||
htmlDocs = super.parse(url, mimeType, charset, source);
|
||||
source.reset();
|
||||
|
||||
} catch (IOException e1) {
|
||||
Log.logWarning("RDFA PARSER", "Super call failed");
|
||||
}
|
||||
return htmlDocs;
|
||||
|
||||
}
|
||||
|
||||
private Document convertAllTriplesToDocument(MultiProtocolURI url,
|
||||
String mimeType, String charset, IRDFaTriple[] allTriples) {
|
||||
|
||||
Set<String> languages = new HashSet<String>(2);
|
||||
Set<String> keywords = new HashSet<String>(allTriples.length);
|
||||
Set<String> sections = new HashSet<String>(5);
|
||||
String all = "";
|
||||
|
||||
for (IRDFaTriple irdFaTriple : allTriples) {
|
||||
// addNotEmptyValuesToSet(keywords, irdFaTriple.getLanguage());
|
||||
// addNotEmptyValuesToSet(keywords,
|
||||
// irdFaTriple.getSubjectNodeURI());
|
||||
// addNotEmptyValuesToSet(keywords, irdFaTriple.getSubjectURI());
|
||||
// addNotEmptyValuesToSet(keywords, irdFaTriple.getPropertyURI());
|
||||
// addNotEmptyValuesToSet(keywords, irdFaTriple.getObjectNodeURI());
|
||||
// addNotEmptyValuesToSet(keywords, irdFaTriple.getObjectURI());
|
||||
// addNotEmptyValuesToSet(keywords, irdFaTriple.getValue());
|
||||
addNotEmptyValuesToSet(keywords, irdFaTriple.getPropertyURI() + "Z"
|
||||
+ irdFaTriple.getValue());
|
||||
}
|
||||
for (String string : keywords) {
|
||||
string = string.replace(":", "X");
|
||||
string = string.replace("_", "Y");
|
||||
string = string.replace(" ", "Y");
|
||||
string = string.replace(".", "Y");
|
||||
string = string.replace(",", "Y");
|
||||
all += string + ",";
|
||||
}
|
||||
|
||||
Document doc = new Document(url, mimeType, charset, null, null, null, "", "",
|
||||
"", null, "", 0, 0, all.getBytes(), null, null, null, false);
|
||||
return doc;
|
||||
}
|
||||
|
||||
private void addNotEmptyValuesToSet(Set<String> set, String value) {
|
||||
if (value != null) {
|
||||
set.add(value);
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,63 @@
|
||||
package net.yacy.document.parser.rdfa.impl;
|
||||
|
||||
import net.yacy.document.parser.rdfa.IRDFaTriple;
|
||||
|
||||
public class RDFaTripleContent implements IRDFaTriple {
|
||||
|
||||
private final String subjectURI;
|
||||
private final String subjectNodeURI;
|
||||
private final String propertyURI;
|
||||
private final String value;
|
||||
private final String dataType;
|
||||
private final String language;
|
||||
private final String objectNodeURI;
|
||||
private final String objectURI;
|
||||
|
||||
public RDFaTripleContent(String subjectURI, String subjectNodeURI,
|
||||
String propertyURI, String value, String dataType, String language, String objectNodeURI, String objectURI) {
|
||||
this.subjectURI = subjectURI;
|
||||
this.subjectNodeURI = subjectNodeURI;
|
||||
this.propertyURI = propertyURI;
|
||||
this.value = value;
|
||||
this.dataType = dataType;
|
||||
this.language = language;
|
||||
this.objectNodeURI = objectNodeURI;
|
||||
this.objectURI = objectURI;
|
||||
}
|
||||
|
||||
public String getSubjectURI() {
|
||||
return subjectURI;
|
||||
}
|
||||
|
||||
public String getSubjectNodeURI() {
|
||||
return subjectNodeURI;
|
||||
}
|
||||
|
||||
public String getPropertyURI() {
|
||||
return propertyURI;
|
||||
}
|
||||
|
||||
public String getValue() {
|
||||
return value;
|
||||
}
|
||||
|
||||
public String getDataType() {
|
||||
return dataType;
|
||||
}
|
||||
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getObjectURI() {
|
||||
return objectURI;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getObjectNodeURI() {
|
||||
return objectNodeURI;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -0,0 +1,159 @@
|
||||
package net.yacy.document.parser.rdfa.impl;
|
||||
|
||||
import java.io.BufferedReader;
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
import java.util.ArrayList;
|
||||
|
||||
import javax.xml.transform.Templates;
|
||||
import javax.xml.transform.Transformer;
|
||||
import javax.xml.transform.TransformerConfigurationException;
|
||||
import javax.xml.transform.TransformerException;
|
||||
import javax.xml.transform.TransformerFactory;
|
||||
import javax.xml.transform.stream.StreamResult;
|
||||
import javax.xml.transform.stream.StreamSource;
|
||||
|
||||
import net.yacy.document.parser.rdfa.IRDFaTriple;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
||||
import net.yacy.yacy;
|
||||
|
||||
public class RDFaTripleImpl{
|
||||
|
||||
private static Templates templates = null;
|
||||
private String propertyURI = null;
|
||||
private String subjectURI = null;
|
||||
private String subjectNodeURI = null;
|
||||
private String objectURI = null;
|
||||
private String objectNodeURI = null;
|
||||
private String value = null;
|
||||
private String dataType = null;
|
||||
private String language = null;
|
||||
private Reader in;
|
||||
private Transformer aTransformer;
|
||||
private ArrayList<IRDFaTriple> allRDFaTriples = new ArrayList<IRDFaTriple>();
|
||||
|
||||
|
||||
public RDFaTripleImpl(Reader in, String base) throws IOException,
|
||||
TransformerException, TransformerConfigurationException {
|
||||
|
||||
BufferedReader bufReader = new BufferedReader(in);
|
||||
String readLine = bufReader.readLine();
|
||||
if (!readLine.toLowerCase().contains("<!doctype")){
|
||||
bufReader.reset();
|
||||
}
|
||||
|
||||
|
||||
if (templates == null) {
|
||||
|
||||
try{
|
||||
File f = new File (yacy.homedir+File.separatorChar+"RDFaParser"+File.separatorChar+"RDFaParser.xsl");
|
||||
|
||||
|
||||
StreamSource aSource = new StreamSource(f);
|
||||
|
||||
|
||||
TransformerFactory aFactory = TransformerFactory.newInstance();
|
||||
templates = aFactory.newTemplates(aSource);
|
||||
}
|
||||
catch(Exception e){
|
||||
Log.logSevere("RDFA PARSER", "XSL template could not be loaded from "+yacy.homedir+File.separatorChar+"RDFaParser"+File.separatorChar+"RDFaParser.xsl");
|
||||
}
|
||||
}
|
||||
this.aTransformer = templates.newTransformer();
|
||||
aTransformer.setParameter("parser", this);
|
||||
aTransformer.setParameter("url", base);
|
||||
|
||||
this.in = bufReader;
|
||||
}
|
||||
|
||||
public IRDFaTriple[] parse() {
|
||||
try {
|
||||
aTransformer.transform(new StreamSource(in), new StreamResult(System.out));
|
||||
} catch (TransformerException e) {
|
||||
Log.logWarning("RDFA PARSER", "Error while reading RDFa");
|
||||
// e.printStackTrace();
|
||||
}
|
||||
|
||||
return allRDFaTriples .toArray(new IRDFaTriple[]{});
|
||||
|
||||
}
|
||||
|
||||
public static boolean flushDataProperty(Object oparser) {
|
||||
RDFaTripleImpl parser = ((RDFaTripleImpl)oparser);
|
||||
|
||||
parser.reportDataProperty(parser.subjectURI, parser.subjectNodeURI, parser.propertyURI,
|
||||
parser.value, parser.dataType, parser.language, parser.objectNodeURI, parser.objectURI);
|
||||
nullAllValues(parser);
|
||||
return true;
|
||||
}
|
||||
|
||||
private void reportDataProperty(String subjectURI, String subjectNodeURI,
|
||||
String propertyURI, String value, String dataType,
|
||||
String language, String objectNodeURI, String objectURI) {
|
||||
IRDFaTriple triple = new RDFaTripleContent(subjectURI,subjectNodeURI,propertyURI,value,dataType,language, objectNodeURI,objectURI);
|
||||
allRDFaTriples.add(triple);
|
||||
}
|
||||
|
||||
private static void nullAllValues(RDFaTripleImpl parser) {
|
||||
parser.propertyURI = null;
|
||||
parser.subjectURI = null;
|
||||
parser.subjectNodeURI = null;
|
||||
parser.objectURI = null;
|
||||
parser.objectNodeURI = null;
|
||||
parser.value = null;
|
||||
parser.dataType = null;
|
||||
parser.language = null;
|
||||
}
|
||||
|
||||
public static boolean flushObjectProperty(Object oparser) {
|
||||
RDFaTripleImpl parser = ((RDFaTripleImpl)oparser);
|
||||
// System.out.println("parser added");
|
||||
nullAllValues(parser);
|
||||
return true;
|
||||
}
|
||||
|
||||
public static boolean setTheDatatype(Object parser, String theDatatype) {
|
||||
((RDFaTripleImpl)parser).dataType = theDatatype;
|
||||
System.out.println(theDatatype);
|
||||
return true;
|
||||
}
|
||||
|
||||
public static boolean setTheLanguage(Object parser, String theLanguage) {
|
||||
((RDFaTripleImpl)parser).language = theLanguage;
|
||||
return true;
|
||||
}
|
||||
|
||||
public static boolean setTheObjectNodeID(Object parser, String theObjectNodeID) {
|
||||
((RDFaTripleImpl)parser).objectNodeURI = theObjectNodeID;
|
||||
return true;
|
||||
}
|
||||
|
||||
public static boolean setTheObjectURI(Object parser, String theObjectURI) {
|
||||
((RDFaTripleImpl)parser).objectURI = theObjectURI;
|
||||
return true;
|
||||
}
|
||||
|
||||
public static boolean setThePropertyURI(Object parser, String thePropertyURI) {
|
||||
((RDFaTripleImpl)parser).propertyURI = thePropertyURI;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
public static boolean setTheSubjectNodeID(Object parser, String theSubjectNodeID) {
|
||||
((RDFaTripleImpl)parser).subjectNodeURI = theSubjectNodeID;
|
||||
System.out.println(theSubjectNodeID);
|
||||
return true;
|
||||
}
|
||||
|
||||
public static boolean setTheSubjectURI(Object parser, String theSubjectURI) {
|
||||
((RDFaTripleImpl)parser).subjectURI = theSubjectURI;
|
||||
return true;
|
||||
}
|
||||
|
||||
public static boolean setTheValue(Object parser, String theValue) {
|
||||
((RDFaTripleImpl)parser).value = theValue;
|
||||
return true;
|
||||
}
|
||||
}
|
@ -0,0 +1,67 @@
|
||||
package net.yacy.document.parser.rdfa;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.FileReader;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.Reader;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
|
||||
import net.yacy.cora.document.MultiProtocolURI;
|
||||
import net.yacy.document.Parser.Failure;
|
||||
import net.yacy.document.parser.rdfa.impl.RDFaParser;
|
||||
|
||||
public class main {
|
||||
/**
|
||||
* @param args
|
||||
*/
|
||||
public static void main(String[] args) {
|
||||
URL aURL = null;
|
||||
if (args.length < 1) {
|
||||
System.out
|
||||
.println("Usage: one and only one argument giving a file path or a URL.");
|
||||
} else {
|
||||
File aFile = new File(args[0]);
|
||||
Reader aReader = null;
|
||||
if (aFile.exists()) {
|
||||
try {
|
||||
aReader = new FileReader(aFile);
|
||||
} catch (FileNotFoundException e) {
|
||||
aReader = null;
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
aURL = new URL(args[0]);
|
||||
aReader = new InputStreamReader(aURL.openStream());
|
||||
} catch (MalformedURLException e) {
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
aReader = null;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (aReader != null) {
|
||||
RDFaParser aParser = new RDFaParser("html");
|
||||
try {
|
||||
aParser.parse(new MultiProtocolURI(args[0]),"","",aURL.openStream());
|
||||
} catch (FileNotFoundException e) {
|
||||
e.printStackTrace();
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
} catch (Failure e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
} catch (InterruptedException e) {
|
||||
// TODO Auto-generated catch block
|
||||
e.printStackTrace();
|
||||
}
|
||||
} else
|
||||
System.out.println("File or URL not recognized.");
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
}
|
Loading…
Reference in new issue