Add several parsers, for RDFa and rdf files.

Conflicts:
	source/net/yacy/document/TextParser.java
pull/1/head
cominch 13 years ago committed by Michael Peter Christen
parent 9ef5a80f4e
commit bcbd8eee33

@ -0,0 +1,970 @@
<?xml version="1.0" encoding="UTF-8"?>
<stylesheet
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
version="1.0" xmlns:h="http://www.w3.org/1999/xhtml"
xmlns="http://www.w3.org/1999/XSL/Transform"
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:xalan="http://xml.apache.org/xalan"
xmlns:java="http://xml.apache.org/xalan/java"
exclude-result-prefixes="java"
>
<!-- Version 0.21 by Fabien.Gandon@sophia.inria.fr -->
<!-- This software is distributed under either the CeCILL-C license or the
GNU Lesser General Public License version 3 license. -->
<!-- This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License -->
<!-- as published by the Free Software Foundation version 3 of the License
or under the terms of the CeCILL-C license. -->
<!-- This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied -->
<!-- warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. -->
<!-- See the GNU Lesser General Public License version 3 at http://www.gnu.org/licenses/ -->
<!-- and the CeCILL-C license at http://www.cecill.info/licences/Licence_CeCILL-C_V1-en.html
for more details -->
<output indent="yes" method="xml" media-type="application/rdf+xml"
encoding="UTF-8" omit-xml-declaration="yes" />
<!-- base of the current HTML doc -->
<variable name='html_base' select="//*/h:head/h:base[position()=1]/@href" />
<!-- default HTML vocabulary namespace -->
<variable name='default_voc' select="'http://www.w3.org/1999/xhtml/vocab#'" />
<!-- parser instance -->
<param name='parser' select="''" />
<!-- url of the current XHTML page if provided by the XSLT engine -->
<param name='url' select="''" />
<!-- this contains the URL of the source document whether it was provided
by the base or as a parameter e.g. http://example.org/bla/file.html -->
<variable name='this'>
<choose>
<when test="string-length($html_base)>0">
<value-of select="$html_base" />
</when>
<otherwise>
<value-of select="$url" />
</otherwise>
</choose>
</variable>
<!-- this_location contains the location the source document e.g. http://example.org/bla/ -->
<variable name='this_location'>
<call-template name="get-location">
<with-param name="url" select="$this" />
</call-template>
</variable>
<!-- this_root contains the root location of the source document e.g. http://example.org/ -->
<variable name='this_root'>
<call-template name="get-root">
<with-param name="url" select="$this" />
</call-template>
</variable>
<!-- templates for parsing - - - - - - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - -->
<!--Start the RDF generation -->
<template match="/">
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
<apply-templates mode="rdf2rdfxml" /> <!-- the mode is used to ease integration with other XSLT templates -->
</rdf:RDF>
</template>
<!-- match RDFa element -->
<template
match="*[attribute::property or attribute::rel or attribute::rev or attribute::typeof]"
mode="rdf2rdfxml">
<!-- identify suject -->
<variable name="subject">
<call-template name="subject" />
</variable>
<!-- do we have object properties? -->
<if test="string-length(@rel)>0 or string-length(@rev)>0">
<variable name="object"> <!-- identify the object(s) -->
<choose>
<when test="@resource">
<call-template name="expand-curie-or-uri">
<with-param name="curie_or_uri" select="@resource" />
</call-template>
</when>
<when test="@href">
<call-template name="expand-curie-or-uri">
<with-param name="curie_or_uri" select="@href" />
</call-template>
</when>
<when
test="descendant::*[attribute::about or attribute::src or attribute::typeof or
attribute::href or attribute::resource or
attribute::rel or attribute::rev or attribute::property]">
<call-template name="recurse-objects" />
</when>
<otherwise>
<call-template name="self-curie-or-uri">
<with-param name="node" select="." />
</call-template>
</otherwise>
</choose>
</variable>
<call-template name="relrev">
<with-param name="subject" select="$subject" />
<with-param name="object" select="$object" />
</call-template>
</if>
<!-- do we have data properties ? -->
<if test="string-length(@property)>0">
<!-- identify language -->
<variable name="language"
select="string(ancestor-or-self::*/attribute::xml:lang[position()=1])" />
<variable name="expended-pro">
<call-template name="expand-ns">
<with-param name="qname" select="@property" />
</call-template>
</variable>
<choose>
<when test="@content"> <!-- there is a specific content -->
<call-template name="property">
<with-param name="subject" select="$subject" />
<with-param name="object" select="@content" />
<with-param name="datatype">
<choose>
<when test="@datatype='' or not(@datatype)"></when> <!-- enforcing plain literal -->
<otherwise>
<call-template name="expand-ns">
<with-param name="qname" select="@datatype" />
</call-template>
</otherwise>
</choose>
</with-param>
<with-param name="predicate" select="@property" />
<with-param name="attrib" select="'true'" />
<with-param name="language" select="$language" />
</call-template>
</when>
<when test="not(*)"> <!-- there no specific content but there are no children elements in the
content -->
<call-template name="property">
<with-param name="subject" select="$subject" />
<with-param name="object" select="." />
<with-param name="datatype">
<choose>
<when test="@datatype='' or not(@datatype)"></when> <!-- enforcing plain literal -->
<otherwise>
<call-template name="expand-ns">
<with-param name="qname" select="@datatype" />
</call-template>
</otherwise>
</choose>
</with-param>
<with-param name="predicate" select="@property" />
<with-param name="attrib" select="'true'" />
<with-param name="language" select="$language" />
</call-template>
</when>
<otherwise> <!-- there is no specific content; we use the value of element -->
<call-template name="property">
<with-param name="subject" select="$subject" />
<with-param name="object" select="." />
<with-param name="datatype">
<choose>
<when test="@datatype='' or not(@datatype)">
http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral
</when> <!-- enforcing XML literal -->
<otherwise>
<call-template name="expand-ns">
<with-param name="qname" select="@datatype" />
</call-template>
</otherwise>
</choose>
</with-param>
<with-param name="predicate" select="@property" />
<with-param name="attrib" select="'false'" />
<with-param name="language" select="$language" />
</call-template>
</otherwise>
</choose>
</if>
<!-- do we have classes ? -->
<if test="@typeof">
<call-template name="class">
<with-param name="resource">
<call-template name="self-curie-or-uri">
<with-param name="node" select="." />
</call-template>
</with-param>
<with-param name="class" select="@typeof" />
</call-template>
</if>
<apply-templates mode="rdf2rdfxml" />
</template>
<!-- named templates to process URIs and token lists - - - - - - - - - -
- - - - - - - - - - - - - - - - - - - - - - - - - - -->
<!-- tokenize a string using space as a delimiter -->
<template name="tokenize">
<param name="string" />
<if test="string-length($string)>0">
<choose>
<when test="contains($string,' ')">
<value-of select="normalize-space(substring-before($string,' '))" />
<call-template name="tokenize">
<with-param name="string"
select="normalize-space(substring-after($string,' '))" />
</call-template>
</when>
<otherwise>
<value-of select="$string" />
</otherwise>
</choose>
</if>
</template>
<!-- get file location from URL -->
<template name="get-location">
<param name="url" />
<if test="string-length($url)>0 and contains($url,'/')">
<value-of select="concat(substring-before($url,'/'),'/')" />
<call-template name="get-location">
<with-param name="url" select="substring-after($url,'/')" />
</call-template>
</if>
</template>
<!-- get root location from URL -->
<template name="get-root">
<param name="url" />
<choose>
<when test="contains($url,'//')">
<value-of
select="concat(substring-before($url,'//'),'//',substring-before(substring-after($url,'//'),'/'),'/')" />
</when>
<otherwise>
UNKNOWN ROOT
</otherwise>
</choose>
</template>
<!-- return namespace of a qname -->
<template name="return-ns">
<param name="qname" />
<variable name="ns_prefix" select="substring-before($qname,':')" />
<if test="string-length($ns_prefix)>0"> <!-- prefix must be explicit -->
<variable name="name" select="substring-after($qname,':')" />
<value-of
select="ancestor-or-self::*/namespace::*[name()=$ns_prefix][position()=1]" />
</if>
<if
test="string-length($ns_prefix)=0 and ancestor-or-self::*/namespace::*[name()=''][position()=1]"> <!-- no prefix -->
<variable name="name" select="substring-after($qname,':')" />
<value-of select="ancestor-or-self::*/namespace::*[name()=''][position()=1]" />
</if>
</template>
<!-- expand namespace of a qname -->
<template name="expand-ns">
<param name="qname" />
<variable name="ns_prefix" select="substring-before($qname,':')" />
<if test="string-length($ns_prefix)>0"> <!-- prefix must be explicit -->
<variable name="name" select="substring-after($qname,':')" />
<variable name="ns_uri"
select="ancestor-or-self::*/namespace::*[name()=$ns_prefix][position()=1]" />
<value-of select="concat($ns_uri,$name)" />
</if>
<if
test="string-length($ns_prefix)=0 and ancestor-or-self::*/namespace::*[name()=''][position()=1]"> <!-- no prefix -->
<variable name="name" select="substring-after($qname,':')" />
<variable name="ns_uri"
select="ancestor-or-self::*/namespace::*[name()=''][position()=1]" />
<value-of select="concat($ns_uri,$name)" />
</if>
</template>
<!-- determines the CURIE / URI of a node -->
<template name="self-curie-or-uri">
<param name="node" />
<choose>
<when test="$node/attribute::about"> <!-- we have an about attribute to extend -->
<call-template name="expand-curie-or-uri">
<with-param name="curie_or_uri" select="$node/attribute::about" />
</call-template>
</when>
<when test="$node/attribute::src"> <!-- we have an src attribute to extend -->
<call-template name="expand-curie-or-uri">
<with-param name="curie_or_uri" select="$node/attribute::src" />
</call-template>
</when>
<when
test="$node/attribute::resource and not($node/attribute::rel or $node/attribute::rev)"> <!-- enforcing the resource as subject if no rel or rev -->
<call-template name="expand-curie-or-uri">
<with-param name="curie_or_uri" select="$node/attribute::resource" />
</call-template>
</when>
<when
test="$node/attribute::href and not($node/attribute::rel or $node/attribute::rev)"> <!-- enforcing the href as subject if no rel or rev -->
<call-template name="expand-curie-or-uri">
<with-param name="curie_or_uri" select="$node/attribute::href" />
</call-template>
</when>
<when
test="$node/self::h:head or $node/self::h:body or $node/self::h:html">
<value-of select="$this" />
</when> <!-- enforcing the doc as subject -->
<when test="$node/attribute::id"> <!-- we have an id attribute to extend -->
<value-of select="concat($this,'#',$node/attribute::id)" />
</when>
<otherwise>
blank:node:
<value-of select="generate-id($node)" />
</otherwise>
</choose>
</template>
<!-- expand CURIE / URI -->
<template name="expand-curie-or-uri">
<param name="curie_or_uri" />
<choose>
<when test="starts-with($curie_or_uri,'[_:')"> <!-- we have a CURIE blank node -->
<value-of
select="concat('blank:node:',substring-after(substring-before($curie_or_uri,']'),'[_:'))" />
</when>
<when test="starts-with($curie_or_uri,'[')"> <!-- we have a CURIE between square brackets -->
<call-template name="expand-ns">
<with-param name="qname"
select="substring-after(substring-before($curie_or_uri,']'),'[')" />
</call-template>
</when>
<when test="starts-with($curie_or_uri,'#')"> <!-- we have an anchor -->
<value-of select="concat($this,$curie_or_uri)" />
</when>
<when test="string-length($curie_or_uri)=0"> <!-- empty anchor means the document itself -->
<value-of select="$this" />
</when>
<when
test="not(starts-with($curie_or_uri,'[')) and contains($curie_or_uri,':')"> <!-- it is a URI -->
<value-of select="$curie_or_uri" />
</when>
<when
test="not(contains($curie_or_uri,'://')) and not(starts-with($curie_or_uri,'/'))"> <!-- relative URL -->
<value-of select="concat($this_location,$curie_or_uri)" />
</when>
<when
test="not(contains($curie_or_uri,'://')) and (starts-with($curie_or_uri,'/'))"> <!-- URL from root domain -->
<value-of select="concat($this_root,substring-after($curie_or_uri,'/'))" />
</when>
<otherwise>
UNKNOWN CURIE URI
</otherwise>
</choose>
</template>
<!-- returns the first token in a list separated by spaces -->
<template name="get-first-token">
<param name="tokens" />
<if test="string-length($tokens)>0">
<choose>
<when test="contains($tokens,' ')">
<value-of select="normalize-space(substring-before($tokens,' '))" />
</when>
<otherwise>
<value-of select="$tokens" />
</otherwise>
</choose>
</if>
</template>
<!-- returns the namespace for an object property -->
<template name="get-relrev-ns">
<param name="qname" />
<variable name="ns_prefix"
select="substring-before(translate($qname,'[]',''),':')" />
<choose>
<when test="string-length($ns_prefix)>0">
<call-template name="return-ns">
<with-param name="qname" select="$qname" />
</call-template>
</when>
<!-- returns default_voc if the predicate is a reserved value -->
<otherwise>
<variable name="is-reserved">
<call-template name="check-reserved">
<with-param name="nonprefixed">
<call-template name="no-leading-colon">
<with-param name="name" select="$qname" />
</call-template>
</with-param>
</call-template>
</variable>
<if test="$is-reserved='true'">
<value-of select="$default_voc" />
</if>
</otherwise>
</choose>
</template>
<!-- returns the namespace for a data property -->
<template name="get-property-ns">
<param name="qname" />
<variable name="ns_prefix"
select="substring-before(translate($qname,'[]',''),':')" />
<choose>
<when test="string-length($ns_prefix)>0">
<call-template name="return-ns">
<with-param name="qname" select="$qname" />
</call-template>
</when>
<!-- returns default_voc otherwise -->
<otherwise>
<value-of select="$default_voc" />
</otherwise>
</choose>
</template>
<!-- returns the qname for a predicate -->
<template name="get-predicate-name">
<param name="qname" />
<variable name="clean_name" select="translate($qname,'[]','')" />
<call-template name="no-leading-colon">
<with-param name="name" select="$clean_name" />
</call-template>
</template>
<!-- no leading colon -->
<template name="no-leading-colon">
<param name="name" />
<choose>
<when test="starts-with($name,':')"> <!-- remove leading colons -->
<value-of select="substring-after($name,':')" />
</when>
<otherwise>
<value-of select="$name" />
</otherwise>
</choose>
</template>
<!-- check if a predicate is reserved -->
<template name="check-reserved">
<param name="nonprefixed" />
<choose>
<when
test="$nonprefixed='alternate' or $nonprefixed='appendix' or $nonprefixed='bookmark' or $nonprefixed='cite'">
true
</when>
<when
test="$nonprefixed='chapter' or $nonprefixed='contents' or $nonprefixed='copyright' or $nonprefixed='first'">
true
</when>
<when
test="$nonprefixed='glossary' or $nonprefixed='help' or $nonprefixed='icon' or $nonprefixed='index'">
true
</when>
<when
test="$nonprefixed='last' or $nonprefixed='license' or $nonprefixed='meta' or $nonprefixed='next'">
true
</when>
<when
test="$nonprefixed='p3pv1' or $nonprefixed='prev' or $nonprefixed='role' or $nonprefixed='section'">
true
</when>
<when
test="$nonprefixed='stylesheet' or $nonprefixed='subsection' or $nonprefixed='start' or $nonprefixed='top'">
true
</when>
<when test="$nonprefixed='up'">
true
</when>
<when
test="$nonprefixed='made' or $nonprefixed='previous' or $nonprefixed='search'">
true
</when> <!-- added because they are frequent -->
<otherwise>
false
</otherwise>
</choose>
</template>
<!-- named templates to generate RDF - - - - - - - - - - - - - - - - - -
- - - - - - - - - - - - - - -->
<template name="recursive-copy"> <!-- full copy -->
<copy>
<for-each select="node()|attribute::* ">
<call-template name="recursive-copy" />
</for-each>
</copy>
</template>
<template name="subject"> <!-- determines current subject -->
<choose>
<!-- current node is a meta or a link in the head and with no about attribute -->
<when
test="(self::h:link or self::h:meta) and ( ancestor::h:head ) and not(attribute::about)">
<value-of select="$this" />
</when>
<!-- an attribute about was specified on the node -->
<when test="self::*/attribute::about">
<call-template name="expand-curie-or-uri">
<with-param name="curie_or_uri" select="@about" />
</call-template>
</when>
<!-- an attribute src was specified on the node -->
<when test="self::*/attribute::src">
<call-template name="expand-curie-or-uri">
<with-param name="curie_or_uri" select="@src" />
</call-template>
</when>
<!-- an attribute typeof was specified on the node -->
<when test="self::*/attribute::typeof">
<call-template name="self-curie-or-uri">
<with-param name="node" select="." />
</call-template>
</when>
<!-- current node is a meta or a link in the body and with no about attribute -->
<when
test="(self::h:link or self::h:meta) and not( ancestor::h:head ) and not(attribute::about)">
<call-template name="self-curie-or-uri">
<with-param name="node" select="parent::*" />
</call-template>
</when>
<!-- an about was specified on its parent or the parent had a rel or a
rev attribute but no href or an typeof. -->
<when
test="ancestor::*[attribute::about or attribute::src or attribute::typeof or attribute::resource or attribute::href or attribute::rel or attribute::rev][position()=1]">
<variable name="selected_ancestor"
select="ancestor::*[attribute::about or attribute::src or attribute::typeof or attribute::resource or attribute::href or attribute::rel or attribute::rev][position()=1]" />
<choose>
<when
test="$selected_ancestor[(attribute::rel or attribute::rev) and not (attribute::resource or attribute::href)]">
<value-of
select="concat('blank:node:INSIDE_',generate-id($selected_ancestor))" />
</when>
<when test="$selected_ancestor/attribute::about">
<call-template name="expand-curie-or-uri">
<with-param name="curie_or_uri" select="$selected_ancestor/attribute::about" />
</call-template>
</when>
<when test="$selected_ancestor/attribute::src">
<call-template name="expand-curie-or-uri">
<with-param name="curie_or_uri" select="$selected_ancestor/attribute::src" />
</call-template>
</when>
<when test="$selected_ancestor/attribute::resource">
<call-template name="expand-curie-or-uri">
<with-param name="curie_or_uri"
select="$selected_ancestor/attribute::resource" />
</call-template>
</when>
<when test="$selected_ancestor/attribute::href">
<call-template name="expand-curie-or-uri">
<with-param name="curie_or_uri" select="$selected_ancestor/attribute::href" />
</call-template>
</when>
<otherwise>
<call-template name="self-curie-or-uri">
<with-param name="node" select="$selected_ancestor" />
</call-template>
</otherwise>
</choose>
</when>
<otherwise> <!-- it must be about the current document -->
<value-of select="$this" />
</otherwise>
</choose>
</template>
<!-- recursive call for object(s) of object properties -->
<template name="recurse-objects">
<xsl:for-each select="child::*">
<choose>
<when test="attribute::about or attribute::src"> <!-- there is a known resource -->
<call-template name="expand-curie-or-uri">
<with-param name="curie_or_uri" select="attribute::about | attribute::src" />
</call-template>
<text> </text>
</when>
<when
test="(attribute::resource or attribute::href) and ( not (attribute::rel or attribute::rev or attribute::property))"> <!-- there is an incomplet triple -->
<call-template name="expand-curie-or-uri">
<with-param name="curie_or_uri"
select="attribute::resource | attribute::href" />
</call-template>
<text> </text>
</when>
<when test="attribute::typeof and not (attribute::about)"> <!-- there is an implicit resource -->
<call-template name="self-curie-or-uri">
<with-param name="node" select="." />
</call-template>
<text> </text>
</when>
<when test="attribute::rel or attribute::rev or attribute::property"> <!-- there is an implicit resource -->
<if
test="not (preceding-sibling::*[attribute::rel or attribute::rev or attribute::property])"> <!-- generate the triple only once -->
<call-template name="subject" />
<text> </text>
</if>
</when>
<otherwise> <!-- nothing at that level thus consider children -->
<call-template name="recurse-objects" />
</otherwise>
</choose>
</xsl:for-each>
</template>
<!-- generate recursive call for multiple objects in rel or rev -->
<template name="relrev">
<param name="subject" />
<param name="object" />
<!-- test for multiple predicates -->
<variable name="single-object">
<call-template name="get-first-token">
<with-param name="tokens" select="$object" />
</call-template>
</variable>
<if test="string-length(@rel)>0">
<call-template name="relation">
<with-param name="subject" select="$subject" />
<with-param name="object" select="$single-object" />
<with-param name="predicate" select="@rel" />
</call-template>
</if>
<if test="string-length(@rev)>0">
<call-template name="relation">
<with-param name="subject" select="$single-object" />
<with-param name="object" select="$subject" />
<with-param name="predicate" select="@rev" />
</call-template>
</if>
<!-- recursive call for multiple predicates -->
<variable name="other-objects"
select="normalize-space(substring-after($object,' '))" />
<if test="string-length($other-objects)>0">
<call-template name="relrev">
<with-param name="subject" select="$subject" />
<with-param name="object" select="$other-objects" />
</call-template>
</if>
</template>
<!-- generate an RDF statement for a relation -->
<template name="relation">
<param name="subject" />
<param name="predicate" />
<param name="object" />
<!-- test for multiple predicates -->
<variable name="single-predicate">
<call-template name="get-first-token">
<with-param name="tokens" select="$predicate" />
</call-template>
</variable>
<!-- get namespace of the predicate -->
<variable name="predicate-ns">
<call-template name="get-relrev-ns">
<with-param name="qname" select="$single-predicate" />
</call-template>
</variable>
<!-- get name of the predicate -->
<variable name="predicate-name">
<call-template name="get-predicate-name">
<with-param name="qname" select="$single-predicate" />
</call-template>
</variable>
<choose>
<when test="string-length($predicate-ns)>0"> <!-- there is a known namespace for the predicate -->
<choose>
<when test="starts-with($subject,'blank:node:')">
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheSubjectNodeID($parser,substring-after($subject,'blank:node:'))" />
</when>
<otherwise>
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheSubjectURI($parser,$subject)" />
</otherwise>
</choose>
<!-- get full predicate -->
<variable name="expanded-predicate">
<call-template name="expand-ns">
<with-param name="qname" select="$single-predicate" />
</call-template>
</variable>
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setThePropertyURI($parser,$expanded-predicate)" />
<choose>
<when test="starts-with($object,'blank:node:')">
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheObjectNodeID($parser,substring-after($object,'blank:node:'))" />
</when>
<otherwise>
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheObjectURI($parser,$object)" />
</otherwise>
</choose>
</when>
<otherwise> <!-- no namespace generate a comment for debug -->
<xsl:comment>
No namespace for the rel or rev value ; could not produce the
triple for:
<value-of select="$subject" />
-
<value-of select="$single-predicate" />
-
<value-of select="$object" />
</xsl:comment>
</otherwise>
</choose>
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.flushObjectProperty($parser)" />
<!-- recursive call for multiple predicates -->
<variable name="other-predicates"
select="normalize-space(substring-after($predicate,' '))" />
<if test="string-length($other-predicates)>0">
<call-template name="relation">
<with-param name="subject" select="$subject" />
<with-param name="predicate" select="$other-predicates" />
<with-param name="object" select="$object" />
</call-template>
</if>
</template>
<!-- generate an RDF statement for a property -->
<template name="property">
<param name="subject" />
<param name="predicate" />
<param name="object" />
<param name="datatype" />
<param name="attrib" /> <!-- is the content from an attribute ? true /false -->
<param name="language" />
<!-- test for multiple predicates -->
<variable name="single-predicate">
<call-template name="get-first-token">
<with-param name="tokens" select="$predicate" />
</call-template>
</variable>
<!-- get namespace of the predicate -->
<variable name="predicate-ns">
<call-template name="get-property-ns">
<with-param name="qname" select="$single-predicate" />
</call-template>
</variable>
<!-- get name of the predicate -->
<variable name="predicate-name">
<call-template name="get-predicate-name">
<with-param name="qname" select="$single-predicate" />
</call-template>
</variable>
<choose>
<when test="string-length($predicate-ns)>0"> <!-- there is a known namespace for the predicate -->
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setThePropertyURI($parser,$predicate-name)" />
<choose>
<when test="starts-with($subject,'blank:nod: ')">
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheSubjectNodeID($parser,substring-after($subject,'blank:node:'))" />
</when>
<otherwise>
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheSubjectURI($parser,$subject)" />
</otherwise>
</choose>
<if test="string-length($language)>0">
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheLanguage($parser,$language)" />
</if>
<choose>
<when
test="$datatype='http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral'">
<choose>
<when test="$attrib='true'"> <!-- content is in an attribute -->
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheDatatype($parser,$datatype)" />
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheValue($parser,normalize-space(string($object)))" />
</when>
<otherwise> <!-- content is in the element and may include some tags -->
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheDatatype($parser,$datatype)" />
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheValue($parser,$object)" />
</otherwise>
</choose>
</when>
<when test="string-length($datatype)>0">
<!-- there is a datatype other than XMLLiteral -->
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheDatatype($parser,$datatype)" />
<choose>
<when test="$attrib='true'"> <!-- content is in an attribute -->
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheValue($parser,normalize-space(string($object)))" />
</when>
<otherwise> <!-- content is in the text nodes of the element -->
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheValue($parser,$object)" />
</otherwise>
</choose>
</when>
<otherwise> <!-- there is no datatype -->
<choose>
<when test="$attrib='true'"> <!-- content is in an attribute -->
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheValue($parser,normalize-space(string($object)))" />
</when>
<otherwise> <!-- content is in the text nodes of the element -->
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheValue($parser,$object)" />
</otherwise>
</choose>
</otherwise>
</choose>
</when>
<otherwise> <!-- generate a comment for debug -->
<xsl:comment>
Could not produce the triple for:
<value-of select="$subject" />
-
<value-of select="$single-predicate" />
-
<value-of select="$object" />
</xsl:comment>
</otherwise>
</choose>
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.flushDataProperty($parser)" />
<!-- recursive call for multiple predicates -->
<variable name="other-predicates"
select="normalize-space(substring-after($predicate,' '))" />
<if test="string-length($other-predicates)>0">
<call-template name="property">
<with-param name="subject" select="$subject" />
<with-param name="predicate" select="$other-predicates" />
<with-param name="object" select="$object" />
<with-param name="datatype" select="$datatype" />
<with-param name="attrib" select="$attrib" />
<with-param name="language" select="$language" />
</call-template>
</if>
</template>
<!-- generate an RDF statement for a class -->
<template name="class">
<param name="resource" />
<param name="class" />
<!-- case multiple classes -->
<variable name="single-class">
<call-template name="get-first-token">
<with-param name="tokens" select="$class" />
</call-template>
</variable>
<!-- get namespace of the class -->
<variable name="class-ns">
<call-template name="return-ns">
<with-param name="qname" select="$single-class" />
</call-template>
</variable>
<if test="string-length($class-ns)>0"> <!-- we have a qname for the class -->
<variable name="expended-class">
<call-template name="expand-ns">
<with-param name="qname" select="$single-class" />
</call-template>
</variable>
<choose>
<when test="starts-with($resource,'blank:node:')">
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheSubjectNodeID($parser,substring-after($resource,'blank:node:'))" />
</when>
<otherwise>
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheSubjectURI($parser,$resource)" />
</otherwise>
</choose>
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setThePropertyURI($parser,'http://www.w3.org/1999/02/22-rdf-syntax-ns#type')" />
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.setTheObjectURI($parser,$expended-class)" />
<value-of
select="java:net.yacy.document.parser.rdfa.impl.RDFaTripleImpl.flushObjectProperty($parser)" />
</if>
<!-- recursive call for multiple classes -->
<variable name="other-classes" select="normalize-space(substring-after($class,' '))" />
<if test="string-length($other-classes)>0">
<call-template name="class">
<with-param name="resource" select="$resource" />
<with-param name="class" select="$other-classes" />
</call-template>
</if>
</template>
<!-- ignore the rest of the DOM -->
<template match="text()|@*|*" mode="rdf2rdfxml">
<apply-templates mode="rdf2rdfxml" />
</template>
</stylesheet>

@ -52,7 +52,7 @@ border-bottom: 1px solid #ffffff;
</div>
<div id="sidebar-document" class="sci_panel" style="top: 70px;">
<div id="sidebar-document" class="sci_panel" style="top: 110px;">
<p>document</p> <img src="/currentyacypeer/env/grafics/bad.png" onclick="contribution (document.location.href, 'like document', '#[username]#'); return false">

@ -48,6 +48,7 @@ import net.yacy.document.parser.ooxmlParser;
import net.yacy.document.parser.pdfParser;
import net.yacy.document.parser.pptParser;
import net.yacy.document.parser.psParser;
import net.yacy.document.parser.rdfParser;
import net.yacy.document.parser.rssParser;
import net.yacy.document.parser.rtfParser;
import net.yacy.document.parser.sevenzipParser;
@ -59,7 +60,9 @@ import net.yacy.document.parser.vcfParser;
import net.yacy.document.parser.vsdParser;
import net.yacy.document.parser.xlsParser;
import net.yacy.document.parser.zipParser;
import net.yacy.document.parser.augment.AugmentParser;
import net.yacy.document.parser.images.genericImageParser;
import net.yacy.document.parser.rdfa.impl.RDFaParser;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
@ -81,7 +84,7 @@ public final class TextParser {
initParser(new csvParser());
initParser(new docParser());
initParser(new gzipParser());
initParser(new htmlParser());
initParser(new htmlParser("HTML Parser"));
initParser(new genericImageParser());
initParser(new mmParser());
initParser(new odtParser());
@ -100,6 +103,11 @@ public final class TextParser {
initParser(new vsdParser());
initParser(new xlsParser());
initParser(new zipParser());
initParser(new RDFaParser("RDFa Parser"));
initParser(new rdfParser());
initParser(new AugmentParser("Augment Parser"));
}
public static Set<Parser> parsers() {

@ -0,0 +1,135 @@
package net.yacy.document.parser.augment;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import net.yacy.yacy;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.rdfa.IRDFaTriple;
import net.yacy.document.parser.rdfa.impl.RDFaParser;
import net.yacy.document.parser.rdfa.impl.RDFaTripleImpl;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
public class AugmentParser extends RDFaParser {
public AugmentParser(String name) {
super(name);
System.out.println("augmented parser was initialized");
SUPPORTED_EXTENSIONS.remove("htm");
SUPPORTED_EXTENSIONS.remove("html");
SUPPORTED_EXTENSIONS.remove("shtml");
SUPPORTED_EXTENSIONS.remove("xhtml");
SUPPORTED_EXTENSIONS.remove("php");
SUPPORTED_EXTENSIONS.remove("php3");
SUPPORTED_EXTENSIONS.remove("php4");
SUPPORTED_EXTENSIONS.remove("php5");
SUPPORTED_EXTENSIONS.remove("cfm");
SUPPORTED_EXTENSIONS.remove("asp");
SUPPORTED_EXTENSIONS.remove("aspx");
SUPPORTED_EXTENSIONS.remove("tex");
SUPPORTED_EXTENSIONS.remove("txt");
SUPPORTED_EXTENSIONS.remove("jsp");
SUPPORTED_EXTENSIONS.remove("mf");
SUPPORTED_EXTENSIONS.remove("pl");
SUPPORTED_EXTENSIONS.remove("py");
SUPPORTED_MIME_TYPES.remove("text/html");
SUPPORTED_MIME_TYPES.remove("text/xhtml+xml");
SUPPORTED_MIME_TYPES.remove("application/xhtml+xml");
SUPPORTED_MIME_TYPES.remove("application/x-httpd-php");
SUPPORTED_MIME_TYPES.remove("application/x-tex");
SUPPORTED_MIME_TYPES.remove("text/plain");
SUPPORTED_MIME_TYPES.remove("text/sgml");
SUPPORTED_MIME_TYPES.remove("text/csv");
SUPPORTED_EXTENSIONS.add("html");
SUPPORTED_EXTENSIONS.add("php");
SUPPORTED_MIME_TYPES.add("text/html");
SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
SUPPORTED_EXTENSIONS.add("html");
SUPPORTED_EXTENSIONS.add("htm");
}
@Override
public Document[] parse(MultiProtocolURI url, String mimeType,
String charset, InputStream source) throws Failure,
InterruptedException {
Document[] htmlDocs = super.parse(url, mimeType, charset, source);
try {
source.reset();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
String urlHash = String.valueOf(url.hashCode());
DigestURI durl;
try {
durl = new DigestURI(MultiProtocolURI.unescape(url.toString()));
urlHash = ASCII.String(durl.hash());
} catch (MalformedURLException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
}
Document theDoc = htmlDocs[0];
Document superDoc = new Document(url, mimeType, charset, null, null, null, "", "",
"", null, "", 0, 0, null, null, null, null, false);
// if the magic word appears in the document, perform extra actions.
// if (htmlDocs[0].getKeywords().contains("magicword")) {
// String all = "";
//
// all = "yacylatest";
// superDoc = new Document(url, mimeType, charset, null, null, null, "", "",
// "", null, "", 0, 0, all.getBytes(), null, null, null, false);
// }
Document augmentDoc = parseAndAugment(url, mimeType, charset, source);
Document[] retDocs = new Document[htmlDocs.length + 2];
for (int i = 0; i < htmlDocs.length; i++) {
retDocs[i] = htmlDocs[i];
}
retDocs[retDocs.length - 1] = augmentDoc;
retDocs[retDocs.length - 2] = superDoc;
return retDocs;
}
private Document parseAndAugment(MultiProtocolURI url,
String mimeType, String charset, InputStream source) {
String all = "";
// add even more information to the document in external routines.
// all = "augmented";
Document doc = new Document(url, mimeType, charset, null, null, null, "", "",
"", null, "", 0, 0, all.getBytes(), null, null, null, false);
return doc;
}
}

@ -52,8 +52,8 @@ public class htmlParser extends AbstractParser implements Parser {
private static final Pattern patternUnderline = Pattern.compile("_");
public htmlParser() {
super("HTML Parser");
public htmlParser(String name) {
super(name);
this.SUPPORTED_EXTENSIONS.add("htm");
this.SUPPORTED_EXTENSIONS.add("html");
this.SUPPORTED_EXTENSIONS.add("phtml");
@ -298,7 +298,7 @@ public class htmlParser extends AbstractParser implements Parser {
try {
url = new MultiProtocolURI(args[0]);
final byte[] content = url.get(ClientIdentification.getUserAgent(), 3000);
final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content));
final Document[] document = new htmlParser("HTML Parser").parse(url, "text/html", null, new ByteArrayInputStream(content));
final String title = document[0].dc_title();
System.out.println(title);
System.out.println(CharacterCoding.unicode2html(title, false));

@ -0,0 +1,82 @@
/**
* rssParser.java
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
* First released 20.08.2010 at http://yacy.net
*
* $LastChangedDate: 2011-04-21 15:58:49 +0200 (Do, 21 Apr 2011) $
* $LastChangedRevision: 7672 $
* $LastChangedBy: orbiter $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.parser;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSReader;
import net.yacy.cora.document.Hit;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.parser.html.ImageEntry;
public class rdfParser extends AbstractParser implements Parser {
public rdfParser() {
super("RDF Parser");
SUPPORTED_EXTENSIONS.add("rdf");
SUPPORTED_MIME_TYPES.add("application/rdf+xml");
}
public Document[] parse(final MultiProtocolURI url, final String mimeType,
final String charset, final InputStream source)
throws Failure, InterruptedException {
// this function currently only registers detected rdf files.
// next step: load rdf content into triplestore.
final List<Document> docs = new ArrayList<Document>();
Document doc;
String all = "rdfdatasource";
doc = new Document(url, mimeType, charset, null, null, null, "", "",
"", null, "", 0, 0, all.getBytes(), null, null, null, false);
docs.add(doc);
final Document[] da = new Document[docs.size()];
docs.toArray(da);
return da;
}
}

@ -0,0 +1,14 @@
package net.yacy.document.parser.rdfa;
public interface IRDFaTriple{
String getPropertyURI();
String getSubjectURI();
String getSubjectNodeURI();
String getObjectURI();
String getObjectNodeURI();
String getValue();
String getDataType();
String getLanguage();
}

@ -0,0 +1,170 @@
/**
*
*/
package net.yacy.document.parser.rdfa.impl;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Set;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.rdfa.IRDFaTriple;
import net.yacy.kelondro.logging.Log;
/**
* @author fgandon
*
*/
public class RDFaParser extends htmlParser {
public RDFaParser(String name) {
super(name);
SUPPORTED_EXTENSIONS.remove("htm");
SUPPORTED_EXTENSIONS.remove("html");
SUPPORTED_EXTENSIONS.remove("shtml");
SUPPORTED_EXTENSIONS.remove("xhtml");
SUPPORTED_EXTENSIONS.remove("php");
SUPPORTED_EXTENSIONS.remove("php3");
SUPPORTED_EXTENSIONS.remove("php4");
SUPPORTED_EXTENSIONS.remove("php5");
SUPPORTED_EXTENSIONS.remove("cfm");
SUPPORTED_EXTENSIONS.remove("asp");
SUPPORTED_EXTENSIONS.remove("aspx");
SUPPORTED_EXTENSIONS.remove("tex");
SUPPORTED_EXTENSIONS.remove("txt");
SUPPORTED_EXTENSIONS.remove("jsp");
SUPPORTED_EXTENSIONS.remove("mf");
SUPPORTED_EXTENSIONS.remove("pl");
SUPPORTED_EXTENSIONS.remove("py");
SUPPORTED_MIME_TYPES.remove("text/html");
SUPPORTED_MIME_TYPES.remove("text/xhtml+xml");
SUPPORTED_MIME_TYPES.remove("application/xhtml+xml");
SUPPORTED_MIME_TYPES.remove("application/x-httpd-php");
SUPPORTED_MIME_TYPES.remove("application/x-tex");
SUPPORTED_MIME_TYPES.remove("text/plain");
SUPPORTED_MIME_TYPES.remove("text/sgml");
SUPPORTED_MIME_TYPES.remove("text/csv");
SUPPORTED_EXTENSIONS.add("html");
SUPPORTED_EXTENSIONS.add("php");
SUPPORTED_MIME_TYPES.add("text/html");
SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
SUPPORTED_EXTENSIONS.add("html");
SUPPORTED_EXTENSIONS.add("htm");
}
@Override
public Document[] parse(MultiProtocolURI url, String mimeType,
String charset, InputStream source) throws Failure,
InterruptedException {
Document[] htmlDocs = parseHtml(url, mimeType, charset, source);
// TODO: current hardcoded restriction: apply rdfa parser only on selected sources.
if (url.toString().contains(".yacy") || url.toString().contains("experiments")) {
// if (true == false) {
Document rdfaDoc = parseRDFa(url, mimeType, charset, source);
Document[] retDocs = new Document[htmlDocs.length + 1];
for (int i = 0; i < htmlDocs.length; i++) {
retDocs[i] = htmlDocs[i];
}
retDocs[retDocs.length - 1] = rdfaDoc;
return retDocs;
} else {
return htmlDocs;
}
}
private Document parseRDFa(MultiProtocolURI url, String mimeType,
String charset, InputStream source) {
RDFaTripleImpl triple;
IRDFaTriple[] allTriples = null;
try {
triple = new RDFaTripleImpl(new InputStreamReader(source), url
.toString());
allTriples = triple.parse();
} catch (Exception e) {
Log.logWarning("RDFA PARSER", "Triple extraction failed");
}
Document doc = new Document(url, mimeType, charset, null, null, null, "", "",
"", null, "", 0, 0, null, null, null, null, false);
try {
if (allTriples.length > 0)
doc = convertAllTriplesToDocument(url, mimeType, charset,
allTriples);
} catch (Exception e) {
Log.logWarning("RDFA PARSER",
"Conversion triple to document failed");
}
return doc;
}
private Document[] parseHtml(MultiProtocolURI url, String mimeType,
String charset, InputStream source) throws Failure,
InterruptedException {
Document[] htmlDocs = null;
try {
htmlDocs = super.parse(url, mimeType, charset, source);
source.reset();
} catch (IOException e1) {
Log.logWarning("RDFA PARSER", "Super call failed");
}
return htmlDocs;
}
private Document convertAllTriplesToDocument(MultiProtocolURI url,
String mimeType, String charset, IRDFaTriple[] allTriples) {
Set<String> languages = new HashSet<String>(2);
Set<String> keywords = new HashSet<String>(allTriples.length);
Set<String> sections = new HashSet<String>(5);
String all = "";
for (IRDFaTriple irdFaTriple : allTriples) {
// addNotEmptyValuesToSet(keywords, irdFaTriple.getLanguage());
// addNotEmptyValuesToSet(keywords,
// irdFaTriple.getSubjectNodeURI());
// addNotEmptyValuesToSet(keywords, irdFaTriple.getSubjectURI());
// addNotEmptyValuesToSet(keywords, irdFaTriple.getPropertyURI());
// addNotEmptyValuesToSet(keywords, irdFaTriple.getObjectNodeURI());
// addNotEmptyValuesToSet(keywords, irdFaTriple.getObjectURI());
// addNotEmptyValuesToSet(keywords, irdFaTriple.getValue());
addNotEmptyValuesToSet(keywords, irdFaTriple.getPropertyURI() + "Z"
+ irdFaTriple.getValue());
}
for (String string : keywords) {
string = string.replace(":", "X");
string = string.replace("_", "Y");
string = string.replace(" ", "Y");
string = string.replace(".", "Y");
string = string.replace(",", "Y");
all += string + ",";
}
Document doc = new Document(url, mimeType, charset, null, null, null, "", "",
"", null, "", 0, 0, all.getBytes(), null, null, null, false);
return doc;
}
private void addNotEmptyValuesToSet(Set<String> set, String value) {
if (value != null) {
set.add(value);
}
}
}

@ -0,0 +1,63 @@
package net.yacy.document.parser.rdfa.impl;
import net.yacy.document.parser.rdfa.IRDFaTriple;
public class RDFaTripleContent implements IRDFaTriple {
private final String subjectURI;
private final String subjectNodeURI;
private final String propertyURI;
private final String value;
private final String dataType;
private final String language;
private final String objectNodeURI;
private final String objectURI;
public RDFaTripleContent(String subjectURI, String subjectNodeURI,
String propertyURI, String value, String dataType, String language, String objectNodeURI, String objectURI) {
this.subjectURI = subjectURI;
this.subjectNodeURI = subjectNodeURI;
this.propertyURI = propertyURI;
this.value = value;
this.dataType = dataType;
this.language = language;
this.objectNodeURI = objectNodeURI;
this.objectURI = objectURI;
}
public String getSubjectURI() {
return subjectURI;
}
public String getSubjectNodeURI() {
return subjectNodeURI;
}
public String getPropertyURI() {
return propertyURI;
}
public String getValue() {
return value;
}
public String getDataType() {
return dataType;
}
public String getLanguage() {
return language;
}
@Override
public String getObjectURI() {
return objectURI;
}
@Override
public String getObjectNodeURI() {
return objectNodeURI;
}
}

@ -0,0 +1,159 @@
package net.yacy.document.parser.rdfa.impl;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import javax.xml.transform.Templates;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import net.yacy.document.parser.rdfa.IRDFaTriple;
import net.yacy.kelondro.logging.Log;
import net.yacy.yacy;
public class RDFaTripleImpl{
private static Templates templates = null;
private String propertyURI = null;
private String subjectURI = null;
private String subjectNodeURI = null;
private String objectURI = null;
private String objectNodeURI = null;
private String value = null;
private String dataType = null;
private String language = null;
private Reader in;
private Transformer aTransformer;
private ArrayList<IRDFaTriple> allRDFaTriples = new ArrayList<IRDFaTriple>();
public RDFaTripleImpl(Reader in, String base) throws IOException,
TransformerException, TransformerConfigurationException {
BufferedReader bufReader = new BufferedReader(in);
String readLine = bufReader.readLine();
if (!readLine.toLowerCase().contains("<!doctype")){
bufReader.reset();
}
if (templates == null) {
try{
File f = new File (yacy.homedir+File.separatorChar+"RDFaParser"+File.separatorChar+"RDFaParser.xsl");
StreamSource aSource = new StreamSource(f);
TransformerFactory aFactory = TransformerFactory.newInstance();
templates = aFactory.newTemplates(aSource);
}
catch(Exception e){
Log.logSevere("RDFA PARSER", "XSL template could not be loaded from "+yacy.homedir+File.separatorChar+"RDFaParser"+File.separatorChar+"RDFaParser.xsl");
}
}
this.aTransformer = templates.newTransformer();
aTransformer.setParameter("parser", this);
aTransformer.setParameter("url", base);
this.in = bufReader;
}
public IRDFaTriple[] parse() {
try {
aTransformer.transform(new StreamSource(in), new StreamResult(System.out));
} catch (TransformerException e) {
Log.logWarning("RDFA PARSER", "Error while reading RDFa");
// e.printStackTrace();
}
return allRDFaTriples .toArray(new IRDFaTriple[]{});
}
public static boolean flushDataProperty(Object oparser) {
RDFaTripleImpl parser = ((RDFaTripleImpl)oparser);
parser.reportDataProperty(parser.subjectURI, parser.subjectNodeURI, parser.propertyURI,
parser.value, parser.dataType, parser.language, parser.objectNodeURI, parser.objectURI);
nullAllValues(parser);
return true;
}
private void reportDataProperty(String subjectURI, String subjectNodeURI,
String propertyURI, String value, String dataType,
String language, String objectNodeURI, String objectURI) {
IRDFaTriple triple = new RDFaTripleContent(subjectURI,subjectNodeURI,propertyURI,value,dataType,language, objectNodeURI,objectURI);
allRDFaTriples.add(triple);
}
private static void nullAllValues(RDFaTripleImpl parser) {
parser.propertyURI = null;
parser.subjectURI = null;
parser.subjectNodeURI = null;
parser.objectURI = null;
parser.objectNodeURI = null;
parser.value = null;
parser.dataType = null;
parser.language = null;
}
public static boolean flushObjectProperty(Object oparser) {
RDFaTripleImpl parser = ((RDFaTripleImpl)oparser);
// System.out.println("parser added");
nullAllValues(parser);
return true;
}
public static boolean setTheDatatype(Object parser, String theDatatype) {
((RDFaTripleImpl)parser).dataType = theDatatype;
System.out.println(theDatatype);
return true;
}
public static boolean setTheLanguage(Object parser, String theLanguage) {
((RDFaTripleImpl)parser).language = theLanguage;
return true;
}
public static boolean setTheObjectNodeID(Object parser, String theObjectNodeID) {
((RDFaTripleImpl)parser).objectNodeURI = theObjectNodeID;
return true;
}
public static boolean setTheObjectURI(Object parser, String theObjectURI) {
((RDFaTripleImpl)parser).objectURI = theObjectURI;
return true;
}
public static boolean setThePropertyURI(Object parser, String thePropertyURI) {
((RDFaTripleImpl)parser).propertyURI = thePropertyURI;
return true;
}
public static boolean setTheSubjectNodeID(Object parser, String theSubjectNodeID) {
((RDFaTripleImpl)parser).subjectNodeURI = theSubjectNodeID;
System.out.println(theSubjectNodeID);
return true;
}
public static boolean setTheSubjectURI(Object parser, String theSubjectURI) {
((RDFaTripleImpl)parser).subjectURI = theSubjectURI;
return true;
}
public static boolean setTheValue(Object parser, String theValue) {
((RDFaTripleImpl)parser).value = theValue;
return true;
}
}

@ -0,0 +1,67 @@
package net.yacy.document.parser.rdfa;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.Parser.Failure;
import net.yacy.document.parser.rdfa.impl.RDFaParser;
public class main {
/**
* @param args
*/
public static void main(String[] args) {
URL aURL = null;
if (args.length < 1) {
System.out
.println("Usage: one and only one argument giving a file path or a URL.");
} else {
File aFile = new File(args[0]);
Reader aReader = null;
if (aFile.exists()) {
try {
aReader = new FileReader(aFile);
} catch (FileNotFoundException e) {
aReader = null;
}
} else {
try {
aURL = new URL(args[0]);
aReader = new InputStreamReader(aURL.openStream());
} catch (MalformedURLException e) {
} catch (IOException e) {
e.printStackTrace();
aReader = null;
}
}
if (aReader != null) {
RDFaParser aParser = new RDFaParser("html");
try {
aParser.parse(new MultiProtocolURI(args[0]),"","",aURL.openStream());
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (Failure e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else
System.out.println("File or URL not recognized.");
}
}
}
Loading…
Cancel
Save