alpha version of surrogate reading and indexing.

see the example file for an explanation. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5815 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · 9050a3c4c5
parent 870066ab35
commit 9050a3c4c5
3 changed files with 94 additions and 5 deletions
--- a/examples/surrogate_dublin_core.xml
+++ b/examples/surrogate_dublin_core.xml
@ -0,0 +1,28 @@
 <?xml version="1.0" encoding="utf-8"?>
 <!-- YaCy surrogate file using dublin core notion -->
 <!-- 
  This is a surrogate file which is an intermediate document description
  file for index generation. Once you have YaCy started, you can copy a file
  like this (or actual this file) into DATA/SURROGATE/in and then the indexing
  process will read the file, store the content into the search index and moves
  the file into DATA/SURROGATE/out
  Using surrogate files and the surrogate file format you can easily create your
  own data harvesting sources for the YaCy indexer. Just write a file generator
  that generates files like this. The xml schema is very similar to that
  described in
    http://dublincore.org/documents/dc-xml-guidelines/
  using the Dublin Core metadata element set.
 -->
 <surrogates
  xmlns:dc="http://purl.org/dc/elements/1.1/">
  <record>
    <dc:title><![CDATA[Alan Smithee]]></dc:title>
    <dc:identifier>http://de.wikipedia.org/wiki/Alan_Smithee</dc:identifier>
    <dc:description><![CDATA[Der als Filmregisseur oft genannte '''Alan Smithee''' ist ein Anagramm von „The Alias Men“.]]></dc:description>
    <dc:language>de</dc:language>
    <dc:date>2009-03-02T11:12:36Z</dc:date> <!-- date is in ISO 8601 -->
  </record>
 </surrogates>
--- a/source/de/anomic/crawler/Surrogate.java
+++ b/source/de/anomic/crawler/Surrogate.java
@ -41,9 +41,31 @@ public class Surrogate extends HashMap<String, String> {
    public Surrogate() {
        super();
    }
    /*
    DC according to rfc 5013
    * dc_title
    * dc_creator
    * dc_subject
    * dc_description
    * dc_publisher
    dc_contributor
    dc_date
    dc_type
    * dc_format
    * dc_identifier
    * dc_source
    dc_language
    dc_relation
    dc_coverage
    dc_rights
         */
    public Date date() {
-        String d = this.get("date");
+        String d = this.get("dateISO8601");
        if (d == null) d = this.get("docdatetime");
        if (d == null) d = this.get("dc:date");
        if (d == null) return null;
        try {
            return DateFormatter.parseISO8601(d);
@ -54,6 +76,7 @@ public class Surrogate extends HashMap<String, String> {
    }
    public yacyURL url() {
        String u = this.get("url");
        if (u == null) u = this.get("dc:identifier");
        if (u == null) return null;
        try {
            return new yacyURL(u, null);
@ -64,19 +87,28 @@ public class Surrogate extends HashMap<String, String> {
    }
    public String language() {
        String l = this.get("language");
        if (l == null) l = this.get("dc:language");
        if (l == null) return "en"; else return l;
    }
    public String title() {
        String t = this.get("title");
-        return stripCDATA(t);
+        if (t == null) t = this.get("dc:title");
        t = stripCDATA(t);
        if (t == null) return "";
        return t;
    }
    public String body() {
        String t = this.get("body");
-        return stripCDATA(t);
+        if (t == null) this.get("dc:description");
        t = stripCDATA(t);
        if (t == null) return "";
        return t;
    }
    public String[] categories() {
        String t = this.get("categories");
        if (t == null) this.get("dc:subject");
        t = stripCDATA(t);
        if (t == null) return new String[]{};
        return t.split(";");
    }
    private String stripCDATA(String s) {
--- a/source/de/anomic/xml/SurrogateReader.java
+++ b/source/de/anomic/xml/SurrogateReader.java
@ -89,19 +89,23 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
    }
    public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
-        if ("document".equals(tag)) {
+        if ("record".equals(tag) || "document".equals(tag)) {
            this.surrogate = new Surrogate();
        } else if ("element".equals(tag)) {
            this.elementName = atts.getValue("name");
        } else if ("value".equals(tag)) {
            this.buffer.setLength(0);
            this.parsingValue = true;
        } else if (tag.startsWith("dc:")) {
            // parse dublin core attribute
            this.elementName = tag;
            this.parsingValue = true;
        }
    }
    public void endElement(final String uri, final String name, final String tag) {
        if (tag == null) return;
-        if ("document".equals(tag)) {
+        if ("record".equals(tag) || "document".equals(tag)) {
            //System.out.println("A Title: " + this.surrogate.title());
            try {
                this.surrogates.put(this.surrogate);
@ -124,6 +128,13 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
            }
            this.buffer.setLength(0);
            this.parsingValue = false;
        } else if (tag.startsWith("dc:")) {
            final String value = buffer.toString().trim();
            if (this.elementName != null) {
                this.surrogate.put(this.elementName, value);
            }
            this.buffer.setLength(0);
            this.parsingValue = false;
        }
    }
@ -177,6 +188,24 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
 }
    /*
 Example surrogate
 <?xml version="1.0" encoding="utf-8"?>
 <!-- YaCy surrogate file using dublin core notion -->
 <!-- see http://dublincore.org/documents/dc-xml-guidelines/ -->
 <surrogates
  xmlns:dc="http://purl.org/dc/elements/1.1/">
  <record>
    <dc:title><![CDATA[Alan Smithee]]></dc:title>
    <dc:identifier>http://de.wikipedia.org/wiki/Alan_Smithee</dc:identifier>
    <dc:description><![CDATA[Der als Filmregisseur oft genannte '''Alan Smithee''' ist ein Anagramm von ãThe Alias MenÒ.]]></dc:description>
    <dc:language>de</dc:language>
    <dc:date>2009-03-02T11:12:36Z</dc:date> <!-- date is in ISO 8601 -->
  </record>
 </surrogates>
 or
 <?xml version="1.0" encoding="utf-8"?>
 <documents>