diff --git a/examples/surrogate_dublin_core.xml b/examples/surrogate_dublin_core.xml
new file mode 100644
index 000000000..d71058f72
--- /dev/null
+++ b/examples/surrogate_dublin_core.xml
@@ -0,0 +1,28 @@
+
+
+
+
+
+
+
+
+ http://de.wikipedia.org/wiki/Alan_Smithee
+
+ de
+ 2009-03-02T11:12:36Z
+
+
+
diff --git a/source/de/anomic/crawler/Surrogate.java b/source/de/anomic/crawler/Surrogate.java
index 45819928f..35ed67112 100644
--- a/source/de/anomic/crawler/Surrogate.java
+++ b/source/de/anomic/crawler/Surrogate.java
@@ -41,9 +41,31 @@ public class Surrogate extends HashMap {
public Surrogate() {
super();
}
+
+ /*
+ DC according to rfc 5013
+
+ * dc_title
+ * dc_creator
+ * dc_subject
+ * dc_description
+ * dc_publisher
+ dc_contributor
+ dc_date
+ dc_type
+ * dc_format
+ * dc_identifier
+ * dc_source
+ dc_language
+ dc_relation
+ dc_coverage
+ dc_rights
+ */
+
public Date date() {
- String d = this.get("date");
+ String d = this.get("dateISO8601");
if (d == null) d = this.get("docdatetime");
+ if (d == null) d = this.get("dc:date");
if (d == null) return null;
try {
return DateFormatter.parseISO8601(d);
@@ -54,6 +76,7 @@ public class Surrogate extends HashMap {
}
public yacyURL url() {
String u = this.get("url");
+ if (u == null) u = this.get("dc:identifier");
if (u == null) return null;
try {
return new yacyURL(u, null);
@@ -64,19 +87,28 @@ public class Surrogate extends HashMap {
}
public String language() {
String l = this.get("language");
+ if (l == null) l = this.get("dc:language");
if (l == null) return "en"; else return l;
}
public String title() {
String t = this.get("title");
- return stripCDATA(t);
+ if (t == null) t = this.get("dc:title");
+ t = stripCDATA(t);
+ if (t == null) return "";
+ return t;
}
public String body() {
String t = this.get("body");
- return stripCDATA(t);
+ if (t == null) this.get("dc:description");
+ t = stripCDATA(t);
+ if (t == null) return "";
+ return t;
}
public String[] categories() {
String t = this.get("categories");
+ if (t == null) this.get("dc:subject");
t = stripCDATA(t);
+ if (t == null) return new String[]{};
return t.split(";");
}
private String stripCDATA(String s) {
diff --git a/source/de/anomic/xml/SurrogateReader.java b/source/de/anomic/xml/SurrogateReader.java
index 09d86c48c..833feae05 100644
--- a/source/de/anomic/xml/SurrogateReader.java
+++ b/source/de/anomic/xml/SurrogateReader.java
@@ -89,19 +89,23 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
}
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
- if ("document".equals(tag)) {
+ if ("record".equals(tag) || "document".equals(tag)) {
this.surrogate = new Surrogate();
} else if ("element".equals(tag)) {
this.elementName = atts.getValue("name");
} else if ("value".equals(tag)) {
this.buffer.setLength(0);
this.parsingValue = true;
+ } else if (tag.startsWith("dc:")) {
+ // parse dublin core attribute
+ this.elementName = tag;
+ this.parsingValue = true;
}
}
public void endElement(final String uri, final String name, final String tag) {
if (tag == null) return;
- if ("document".equals(tag)) {
+ if ("record".equals(tag) || "document".equals(tag)) {
//System.out.println("A Title: " + this.surrogate.title());
try {
this.surrogates.put(this.surrogate);
@@ -124,6 +128,13 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
}
this.buffer.setLength(0);
this.parsingValue = false;
+ } else if (tag.startsWith("dc:")) {
+ final String value = buffer.toString().trim();
+ if (this.elementName != null) {
+ this.surrogate.put(this.elementName, value);
+ }
+ this.buffer.setLength(0);
+ this.parsingValue = false;
}
}
@@ -177,6 +188,24 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
}
/*
Example surrogate
+
+
+
+
+
+
+
+ http://de.wikipedia.org/wiki/Alan_Smithee
+
+ de
+ 2009-03-02T11:12:36Z
+
+
+
+
+
+or