From fbcbcc5bdb2cd7ea37d5079adea1b3deea7839f7 Mon Sep 17 00:00:00 2001 From: orbiter Date: Fri, 17 Apr 2009 14:20:12 +0000 Subject: [PATCH] export of yacy document objects as dublin core record in xml git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5826 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- .../de/anomic/kelondro/order/Base64Order.java | 6 ++--- .../anomic/plasma/plasmaParserDocument.java | 22 +++++++++++++++++-- source/de/anomic/plasma/plasmaWordIndex.java | 2 +- source/de/anomic/tools/mediawikiIndex.java | 5 +++++ 4 files changed, 29 insertions(+), 6 deletions(-) diff --git a/source/de/anomic/kelondro/order/Base64Order.java b/source/de/anomic/kelondro/order/Base64Order.java index 0c7fdf7b9..c1c13badc 100644 --- a/source/de/anomic/kelondro/order/Base64Order.java +++ b/source/de/anomic/kelondro/order/Base64Order.java @@ -238,16 +238,16 @@ public class Base64Order extends AbstractOrder implements ByteOrder, Cod // now there may be remaining bytes if (in.length % 3 != 0 && writepos < sublen) { if (in.length % 3 == 2) { - System.arraycopy(encodeLong((((0XffL & in[pos]) << 8) + (0XffL & in[pos + 1])) << 8, 4), 0, out, writepos, 3); + System.arraycopy(encodeLong((((0XffL & in[pos]) << 8) + (0XffL & in[pos + 1])) << 8, 4).getBytes(), 0, out, writepos, 3); writepos += 3; } else { - System.arraycopy(encodeLong((((0XffL & in[pos])) << 8) << 8, 4).substring(0, 2), 0, out, writepos, 2); + System.arraycopy(encodeLong((((0XffL & in[pos])) << 8) << 8, 4).substring(0, 2).getBytes(), 0, out, writepos, 2); writepos += 2; } } if (rfc1113compliant) while (writepos % 4 > 0 && writepos < sublen) out[writepos] = '='; - //assert encode(in).substring(0, sublen).equals(new String(out)); + assert encode(in).substring(0, sublen).equals(new String(out)); return out; } diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index 7994859d9..5f76d01d0 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -29,6 +29,7 @@ import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStreamWriter; import java.util.Arrays; import java.util.Date; import java.util.HashMap; @@ -41,6 +42,7 @@ import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterImageEntry; +import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.FileUtils; import de.anomic.plasma.parser.Parser; import de.anomic.plasma.parser.Condenser; @@ -137,7 +139,7 @@ public class plasmaParserDocument { * If there is no metadata at all, null is returned * @return a string with a language name using the alpha-2 code of ISO 639 */ - public String languageByMetadata() { + public String dc_language() { if (this.languages == null) return null; if (this.languages.size() == 0) return null; if (this.languages.size() == 1) return languages.iterator().next(); @@ -210,7 +212,7 @@ dc_rights } public String dc_identifier() { - return "yacy.net:" + this.source.hash(); + return this.source.toNormalform(true, false); } public yacyURL dc_source() { @@ -456,6 +458,22 @@ dc_rights return (this.outboundLinks < 0) ? 0 : this.outboundLinks; } + public void writeXML(OutputStreamWriter os, Date date) throws IOException { + os.write("\n"); + os.write("\n"); + os.write("" + this.dc_identifier() + "\n"); + os.write(" 0) os.write(new String(buffer, 0, c)); + is.close(); + os.write("]]>\n"); + os.write("" + this.dc_language() + "\n"); + os.write("" + DateFormatter.formatISO8601(date) + "\n"); + os.write("\n"); + } + public void close() { // try close the output stream if (this.textStream != null) { diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 88fa347b8..3e838679a 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -497,7 +497,7 @@ public final class plasmaWordIndex { // do a identification of the language String language = condenser.language(); // this is a statistical analysation of the content: will be compared with other attributes - String bymetadata = document.languageByMetadata(); // the languageByMetadata may return null if there was no declaration + String bymetadata = document.dc_language(); // the languageByMetadata may return null if there was no declaration if (language == null) { // no statistics available, we take either the metadata (if given) or the TLD language = (bymetadata == null) ? entry.url().language() : bymetadata; diff --git a/source/de/anomic/tools/mediawikiIndex.java b/source/de/anomic/tools/mediawikiIndex.java index ee88e831c..eda977faa 100644 --- a/source/de/anomic/tools/mediawikiIndex.java +++ b/source/de/anomic/tools/mediawikiIndex.java @@ -37,10 +37,12 @@ import java.io.FileNotFoundException; import java.io.FileWriter; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStreamWriter; import java.io.PrintWriter; import java.io.RandomAccessFile; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; +import java.util.Date; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.Callable; @@ -309,6 +311,9 @@ public class mediawikiIndex { public void genDocument() throws InterruptedException, ParserException { document = hparser.parseSource(url, "text/html", "utf-8", html.getBytes()); } + public void writeXML(OutputStreamWriter os) throws IOException { + document.writeXML(os, new Date()); + } } private static class PositionAwareReader {