export of yacy document objects as dublin core record in xml

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5826 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent d7cbf4cdd4
commit fbcbcc5bdb

@ -238,16 +238,16 @@ public class Base64Order extends AbstractOrder<byte[]> implements ByteOrder, Cod
// now there may be remaining bytes
if (in.length % 3 != 0 && writepos < sublen) {
if (in.length % 3 == 2) {
System.arraycopy(encodeLong((((0XffL & in[pos]) << 8) + (0XffL & in[pos + 1])) << 8, 4), 0, out, writepos, 3);
System.arraycopy(encodeLong((((0XffL & in[pos]) << 8) + (0XffL & in[pos + 1])) << 8, 4).getBytes(), 0, out, writepos, 3);
writepos += 3;
} else {
System.arraycopy(encodeLong((((0XffL & in[pos])) << 8) << 8, 4).substring(0, 2), 0, out, writepos, 2);
System.arraycopy(encodeLong((((0XffL & in[pos])) << 8) << 8, 4).substring(0, 2).getBytes(), 0, out, writepos, 2);
writepos += 2;
}
}
if (rfc1113compliant) while (writepos % 4 > 0 && writepos < sublen) out[writepos] = '=';
//assert encode(in).substring(0, sublen).equals(new String(out));
assert encode(in).substring(0, sublen).equals(new String(out));
return out;
}

@ -29,6 +29,7 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
@ -41,6 +42,7 @@ import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.FileUtils;
import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.Condenser;
@ -137,7 +139,7 @@ public class plasmaParserDocument {
* If there is no metadata at all, null is returned
* @return a string with a language name using the alpha-2 code of ISO 639
*/
public String languageByMetadata() {
public String dc_language() {
if (this.languages == null) return null;
if (this.languages.size() == 0) return null;
if (this.languages.size() == 1) return languages.iterator().next();
@ -210,7 +212,7 @@ dc_rights
}
public String dc_identifier() {
return "yacy.net:" + this.source.hash();
return this.source.toNormalform(true, false);
}
public yacyURL dc_source() {
@ -456,6 +458,22 @@ dc_rights
return (this.outboundLinks < 0) ? 0 : this.outboundLinks;
}
public void writeXML(OutputStreamWriter os, Date date) throws IOException {
os.write("<record>\n");
os.write("<dc:Title><![CDATA[" + this.dc_title() + "]]></dc:Title>\n");
os.write("<dc:Identifier>" + this.dc_identifier() + "</dc:Identifier>\n");
os.write("<dc:Description><![CDATA[");
byte[] buffer = new byte[1000];
int c = 0;
InputStream is = this.getText();
while ((c = is.read(buffer)) > 0) os.write(new String(buffer, 0, c));
is.close();
os.write("]]></dc:Description>\n");
os.write("<dc:Language>" + this.dc_language() + "</dc:Language>\n");
os.write("<dc:Date>" + DateFormatter.formatISO8601(date) + "</dc:Date>\n");
os.write("</record>\n");
}
public void close() {
// try close the output stream
if (this.textStream != null) {

@ -497,7 +497,7 @@ public final class plasmaWordIndex {
// do a identification of the language
String language = condenser.language(); // this is a statistical analysation of the content: will be compared with other attributes
String bymetadata = document.languageByMetadata(); // the languageByMetadata may return null if there was no declaration
String bymetadata = document.dc_language(); // the languageByMetadata may return null if there was no declaration
if (language == null) {
// no statistics available, we take either the metadata (if given) or the TLD
language = (bymetadata == null) ? entry.url().language() : bymetadata;

@ -37,10 +37,12 @@ import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.Callable;
@ -309,6 +311,9 @@ public class mediawikiIndex {
public void genDocument() throws InterruptedException, ParserException {
document = hparser.parseSource(url, "text/html", "utf-8", html.getBytes());
}
public void writeXML(OutputStreamWriter os) throws IOException {
document.writeXML(os, new Date());
}
}
private static class PositionAwareReader {

Loading…
Cancel
Save