added experimental oai-pmh reader and integrated it with the existing dublin core parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6366 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 0c17b600c6
commit 3671c37989

@ -90,8 +90,7 @@ public class DCEntry extends TreeMap<String, String> {
dc_coverage
dc_rights
*/
public Date date() {
public Date getDate() {
String d = this.get("docdatetime");
if (d == null) d = this.get("dc:date");
if (d == null) return null;
@ -103,7 +102,7 @@ public class DCEntry extends TreeMap<String, String> {
}
}
public yacyURL url() {
public yacyURL getIdentifier() {
String u = this.get("url");
if (u == null) u = this.get("dc:identifier");
if (u == null) return null;
@ -115,14 +114,38 @@ public class DCEntry extends TreeMap<String, String> {
}
}
public String language() {
public String getLanguage() {
String l = this.get("language");
if (l == null) l = this.get("dc:language");
if (l == null) return url().language();
if (l == null) return getIdentifier().language();
return l;
}
public String title() {
public String getType() {
String t = this.get("dc:type");
if (t == null) return "";
return t;
}
public String getFormat() {
String t = this.get("dc:format");
if (t == null) return "";
return t;
}
public String getSource() {
String t = this.get("dc:source");
if (t == null) return "";
return t;
}
public String getRights() {
String t = this.get("dc:rights");
if (t == null) return "";
return t;
}
public String getTitle() {
String t = this.get("title");
if (t == null) t = this.get("dc:title");
t = stripCDATA(t);
@ -130,7 +153,14 @@ public class DCEntry extends TreeMap<String, String> {
return t;
}
public String author() {
public String getPublisher() {
String t = this.get("dc:publisher");
t = stripCDATA(t);
if (t == null) return "";
return t;
}
public String getCreator() {
String t = this.get("author");
if (t == null) t = this.get("dc:creator");
t = stripCDATA(t);
@ -138,7 +168,7 @@ public class DCEntry extends TreeMap<String, String> {
return t;
}
public String body() {
public String getDescription() {
String t = this.get("body");
if (t == null) t = this.get("dc:description");
t = stripCDATA(t);
@ -146,7 +176,7 @@ public class DCEntry extends TreeMap<String, String> {
return t;
}
public String[] categories() {
public String[] getSubject() {
String t = this.get("categories");
if (t == null) this.get("dc:subject");
t = stripCDATA(t);
@ -164,20 +194,20 @@ public class DCEntry extends TreeMap<String, String> {
public Document document() {
HashSet<String> languages = new HashSet<String>();
languages.add(language());
languages.add(getLanguage());
try {
return new Document(
url(),
getIdentifier(),
"text/html",
"UTF-8",
languages,
categories(),
title(),
author(),
getSubject(),
getTitle(),
getCreator(),
null,
"",
body().getBytes("UTF-8"),
getDescription().getBytes("UTF-8"),
null,
null);
} catch (UnsupportedEncodingException e) {
@ -189,7 +219,7 @@ public class DCEntry extends TreeMap<String, String> {
public void writeXML(OutputStreamWriter os) throws IOException {
Document doc = document();
if (doc != null) {
doc.writeXML(os, this.date());
doc.writeXML(os, this.getDate());
}
}
}

@ -136,7 +136,13 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
} else if (tag.startsWith("dc:")) {
final String value = buffer.toString().trim();
if (this.elementName != null) {
this.surrogate.put(this.elementName, value);
value.replaceAll(";", ",");
String oldcontent = this.surrogate.get(this.elementName);
if (oldcontent == null) {
this.surrogate.put(this.elementName, value);
} else {
this.surrogate.put(this.elementName, oldcontent + ";" + value);
}
}
this.buffer.setLength(0);
this.parsingValue = false;
@ -169,12 +175,12 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
DCEntry s;
System.out.println("1");
while ((s = sr.take()) != DCEntry.poison) {
System.out.println("Title: " + s.title());
System.out.println("Date: " + s.date());
System.out.println("URL: " + s.url());
System.out.println("Language: " + s.language());
System.out.println("Body: " + s.body());
System.out.println("Categories: " + s.categories());
System.out.println("Title: " + s.getTitle());
System.out.println("Date: " + s.getDate());
System.out.println("URL: " + s.getIdentifier());
System.out.println("Language: " + s.getLanguage());
System.out.println("Body: " + s.getDescription());
System.out.println("Categories: " + s.getSubject());
}
System.out.println("2");
} catch (IOException e) {

@ -0,0 +1,88 @@
// PMHReader
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 30.09.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-09-23 23:26:14 +0200 (Mi, 23 Sep 2009) $
// $LastChangedRevision: 6340 $
// $LastChangedBy: low012 $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.content.oai;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import de.anomic.content.DCEntry;
import de.anomic.content.file.SurrogateReader;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.LoaderDispatcher;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.yacy.yacyURL;
public class PMHReader {
LoaderDispatcher loader;
public PMHReader(LoaderDispatcher loader) {
this.loader = loader;
}
public void load(yacyURL source) throws IOException {
Response response = this.loader.load(source, true, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
load(response);
}
public static void load0(yacyURL source) throws IOException {
Response response = HTTPLoader.load(new Request(source, null));
load(response);
}
private static void load(Response response) throws IOException {
byte[] b = response.getContent();
SurrogateReader sr = new SurrogateReader(new ByteArrayInputStream(b), 100);
Thread srt = new Thread(sr);
srt.start();
DCEntry dce;
while ((dce = sr.take()) != DCEntry.poison) {
System.out.println(dce.toString());
}
try {
srt.join();
} catch (InterruptedException e) {}
}
public static void main(String[] args) {
// get one server with
// http://roar.eprints.org/index.php?action=csv
// list records from oai-pmh like
// http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc
try {
load0(new yacyURL(args[0], null));
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}

@ -220,4 +220,101 @@ public final class HTTPLoader {
return response;
}
public static Response load(final Request request) throws IOException {
return load(request, 3);
}
private static Response load(final Request request, int retryCount) throws IOException {
if (retryCount < 0) {
throw new IOException("Redirection counter exceeded for URL " + request.url().toString() + ". Processing aborted.");
}
final String host = request.url().getHost();
if (host == null || host.length() < 2) throw new IOException("host is not well-formed: '" + host + "'");
final String path = request.url().getFile();
int port = request.url().getPort();
final boolean ssl = request.url().getProtocol().equals("https");
if (port < 0) port = (ssl) ? 443 : 80;
// check if url is in blacklist
final String hostlow = host.toLowerCase();
if (Switchboard.urlBlacklist != null && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) {
throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
}
// take a file from the net
Response response = null;
// create a request header
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, crawlerUserAgent);
requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, DEFAULT_LANGUAGE);
requestHeader.put(HeaderFramework.ACCEPT_CHARSET, DEFAULT_CHARSET);
requestHeader.put(HeaderFramework.ACCEPT_ENCODING, DEFAULT_ENCODING);
// HTTP-Client
final Client client = new Client(20000, requestHeader);
ResponseContainer res = null;
try {
// send request
res = client.GET(request.url().toString(), Long.MAX_VALUE);
// FIXME: 30*-handling (bottom) is never reached
// we always get the final content because httpClient.followRedirects = true
if (res.getStatusCode() == 200 || res.getStatusCode() == 203) {
// the transfer is ok
// we write the new cache entry to file system directly
res.setAccountingName("CRAWLER");
final byte[] responseBody = res.getData();
// create a new cache entry
response = new Response(
request,
requestHeader,
res.getResponseHeader(),
res.getStatusLine(),
null,
responseBody
);
return response;
} else if (res.getStatusLine().startsWith("30")) {
if (res.getResponseHeader().containsKey(HeaderFramework.LOCATION)) {
// getting redirection URL
String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION);
redirectionUrlString = redirectionUrlString.trim();
if (redirectionUrlString.length() == 0) {
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty.");
}
// normalizing URL
final yacyURL redirectionUrl = yacyURL.newURL(request.url(), redirectionUrlString);
// if we are already doing a shutdown we don't need to retry crawling
if (Thread.currentThread().isInterrupted()) {
throw new IOException("CRAWLER Retry of URL=" + request.url().toString() + " aborted because of server shutdown.");
}
// retry crawling with new url
request.redirectURL(redirectionUrl);
return load(request, retryCount - 1);
}
} else {
// if the response has not the right response type then reject file
throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + request.url().toString());
}
} finally {
if(res != null) {
// release connection
res.closeStream();
}
}
return response;
}
}

@ -96,6 +96,13 @@ public final class LoaderDispatcher {
return load(request(url, forText, global), forText, cacheStratgy);
}
/**
* generate a request object
* @param url the target url
* @param forText shows that this was a for-text crawling request
* @param global shows that this was a global crawling request
* @return the request object
*/
public Request request(
final yacyURL url,
final boolean forText,

@ -78,6 +78,14 @@ public class Request extends serverProcessorJob {
private String statusMessage;
private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection
/**
* convenience method for 'full' request object
* @param url
* @param referrerhash
*/
public Request(final yacyURL url, final String referrerhash) {
this(null, url, referrerhash, null, null, null, null, 0, 0, 0);
}
/**
* A Request Entry is a object that is created to provide

@ -274,7 +274,7 @@ public class Response {
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable
// in caches
if (this.url().isPOST() && !this.profile.crawlingQ()) {
if (this.url().isPOST() && this.profile != null && !this.profile.crawlingQ()) {
return "dynamic_post";
}

@ -1206,9 +1206,9 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
// check if url is in accepted domain
assert surrogate != null;
assert crawlStacker != null;
final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.url());
final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier());
if (urlRejectReason != null) {
if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.url() + "': " + urlRejectReason);
if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier() + "': " + urlRejectReason);
continue;
}
@ -1216,7 +1216,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi
Document document = surrogate.document();
Request request = new Request(
peers.mySeed().hash,
surrogate.url(),
surrogate.getIdentifier(),
null,
"",
new Date(),

Loading…
Cancel
Save