From 3671c37989c0a4e085fe17a43e1d544e946e3aef Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 30 Sep 2009 22:11:00 +0000 Subject: [PATCH] added experimental oai-pmh reader and integrated it with the existing dublin core parser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6366 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/content/DCEntry.java | 62 +++++++++--- .../anomic/content/file/SurrogateReader.java | 20 ++-- source/de/anomic/content/oai/PMHReader.java | 88 +++++++++++++++++ .../anomic/crawler/retrieval/HTTPLoader.java | 97 +++++++++++++++++++ .../crawler/retrieval/LoaderDispatcher.java | 7 ++ .../de/anomic/crawler/retrieval/Request.java | 8 ++ .../de/anomic/crawler/retrieval/Response.java | 2 +- source/de/anomic/search/Switchboard.java | 6 +- 8 files changed, 263 insertions(+), 27 deletions(-) create mode 100644 source/de/anomic/content/oai/PMHReader.java diff --git a/source/de/anomic/content/DCEntry.java b/source/de/anomic/content/DCEntry.java index 9439269da..0909aebc0 100644 --- a/source/de/anomic/content/DCEntry.java +++ b/source/de/anomic/content/DCEntry.java @@ -90,8 +90,7 @@ public class DCEntry extends TreeMap { dc_coverage dc_rights */ - - public Date date() { + public Date getDate() { String d = this.get("docdatetime"); if (d == null) d = this.get("dc:date"); if (d == null) return null; @@ -103,7 +102,7 @@ public class DCEntry extends TreeMap { } } - public yacyURL url() { + public yacyURL getIdentifier() { String u = this.get("url"); if (u == null) u = this.get("dc:identifier"); if (u == null) return null; @@ -115,14 +114,38 @@ public class DCEntry extends TreeMap { } } - public String language() { + public String getLanguage() { String l = this.get("language"); if (l == null) l = this.get("dc:language"); - if (l == null) return url().language(); + if (l == null) return getIdentifier().language(); return l; } - public String title() { + public String getType() { + String t = this.get("dc:type"); + if (t == null) return ""; + return t; + } + + public String getFormat() { + String t = this.get("dc:format"); + if (t == null) return ""; + return t; + } + + public String getSource() { + String t = this.get("dc:source"); + if (t == null) return ""; + return t; + } + + public String getRights() { + String t = this.get("dc:rights"); + if (t == null) return ""; + return t; + } + + public String getTitle() { String t = this.get("title"); if (t == null) t = this.get("dc:title"); t = stripCDATA(t); @@ -130,7 +153,14 @@ public class DCEntry extends TreeMap { return t; } - public String author() { + public String getPublisher() { + String t = this.get("dc:publisher"); + t = stripCDATA(t); + if (t == null) return ""; + return t; + } + + public String getCreator() { String t = this.get("author"); if (t == null) t = this.get("dc:creator"); t = stripCDATA(t); @@ -138,7 +168,7 @@ public class DCEntry extends TreeMap { return t; } - public String body() { + public String getDescription() { String t = this.get("body"); if (t == null) t = this.get("dc:description"); t = stripCDATA(t); @@ -146,7 +176,7 @@ public class DCEntry extends TreeMap { return t; } - public String[] categories() { + public String[] getSubject() { String t = this.get("categories"); if (t == null) this.get("dc:subject"); t = stripCDATA(t); @@ -164,20 +194,20 @@ public class DCEntry extends TreeMap { public Document document() { HashSet languages = new HashSet(); - languages.add(language()); + languages.add(getLanguage()); try { return new Document( - url(), + getIdentifier(), "text/html", "UTF-8", languages, - categories(), - title(), - author(), + getSubject(), + getTitle(), + getCreator(), null, "", - body().getBytes("UTF-8"), + getDescription().getBytes("UTF-8"), null, null); } catch (UnsupportedEncodingException e) { @@ -189,7 +219,7 @@ public class DCEntry extends TreeMap { public void writeXML(OutputStreamWriter os) throws IOException { Document doc = document(); if (doc != null) { - doc.writeXML(os, this.date()); + doc.writeXML(os, this.getDate()); } } } \ No newline at end of file diff --git a/source/de/anomic/content/file/SurrogateReader.java b/source/de/anomic/content/file/SurrogateReader.java index 9b9b7dd85..8bb0f15dd 100644 --- a/source/de/anomic/content/file/SurrogateReader.java +++ b/source/de/anomic/content/file/SurrogateReader.java @@ -136,7 +136,13 @@ public class SurrogateReader extends DefaultHandler implements Runnable { } else if (tag.startsWith("dc:")) { final String value = buffer.toString().trim(); if (this.elementName != null) { - this.surrogate.put(this.elementName, value); + value.replaceAll(";", ","); + String oldcontent = this.surrogate.get(this.elementName); + if (oldcontent == null) { + this.surrogate.put(this.elementName, value); + } else { + this.surrogate.put(this.elementName, oldcontent + ";" + value); + } } this.buffer.setLength(0); this.parsingValue = false; @@ -169,12 +175,12 @@ public class SurrogateReader extends DefaultHandler implements Runnable { DCEntry s; System.out.println("1"); while ((s = sr.take()) != DCEntry.poison) { - System.out.println("Title: " + s.title()); - System.out.println("Date: " + s.date()); - System.out.println("URL: " + s.url()); - System.out.println("Language: " + s.language()); - System.out.println("Body: " + s.body()); - System.out.println("Categories: " + s.categories()); + System.out.println("Title: " + s.getTitle()); + System.out.println("Date: " + s.getDate()); + System.out.println("URL: " + s.getIdentifier()); + System.out.println("Language: " + s.getLanguage()); + System.out.println("Body: " + s.getDescription()); + System.out.println("Categories: " + s.getSubject()); } System.out.println("2"); } catch (IOException e) { diff --git a/source/de/anomic/content/oai/PMHReader.java b/source/de/anomic/content/oai/PMHReader.java new file mode 100644 index 000000000..84997c483 --- /dev/null +++ b/source/de/anomic/content/oai/PMHReader.java @@ -0,0 +1,88 @@ +// PMHReader +// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 30.09.2009 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2009-09-23 23:26:14 +0200 (Mi, 23 Sep 2009) $ +// $LastChangedRevision: 6340 $ +// $LastChangedBy: low012 $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.content.oai; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.net.MalformedURLException; + +import de.anomic.content.DCEntry; +import de.anomic.content.file.SurrogateReader; +import de.anomic.crawler.CrawlProfile; +import de.anomic.crawler.retrieval.HTTPLoader; +import de.anomic.crawler.retrieval.LoaderDispatcher; +import de.anomic.crawler.retrieval.Request; +import de.anomic.crawler.retrieval.Response; +import de.anomic.yacy.yacyURL; + +public class PMHReader { + + LoaderDispatcher loader; + + public PMHReader(LoaderDispatcher loader) { + this.loader = loader; + } + + public void load(yacyURL source) throws IOException { + Response response = this.loader.load(source, true, true, CrawlProfile.CACHE_STRATEGY_NOCACHE); + load(response); + } + + public static void load0(yacyURL source) throws IOException { + Response response = HTTPLoader.load(new Request(source, null)); + load(response); + } + + private static void load(Response response) throws IOException { + byte[] b = response.getContent(); + SurrogateReader sr = new SurrogateReader(new ByteArrayInputStream(b), 100); + Thread srt = new Thread(sr); + srt.start(); + DCEntry dce; + while ((dce = sr.take()) != DCEntry.poison) { + System.out.println(dce.toString()); + } + try { + srt.join(); + } catch (InterruptedException e) {} + } + + public static void main(String[] args) { + // get one server with + // http://roar.eprints.org/index.php?action=csv + // list records from oai-pmh like + // http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc + try { + load0(new yacyURL(args[0], null)); + } catch (MalformedURLException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + } + +} diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index 090504bc2..d46d36824 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -220,4 +220,101 @@ public final class HTTPLoader { return response; } + public static Response load(final Request request) throws IOException { + return load(request, 3); + } + + private static Response load(final Request request, int retryCount) throws IOException { + + if (retryCount < 0) { + throw new IOException("Redirection counter exceeded for URL " + request.url().toString() + ". Processing aborted."); + } + + final String host = request.url().getHost(); + if (host == null || host.length() < 2) throw new IOException("host is not well-formed: '" + host + "'"); + final String path = request.url().getFile(); + int port = request.url().getPort(); + final boolean ssl = request.url().getProtocol().equals("https"); + if (port < 0) port = (ssl) ? 443 : 80; + + // check if url is in blacklist + final String hostlow = host.toLowerCase(); + if (Switchboard.urlBlacklist != null && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, hostlow, path)) { + throw new IOException("CRAWLER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist."); + } + + // take a file from the net + Response response = null; + + // create a request header + final RequestHeader requestHeader = new RequestHeader(); + requestHeader.put(HeaderFramework.USER_AGENT, crawlerUserAgent); + requestHeader.put(HeaderFramework.ACCEPT_LANGUAGE, DEFAULT_LANGUAGE); + requestHeader.put(HeaderFramework.ACCEPT_CHARSET, DEFAULT_CHARSET); + requestHeader.put(HeaderFramework.ACCEPT_ENCODING, DEFAULT_ENCODING); + + // HTTP-Client + final Client client = new Client(20000, requestHeader); + + ResponseContainer res = null; + try { + // send request + res = client.GET(request.url().toString(), Long.MAX_VALUE); + // FIXME: 30*-handling (bottom) is never reached + // we always get the final content because httpClient.followRedirects = true + + if (res.getStatusCode() == 200 || res.getStatusCode() == 203) { + // the transfer is ok + + // we write the new cache entry to file system directly + res.setAccountingName("CRAWLER"); + final byte[] responseBody = res.getData(); + + // create a new cache entry + response = new Response( + request, + requestHeader, + res.getResponseHeader(), + res.getStatusLine(), + null, + responseBody + ); + + return response; + } else if (res.getStatusLine().startsWith("30")) { + if (res.getResponseHeader().containsKey(HeaderFramework.LOCATION)) { + // getting redirection URL + String redirectionUrlString = res.getResponseHeader().get(HeaderFramework.LOCATION); + redirectionUrlString = redirectionUrlString.trim(); + + if (redirectionUrlString.length() == 0) { + throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " aborted. Location header is empty."); + } + + // normalizing URL + final yacyURL redirectionUrl = yacyURL.newURL(request.url(), redirectionUrlString); + + + // if we are already doing a shutdown we don't need to retry crawling + if (Thread.currentThread().isInterrupted()) { + throw new IOException("CRAWLER Retry of URL=" + request.url().toString() + " aborted because of server shutdown."); + } + + // retry crawling with new url + request.redirectURL(redirectionUrl); + return load(request, retryCount - 1); + } + } else { + // if the response has not the right response type then reject file + throw new IOException("REJECTED WRONG STATUS TYPE '" + res.getStatusLine() + "' for URL " + request.url().toString()); + } + } finally { + if(res != null) { + // release connection + res.closeStream(); + } + } + return response; + } + } diff --git a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java index fa1f7d1da..02feb6c2d 100644 --- a/source/de/anomic/crawler/retrieval/LoaderDispatcher.java +++ b/source/de/anomic/crawler/retrieval/LoaderDispatcher.java @@ -96,6 +96,13 @@ public final class LoaderDispatcher { return load(request(url, forText, global), forText, cacheStratgy); } + /** + * generate a request object + * @param url the target url + * @param forText shows that this was a for-text crawling request + * @param global shows that this was a global crawling request + * @return the request object + */ public Request request( final yacyURL url, final boolean forText, diff --git a/source/de/anomic/crawler/retrieval/Request.java b/source/de/anomic/crawler/retrieval/Request.java index 495dbdf12..37cbb0883 100755 --- a/source/de/anomic/crawler/retrieval/Request.java +++ b/source/de/anomic/crawler/retrieval/Request.java @@ -78,6 +78,14 @@ public class Request extends serverProcessorJob { private String statusMessage; private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection + /** + * convenience method for 'full' request object + * @param url + * @param referrerhash + */ + public Request(final yacyURL url, final String referrerhash) { + this(null, url, referrerhash, null, null, null, null, 0, 0, 0); + } /** * A Request Entry is a object that is created to provide diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index 768e176a4..3e251217d 100755 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -274,7 +274,7 @@ public class Response { // -CGI access in request // CGI access makes the page very individual, and therefore not usable // in caches - if (this.url().isPOST() && !this.profile.crawlingQ()) { + if (this.url().isPOST() && this.profile != null && !this.profile.crawlingQ()) { return "dynamic_post"; } diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index 362bf1f22..5b2fd5cd2 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1206,9 +1206,9 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi // check if url is in accepted domain assert surrogate != null; assert crawlStacker != null; - final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.url()); + final String urlRejectReason = crawlStacker.urlInAcceptedDomain(surrogate.getIdentifier()); if (urlRejectReason != null) { - if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.url() + "': " + urlRejectReason); + if (this.log.isFine()) this.log.logInfo("Rejected URL '" + surrogate.getIdentifier() + "': " + urlRejectReason); continue; } @@ -1216,7 +1216,7 @@ public final class Switchboard extends serverAbstractSwitch implements serverSwi Document document = surrogate.document(); Request request = new Request( peers.mySeed().hash, - surrogate.url(), + surrogate.getIdentifier(), null, "", new Date(),