very first working version of oai-pmh importer: if given the right url, the importer can read and index listRecord xml files and calculate the right resumptionURL which is then given as next default start point for the importer url input.

no automatic harvesting by now, this will be done later

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6443 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 58616d99e4
commit 350d13e153

@ -3,7 +3,7 @@
<head>
<title>YaCy '#[clientname]#': OAI-PMH Import</title>
#%env/templates/metas.template%#
#(import)#::<meta http-equiv="REFRESH" content="10" />#(/import)#
#(import)#::<!--<meta http-equiv="REFRESH" content="10" />-->#(/import)#
</head>
<body id="IndexImportOAIPMH">
#%env/templates/header.template%#
@ -15,7 +15,7 @@
<form action="IndexImportOAIPMH_p.html" method="get">
<fieldset>
<legend>OAI-PMH Import: set a OAI-PMH URL</legend>
<input name="oaipmhurl" type="text" value="http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc" size="100" />
<input name="oaipmhurl" type="text" value="#[defaulturl]#" size="100" />
<input name="submit" type="submit" value="Import OAI-PMH source" />
</fieldset>
</form>
@ -24,6 +24,7 @@
<dl>
<dt>Thread:</dt><dd>#[thread]#</dd>
<dt>Source:</dt><dd>#[source]#</dd>
<dt>ResumptionToken:</dt><dd>#[rt]#</dd>
<dt>Processed:</dt><dd>#[count]# Wiki Entries</dd>
<dt>Speed:</dt><dd>#[speed]# articles per second</dd>
<dt>Running Time:</dt><dd>#[runningHours]# hours, #[runningMinutes]# minutes</dd>

@ -22,9 +22,11 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.IOException;
import java.net.MalformedURLException;
import net.yacy.document.importer.OAIPMHImporter;
import net.yacy.document.importer.ResumptionToken;
import net.yacy.kelondro.data.meta.DigestURI;
import de.anomic.http.server.RequestHeader;
@ -38,6 +40,20 @@ public class IndexImportOAIPMH_p {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard) env;
prop.put("import_defaulturl", "");
if (OAIPMHImporter.job != null) {
// show result from finished import
try {
ResumptionToken rt = OAIPMHImporter.job.getResumptionToken();
if (rt != null) prop.put("import_defaulturl", rt.resumptionURL(new DigestURI(OAIPMHImporter.job.source(), null)).toNormalform(true, false));
} catch (MalformedURLException e) {
prop.put("import_defaulturl", e.getMessage());
} catch (IOException e) {
// reached end of resumption
prop.put("import_defaulturl", e.getMessage());
}
}
if (OAIPMHImporter.job != null && OAIPMHImporter.job.isAlive()) {
// one import is running, no option to insert anything
prop.put("import", 1);
@ -49,35 +65,37 @@ public class IndexImportOAIPMH_p {
prop.put("import_runningMinutes", (OAIPMHImporter.job.runningTime() / 60) % 60);
prop.put("import_remainingHours", (OAIPMHImporter.job.remainingTime() / 60) / 60);
prop.put("import_remainingMinutes", (OAIPMHImporter.job.remainingTime() / 60) % 60);
} else {
prop.put("import", 0);
if (post == null) {
prop.put("import_status", 0);
} else {
if (post.containsKey("oaipmhurl")) {
String oaipmhurl = post.get("oaipmhurl");
DigestURI url = null;
try {
url = new DigestURI(oaipmhurl, null);
OAIPMHImporter.job = new OAIPMHImporter(sb.loader, url);
OAIPMHImporter.job.start();
prop.put("import", 0);
prop.put("import_thread", "started");
prop.put("import_source", OAIPMHImporter.job.source());
prop.put("import_count", 0);
prop.put("import_speed", 0);
prop.put("import_runningHours", 0);
prop.put("import_runningMinutes", 0);
prop.put("import_remainingHours", 0);
prop.put("import_remainingMinutes", 0);
} catch (MalformedURLException e) {
e.printStackTrace();
prop.put("import", 0);
prop.put("import_status", 1);
prop.put("import_status_message", e.getMessage());
}
}
return prop;
return prop;
}
prop.put("import", 0);
if (post == null) {
prop.put("import_status", 0);
return prop;
}
if (post.containsKey("oaipmhurl")) {
String oaipmhurl = post.get("oaipmhurl");
DigestURI url = null;
try {
url = new DigestURI(oaipmhurl, null);
OAIPMHImporter.job = new OAIPMHImporter(sb.loader, url);
OAIPMHImporter.job.start();
prop.put("import", 1);
prop.put("import_thread", "started");
prop.put("import_source", OAIPMHImporter.job.source());
prop.put("import_rt", OAIPMHImporter.job.status());
prop.put("import_count", 0);
prop.put("import_speed", 0);
prop.put("import_runningHours", 0);
prop.put("import_runningMinutes", 0);
prop.put("import_remainingHours", 0);
prop.put("import_remainingMinutes", 0);
} catch (MalformedURLException e) {
e.printStackTrace();
prop.put("import", 0);
prop.put("import_status", 1);
prop.put("import_status_message", e.getMessage());
}
}
return prop;

@ -107,6 +107,11 @@ public class DCEntry extends TreeMap<String, String> {
String u = this.get("url");
if (u == null) u = this.get("dc:identifier");
if (u == null) return null;
String[] urls = u.split(";");
if (urls.length > 1) {
// select one that fits
u = bestU(urls);
}
try {
return new DigestURI(u, null);
} catch (MalformedURLException e) {
@ -115,6 +120,21 @@ public class DCEntry extends TreeMap<String, String> {
}
}
private String bestU(String[] urls) {
for (String uu: urls) {
if (uu.startsWith("http://")) return uu;
}
for (String uu: urls) {
if (uu.startsWith("ftp://")) return uu;
}
for (String uu: urls) {
//urn identifier koennen ueber den resolver der d-nb aufgeloest werden:
//http://nbn-resolving.de/urn:nbn:de:bsz:960-opus-1860
if (uu.startsWith("urn:")) return "http://nbn-resolving.de/" + uu;
}
return urls[0];
}
public String getLanguage() {
String l = this.get("language");
if (l == null) l = this.get("dc:language");

@ -26,7 +26,7 @@ public interface Importer extends Runnable {
*/
public long remainingTime();
public String status();
public boolean isAlive();

@ -112,6 +112,10 @@ public class MediawikiImporter extends Thread implements Importer {
return this.sourcefile.getAbsolutePath();
}
public String status() {
return "";
}
/**
* return the number of articles per second
* @return

@ -1,4 +1,4 @@
// PMHReader
// OAIPMHImporter
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 30.09.2009 on http://yacy.net
//
@ -27,6 +27,7 @@
package net.yacy.document.importer;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
@ -40,21 +41,31 @@ import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.search.Switchboard;
// get one server with
// http://roar.eprints.org/index.php?action=csv
// list records from oai-pmh like
// http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc
public class OAIPMHImporter extends Thread implements Importer {
public static Importer job; // if started from a servlet, this object is used to store the thread
public static OAIPMHImporter job; // if started from a servlet, this object is used to store the thread
private LoaderDispatcher loader;
private DigestURI source;
private int count;
private long startTime;
private ResumptionToken resumptionToken;
public OAIPMHImporter(LoaderDispatcher loader, DigestURI source) {
this.loader = loader;
this.source = source;
this.count = 0;
this.startTime = System.currentTimeMillis();
this.resumptionToken = null;
}
@ -62,6 +73,14 @@ public class OAIPMHImporter extends Thread implements Importer {
return this.count;
}
public String status() {
return (this.resumptionToken == null) ? "" : this.resumptionToken.toString();
}
public ResumptionToken getResumptionToken() {
return this.resumptionToken;
}
public long remainingTime() {
return Long.MAX_VALUE; // we don't know
}
@ -88,14 +107,21 @@ public class OAIPMHImporter extends Thread implements Importer {
}
}
public static void load0(DigestURI source) throws IOException {
public void load0(DigestURI source) throws IOException {
Response response = HTTPLoader.load(new Request(source, null));
load(response);
}
private static void load(Response response) throws IOException {
//FileUtils.copy(source, dest)
private void load(Response response) throws IOException {
byte[] b = response.getContent();
this.resumptionToken = new ResumptionToken(new ByteArrayInputStream(b));
String file = this.source.getHost() + "_" + System.currentTimeMillis();
File f0 = new File(Switchboard.getSwitchboard().surrogatesInPath, file + ".tmp");
File f1 = new File(Switchboard.getSwitchboard().surrogatesInPath, file + ".xml");
FileUtils.copy(b, f0);
f0.renameTo(f1);
/*
SurrogateReader sr = new SurrogateReader(new ByteArrayInputStream(b), 100);
Thread srt = new Thread(sr);
srt.start();
@ -106,9 +132,8 @@ public class OAIPMHImporter extends Thread implements Importer {
try {
srt.join();
} catch (InterruptedException e) {}
ResumptionTokenReader rtr = new ResumptionTokenReader(new ByteArrayInputStream(b));
ResumptionToken token = rtr.getToken();
System.out.println("TOKEN: " + token.toString());
*/
System.out.println("TOKEN: " + resumptionToken.toString());
}
@ -165,20 +190,6 @@ public class OAIPMHImporter extends Thread implements Importer {
}
return sbuf.toString();
}
public static void main(String[] args) {
// get one server with
// http://roar.eprints.org/index.php?action=csv
// list records from oai-pmh like
// http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc
try {
load0(new DigestURI(args[0], null));
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/*

@ -1,11 +1,47 @@
// ResumptionToken
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 31.10.2009 on http://yacy.net
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.importer;
import java.io.IOException;
import java.io.InputStream;
import java.text.Collator;
import java.text.ParseException;
import java.util.Date;
import java.util.Locale;
import java.util.TreeMap;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.DateFormatter;
public class ResumptionToken extends TreeMap<String, String> {
@ -19,32 +55,76 @@ public class ResumptionToken extends TreeMap<String, String> {
insensitiveCollator.setDecomposition(Collator.NO_DECOMPOSITION);
}
public ResumptionToken(final InputStream stream) throws IOException {
super((Collator) insensitiveCollator.clone());
new Reader(stream);
}
public ResumptionToken(
Date expirationDate,
int completeListSize,
int cursor,
int token
String token
) {
super((Collator) insensitiveCollator.clone());
this.put("expirationDate", DateFormatter.formatISO8601(expirationDate));
this.put("completeListSize", Integer.toString(completeListSize));
this.put("cursor", Integer.toString(cursor));
this.put("token", Integer.toString(token));
this.put("token", token);
}
public ResumptionToken(
String expirationDate,
int completeListSize,
int cursor,
int token
String token
) {
super((Collator) insensitiveCollator.clone());
this.put("expirationDate", expirationDate);
this.put("completeListSize", Integer.toString(completeListSize));
this.put("cursor", Integer.toString(cursor));
this.put("token", Integer.toString(token));
this.put("token", token);
}
/**
* compute a url that can be used to resume the retrieval from the OAI-PMH resource
* @param givenURL
* @return
* @throws IOException in case that no follow-up url can be generated; i.e. if the expiration date is exceeded
*/
public DigestURI resumptionURL(DigestURI givenURL) throws IOException {
// decide which kind of encoding stratgy was used to get a resumptionToken:
String token = this.getToken();
if (token == null || token.length() == 0) throw new IOException("end of resumption reached");
String url = givenURL.toNormalform(true, true);
int i = url.indexOf('?');
if (i > 0) url = url.substring(0, i + 1);
// encoded state
if (token.indexOf("from=") >= 0) {
return new DigestURI(url + "verb=ListRecords&" + token, null);
}
// cached result set
// can be detected with given expiration date
Date expiration = getExpirationDate();
if (expiration != null) {
if (expiration.before(new Date())) throw new IOException("the resumption is expired at " + DateFormatter.formatISO8601(expiration) + " (now: " + DateFormatter.formatISO8601(new Date()));
// the resumption token is still fresh
return new DigestURI(url + "verb=ListRecords&resumptionToken=" + token, null);
}
// may still be an encoded state
return new DigestURI(url + "verb=ListRecords&" + token, null);
}
/**
* an expiration date of a resumption token that addresses how long a cached set will
* stay in the cache of the oai-pmh server. See:
* http://www.openarchives.org/OAI/2.0/guidelines-repository.htm#CachedResultSet
* @return
*/
public Date getExpirationDate() {
String d = this.get("expirationDate");
if (d == null) return null;
@ -56,22 +136,46 @@ public class ResumptionToken extends TreeMap<String, String> {
}
}
/**
* The completeListSize attribute provides a place where the estimated number of results
* in the complete list response may be announced. This is likely to be used for
* status monitoring by harvesting software and implementation is recommended especially in
* repositories with large numbers of records. The value of completeListSize can be reliably
* accurate only in the case of a system where the result set is cached.
* In other cases, it is permissible for repositories to revise
* the estimate during a list request sequence.
* An attribute according to
* http://www.openarchives.org/OAI/2.0/guidelines-repository.htm#completeListSize
* @return
*/
public int getCompleteListSize() {
String t = this.get("completeListSize");
if (t == null) return 0;
return Integer.parseInt(t);
}
/**
* The cursor attribute is the number of results returned so far in the complete list response,
* thus it is always "0" in the first incomplete list response.
* It should only be specified if it is consistently used in all responses.
* An attribute according to
* http://www.openarchives.org/OAI/2.0/guidelines-repository.htm#completeListSize
* @return
*/
public int getCursor() {
String t = this.get("cursor");
if (t == null) return 0;
return Integer.parseInt(t);
}
public int getToken() {
String t = this.get("token");
if (t == null) return 0;
return Integer.parseInt(t);
/**
* get a token of the stateless transfer in case that no expiration date is given
* see:
* http://www.openarchives.org/OAI/2.0/guidelines-repository.htm#StateInResumptionToken
* @return
*/
public String getToken() {
return this.get("token");
}
public String toString() {
@ -79,4 +183,72 @@ public class ResumptionToken extends TreeMap<String, String> {
", cursor=" + this.getCursor() + ", token=" + this.getToken();
}
// get a resumption token using a SAX xml parser from am input stream
private class Reader extends DefaultHandler {
// class variables
private final StringBuilder buffer;
private boolean parsingValue;
private SAXParser saxParser;
private InputStream stream;
private Attributes atts;
public Reader(final InputStream stream) throws IOException {
this.buffer = new StringBuilder();
this.parsingValue = false;
this.stream = stream;
this.atts = null;
final SAXParserFactory factory = SAXParserFactory.newInstance();
try {
this.saxParser = factory.newSAXParser();
this.saxParser.parse(this.stream, this);
} catch (SAXException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ParserConfigurationException e) {
e.printStackTrace();
throw new IOException(e.getMessage());
} finally {
try {
this.stream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/*
<resumptionToken expirationDate="2009-10-31T22:52:14Z"
completeListSize="226"
cursor="0">688</resumptionToken>
*/
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
if ("resumptionToken".equals(tag)) {
this.parsingValue = true;
this.atts = atts;
}
}
public void endElement(final String uri, final String name, final String tag) {
if (tag == null) return;
if ("resumptionToken".equals(tag)) {
put("expirationDate", atts.getValue("expirationDate"));
put("completeListSize", atts.getValue("completeListSize"));
put("cursor", atts.getValue("cursor"));
put("token", buffer.toString());
this.buffer.setLength(0);
this.parsingValue = false;
}
}
public void characters(final char ch[], final int start, final int length) {
if (parsingValue) {
buffer.append(ch, start, length);
}
}
}
}

@ -1,90 +0,0 @@
package net.yacy.document.importer;
import java.io.IOException;
import java.io.InputStream;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class ResumptionTokenReader extends DefaultHandler {
// class variables
private final StringBuilder buffer;
private boolean parsingValue;
private ResumptionToken token;
private SAXParser saxParser;
private InputStream stream;
private Attributes atts;
public ResumptionTokenReader(final InputStream stream) throws IOException {
this.buffer = new StringBuilder();
this.parsingValue = false;
this.token = null;
this.stream = stream;
this.atts = null;
final SAXParserFactory factory = SAXParserFactory.newInstance();
try {
this.saxParser = factory.newSAXParser();
this.saxParser.parse(this.stream, this);
} catch (SAXException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} catch (ParserConfigurationException e) {
e.printStackTrace();
throw new IOException(e.getMessage());
} finally {
try {
this.stream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public ResumptionToken getToken() {
return this.token;
}
/*
<resumptionToken expirationDate="2009-10-31T22:52:14Z"
completeListSize="226"
cursor="0">688</resumptionToken>
*/
public void run() {
}
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
if ("resumptionToken".equals(tag)) {
this.parsingValue = true;
this.atts = atts;
}
}
public void endElement(final String uri, final String name, final String tag) {
if (tag == null) return;
if ("resumptionToken".equals(tag)) {
this.token = new ResumptionToken(
atts.getValue("expirationDate"),
Integer.parseInt(atts.getValue("completeListSize")),
Integer.parseInt(atts.getValue("cursor")),
Integer.parseInt(buffer.toString().trim()));
this.buffer.setLength(0);
this.parsingValue = false;
}
}
public void characters(final char ch[], final int start, final int length) {
if (parsingValue) {
buffer.append(ch, start, length);
}
}
}
Loading…
Cancel
Save