added stub of oai-pmh importer (not working yet)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6437 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 77c99e500f
commit 30f108f97d

@ -0,0 +1,37 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': OAI-PMH Import</title>
#%env/templates/metas.template%#
#(import)#::<meta http-equiv="REFRESH" content="10" />#(/import)#
</head>
<body id="IndexImportOAIPMH">
#%env/templates/header.template%#
#%env/templates/submenuIntegration.template%#
<h2>OAI-PMH Import</h2>
#(import)#
<p>#(status)#No import thread is running, you can start a new thread here::Bad input data: #[message]# #(/status)#</p>
<form action="IndexImportOAIPMH_p.html" method="get">
<fieldset>
<legend>OAI-PMH Import: set a OAI-PMH URL</legend>
<input name="oaipmhurl" type="text" value="" size="80" />
<input name="submit" type="submit" value="Import from a OAI-PMH source" />
</fieldset>
</form>
::
<form><fieldset><legend>Import Process</legend>
<dl>
<dt>Thread:</dt><dd>#[thread]#</dd>
<dt>Source:</dt><dd>#[source]#</dd>
<dt>Processed:</dt><dd>#[count]# Wiki Entries</dd>
<dt>Speed:</dt><dd>#[speed]# articles per second</dd>
<dt>Running Time:</dt><dd>#[runningHours]# hours, #[runningMinutes]# minutes</dd>
<dt>Remaining Time:</dt><dd>#[remainingHours]# hours, #[remainingMinutes]# minutes</dd>
</dl>
</fieldset></form>
#(/import)#
#%env/templates/footer.template%#
</body>
</html>

@ -0,0 +1,86 @@
// IndexImportOAIPMH.java
// -------------------------
// (C) 2009 by Michael Peter Christen; mc@yacy.net
// first published 04.05.2009 on http://yacy.net
// Frankfurt, Germany
//
// $LastChangedDate: 2009-10-11 23:29:18 +0200 (So, 11 Okt 2009) $
// $LastChangedRevision: 6400 $
// $LastChangedBy: orbiter $
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.File;
import java.net.MalformedURLException;
import net.yacy.document.importer.OAIPMHImporter;
import net.yacy.kelondro.data.meta.DigestURI;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class IndexImportOAIPMH_p {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard) env;
if (OAIPMHImporter.job != null && OAIPMHImporter.job.isAlive()) {
// one import is running, no option to insert anything
prop.put("import", 1);
prop.put("import_thread", "running");
prop.put("import_source", OAIPMHImporter.job.source());
prop.put("import_count", OAIPMHImporter.job.count());
prop.put("import_speed", OAIPMHImporter.job.speed());
prop.put("import_runningHours", (OAIPMHImporter.job.runningTime() / 60) / 60);
prop.put("import_runningMinutes", (OAIPMHImporter.job.runningTime() / 60) % 60);
prop.put("import_remainingHours", (OAIPMHImporter.job.remainingTime() / 60) / 60);
prop.put("import_remainingMinutes", (OAIPMHImporter.job.remainingTime() / 60) % 60);
} else {
prop.put("import", 0);
if (post == null) {
prop.put("import_status", 0);
} else {
if (post.containsKey("file")) {
String oaipmhurl = post.get("oaipmhurl");
DigestURI url = null;
try {
url = new DigestURI(oaipmhurl, null);
OAIPMHImporter.job = new OAIPMHImporter(sb.loader, url);
OAIPMHImporter.job.start();
prop.put("import", 1);
prop.put("import_thread", "started");
prop.put("import_dump", OAIPMHImporter.job.source());
prop.put("import_count", 0);
prop.put("import_speed", 0);
prop.put("import_runningHours", 0);
prop.put("import_runningMinutes", 0);
prop.put("import_remainingHours", 0);
prop.put("import_remainingMinutes", 0);
} catch (MalformedURLException e) {
e.printStackTrace();
prop.put("import", 0);
prop.put("import_status", 1);
prop.put("import_status_message", e.getMessage());
}
}
return prop;
}
}
return prop;
}
}

@ -25,11 +25,12 @@
import java.io.File;
import java.net.MalformedURLException;
import net.yacy.document.importer.MediawikiImporter;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.tools.mediawikiIndex;
public class IndexImportWikimedia_p {
@ -37,17 +38,17 @@ public class IndexImportWikimedia_p {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard) env;
if (mediawikiIndex.job != null && mediawikiIndex.job.isAlive()) {
if (MediawikiImporter.job != null && MediawikiImporter.job.isAlive()) {
// one import is running, no option to insert anything
prop.put("import", 1);
prop.put("import_thread", "running");
prop.put("import_dump", mediawikiIndex.job.sourcefile.getName());
prop.put("import_count", mediawikiIndex.job.count);
prop.put("import_speed", mediawikiIndex.job.speed());
prop.put("import_runningHours", (mediawikiIndex.job.runningTime() / 60) / 60);
prop.put("import_runningMinutes", (mediawikiIndex.job.runningTime() / 60) % 60);
prop.put("import_remainingHours", (mediawikiIndex.job.remainingTime() / 60) / 60);
prop.put("import_remainingMinutes", (mediawikiIndex.job.remainingTime() / 60) % 60);
prop.put("import_dump", MediawikiImporter.job.source());
prop.put("import_count", MediawikiImporter.job.count());
prop.put("import_speed", MediawikiImporter.job.speed());
prop.put("import_runningHours", (MediawikiImporter.job.runningTime() / 60) / 60);
prop.put("import_runningMinutes", (MediawikiImporter.job.runningTime() / 60) % 60);
prop.put("import_remainingHours", (MediawikiImporter.job.remainingTime() / 60) / 60);
prop.put("import_remainingMinutes", (MediawikiImporter.job.remainingTime() / 60) % 60);
} else {
prop.put("import", 0);
if (post == null) {
@ -64,11 +65,11 @@ public class IndexImportWikimedia_p {
}
String lang = name.substring(0, 2);
try {
mediawikiIndex.job = new mediawikiIndex(sourcefile, sb.surrogatesInPath, "http://" + lang + ".wikipedia.org/wiki/");
mediawikiIndex.job.start();
MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath, "http://" + lang + ".wikipedia.org/wiki/");
MediawikiImporter.job.start();
prop.put("import", 1);
prop.put("import_thread", "started");
prop.put("import_dump", mediawikiIndex.job.sourcefile.getName());
prop.put("import_dump", MediawikiImporter.job.source());
prop.put("import_count", 0);
prop.put("import_speed", 0);
prop.put("import_runningHours", 0);

@ -27,11 +27,12 @@
import java.io.File;
import java.io.IOException;
import net.yacy.document.importer.MediawikiImporter;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.tools.mediawikiIndex;
public class mediawiki_p {
@ -53,12 +54,12 @@ public class mediawiki_p {
File dumpFile = new File(sb.getRootPath(), "DATA/HTCACHE/mediawiki/" + dump);
if (!dumpFile.exists()) return post;
mediawikiIndex.checkIndex(dumpFile);
mediawikiIndex.wikisourcerecord w = mediawikiIndex.find(title.replaceAll(" ", "_"), mediawikiIndex.idxFromWikimediaXML(dumpFile));
MediawikiImporter.checkIndex(dumpFile);
MediawikiImporter.wikisourcerecord w = MediawikiImporter.find(title.replaceAll(" ", "_"), MediawikiImporter.idxFromWikimediaXML(dumpFile));
if (w == null) {
return post;
}
String page = new String(mediawikiIndex.read(dumpFile, w.start, (int) (w.end - w.start)), "UTF-8");
String page = new String(MediawikiImporter.read(dumpFile, w.start, (int) (w.end - w.start)), "UTF-8");
int p = page.indexOf("<text");
if (p < 0) return prop;
p = page.indexOf('>', p);

@ -0,0 +1,40 @@
package net.yacy.document.importer;
public interface Importer extends Runnable {
public String source();
public int count();
/**
* return the number of articles per second
* @return
*/
public int speed();
/**
* return the time this import is already running
* @return
*/
public long runningTime();
/**
* return the remaining seconds for the completion of all records in milliseconds
* @return
*/
public long remainingTime();
public boolean isAlive();
public void start();
/**
* the run method from runnable
*/
public void run();
}

@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.tools;
package net.yacy.document.importer;
import net.yacy.document.Document;
import net.yacy.document.TextParser;
@ -71,7 +71,7 @@ import de.anomic.data.wiki.wikiParser;
* as referenced with xmlns="http://www.mediawiki.org/xml/export-0.3/"
*/
public class mediawikiIndex extends Thread {
public class MediawikiImporter extends Thread implements Importer {
private static final String textstart = "<text";
private static final String textend = "</text>";
@ -79,6 +79,9 @@ public class mediawikiIndex extends Thread {
private static final String pageend = "</page>";
private static final byte[] pagestartb = pagestart.getBytes();
private static final byte[] pageendb = pageend.getBytes();
private static final int docspermbinxmlbz2 = 800; // documents per megabyte in a xml.bz2 wikimedia dump
public static Importer job; // if started from a servlet, this object is used to store the thread
protected wikiParser wparser;
protected String urlStub;
@ -89,11 +92,8 @@ public class mediawikiIndex extends Thread {
private long docsize;
private int approxdocs;
private static final int docspermbinxmlbz2 = 800; // documents per megabyte in a xml.bz2 wikimedia dump
public static mediawikiIndex job; // if started from a servlet, this object is used to store the thread
public mediawikiIndex(File sourcefile, File targetdir, String baseURL) throws MalformedURLException {
public MediawikiImporter(File sourcefile, File targetdir, String baseURL) throws MalformedURLException {
this.sourcefile = sourcefile;
this.docsize = sourcefile.length();
this.approxdocs = (int) (this.docsize * (long) docspermbinxmlbz2 / 1024L / 1024L);
@ -104,6 +104,14 @@ public class mediawikiIndex extends Thread {
this.start = 0;
}
public int count() {
return this.count;
}
public String source() {
return this.sourcefile.getAbsolutePath();
}
/**
* return the number of articles per second
* @return
@ -738,7 +746,7 @@ public class mediawikiIndex extends Thread {
String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/
//String language = urlStub.substring(7,9);
try {
mediawikiIndex mi = new mediawikiIndex(sourcefile, targetdir, urlStub);
MediawikiImporter mi = new MediawikiImporter(sourcefile, targetdir, urlStub);
mi.start();
mi.join();
} catch (InterruptedException e) {

@ -24,7 +24,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.crawler;
package net.yacy.document.importer;
import java.io.ByteArrayInputStream;
import java.io.IOException;
@ -35,21 +35,56 @@ import net.yacy.document.content.file.SurrogateReader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
public class PMHReader {
public class OAIPMHImporter extends Thread implements Importer {
LoaderDispatcher loader;
public static Importer job; // if started from a servlet, this object is used to store the thread
public PMHReader(LoaderDispatcher loader) {
private LoaderDispatcher loader;
private DigestURI source;
private int count;
private long startTime;
public OAIPMHImporter(LoaderDispatcher loader, DigestURI source) {
this.loader = loader;
this.source = source;
this.count = 0;
this.startTime = System.currentTimeMillis();
}
public int count() {
return this.count;
}
public long remainingTime() {
return Long.MAX_VALUE; // we don't know
}
public long runningTime() {
return System.currentTimeMillis() - this.startTime;
}
public String source() {
return source.toNormalform(true, false);
}
public int speed() {
return (int) (1000L * ((long) count()) / runningTime());
}
public void load(DigestURI source) throws IOException {
Response response = this.loader.load(source, true, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
load(response);
public void run() {
Response response;
try {
response = this.loader.load(source, true, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
load(response);
} catch (IOException e) {
e.printStackTrace();
}
}
public static void load0(DigestURI source) throws IOException {
Loading…
Cancel
Save