- added a csv "comma separated values" parser to parse OAI-PMH sources from

http://roar.eprints.org/index.php?action=csv
- integrated the csv parser into the crawlers parser list
- added an extension to the OAI-PMH import function to download and show the roar csv file using the csv parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6448 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 0f63de8236
commit 9b6762ec2e

@ -3,9 +3,24 @@
<head>
<title>YaCy '#[clientname]#': OAI-PMH source import list</title>
#%env/templates/metas.template%#
<meta http-equiv="REFRESH" content="6" />
#(refresh)#::<meta http-equiv="REFRESH" content="6" />#(/refresh)#
</head>
<body>
#(source)#::
<h3>OAI Source List</h3>
<table cellpadding="2" cellspacing="1" >
<tr class="TableHeader">
<td>Source</td>
</tr>
#{table}#
<tr class="TableCell#(dark)#Light::Dark#(/dark)#">
<td>#[source]#</td>
</tr>
#{/table}#
</table>
#(/source)#
#(import)#::
<h3>Import List</h3>
<table cellpadding="2" cellspacing="1" >
<tr class="TableHeader">
@ -25,5 +40,6 @@
</tr>
#{/table}#
</table>
#(/import)#
</body>
</html>

@ -23,10 +23,12 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.util.ArrayList;
import java.util.Set;
import net.yacy.document.importer.OAIPMHImporter;
import de.anomic.http.server.RequestHeader;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -34,25 +36,49 @@ public class IndexImportOAIPMHList_p {
public static serverObjects respond(final RequestHeader header, serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard) env;
prop.put("refresh", 0);
prop.put("import", 0);
prop.put("source", 0);
ArrayList<OAIPMHImporter> jobs = new ArrayList<OAIPMHImporter>();
for (OAIPMHImporter job: OAIPMHImporter.runningJobs) jobs.add(job);
for (OAIPMHImporter job: OAIPMHImporter.startedJobs) jobs.add(job);
for (OAIPMHImporter job: OAIPMHImporter.finishedJobs) jobs.add(job);
if (post != null && post.containsKey("source")) {
Set<String> oaiRoots = OAIPMHImporter.getOAIServer(sb.loader);
boolean dark = false;
int cnt = 0;
for (String root: oaiRoots) {
prop.put("source_table_" + cnt + "_dark", (dark) ? "1" : "0");
prop.put("source_table_" + cnt + "_source", root);
dark = !dark;
cnt++;
}
prop.put("source_table", cnt);
prop.put("source", 1);
}
boolean dark = false;
int cnt = 0;
for (OAIPMHImporter job: jobs) {
prop.put("table_" + cnt + "_dark", (dark) ? "1" : "0");
prop.put("table_" + cnt + "_thread", (job.isAlive()) ? "<img src=\"/env/grafics/crawl.gif\" alt=\"running\" />" : "finished");
prop.put("table_" + cnt + "_source", job.source());
prop.put("table_" + cnt + "_chunkCount", job.chunkCount());
prop.put("table_" + cnt + "_recordsCount", job.count());
prop.put("table_" + cnt + "_speed", job.speed());
dark = !dark;
cnt++;
if (post != null && post.containsKey("import")) {
ArrayList<OAIPMHImporter> jobs = new ArrayList<OAIPMHImporter>();
for (OAIPMHImporter job: OAIPMHImporter.runningJobs) jobs.add(job);
for (OAIPMHImporter job: OAIPMHImporter.startedJobs) jobs.add(job);
for (OAIPMHImporter job: OAIPMHImporter.finishedJobs) jobs.add(job);
boolean dark = false;
int cnt = 0;
for (OAIPMHImporter job: jobs) {
prop.put("import_table_" + cnt + "_dark", (dark) ? "1" : "0");
prop.put("import_table_" + cnt + "_thread", (job.isAlive()) ? "<img src=\"/env/grafics/crawl.gif\" alt=\"running\" />" : "finished");
prop.put("import_table_" + cnt + "_source", job.source());
prop.put("import_table_" + cnt + "_chunkCount", job.chunkCount());
prop.put("import_table_" + cnt + "_recordsCount", job.count());
prop.put("import_table_" + cnt + "_speed", job.speed());
dark = !dark;
cnt++;
}
prop.put("import_table", cnt);
prop.put("import", 1);
prop.put("refresh", 1);
}
prop.put("table", cnt);
return prop;
}

@ -15,7 +15,7 @@
<fieldset>
<legend>Single request import</legend>
This will submit only a single request as given here to a OAI-PMH server and imports records into the index<br />
<input name="urlstartone" type="text" value="#[defaulturl]#" size="100" />
<input name="urlstartone" type="text" value="#[defaulturl]#" size="80" />
<input name="submit" type="submit" value="Import OAI-PMH source" />
#(import-one)#::
<p><dl>
@ -32,14 +32,15 @@
<form action="IndexImportOAIPMH_p.html" method="post" enctype="multipart/form-data">
<fieldset>
<legend>Import all Records from a server</legend>
Import all records that follow acording to resumption elements into index<br />
<input name="urlstartall" type="text" value="" size="100" />
<input name="submit" type="submit" value="Import OAI-PMH source" />
Import all records that follow according to resumption elements into index<br />
<input name="urlstartall" type="text" value="" size="80" />
<input name="importroot" type="submit" value="import this source" />
or&nbsp;<input name="getlist" type="submit" value="import from a list" />
#(status)#::<p>Import started!</p>::<p>Bad input data: #[message]# </p>#(/status)#
</fieldset>
</form>
<iframe name="OAI-PMH Import List"
src="/IndexImportOAIPMHList_p.html"
src="/IndexImportOAIPMHList_p.html#(iframetype)#::?import=::?source=#(/iframetype)#"
width="100%"
height="340"
frameborder="0"

@ -44,6 +44,7 @@ public class IndexImportOAIPMH_p {
prop.put("import-one", 0);
prop.put("status", 0);
prop.put("defaulturl", "");
prop.put("iframetype", (OAIPMHImporter.runningJobs.size() + OAIPMHImporter.startedJobs.size() + OAIPMHImporter.finishedJobs.size() == 0) ? 0 : 1);
if (post != null) {
if (post.containsKey("urlstartone")) {
@ -80,8 +81,8 @@ public class IndexImportOAIPMH_p {
}
}
if (post.containsKey("urlstartall")) {
String oaipmhurl = post.get("urlstartall");
if (post.containsKey("importroot")) {
String oaipmhurl = post.get("urlstartall", "");
DigestURI url = null;
try {
url = new DigestURI(oaipmhurl, null);
@ -94,6 +95,10 @@ public class IndexImportOAIPMH_p {
prop.put("status_message", e.getMessage());
}
}
if (post.containsKey("getlist")) {
prop.put("iframetype", 2);
}
}
return prop;
}

@ -42,6 +42,7 @@ import java.util.TreeMap;
import java.util.TreeSet;
import net.yacy.document.parser.bzipParser;
import net.yacy.document.parser.csvParser;
import net.yacy.document.parser.docParser;
import net.yacy.document.parser.gzipParser;
import net.yacy.document.parser.htmlParser;
@ -83,6 +84,7 @@ public final class TextParser {
static {
initParser(new bzipParser());
initParser(new csvParser());
initParser(new docParser());
initParser(new gzipParser());
initParser(new htmlParser());

@ -26,18 +26,32 @@
package net.yacy.document.importer;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.List;
import java.util.Set;
import java.util.TreeSet;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.document.parser.csvParser;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response;
import de.anomic.search.Switchboard;
// get one server with
// http://roar.eprints.org/index.php?action=csv
// or
// http://www.openarchives.org/Register/BrowseSites
// or
// http://www.openarchives.org/Register/ListFriends
//
// list records from oai-pmh like
// http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc
@ -156,4 +170,40 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
if (this.serialNumber < o.serialNumber) return -1;
return 0;
}
public static Set<String> getOAIServer(LoaderDispatcher loader) {
TreeSet<String> list = new TreeSet<String>();
// read roar
File roar = new File(Switchboard.getSwitchboard().getRootPath(), "DATA/SETTINGS/roar.csv");
DigestURI roarSource;
try {
roarSource = new DigestURI("http://roar.eprints.org/index.php?action=csv", null);
} catch (MalformedURLException e) {
e.printStackTrace();
roarSource = null;
}
if (!roar.exists()) try {
// load the file from the net
Response response = loader.load(roarSource, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
byte[] b = response.getContent();
FileUtils.copy(b, roar);
} catch (IOException e) {
e.printStackTrace();
}
if (roar.exists()) {
csvParser parser = new csvParser();
try {
List<String[]> table = parser.getTable(roarSource, "", "UTF-8", new FileInputStream(roar));
for (String[] row: table) {
list.add(row[2]);
}
} catch (FileNotFoundException e) {
e.printStackTrace();
}
}
return list;
}
}

@ -0,0 +1,149 @@
// CSVParser
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 02.10.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2009-09-23 23:26:14 +0200 (Mi, 23 Sep 2009) $
// $LastChangedRevision: 6340 $
// $LastChangedBy: low012 $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Idiom;
import net.yacy.document.ParserException;
import net.yacy.kelondro.data.meta.DigestURI;
/**
* a parser for comma-separated values
* The values may also be separated by semicolon or tab,
* the separator character is detected automatically
*/
public class csvParser extends AbstractParser implements Idiom {
/**
* a list of mime types that are supported by this parser class
* @see #getSupportedMimeTypes()
*/
public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
static {
SUPPORTED_EXTENSIONS.add("csv");
}
public csvParser() {
super("Comma Separated Value Parser");
}
public Set<String> supportedMimeTypes() {
return SUPPORTED_MIME_TYPES;
}
public Set<String> supportedExtensions() {
return SUPPORTED_EXTENSIONS;
}
@Override
public Document parse(DigestURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
// construct a document using all cells of the document
// the first row is used as headline
// all lines are artificially terminated by a '.' to separate them as sentence for the condenser.
List<String[]> table = getTable(location, mimeType, charset, source);
if (table.size() == 0) throw new ParserException("document has no lines", location);
StringBuilder sb = new StringBuilder();
for (String[] row: table) sb.append(concatRow(row)).append(' ');
try {
return new Document(
location,
mimeType,
charset,
null,
null,
concatRow(table.get(0)),
"",
null,
null,
sb.toString().getBytes(charset),
null,
null);
} catch (UnsupportedEncodingException e) {
throw new ParserException("error in csvParser, getBytes: " + e.getMessage(), location);
}
}
public String concatRow(String[] column) {
StringBuilder sb = new StringBuilder(80);
for (int i = 0; i < column.length; i++) {
if (i != 0) sb.append(' ');
sb.append(column[i]);
}
sb.append('.');
return sb.toString();
}
public List<String[]> getTable(DigestURI location, String mimeType, String charset, InputStream source) {
ArrayList<String[]> rows = new ArrayList<String[]>();
BufferedReader reader;
try {
reader = new BufferedReader(new InputStreamReader(source, charset));
} catch (UnsupportedEncodingException e1) {
reader = new BufferedReader(new InputStreamReader(source));
}
String row;
String separator = null;
int columns = -1;
try {
while ((row = reader.readLine()) != null) {
row = row.trim();
if (row.length() == 0) continue;
if (separator == null) {
// try comma, semicolon and tab; take that one that results with more columns
String[] colc = row.split(",");
String[] cols = row.split(";");
String[] colt = row.split("\t");
if (colc.length >= cols.length && colc.length >= colt.length) separator = ",";
if (cols.length >= colc.length && cols.length >= colt.length) separator = ";";
if (colt.length >= cols.length && colt.length >= colc.length) separator = "\t";
}
String[] cols = row.split(separator);
if (columns == -1) columns = cols.length;
//if (cols.length != columns) continue; // skip lines that have the wrong number of columns
rows.add(cols);
}
} catch (IOException e) {
}
return rows;
}
}
Loading…
Cancel
Save