- added a csv "comma separated values" parser to parse OAI-PMH sources from

http://roar.eprints.org/index.php?action=csv - integrated the csv parser into the crawlers parser list - added an extension to the OAI-PMH import function to download and show the roar csv file using the csv parser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6448 6c8d7289-2bf4-0310-a012-ef5d649a1542
16 years ago · 9b6762ec2e
parent 0f63de8236
commit 9b6762ec2e
7 changed files with 273 additions and 24 deletions
--- a/htroot/IndexImportOAIPMHList_p.html
+++ b/htroot/IndexImportOAIPMHList_p.html
@ -3,9 +3,24 @@
  <head>
    <title>YaCy '#[clientname]#': OAI-PMH source import list</title>
    #%env/templates/metas.template%#
-    <meta http-equiv="REFRESH" content="6" />
+    #(refresh)#::<meta http-equiv="REFRESH" content="6" />#(/refresh)#
  </head>
  <body>
+    #(source)#::
+    <h3>OAI Source List</h3>
+    <table cellpadding="2" cellspacing="1" >
+      <tr class="TableHeader">
+        <td>Source</td>
+      </tr>
+      #{table}#
+      <tr class="TableCell#(dark)#Light::Dark#(/dark)#">
+        <td>#[source]#</td>
+      </tr>
+      #{/table}#
+    </table>
+    #(/source)#
+    
+    #(import)#::
    <h3>Import List</h3>
    <table cellpadding="2" cellspacing="1" >
      <tr class="TableHeader">
@ -25,5 +40,6 @@
      </tr>
      #{/table}#
    </table>
+    #(/import)#
  </body>
 </html>
--- a/htroot/IndexImportOAIPMHList_p.java
+++ b/htroot/IndexImportOAIPMHList_p.java
@ -23,10 +23,12 @@
 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

 import java.util.ArrayList;
+import java.util.Set;

 import net.yacy.document.importer.OAIPMHImporter;

 import de.anomic.http.server.RequestHeader;
+import de.anomic.search.Switchboard;
 import de.anomic.server.serverObjects;
 import de.anomic.server.serverSwitch;

@ -34,25 +36,49 @@ public class IndexImportOAIPMHList_p {

    public static serverObjects respond(final RequestHeader header, serverObjects post, final serverSwitch env) {
        final serverObjects prop = new serverObjects();
+        final Switchboard sb = (Switchboard) env;
+
+        prop.put("refresh", 0);
+        prop.put("import", 0);
+        prop.put("source", 0);
        
-        ArrayList<OAIPMHImporter> jobs = new ArrayList<OAIPMHImporter>();
-        for (OAIPMHImporter job: OAIPMHImporter.runningJobs) jobs.add(job);
-        for (OAIPMHImporter job: OAIPMHImporter.startedJobs) jobs.add(job);
-        for (OAIPMHImporter job: OAIPMHImporter.finishedJobs) jobs.add(job);
+        if (post != null && post.containsKey("source")) {
+            Set<String> oaiRoots = OAIPMHImporter.getOAIServer(sb.loader);
+            
+            boolean dark = false;
+            int cnt = 0;
+            for (String root: oaiRoots) {
+                prop.put("source_table_" + cnt + "_dark", (dark) ? "1" : "0");
+                prop.put("source_table_" + cnt + "_source", root);
+                dark = !dark;
+                cnt++;
+            }
+            prop.put("source_table", cnt);
+            prop.put("source", 1);
+        }
        
-        boolean dark = false;
-        int cnt = 0;
-        for (OAIPMHImporter job: jobs) {
-            prop.put("table_" + cnt + "_dark", (dark) ? "1" : "0");
-            prop.put("table_" + cnt + "_thread", (job.isAlive()) ? "<img src=\"/env/grafics/crawl.gif\" alt=\"running\" />" : "finished");
-            prop.put("table_" + cnt + "_source", job.source());
-            prop.put("table_" + cnt + "_chunkCount", job.chunkCount());
-            prop.put("table_" + cnt + "_recordsCount", job.count());
-            prop.put("table_" + cnt + "_speed", job.speed());
-            dark = !dark;
-            cnt++;
+        if (post != null && post.containsKey("import")) {
+            ArrayList<OAIPMHImporter> jobs = new ArrayList<OAIPMHImporter>();
+            for (OAIPMHImporter job: OAIPMHImporter.runningJobs) jobs.add(job);
+            for (OAIPMHImporter job: OAIPMHImporter.startedJobs) jobs.add(job);
+            for (OAIPMHImporter job: OAIPMHImporter.finishedJobs) jobs.add(job);
+            
+            boolean dark = false;
+            int cnt = 0;
+            for (OAIPMHImporter job: jobs) {
+                prop.put("import_table_" + cnt + "_dark", (dark) ? "1" : "0");
+                prop.put("import_table_" + cnt + "_thread", (job.isAlive()) ? "<img src=\"/env/grafics/crawl.gif\" alt=\"running\" />" : "finished");
+                prop.put("import_table_" + cnt + "_source", job.source());
+                prop.put("import_table_" + cnt + "_chunkCount", job.chunkCount());
+                prop.put("import_table_" + cnt + "_recordsCount", job.count());
+                prop.put("import_table_" + cnt + "_speed", job.speed());
+                dark = !dark;
+                cnt++;
+            }
+            prop.put("import_table", cnt);
+            prop.put("import", 1);
+            prop.put("refresh", 1);
        }
-        prop.put("table", cnt);
        return prop;
    }

--- a/htroot/IndexImportOAIPMH_p.html
+++ b/htroot/IndexImportOAIPMH_p.html
@ -15,7 +15,7 @@
      <fieldset>
        <legend>Single request import</legend>
        This will submit only a single request as given here to a OAI-PMH server and imports records into the index<br />
-        <input name="urlstartone" type="text" value="#[defaulturl]#" size="100" />
+        <input name="urlstartone" type="text" value="#[defaulturl]#" size="80" />
        <input name="submit" type="submit" value="Import OAI-PMH source" />
        #(import-one)#::
        <p><dl>
@ -32,14 +32,15 @@
    <form action="IndexImportOAIPMH_p.html" method="post" enctype="multipart/form-data">
        <fieldset>
          <legend>Import all Records from a server</legend>
-          Import all records that follow acording to resumption elements into index<br />
-          <input name="urlstartall" type="text" value="" size="100" />
-          <input name="submit" type="submit" value="Import OAI-PMH source" />
+          Import all records that follow according to resumption elements into index<br />
+          <input name="urlstartall" type="text" value="" size="80" />
+          <input name="importroot" type="submit" value="import this source" />
+          or&nbsp;<input name="getlist" type="submit" value="import from a list" />
          #(status)#::<p>Import started!</p>::<p>Bad input data: #[message]# </p>#(/status)#
        </fieldset>
    </form>
    <iframe name="OAI-PMH Import List"
-       src="/IndexImportOAIPMHList_p.html"
+       src="/IndexImportOAIPMHList_p.html#(iframetype)#::?import=::?source=#(/iframetype)#"
       width="100%"
       height="340"
       frameborder="0"
--- a/htroot/IndexImportOAIPMH_p.java
+++ b/htroot/IndexImportOAIPMH_p.java
@ -44,6 +44,7 @@ public class IndexImportOAIPMH_p {
        prop.put("import-one", 0);
        prop.put("status", 0);
        prop.put("defaulturl", "");
+        prop.put("iframetype", (OAIPMHImporter.runningJobs.size() + OAIPMHImporter.startedJobs.size() + OAIPMHImporter.finishedJobs.size() == 0) ? 0 : 1);
        
        if (post != null) {
            if (post.containsKey("urlstartone")) {
@ -80,8 +81,8 @@ public class IndexImportOAIPMH_p {
                }
            }
            
-            if (post.containsKey("urlstartall")) {
-                String oaipmhurl = post.get("urlstartall");
+            if (post.containsKey("importroot")) {
+                String oaipmhurl = post.get("urlstartall", "");
                DigestURI url = null;
                try {
                    url = new DigestURI(oaipmhurl, null);
@ -94,6 +95,10 @@ public class IndexImportOAIPMH_p {
                    prop.put("status_message", e.getMessage());
                }
            }
+            
+            if (post.containsKey("getlist")) {
+                prop.put("iframetype", 2);
+            }
        }
        return prop;
    }
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -42,6 +42,7 @@ import java.util.TreeMap;
 import java.util.TreeSet;

 import net.yacy.document.parser.bzipParser;
+import net.yacy.document.parser.csvParser;
 import net.yacy.document.parser.docParser;
 import net.yacy.document.parser.gzipParser;
 import net.yacy.document.parser.htmlParser;
@ -83,6 +84,7 @@ public final class TextParser {
    
    static {
        initParser(new bzipParser());
+        initParser(new csvParser());
        initParser(new docParser());
        initParser(new gzipParser());
        initParser(new htmlParser());
--- a/source/net/yacy/document/importer/OAIPMHImporter.java
+++ b/source/net/yacy/document/importer/OAIPMHImporter.java
@ -26,18 +26,32 @@

 package net.yacy.document.importer;

+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.net.MalformedURLException;
+import java.util.List;
+import java.util.Set;
 import java.util.TreeSet;

 import net.yacy.kelondro.data.meta.DigestURI;
+import net.yacy.kelondro.util.FileUtils;
 import net.yacy.repository.LoaderDispatcher;
+import net.yacy.document.parser.csvParser;

+import de.anomic.crawler.CrawlProfile;
+import de.anomic.crawler.retrieval.Response;
 import de.anomic.search.Switchboard;


 // get one server with
 // http://roar.eprints.org/index.php?action=csv
+// or
+// http://www.openarchives.org/Register/BrowseSites
+// or
+// http://www.openarchives.org/Register/ListFriends
+//
 // list records from oai-pmh like
 // http://opus.bsz-bw.de/fhhv/oai2/oai2.php?verb=ListRecords&metadataPrefix=oai_dc

@ -156,4 +170,40 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
        if (this.serialNumber < o.serialNumber) return -1;
        return 0;
    }
+    
+    public static Set<String> getOAIServer(LoaderDispatcher loader) {
+        TreeSet<String> list = new TreeSet<String>();
+
+        // read roar
+        File roar = new File(Switchboard.getSwitchboard().getRootPath(), "DATA/SETTINGS/roar.csv");
+        DigestURI roarSource;
+        try {
+            roarSource = new DigestURI("http://roar.eprints.org/index.php?action=csv", null);
+        } catch (MalformedURLException e) {
+            e.printStackTrace();
+            roarSource = null;
+        }
+        if (!roar.exists()) try {
+            // load the file from the net
+            Response response = loader.load(roarSource, false, true, CrawlProfile.CACHE_STRATEGY_NOCACHE);
+            byte[] b = response.getContent();
+            FileUtils.copy(b, roar);
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+        if (roar.exists()) {
+            csvParser parser = new csvParser();
+            try {
+                List<String[]> table = parser.getTable(roarSource, "", "UTF-8", new FileInputStream(roar));
+                for (String[] row: table) {
+                    list.add(row[2]);
+                }
+            } catch (FileNotFoundException e) {
+                e.printStackTrace();
+            }
+        }
+        
+        return list;
+    }
+    
 }
--- a/source/net/yacy/document/parser/csvParser.java
+++ b/source/net/yacy/document/parser/csvParser.java
@ -0,0 +1,149 @@
+// CSVParser
+// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
+// first published 02.10.2009 on http://yacy.net
+//
+// This is a part of YaCy, a peer-to-peer based web search engine
+//
+// $LastChangedDate: 2009-09-23 23:26:14 +0200 (Mi, 23 Sep 2009) $
+// $LastChangedRevision: 6340 $
+// $LastChangedBy: low012 $
+//
+// LICENSE
+// 
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+
+package net.yacy.document.parser;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import net.yacy.document.AbstractParser;
+import net.yacy.document.Document;
+import net.yacy.document.Idiom;
+import net.yacy.document.ParserException;
+import net.yacy.kelondro.data.meta.DigestURI;
+
+/**
+ * a parser for comma-separated values
+ * The values may also be separated by semicolon or tab,
+ * the separator character is detected automatically
+ */
+public class csvParser extends AbstractParser implements Idiom {
+
+    /**
+     * a list of mime types that are supported by this parser class
+     * @see #getSupportedMimeTypes()
+     */    
+    public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
+    public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
+    static {
+        SUPPORTED_EXTENSIONS.add("csv");
+    }
+    
+    public csvParser() {
+        super("Comma Separated Value Parser");
+    }
+    
+    public Set<String> supportedMimeTypes() {
+        return SUPPORTED_MIME_TYPES;
+    }
+    
+    public Set<String> supportedExtensions() {
+        return SUPPORTED_EXTENSIONS;
+    }
+    
+    @Override
+    public Document parse(DigestURI location, String mimeType, String charset, InputStream source) throws ParserException, InterruptedException {
+        // construct a document using all cells of the document
+        // the first row is used as headline
+        // all lines are artificially terminated by a '.' to separate them as sentence for the condenser.
+        List<String[]> table = getTable(location, mimeType, charset, source);
+        if (table.size() == 0) throw new ParserException("document has no lines", location);
+        StringBuilder sb = new StringBuilder();
+        for (String[] row: table) sb.append(concatRow(row)).append(' ');
+        try {
+            return new Document(
+                    location,
+                    mimeType,
+                    charset,
+                    null,
+                    null,
+                    concatRow(table.get(0)),
+                    "",
+                    null,
+                    null,
+                    sb.toString().getBytes(charset),
+                    null,
+                    null);
+        } catch (UnsupportedEncodingException e) {
+            throw new ParserException("error in csvParser, getBytes: " + e.getMessage(), location);
+        }
+    }
+
+    public String concatRow(String[] column) {
+        StringBuilder sb = new StringBuilder(80);
+        for (int i = 0; i < column.length; i++) {
+            if (i != 0) sb.append(' ');
+            sb.append(column[i]);
+        }
+        sb.append('.');
+        return sb.toString();
+    }
+    
+    public List<String[]> getTable(DigestURI location, String mimeType, String charset, InputStream source) {
+        ArrayList<String[]> rows = new ArrayList<String[]>();
+        BufferedReader reader;
+        try {
+            reader = new BufferedReader(new InputStreamReader(source, charset));
+        } catch (UnsupportedEncodingException e1) {
+            reader = new BufferedReader(new InputStreamReader(source));
+        }
+        String row;
+        String separator = null;
+        int columns = -1;
+        try {
+            while ((row = reader.readLine()) != null) {
+                row = row.trim();
+                if (row.length() == 0) continue;
+                if (separator == null) {
+                    // try comma, semicolon and tab; take that one that results with more columns
+                    String[] colc = row.split(",");
+                    String[] cols = row.split(";");
+                    String[] colt = row.split("\t");
+                    if (colc.length >= cols.length && colc.length >= colt.length) separator = ",";
+                    if (cols.length >= colc.length && cols.length >= colt.length) separator = ";";
+                    if (colt.length >= cols.length && colt.length >= colc.length) separator = "\t";
+                }
+                String[] cols = row.split(separator);
+                if (columns == -1) columns = cols.length;
+                //if (cols.length != columns) continue; // skip lines that have the wrong number of columns
+                rows.add(cols);
+            }
+        } catch (IOException e) {
+        }
+        return rows;
+    }
+
+    
+
+}