- added a catch-all parser for all documents that cannot be parsed: they will contributed with their document url for the search index only

- enhanced the pdf and torrent parser: better documents titles - enhanced the ftp client: more time-out time - fixed bugs in json for search results - enhanced yacyinteractive.html: added a file type navigator and a download-script generator for search result files Please have a look at yacyinteractive.html: this will become the hacker-download tool for 27c3! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7355 6c8d7289-2bf4-0310-a012-ef5d649a1542
15 years ago · b769cce433
parent 6692e862ae
commit b769cce433
13 changed files with 191 additions and 53 deletions
--- a/htroot/Network.html
+++ b/htroot/Network.html
@ -49,8 +49,7 @@
  <body id="Network">

 <div id="api">
-<script type="text/javascript" src="/js/sorttable.js">
-</script>
+<script type="text/javascript" src="/js/sorttable.js"></script>
 <a href="Network.xml" id="apilink"><img src="/env/grafics/api.png" width="60" height="40" alt="API"/></a>
 <script type="text/javascript">
 //<![CDATA[
--- a/htroot/js/yacyinteractive.js
+++ b/htroot/js/yacyinteractive.js
@ -5,6 +5,7 @@ function xmlhttpPost() {

 function search(query) {
 //    var xmlHttpReq = false;
+    start = new Date();
    var self = this;
    if (window.XMLHttpRequest) { // Mozilla/Safari
        self.xmlHttpReq = new XMLHttpRequest(); 
@ -12,11 +13,12 @@ function search(query) {
    else if (window.ActiveXObject) { // IE
        self.xmlHttpReq = new ActiveXObject("Microsoft.XMLHTTP");
    }
-    self.xmlHttpReq.open('GET', "yacysearch.json?verify=false&resource=local&maximumRecords=100&nav=none&query=" + query, true);
+    self.xmlHttpReq.open('GET', "yacysearch.json?verify=false&resource=local&maximumRecords=1000&nav=all&query=" + query, true);
    self.xmlHttpReq.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded');
    self.xmlHttpReq.onreadystatechange = function() {
        if (self.xmlHttpReq.readyState == 4) {
-            updatepage(self.xmlHttpReq.responseText);
+            stop = new Date();
+            updatepage(query, self.xmlHttpReq.responseText, stop.getTime() - start.getTime());
        }
    }
    self.xmlHttpReq.send(null);
@ -28,44 +30,130 @@ function navget(list, name) {
  }
 }

-function updatepage(str) {
+var searchresult;
+
+function makeDownloadScript() {
+  script = "<div style=\"float:left\"><pre>";
+  for (var i = 0; i < searchresult.length; i++) {
+        var item = searchresult[i];
+        script += "curl -OL \"" + item.link + "\"\n";
+  }
+  script += "</pre></div>";
+  document.getElementById("downloadscript").innerHTML = script;
+  document.getElementById("downloadbutton").innerHTML = "<input id=\"downloadbutton\" type=\"button\" value=\"hide the download script\" onClick=\"hideDownloadScript();\"/></form>";
+}
+
+function hideDownloadScript() {
+  document.getElementById("downloadscript").innerHTML = "";
+  var dlb = document.getElementById("downloadbutton");
+  if (dlb) dlb.innerHTML = "<input type=\"button\" value=\"create a download script\" onClick=\"makeDownloadScript();\"/></form>";
+}
+
+function updatepage(query, str, time) {
  var raw = document.getElementById("raw");
  if (raw != null) raw.innerHTML = str;
  var rsp = eval("("+str+")");
  var firstChannel = rsp.channels[0];
+  searchresult = firstChannel.items;
  var totalResults = firstChannel.totalResults.replace(/[,.]/,"");
 //  var startIndex = firstChannel.startIndex;
 //  var itemsPerPage = firstChannel.itemsPerPage;
  var navigation = firstChannel.navigation;
  var topics = navget(navigation, "topics");
  
-  var html = "<span id=\"resCounter\" style=\"display: inline;\">total results = " + totalResults;
+  // analyse the search result
+  var filetypes = {};
+  for (var i = 0; i < firstChannel.items.length; i++) {
+    item = firstChannel.items[i];
+    if (item.link && item.link.length > 4) {
+      ext = item.link.substring(item.link.length - 4);
+      if (ext.charAt(0) == ".") {
+        ext = ext.substring(1).toLowerCase();
+        var count = filetypes[ext];
+        if (count) filetypes[ext]++; else filetypes[ext] = 1;
+      }
+    }
+  }
+  for (var key in filetypes) {
+    if (query.indexOf("filetype:" + key) >= 0) delete filetypes[key];
+  }
+
+  // show statistics
+  var html = "<span id=\"resCounter\" style=\"display: inline;\">";
+  if (firstChannel.items.length > 0) {
+      html += "<form><div style=\"float:left\">" + firstChannel.items.length + " results from a total of " + totalResults + " docs in index; search time: " + time + " milliseconds. </div>";
+      html += "<div id=\"downloadbutton\" style=\"float:left\"></div></form>";
+  } else {
+      if (query == "") {
+         html += "please enter some search words";
+      } else {
+         html += "no results";
+      }
+  }
+  html += "<br>";
+
+  // add extension navigation
+  var extnav = "";
+  for (var key in filetypes) {
+      if (filetypes[key] > 0)  { extnav += "<a style=\"text-decoration:underline\" href=\"/yacyinteractive.html?query=" + query + "+filetype:"+ key + "\">" + key + "</a>(" + filetypes[key] + ")&nbsp;&nbsp;";}
+  }
+  if (extnav.length > 0) {
+	  html += "apply a <b>filter</b> by filetype:&nbsp;&nbsp;&nbsp;&nbsp;" + extnav;
+  }
+
+  // add topic navigation  
  if (topics && topics.length > 0) {
    var topwords = "";
    for (var i = 0; i < topics.elements.length; i++) {
-        topwords += "<a href=\"yacyinteractive.html?query=" + firstChannel.searchTerms + "+" + topics.elements[i].name + "\">" + topics.elements[i].name + "</a> ";
+        topwords += "<a href=\"/yacyinteractive.html?query=" + query + "+" + topics.elements[i].name + "\">" + topics.elements[i].name + "</a> ";
        if (i > 10) break;
    }
    html += "&nbsp;&nbsp;&nbsp;topwords: " + topwords;
  }
-  html += "</span><br>";
+  html += "<br><div id=\"downloadscript\"></div></span><br>";
  
-  if (totalResults > 0) {
+  // display result
+  if (firstChannel.items.length > 0) {
    var item;
-    html += "<table class=\"sortable\" border=\"0\" cellpadding=\"2\" cellspacing=\"1\" width=\"99%\">";
+    html += "<table class=\"sortable\" id=\"sortable\" border=\"0\" cellpadding=\"2\" cellspacing=\"1\" width=\"99%\">";
    html += "<tr class=\"TableHeader\" valign=\"bottom\">";
-    html += "<td>Name</td>";
-    html += "<td width=\"60\">Size</td>";
+    html += "<td width=\"40\">Protocol</td>";
+    html += "<td width=\"60\">Host</td>";
+    html += "<td width=\"60\">Path</td>";
+    html += "<td width=\"60\">Name</td>";
+    html += "<td width=\"50\">Size</td>";
    //html += "<td>Description</td>";
-    html += "<td width=\"180\">Date</td></tr>";
+    html += "<td width=\"50\">Date</td></tr>";
    for (var i = 0; i < firstChannel.items.length; i++) {
        item = firstChannel.items[i];
-        html += "<tr class=\"TableCellLight\"><td align=\"left\"><a href=\"" + item.link + "\">" + item.title + "</a></td>";
+        p = item.link.indexOf("//");
+	    protocol = "";
+        host = "";
+        path = item.link;
+        if (p > 0) {
+        	q = item.link.indexOf("/", p + 2);
+            protocol = item.link.substring(0, p - 1);
+            host = item.link.substring(p + 2, q);
+            path = item.link.substring(q + 1);
+        }
+        html += "<tr class=\"TableCellLight\">";
+        html += "<td align=\"left\">" + protocol + "</td>";
+        html += "<td align=\"left\"><a href=\"" + protocol + "://" + host + "/" + "\">" + host + "</a></td>";
+        html += "<td align=\"left\"><a href=\"" + item.link + "\">" + path + "</a></td>";
+        title = item.title;
+        if (title == "") title = path;
+        html += "<td align=\"left\"><a href=\"" + item.link + "\">" + title + "</a></td>";
        html += "<td align=\"right\">" + item.sizename + "</td>";
        //html += "<td>" + item.description + "</td>";
-        html += "<td align=\"right\">" + item.pubDate + "</td></tr>";
+        pd = item.pubDate;
+        if (pd.substring(pd.length - 6) == " +0000") pd = pd.substring(0, pd.length - 6);
+        if (pd.substring(pd.length - 9) == " 00:00:00") pd = pd.substring(0, pd.length - 9);
+        if (pd.substring(pd.length - 5) == " 2010") pd = pd.substring(0, pd.length - 5);
+        html += "<td align=\"right\">" + pd + "</td>";
+        html += "</tr>";
    }
    html += "</table>";
  }
  document.getElementById("searchresults").innerHTML = html;
+  hideDownloadScript();
 }
--- a/htroot/yacyinteractive.html
+++ b/htroot/yacyinteractive.html
@ -35,12 +35,16 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
  <div class="yacylogo">
    <a href="#[promoteSearchPageGreeting.homepage]#" class="yacylogo"><img src="#[promoteSearchPageGreeting.smallImage]#" alt="yacysearch"/></a>
  </div>
-  <fieldset class="yacys"><input name="query" type="text" value="#[query]#" size="50" maxlength="80" /></fieldset>
+  <fieldset class="yacys"><input id="query" name="query" type="text" value="#[query]#" size="50" maxlength="80" /></fieldset>

 <!--<pre>Raw JSON String: <div id="raw"></div></pre>-->
 </form>
 <div id="searchresults"></div>
-
+<script type="text/javascript">
+//<![CDATA[
+document.getElementById("query").focus();
+//]]>
+</script>
 #%env/templates/footer.template%#
 </body>
 </html>
--- a/htroot/yacysearchtrailer.java
+++ b/htroot/yacysearchtrailer.java
@ -73,7 +73,7 @@ public class yacysearchtrailer {
            while (i < 10 && navigatorIterator.hasNext()) {
                name = navigatorIterator.next();
                count = namespaceNavigator.get(name);
-                prop.put("nav-namespace_element_" + i + "_name", name);
+                prop.putJSON("nav-namespace_element_" + i + "_name", name);
                prop.put("nav-namespace_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + "inurl:" + name, theQuery.urlMask.toString(), theQuery.navigators) + "\">" + name + " (" + count + ")</a>");
                prop.putJSON("nav-namespace_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + "inurl:" + name, theQuery.urlMask.toString(), theQuery.navigators));
                prop.put("nav-namespace_element_" + i + "_count", count);
@ -97,7 +97,7 @@ public class yacysearchtrailer {
            while (i < 20 && navigatorIterator.hasNext()) {
                name = navigatorIterator.next();
                count = hostNavigator.get(name);
-                prop.put("nav-domains_element_" + i + "_name", name);
+                prop.putJSON("nav-domains_element_" + i + "_name", name);
                prop.put("nav-domains_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + "site:" + name, theQuery.urlMask.toString(), theQuery.navigators) + "\">" + name + " (" + count + ")</a>");
                prop.putJSON("nav-domains_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + "site:" + name, theQuery.urlMask.toString(), theQuery.navigators));
                prop.put("nav-domains_element_" + i + "_count", count);
@ -120,10 +120,10 @@ public class yacysearchtrailer {
            int i = 0;
            String anav;
            while (i < 20 && navigatorIterator.hasNext()) {
-                name = navigatorIterator.next();
+                name = navigatorIterator.next().trim();
                count = authorNavigator.get(name);
                anav = (name.indexOf(' ') < 0) ? "author:" + name : "author:'" + name.replace(" ", "+") + "'";
-                prop.put("nav-authors_element_" + i + "_name", name);
+                prop.putJSON("nav-authors_element_" + i + "_name", name);
                prop.put("nav-authors_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators) + "\">" + name + " (" + count + ")</a>");
                prop.putJSON("nav-authors_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators));
                prop.put("nav-authors_element_" + i + "_count", count);
@ -149,7 +149,7 @@ public class yacysearchtrailer {
                count = topicNavigator.get(name);
                if (/*(theQuery == null) ||*/ (theQuery.queryString == null)) break;
                if (name != null) {
-                    prop.putHTML("nav-topics_element_" + i + "_name", name);
+                    prop.putJSON("nav-topics_element_" + i + "_name", name);
                    prop.put("nav-topics_element_" + i + "_url",
                            "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + name, theQuery.urlMask.toString(), theQuery.navigators) + "\">" + name + "</a>");
                            //+"<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.queryStringForUrl() + "+-" + name, theQuery.urlMask.toString(), theQuery.navigators) + "\">-</a>")*/;
--- a/source/de/anomic/crawler/retrieval/FTPLoader.java
+++ b/source/de/anomic/crawler/retrieval/FTPLoader.java
@ -95,7 +95,6 @@ public class FTPLoader {

        // create new ftp client
        final FTPClient ftpClient = new FTPClient();
-        ftpClient.setDataTimeoutByMaxFilesize(maxFileSize);
        
        // get a connection
        if (openConnection(ftpClient, entryUrl)) {
@ -250,9 +249,6 @@ public class FTPLoader {
                    url.toNormalform(true, true).getBytes());
            return response;
        }
-
-        // timeout for download
-        ftpClient.setDataTimeoutByMaxFilesize(size);
        
        // download the remote file
        byte[] b = ftpClient.get(path);
--- a/source/net/yacy/cora/protocol/ftp/FTPClient.java
+++ b/source/net/yacy/cora/protocol/ftp/FTPClient.java
@ -86,7 +86,7 @@ public class FTPClient {
    private Socket ControlSocket = null;

    // socket timeout
-    private static final int ControlSocketTimeout = 1000;
+    private static final int ControlSocketTimeout = 10000;

    // data socket timeout
    private int DataSocketTimeout = 0; // in seconds (default infinite)
@ -2450,22 +2450,6 @@ public class FTPClient {
        return ControlSocketTimeout;
    }

-    /**
-     * set timeout for data connections calculated for a minimum data rate
-     * 
-     * @param maxFilesize
-     * @return timeout in seconds
-     */
-    public void setDataTimeoutByMaxFilesize(final int maxFilesize) {
-        int timeout = 1;
-        if (DataSocketRate > 0) {
-            // calculate by minDataRate and MaxFTPFileSize
-            timeout = maxFilesize / DataSocketRate;
-        }
-
-        setDataSocketTimeout(timeout);
-    }
-
    /**
     * after this time the data connection is closed
     * 
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@ -132,7 +132,7 @@ public final class Condenser {
        
        Map.Entry<MultiProtocolURI, String> entry;
        if (indexText) {
-            createCondensement(document.getText(), meaningLib);        
+            createCondensement(document.getText(), meaningLib);
            // the phrase counter:
            // phrase   0 are words taken from the URL
            // phrase   1 is the MainTitle
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -232,7 +232,7 @@ dc_rights

    public InputStream getText() {
        try {
-            if (this.text == null) return null;
+            if (this.text == null) return new ByteArrayInputStream("".getBytes());

            if (this.text instanceof File) {
                this.textStream = new BufferedInputStream(new FileInputStream((File)this.text));
@ -245,7 +245,7 @@ dc_rights
        } catch (final Exception e) {
            Log.logException(e);
        }
-        return null; 
+        return new ByteArrayInputStream("".getBytes());
    }
    
    public byte[] getTextBytes() {
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@ -38,6 +38,7 @@ import net.yacy.cora.document.MultiProtocolURI;
 import net.yacy.document.parser.bzipParser;
 import net.yacy.document.parser.csvParser;
 import net.yacy.document.parser.docParser;
+import net.yacy.document.parser.genericParser;
 import net.yacy.document.parser.gzipParser;
 import net.yacy.document.parser.htmlParser;
 import net.yacy.document.parser.odtParser;
@ -64,6 +65,7 @@ public final class TextParser {
    private static final Log log = new Log("PARSER");
    private static final Object v = new Object();

+    private static final Parser genericIdiom = new genericParser();
    private static final Map<String, Parser> mime2parser = new ConcurrentHashMap<String, Parser>();
    private static final Map<String, Parser> ext2parser = new ConcurrentHashMap<String, Parser>();
    private static final Map<String, String> ext2mime = new ConcurrentHashMap<String, String>();
@ -196,11 +198,13 @@ public final class TextParser {
        
        // in case that we know more parsers we first transform the content into a byte[] and use that as base
        // for a number of different parse attempts.
+        byte[] b = null;
        try {
-            return parseSource(location, mimeType, idioms, charset, FileUtils.read(sourceStream, (int) contentLength));
+            b = FileUtils.read(sourceStream, (int) contentLength);
        } catch (IOException e) {
            throw new Parser.Failure(e.getMessage(), location);
        }
+        return parseSource(location, mimeType, idioms, charset, b);
    }

    private static Document[] parseSource(
@ -325,8 +329,9 @@ public final class TextParser {
        idiom = mime2parser.get(mimeType2);
        if (idiom != null && !idioms.contains(idiom)) idioms.add(idiom);
        
-        // finall check if we found any parser
-        if (idioms.isEmpty()) throw new Parser.Failure("no parser found for extension '" + ext + "' and mime type '" + mimeType1 + "'", url);
+        // always add the generic parser
+        idioms.add(genericIdiom);
+        //if (idioms.isEmpty()) throw new Parser.Failure("no parser found for extension '" + ext + "' and mime type '" + mimeType1 + "'", url);
        
        return idioms;
    }
--- a/source/net/yacy/document/parser/genericParser.java
+++ b/source/net/yacy/document/parser/genericParser.java
@ -0,0 +1,60 @@
+/**
+ *  genericParser
+ *  Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
+ *  First released 30.11.2010 at http://yacy.net
+ *
+ *  This library is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  License as published by the Free Software Foundation; either
+ *  version 2.1 of the License, or (at your option) any later version.
+ *  
+ *  This library is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  Lesser General Public License for more details.
+ *  
+ *  You should have received a copy of the GNU Lesser General Public License
+ *  along with this program in the file lgpl21.txt
+ *  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+package net.yacy.document.parser;
+
+import java.io.InputStream;
+
+import net.yacy.cora.document.MultiProtocolURI;
+import net.yacy.document.AbstractParser;
+import net.yacy.document.Document;
+import net.yacy.document.Parser;
+
+/**
+ * this parser can parse just anything because it uses only the uri/file/path information
+ */
+public class genericParser extends AbstractParser implements Parser {
+
+    public genericParser() {
+        super("Generic Parser");
+        // no SUPPORTED_EXTENSIONS and no SUPPORTED_MIME_TYPES
+        // this parser is used if no other fits. This parser fits all
+    }
+    
+    public Document[] parse(MultiProtocolURI location, String mimeType, String charset, InputStream source1) throws Parser.Failure, InterruptedException {
+
+        return new Document[]{new Document(
+                location,
+                mimeType,
+                charset,
+                null,
+                null,
+                location.getFileName(), // title
+                "", // author 
+                location.getHost(),
+                null,
+                null,
+                "",
+                null,
+                null,
+                null,
+                false)};
+    }
+}
--- a/source/net/yacy/document/parser/images/genericImageParser.java
+++ b/source/net/yacy/document/parser/images/genericImageParser.java
@ -185,7 +185,7 @@ public class genericImageParser extends AbstractParser implements Parser {
        String infoString = ii.info.toString();
        images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1));
        
-        if (title == null) title = location.toNormalform(true, true);
+        if (title == null || title.length() == 0) title = location.getFileName();
        
        return new Document[]{new Document(
             location,
--- a/source/net/yacy/document/parser/pdfParser.java
+++ b/source/net/yacy/document/parser/pdfParser.java
@ -118,6 +118,7 @@ public class pdfParser extends AbstractParser implements Parser {
            // info.getModificationDate();
        }
        
+        if (docTitle == null || docTitle.length() == 0) docTitle = location.getFileName();
        CharBuffer writer = null;
        try {
            // create a writer for output
--- a/source/net/yacy/document/parser/torrentParser.java
+++ b/source/net/yacy/document/parser/torrentParser.java
@ -67,7 +67,7 @@ public class torrentParser extends AbstractParser implements Parser {
        //Date creation = new Date(map.get("creation date").getInteger());
        BObject infoo = map.get("info");
        StringBuilder filenames = new StringBuilder();
-        String name = "";
+        String title = "";
        if (infoo != null) {
            Map<String, BObject> info = infoo.getMap();
            BObject fileso = info.get("files");
@ -82,8 +82,9 @@ public class torrentParser extends AbstractParser implements Parser {
                }
            }
            BObject nameo = info.get("name");
-            if (nameo != null) name = new String(nameo.getString());
+            if (nameo != null) title = new String(nameo.getString());
        }
+        if (title == null || title.length() == 0) title = location.getFileName();
        try {
            return new Document[]{new Document(
                    location,
@ -91,7 +92,7 @@ public class torrentParser extends AbstractParser implements Parser {
                    charset,
                    null,
                    null,
-                    name, // title
+                    title, // title
                    comment, // author 
                    location.getHost(),
                    null,