From b769cce433e58b77ba47e133575f476750d1af74 Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 30 Nov 2010 16:13:55 +0000 Subject: [PATCH] - added a catch-all parser for all documents that cannot be parsed: they will contributed with their document url for the search index only - enhanced the pdf and torrent parser: better documents titles - enhanced the ftp client: more time-out time - fixed bugs in json for search results - enhanced yacyinteractive.html: added a file type navigator and a download-script generator for search result files Please have a look at yacyinteractive.html: this will become the hacker-download tool for 27c3! git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7355 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Network.html | 3 +- htroot/js/yacyinteractive.js | 114 ++++++++++++++++-- htroot/yacyinteractive.html | 8 +- htroot/yacysearchtrailer.java | 10 +- .../anomic/crawler/retrieval/FTPLoader.java | 4 - .../net/yacy/cora/protocol/ftp/FTPClient.java | 18 +-- source/net/yacy/document/Condenser.java | 2 +- source/net/yacy/document/Document.java | 4 +- source/net/yacy/document/TextParser.java | 11 +- .../yacy/document/parser/genericParser.java | 60 +++++++++ .../parser/images/genericImageParser.java | 2 +- .../net/yacy/document/parser/pdfParser.java | 1 + .../yacy/document/parser/torrentParser.java | 7 +- 13 files changed, 191 insertions(+), 53 deletions(-) create mode 100644 source/net/yacy/document/parser/genericParser.java diff --git a/htroot/Network.html b/htroot/Network.html index e21ccca66..8a7495225 100644 --- a/htroot/Network.html +++ b/htroot/Network.html @@ -49,8 +49,7 @@
- + API #%env/templates/footer.template%# \ No newline at end of file diff --git a/htroot/yacysearchtrailer.java b/htroot/yacysearchtrailer.java index 499138b27..a4f6d4860 100644 --- a/htroot/yacysearchtrailer.java +++ b/htroot/yacysearchtrailer.java @@ -73,7 +73,7 @@ public class yacysearchtrailer { while (i < 10 && navigatorIterator.hasNext()) { name = navigatorIterator.next(); count = namespaceNavigator.get(name); - prop.put("nav-namespace_element_" + i + "_name", name); + prop.putJSON("nav-namespace_element_" + i + "_name", name); prop.put("nav-namespace_element_" + i + "_url", "" + name + " (" + count + ")"); prop.putJSON("nav-namespace_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + "inurl:" + name, theQuery.urlMask.toString(), theQuery.navigators)); prop.put("nav-namespace_element_" + i + "_count", count); @@ -97,7 +97,7 @@ public class yacysearchtrailer { while (i < 20 && navigatorIterator.hasNext()) { name = navigatorIterator.next(); count = hostNavigator.get(name); - prop.put("nav-domains_element_" + i + "_name", name); + prop.putJSON("nav-domains_element_" + i + "_name", name); prop.put("nav-domains_element_" + i + "_url", "" + name + " (" + count + ")"); prop.putJSON("nav-domains_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + "site:" + name, theQuery.urlMask.toString(), theQuery.navigators)); prop.put("nav-domains_element_" + i + "_count", count); @@ -120,10 +120,10 @@ public class yacysearchtrailer { int i = 0; String anav; while (i < 20 && navigatorIterator.hasNext()) { - name = navigatorIterator.next(); + name = navigatorIterator.next().trim(); count = authorNavigator.get(name); anav = (name.indexOf(' ') < 0) ? "author:" + name : "author:'" + name.replace(" ", "+") + "'"; - prop.put("nav-authors_element_" + i + "_name", name); + prop.putJSON("nav-authors_element_" + i + "_name", name); prop.put("nav-authors_element_" + i + "_url", "" + name + " (" + count + ")"); prop.putJSON("nav-authors_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators)); prop.put("nav-authors_element_" + i + "_count", count); @@ -149,7 +149,7 @@ public class yacysearchtrailer { count = topicNavigator.get(name); if (/*(theQuery == null) ||*/ (theQuery.queryString == null)) break; if (name != null) { - prop.putHTML("nav-topics_element_" + i + "_name", name); + prop.putJSON("nav-topics_element_" + i + "_name", name); prop.put("nav-topics_element_" + i + "_url", "" + name + ""); //+"-")*/; diff --git a/source/de/anomic/crawler/retrieval/FTPLoader.java b/source/de/anomic/crawler/retrieval/FTPLoader.java index 6d25d5c16..1eda7c0a3 100644 --- a/source/de/anomic/crawler/retrieval/FTPLoader.java +++ b/source/de/anomic/crawler/retrieval/FTPLoader.java @@ -95,7 +95,6 @@ public class FTPLoader { // create new ftp client final FTPClient ftpClient = new FTPClient(); - ftpClient.setDataTimeoutByMaxFilesize(maxFileSize); // get a connection if (openConnection(ftpClient, entryUrl)) { @@ -250,9 +249,6 @@ public class FTPLoader { url.toNormalform(true, true).getBytes()); return response; } - - // timeout for download - ftpClient.setDataTimeoutByMaxFilesize(size); // download the remote file byte[] b = ftpClient.get(path); diff --git a/source/net/yacy/cora/protocol/ftp/FTPClient.java b/source/net/yacy/cora/protocol/ftp/FTPClient.java index 17bf656df..ca8916f9b 100644 --- a/source/net/yacy/cora/protocol/ftp/FTPClient.java +++ b/source/net/yacy/cora/protocol/ftp/FTPClient.java @@ -86,7 +86,7 @@ public class FTPClient { private Socket ControlSocket = null; // socket timeout - private static final int ControlSocketTimeout = 1000; + private static final int ControlSocketTimeout = 10000; // data socket timeout private int DataSocketTimeout = 0; // in seconds (default infinite) @@ -2450,22 +2450,6 @@ public class FTPClient { return ControlSocketTimeout; } - /** - * set timeout for data connections calculated for a minimum data rate - * - * @param maxFilesize - * @return timeout in seconds - */ - public void setDataTimeoutByMaxFilesize(final int maxFilesize) { - int timeout = 1; - if (DataSocketRate > 0) { - // calculate by minDataRate and MaxFTPFileSize - timeout = maxFilesize / DataSocketRate; - } - - setDataSocketTimeout(timeout); - } - /** * after this time the data connection is closed * diff --git a/source/net/yacy/document/Condenser.java b/source/net/yacy/document/Condenser.java index 67c25c4c6..cbd2dfe0d 100644 --- a/source/net/yacy/document/Condenser.java +++ b/source/net/yacy/document/Condenser.java @@ -132,7 +132,7 @@ public final class Condenser { Map.Entry entry; if (indexText) { - createCondensement(document.getText(), meaningLib); + createCondensement(document.getText(), meaningLib); // the phrase counter: // phrase 0 are words taken from the URL // phrase 1 is the MainTitle diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 6d1b2ca5d..081d4e94d 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -232,7 +232,7 @@ dc_rights public InputStream getText() { try { - if (this.text == null) return null; + if (this.text == null) return new ByteArrayInputStream("".getBytes()); if (this.text instanceof File) { this.textStream = new BufferedInputStream(new FileInputStream((File)this.text)); @@ -245,7 +245,7 @@ dc_rights } catch (final Exception e) { Log.logException(e); } - return null; + return new ByteArrayInputStream("".getBytes()); } public byte[] getTextBytes() { diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index 98e5dede5..f63c72681 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -38,6 +38,7 @@ import net.yacy.cora.document.MultiProtocolURI; import net.yacy.document.parser.bzipParser; import net.yacy.document.parser.csvParser; import net.yacy.document.parser.docParser; +import net.yacy.document.parser.genericParser; import net.yacy.document.parser.gzipParser; import net.yacy.document.parser.htmlParser; import net.yacy.document.parser.odtParser; @@ -64,6 +65,7 @@ public final class TextParser { private static final Log log = new Log("PARSER"); private static final Object v = new Object(); + private static final Parser genericIdiom = new genericParser(); private static final Map mime2parser = new ConcurrentHashMap(); private static final Map ext2parser = new ConcurrentHashMap(); private static final Map ext2mime = new ConcurrentHashMap(); @@ -196,11 +198,13 @@ public final class TextParser { // in case that we know more parsers we first transform the content into a byte[] and use that as base // for a number of different parse attempts. + byte[] b = null; try { - return parseSource(location, mimeType, idioms, charset, FileUtils.read(sourceStream, (int) contentLength)); + b = FileUtils.read(sourceStream, (int) contentLength); } catch (IOException e) { throw new Parser.Failure(e.getMessage(), location); } + return parseSource(location, mimeType, idioms, charset, b); } private static Document[] parseSource( @@ -325,8 +329,9 @@ public final class TextParser { idiom = mime2parser.get(mimeType2); if (idiom != null && !idioms.contains(idiom)) idioms.add(idiom); - // finall check if we found any parser - if (idioms.isEmpty()) throw new Parser.Failure("no parser found for extension '" + ext + "' and mime type '" + mimeType1 + "'", url); + // always add the generic parser + idioms.add(genericIdiom); + //if (idioms.isEmpty()) throw new Parser.Failure("no parser found for extension '" + ext + "' and mime type '" + mimeType1 + "'", url); return idioms; } diff --git a/source/net/yacy/document/parser/genericParser.java b/source/net/yacy/document/parser/genericParser.java new file mode 100644 index 000000000..eb0259603 --- /dev/null +++ b/source/net/yacy/document/parser/genericParser.java @@ -0,0 +1,60 @@ +/** + * genericParser + * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany + * First released 30.11.2010 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.document.parser; + +import java.io.InputStream; + +import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Parser; + +/** + * this parser can parse just anything because it uses only the uri/file/path information + */ +public class genericParser extends AbstractParser implements Parser { + + public genericParser() { + super("Generic Parser"); + // no SUPPORTED_EXTENSIONS and no SUPPORTED_MIME_TYPES + // this parser is used if no other fits. This parser fits all + } + + public Document[] parse(MultiProtocolURI location, String mimeType, String charset, InputStream source1) throws Parser.Failure, InterruptedException { + + return new Document[]{new Document( + location, + mimeType, + charset, + null, + null, + location.getFileName(), // title + "", // author + location.getHost(), + null, + null, + "", + null, + null, + null, + false)}; + } +} diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java index 484211e9b..9561acf5c 100644 --- a/source/net/yacy/document/parser/images/genericImageParser.java +++ b/source/net/yacy/document/parser/images/genericImageParser.java @@ -185,7 +185,7 @@ public class genericImageParser extends AbstractParser implements Parser { String infoString = ii.info.toString(); images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1)); - if (title == null) title = location.toNormalform(true, true); + if (title == null || title.length() == 0) title = location.getFileName(); return new Document[]{new Document( location, diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index a813d6fee..eb69e3f1a 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -118,6 +118,7 @@ public class pdfParser extends AbstractParser implements Parser { // info.getModificationDate(); } + if (docTitle == null || docTitle.length() == 0) docTitle = location.getFileName(); CharBuffer writer = null; try { // create a writer for output diff --git a/source/net/yacy/document/parser/torrentParser.java b/source/net/yacy/document/parser/torrentParser.java index 810b73e20..4c7babdd6 100644 --- a/source/net/yacy/document/parser/torrentParser.java +++ b/source/net/yacy/document/parser/torrentParser.java @@ -67,7 +67,7 @@ public class torrentParser extends AbstractParser implements Parser { //Date creation = new Date(map.get("creation date").getInteger()); BObject infoo = map.get("info"); StringBuilder filenames = new StringBuilder(); - String name = ""; + String title = ""; if (infoo != null) { Map info = infoo.getMap(); BObject fileso = info.get("files"); @@ -82,8 +82,9 @@ public class torrentParser extends AbstractParser implements Parser { } } BObject nameo = info.get("name"); - if (nameo != null) name = new String(nameo.getString()); + if (nameo != null) title = new String(nameo.getString()); } + if (title == null || title.length() == 0) title = location.getFileName(); try { return new Document[]{new Document( location, @@ -91,7 +92,7 @@ public class torrentParser extends AbstractParser implements Parser { charset, null, null, - name, // title + title, // title comment, // author location.getHost(), null,