- added a catch-all parser for all documents that cannot be parsed: they will contributed with their document url for the search index only

- enhanced the pdf and torrent parser: better documents titles
- enhanced the ftp client: more time-out time
- fixed bugs in json for search results
- enhanced yacyinteractive.html: added a file type navigator and a download-script generator for search result files

Please have a look at yacyinteractive.html: this will become the hacker-download tool for 27c3!

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7355 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 6692e862ae
commit b769cce433

@ -49,8 +49,7 @@
<body id="Network">
<div id="api">
<script type="text/javascript" src="/js/sorttable.js">
</script>
<script type="text/javascript" src="/js/sorttable.js"></script>
<a href="Network.xml" id="apilink"><img src="/env/grafics/api.png" width="60" height="40" alt="API"/></a>
<script type="text/javascript">
//<![CDATA[

@ -5,6 +5,7 @@ function xmlhttpPost() {
function search(query) {
// var xmlHttpReq = false;
start = new Date();
var self = this;
if (window.XMLHttpRequest) { // Mozilla/Safari
self.xmlHttpReq = new XMLHttpRequest();
@ -12,11 +13,12 @@ function search(query) {
else if (window.ActiveXObject) { // IE
self.xmlHttpReq = new ActiveXObject("Microsoft.XMLHTTP");
}
self.xmlHttpReq.open('GET', "yacysearch.json?verify=false&resource=local&maximumRecords=100&nav=none&query=" + query, true);
self.xmlHttpReq.open('GET', "yacysearch.json?verify=false&resource=local&maximumRecords=1000&nav=all&query=" + query, true);
self.xmlHttpReq.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded');
self.xmlHttpReq.onreadystatechange = function() {
if (self.xmlHttpReq.readyState == 4) {
updatepage(self.xmlHttpReq.responseText);
stop = new Date();
updatepage(query, self.xmlHttpReq.responseText, stop.getTime() - start.getTime());
}
}
self.xmlHttpReq.send(null);
@ -28,44 +30,130 @@ function navget(list, name) {
}
}
function updatepage(str) {
var searchresult;
function makeDownloadScript() {
script = "<div style=\"float:left\"><pre>";
for (var i = 0; i < searchresult.length; i++) {
var item = searchresult[i];
script += "curl -OL \"" + item.link + "\"\n";
}
script += "</pre></div>";
document.getElementById("downloadscript").innerHTML = script;
document.getElementById("downloadbutton").innerHTML = "<input id=\"downloadbutton\" type=\"button\" value=\"hide the download script\" onClick=\"hideDownloadScript();\"/></form>";
}
function hideDownloadScript() {
document.getElementById("downloadscript").innerHTML = "";
var dlb = document.getElementById("downloadbutton");
if (dlb) dlb.innerHTML = "<input type=\"button\" value=\"create a download script\" onClick=\"makeDownloadScript();\"/></form>";
}
function updatepage(query, str, time) {
var raw = document.getElementById("raw");
if (raw != null) raw.innerHTML = str;
var rsp = eval("("+str+")");
var firstChannel = rsp.channels[0];
searchresult = firstChannel.items;
var totalResults = firstChannel.totalResults.replace(/[,.]/,"");
// var startIndex = firstChannel.startIndex;
// var itemsPerPage = firstChannel.itemsPerPage;
var navigation = firstChannel.navigation;
var topics = navget(navigation, "topics");
var html = "<span id=\"resCounter\" style=\"display: inline;\">total results = " + totalResults;
// analyse the search result
var filetypes = {};
for (var i = 0; i < firstChannel.items.length; i++) {
item = firstChannel.items[i];
if (item.link && item.link.length > 4) {
ext = item.link.substring(item.link.length - 4);
if (ext.charAt(0) == ".") {
ext = ext.substring(1).toLowerCase();
var count = filetypes[ext];
if (count) filetypes[ext]++; else filetypes[ext] = 1;
}
}
}
for (var key in filetypes) {
if (query.indexOf("filetype:" + key) >= 0) delete filetypes[key];
}
// show statistics
var html = "<span id=\"resCounter\" style=\"display: inline;\">";
if (firstChannel.items.length > 0) {
html += "<form><div style=\"float:left\">" + firstChannel.items.length + " results from a total of " + totalResults + " docs in index; search time: " + time + " milliseconds. </div>";
html += "<div id=\"downloadbutton\" style=\"float:left\"></div></form>";
} else {
if (query == "") {
html += "please enter some search words";
} else {
html += "no results";
}
}
html += "<br>";
// add extension navigation
var extnav = "";
for (var key in filetypes) {
if (filetypes[key] > 0) { extnav += "<a style=\"text-decoration:underline\" href=\"/yacyinteractive.html?query=" + query + "+filetype:"+ key + "\">" + key + "</a>(" + filetypes[key] + ")&nbsp;&nbsp;";}
}
if (extnav.length > 0) {
html += "apply a <b>filter</b> by filetype:&nbsp;&nbsp;&nbsp;&nbsp;" + extnav;
}
// add topic navigation
if (topics && topics.length > 0) {
var topwords = "";
for (var i = 0; i < topics.elements.length; i++) {
topwords += "<a href=\"yacyinteractive.html?query=" + firstChannel.searchTerms + "+" + topics.elements[i].name + "\">" + topics.elements[i].name + "</a> ";
topwords += "<a href=\"/yacyinteractive.html?query=" + query + "+" + topics.elements[i].name + "\">" + topics.elements[i].name + "</a> ";
if (i > 10) break;
}
html += "&nbsp;&nbsp;&nbsp;topwords: " + topwords;
}
html += "</span><br>";
html += "<br><div id=\"downloadscript\"></div></span><br>";
if (totalResults > 0) {
// display result
if (firstChannel.items.length > 0) {
var item;
html += "<table class=\"sortable\" border=\"0\" cellpadding=\"2\" cellspacing=\"1\" width=\"99%\">";
html += "<table class=\"sortable\" id=\"sortable\" border=\"0\" cellpadding=\"2\" cellspacing=\"1\" width=\"99%\">";
html += "<tr class=\"TableHeader\" valign=\"bottom\">";
html += "<td>Name</td>";
html += "<td width=\"60\">Size</td>";
html += "<td width=\"40\">Protocol</td>";
html += "<td width=\"60\">Host</td>";
html += "<td width=\"60\">Path</td>";
html += "<td width=\"60\">Name</td>";
html += "<td width=\"50\">Size</td>";
//html += "<td>Description</td>";
html += "<td width=\"180\">Date</td></tr>";
html += "<td width=\"50\">Date</td></tr>";
for (var i = 0; i < firstChannel.items.length; i++) {
item = firstChannel.items[i];
html += "<tr class=\"TableCellLight\"><td align=\"left\"><a href=\"" + item.link + "\">" + item.title + "</a></td>";
p = item.link.indexOf("//");
protocol = "";
host = "";
path = item.link;
if (p > 0) {
q = item.link.indexOf("/", p + 2);
protocol = item.link.substring(0, p - 1);
host = item.link.substring(p + 2, q);
path = item.link.substring(q + 1);
}
html += "<tr class=\"TableCellLight\">";
html += "<td align=\"left\">" + protocol + "</td>";
html += "<td align=\"left\"><a href=\"" + protocol + "://" + host + "/" + "\">" + host + "</a></td>";
html += "<td align=\"left\"><a href=\"" + item.link + "\">" + path + "</a></td>";
title = item.title;
if (title == "") title = path;
html += "<td align=\"left\"><a href=\"" + item.link + "\">" + title + "</a></td>";
html += "<td align=\"right\">" + item.sizename + "</td>";
//html += "<td>" + item.description + "</td>";
html += "<td align=\"right\">" + item.pubDate + "</td></tr>";
pd = item.pubDate;
if (pd.substring(pd.length - 6) == " +0000") pd = pd.substring(0, pd.length - 6);
if (pd.substring(pd.length - 9) == " 00:00:00") pd = pd.substring(0, pd.length - 9);
if (pd.substring(pd.length - 5) == " 2010") pd = pd.substring(0, pd.length - 5);
html += "<td align=\"right\">" + pd + "</td>";
html += "</tr>";
}
html += "</table>";
}
document.getElementById("searchresults").innerHTML = html;
hideDownloadScript();
}

@ -35,12 +35,16 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
<div class="yacylogo">
<a href="#[promoteSearchPageGreeting.homepage]#" class="yacylogo"><img src="#[promoteSearchPageGreeting.smallImage]#" alt="yacysearch"/></a>
</div>
<fieldset class="yacys"><input name="query" type="text" value="#[query]#" size="50" maxlength="80" /></fieldset>
<fieldset class="yacys"><input id="query" name="query" type="text" value="#[query]#" size="50" maxlength="80" /></fieldset>
<!--<pre>Raw JSON String: <div id="raw"></div></pre>-->
</form>
<div id="searchresults"></div>
<script type="text/javascript">
//<![CDATA[
document.getElementById("query").focus();
//]]>
</script>
#%env/templates/footer.template%#
</body>
</html>

@ -73,7 +73,7 @@ public class yacysearchtrailer {
while (i < 10 && navigatorIterator.hasNext()) {
name = navigatorIterator.next();
count = namespaceNavigator.get(name);
prop.put("nav-namespace_element_" + i + "_name", name);
prop.putJSON("nav-namespace_element_" + i + "_name", name);
prop.put("nav-namespace_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + "inurl:" + name, theQuery.urlMask.toString(), theQuery.navigators) + "\">" + name + " (" + count + ")</a>");
prop.putJSON("nav-namespace_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + "inurl:" + name, theQuery.urlMask.toString(), theQuery.navigators));
prop.put("nav-namespace_element_" + i + "_count", count);
@ -97,7 +97,7 @@ public class yacysearchtrailer {
while (i < 20 && navigatorIterator.hasNext()) {
name = navigatorIterator.next();
count = hostNavigator.get(name);
prop.put("nav-domains_element_" + i + "_name", name);
prop.putJSON("nav-domains_element_" + i + "_name", name);
prop.put("nav-domains_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + "site:" + name, theQuery.urlMask.toString(), theQuery.navigators) + "\">" + name + " (" + count + ")</a>");
prop.putJSON("nav-domains_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + "site:" + name, theQuery.urlMask.toString(), theQuery.navigators));
prop.put("nav-domains_element_" + i + "_count", count);
@ -120,10 +120,10 @@ public class yacysearchtrailer {
int i = 0;
String anav;
while (i < 20 && navigatorIterator.hasNext()) {
name = navigatorIterator.next();
name = navigatorIterator.next().trim();
count = authorNavigator.get(name);
anav = (name.indexOf(' ') < 0) ? "author:" + name : "author:'" + name.replace(" ", "+") + "'";
prop.put("nav-authors_element_" + i + "_name", name);
prop.putJSON("nav-authors_element_" + i + "_name", name);
prop.put("nav-authors_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators) + "\">" + name + " (" + count + ")</a>");
prop.putJSON("nav-authors_element_" + i + "_url-json", QueryParams.navurl("json", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators));
prop.put("nav-authors_element_" + i + "_count", count);
@ -149,7 +149,7 @@ public class yacysearchtrailer {
count = topicNavigator.get(name);
if (/*(theQuery == null) ||*/ (theQuery.queryString == null)) break;
if (name != null) {
prop.putHTML("nav-topics_element_" + i + "_name", name);
prop.putJSON("nav-topics_element_" + i + "_name", name);
prop.put("nav-topics_element_" + i + "_url",
"<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.queryStringForUrl() + "+" + name, theQuery.urlMask.toString(), theQuery.navigators) + "\">" + name + "</a>");
//+"<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.queryStringForUrl() + "+-" + name, theQuery.urlMask.toString(), theQuery.navigators) + "\">-</a>")*/;

@ -95,7 +95,6 @@ public class FTPLoader {
// create new ftp client
final FTPClient ftpClient = new FTPClient();
ftpClient.setDataTimeoutByMaxFilesize(maxFileSize);
// get a connection
if (openConnection(ftpClient, entryUrl)) {
@ -250,9 +249,6 @@ public class FTPLoader {
url.toNormalform(true, true).getBytes());
return response;
}
// timeout for download
ftpClient.setDataTimeoutByMaxFilesize(size);
// download the remote file
byte[] b = ftpClient.get(path);

@ -86,7 +86,7 @@ public class FTPClient {
private Socket ControlSocket = null;
// socket timeout
private static final int ControlSocketTimeout = 1000;
private static final int ControlSocketTimeout = 10000;
// data socket timeout
private int DataSocketTimeout = 0; // in seconds (default infinite)
@ -2450,22 +2450,6 @@ public class FTPClient {
return ControlSocketTimeout;
}
/**
* set timeout for data connections calculated for a minimum data rate
*
* @param maxFilesize
* @return timeout in seconds
*/
public void setDataTimeoutByMaxFilesize(final int maxFilesize) {
int timeout = 1;
if (DataSocketRate > 0) {
// calculate by minDataRate and MaxFTPFileSize
timeout = maxFilesize / DataSocketRate;
}
setDataSocketTimeout(timeout);
}
/**
* after this time the data connection is closed
*

@ -132,7 +132,7 @@ public final class Condenser {
Map.Entry<MultiProtocolURI, String> entry;
if (indexText) {
createCondensement(document.getText(), meaningLib);
createCondensement(document.getText(), meaningLib);
// the phrase counter:
// phrase 0 are words taken from the URL
// phrase 1 is the MainTitle

@ -232,7 +232,7 @@ dc_rights
public InputStream getText() {
try {
if (this.text == null) return null;
if (this.text == null) return new ByteArrayInputStream("".getBytes());
if (this.text instanceof File) {
this.textStream = new BufferedInputStream(new FileInputStream((File)this.text));
@ -245,7 +245,7 @@ dc_rights
} catch (final Exception e) {
Log.logException(e);
}
return null;
return new ByteArrayInputStream("".getBytes());
}
public byte[] getTextBytes() {

@ -38,6 +38,7 @@ import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.parser.bzipParser;
import net.yacy.document.parser.csvParser;
import net.yacy.document.parser.docParser;
import net.yacy.document.parser.genericParser;
import net.yacy.document.parser.gzipParser;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.odtParser;
@ -64,6 +65,7 @@ public final class TextParser {
private static final Log log = new Log("PARSER");
private static final Object v = new Object();
private static final Parser genericIdiom = new genericParser();
private static final Map<String, Parser> mime2parser = new ConcurrentHashMap<String, Parser>();
private static final Map<String, Parser> ext2parser = new ConcurrentHashMap<String, Parser>();
private static final Map<String, String> ext2mime = new ConcurrentHashMap<String, String>();
@ -196,11 +198,13 @@ public final class TextParser {
// in case that we know more parsers we first transform the content into a byte[] and use that as base
// for a number of different parse attempts.
byte[] b = null;
try {
return parseSource(location, mimeType, idioms, charset, FileUtils.read(sourceStream, (int) contentLength));
b = FileUtils.read(sourceStream, (int) contentLength);
} catch (IOException e) {
throw new Parser.Failure(e.getMessage(), location);
}
return parseSource(location, mimeType, idioms, charset, b);
}
private static Document[] parseSource(
@ -325,8 +329,9 @@ public final class TextParser {
idiom = mime2parser.get(mimeType2);
if (idiom != null && !idioms.contains(idiom)) idioms.add(idiom);
// finall check if we found any parser
if (idioms.isEmpty()) throw new Parser.Failure("no parser found for extension '" + ext + "' and mime type '" + mimeType1 + "'", url);
// always add the generic parser
idioms.add(genericIdiom);
//if (idioms.isEmpty()) throw new Parser.Failure("no parser found for extension '" + ext + "' and mime type '" + mimeType1 + "'", url);
return idioms;
}

@ -0,0 +1,60 @@
/**
* genericParser
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
* First released 30.11.2010 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.parser;
import java.io.InputStream;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
/**
* this parser can parse just anything because it uses only the uri/file/path information
*/
public class genericParser extends AbstractParser implements Parser {
public genericParser() {
super("Generic Parser");
// no SUPPORTED_EXTENSIONS and no SUPPORTED_MIME_TYPES
// this parser is used if no other fits. This parser fits all
}
public Document[] parse(MultiProtocolURI location, String mimeType, String charset, InputStream source1) throws Parser.Failure, InterruptedException {
return new Document[]{new Document(
location,
mimeType,
charset,
null,
null,
location.getFileName(), // title
"", // author
location.getHost(),
null,
null,
"",
null,
null,
null,
false)};
}
}

@ -185,7 +185,7 @@ public class genericImageParser extends AbstractParser implements Parser {
String infoString = ii.info.toString();
images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1));
if (title == null) title = location.toNormalform(true, true);
if (title == null || title.length() == 0) title = location.getFileName();
return new Document[]{new Document(
location,

@ -118,6 +118,7 @@ public class pdfParser extends AbstractParser implements Parser {
// info.getModificationDate();
}
if (docTitle == null || docTitle.length() == 0) docTitle = location.getFileName();
CharBuffer writer = null;
try {
// create a writer for output

@ -67,7 +67,7 @@ public class torrentParser extends AbstractParser implements Parser {
//Date creation = new Date(map.get("creation date").getInteger());
BObject infoo = map.get("info");
StringBuilder filenames = new StringBuilder();
String name = "";
String title = "";
if (infoo != null) {
Map<String, BObject> info = infoo.getMap();
BObject fileso = info.get("files");
@ -82,8 +82,9 @@ public class torrentParser extends AbstractParser implements Parser {
}
}
BObject nameo = info.get("name");
if (nameo != null) name = new String(nameo.getString());
if (nameo != null) title = new String(nameo.getString());
}
if (title == null || title.length() == 0) title = location.getFileName();
try {
return new Document[]{new Document(
location,
@ -91,7 +92,7 @@ public class torrentParser extends AbstractParser implements Parser {
charset,
null,
null,
name, // title
title, // title
comment, // author
location.getHost(),
null,

Loading…
Cancel
Save