Merge branch 'master' of ssh://git@gitorious.org/yacy/rc1.git

Conflicts:
	source/net/yacy/document/parser/pdfParser.java
pull/1/head
Michael Peter Christen 13 years ago
commit 1f4f60654a

@ -35,7 +35,7 @@
<classpathentry kind="lib" path="lib/json-simple-1.1.jar"/>
<classpathentry kind="lib" path="lib/fontbox-1.6.0.jar"/>
<classpathentry kind="lib" path="lib/jempbox-1.6.0.jar"/>
<classpathentry kind="lib" path="lib/pdfbox-1.6.0.jar"/>
<classpathentry kind="lib" path="lib/pdfbox-1.6.0.jar" sourcepath="/Users/admin/.m2/repository/org/apache/pdfbox/pdfbox/1.6.0/pdfbox-1.6.0-sources.jar"/>
<classpathentry kind="lib" path="lib/commons-io-2.0.1.jar"/>
<classpathentry kind="lib" path="lib/xercesImpl.jar"/>
<classpathentry kind="lib" path="lib/xml-apis.jar"/>

@ -273,7 +273,7 @@ minimumGlobalDelta = 500
# the following mime-types are a blacklist for indexing:
# parser.mime.deny: specifies mime-types that shall not be indexed
parser.mime.deny=
parser.extensions.deny=pdf
parser.extensions.deny=
# Promotion Strings
# These strings appear in the Web Mask of the YACY search client

@ -42,6 +42,7 @@ import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Document;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.data.meta.DigestURI;
@ -312,9 +313,9 @@ public class Crawler_p {
sb.crawlQueues.errorURL.remove(urlhash);
// get a scraper to get the title
final ContentScraper scraper = sb.loader.parseResource(url, CacheStrategy.IFFRESH);
final String title = scraper == null ? url.toNormalform(true, true) : scraper.getTitle();
final String description = scraper.getDescription();
final Document scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH);
final String title = scraper == null ? url.toNormalform(true, true) : scraper.dc_title();
final String description = scraper.dc_description();
// stack url
sb.crawler.removePassive(crawlingStartURL.hash()); // if there is an old entry, delete it
@ -357,7 +358,7 @@ public class Crawler_p {
//final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
tags.add("crawlStart");
final String[] keywords = scraper.getKeywords();
final String[] keywords = scraper.dc_subject();
if (keywords != null) {
for (final String k: keywords) {
final String kk = BookmarkHelper.cleanTagsString(k);
@ -534,8 +535,7 @@ public class Crawler_p {
try {
final DigestURI sitelistURL = new DigestURI(crawlingStart);
// download document
ContentScraper scraper = null;
scraper = sb.loader.parseResource(sitelistURL, CacheStrategy.IFFRESH);
Document scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH);
// String title = scraper.getTitle();
// String description = scraper.getDescription();

@ -1,3 +1,28 @@
// getpageinfo_p
// (C) 2011 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 11.11.2011 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.IOException;
import java.net.MalformedURLException;
@ -10,7 +35,6 @@ import javax.xml.parsers.ParserConfigurationException;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
@ -68,9 +92,9 @@ public class getpageinfo_p {
} catch (final MalformedURLException e) {
Log.logException(e);
}
ContentScraper scraper = null;
net.yacy.document.Document scraper = null;
if (u != null) try {
scraper = sb.loader.parseResource(u, CacheStrategy.IFEXIST);
scraper = sb.loader.loadDocument(u, CacheStrategy.IFEXIST);
} catch (final IOException e) {
Log.logException(e);
// bad things are possible, i.e. that the Server responds with "403 Bad Behavior"
@ -78,13 +102,13 @@ public class getpageinfo_p {
}
if (scraper != null) {
// put the document title
prop.putXML("title", scraper.getTitle());
prop.putXML("title", scraper.dc_title());
// put the favicon that belongs to the document
prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());
// put keywords
final String list[] = scraper.getKeywords();
final String list[] = scraper.dc_subject();
int count = 0;
for (final String element: list) {
final String tag = element;
@ -95,7 +119,7 @@ public class getpageinfo_p {
}
prop.put("tags", count);
// put description
prop.putXML("desc", scraper.getDescription());
prop.putXML("desc", scraper.dc_description());
// put language
final Set<String> languages = scraper.getContentLanguages();
prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());

@ -441,7 +441,7 @@ form.search.small h2 {
margin-bottom:5px;
}
li.menugroup h3{
li.menugroup h3 {
font-size: 1em; font-weight: bold;
margin: 0; padding: 1px 10px;
}
@ -631,6 +631,31 @@ dd.hint {
padding-bottom: 10px;
}
dl.bplike
{
float: left;
margin: 0 0;
width: 184px;
padding: 0;
}
.bplike dt
{
clear: left;
float: left;
width: 20px;
margin: 0;
padding: 0px;
}
.bplike dd
{
float: left;
width: 155px;
margin: 0px;
padding: 0px;
}
/*----------
<form>
*/

@ -28,10 +28,10 @@
case 9:
case 33:
window.location.href = document.getElementById("nextpage").href;
break;
break;
case 34:
window.location.href = document.getElementById("prevpage").href;
break;
break;
case 40:
}
}
@ -48,13 +48,13 @@
function opensearch(data) {
var parsed = [];
data = eval('({"suggest":' + data + '})');
for (var i = 0; i < data.suggest[1].length; i++) {
for (var i = 0; i < data.suggest[1].length; i++) {
var row = data.suggest[1][i];
if (row) {
parsed[parsed.length] = {
data: [row],
value: row,
result: row
if (row) {
parsed[parsed.length] = {
data: [row],
value: row,
result: row
};
};
};

@ -533,16 +533,16 @@ public class yacysearch {
String authorhash = null;
if ( authori >= 0 ) {
// check if the author was given with single quotes or without
final boolean quotes = (querystring.charAt(authori + 7) == (char) 39);
final boolean quotes = (querystring.charAt(authori + 7) == '%');
String author;
if ( quotes ) {
int ftb = querystring.indexOf((char) 39, authori + 8);
int ftb = querystring.indexOf('%', authori + 8);
if ( ftb == -1 ) {
ftb = querystring.length() + 1;
}
author = querystring.substring(authori + 8, ftb);
querystring = querystring.replace("author:'" + author + "'", "");
modifier.append("author:'").append(author).append("' ");
querystring = querystring.replace("author:%" + author + "%", "");
modifier.append("author:%").append(author).append("% ");
} else {
int ftb = querystring.indexOf(' ', authori);
if ( ftb == -1 ) {

@ -1,7 +1,7 @@
<div style="float:right;width:25%;">
#(cat-location)#::
<div style="float: right; margin-top:5px; width: 220px;">
<div style="float:right; margin-top:5px; width:220px;">
<a href="yacysearch_location.html?query=#[queryenc]#">
<img src="/env/grafics/earthsearch.png" width="215" height="159" alt="earthsearchlogo" /></a>
<a href="yacysearch_location.html?query=#[queryenc]#">Show search results for "#[query]#" on map</a>
@ -11,62 +11,68 @@
#(nav-topics)#::
<div style="float: right; margin-top:5px; width: 220px; height: 80px">
<div><ul id="sidebarTopics" style="padding-left: 0px;">#{element}#
<li value="#[count]#">#[url]#</li>
<li value="#[count]#"><a href="#[url]#">#[name]#</a></li>
#{/element}#</ul></div>
</div>
#(/nav-topics)#
#(nav-protocols)#::
<div id="sidebarProtocols" style="float: right; margin-top:5px; width: 220px;">
<div id="sidebarProtocols" style="float:right; margin-top:5px; width:220px;">
<h3 style="padding-left:25px;">Protocol Navigator</h3>
<div><ul style="padding-left: 0px;">#{element}#
<li>#[url]#</li>
#{/element}#</ul></div>
<dl class="bplike" style="padding-left:5px;">#{element}#
<dt><input type="checkbox" onchange="window.location.href='#[url]#'"#(on)# checked="checked"::#(/on)#/></dt>
<dd>#[name]# (#[count]#)</dd>
#{/element}#</dl>
</div>
#(/nav-protocols)#
#(nav-filetypes)#::
<div id="sidebarFiletypes" style="float: right; margin-top:5px; width: 220px;">
<div id="sidebarFiletypes" style="float:right; margin-top:5px; width:220px;">
<h3 style="padding-left:25px;">Filetype Navigator</h3>
<div><ul style="padding-left: 0px;">#{element}#
<li>#[url]#</li>
#{/element}#</ul></div>
<dl class="bplike" style="padding-left:5px;">#{element}#
<dt><input type="checkbox" onchange="window.location.href='#[url]#'"#(on)# checked="checked"::#(/on)#/></dt>
<dd>#[name]# (#[count]#)</dd>
#{/element}#</dl>
</div>
#(/nav-filetypes)#
#(nav-domains)#::
<div id="sidebarDomains" style="float: right; margin-top:5px; width: 220px;">
<div id="sidebarDomains" style="float:right; margin-top:5px; width:220px;">
<h3 style="padding-left:25px;">Domain Navigator</h3>
<div><ul style="padding-left: 0px;">#{element}#
<li>#[url]#</li>
#{/element}#</ul></div>
<dl class="bplike" style="padding-left:5px;">#{element}#
<dt><input type="checkbox" onchange="window.location.href='#[url]#'"#(on)# checked="checked"::#(/on)#/></dt>
<dd>#[name]# (#[count]#)</dd>
#{/element}#</dl>
</div>
#(/nav-domains)#
#(nav-namespace)#::
<div id="sidebarNameSpace" style="float: right; margin-top:5px; width: 220px;">
<div id="sidebarNameSpace" style="float:right; margin-top:5px; width:220px;">
<h3 style="padding-left:25px;">Name Space Navigator</h3>
<div><ul style="padding-left: 0px;">#{element}#
<li>#[url]#</li>
#{/element}#</ul></div>
<dl class="bplike" style="padding-left:5px;">#{element}#
<dt><input type="checkbox" onchange="window.location.href='#[url]#'"#(on)# checked="checked"::#(/on)#/></dt>
<dd>#[name]# (#[count]#)</dd>
#{/element}#</dl>
</div>
#(/nav-namespace)#
#(nav-authors)#::
<div id="sidebarAuthors" style="float: right; margin-top:5px; width: 220px;">
<div id="sidebarAuthors" style="float:right; margin-top:5px; width:220px;">
<h3 style="padding-left:25px;">Author Navigator</h3>
<div><ul style="padding-left: 0px;">#{element}#
<li>#[url]#</li>
#{/element}#</ul></div>
<dl class="bplike" style="padding-left:5px;">#{element}#
<dt><input type="checkbox" onchange="window.location.href='#[url]#'"#(on)# checked="checked"::#(/on)#/></dt>
<dd>#[name]# (#[count]#)</dd>
#{/element}#</dl>
</div>
#(/nav-authors)#
#{nav-vocabulary}#
<div id="sidebar#[navname]#" style="float: right; margin-top:5px; width: 220px;">
<div id="sidebar#[navname]#" style="float:right; margin-top:5px; width:220px;">
<h3 style="padding-left:25px;">#[navname]# Navigator</h3>
<div><ul style="padding-left: 0px;">#{element}#
<li>#[url]#</li>
#{/element}#</ul></div>
<dl class="bplike" style="padding-left:5px;">#{element}#
<dt><input type="checkbox" onchange="window.location.href='#[url]#'"#(on)# checked="checked"::#(/on)#/></dt>
<dd>#[name]# (#[count]#)</dd>
#{/element}#</dl>
</div>
#{/nav-vocabulary}#

@ -75,9 +75,9 @@ public class yacysearchtrailer {
while (i < 10 && navigatorIterator.hasNext()) {
name = navigatorIterator.next();
count = namespaceNavigator.get(name);
prop.put("nav-namespace_element_" + i + "_on", 1);
prop.put(fileType, "nav-namespace_element_" + i + "_name", name);
prop.put("nav-namespace_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, theQuery, theQuery.queryStringForUrl() + "+" + "inurl:" + name, theQuery.urlMask.toString(), theQuery.navigators).toString() + "\">" + name + " (" + count + ")</a>");
prop.putJSON("nav-namespace_element_" + i + "_url-json", QueryParams.navurl("json", 0, theQuery, theQuery.queryStringForUrl() + "+" + "inurl:" + name, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put(fileType, "nav-namespace_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theQuery, theQuery.queryStringForUrl() + "+" + "inurl:" + name, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put("nav-namespace_element_" + i + "_count", count);
prop.put(fileType, "nav-namespace_element_" + i + "_modifier", "inurl:" + name);
prop.put("nav-namespace_element_" + i + "_nl", 1);
@ -101,9 +101,9 @@ public class yacysearchtrailer {
name = navigatorIterator.next();
count = hostNavigator.get(name);
dnav = "site:" + name;
prop.put("nav-domains_element_" + i + "_on", 1);
prop.put(fileType, "nav-domains_element_" + i + "_name", name);
prop.put("nav-domains_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, theQuery, theQuery.queryStringForUrl() + "+" + dnav, theQuery.urlMask.toString(), theQuery.navigators).toString() + "\">" + name + " (" + count + ")</a>");
prop.putJSON("nav-domains_element_" + i + "_url-json", QueryParams.navurl("json", 0, theQuery, theQuery.queryStringForUrl() + "+" + dnav, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put(fileType, "nav-domains_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theQuery, theQuery.queryStringForUrl() + "+" + dnav, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put("nav-domains_element_" + i + "_count", count);
prop.put(fileType, "nav-domains_element_" + i + "_modifier", dnav);
prop.put("nav-domains_element_" + i + "_nl", 1);
@ -126,10 +126,10 @@ public class yacysearchtrailer {
while (i < 20 && navigatorIterator.hasNext()) {
name = navigatorIterator.next().trim();
count = authorNavigator.get(name);
anav = (name.indexOf(' ',0) < 0) ? "author:" + name : "author:'" + name.replace(" ", "+") + "'";
anav = (name.indexOf(' ', 0) < 0) ? "author:" + name : "author:%" + name.replace(" ", "+") + "%";
prop.put("nav-authors_element_" + i + "_on", 1);
prop.put(fileType, "nav-authors_element_" + i + "_name", name);
prop.put("nav-authors_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators).toString() + "\">" + name + " (" + count + ")</a>");
prop.putJSON("nav-authors_element_" + i + "_url-json", QueryParams.navurl("json", 0, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put(fileType, "nav-authors_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put("nav-authors_element_" + i + "_count", count);
prop.put(fileType, "nav-authors_element_" + i + "_modifier", anav);
prop.put("nav-authors_element_" + i + "_nl", 1);
@ -153,11 +153,9 @@ public class yacysearchtrailer {
count = topicNavigator.get(name);
if (/*(theQuery == null) ||*/ (theQuery.queryString == null)) break;
if (name != null) {
prop.put("nav-topics_element_" + i + "_on", 1);
prop.put(fileType, "nav-topics_element_" + i + "_name", name);
prop.put("nav-topics_element_" + i + "_url",
"<a href=\"" + QueryParams.navurl("html", 0, theQuery, theQuery.queryStringForUrl() + "+" + name, theQuery.urlMask.toString(), theQuery.navigators).toString() + "\">" + name + "</a>");
//+"<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.queryStringForUrl() + "+-" + name, theQuery.urlMask.toString(), theQuery.navigators) + "\">-</a>")*/;
prop.putJSON("nav-topics_element_" + i + "_url-json", QueryParams.navurl("json", 0, theQuery, theQuery.queryStringForUrl() + "+" + name, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put(fileType, "nav-topics_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theQuery, theQuery.queryStringForUrl() + "+" + name, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put("nav-topics_element_" + i + "_count", count);
prop.put(fileType, "nav-topics_element_" + i + "_modifier", name);
prop.put("nav-topics_element_" + i + "_nl", 1);
@ -182,9 +180,9 @@ public class yacysearchtrailer {
name = navigatorIterator.next().trim();
count = protocolNavigator.get(name);
pnav = "/" + name;
prop.put("nav-protocols_element_" + i + "_on", 1);
prop.put(fileType, "nav-protocols_element_" + i + "_name", name);
prop.put("nav-protocols_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, theQuery, theQuery.queryStringForUrl() + "+" + pnav, theQuery.urlMask.toString(), theQuery.navigators).toString() + "\">" + name + " (" + count + ")</a>");
prop.putJSON("nav-protocols_element_" + i + "_url-json", QueryParams.navurl("json", 0, theQuery, theQuery.queryStringForUrl() + "+" + pnav, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put(fileType, "nav-protocols_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theQuery, theQuery.queryStringForUrl() + "+" + pnav, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put("nav-protocols_element_" + i + "_count", count);
prop.put(fileType, "nav-protocols_element_" + i + "_modifier", pnav);
prop.put("nav-protocols_element_" + i + "_nl", 1);
@ -208,9 +206,9 @@ public class yacysearchtrailer {
name = navigatorIterator.next().trim();
count = filetypeNavigator.get(name);
tnav = "filetype:" + name;
prop.put("nav-filetypes_element_" + i + "_on", 1);
prop.put(fileType, "nav-filetypes_element_" + i + "_name", name);
prop.put("nav-filetypes_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, theQuery, theQuery.queryStringForUrl() + "+" + tnav, theQuery.urlMask.toString(), theQuery.navigators).toString() + "\">" + name + " (" + count + ")</a>");
prop.putJSON("nav-filetypes_element_" + i + "_url-json", QueryParams.navurl("json", 0, theQuery, theQuery.queryStringForUrl() + "+" + tnav, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put(fileType, "nav-filetypes_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theQuery, theQuery.queryStringForUrl() + "+" + tnav, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put("nav-filetypes_element_" + i + "_count", count);
prop.put(fileType, "nav-filetypes_element_" + i + "_modifier", tnav);
prop.put("nav-filetypes_element_" + i + "_nl", 1);
@ -238,9 +236,9 @@ public class yacysearchtrailer {
name = navigatorIterator.next();
count = ve.getValue().get(name);
anav = "/vocabulary/" + navname + "/" + Autotagging.encodePrintname(name);
prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_on", 1);
prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_name", name);
prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators).toString() + "\">" + name + " (" + count + ")</a>");
prop.putJSON("nav-vocabulary_" + navvoccount + "_element_" + i + "_url-json", QueryParams.navurl("json", 0, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_count", count);
prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_modifier", anav);
prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_nl", 1);

@ -8,7 +8,7 @@
"mean": "0",
"elements": [
#{element}#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url]#"}#(nl)#::,#(/nl)#
#{/element}#
]
},#(/nav-filetypes)##(nav-protocols)#::
@ -21,7 +21,7 @@
"mean": "0",
"elements": [
#{element}#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url]#"}#(nl)#::,#(/nl)#
#{/element}#
]
},#(/nav-protocols)##(nav-domains)#::
@ -34,7 +34,7 @@
"mean": "0",
"elements": [
#{element}#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url]#"}#(nl)#::,#(/nl)#
#{/element}#
]
},#(/nav-domains)##(nav-namespace)#::
@ -47,7 +47,7 @@
"mean": "0",
"elements": [
#{element}#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url]#"}#(nl)#::,#(/nl)#
#{/element}#
]
},#(/nav-namespace)##(nav-authors)#::
@ -60,7 +60,7 @@
"mean": "0",
"elements": [
#{element}#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url]#"}#(nl)#::,#(/nl)#
#{/element}#
]
},#(/nav-authors)##{nav-vocabulary}#
@ -73,7 +73,7 @@
"mean": "0",
"elements": [
#{element}#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url]#"}#(nl)#::,#(/nl)#
#{/element}#
]
},#{/nav-vocabulary}##(nav-topics)#::
@ -86,7 +86,7 @@
"mean": "0",
"elements": [
#{element}#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url]#"}#(nl)#::,#(/nl)#
#{/element}#
]
}#(/nav-topics)#

@ -29,6 +29,8 @@ package de.anomic.crawler;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@ -293,6 +295,72 @@ public class Balancer {
}
}
/**
* get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names to the size of the domain stack
*/
public Map<String, Integer> getDomainStackHosts() {
Map<String, Integer> map = new HashMap<String, Integer>();
for (Map.Entry<String, HandleSet> entry: this.domainStacks.entrySet()) {
map.put(entry.getKey(), entry.getValue().size());
}
return map;
}
/**
* compute the current sleep time for a given crawl entry
* @param cs
* @param crawlEntry
* @return
*/
public long getDomainSleepTime(final CrawlSwitchboard cs, Request crawlEntry) {
final CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
return getDomainSleepTime(cs, profileEntry, crawlEntry);
}
private long getDomainSleepTime(final CrawlSwitchboard cs, final CrawlProfile profileEntry, Request crawlEntry) {
if (profileEntry == null) {
return 0;
}
long sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlEntry.url()))
) ? 0 : Latency.waitingRemaining(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime;
}
/**
* get lists of crawl request entries for a specific host
* @param host
* @param maxcount
* @return a list of crawl loader requests
*/
public List<Request> getDomainStackReferences(String host, int maxcount) {
HandleSet domainList = this.domainStacks.get(host);
if (domainList == null || domainList.isEmpty()) return new ArrayList<Request>(0);
ArrayList<Request> cel = new ArrayList<Request>(maxcount);
for (int i = 0; i < maxcount; i++) {
if (domainList.size() <= i) break;
final byte[] urlhash = domainList.getOne(i);
if (urlhash == null) continue;
Row.Entry rowEntry;
try {
rowEntry = this.urlFileIndex.get(urlhash, true);
} catch (IOException e) {
continue;
}
if (rowEntry == null) continue;
Request crawlEntry;
try {
crawlEntry = new Request(rowEntry);
} catch (IOException e) {
continue;
}
cel.add(crawlEntry);
}
return cel;
}
private void pushHashToDomainStacks(String host, final byte[] urlhash) throws RowSpaceExceededException {
// extend domain stack
if (host == null) host = localhost;
@ -417,11 +485,8 @@ public class Balancer {
return null;
}
// depending on the caching policy we need sleep time to avoid DoS-like situations
sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlEntry.url()))
) ? 0 : Latency.waitingRemaining(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
sleeptime = getDomainSleepTime(cs, profileEntry, crawlEntry);
assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes());
assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash());

@ -32,6 +32,7 @@ import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import net.yacy.kelondro.index.HandleSet;
@ -228,6 +229,50 @@ public class NoticedURL {
return removed;
}
/**
* get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names to the size of the domain stacks
*/
public Map<String, Integer> getDomainStackHosts(final StackType stackType) {
switch (stackType) {
case CORE: return this.coreStack.getDomainStackHosts();
case LIMIT: return this.limitStack.getDomainStackHosts();
case REMOTE: return this.remoteStack.getDomainStackHosts();
case NOLOAD: return this.noloadStack.getDomainStackHosts();
default: return null;
}
}
/**
* get a list of domains that are currently maintained as domain stacks
* @return a collection of clear text strings of host names
*/
public long getDomainSleepTime(final StackType stackType, final CrawlSwitchboard cs, Request crawlEntry) {
switch (stackType) {
case CORE: return this.coreStack.getDomainSleepTime(cs, crawlEntry);
case LIMIT: return this.limitStack.getDomainSleepTime(cs, crawlEntry);
case REMOTE: return this.remoteStack.getDomainSleepTime(cs, crawlEntry);
case NOLOAD: return this.noloadStack.getDomainSleepTime(cs, crawlEntry);
default: return 0;
}
}
/**
* get lists of crawl request entries for a specific host
* @param host
* @param maxcount
* @return a list of crawl loader requests
*/
public List<Request> getDomainStackReferences(final StackType stackType, String host, int maxcount) {
switch (stackType) {
case CORE: return this.coreStack.getDomainStackReferences(host, maxcount);
case LIMIT: return this.limitStack.getDomainStackReferences(host, maxcount);
case REMOTE: return this.remoteStack.getDomainStackReferences(host, maxcount);
case NOLOAD: return this.noloadStack.getDomainStackReferences(host, maxcount);
default: return null;
}
}
public List<Request> top(final StackType stackType, final int count) {
switch (stackType) {
case CORE: return top(this.coreStack, count);
@ -295,7 +340,7 @@ public class NoticedURL {
return null;
}
private List<Request> top(final Balancer balancer, int count) {
private static List<Request> top(final Balancer balancer, int count) {
// this is a filo - top
if (count > balancer.size()) count = balancer.size();
return balancer.top(count);

@ -34,7 +34,6 @@ import java.io.IOException;
import java.io.InputStream;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
@ -126,32 +125,32 @@ public class pdfParser extends AbstractParser implements Parser {
if (docTitle == null || docTitle.length() == 0) {
docTitle = MultiProtocolURI.unescape(location.getFileName());
}
final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
byte[] contentBytes = UTF8.getBytes("");
final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
byte[] contentBytes = new byte[0];
try {
// create a writer for output
final PDFTextStripper stripper = new PDFTextStripper();
final PDFTextStripper stripper = new PDFTextStripper();
stripper.setEndPage(3); // get first 3 pages (always)
writer.append(stripper.getText(pdfDoc));
contentBytes = UTF8.getBytes(writer.toString()); // remember text in case of interrupting thread
contentBytes = writer.getBytes(); // remember text in case of interrupting thread
stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
stripper.setEndPage(Integer.MAX_VALUE); // set to default
// we start the pdf parsing in a separate thread to ensure that it can be terminated
final Thread t = new Thread() {
@Override
public void run() {
try {
writer.append(stripper.getText(pdfDoc));
try {
writer.append(stripper.getText(pdfDoc));
} catch (final Throwable e) {}
}
};
t.start();
t.join(3000);
if (t.isAlive()) t.interrupt();
pdfDoc.close();
contentBytes = UTF8.getBytes(writer.toString()); // get final text before closing writer
if (t.isAlive()) t.interrupt();
pdfDoc.close();
contentBytes = writer.getBytes(); // get final text before closing writer
writer.close();
} catch (final IOException e) {
// close the writer
@ -176,8 +175,7 @@ public class pdfParser extends AbstractParser implements Parser {
if (docTitle == null) {
docTitle = docSubject;
}
// clear resources in pdfbox. they say that is resolved but it's not. see:
// https://issues.apache.org/jira/browse/PDFBOX-313
// https://issues.apache.org/jira/browse/PDFBOX-351

@ -44,6 +44,8 @@ import net.yacy.kelondro.logging.Log;
public final class ReferenceContainerArray<ReferenceType extends Reference> {
private final static long METHOD_MAXRUNTIME = 5000L;
protected final ReferenceFactory<ReferenceType> factory;
protected final ArrayStack array;
@ -295,21 +297,21 @@ public final class ReferenceContainerArray<ReferenceType extends Reference> {
* @throws RowSpaceExceededException
*/
public ReferenceContainer<ReferenceType> get(final byte[] termHash) throws IOException, RowSpaceExceededException {
final long timeout = System.currentTimeMillis() + 3000;
final long timeout = System.currentTimeMillis() + METHOD_MAXRUNTIME;
final Iterator<byte[]> entries = this.array.getAll(termHash).iterator();
if (entries == null || !entries.hasNext()) return null;
final byte[] a = entries.next();
int k = 1;
ReferenceContainer<ReferenceType> c = new ReferenceContainer<ReferenceType>(this.factory, termHash, RowSet.importRowSet(a, this.factory.getRow()));
if (System.currentTimeMillis() > timeout) {
Log.logWarning("ReferenceContainerArray", "timout in index retrieval (1): " + k + " tables searched. timeout = 3000");
Log.logWarning("ReferenceContainerArray", "timout in get() (1): " + k + " tables searched. timeout = " + METHOD_MAXRUNTIME);
return c;
}
while (entries.hasNext()) {
c = c.merge(new ReferenceContainer<ReferenceType>(this.factory, termHash, RowSet.importRowSet(entries.next(), this.factory.getRow())));
k++;
if (System.currentTimeMillis() > timeout) {
Log.logWarning("ReferenceContainerArray", "timout in index retrieval (2): " + k + " tables searched. timeout = 3000");
Log.logWarning("ReferenceContainerArray", "timout in get() (2): " + k + " tables searched. timeout = " + METHOD_MAXRUNTIME);
return c;
}
}
@ -317,7 +319,7 @@ public final class ReferenceContainerArray<ReferenceType extends Reference> {
}
public int count(final byte[] termHash) throws IOException {
final long timeout = System.currentTimeMillis() + 3000;
final long timeout = System.currentTimeMillis() + METHOD_MAXRUNTIME;
final Iterator<Long> entries = this.array.lengthAll(termHash).iterator();
if (entries == null || !entries.hasNext()) return 0;
final Long a = entries.next();
@ -325,7 +327,7 @@ public final class ReferenceContainerArray<ReferenceType extends Reference> {
int c = RowSet.importRowCount(a, this.factory.getRow());
assert c >= 0;
if (System.currentTimeMillis() > timeout) {
Log.logWarning("ReferenceContainerArray", "timout in index retrieval (1): " + k + " tables searched. timeout = 3000");
Log.logWarning("ReferenceContainerArray", "timout in count() (1): " + k + " tables searched. timeout = " + METHOD_MAXRUNTIME);
return c;
}
while (entries.hasNext()) {
@ -333,7 +335,7 @@ public final class ReferenceContainerArray<ReferenceType extends Reference> {
assert c >= 0;
k++;
if (System.currentTimeMillis() > timeout) {
Log.logWarning("ReferenceContainerArray", "timout in index retrieval (2): " + k + " tables searched. timeout = 3000");
Log.logWarning("ReferenceContainerArray", "timout in count() (2): " + k + " tables searched. timeout = " + METHOD_MAXRUNTIME);
return c;
}
}

@ -30,6 +30,7 @@ import java.io.File;
import java.io.IOException;
import java.util.Map;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.kelondro.blob.HeapReader;
import net.yacy.kelondro.index.RowSet;
@ -55,6 +56,7 @@ public class ReferenceIterator <ReferenceType extends Reference> extends LookAhe
* return an index container
* because they may get very large, it is wise to deallocate some memory before calling next()
*/
@Override
public ReferenceContainer<ReferenceType> next0() {
if (this.blobs == null) return null;
RowSet row;
@ -65,15 +67,15 @@ public class ReferenceIterator <ReferenceType extends Reference> extends LookAhe
try {
row = RowSet.importRowSet(entry.getValue(), this.factory.getRow());
if (row == null) {
Log.logSevere("ReferenceIterator", "lost entry '" + entry.getKey() + "' because importRowSet returned null");
Log.logSevere("ReferenceIterator", "lost entry '" + UTF8.String(entry.getKey()) + "' because importRowSet returned null");
continue; // thats a fail but not as REALLY bad if the whole method would crash here
}
return new ReferenceContainer<ReferenceType>(this.factory, entry.getKey(), row);
} catch (final RowSpaceExceededException e) {
Log.logSevere("ReferenceIterator", "lost entry '" + entry.getKey() + "' because of too low memory: " + e.toString());
Log.logSevere("ReferenceIterator", "lost entry '" + UTF8.String(entry.getKey()) + "' because of too low memory: " + e.toString());
continue;
} catch (final Throwable e) {
Log.logSevere("ReferenceIterator", "lost entry '" + entry.getKey() + "' because of too error: " + e.toString());
Log.logSevere("ReferenceIterator", "lost entry '" + UTF8.String(entry.getKey()) + "' because of error: " + e.toString());
continue;
}
}
@ -86,6 +88,7 @@ public class ReferenceIterator <ReferenceType extends Reference> extends LookAhe
this.blobs = null;
}
@Override
public CloneableIterator<ReferenceContainer<ReferenceType>> clone(final Object modifier) {
if (this.blobs != null) this.blobs.close();
this.blobs = null;

@ -52,7 +52,7 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.document.Document;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
@ -234,11 +234,11 @@ public final class yacyRelease extends yacyVersion {
// this is done by contacting a release location,
// parsing the content and filtering+parsing links
// returns the version info if successful, null otherwise
ContentScraper scraper;
Document scraper;
try {
final DigestURI uri = location.getLocationURL();
Thread.currentThread().setName("allReleaseFrom - host " + uri.getHost()); // makes it more easy to see which release blocks process in thread dump
scraper = Switchboard.getSwitchboard().loader.parseResource(uri, CacheStrategy.NOCACHE);
scraper = Switchboard.getSwitchboard().loader.loadDocument(uri, CacheStrategy.NOCACHE);
} catch (final IOException e) {
return null;
}

@ -26,7 +26,6 @@
package net.yacy.repository;
import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
@ -50,21 +49,19 @@ import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segments;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.ZURL.FailCategory;
import de.anomic.crawler.retrieval.FTPLoader;
import de.anomic.crawler.retrieval.FileLoader;
import de.anomic.crawler.retrieval.HTTPLoader;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.retrieval.Response;
import de.anomic.crawler.retrieval.SMBLoader;
import de.anomic.crawler.ZURL.FailCategory;
import de.anomic.http.client.Cache;
public final class LoaderDispatcher {
@ -192,7 +189,7 @@ public final class LoaderDispatcher {
final String host = url.getHost();
// check if url is in blacklist
if (checkBlacklist && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, host.toLowerCase(), url.getFile())) {
if (checkBlacklist && host != null && Switchboard.urlBlacklist.isListed(Blacklist.BLACKLIST_CRAWLER, host.toLowerCase(), url.getFile())) {
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
throw new IOException("DISPATCHER Rejecting URL '" + request.url().toString() + "'. URL is in blacklist.");
}
@ -290,7 +287,7 @@ public final class LoaderDispatcher {
if (response.getContent() == null) {
throw new IOException("empty response (code " + response.getStatus() + ") for url " + url);
}
// we got something. Now check if we want to store that to the cache
// first check looks if we want to store the content to the cache
if (crawlProfile == null || !crawlProfile.storeHTCache()) {
@ -352,16 +349,22 @@ public final class LoaderDispatcher {
return response.parse();
}
public ContentScraper parseResource(final DigestURI location, final CacheStrategy cachePolicy) throws IOException {
// load page
final Response r = this.load(request(location, true, false), cachePolicy, true);
final byte[] page = (r == null) ? null : r.getContent();
if (page == null) throw new IOException("no response from url " + location.toString());
public Document loadDocument(final DigestURI location, final CacheStrategy cachePolicy) throws IOException {
// load resource
Request request = request(location, true, false);
final Response response = this.load(request, cachePolicy, 10000, true);
final DigestURI url = request.url();
if (response == null) throw new IOException("no Response for url " + url);
// if it is still not available, report an error
if (response.getContent() == null || response.getResponseHeader() == null) throw new IOException("no Content available for url " + url);
// parse resource
try {
return htmlParser.parseToScraper(location, r.getCharacterEncoding(), new ByteArrayInputStream(page));
Document[] documents = response.parse();
return Document.mergeDocuments(location, response.getMimeType(), documents);
} catch(final Parser.Failure e) {
throw new IOException(e.getMessage());
throw new IOException(e.getMessage());
}
}

@ -69,4 +69,5 @@ net.yacy.kelondro.logging.LogalizerHandler.debug = false
net.yacy.kelondro.logging.LogalizerHandler.parserPackage = net.yacy.kelondro.logging
org.apache.http.level = OFF
org.apache.http.wire.level = OFF
org.apache.http.wire.level = OFF
org.apache.pdfbox.level = INFO
Loading…
Cancel
Save