Merge rc1/master

pull/1/head
reger 13 years ago
commit 55518c600f

@ -734,7 +734,10 @@ search.items = 10
# possible values:
# "_blank" (new window), "_self" (same window), "_parent" (the parent frame of a frameset),
# "_top" (top of all frames), "searchresult" (a default custom page name for search results)
# a special pattern can be given for exceptions to the default target according to urls
search.target = _self
search.target.special = _self
search.target.special.pattern =
# search result lines may show additional information for each search hit
# these information pieces may be switched on or off

@ -101,14 +101,26 @@
<dt>Target for Click on Search Results</dt>
<dd>
<select name="target">
<option value="_blank" #(selected_blank)#::selected="selected"#(/selected_blank)#>"_blank" (new window)</option>
<option value="_self" #(selected_self)#::selected="selected"#(/selected_self)#>"_self" (same window)</option>
<option value="_parent" #(selected_parent)#::selected="selected"#(/selected_parent)#>"_parent" (the parent frame of a frameset)</option>
<option value="_top" #(selected_top)#::selected="selected"#(/selected_top)#>"_top" (top of all frames)</option>
<option value="searchresult" #(selected_searchresult)#::selected="selected"#(/selected_searchresult)#>"searchresult" (a default custom page name for search results)</option>
<option value="_blank" #(target_selected_blank)#::selected="selected"#(/target_selected_blank)#>"_blank" (new window)</option>
<option value="_self" #(target_selected_self)#::selected="selected"#(/target_selected_self)#>"_self" (same window)</option>
<option value="_parent" #(target_selected_parent)#::selected="selected"#(/target_selected_parent)#>"_parent" (the parent frame of a frameset)</option>
<option value="_top" #(target_selected_top)#::selected="selected"#(/target_selected_top)#>"_top" (top of all frames)</option>
<option value="searchresult" #(target_selected_searchresult)#::selected="selected"#(/target_selected_searchresult)#>"searchresult" (a default custom page name for search results)</option>
</select>
</dd>
<dt>Special Target as Exception for an URL-Pattern</dt>
<dd>
<select name="target_special">
<option value="_blank" #(target_selected_special_blank)#::selected="selected"#(/target_selected_special_blank)#>"_blank" (new window)</option>
<option value="_self" #(target_selected_special_self)#::selected="selected"#(/target_selected_special_self)#>"_self" (same window)</option>
<option value="_parent" #(target_selected_special_parent)#::selected="selected"#(/target_selected_special_parent)#>"_parent" (the parent frame of a frameset)</option>
<option value="_top" #(target_selected_special_top)#::selected="selected"#(/target_selected_special_top)#>"_top" (top of all frames)</option>
<option value="searchresult" #(target_selected_special_searchresult)#::selected="selected"#(/target_selected_special_searchresult)#>"searchresult" (a default custom page name for search results)</option>
</select>
&nbsp;Pattern:<input type="text" name="target_special_pattern" value="#[target_special_pattern]#" size="30" />
</dd>
<dt>Exclude Hosts</dt>
<dd>List of hosts that shall be excluded from search results by default but can be included using the site:&lt;host&gt; operator:<br/>
<input type="text" name="search.excludehosts" value="#[search.excludehosts]#" size="60" /><br/>

@ -68,7 +68,9 @@ public class ConfigPortal {
sb.setConfig(SwitchboardConstants.GREETING_HOMEPAGE, post.get(SwitchboardConstants.GREETING_HOMEPAGE, ""));
sb.setConfig(SwitchboardConstants.GREETING_LARGE_IMAGE, post.get(SwitchboardConstants.GREETING_LARGE_IMAGE, ""));
sb.setConfig(SwitchboardConstants.GREETING_SMALL_IMAGE, post.get(SwitchboardConstants.GREETING_SMALL_IMAGE, ""));
sb.setConfig(SwitchboardConstants.SEARCH_TARGET, post.get("target", "_self"));
sb.setConfig(SwitchboardConstants.SEARCH_TARGET_DEFAULT, post.get("target", "_self"));
sb.setConfig(SwitchboardConstants.SEARCH_TARGET_SPECIAL, post.get("target_special", "_self"));
sb.setConfig(SwitchboardConstants.SEARCH_TARGET_SPECIAL_PATTERN, post.get("target_special_pattern", "_self"));
sb.setConfig(SwitchboardConstants.SEARCH_ITEMS, post.getInt("maximumRecords", 10));
sb.setConfig(SwitchboardConstants.INDEX_FORWARD, post.get(SwitchboardConstants.INDEX_FORWARD, ""));
HTTPDFileHandler.indexForward = post.get(SwitchboardConstants.INDEX_FORWARD, "");
@ -87,17 +89,17 @@ public class ConfigPortal {
sb.setConfig("search.result.show.metadata", post.getBoolean("search.result.show.metadata", false));
sb.setConfig("search.result.show.parser", post.getBoolean("search.result.show.parser", false));
sb.setConfig("search.result.show.pictures", post.getBoolean("search.result.show.pictures", false));
sb.setConfig(SwitchboardConstants.SEARCH_VERIFY, post.get("search.verify", "ifexist"));
sb.setConfig(SwitchboardConstants.SEARCH_VERIFY_DELETE, post.getBoolean("search.verify.delete", false));
sb.setConfig("about.headline", post.get("about.headline", ""));
sb.setConfig("about.body", post.get("about.body", ""));
String excludehosts = post.get("search.excludehosts", "");
sb.setConfig("search.excludehosts", excludehosts);
sb.setConfig("search.excludehosth", DigestURI.hosthashes(excludehosts));
// construct navigation String
String nav = "";
if (post.getBoolean("search.navigation.hosts", false)) nav += "hosts,";
@ -114,7 +116,9 @@ public class ConfigPortal {
sb.setConfig(SwitchboardConstants.BROWSER_POP_UP_PAGE, "Status.html");
sb.setConfig(SwitchboardConstants.INDEX_FORWARD, "");
HTTPDFileHandler.indexForward = "";
sb.setConfig(SwitchboardConstants.SEARCH_TARGET, "_self");
sb.setConfig(SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self");
sb.setConfig(SwitchboardConstants.SEARCH_TARGET_SPECIAL, "_self");
sb.setConfig(SwitchboardConstants.SEARCH_TARGET_SPECIAL_PATTERN, "");
sb.setConfig("publicTopmenu", true);
sb.setConfig("publicSearchpage", true);
sb.setConfig("search.navigation", "hosts,authors,namespace,topics");
@ -194,12 +198,20 @@ public class ConfigPortal {
prop.put("maximumRecords", sb.getConfigInt(SwitchboardConstants.SEARCH_ITEMS, 10));
final String target = sb.getConfig(SwitchboardConstants.SEARCH_TARGET, "_self");
prop.put("selected_blank", "_blank".equals(target) ? 1 : 0);
prop.put("selected_self", "_self".equals(target) ? 1 : 0);
prop.put("selected_parent", "_parent".equals(target) ? 1 : 0);
prop.put("selected_top", "_top".equals(target) ? 1 : 0);
prop.put("selected_searchresult", "searchresult".equals(target) ? 1 : 0);
final String target = sb.getConfig(SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self");
prop.put("target_selected_blank", "_blank".equals(target) ? 1 : 0);
prop.put("target_selected_self", "_self".equals(target) ? 1 : 0);
prop.put("target_selected_parent", "_parent".equals(target) ? 1 : 0);
prop.put("target_selected_top", "_top".equals(target) ? 1 : 0);
prop.put("target_selected_searchresult", "searchresult".equals(target) ? 1 : 0);
final String target_special = sb.getConfig(SwitchboardConstants.SEARCH_TARGET_SPECIAL, "_self");
prop.put("target_selected_special_blank", "_blank".equals(target_special) ? 1 : 0);
prop.put("target_selected_special_self", "_self".equals(target_special) ? 1 : 0);
prop.put("target_selected_special_parent", "_parent".equals(target_special) ? 1 : 0);
prop.put("target_selected_special_top", "_top".equals(target_special) ? 1 : 0);
prop.put("target_selected_special_searchresult", "searchresult".equals(target_special) ? 1 : 0);
prop.put("target_special_pattern", sb.getConfig(SwitchboardConstants.SEARCH_TARGET_SPECIAL_PATTERN, ""));
String myaddress = (sb.peers == null) ? null : sb.peers.mySeed() == null ? null : sb.peers.mySeed().getPublicAddress();
if (myaddress == null) {

@ -441,7 +441,7 @@ form.search.small h2 {
margin-bottom:5px;
}
li.menugroup h3{
li.menugroup h3 {
font-size: 1em; font-weight: bold;
margin: 0; padding: 1px 10px;
}
@ -631,6 +631,31 @@ dd.hint {
padding-bottom: 10px;
}
dl.bplike
{
float: left;
margin: 0 0;
width: 184px;
padding: 0;
}
.bplike dt
{
clear: left;
float: left;
width: 20px;
margin: 0;
padding: 0px;
}
.bplike dd
{
float: left;
width: 155px;
margin: 0px;
padding: 0px;
}
/*----------
<form>
*/

@ -28,10 +28,10 @@
case 9:
case 33:
window.location.href = document.getElementById("nextpage").href;
break;
break;
case 34:
window.location.href = document.getElementById("prevpage").href;
break;
break;
case 40:
}
}
@ -48,13 +48,13 @@
function opensearch(data) {
var parsed = [];
data = eval('({"suggest":' + data + '})');
for (var i = 0; i < data.suggest[1].length; i++) {
for (var i = 0; i < data.suggest[1].length; i++) {
var row = data.suggest[1][i];
if (row) {
parsed[parsed.length] = {
data: [row],
value: row,
result: row
if (row) {
parsed[parsed.length] = {
data: [row],
value: row,
result: row
};
};
};
@ -93,13 +93,9 @@ $(function() {
#{/sidebarVocabulary}#
$("#sidebarDomains").accordion({});
$("#sidebarProtocols").accordion({});
$("#sidebarProtocols").accordion('activate', false);
$("#sidebarFiletypes").accordion({});
$("#sidebarFiletypes").accordion('activate', false);
$("#sidebarAuthors").accordion({});
$("#sidebarAuthors").accordion('activate', false);
$("#sidebarNameSpace").accordion({});
$("#sidebarNameSpace").accordion('activate', false);
$("#sidebarTopics").tagcloud({type:"sphere",power:.25,seed:0,sizemin:10,sizemax:20,height:80,colormin:"682",colormax:"20C"}).find("li").tsort();
$("#sidebarAbout").accordion({});
$("#search").focus();

@ -248,6 +248,7 @@ public class yacysearch {
|| sb.getConfigBool(SwitchboardConstants.INDEX_RECEIVE_AUTODISABLED, true)
|| clustersearch;
global = global && indexReceiveGranted; // if the user does not want indexes from remote peers, it cannot be a global searchnn
final boolean intranetMode = sb.isIntranetMode() || sb.isAllIPMode();
// increase search statistic counter
if ( !global ) {
@ -298,7 +299,7 @@ public class yacysearch {
Log.logInfo("LOCAL_SEARCH", "ACCESS CONTROL: WHITELISTED CLIENT FROM "
+ client
+ " gets no search restrictions");
} else if ( !authenticated && !localhostAccess ) {
} else if ( !authenticated && !localhostAccess && !intranetMode ) {
// in case that we do a global search or we want to fetch snippets, we check for DoS cases
synchronized ( trackerHandles ) {
final int accInThreeSeconds =
@ -533,16 +534,16 @@ public class yacysearch {
String authorhash = null;
if ( authori >= 0 ) {
// check if the author was given with single quotes or without
final boolean quotes = (querystring.charAt(authori + 7) == (char) 39);
final boolean quotes = (querystring.charAt(authori + 7) == '(');
String author;
if ( quotes ) {
int ftb = querystring.indexOf((char) 39, authori + 8);
int ftb = querystring.indexOf(')', authori + 8);
if ( ftb == -1 ) {
ftb = querystring.length() + 1;
}
author = querystring.substring(authori + 8, ftb);
querystring = querystring.replace("author:'" + author + "'", "");
modifier.append("author:'").append(author).append("' ");
querystring = querystring.replace("author:(" + author + ")", "");
modifier.append("author:(").append(author).append(") ");
} else {
int ftb = querystring.indexOf(' ', authori);
if ( ftb == -1 ) {

@ -102,16 +102,18 @@ public class yacysearchitem {
prop.put("remoteIndexCount", Formatter.number(theSearch.getRankingResult().getRemoteIndexCount(), true));
prop.put("remotePeerCount", Formatter.number(theSearch.getRankingResult().getRemotePeerCount(), true));
prop.put("navurlBase", QueryParams.navurlBase("html", theQuery, null, theQuery.urlMask.toString(), theQuery.navigators).toString());
final String target_special_pattern = sb.getConfig(SwitchboardConstants.SEARCH_TARGET_SPECIAL_PATTERN, "");
final String target = sb.getConfig(SwitchboardConstants.SEARCH_TARGET, "_self");
if (theQuery.contentdom == ContentDomain.TEXT) {
// text search
// generate result object
final ResultEntry result = theSearch.oneResult(item, theQuery.isLocal() ? 1000 : 5000);
if (result == null) return prop; // no content
final String resultUrlstring = result.urlstring();
final DigestURI resultURL = result.url();
final String target = sb.getConfig(resultUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self");
final int port = resultURL.getPort();
DigestURI faviconURL = null;
if ((fileType == FileType.HTML || fileType == FileType.JSON) && !sb.isIntranetMode() && !resultURL.isLocal()) try {
@ -131,7 +133,7 @@ public class yacysearchitem {
final String urlhash = ASCII.String(result.hash());
prop.put("content_authorized_bookmark", sb.tables.bookmarks.hasBookmark("admin", urlhash) ? "0" : "1");
prop.putHTML("content_authorized_bookmark_bookmarklink", "/yacysearch.html?query=" + theQuery.queryString.replace(' ', '+') + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=" + resource + "&time=3&bookmarkref=" + urlhash + "&urlmaskfilter=.*");
prop.put("content_authorized_recommend", (sb.peers.newsPool.getSpecific(NewsPool.OUTGOING_DB, NewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? "1" : "0");
prop.put("content_authorized_recommend", (sb.peers.newsPool.getSpecific(NewsPool.OUTGOING_DB, NewsPool.CATEGORY_SURFTIPP_ADD, "url", resultUrlstring) == null) ? "1" : "0");
prop.putHTML("content_authorized_recommend_deletelink", "/yacysearch.html?query=" + theQuery.queryString.replace(' ', '+') + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=" + resource + "&time=3&deleteref=" + urlhash + "&urlmaskfilter=.*");
prop.putHTML("content_authorized_recommend_recommendlink", "/yacysearch.html?query=" + theQuery.queryString.replace(' ', '+') + "&Enter=Search&count=" + theQuery.displayResults() + "&offset=" + (theQuery.neededResults() - theQuery.displayResults()) + "&order=" + crypt.simpleEncode(theQuery.ranking.toExternalString()) + "&resource=" + resource + "&time=3&recommendref=" + urlhash + "&urlmaskfilter=.*");
prop.put("content_authorized_urlhash", urlhash);
@ -139,8 +141,8 @@ public class yacysearchitem {
prop.putHTML("content_title", result.title());
prop.putXML("content_title-xml", result.title());
prop.putJSON("content_title-json", result.title());
prop.putHTML("content_link", result.urlstring());
prop.putHTML("content_showPictures_link", result.urlstring());
prop.putHTML("content_link", resultUrlstring);
prop.putHTML("content_showPictures_link", resultUrlstring);
prop.putHTML("content_target", target);
if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10);
prop.putHTML("content_faviconCode", sb.licensedURLs.aquireLicense(faviconURL)); // acquire license for favicon url loading
@ -216,10 +218,13 @@ public class yacysearchitem {
if (ms == null) {
prop.put("content_item", "0");
} else {
final String resultUrlstring = ms.href.toNormalform(true, false);
final String target = sb.getConfig(resultUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self");
final String license = sb.licensedURLs.aquireLicense(ms.href);
sb.loader.loadIfNotExistBackground(ms.href, 1024 * 1024 * 10);
prop.putHTML("content_item_hrefCache", (auth) ? "/ViewImage.png?url=" + ms.href.toNormalform(true, false) : ms.href.toNormalform(true, false));
prop.putHTML("content_item_href", ms.href.toNormalform(true, false));
prop.putHTML("content_item_hrefCache", (auth) ? "/ViewImage.png?url=" + resultUrlstring : resultUrlstring);
prop.putHTML("content_item_href", resultUrlstring);
prop.putHTML("content_item_target", target);
prop.put("content_item_code", license);
prop.putHTML("content_item_name", shorten(ms.name, MAX_NAME_LENGTH));
@ -254,8 +259,11 @@ public class yacysearchitem {
if (media != null) {
int c = 0;
for (final MediaSnippet ms : media) {
prop.putHTML("content_items_" + c + "_href", ms.href.toNormalform(true, false));
prop.putHTML("content_items_" + c + "_hrefshort", nxTools.shortenURLString(ms.href.toNormalform(true, false), MAX_URL_LENGTH));
final String resultUrlstring = ms.href.toNormalform(true, false);
final String target = sb.getConfig(resultUrlstring.matches(target_special_pattern) ? SwitchboardConstants.SEARCH_TARGET_SPECIAL : SwitchboardConstants.SEARCH_TARGET_DEFAULT, "_self");
prop.putHTML("content_items_" + c + "_href", resultUrlstring);
prop.putHTML("content_items_" + c + "_hrefshort", nxTools.shortenURLString(resultUrlstring, MAX_URL_LENGTH));
prop.putHTML("content_items_" + c + "_target", target);
prop.putHTML("content_items_" + c + "_name", shorten(ms.name, MAX_NAME_LENGTH));
prop.put("content_items_" + c + "_col", (col) ? "0" : "1");

@ -1,7 +1,7 @@
<div style="float:right;width:25%;">
#(cat-location)#::
<div style="float: right; margin-top:5px; width: 220px;">
<div style="float:right; margin-top:5px; width:220px;">
<a href="yacysearch_location.html?query=#[queryenc]#">
<img src="/env/grafics/earthsearch.png" width="215" height="159" alt="earthsearchlogo" /></a>
<a href="yacysearch_location.html?query=#[queryenc]#">Show search results for "#[query]#" on map</a>
@ -11,62 +11,103 @@
#(nav-topics)#::
<div style="float: right; margin-top:5px; width: 220px; height: 80px">
<div><ul id="sidebarTopics" style="padding-left: 0px;">#{element}#
<li value="#[count]#">#[url]#</li>
<li value="#[count]#"><a href="#[url]#">#[name]#</a></li>
#{/element}#</ul></div>
</div>
#(/nav-topics)#
#(nav-protocols)#::
<div id="sidebarProtocols" style="float: right; margin-top:5px; width: 220px;">
<div id="sidebarProtocols" style="float:right; margin-top:5px; width:220px;">
<h3 style="padding-left:25px;">Protocol Navigator</h3>
<div><ul style="padding-left: 0px;">#{element}#
<li>#[url]#</li>
#{/element}#</ul></div>
<dl class="bplike" style="padding-left:5px;">#{element}#
<dt><input type="checkbox" onchange="window.location.href='#[url]#'"#(on)# checked="checked"::#(/on)#/></dt>
<dd>#[name]# (#[count]#)</dd>
#{/element}#</dl>
</div>
#(activate)#
<script type="text/javascript">
//<![CDATA[
$(function() { $("#sidebarProtocols").accordion('activate', false); });
//]]>
</script>
::#(/activate)#
#(/nav-protocols)#
#(nav-filetypes)#::
<div id="sidebarFiletypes" style="float: right; margin-top:5px; width: 220px;">
<div id="sidebarFiletypes" style="float:right; margin-top:5px; width:220px;">
<h3 style="padding-left:25px;">Filetype Navigator</h3>
<div><ul style="padding-left: 0px;">#{element}#
<li>#[url]#</li>
#{/element}#</ul></div>
<dl class="bplike" style="padding-left:5px;">#{element}#
<dt><input type="checkbox" onchange="window.location.href='#[url]#'"#(on)# checked="checked"::#(/on)#/></dt>
<dd>#[name]# (#[count]#)</dd>
#{/element}#</dl>
</div>
#(activate)#
<script type="text/javascript">
//<![CDATA[
$(function() { $("#sidebarFiletypes").accordion('activate', false); });
//]]>
</script>
::#(/activate)#
#(/nav-filetypes)#
#(nav-domains)#::
<div id="sidebarDomains" style="float: right; margin-top:5px; width: 220px;">
<div id="sidebarDomains" style="float:right; margin-top:5px; width:220px;">
<h3 style="padding-left:25px;">Domain Navigator</h3>
<div><ul style="padding-left: 0px;">#{element}#
<li>#[url]#</li>
#{/element}#</ul></div>
<dl class="bplike" style="padding-left:5px;">#{element}#
<dt><input type="checkbox" onchange="window.location.href='#[url]#'"#(on)# checked="checked"::#(/on)#/></dt>
<dd>#[name]# (#[count]#)</dd>
#{/element}#</dl>
</div>
#(activate)#
<script type="text/javascript">
//<![CDATA[
$(function() { $("#sidebarDomains").accordion('activate', false); });
//]]>
</script>
::#(/activate)#
#(/nav-domains)#
#(nav-namespace)#::
<div id="sidebarNameSpace" style="float: right; margin-top:5px; width: 220px;">
<div id="sidebarNameSpace" style="float:right; margin-top:5px; width:220px;">
<h3 style="padding-left:25px;">Name Space Navigator</h3>
<div><ul style="padding-left: 0px;">#{element}#
<li>#[url]#</li>
#{/element}#</ul></div>
<dl class="bplike" style="padding-left:5px;">#{element}#
<dt><input type="checkbox" onchange="window.location.href='#[url]#'"#(on)# checked="checked"::#(/on)#/></dt>
<dd>#[name]# (#[count]#)</dd>
#{/element}#</dl>
</div>
#(activate)#
<script type="text/javascript">
//<![CDATA[
$(function() { $("#sidebarNameSpace").accordion('activate', false); });
//]]>
</script>
::#(/activate)#
#(/nav-namespace)#
#(nav-authors)#::
<div id="sidebarAuthors" style="float: right; margin-top:5px; width: 220px;">
<div id="sidebarAuthors" style="float:right; margin-top:5px; width:220px;">
<h3 style="padding-left:25px;">Author Navigator</h3>
<div><ul style="padding-left: 0px;">#{element}#
<li>#[url]#</li>
#{/element}#</ul></div>
<dl class="bplike" style="padding-left:5px;">#{element}#
<dt><input type="checkbox" onchange="window.location.href='#[url]#'"#(on)# checked="checked"::#(/on)#/></dt>
<dd>#[name]# (#[count]#)</dd>
#{/element}#</dl>
</div>
#(activate)#
<script type="text/javascript">
//<![CDATA[
$(function() { $("#sidebarAuthors").accordion('activate', false); });
//]]>
</script>
::#(/activate)#
#(/nav-authors)#
#{nav-vocabulary}#
<div id="sidebar#[navname]#" style="float: right; margin-top:5px; width: 220px;">
<div id="sidebar#[navname]#" style="float:right; margin-top:5px; width:220px;">
<h3 style="padding-left:25px;">#[navname]# Navigator</h3>
<div><ul style="padding-left: 0px;">#{element}#
<li>#[url]#</li>
#{/element}#</ul></div>
<dl class="bplike" style="padding-left:5px;">#{element}#
<dt><input type="checkbox" onchange="window.location.href='#[url]#'"#(on)# checked="checked"::#(/on)#/></dt>
<dd>#[name]# (#[count]#)</dd>
#{/element}#</dl>
</div>
#{/nav-vocabulary}#

@ -27,6 +27,7 @@
import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.document.Autotagging;
@ -44,6 +45,8 @@ import de.anomic.server.serverSwitch;
public class yacysearchtrailer {
private static final int MAX_TOPWORDS = 12;
private static final int MAXLIMIT_NAV_LOW = 5;
private static final int MAXLIMIT_NAV_HIGH = 20;
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();
@ -71,21 +74,36 @@ public class yacysearchtrailer {
} else {
prop.put("nav-namespace", 1);
navigatorIterator = namespaceNavigator.keys(false);
int i = 0;
int i = 0, p, pos = 0, neg = 0;
String nav, queryStringForUrl;
while (i < 10 && navigatorIterator.hasNext()) {
name = navigatorIterator.next();
count = namespaceNavigator.get(name);
nav = "inurl%3A" + name;
queryStringForUrl = theQuery.queryStringForUrl();
p = queryStringForUrl.indexOf(nav);
if (p < 0) {
pos++;
queryStringForUrl += "+" + nav;
prop.put("nav-namespace_element_" + i + "_on", 1);
prop.put(fileType, "nav-namespace_element_" + i + "_modifier", nav);
} else {
neg++;
prop.put("nav-namespace_element_" + i + "_on", 0);
prop.put(fileType, "nav-namespace_element_" + i + "_modifier", "-" + nav);
queryStringForUrl = (queryStringForUrl.substring(0, p) + queryStringForUrl.substring(p + nav.length())).trim();
}
prop.put(fileType, "nav-namespace_element_" + i + "_name", name);
prop.put("nav-namespace_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, theQuery, theQuery.queryStringForUrl() + "+" + "inurl:" + name, theQuery.urlMask.toString(), theQuery.navigators).toString() + "\">" + name + " (" + count + ")</a>");
prop.putJSON("nav-namespace_element_" + i + "_url-json", QueryParams.navurl("json", 0, theQuery, theQuery.queryStringForUrl() + "+" + "inurl:" + name, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put(fileType, "nav-namespace_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theQuery, queryStringForUrl, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put("nav-namespace_element_" + i + "_count", count);
prop.put(fileType, "nav-namespace_element_" + i + "_modifier", "inurl:" + name);
prop.put("nav-namespace_element_" + i + "_nl", 1);
i++;
}
prop.put("nav-namespace_element", i);
prop.put("nav-namespace_activate", on(pos, neg, MAXLIMIT_NAV_LOW) ? 1 : 0);
i--;
prop.put("nav-namespace_element_" + i + "_nl", 0);
if (pos == 1 && neg == 0) prop.put("nav-namespace", 0); // this navigation is not useful
}
// host navigators
@ -95,23 +113,36 @@ public class yacysearchtrailer {
} else {
prop.put("nav-domains", 1);
navigatorIterator = hostNavigator.keys(false);
int i = 0;
String dnav;
int i = 0, p, pos = 0, neg = 0;
String nav, queryStringForUrl;
while (i < 20 && navigatorIterator.hasNext()) {
name = navigatorIterator.next();
count = hostNavigator.get(name);
dnav = "site:" + name;
nav = "site%3A" + name;
queryStringForUrl = theQuery.queryStringForUrl();
p = queryStringForUrl.indexOf(nav);
if (p < 0) {
pos++;
queryStringForUrl += "+" + nav;
prop.put("nav-domains_element_" + i + "_on", 1);
prop.put(fileType, "nav-domains_element_" + i + "_modifier", nav);
} else {
neg++;
queryStringForUrl = (queryStringForUrl.substring(0, p) + queryStringForUrl.substring(p + nav.length())).trim();
prop.put("nav-authors_element_" + i + "_on", 0);
prop.put(fileType, "nav-authors_element_" + i + "_modifier", "-" + nav);
}
prop.put(fileType, "nav-domains_element_" + i + "_name", name);
prop.put("nav-domains_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, theQuery, theQuery.queryStringForUrl() + "+" + dnav, theQuery.urlMask.toString(), theQuery.navigators).toString() + "\">" + name + " (" + count + ")</a>");
prop.putJSON("nav-domains_element_" + i + "_url-json", QueryParams.navurl("json", 0, theQuery, theQuery.queryStringForUrl() + "+" + dnav, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put(fileType, "nav-domains_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theQuery, queryStringForUrl, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put("nav-domains_element_" + i + "_count", count);
prop.put(fileType, "nav-domains_element_" + i + "_modifier", dnav);
prop.put("nav-domains_element_" + i + "_nl", 1);
i++;
}
prop.put("nav-domains_element", i);
prop.put("nav-domains_activate", on(pos, neg, MAXLIMIT_NAV_HIGH) ? 1 : 0);
i--;
prop.put("nav-domains_element_" + i + "_nl", 0);
if (pos == 1 && neg == 0) prop.put("nav-domains", 0); // this navigation is not useful
}
// author navigators
@ -121,23 +152,36 @@ public class yacysearchtrailer {
} else {
prop.put("nav-authors", 1);
navigatorIterator = authorNavigator.keys(false);
int i = 0;
String anav;
int i = 0, p, pos = 0, neg = 0;
String nav, queryStringForUrl;
while (i < 20 && navigatorIterator.hasNext()) {
name = navigatorIterator.next().trim();
count = authorNavigator.get(name);
anav = (name.indexOf(' ',0) < 0) ? "author:" + name : "author:'" + name.replace(" ", "+") + "'";
nav = (name.indexOf(' ', 0) < 0) ? "author%3A" + name : "author%3A%28" + name.replace(" ", "+") + "%29";
queryStringForUrl = theQuery.queryStringForUrl();
p = queryStringForUrl.indexOf(nav);
if (p < 0) {
pos++;
queryStringForUrl += "+" + nav;
prop.put("nav-authors_element_" + i + "_on", 1);
prop.put(fileType, "nav-authors_element_" + i + "_modifier", nav);
} else {
neg++;
queryStringForUrl = (queryStringForUrl.substring(0, p) + queryStringForUrl.substring(p + nav.length())).trim();
prop.put("nav-authors_element_" + i + "_on", 0);
prop.put(fileType, "nav-authors_element_" + i + "_modifier", "-" + nav);
}
prop.put(fileType, "nav-authors_element_" + i + "_name", name);
prop.put("nav-authors_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators).toString() + "\">" + name + " (" + count + ")</a>");
prop.putJSON("nav-authors_element_" + i + "_url-json", QueryParams.navurl("json", 0, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put(fileType, "nav-authors_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theQuery, queryStringForUrl, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put("nav-authors_element_" + i + "_count", count);
prop.put(fileType, "nav-authors_element_" + i + "_modifier", anav);
prop.put("nav-authors_element_" + i + "_nl", 1);
i++;
}
prop.put("nav-authors_element", i);
prop.put("nav-authors_activate", neg > 0 ? 1 : 0); // by default off
i--;
prop.put("nav-authors_element_" + i + "_nl", 0);
if (pos == 1 && neg == 0) prop.put("nav-authors", 0); // this navigation is not useful
}
// topics navigator
@ -148,18 +192,18 @@ public class yacysearchtrailer {
prop.put("nav-topics", "1");
navigatorIterator = topicNavigator.keys(false);
int i = 0;
String queryStringForUrl;
while (i < MAX_TOPWORDS && navigatorIterator.hasNext()) {
name = navigatorIterator.next();
count = topicNavigator.get(name);
if (/*(theQuery == null) ||*/ (theQuery.queryString == null)) break;
if (theQuery.queryString == null) break;
if (name != null) {
queryStringForUrl = theQuery.queryStringForUrl();
prop.put("nav-topics_element_" + i + "_on", 1);
prop.put(fileType, "nav-topics_element_" + i + "_modifier", name);
prop.put(fileType, "nav-topics_element_" + i + "_name", name);
prop.put("nav-topics_element_" + i + "_url",
"<a href=\"" + QueryParams.navurl("html", 0, theQuery, theQuery.queryStringForUrl() + "+" + name, theQuery.urlMask.toString(), theQuery.navigators).toString() + "\">" + name + "</a>");
//+"<a href=\"" + QueryParams.navurl("html", 0, display, theQuery, theQuery.queryStringForUrl() + "+-" + name, theQuery.urlMask.toString(), theQuery.navigators) + "\">-</a>")*/;
prop.putJSON("nav-topics_element_" + i + "_url-json", QueryParams.navurl("json", 0, theQuery, theQuery.queryStringForUrl() + "+" + name, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put(fileType, "nav-topics_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theQuery, queryStringForUrl + "+" + name, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put("nav-topics_element_" + i + "_count", count);
prop.put(fileType, "nav-topics_element_" + i + "_modifier", name);
prop.put("nav-topics_element_" + i + "_nl", 1);
i++;
}
@ -176,23 +220,36 @@ public class yacysearchtrailer {
} else {
prop.put("nav-protocols", 1);
navigatorIterator = protocolNavigator.keys(false);
int i = 0;
String pnav;
int i = 0, p, pos = 0, neg = 0;
String nav, queryStringForUrl;
while (i < 20 && navigatorIterator.hasNext()) {
name = navigatorIterator.next().trim();
count = protocolNavigator.get(name);
pnav = "/" + name;
nav = "%2F" + name;
queryStringForUrl = theQuery.queryStringForUrl();
p = queryStringForUrl.indexOf(nav);
if (p < 0) {
pos++;
queryStringForUrl += "+" + nav;
prop.put("nav-protocols_element_" + i + "_on", 1);
prop.put(fileType, "nav-protocols_element_" + i + "_modifier", nav);
} else {
neg++;
queryStringForUrl = (queryStringForUrl.substring(0, p) + queryStringForUrl.substring(p + nav.length())).trim();
prop.put("nav-protocols_element_" + i + "_on", 0);
prop.put(fileType, "nav-protocols_element_" + i + "_modifier", "-" + nav);
}
prop.put(fileType, "nav-protocols_element_" + i + "_name", name);
prop.put("nav-protocols_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, theQuery, theQuery.queryStringForUrl() + "+" + pnav, theQuery.urlMask.toString(), theQuery.navigators).toString() + "\">" + name + " (" + count + ")</a>");
prop.putJSON("nav-protocols_element_" + i + "_url-json", QueryParams.navurl("json", 0, theQuery, theQuery.queryStringForUrl() + "+" + pnav, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put(fileType, "nav-protocols_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theQuery, queryStringForUrl, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put("nav-protocols_element_" + i + "_count", count);
prop.put(fileType, "nav-protocols_element_" + i + "_modifier", pnav);
prop.put("nav-protocols_element_" + i + "_nl", 1);
i++;
}
prop.put("nav-protocols_element", i);
prop.put("nav-protocols_activate", neg > 0 ? 1 : 0); // by default off
i--;
prop.put("nav-protocols_element_" + i + "_nl", 0);
if (pos == 1 && neg == 0) prop.put("nav-protocols", 0); // this navigation is not useful
}
// filetype navigators
@ -202,23 +259,36 @@ public class yacysearchtrailer {
} else {
prop.put("nav-filetypes", 1);
navigatorIterator = filetypeNavigator.keys(false);
int i = 0;
String tnav;
int i = 0, p, pos = 0, neg = 0;
String nav, queryStringForUrl;
while (i < 20 && navigatorIterator.hasNext()) {
name = navigatorIterator.next().trim();
count = filetypeNavigator.get(name);
tnav = "filetype:" + name;
nav = "filetype%3A" + name;
queryStringForUrl = theQuery.queryStringForUrl();
p = queryStringForUrl.indexOf(nav);
if (p < 0) {
pos++;
queryStringForUrl += "+" + nav;
prop.put("nav-filetypes_element_" + i + "_on", 1);
prop.put(fileType, "nav-filetypes_element_" + i + "_modifier", nav);
} else {
neg++;
queryStringForUrl = (queryStringForUrl.substring(0, p) + queryStringForUrl.substring(p + nav.length())).trim();
prop.put("nav-filetypes_element_" + i + "_on", 0);
prop.put(fileType, "nav-filetypes_element_" + i + "_modifier", "-" + nav);
}
prop.put(fileType, "nav-filetypes_element_" + i + "_name", name);
prop.put("nav-filetypes_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, theQuery, theQuery.queryStringForUrl() + "+" + tnav, theQuery.urlMask.toString(), theQuery.navigators).toString() + "\">" + name + " (" + count + ")</a>");
prop.putJSON("nav-filetypes_element_" + i + "_url-json", QueryParams.navurl("json", 0, theQuery, theQuery.queryStringForUrl() + "+" + tnav, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put(fileType, "nav-filetypes_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theQuery, queryStringForUrl, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put("nav-filetypes_element_" + i + "_count", count);
prop.put(fileType, "nav-filetypes_element_" + i + "_modifier", tnav);
prop.put("nav-filetypes_element_" + i + "_nl", 1);
i++;
}
prop.put("nav-filetypes_element", i);
prop.put("nav-filetypes_activate", neg > 0 ? 1 : 0); // by default off
i--;
prop.put("nav-filetypes_element_" + i + "_nl", 0);
if (pos == 1 && neg == 0) prop.put("nav-filetypes", 0); // this navigation is not useful
}
// vocabulary navigators
@ -232,17 +302,26 @@ public class yacysearchtrailer {
}
prop.put(fileType, "nav-vocabulary_" + navvoccount + "_navname", navname);
navigatorIterator = ve.getValue().keys(false);
int i = 0;
String anav;
int i = 0, p;
String nav, queryStringForUrl;
while (i < 20 && navigatorIterator.hasNext()) {
name = navigatorIterator.next();
count = ve.getValue().get(name);
anav = "/vocabulary/" + navname + "/" + Autotagging.encodePrintname(name);
nav = "%2Fvocabulary%2F" + navname + "%2F" + MultiProtocolURI.escape(Autotagging.encodePrintname(name)).toString();
queryStringForUrl = theQuery.queryStringForUrl();
p = queryStringForUrl.indexOf(nav);
if (p < 0) {
queryStringForUrl += "+" + nav;
prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_on", 1);
prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_modifier", nav);
} else {
queryStringForUrl = (queryStringForUrl.substring(0, p) + queryStringForUrl.substring(p + nav.length())).trim();
prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_on", 0);
prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_modifier", "-" + nav);
}
prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_name", name);
prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_url", "<a href=\"" + QueryParams.navurl("html", 0, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators).toString() + "\">" + name + " (" + count + ")</a>");
prop.putJSON("nav-vocabulary_" + navvoccount + "_element_" + i + "_url-json", QueryParams.navurl("json", 0, theQuery, theQuery.queryStringForUrl() + "+" + anav, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_url", QueryParams.navurl(fileType.name().toLowerCase(), 0, theQuery, queryStringForUrl, theQuery.urlMask.toString(), theQuery.navigators).toString());
prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_count", count);
prop.put(fileType, "nav-vocabulary_" + navvoccount + "_element_" + i + "_modifier", anav);
prop.put("nav-vocabulary_" + navvoccount + "_element_" + i + "_nl", 1);
i++;
}
@ -255,42 +334,6 @@ public class yacysearchtrailer {
} else {
prop.put("nav-vocabulary", 0);
}
/*
html
#{nav-vocabulary}#
<div id="sidebar#[navname]#" style="float: right; margin-top:5px; width: 220px;">
<h3 style="padding-left:25px;">#[navname]# Navigator</h3>
<div><ul style="padding-left: 0px;">#{element}#
<li>#[url]#</li>
#{/element}#</ul></div>
</div>
#{/nav-vocabulary}#
xml
#{nav-vocabulary}#
<yacy:facet name="#[navname]#" displayname="#[navname]#" type="String" min="0" max="0" mean="0">
#{element}#
<yacy:element name="#[name]#" count="#[count]#" modifier="#[modifier]#" />
#{/element}#
</yacy:facet>
#{/nav-vocabulary}#
json
#{nav-vocabulary}#
{
"facetname": "#[navname]#",
"displayname": "#[navname]#",
"type": "String",
"min": "0",
"max": "0",
"mean": "0",
"elements": [
#{element}#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)#
#{/element}#
]
},#{nav-vocabulary}#
*/
// about box
final String aboutBody = env.getConfig("about.body", "");
@ -323,4 +366,8 @@ json
return prop;
}
private final static boolean on(int pos, int neg, int maxlimit) {
return neg > 0 || (pos > 1 && pos <= maxlimit);
}
}

@ -8,7 +8,7 @@
"mean": "0",
"elements": [
#{element}#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url]#"}#(nl)#::,#(/nl)#
#{/element}#
]
},#(/nav-filetypes)##(nav-protocols)#::
@ -21,7 +21,7 @@
"mean": "0",
"elements": [
#{element}#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url]#"}#(nl)#::,#(/nl)#
#{/element}#
]
},#(/nav-protocols)##(nav-domains)#::
@ -34,7 +34,7 @@
"mean": "0",
"elements": [
#{element}#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url]#"}#(nl)#::,#(/nl)#
#{/element}#
]
},#(/nav-domains)##(nav-namespace)#::
@ -47,7 +47,7 @@
"mean": "0",
"elements": [
#{element}#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url]#"}#(nl)#::,#(/nl)#
#{/element}#
]
},#(/nav-namespace)##(nav-authors)#::
@ -60,7 +60,7 @@
"mean": "0",
"elements": [
#{element}#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url]#"}#(nl)#::,#(/nl)#
#{/element}#
]
},#(/nav-authors)##{nav-vocabulary}#
@ -73,7 +73,7 @@
"mean": "0",
"elements": [
#{element}#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url]#"}#(nl)#::,#(/nl)#
#{/element}#
]
},#{/nav-vocabulary}##(nav-topics)#::
@ -86,7 +86,7 @@
"mean": "0",
"elements": [
#{element}#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url-json]#"}#(nl)#::,#(/nl)#
{"name": "#[name]#", "count": "#[count]#", "modifier": "#[modifier]#", "url": "#[url]#"}#(nl)#::,#(/nl)#
#{/element}#
]
}#(/nav-topics)#

@ -29,6 +29,8 @@ package de.anomic.crawler;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@ -293,6 +295,72 @@ public class Balancer {
}
}
/**
* get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names to the size of the domain stack
*/
public Map<String, Integer> getDomainStackHosts() {
Map<String, Integer> map = new HashMap<String, Integer>();
for (Map.Entry<String, HandleSet> entry: this.domainStacks.entrySet()) {
map.put(entry.getKey(), entry.getValue().size());
}
return map;
}
/**
* compute the current sleep time for a given crawl entry
* @param cs
* @param crawlEntry
* @return
*/
public long getDomainSleepTime(final CrawlSwitchboard cs, Request crawlEntry) {
final CrawlProfile profileEntry = cs.getActive(UTF8.getBytes(crawlEntry.profileHandle()));
return getDomainSleepTime(cs, profileEntry, crawlEntry);
}
private long getDomainSleepTime(final CrawlSwitchboard cs, final CrawlProfile profileEntry, Request crawlEntry) {
if (profileEntry == null) {
return 0;
}
long sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlEntry.url()))
) ? 0 : Latency.waitingRemaining(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
return sleeptime;
}
/**
* get lists of crawl request entries for a specific host
* @param host
* @param maxcount
* @return a list of crawl loader requests
*/
public List<Request> getDomainStackReferences(String host, int maxcount) {
HandleSet domainList = this.domainStacks.get(host);
if (domainList == null || domainList.isEmpty()) return new ArrayList<Request>(0);
ArrayList<Request> cel = new ArrayList<Request>(maxcount);
for (int i = 0; i < maxcount; i++) {
if (domainList.size() <= i) break;
final byte[] urlhash = domainList.getOne(i);
if (urlhash == null) continue;
Row.Entry rowEntry;
try {
rowEntry = this.urlFileIndex.get(urlhash, true);
} catch (IOException e) {
continue;
}
if (rowEntry == null) continue;
Request crawlEntry;
try {
crawlEntry = new Request(rowEntry);
} catch (IOException e) {
continue;
}
cel.add(crawlEntry);
}
return cel;
}
private void pushHashToDomainStacks(String host, final byte[] urlhash) throws RowSpaceExceededException {
// extend domain stack
if (host == null) host = localhost;
@ -417,11 +485,8 @@ public class Balancer {
return null;
}
// depending on the caching policy we need sleep time to avoid DoS-like situations
sleeptime = (
profileEntry.cacheStrategy() == CacheStrategy.CACHEONLY ||
(profileEntry.cacheStrategy() == CacheStrategy.IFEXIST && Cache.has(crawlEntry.url()))
) ? 0 : Latency.waitingRemaining(crawlEntry.url(), this.myAgentIDs, this.minimumLocalDelta, this.minimumGlobalDelta); // this uses the robots.txt database and may cause a loading of robots.txt from the server
sleeptime = getDomainSleepTime(cs, profileEntry, crawlEntry);
assert Base64Order.enhancedCoder.equal(nexthash, rowEntry.getPrimaryKeyBytes()) : "result = " + ASCII.String(nexthash) + ", rowEntry.getPrimaryKeyBytes() = " + ASCII.String(rowEntry.getPrimaryKeyBytes());
assert Base64Order.enhancedCoder.equal(nexthash, crawlEntry.url().hash()) : "result = " + ASCII.String(nexthash) + ", crawlEntry.url().hash() = " + ASCII.String(crawlEntry.url().hash());

@ -32,6 +32,7 @@ import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import net.yacy.kelondro.index.HandleSet;
@ -228,6 +229,50 @@ public class NoticedURL {
return removed;
}
/**
* get a list of domains that are currently maintained as domain stacks
* @return a map of clear text strings of host names to the size of the domain stacks
*/
public Map<String, Integer> getDomainStackHosts(final StackType stackType) {
switch (stackType) {
case CORE: return this.coreStack.getDomainStackHosts();
case LIMIT: return this.limitStack.getDomainStackHosts();
case REMOTE: return this.remoteStack.getDomainStackHosts();
case NOLOAD: return this.noloadStack.getDomainStackHosts();
default: return null;
}
}
/**
* get a list of domains that are currently maintained as domain stacks
* @return a collection of clear text strings of host names
*/
public long getDomainSleepTime(final StackType stackType, final CrawlSwitchboard cs, Request crawlEntry) {
switch (stackType) {
case CORE: return this.coreStack.getDomainSleepTime(cs, crawlEntry);
case LIMIT: return this.limitStack.getDomainSleepTime(cs, crawlEntry);
case REMOTE: return this.remoteStack.getDomainSleepTime(cs, crawlEntry);
case NOLOAD: return this.noloadStack.getDomainSleepTime(cs, crawlEntry);
default: return 0;
}
}
/**
* get lists of crawl request entries for a specific host
* @param host
* @param maxcount
* @return a list of crawl loader requests
*/
public List<Request> getDomainStackReferences(final StackType stackType, String host, int maxcount) {
switch (stackType) {
case CORE: return this.coreStack.getDomainStackReferences(host, maxcount);
case LIMIT: return this.limitStack.getDomainStackReferences(host, maxcount);
case REMOTE: return this.remoteStack.getDomainStackReferences(host, maxcount);
case NOLOAD: return this.noloadStack.getDomainStackReferences(host, maxcount);
default: return null;
}
}
public List<Request> top(final StackType stackType, final int count) {
switch (stackType) {
case CORE: return top(this.coreStack, count);
@ -295,7 +340,7 @@ public class NoticedURL {
return null;
}
private List<Request> top(final Balancer balancer, int count) {
private static List<Request> top(final Balancer balancer, int count) {
// this is a filo - top
if (count > balancer.size()) count = balancer.size();
return balancer.top(count);

@ -153,6 +153,7 @@ public final class HTTPDemon implements serverHandler, Cloneable {
* it can be reused for further connections
* @see de.anomic.server.serverHandler#reset()
*/
@Override
public void reset() {
this.proxyAccounts_init = false;
@ -197,12 +198,14 @@ public final class HTTPDemon implements serverHandler, Cloneable {
return false;
}
@Override
public String greeting() { // OBLIGATORIC FUNCTION
// a response line upon connection is send to client
// if no response line is wanted, return "" or null
return null;
}
@Override
public String error(final Throwable e) { // OBLIGATORIC FUNCTION
// return string in case of any error that occurs during communication
// is always (but not only) called if an IO-dependent exception occurrs.
@ -348,11 +351,13 @@ public final class HTTPDemon implements serverHandler, Cloneable {
return true;
}
@Override
public Boolean EMPTY(final String arg, final Session session) throws IOException {
//System.out.println("EMPTY " + arg);
return (++this.emptyRequestCount > 10) ? serverCore.TERMINATE_CONNECTION : serverCore.RESUME_CONNECTION;
}
@Override
public Boolean UNKNOWN(final String arg, final Session session) throws IOException {
//System.out.println("UNKNOWN " + arg);
@ -695,12 +700,23 @@ public final class HTTPDemon implements serverHandler, Cloneable {
while (argsString.length() > 0) {
eqp = argsString.indexOf('=');
if (eqp <= 0) break;
sep = argsString.indexOf("&amp;", eqp + 1);
if (sep > 0) {
// resulting equations are inserted into the property args with leading '&amp;'
args.put(parseArg(argsString.substring(0, eqp)), parseArg(argsString.substring(eqp + 1, sep)));
argsString = argsString.substring(sep + 5);
argc++;
continue;
}
sep = argsString.indexOf('&', eqp + 1);
if (sep <= 0) break;
// resulting equations are inserted into the property args with leading '&'
args.put(parseArg(argsString.substring(0, eqp)), parseArg(argsString.substring(eqp + 1, sep)));
argsString = argsString.substring(sep + 1);
argc++;
if (sep > 0) {
// resulting equations are inserted into the property args with leading '&'
args.put(parseArg(argsString.substring(0, eqp)), parseArg(argsString.substring(eqp + 1, sep)));
argsString = argsString.substring(sep + 1);
argc++;
continue;
}
break;
}
// we return the number of parsed arguments
return argc;

@ -178,16 +178,13 @@ public class Autotagging {
p = line.indexOf('\t');
}
if (p < 0) {
this.tag2print.put(line, line);
this.print2tag.put(line, line);
k = normalizeKey(line);
v = normalizeWord(line);
this.tag2print.put(v, k);
this.print2tag.put(k, v);
continue vocloop;
}
k = line.substring(0, p).trim();
k = k.replaceAll(" \\+", ", "); // remove symbols that are bad in a query attribute
k = k.replaceAll(" /", ", ");
k = k.replaceAll("\\+", ",");
k = k.replaceAll("/", ",");
k = k.replaceAll(" ", " ");
k = normalizeKey(line.substring(0, p));
v = line.substring(p + 1);
tags = v.split(",");
tagloop: for (String t: tags) {
@ -204,6 +201,16 @@ public class Autotagging {
}
}
private final String normalizeKey(String k) {
k = k.trim();
k = k.replaceAll(" \\+", ", "); // remove symbols that are bad in a query attribute
k = k.replaceAll(" /", ", ");
k = k.replaceAll("\\+", ",");
k = k.replaceAll("/", ",");
k = k.replaceAll(" ", " ");
return k;
}
public Vocabulary(String name, Localization localization) {
this(name);
Set<String> locNames = localization.locationNames();

@ -7,12 +7,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -32,10 +32,10 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
public class SnippetExtractor {
String snippetString;
HandleSet remainingHashes;
public SnippetExtractor(final Collection<StringBuilder> sentences, final HandleSet queryhashes, int maxLength) throws UnsupportedOperationException {
if (sentences == null) throw new UnsupportedOperationException("sentence == null");
if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
@ -47,7 +47,7 @@ public class SnippetExtractor {
int linenumber = 0;
int fullmatchcounter = 0;
lookup: for (final StringBuilder sentence: sentences) {
hs = WordTokenizer.hashSentence(sentence.toString(), null);
hs = WordTokenizer.hashSentence(sentence.toString(), null, 100);
positions = new TreeSet<Integer>();
for (final byte[] word: queryhashes) {
pos = hs.get(word);
@ -69,7 +69,7 @@ public class SnippetExtractor {
}
linenumber++;
}
StringBuilder sentence;
SnippetExtractor tsr;
while (!order.isEmpty()) {
@ -79,27 +79,27 @@ public class SnippetExtractor {
} catch (UnsupportedOperationException e) {
continue;
}
snippetString = tsr.snippetString;
if (snippetString != null && snippetString.length() > 0) {
remainingHashes = tsr.remainingHashes;
if (remainingHashes.isEmpty()) {
this.snippetString = tsr.snippetString;
if (this.snippetString != null && this.snippetString.length() > 0) {
this.remainingHashes = tsr.remainingHashes;
if (this.remainingHashes.isEmpty()) {
// we have found the snippet
return; // finished!
} else if (remainingHashes.size() < queryhashes.size()) {
} else if (this.remainingHashes.size() < queryhashes.size()) {
// the result has not all words in it.
// find another sentence that represents the missing other words
// and find recursively more sentences
maxLength = maxLength - snippetString.length();
maxLength = maxLength - this.snippetString.length();
if (maxLength < 20) maxLength = 20;
try {
tsr = new SnippetExtractor(order.values(), remainingHashes, maxLength);
tsr = new SnippetExtractor(order.values(), this.remainingHashes, maxLength);
} catch (UnsupportedOperationException e) {
throw e;
}
final String nextSnippet = tsr.snippetString;
if (nextSnippet == null) return;
snippetString = snippetString + (" / " + nextSnippet);
remainingHashes = tsr.remainingHashes;
this.snippetString = this.snippetString + (" / " + nextSnippet);
this.remainingHashes = tsr.remainingHashes;
return;
} else {
// error
@ -110,7 +110,7 @@ public class SnippetExtractor {
}
throw new UnsupportedOperationException("no snippet computed");
}
private static int linelengthKey(int givenlength, int maxlength) {
if (givenlength > maxlength) return 1;
if (givenlength >= maxlength / 2 && givenlength < maxlength) return 7;
@ -118,15 +118,15 @@ public class SnippetExtractor {
if (givenlength >= maxlength / 8 && givenlength < maxlength / 4) return 3;
return 0;
}
private SnippetExtractor(String sentence, final HandleSet queryhashes, final int maxLength) throws UnsupportedOperationException {
try {
if (sentence == null) throw new UnsupportedOperationException("no sentence given");
if (queryhashes == null || queryhashes.isEmpty()) throw new UnsupportedOperationException("queryhashes == null");
byte[] hash;
// find all hashes that appear in the sentence
final Map<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, null);
final Map<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, null, 100);
final Iterator<byte[]> j = queryhashes.iterator();
Integer pos;
int p, minpos = sentence.length(), maxpos = -1;
@ -189,11 +189,11 @@ public class SnippetExtractor {
throw new UnsupportedOperationException(e.getMessage());
}
}
public String getSnippet() {
return this.snippetString;
}
public HandleSet getRemainingWords() {
return this.remainingHashes;
}

@ -68,10 +68,12 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
return null;
}
@Override
public boolean hasMoreElements() {
return this.buffer != null;
}
@Override
public StringBuilder nextElement() {
final StringBuilder r = (this.buffer == null) ? null : this.buffer;
this.buffer = nextElement0();
@ -79,9 +81,9 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
if (this.meaningLib != null) WordCache.learn(r);
return r;
}
public void close() {
e.close();
this.e.close();
}
private static class unsievedWordsEnum implements Enumeration<StringBuilder> {
@ -139,10 +141,12 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
return r;
}
@Override
public boolean hasMoreElements() {
return this.buffer != null;
}
@Override
public StringBuilder nextElement() {
final StringBuilder r = this.buffer;
this.buffer = nextElement0();
@ -150,7 +154,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
}
public void close() {
e.close();
this.e.close();
}
}
@ -177,7 +181,7 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
* @param sentence the sentence to be tokenized
* @return a ordered map containing word hashes as key and positions as value. The map is orderd by the hash ordering
*/
public static SortedMap<byte[], Integer> hashSentence(final String sentence, final WordCache meaningLib) {
public static SortedMap<byte[], Integer> hashSentence(final String sentence, final WordCache meaningLib, int maxlength) {
final SortedMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
final WordTokenizer words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib);
try {
@ -185,16 +189,16 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
StringBuilder word;
byte[] hash;
Integer oldpos;
while (words.hasMoreElements()) {
while (words.hasMoreElements() && maxlength-- > 0) {
word = words.nextElement();
hash = Word.word2hash(word);
// don't overwrite old values, that leads to too far word distances
oldpos = map.put(hash, LargeNumberCache.valueOf(pos));
if (oldpos != null) {
map.put(hash, oldpos);
}
pos += word.length() + 1;
}
return map;

@ -125,32 +125,32 @@ public class pdfParser extends AbstractParser implements Parser {
if (docTitle == null || docTitle.length() == 0) {
docTitle = MultiProtocolURI.unescape(location.getFileName());
}
final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
byte[] contentBytes = new byte[0];
final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
byte[] contentBytes = new byte[0];
try {
// create a writer for output
final PDFTextStripper stripper = new PDFTextStripper();
final PDFTextStripper stripper = new PDFTextStripper();
stripper.setEndPage(3); // get first 3 pages (always)
writer.append(stripper.getText(pdfDoc));
contentBytes = writer.getBytes(); // remember text in case of interrupting thread
stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
stripper.setEndPage(Integer.MAX_VALUE); // set to default
// we start the pdf parsing in a separate thread to ensure that it can be terminated
final Thread t = new Thread() {
@Override
public void run() {
try {
writer.append(stripper.getText(pdfDoc));
try {
writer.append(stripper.getText(pdfDoc));
} catch (final Throwable e) {}
}
};
}
};
t.start();
t.join(3000);
if (t.isAlive()) t.interrupt();
if (t.isAlive()) t.interrupt();
pdfDoc.close();
contentBytes = writer.getBytes(); // get final text before closing writer
contentBytes = writer.getBytes(); // get final text before closing writer
writer.close();
} catch (final IOException e) {
// close the writer
@ -175,8 +175,7 @@ public class pdfParser extends AbstractParser implements Parser {
if (docTitle == null) {
docTitle = docSubject;
}
// clear resources in pdfbox. they say that is resolved but it's not. see:
// https://issues.apache.org/jira/browse/PDFBOX-313
// https://issues.apache.org/jira/browse/PDFBOX-351

@ -376,6 +376,13 @@ public class URIMetadataRow implements URIMetadata {
return this.entry.getPrimaryKeyBytes();
}
private String hostHash = null;
public String hosthash() {
if (this.hostHash != null) return this.hostHash;
this.hostHash = ASCII.String(this.entry.getPrimaryKeyBytes(), 6, 6);
return this.hostHash;
}
public long ranking() {
return this.ranking;
}

@ -65,7 +65,7 @@ public class Digest {
md5Cache = new ConcurrentARC<String, byte[]>(1000, Math.max(8, 2 * Runtime.getRuntime().availableProcessors()));
}
}
public static String encodeHex(final long in, final int length) {
String s = Long.toHexString(in);
while (s.length() < length) s = "0" + s;
@ -119,7 +119,7 @@ public class Digest {
byte[] h = md5Cache.get(key);
if (h != null) return h;
MessageDigest digest = digestPool.poll();
if (digest == null) {
// if there are no digest objects left, create some on the fly
@ -129,12 +129,14 @@ public class Digest {
digest.reset();
} catch (final NoSuchAlgorithmException e) {
}
} else {
digest.reset(); // they should all be reseted but anyway; this is safe
}
byte[] keyBytes;
keyBytes = UTF8.getBytes(key);
digest.update(keyBytes);
final byte[] result = digest.digest();
digest.reset();
digest.reset(); // to be prepared for next
try {
digestPool.put(digest);
//System.out.println("Digest Pool size = " + digestPool.size());
@ -390,7 +392,7 @@ public class Digest {
}
System.out.println("time: " + (System.currentTimeMillis() - start) + " ms");
// without this this method would never end
Log.shutdown();
}

@ -436,7 +436,9 @@ public final class SwitchboardConstants {
public static final String UPNP_REMOTEHOST = "upnp.remoteHost";
public static final String SEARCH_ITEMS = "search.items";
public static final String SEARCH_TARGET = "search.target";
public static final String SEARCH_TARGET_DEFAULT = "search.target";
public static final String SEARCH_TARGET_SPECIAL = "search.target.special"; // exceptions to the search target
public static final String SEARCH_TARGET_SPECIAL_PATTERN = "search.target.special.pattern"; // ie 'own' addresses in topframe, 'other' in iframe
public static final String SEARCH_VERIFY = "search.verify";
public static final String SEARCH_VERIFY_DELETE = "search.verify.delete";

@ -516,12 +516,13 @@ public final class QueryParams {
context.append(this.modifier.s);
context.append(asterisk);
context.append(this.snippetCacheStrategy == null ? "null" : this.snippetCacheStrategy.name());
String result = context.toString();
if (anonymized) {
this.idCacheAnon = context.toString();
this.idCacheAnon = result;
} else {
this.idCache = context.toString();
this.idCache = result;
}
return context.toString();
return result;
}
/**

@ -307,15 +307,6 @@ public final class RWIProcess extends Thread
}
}
// check tld domain
/*
if ((DigestURI.domDomain(iEntry.metadataHash()) & this.query.zonecode) == 0) {
// filter out all tld that do not match with wanted tld domain
this.sortout++;
continue;
}
*/
// count domZones
//this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++;
@ -325,11 +316,6 @@ public final class RWIProcess extends Thread
if (this.query.siteexcludes != null && this.query.siteexcludes.contains(hosthash)) {
continue pollloop;
}
// no site constraint there; maybe collect host navigation information
if ( nav_hosts && this.query.urlMask_isCatchall ) {
this.hostNavigator.inc(hosthash);
this.hostResolver.put(hosthash, iEntry.urlhash());
}
} else {
if ( !hosthash.equals(this.query.sitehash) ) {
// filter out all domains that do not match with the site constraint
@ -337,6 +323,12 @@ public final class RWIProcess extends Thread
}
}
// collect host navigation information (even if we have only one; this is to provide a switch-off button)
if (this.query.navigators.isEmpty() && (nav_hosts || this.query.urlMask_isCatchall)) {
this.hostNavigator.inc(hosthash);
this.hostResolver.put(hosthash, iEntry.urlhash());
}
// check protocol
if ( !this.query.urlMask_isCatchall ) {
final boolean httpFlagSet = DigestURI.flag4HTTPset(iEntry.urlHash);
@ -675,6 +667,15 @@ public final class RWIProcess extends Thread
continue;
}
// from here: collect navigation information
// collect host navigation information (even if we have only one; this is to provide a switch-off button)
if (!this.query.navigators.isEmpty() && (this.query.urlMask_isCatchall || this.query.navigators.equals("all") || this.query.navigators.indexOf("hosts", 0) >= 0)) {
final String hosthash = page.hosthash();
this.hostNavigator.inc(hosthash);
this.hostResolver.put(hosthash, page.hash());
}
// namespace navigation
String pagepath = page.url().getPath();
if ( (p = pagepath.indexOf(':')) >= 0 ) {
@ -795,9 +796,6 @@ public final class RWIProcess extends Thread
if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("namespace", 0) < 0 ) {
return new ClusteredScoreMap<String>();
}
if ( this.namespaceNavigator.sizeSmaller(2) ) {
this.namespaceNavigator.clear(); // navigators with one entry are not useful
}
return this.namespaceNavigator;
}
@ -825,9 +823,6 @@ public final class RWIProcess extends Thread
}
}
}
if ( result.sizeSmaller(2) ) {
result.clear(); // navigators with one entry are not useful
}
return result;
}
@ -835,9 +830,6 @@ public final class RWIProcess extends Thread
if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("protocol", 0) < 0 ) {
return new ClusteredScoreMap<String>();
}
if ( this.protocolNavigator.sizeSmaller(2) ) {
this.protocolNavigator.clear(); // navigators with one entry are not useful
}
return this.protocolNavigator;
}
@ -845,9 +837,6 @@ public final class RWIProcess extends Thread
if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("filetype", 0) < 0 ) {
return new ClusteredScoreMap<String>();
}
if ( this.filetypeNavigator.sizeSmaller(2) ) {
this.filetypeNavigator.clear(); // navigators with one entry are not useful
}
return this.filetypeNavigator;
}
@ -945,9 +934,6 @@ public final class RWIProcess extends Thread
if ( !this.query.navigators.equals("all") && this.query.navigators.indexOf("authors", 0) < 0 ) {
return new ConcurrentScoreMap<String>();
}
if ( this.authorNavigator.sizeSmaller(2) ) {
this.authorNavigator.clear(); // navigators with one entry are not useful
}
return this.authorNavigator;
}

@ -190,8 +190,11 @@ public class RankingProfile {
return (coeff.get(attr)).intValue();
}
private String externalStringCache = null;
public String toExternalString() {
return toExternalMap("").toString();
if (this.externalStringCache != null) return this.externalStringCache;
this.externalStringCache = toExternalMap("").toString();
return this.externalStringCache;
}
public Map<String, String> toExternalMap(final String prefix) {
@ -202,43 +205,81 @@ public class RankingProfile {
public Map<String, String> preToExternalMap(final String prefix) {
final Map<String, String> ext = new LinkedHashMap<String, String>(40);
ext.put(prefix + APPEMPH, Integer.toString(this.coeff_appemph));
ext.put(prefix + APPURL, Integer.toString(this.coeff_appurl));
ext.put(prefix + APP_DC_CREATOR, Integer.toString(this.coeff_app_dc_creator));
ext.put(prefix + APP_DC_DESCRIPTION, Integer.toString(this.coeff_app_dc_description));
ext.put(prefix + APP_DC_SUBJECT, Integer.toString(this.coeff_app_dc_subject));
ext.put(prefix + APP_DC_TITLE, Integer.toString(this.coeff_app_dc_title));
ext.put(prefix + AUTHORITY, Integer.toString(this.coeff_authority));
ext.put(prefix + CATHASAPP, Integer.toString(this.coeff_cathasapp));
ext.put(prefix + CATHASAUDIO, Integer.toString(this.coeff_cathasaudio));
ext.put(prefix + CATHASIMAGE, Integer.toString(this.coeff_cathasimage));
ext.put(prefix + CATHASVIDEO, Integer.toString(this.coeff_cathasvideo));
ext.put(prefix + CATINDEXOF, Integer.toString(this.coeff_catindexof));
ext.put(prefix + DATE, Integer.toString(this.coeff_date));
ext.put(prefix + DOMLENGTH, Integer.toString(this.coeff_domlength));
ext.put(prefix + HITCOUNT, Integer.toString(this.coeff_hitcount));
ext.put(prefix + LANGUAGE, Integer.toString(this.coeff_language));
ext.put(prefix + LLOCAL, Integer.toString(this.coeff_llocal));
ext.put(prefix + LOTHER, Integer.toString(this.coeff_lother));
ext.put(prefix + PHRASESINTEXT, Integer.toString(this.coeff_phrasesintext));
ext.put(prefix + POSINPHRASE, Integer.toString(this.coeff_posinphrase));
ext.put(prefix + POSINTEXT, Integer.toString(this.coeff_posintext));
ext.put(prefix + POSOFPHRASE, Integer.toString(this.coeff_posofphrase));
ext.put(prefix + TERMFREQUENCY, Integer.toString(this.coeff_termfrequency));
ext.put(prefix + URLCOMPS, Integer.toString(this.coeff_urlcomps));
ext.put(prefix + URLLENGTH, Integer.toString(this.coeff_urllength));
ext.put(prefix + WORDDISTANCE, Integer.toString(this.coeff_worddistance));
ext.put(prefix + WORDSINTEXT, Integer.toString(this.coeff_wordsintext));
ext.put(prefix + WORDSINTITLE, Integer.toString(this.coeff_wordsintitle));
ext.put(prefix + YBR, Integer.toString(this.coeff_ybr));
if (prefix.length() == 0) {
ext.put(APPEMPH, Integer.toString(this.coeff_appemph));
ext.put(APPURL, Integer.toString(this.coeff_appurl));
ext.put(APP_DC_CREATOR, Integer.toString(this.coeff_app_dc_creator));
ext.put(APP_DC_DESCRIPTION, Integer.toString(this.coeff_app_dc_description));
ext.put(APP_DC_SUBJECT, Integer.toString(this.coeff_app_dc_subject));
ext.put(APP_DC_TITLE, Integer.toString(this.coeff_app_dc_title));
ext.put(AUTHORITY, Integer.toString(this.coeff_authority));
ext.put(CATHASAPP, Integer.toString(this.coeff_cathasapp));
ext.put(CATHASAUDIO, Integer.toString(this.coeff_cathasaudio));
ext.put(CATHASIMAGE, Integer.toString(this.coeff_cathasimage));
ext.put(CATHASVIDEO, Integer.toString(this.coeff_cathasvideo));
ext.put(CATINDEXOF, Integer.toString(this.coeff_catindexof));
ext.put(DATE, Integer.toString(this.coeff_date));
ext.put(DOMLENGTH, Integer.toString(this.coeff_domlength));
ext.put(HITCOUNT, Integer.toString(this.coeff_hitcount));
ext.put(LANGUAGE, Integer.toString(this.coeff_language));
ext.put(LLOCAL, Integer.toString(this.coeff_llocal));
ext.put(LOTHER, Integer.toString(this.coeff_lother));
ext.put(PHRASESINTEXT, Integer.toString(this.coeff_phrasesintext));
ext.put(POSINPHRASE, Integer.toString(this.coeff_posinphrase));
ext.put(POSINTEXT, Integer.toString(this.coeff_posintext));
ext.put(POSOFPHRASE, Integer.toString(this.coeff_posofphrase));
ext.put(TERMFREQUENCY, Integer.toString(this.coeff_termfrequency));
ext.put(URLCOMPS, Integer.toString(this.coeff_urlcomps));
ext.put(URLLENGTH, Integer.toString(this.coeff_urllength));
ext.put(WORDDISTANCE, Integer.toString(this.coeff_worddistance));
ext.put(WORDSINTEXT, Integer.toString(this.coeff_wordsintext));
ext.put(WORDSINTITLE, Integer.toString(this.coeff_wordsintitle));
ext.put(YBR, Integer.toString(this.coeff_ybr));
} else {
ext.put(prefix + APPEMPH, Integer.toString(this.coeff_appemph));
ext.put(prefix + APPURL, Integer.toString(this.coeff_appurl));
ext.put(prefix + APP_DC_CREATOR, Integer.toString(this.coeff_app_dc_creator));
ext.put(prefix + APP_DC_DESCRIPTION, Integer.toString(this.coeff_app_dc_description));
ext.put(prefix + APP_DC_SUBJECT, Integer.toString(this.coeff_app_dc_subject));
ext.put(prefix + APP_DC_TITLE, Integer.toString(this.coeff_app_dc_title));
ext.put(prefix + AUTHORITY, Integer.toString(this.coeff_authority));
ext.put(prefix + CATHASAPP, Integer.toString(this.coeff_cathasapp));
ext.put(prefix + CATHASAUDIO, Integer.toString(this.coeff_cathasaudio));
ext.put(prefix + CATHASIMAGE, Integer.toString(this.coeff_cathasimage));
ext.put(prefix + CATHASVIDEO, Integer.toString(this.coeff_cathasvideo));
ext.put(prefix + CATINDEXOF, Integer.toString(this.coeff_catindexof));
ext.put(prefix + DATE, Integer.toString(this.coeff_date));
ext.put(prefix + DOMLENGTH, Integer.toString(this.coeff_domlength));
ext.put(prefix + HITCOUNT, Integer.toString(this.coeff_hitcount));
ext.put(prefix + LANGUAGE, Integer.toString(this.coeff_language));
ext.put(prefix + LLOCAL, Integer.toString(this.coeff_llocal));
ext.put(prefix + LOTHER, Integer.toString(this.coeff_lother));
ext.put(prefix + PHRASESINTEXT, Integer.toString(this.coeff_phrasesintext));
ext.put(prefix + POSINPHRASE, Integer.toString(this.coeff_posinphrase));
ext.put(prefix + POSINTEXT, Integer.toString(this.coeff_posintext));
ext.put(prefix + POSOFPHRASE, Integer.toString(this.coeff_posofphrase));
ext.put(prefix + TERMFREQUENCY, Integer.toString(this.coeff_termfrequency));
ext.put(prefix + URLCOMPS, Integer.toString(this.coeff_urlcomps));
ext.put(prefix + URLLENGTH, Integer.toString(this.coeff_urllength));
ext.put(prefix + WORDDISTANCE, Integer.toString(this.coeff_worddistance));
ext.put(prefix + WORDSINTEXT, Integer.toString(this.coeff_wordsintext));
ext.put(prefix + WORDSINTITLE, Integer.toString(this.coeff_wordsintitle));
ext.put(prefix + YBR, Integer.toString(this.coeff_ybr));
}
return ext;
}
public Map<String, String> postToExternalMap(final String prefix) {
final Map<String, String> ext = new LinkedHashMap<String, String>();
ext.put(prefix + URLCOMPINTOPLIST, Integer.toString(this.coeff_urlcompintoplist));
ext.put(prefix + DESCRCOMPINTOPLIST, Integer.toString(this.coeff_descrcompintoplist));
ext.put(prefix + PREFER, Integer.toString(this.coeff_prefer));
if (prefix.length() == 0) {
ext.put(URLCOMPINTOPLIST, Integer.toString(this.coeff_urlcompintoplist));
ext.put(DESCRCOMPINTOPLIST, Integer.toString(this.coeff_descrcompintoplist));
ext.put(PREFER, Integer.toString(this.coeff_prefer));
} else {
ext.put(prefix + URLCOMPINTOPLIST, Integer.toString(this.coeff_urlcompintoplist));
ext.put(prefix + DESCRCOMPINTOPLIST, Integer.toString(this.coeff_descrcompintoplist));
ext.put(prefix + PREFER, Integer.toString(this.coeff_prefer));
}
return ext;
}

@ -27,7 +27,7 @@ package net.yacy.search.snippet;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.Date;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
@ -51,8 +51,8 @@ import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.util.ByteArray;
import net.yacy.repository.Blacklist;
import net.yacy.search.Switchboard;
import de.anomic.crawler.retrieval.Request;
import de.anomic.crawler.ZURL.FailCategory;
import de.anomic.crawler.retrieval.Request;
public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaSnippet> {
@ -117,10 +117,12 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
return Base64Order.enhancedCoder.equal(this.href.hash(), other.href.hash());
}
@Override
public int compareTo(final MediaSnippet o) {
return Base64Order.enhancedCoder.compare(this.href.hash(), o.href.hash());
}
@Override
public int compare(final MediaSnippet o1, final MediaSnippet o2) {
return o1.compareTo(o2);
}
@ -217,7 +219,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
private static HandleSet removeAppearanceHashes(final String sentence, final HandleSet queryhashes) {
// remove all hashes that appear in the sentence
if (sentence == null) return queryhashes;
final SortedMap<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, null);
final SortedMap<byte[], Integer> hs = WordTokenizer.hashSentence(sentence, null, 100);
final Iterator<byte[]> j = queryhashes.iterator();
byte[] hash;
Integer pos;

@ -497,8 +497,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
private static boolean containsAllHashes(
final String sentence, final HandleSet queryhashes) {
final SortedMap<byte[], Integer> m =
WordTokenizer.hashSentence(sentence, null);
final SortedMap<byte[], Integer> m = WordTokenizer.hashSentence(sentence, null, 100);
for (final byte[] b : queryhashes) {
if (!(m.containsKey(b))) {
return false;

@ -0,0 +1,27 @@
Eislaufen
Drachensteigen
Saunabaden=Sauna
Spazieren und Wandern=Spazieren,Wandern,Park,Ufer
Skaten=Inline,Inliner
Skateboarden=Skateboard
Museumsbesuch=Museum
Altstadtbesuch=Altstadt
Parkbesuch=Park,Zoo,Freilichtmuseum,Burg
Ausflug=Schifstouren,Schifffahrt,Aussicht,Aussichtspunkt,Geocaching
Schwimmen=Schwimmbad,Freibad,Hallenbad,Schwimmbäder,Freibäder,Hallenbäder,Therme
Fastnacht
Eislaufen
Saunabaden=Sauna
Theater=Bühnen
Oper=Operette,Opernhaus
Spielplätze=Spielplatz
Museum
Camping
Picknick=Picknicken
Essen und Trinken=Essen,Trinken,Bar,Restaurant,Kneipe
Kochen
Singen=Gesang,Chor
Kegeln
Bowling
Kartbahn=Kart
Kino=Kinoprogramm,Cine

@ -0,0 +1,27 @@
Eislaufen
Drachensteigen
Saunabaden=Sauna
Spazieren und Wandern=Spazieren,Wandern,Park,Ufer
Skaten=Inline,Inliner
Skateboarden=Skateboard
Museumsbesuch=Museum
Altstadtbesuch=Altstadt
Parkbesuch=Park,Zoo,Freilichtmuseum,Burg
Ausflug=Schifstouren,Schifffahrt,Aussicht,Aussichtspunkt,Geocaching
Schwimmen=Schwimmbad,Freibad,Hallenbad,Schwimmbäder,Freibäder,Hallenbäder,Therme
Fastnacht
Eislaufen
Saunabaden=Sauna
Theater=Bühnen
Oper=Operette,Opernhaus
Spielplätze=Spielplatz
Museum
Camping
Picknick=Picknicken
Essen und Trinken=Essen,Trinken,Bar,Restaurant,Kneipe
Kochen
Singen=Gesang,Chor
Kegeln
Bowling
Kartbahn=Kart
Kino=Kinoprogramm,Cine

@ -0,0 +1,19 @@
Veranstaltungen=Veranstaltung,Fest,Party
Stadtteile=Stadtteil,Altstadt,Bahnhofsviertel
Stellenmarkt=Arbeit,Arbeitsamt,Stellengesuche,Stellenangebote,Stellengesuch,Stellenangebot,Job,Jobs
Immobilien=Wohnen,Immobilie,Miete,Mieten,Mietwohnung,Eigentumswohnung
Umwelt=Umweltzone
Verkehr
Gastro=Kulinarisches,Kulinarische,Restaurant,Kneipe,Äpplewoi
Bürgerberatung=Bürgerdialog,Amt
Notdienste=Notdienst
Kinder und Jugendliche=Kinder,Jugendliche
Studierende=Studium,Uni,Universität,Fachhochschule,Student,Studenten,Studentin,Campus
Familie=Familien,Eltern
Neubürger=Neubürgerinnen
Umzug
Notfall
Sozialiales=Soziale
Gewerbe und Existenzgründung=Gewerbe,Existenzgründung,gmbh,Gründer,Gründung,Börse
Kultur=Museum,Museen
Freizeit

@ -0,0 +1,32 @@
A vocabulary is used to produce search navigation entities.
A search navigation is what you see at the right column
at the side of a search results where it is possible to reduce the
set of result entries with given restrictions.
A vocabulary is a restriction where the search results are restricted to
entries which have a specific tag in the subject metadata that corresponds
to the vocabulary restriction. The restriction is expressed with a set of
synonyms for the tag in a property-like file. Such files are activated if
they are present in the folder DATA/DICTIONARIES/autotagging/ at start-up time
and the vocabulary files must be named with a '.vocabulary' extension.
Vocabulary files are similar to property-files with these rules rules:
- the key represents the vocabulary term (this is what you see in the navigation)
- a value is a list of synonyms for the vocabulary term
- a term is always self-referencing (the term is also a synonym for the term)
- a value may be omitted (a self-referencing-only vocabulary)
The format of a vocabulary file is:
each line has the format
<print-name>[=<synonym>{','<synonym>}*]
or the line starts with a '#' for comment lines
The subdirectories of this directory contains example-vocabularies for
specific languages. Vocabularies work best if the vocabulary is expressed in
the same language as the documents are that are indexed.
A vocabulary can be activated by doing:
- copy the vocabulary from the <lang>/ subdirectory to DATA/DICTIONARIES/autotagging/
- restart
- do an indexing of the web-pages. Vocabularies cannot be applied to already indexed
web pages because tags are only generated during the parsing process
Loading…
Cancel
Save