diff --git a/defaults/yacy.init b/defaults/yacy.init index dcb81f7de..d4172ce96 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -762,6 +762,9 @@ search.navigation=hosts,authors,namespace,topics,filetype,protocol all search results are valid without verification search.verify = iffresh +search.excludehosts= +search.excludehosth= + # in case that a link verification fails then the corresponding index reference can be # deleted to clean up the index. If this property is set then failed index verification in # the cases of nocache, iffresh and ifexist causes an index deletion diff --git a/htroot/ConfigPortal.html b/htroot/ConfigPortal.html index 369a19766..ba4b86885 100644 --- a/htroot/ConfigPortal.html +++ b/htroot/ConfigPortal.html @@ -109,12 +109,18 @@ +
Exclude Hosts
+
List of hosts that shall be excluded from search results by default but can be included using the site:<host> operator:
+
+ #[search.excludehosth]# +
+
'About' Column
(shown in a column alongside
with the search result page)
(Headline)
(Content)
-
 
+
 
   diff --git a/htroot/ConfigPortal.java b/htroot/ConfigPortal.java index 7e6379833..fe61eb69d 100644 --- a/htroot/ConfigPortal.java +++ b/htroot/ConfigPortal.java @@ -26,6 +26,7 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import net.yacy.cora.protocol.RequestHeader; +import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import de.anomic.data.WorkTables; @@ -93,6 +94,10 @@ public class ConfigPortal { sb.setConfig("about.headline", post.get("about.headline", "")); sb.setConfig("about.body", post.get("about.body", "")); + String excludehosts = post.get("search.excludehosts", ""); + sb.setConfig("search.excludehosts", excludehosts); + sb.setConfig("search.excludehosth", DigestURI.hosthashes(excludehosts)); + // construct navigation String String nav = ""; if (post.getBoolean("search.navigation.hosts", false)) nav += "hosts,"; @@ -126,8 +131,10 @@ public class ConfigPortal { sb.setConfig("search.result.show.pictures", false); sb.setConfig(SwitchboardConstants.SEARCH_VERIFY, "iffresh"); sb.setConfig(SwitchboardConstants.SEARCH_VERIFY_DELETE, "true"); - prop.put("about.headline", ""); - prop.put("about.body", ""); + sb.setConfig("about.headline", ""); + sb.setConfig("about.body", ""); + sb.setConfig("search.excludehosts", ""); + sb.setConfig("search.excludehosth", ""); } } @@ -167,6 +174,9 @@ public class ConfigPortal { prop.put("about.headline", sb.getConfig("about.headline", "")); prop.put("about.body", sb.getConfig("about.body", "")); + prop.put("search.excludehosts", sb.getConfig("search.excludehosts", "")); + prop.put("search.excludehosth", sb.getConfig("search.excludehosth", "")); + final String browserPopUpPage = sb.getConfig(SwitchboardConstants.BROWSER_POP_UP_PAGE, "ConfigBasic.html"); prop.put("popupFront", 0); prop.put("popupSearch", 0); diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index a5bfc0aee..e7db51239 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -244,6 +244,7 @@ public final class search { null, false, sitehash, + null, authorhash, DigestURI.TLD_any_zone_filter, client, @@ -305,6 +306,7 @@ public final class search { constraint, false, sitehash, + null, authorhash, DigestURI.TLD_any_zone_filter, client, diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 7ae89fa42..41cd4e58b 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -618,6 +618,7 @@ public class yacysearch { constraint, true, sitehash, + DigestURI.hosthashess(sb.getConfig("search.excludehosth", "")), authorhash, DigestURI.TLD_any_zone_filter, client, diff --git a/source/net/yacy/kelondro/data/meta/DigestURI.java b/source/net/yacy/kelondro/data/meta/DigestURI.java index a4713d551..f80501e49 100644 --- a/source/net/yacy/kelondro/data/meta/DigestURI.java +++ b/source/net/yacy/kelondro/data/meta/DigestURI.java @@ -30,6 +30,8 @@ package net.yacy.kelondro.data.meta; import java.io.File; import java.io.Serializable; import java.net.MalformedURLException; +import java.util.HashSet; +import java.util.Set; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; @@ -71,6 +73,37 @@ public class DigestURI extends MultiProtocolURI implements Serializable { } return (url == null) ? null : ASCII.String(url.hash(), 6, 6); } + + /** + * from a given list of hosts make a list of host hashes + * the list is separated by comma + * @param hostlist + * @return list of host hashes without separation + */ + public static String hosthashes(final String hostlist) { + String[] hs = hostlist.split(","); + StringBuilder sb = new StringBuilder(hostlist.length()); + for (String h: hs) { + if (h == null) continue; + h = h.trim(); + if (h.length() == 0) continue; + h = hosthash(h); + if (h == null || h.length() != 6) continue; + sb.append(h); + } + return sb.toString(); + } + + public static Set hosthashess(String hosthashes) { + if (hosthashes == null || hosthashes.length() == 0) return null; + HashSet h = new HashSet(); + assert hosthashes.length() % 6 == 0; + for (int i = 0; i < hosthashes.length(); i = i + 6) { + h.add(hosthashes.substring(i, i + 6)); + } + return h; + } + /** * DigestURI from File diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index ed9203a9a..57483398f 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -29,8 +29,10 @@ package net.yacy.search.query; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.Map; +import java.util.Set; import java.util.SortedSet; import java.util.TreeSet; import java.util.regex.Matcher; @@ -124,6 +126,7 @@ public final class QueryParams { private final Segment indexSegment; public final String host; // this is the client host that starts the query, not a site operator public final String sitehash; // this is a domain hash, 6 bytes long or null + public final Set siteexcludes; // set of domain hashes that are excluded if not included by sitehash public final String authorhash; public final String tenant; public final Modifier modifier; @@ -182,6 +185,7 @@ public final class QueryParams { this.snippetCacheStrategy = null; this.host = null; this.sitehash = null; + this.siteexcludes = null; this.authorhash = null; this.remotepeer = null; this.time = Long.valueOf(System.currentTimeMillis()); @@ -208,6 +212,7 @@ public final class QueryParams { final Searchdom domType, final int domMaxTargets, final Bitfield constraint, final boolean allofconstraint, final String site, + final Set siteexcludes, final String authorhash, final int domainzone, final String host, @@ -250,6 +255,7 @@ public final class QueryParams { this.constraint = constraint; this.allofconstraint = allofconstraint; this.sitehash = site; assert site == null || site.length() == 6; + this.siteexcludes = siteexcludes != null && siteexcludes.size() == 0 ? null: siteexcludes; this.authorhash = authorhash; assert authorhash == null || !authorhash.isEmpty(); this.snippetCacheStrategy = snippetCacheStrategy; this.host = host; @@ -491,6 +497,8 @@ public final class QueryParams { context.append(asterisk); context.append(this.sitehash); context.append(asterisk); + context.append(this.siteexcludes); + context.append(asterisk); context.append(this.authorhash); context.append(asterisk); context.append(this.targetlang); diff --git a/source/net/yacy/search/query/RWIProcess.java b/source/net/yacy/search/query/RWIProcess.java index e221959b9..06a6d856b 100644 --- a/source/net/yacy/search/query/RWIProcess.java +++ b/source/net/yacy/search/query/RWIProcess.java @@ -311,6 +311,9 @@ public final class RWIProcess extends Thread // check site constraints final String hosthash = iEntry.hosthash(); if ( this.query.sitehash == null ) { + if (this.query.siteexcludes != null && this.query.siteexcludes.contains(hosthash)) { + continue pollloop; + } // no site constraint there; maybe collect host navigation information if ( nav_hosts && this.query.urlMask_isCatchall ) { this.hostNavigator.inc(hosthash);