From 433143ba405f60c530ad6daf4d3f4fa820d48534 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 19 Dec 2012 12:45:40 +0100 Subject: [PATCH] removed protocol, tld, ext from the urlmask and created specific navigation field for these --- htroot/yacy/search.java | 4 +- htroot/yacysearch.java | 68 ++++++++----------- source/net/yacy/peers/Protocol.java | 7 +- source/net/yacy/search/query/QueryParams.java | 40 ++++++----- source/net/yacy/search/query/SearchEvent.java | 10 ++- 5 files changed, 66 insertions(+), 63 deletions(-) diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index af377daaa..730d85bcd 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -228,7 +228,7 @@ public final class search { null, // no snippet computation count, 0, - filter, + filter, null, null, null, QueryParams.Searchdom.LOCAL, -1, null, @@ -290,7 +290,7 @@ public final class search { null, // no snippet computation count, 0, - filter, + filter, null, null, null, QueryParams.Searchdom.LOCAL, -1, constraint, diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 402102c96..fdbbb0354 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -346,6 +346,9 @@ public class yacysearch { if ( !block && (post == null || post.get("cat", "href").equals("href")) ) { String urlmask = null; + String protocol = null; + String tld = null; + String ext = null; // check available memory and clean up if necessary if ( !MemoryControl.request(8000000L, false) ) { @@ -374,27 +377,27 @@ public class yacysearch { } if ( querystring.indexOf("/https", 0) >= 0 ) { querystring = querystring.replace("/https", ""); - urlmask = "https://.*"; + protocol = "https"; modifier.append("/https "); } else if ( querystring.indexOf("/http", 0) >= 0 ) { querystring = querystring.replace("/http", ""); - urlmask = "http://.*"; + protocol = "http"; modifier.append("/http "); } if ( querystring.indexOf("/ftp", 0) >= 0 ) { querystring = querystring.replace("/ftp", ""); - urlmask = "ftp://.*"; + protocol = "ftp"; modifier.append("/ftp "); } if ( querystring.indexOf("/smb", 0) >= 0 ) { querystring = querystring.replace("/smb", ""); - urlmask = "smb://.*"; + protocol = "smb"; modifier.append("/smb "); } if ( querystring.indexOf("/file", 0) >= 0 ) { querystring = querystring.replace("/file", ""); - urlmask = "file://.*"; + protocol = "file"; modifier.append("/file "); } @@ -438,19 +441,13 @@ public class yacysearch { if ( ftb == -1 ) { ftb = querystring.length(); } - String ft = querystring.substring(filetype + 9, ftb); - querystring = querystring.replace("filetype:" + ft, ""); - while ( !ft.isEmpty() && ft.charAt(0) == '.' ) { - ft = ft.substring(1); + ext = querystring.substring(filetype + 9, ftb); + querystring = querystring.replace("filetype:" + ext, ""); + while ( !ext.isEmpty() && ext.charAt(0) == '.' ) { + ext = ext.substring(1); } - if ( !ft.isEmpty() ) { - if ( urlmask == null ) { - urlmask = ".*\\." + ft + "(\\?.*)?"; - } else { - urlmask = urlmask + ".*\\." + ft + "(\\?.*)?"; - } - } - modifier.append("filetype:").append(ft).append(' '); + modifier.append("filetype:").append(ext).append(' '); + if (ext.isEmpty()) ext = null; } int voc = 0; @@ -537,9 +534,7 @@ public class yacysearch { final boolean quotes = (querystring.charAt(authori + 7) == '('); if ( quotes ) { int ftb = querystring.indexOf(')', authori + 8); - if ( ftb == -1 ) { - ftb = querystring.length() + 1; - } + if (ftb == -1) ftb = querystring.length() + 1; author = querystring.substring(authori + 8, ftb); querystring = querystring.replace("author:(" + author + ")", ""); modifier.append("author:(").append(author).append(") "); @@ -554,28 +549,19 @@ public class yacysearch { } } - final int tld = querystring.indexOf("tld:", 0); - if ( tld >= 0 ) { - int ftb = querystring.indexOf(' ', tld); - if ( ftb == -1 ) { - ftb = querystring.length(); - } - String domain = querystring.substring(tld + 4, ftb); - querystring = querystring.replace("tld:" + domain, ""); - modifier.append("tld:").append(domain).append(' '); - while ( domain.length() > 0 && domain.charAt(0) == '.' ) { - domain = domain.substring(1); - } - if ( domain.indexOf('.', 0) < 0 ) { - domain = "\\." + domain; - } // is tld - if ( domain.length() > 0 ) { - urlmask = "[a-zA-Z]*://[^/]*" + domain + "/.*" + ((urlmask != null) ? urlmask : ""); + final int tldp = querystring.indexOf("tld:", 0); + if (tldp >= 0) { + int ftb = querystring.indexOf(' ', tldp); + if (ftb == -1) ftb = querystring.length(); + tld = querystring.substring(tldp + 4, ftb); + querystring = querystring.replace("tld:" + tld, ""); + modifier.append("tld:").append(tld).append(' '); + while ( tld.length() > 0 && tld.charAt(0) == '.' ) { + tld = tld.substring(1); } + if (tld.length() == 0) tld = null; } - if ( urlmask == null || urlmask.isEmpty() ) { - urlmask = ".*"; - } //if no urlmask was given + if (urlmask == null || urlmask.isEmpty()) urlmask = ".*"; //if no urlmask was given // read the language from the language-restrict option 'lr' // if no one is given, use the user agent or the system language as default @@ -726,7 +712,7 @@ public class yacysearch { snippetFetchStrategy, itemsPerPage, startRecord, - urlmask, + urlmask, protocol, tld, ext, clustersearch && global ? QueryParams.Searchdom.CLUSTER : (global && indexReceiveGranted ? QueryParams.Searchdom.GLOBAL : QueryParams.Searchdom.LOCAL), diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index 2f8707c2f..ada7e194d 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -931,6 +931,11 @@ public final class Protocol keyBody.writeTo(baos); key = baos.toString(); } + + String filter = event.query.urlMask.pattern().toString(); + if (event.query.tld != null) filter = ".*" + event.query.tld + ".*" + filter; + if (event.query.protocol != null) filter = ".*" + event.query.protocol + ".*" + filter; + if (event.query.ext != null) filter = filter + ".*" + event.query.ext + ".*"; parts.put("myseed", UTF8.StringBody((event.peers.mySeed() == null) ? "" : event.peers.mySeed().genSeedStr(key))); parts.put("count", UTF8.StringBody(Integer.toString(Math.max(10, count)))); parts.put("time", UTF8.StringBody(Long.toString(Math.max(3000, time)))); @@ -940,7 +945,7 @@ public final class Protocol parts.put("duetime", UTF8.StringBody("1000")); parts.put("urls", UTF8.StringBody(urlhashes)); parts.put("prefer", UTF8.StringBody(event.query.prefer.pattern())); - parts.put("filter", UTF8.StringBody(event.query.urlMask.pattern())); + parts.put("filter", UTF8.StringBody(filter)); parts.put("modifier", UTF8.StringBody(modifier)); parts.put("language", UTF8.StringBody(language)); parts.put("sitehash", UTF8.StringBody(sitehash)); diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index a1c3c4470..0b3b20342 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -112,6 +112,7 @@ public final class QueryParams { public int itemsPerPage; public int offset; public final Pattern urlMask, prefer; + public final String protocol, tld, ext; final boolean urlMask_isCatchall; public final Classification.ContentDomain contentdom; public final String targetlang; @@ -166,6 +167,9 @@ public final class QueryParams { this.maxDistance = Integer.MAX_VALUE; this.urlMask = catchall_pattern; this.urlMask_isCatchall = true; + this.protocol = null; + this.tld = null; + this.ext = null; this.prefer = matchnothing_pattern; this.contentdom = ContentDomain.ALL; this.itemsPerPage = itemsPerPage; @@ -213,7 +217,8 @@ public final class QueryParams { final String language, final Collection metatags, final CacheStrategy snippetCacheStrategy, - final int itemsPerPage, final int offset, final String urlMask, + final int itemsPerPage, final int offset, + final String urlMask, final String protocol, final String tld, final String ext, final Searchdom domType, final int domMaxTargets, final Bitfield constraint, final boolean allofconstraint, final String nav_sitehash, @@ -241,6 +246,9 @@ public final class QueryParams { throw new IllegalArgumentException("Not a valid regular expression: " + urlMask, ex); } this.urlMask_isCatchall = this.urlMask.toString().equals(catchall_pattern.toString()); + this.protocol = protocol; + this.tld = tld; + this.ext = ext; try { this.prefer = Pattern.compile(prefer); } catch (final PatternSyntaxException ex) { @@ -438,26 +446,22 @@ public final class QueryParams { fq.append(" AND ").append(YaCySchema.author_s.getSolrFieldName()).append(":\"").append(this.author).append('\"'); } + if (this.protocol != null) { + fq.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append(this.protocol); + } + + if (this.tld != null) { + fq.append(" AND ").append(YaCySchema.host_dnc_s.getSolrFieldName()).append(":\"").append(this.tld).append('\"'); + } + + if (this.ext != null) { + fq.append(" AND ").append(YaCySchema.url_file_ext_s.getSolrFieldName()).append(":\"").append(this.ext).append('\"'); + } + if (!this.urlMask_isCatchall) { + // add a filter query on urls String urlMaskPattern = this.urlMask.pattern(); - // translate filetype navigation - int extm = urlMaskPattern.indexOf(".*\\."); - if (extm >= 0) { - String ext = urlMaskPattern.substring(extm + 4); - int k = ext.indexOf('('); - if (k > 0) ext = ext.substring(0, k); - fq.append(" AND ").append(YaCySchema.url_file_ext_s.getSolrFieldName()).append(":\"").append(ext).append('\"'); - } - - // translate protocol navigation - if (urlMaskPattern.startsWith("http://.*")) fq.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append("http"); - else if (urlMaskPattern.startsWith("https://.*")) fq.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append("https"); - else if (urlMaskPattern.startsWith("ftp://.*")) fq.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append("ftp"); - else if (urlMaskPattern.startsWith("smb://.*")) fq.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append("smb"); - else if (urlMaskPattern.startsWith("file://.*")) fq.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append("file"); - - // add a filter query on urls // solr doesn't like slashes, backslashes or doublepoints; remove them // urlmask = ".*\\." + ft + "(\\?.*)?"; int p; while ((p = urlMaskPattern.indexOf(':')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 1); diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 1334db377..3f3f8bdce 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -501,7 +501,15 @@ public final class SearchEvent { if (this.protocolNavigator != null) { fcts = facets.get(YaCySchema.url_protocol_s.getSolrFieldName()); - if (fcts != null) this.protocolNavigator.inc(fcts); + if (fcts != null) { + // remove all protocols that we don't know + Iterator i = fcts.iterator(); + while (i.hasNext()) { + String protocol = i.next(); + if ("http,https,smb,ftp,file".indexOf(protocol) < 0) i.remove(); + } + this.protocolNavigator.inc(fcts); + } } //fcts = facets.get(YaCySchema.author.getSolrFieldName()); //if (fcts != null) this.authorNavigator.inc(fcts);