removed protocol, tld, ext from the urlmask and created specific

navigation field for these
pull/1/head
Michael Peter Christen 12 years ago
parent 84f82541e8
commit 433143ba40

@ -228,7 +228,7 @@ public final class search {
null, // no snippet computation null, // no snippet computation
count, count,
0, 0,
filter, filter, null, null, null,
QueryParams.Searchdom.LOCAL, QueryParams.Searchdom.LOCAL,
-1, -1,
null, null,
@ -290,7 +290,7 @@ public final class search {
null, // no snippet computation null, // no snippet computation
count, count,
0, 0,
filter, filter, null, null, null,
QueryParams.Searchdom.LOCAL, QueryParams.Searchdom.LOCAL,
-1, -1,
constraint, constraint,

@ -346,6 +346,9 @@ public class yacysearch {
if ( !block && (post == null || post.get("cat", "href").equals("href")) ) { if ( !block && (post == null || post.get("cat", "href").equals("href")) ) {
String urlmask = null; String urlmask = null;
String protocol = null;
String tld = null;
String ext = null;
// check available memory and clean up if necessary // check available memory and clean up if necessary
if ( !MemoryControl.request(8000000L, false) ) { if ( !MemoryControl.request(8000000L, false) ) {
@ -374,27 +377,27 @@ public class yacysearch {
} }
if ( querystring.indexOf("/https", 0) >= 0 ) { if ( querystring.indexOf("/https", 0) >= 0 ) {
querystring = querystring.replace("/https", ""); querystring = querystring.replace("/https", "");
urlmask = "https://.*"; protocol = "https";
modifier.append("/https "); modifier.append("/https ");
} else if ( querystring.indexOf("/http", 0) >= 0 ) { } else if ( querystring.indexOf("/http", 0) >= 0 ) {
querystring = querystring.replace("/http", ""); querystring = querystring.replace("/http", "");
urlmask = "http://.*"; protocol = "http";
modifier.append("/http "); modifier.append("/http ");
} }
if ( querystring.indexOf("/ftp", 0) >= 0 ) { if ( querystring.indexOf("/ftp", 0) >= 0 ) {
querystring = querystring.replace("/ftp", ""); querystring = querystring.replace("/ftp", "");
urlmask = "ftp://.*"; protocol = "ftp";
modifier.append("/ftp "); modifier.append("/ftp ");
} }
if ( querystring.indexOf("/smb", 0) >= 0 ) { if ( querystring.indexOf("/smb", 0) >= 0 ) {
querystring = querystring.replace("/smb", ""); querystring = querystring.replace("/smb", "");
urlmask = "smb://.*"; protocol = "smb";
modifier.append("/smb "); modifier.append("/smb ");
} }
if ( querystring.indexOf("/file", 0) >= 0 ) { if ( querystring.indexOf("/file", 0) >= 0 ) {
querystring = querystring.replace("/file", ""); querystring = querystring.replace("/file", "");
urlmask = "file://.*"; protocol = "file";
modifier.append("/file "); modifier.append("/file ");
} }
@ -438,19 +441,13 @@ public class yacysearch {
if ( ftb == -1 ) { if ( ftb == -1 ) {
ftb = querystring.length(); ftb = querystring.length();
} }
String ft = querystring.substring(filetype + 9, ftb); ext = querystring.substring(filetype + 9, ftb);
querystring = querystring.replace("filetype:" + ft, ""); querystring = querystring.replace("filetype:" + ext, "");
while ( !ft.isEmpty() && ft.charAt(0) == '.' ) { while ( !ext.isEmpty() && ext.charAt(0) == '.' ) {
ft = ft.substring(1); ext = ext.substring(1);
} }
if ( !ft.isEmpty() ) { modifier.append("filetype:").append(ext).append(' ');
if ( urlmask == null ) { if (ext.isEmpty()) ext = null;
urlmask = ".*\\." + ft + "(\\?.*)?";
} else {
urlmask = urlmask + ".*\\." + ft + "(\\?.*)?";
}
}
modifier.append("filetype:").append(ft).append(' ');
} }
int voc = 0; int voc = 0;
@ -537,9 +534,7 @@ public class yacysearch {
final boolean quotes = (querystring.charAt(authori + 7) == '('); final boolean quotes = (querystring.charAt(authori + 7) == '(');
if ( quotes ) { if ( quotes ) {
int ftb = querystring.indexOf(')', authori + 8); int ftb = querystring.indexOf(')', authori + 8);
if ( ftb == -1 ) { if (ftb == -1) ftb = querystring.length() + 1;
ftb = querystring.length() + 1;
}
author = querystring.substring(authori + 8, ftb); author = querystring.substring(authori + 8, ftb);
querystring = querystring.replace("author:(" + author + ")", ""); querystring = querystring.replace("author:(" + author + ")", "");
modifier.append("author:(").append(author).append(") "); modifier.append("author:(").append(author).append(") ");
@ -554,28 +549,19 @@ public class yacysearch {
} }
} }
final int tld = querystring.indexOf("tld:", 0); final int tldp = querystring.indexOf("tld:", 0);
if ( tld >= 0 ) { if (tldp >= 0) {
int ftb = querystring.indexOf(' ', tld); int ftb = querystring.indexOf(' ', tldp);
if ( ftb == -1 ) { if (ftb == -1) ftb = querystring.length();
ftb = querystring.length(); tld = querystring.substring(tldp + 4, ftb);
} querystring = querystring.replace("tld:" + tld, "");
String domain = querystring.substring(tld + 4, ftb); modifier.append("tld:").append(tld).append(' ');
querystring = querystring.replace("tld:" + domain, ""); while ( tld.length() > 0 && tld.charAt(0) == '.' ) {
modifier.append("tld:").append(domain).append(' '); tld = tld.substring(1);
while ( domain.length() > 0 && domain.charAt(0) == '.' ) {
domain = domain.substring(1);
} }
if ( domain.indexOf('.', 0) < 0 ) { if (tld.length() == 0) tld = null;
domain = "\\." + domain;
} // is tld
if ( domain.length() > 0 ) {
urlmask = "[a-zA-Z]*://[^/]*" + domain + "/.*" + ((urlmask != null) ? urlmask : "");
} }
} if (urlmask == null || urlmask.isEmpty()) urlmask = ".*"; //if no urlmask was given
if ( urlmask == null || urlmask.isEmpty() ) {
urlmask = ".*";
} //if no urlmask was given
// read the language from the language-restrict option 'lr' // read the language from the language-restrict option 'lr'
// if no one is given, use the user agent or the system language as default // if no one is given, use the user agent or the system language as default
@ -726,7 +712,7 @@ public class yacysearch {
snippetFetchStrategy, snippetFetchStrategy,
itemsPerPage, itemsPerPage,
startRecord, startRecord,
urlmask, urlmask, protocol, tld, ext,
clustersearch && global ? QueryParams.Searchdom.CLUSTER : (global && indexReceiveGranted clustersearch && global ? QueryParams.Searchdom.CLUSTER : (global && indexReceiveGranted
? QueryParams.Searchdom.GLOBAL ? QueryParams.Searchdom.GLOBAL
: QueryParams.Searchdom.LOCAL), : QueryParams.Searchdom.LOCAL),

@ -931,6 +931,11 @@ public final class Protocol
keyBody.writeTo(baos); keyBody.writeTo(baos);
key = baos.toString(); key = baos.toString();
} }
String filter = event.query.urlMask.pattern().toString();
if (event.query.tld != null) filter = ".*" + event.query.tld + ".*" + filter;
if (event.query.protocol != null) filter = ".*" + event.query.protocol + ".*" + filter;
if (event.query.ext != null) filter = filter + ".*" + event.query.ext + ".*";
parts.put("myseed", UTF8.StringBody((event.peers.mySeed() == null) ? "" : event.peers.mySeed().genSeedStr(key))); parts.put("myseed", UTF8.StringBody((event.peers.mySeed() == null) ? "" : event.peers.mySeed().genSeedStr(key)));
parts.put("count", UTF8.StringBody(Integer.toString(Math.max(10, count)))); parts.put("count", UTF8.StringBody(Integer.toString(Math.max(10, count))));
parts.put("time", UTF8.StringBody(Long.toString(Math.max(3000, time)))); parts.put("time", UTF8.StringBody(Long.toString(Math.max(3000, time))));
@ -940,7 +945,7 @@ public final class Protocol
parts.put("duetime", UTF8.StringBody("1000")); parts.put("duetime", UTF8.StringBody("1000"));
parts.put("urls", UTF8.StringBody(urlhashes)); parts.put("urls", UTF8.StringBody(urlhashes));
parts.put("prefer", UTF8.StringBody(event.query.prefer.pattern())); parts.put("prefer", UTF8.StringBody(event.query.prefer.pattern()));
parts.put("filter", UTF8.StringBody(event.query.urlMask.pattern())); parts.put("filter", UTF8.StringBody(filter));
parts.put("modifier", UTF8.StringBody(modifier)); parts.put("modifier", UTF8.StringBody(modifier));
parts.put("language", UTF8.StringBody(language)); parts.put("language", UTF8.StringBody(language));
parts.put("sitehash", UTF8.StringBody(sitehash)); parts.put("sitehash", UTF8.StringBody(sitehash));

@ -112,6 +112,7 @@ public final class QueryParams {
public int itemsPerPage; public int itemsPerPage;
public int offset; public int offset;
public final Pattern urlMask, prefer; public final Pattern urlMask, prefer;
public final String protocol, tld, ext;
final boolean urlMask_isCatchall; final boolean urlMask_isCatchall;
public final Classification.ContentDomain contentdom; public final Classification.ContentDomain contentdom;
public final String targetlang; public final String targetlang;
@ -166,6 +167,9 @@ public final class QueryParams {
this.maxDistance = Integer.MAX_VALUE; this.maxDistance = Integer.MAX_VALUE;
this.urlMask = catchall_pattern; this.urlMask = catchall_pattern;
this.urlMask_isCatchall = true; this.urlMask_isCatchall = true;
this.protocol = null;
this.tld = null;
this.ext = null;
this.prefer = matchnothing_pattern; this.prefer = matchnothing_pattern;
this.contentdom = ContentDomain.ALL; this.contentdom = ContentDomain.ALL;
this.itemsPerPage = itemsPerPage; this.itemsPerPage = itemsPerPage;
@ -213,7 +217,8 @@ public final class QueryParams {
final String language, final String language,
final Collection<Tagging.Metatag> metatags, final Collection<Tagging.Metatag> metatags,
final CacheStrategy snippetCacheStrategy, final CacheStrategy snippetCacheStrategy,
final int itemsPerPage, final int offset, final String urlMask, final int itemsPerPage, final int offset,
final String urlMask, final String protocol, final String tld, final String ext,
final Searchdom domType, final int domMaxTargets, final Searchdom domType, final int domMaxTargets,
final Bitfield constraint, final boolean allofconstraint, final Bitfield constraint, final boolean allofconstraint,
final String nav_sitehash, final String nav_sitehash,
@ -241,6 +246,9 @@ public final class QueryParams {
throw new IllegalArgumentException("Not a valid regular expression: " + urlMask, ex); throw new IllegalArgumentException("Not a valid regular expression: " + urlMask, ex);
} }
this.urlMask_isCatchall = this.urlMask.toString().equals(catchall_pattern.toString()); this.urlMask_isCatchall = this.urlMask.toString().equals(catchall_pattern.toString());
this.protocol = protocol;
this.tld = tld;
this.ext = ext;
try { try {
this.prefer = Pattern.compile(prefer); this.prefer = Pattern.compile(prefer);
} catch (final PatternSyntaxException ex) { } catch (final PatternSyntaxException ex) {
@ -438,26 +446,22 @@ public final class QueryParams {
fq.append(" AND ").append(YaCySchema.author_s.getSolrFieldName()).append(":\"").append(this.author).append('\"'); fq.append(" AND ").append(YaCySchema.author_s.getSolrFieldName()).append(":\"").append(this.author).append('\"');
} }
if (!this.urlMask_isCatchall) { if (this.protocol != null) {
String urlMaskPattern = this.urlMask.pattern(); fq.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append(this.protocol);
}
// translate filetype navigation if (this.tld != null) {
int extm = urlMaskPattern.indexOf(".*\\."); fq.append(" AND ").append(YaCySchema.host_dnc_s.getSolrFieldName()).append(":\"").append(this.tld).append('\"');
if (extm >= 0) {
String ext = urlMaskPattern.substring(extm + 4);
int k = ext.indexOf('(');
if (k > 0) ext = ext.substring(0, k);
fq.append(" AND ").append(YaCySchema.url_file_ext_s.getSolrFieldName()).append(":\"").append(ext).append('\"');
} }
// translate protocol navigation if (this.ext != null) {
if (urlMaskPattern.startsWith("http://.*")) fq.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append("http"); fq.append(" AND ").append(YaCySchema.url_file_ext_s.getSolrFieldName()).append(":\"").append(this.ext).append('\"');
else if (urlMaskPattern.startsWith("https://.*")) fq.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append("https"); }
else if (urlMaskPattern.startsWith("ftp://.*")) fq.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append("ftp");
else if (urlMaskPattern.startsWith("smb://.*")) fq.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append("smb");
else if (urlMaskPattern.startsWith("file://.*")) fq.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append("file");
if (!this.urlMask_isCatchall) {
// add a filter query on urls // add a filter query on urls
String urlMaskPattern = this.urlMask.pattern();
// solr doesn't like slashes, backslashes or doublepoints; remove them // urlmask = ".*\\." + ft + "(\\?.*)?"; // solr doesn't like slashes, backslashes or doublepoints; remove them // urlmask = ".*\\." + ft + "(\\?.*)?";
int p; int p;
while ((p = urlMaskPattern.indexOf(':')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 1); while ((p = urlMaskPattern.indexOf(':')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 1);

@ -501,7 +501,15 @@ public final class SearchEvent {
if (this.protocolNavigator != null) { if (this.protocolNavigator != null) {
fcts = facets.get(YaCySchema.url_protocol_s.getSolrFieldName()); fcts = facets.get(YaCySchema.url_protocol_s.getSolrFieldName());
if (fcts != null) this.protocolNavigator.inc(fcts); if (fcts != null) {
// remove all protocols that we don't know
Iterator<String> i = fcts.iterator();
while (i.hasNext()) {
String protocol = i.next();
if ("http,https,smb,ftp,file".indexOf(protocol) < 0) i.remove();
}
this.protocolNavigator.inc(fcts);
}
} }
//fcts = facets.get(YaCySchema.author.getSolrFieldName()); //fcts = facets.get(YaCySchema.author.getSolrFieldName());
//if (fcts != null) this.authorNavigator.inc(fcts); //if (fcts != null) this.authorNavigator.inc(fcts);

Loading…
Cancel
Save