removed protocol, tld, ext from the urlmask and created specific

navigation field for these
pull/1/head
Michael Peter Christen 12 years ago
parent 84f82541e8
commit 433143ba40

@ -228,7 +228,7 @@ public final class search {
null, // no snippet computation
count,
0,
filter,
filter, null, null, null,
QueryParams.Searchdom.LOCAL,
-1,
null,
@ -290,7 +290,7 @@ public final class search {
null, // no snippet computation
count,
0,
filter,
filter, null, null, null,
QueryParams.Searchdom.LOCAL,
-1,
constraint,

@ -346,6 +346,9 @@ public class yacysearch {
if ( !block && (post == null || post.get("cat", "href").equals("href")) ) {
String urlmask = null;
String protocol = null;
String tld = null;
String ext = null;
// check available memory and clean up if necessary
if ( !MemoryControl.request(8000000L, false) ) {
@ -374,27 +377,27 @@ public class yacysearch {
}
if ( querystring.indexOf("/https", 0) >= 0 ) {
querystring = querystring.replace("/https", "");
urlmask = "https://.*";
protocol = "https";
modifier.append("/https ");
} else if ( querystring.indexOf("/http", 0) >= 0 ) {
querystring = querystring.replace("/http", "");
urlmask = "http://.*";
protocol = "http";
modifier.append("/http ");
}
if ( querystring.indexOf("/ftp", 0) >= 0 ) {
querystring = querystring.replace("/ftp", "");
urlmask = "ftp://.*";
protocol = "ftp";
modifier.append("/ftp ");
}
if ( querystring.indexOf("/smb", 0) >= 0 ) {
querystring = querystring.replace("/smb", "");
urlmask = "smb://.*";
protocol = "smb";
modifier.append("/smb ");
}
if ( querystring.indexOf("/file", 0) >= 0 ) {
querystring = querystring.replace("/file", "");
urlmask = "file://.*";
protocol = "file";
modifier.append("/file ");
}
@ -438,19 +441,13 @@ public class yacysearch {
if ( ftb == -1 ) {
ftb = querystring.length();
}
String ft = querystring.substring(filetype + 9, ftb);
querystring = querystring.replace("filetype:" + ft, "");
while ( !ft.isEmpty() && ft.charAt(0) == '.' ) {
ft = ft.substring(1);
ext = querystring.substring(filetype + 9, ftb);
querystring = querystring.replace("filetype:" + ext, "");
while ( !ext.isEmpty() && ext.charAt(0) == '.' ) {
ext = ext.substring(1);
}
if ( !ft.isEmpty() ) {
if ( urlmask == null ) {
urlmask = ".*\\." + ft + "(\\?.*)?";
} else {
urlmask = urlmask + ".*\\." + ft + "(\\?.*)?";
}
}
modifier.append("filetype:").append(ft).append(' ');
modifier.append("filetype:").append(ext).append(' ');
if (ext.isEmpty()) ext = null;
}
int voc = 0;
@ -537,9 +534,7 @@ public class yacysearch {
final boolean quotes = (querystring.charAt(authori + 7) == '(');
if ( quotes ) {
int ftb = querystring.indexOf(')', authori + 8);
if ( ftb == -1 ) {
ftb = querystring.length() + 1;
}
if (ftb == -1) ftb = querystring.length() + 1;
author = querystring.substring(authori + 8, ftb);
querystring = querystring.replace("author:(" + author + ")", "");
modifier.append("author:(").append(author).append(") ");
@ -554,28 +549,19 @@ public class yacysearch {
}
}
final int tld = querystring.indexOf("tld:", 0);
if ( tld >= 0 ) {
int ftb = querystring.indexOf(' ', tld);
if ( ftb == -1 ) {
ftb = querystring.length();
}
String domain = querystring.substring(tld + 4, ftb);
querystring = querystring.replace("tld:" + domain, "");
modifier.append("tld:").append(domain).append(' ');
while ( domain.length() > 0 && domain.charAt(0) == '.' ) {
domain = domain.substring(1);
}
if ( domain.indexOf('.', 0) < 0 ) {
domain = "\\." + domain;
} // is tld
if ( domain.length() > 0 ) {
urlmask = "[a-zA-Z]*://[^/]*" + domain + "/.*" + ((urlmask != null) ? urlmask : "");
final int tldp = querystring.indexOf("tld:", 0);
if (tldp >= 0) {
int ftb = querystring.indexOf(' ', tldp);
if (ftb == -1) ftb = querystring.length();
tld = querystring.substring(tldp + 4, ftb);
querystring = querystring.replace("tld:" + tld, "");
modifier.append("tld:").append(tld).append(' ');
while ( tld.length() > 0 && tld.charAt(0) == '.' ) {
tld = tld.substring(1);
}
if (tld.length() == 0) tld = null;
}
if ( urlmask == null || urlmask.isEmpty() ) {
urlmask = ".*";
} //if no urlmask was given
if (urlmask == null || urlmask.isEmpty()) urlmask = ".*"; //if no urlmask was given
// read the language from the language-restrict option 'lr'
// if no one is given, use the user agent or the system language as default
@ -726,7 +712,7 @@ public class yacysearch {
snippetFetchStrategy,
itemsPerPage,
startRecord,
urlmask,
urlmask, protocol, tld, ext,
clustersearch && global ? QueryParams.Searchdom.CLUSTER : (global && indexReceiveGranted
? QueryParams.Searchdom.GLOBAL
: QueryParams.Searchdom.LOCAL),

@ -931,6 +931,11 @@ public final class Protocol
keyBody.writeTo(baos);
key = baos.toString();
}
String filter = event.query.urlMask.pattern().toString();
if (event.query.tld != null) filter = ".*" + event.query.tld + ".*" + filter;
if (event.query.protocol != null) filter = ".*" + event.query.protocol + ".*" + filter;
if (event.query.ext != null) filter = filter + ".*" + event.query.ext + ".*";
parts.put("myseed", UTF8.StringBody((event.peers.mySeed() == null) ? "" : event.peers.mySeed().genSeedStr(key)));
parts.put("count", UTF8.StringBody(Integer.toString(Math.max(10, count))));
parts.put("time", UTF8.StringBody(Long.toString(Math.max(3000, time))));
@ -940,7 +945,7 @@ public final class Protocol
parts.put("duetime", UTF8.StringBody("1000"));
parts.put("urls", UTF8.StringBody(urlhashes));
parts.put("prefer", UTF8.StringBody(event.query.prefer.pattern()));
parts.put("filter", UTF8.StringBody(event.query.urlMask.pattern()));
parts.put("filter", UTF8.StringBody(filter));
parts.put("modifier", UTF8.StringBody(modifier));
parts.put("language", UTF8.StringBody(language));
parts.put("sitehash", UTF8.StringBody(sitehash));

@ -112,6 +112,7 @@ public final class QueryParams {
public int itemsPerPage;
public int offset;
public final Pattern urlMask, prefer;
public final String protocol, tld, ext;
final boolean urlMask_isCatchall;
public final Classification.ContentDomain contentdom;
public final String targetlang;
@ -166,6 +167,9 @@ public final class QueryParams {
this.maxDistance = Integer.MAX_VALUE;
this.urlMask = catchall_pattern;
this.urlMask_isCatchall = true;
this.protocol = null;
this.tld = null;
this.ext = null;
this.prefer = matchnothing_pattern;
this.contentdom = ContentDomain.ALL;
this.itemsPerPage = itemsPerPage;
@ -213,7 +217,8 @@ public final class QueryParams {
final String language,
final Collection<Tagging.Metatag> metatags,
final CacheStrategy snippetCacheStrategy,
final int itemsPerPage, final int offset, final String urlMask,
final int itemsPerPage, final int offset,
final String urlMask, final String protocol, final String tld, final String ext,
final Searchdom domType, final int domMaxTargets,
final Bitfield constraint, final boolean allofconstraint,
final String nav_sitehash,
@ -241,6 +246,9 @@ public final class QueryParams {
throw new IllegalArgumentException("Not a valid regular expression: " + urlMask, ex);
}
this.urlMask_isCatchall = this.urlMask.toString().equals(catchall_pattern.toString());
this.protocol = protocol;
this.tld = tld;
this.ext = ext;
try {
this.prefer = Pattern.compile(prefer);
} catch (final PatternSyntaxException ex) {
@ -438,26 +446,22 @@ public final class QueryParams {
fq.append(" AND ").append(YaCySchema.author_s.getSolrFieldName()).append(":\"").append(this.author).append('\"');
}
if (!this.urlMask_isCatchall) {
String urlMaskPattern = this.urlMask.pattern();
if (this.protocol != null) {
fq.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append(this.protocol);
}
// translate filetype navigation
int extm = urlMaskPattern.indexOf(".*\\.");
if (extm >= 0) {
String ext = urlMaskPattern.substring(extm + 4);
int k = ext.indexOf('(');
if (k > 0) ext = ext.substring(0, k);
fq.append(" AND ").append(YaCySchema.url_file_ext_s.getSolrFieldName()).append(":\"").append(ext).append('\"');
}
if (this.tld != null) {
fq.append(" AND ").append(YaCySchema.host_dnc_s.getSolrFieldName()).append(":\"").append(this.tld).append('\"');
}
// translate protocol navigation
if (urlMaskPattern.startsWith("http://.*")) fq.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append("http");
else if (urlMaskPattern.startsWith("https://.*")) fq.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append("https");
else if (urlMaskPattern.startsWith("ftp://.*")) fq.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append("ftp");
else if (urlMaskPattern.startsWith("smb://.*")) fq.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append("smb");
else if (urlMaskPattern.startsWith("file://.*")) fq.append(" AND ").append(YaCySchema.url_protocol_s.getSolrFieldName()).append(':').append("file");
if (this.ext != null) {
fq.append(" AND ").append(YaCySchema.url_file_ext_s.getSolrFieldName()).append(":\"").append(this.ext).append('\"');
}
if (!this.urlMask_isCatchall) {
// add a filter query on urls
String urlMaskPattern = this.urlMask.pattern();
// solr doesn't like slashes, backslashes or doublepoints; remove them // urlmask = ".*\\." + ft + "(\\?.*)?";
int p;
while ((p = urlMaskPattern.indexOf(':')) >= 0) urlMaskPattern = urlMaskPattern.substring(0, p) + "." + urlMaskPattern.substring(p + 1);

@ -501,7 +501,15 @@ public final class SearchEvent {
if (this.protocolNavigator != null) {
fcts = facets.get(YaCySchema.url_protocol_s.getSolrFieldName());
if (fcts != null) this.protocolNavigator.inc(fcts);
if (fcts != null) {
// remove all protocols that we don't know
Iterator<String> i = fcts.iterator();
while (i.hasNext()) {
String protocol = i.next();
if ("http,https,smb,ftp,file".indexOf(protocol) < 0) i.remove();
}
this.protocolNavigator.inc(fcts);
}
}
//fcts = facets.get(YaCySchema.author.getSolrFieldName());
//if (fcts != null) this.authorNavigator.inc(fcts);

Loading…
Cancel
Save