diff --git a/htroot/index.html b/htroot/index.html index 9a737d76b..c7425c49c 100644 --- a/htroot/index.html +++ b/htroot/index.html @@ -31,7 +31,6 @@ -
#(searchtext)#::  #(/searchtext)# @@ -40,11 +39,11 @@ #(searchvideo)#::  #(/searchvideo)# #(searchapp)#::#(/searchapp)#
+ #(searchoptions)# - diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 770a147bc..850e72849 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -219,7 +219,6 @@ public final class search { indexSegment, rankingProfile ); - theQuery.domType = QueryParams.SEARCHDOM_LOCAL; yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + QueryParams.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); final long timer = System.currentTimeMillis(); @@ -273,7 +272,6 @@ public final class search { sb.indexSegments.segment(Segments.Process.PUBLIC), rankingProfile ); - theQuery.domType = QueryParams.SEARCHDOM_LOCAL; yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + QueryParams.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links"); RSSFeed.channels(RSSFeed.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.queryHashes), "")); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index daa1ba5d5..be4c1da61 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -541,8 +541,8 @@ public class yacysearch { "&resource=" + ((theQuery.isLocal()) ? "local" : "global") + "&verify=" + ((theQuery.onlineSnippetFetch) ? "true" : "false") + "&nav=" + theQuery.navigators + - "&urlmaskfilter=" + originalUrlMask + - "&prefermaskfilter=" + theQuery.prefer + + "&urlmaskfilter=" + originalUrlMask.toString() + + "&prefermaskfilter=" + theQuery.prefer.toString() + "&cat=href&constraint=" + ((theQuery.constraint == null) ? "" : theQuery.constraint.exportB64()) + "&contentdom=" + theQuery.contentdom() + "&former=" + theQuery.queryString(true) + diff --git a/source/de/anomic/search/QueryParams.java b/source/de/anomic/search/QueryParams.java index 3f9abf680..116dfdde6 100644 --- a/source/de/anomic/search/QueryParams.java +++ b/source/de/anomic/search/QueryParams.java @@ -59,29 +59,33 @@ public final class QueryParams { public static final Bitfield empty_constraint = new Bitfield(4, "AAAAAA"); public static final Bitfield catchall_constraint = new Bitfield(4, "______"); + public static final Pattern catchall_pattern = Pattern.compile(".*"); + public static final Pattern matchnothing_pattern = Pattern.compile(""); - public String queryString; + public final String queryString; public TreeSet fullqueryHashes, queryHashes, excludeHashes; - public int itemsPerPage, offset; - public Pattern urlMask, prefer; - public ContentDomain contentdom; - public String targetlang; - public String navigators; - public int domType; - public int zonecode; - public int domMaxTargets; - public int maxDistance; - public Bitfield constraint; - public boolean allofconstraint; - public boolean onlineSnippetFetch; - public RankingProfile ranking; + public final int itemsPerPage; + public int offset; + public final Pattern urlMask, prefer; + public final boolean urlMask_isCatchall, prefer_isMatchnothing; + public final ContentDomain contentdom; + public final String targetlang; + public final String navigators; + public final int domType; + public final int zonecode; + public final int domMaxTargets; + public final int maxDistance; + public final Bitfield constraint; + public final boolean allofconstraint; + public final boolean onlineSnippetFetch; + public final RankingProfile ranking; private final Segment indexSegment; - public String host; // this is the client host that starts the query, not a site operator - public String sitehash; // this is a domain hash, 6 bytes long or null - public String authorhash; - public String tenant; + public final String host; // this is the client host that starts the query, not a site operator + public final String sitehash; // this is a domain hash, 6 bytes long or null + public final String authorhash; + public final String tenant; public yacySeed remotepeer; - public Long handle; + public final Long handle; // values that are set after a search: public int resultcount; // number of found results public long searchtime, urlretrievaltime, snippetcomputationtime; // time to perform the search, to get all the urls, and to compute the snippets @@ -107,11 +111,13 @@ public final class QueryParams { this.ranking = ranking; this.tenant = null; this.maxDistance = Integer.MAX_VALUE; - this.prefer = Pattern.compile(""); + this.urlMask = catchall_pattern; + this.urlMask_isCatchall = true; + this.prefer = matchnothing_pattern; + this.prefer_isMatchnothing = true; this.contentdom = ContentDomain.ALL; this.itemsPerPage = itemsPerPage; this.offset = 0; - this.urlMask = Pattern.compile(".*"); this.targetlang = "en"; this.domType = SEARCHDOM_LOCAL; this.zonecode = DigestURI.TLD_any_zone_filter; @@ -155,11 +161,13 @@ public final class QueryParams { this.tenant = (tenant != null && tenant.length() == 0) ? null : tenant; this.ranking = ranking; this.maxDistance = maxDistance; - this.prefer = Pattern.compile(prefer); this.contentdom = contentdom; this.itemsPerPage = Math.min((specialRights) ? 1000 : 50, itemsPerPage); this.offset = Math.min((specialRights) ? 10000 : 100, offset); this.urlMask = Pattern.compile(urlMask); + this.urlMask_isCatchall = this.urlMask.toString().equals(catchall_pattern.toString()); + this.prefer = Pattern.compile(prefer); + this.prefer_isMatchnothing = this.prefer.toString().equals(matchnothing_pattern.toString());; assert language != null; this.targetlang = language; this.navigators = navigators; diff --git a/source/de/anomic/search/RankingProcess.java b/source/de/anomic/search/RankingProcess.java index 5da2243e3..acff75be1 100644 --- a/source/de/anomic/search/RankingProcess.java +++ b/source/de/anomic/search/RankingProcess.java @@ -220,7 +220,7 @@ public final class RankingProcess extends Thread { //this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++; // get statistics for host navigator - if (nav_hosts) { + if (nav_hosts && query.urlMask_isCatchall) { domhash = iEntry.urlHash.substring(6); this.hostNavigator.inc(domhash, iEntry.urlHash); } @@ -374,6 +374,7 @@ public final class RankingProcess extends Thread { // returns from the current RWI list the best URL entry and removes this entry from the list long timeLimit = System.currentTimeMillis() + timeout; int p = -1; + String urlhash; while (System.currentTimeMillis() < timeLimit) { final SortStack.stackElement obrwi = takeRWI(skipDoubleDom); if (obrwi == null) { @@ -381,7 +382,8 @@ public final class RankingProcess extends Thread { try {Thread.sleep(50);} catch (final InterruptedException e1) {} continue; } - final URIMetadataRow page = this.query.getSegment().urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue()); + urlhash = obrwi.element.metadataHash(); + final URIMetadataRow page = this.query.getSegment().urlMetadata().load(urlhash, obrwi.element, obrwi.weight.longValue()); if (page == null) { misses.add(obrwi.element.metadataHash()); continue; @@ -395,12 +397,18 @@ public final class RankingProcess extends Thread { continue; // rare case where the url is corrupted } - // check url mask - if (!metadata.matches(query.urlMask)) { - continue; + if (!query.urlMask_isCatchall) { + // check url mask + if (!metadata.matches(query.urlMask)) { + continue; + } + + // in case that we do not have e catchall filter for urls + // we must also construct the domain navigator here + this.hostNavigator.inc(urlhash.substring(6), urlhash); } - // check for more errors + // check for more errors if (metadata.url() == null) { continue; // rare case where the url is corrupted } @@ -539,14 +547,10 @@ public final class RankingProcess extends Thread { int rc = Math.min(count, hsa.length); ArrayList result = new ArrayList(); for (int i = 0; i < rc; i++) result.add(hsa[i]); + if (result.size() < 2) result.clear(); // navigators with one entry are not useful return result; } - public List getHostNavigators(int count) { - if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts") < 0) return new ArrayList(0); - - return this.hostNavigator.entries(10); - } public List getHostNavigator(int count) { List result = new ArrayList(); if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts") < 0) return result; @@ -569,6 +573,7 @@ public final class RankingProcess extends Thread { for (Navigator.Item entry: result) if (entry.name.equals(hostname)) continue loop; // check if one entry already exists result.add(new Navigator.Item(hostname, item.count)); } + if (result.size() < 2) result.clear(); // navigators with one entry are not useful return result; } @@ -589,8 +594,9 @@ public final class RankingProcess extends Thread { // create a list of words that had been computed by statistics over all // words that appeared in the url or the description of all urls if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("topics") < 0) return new ArrayList(0); - - return this.ref.entries(10); + List result = this.ref.entries(10); + if (result.size() < 2) result.clear(); // navigators with one entry are not useful + return result; } public void addTopic(final String[] words) { @@ -623,8 +629,9 @@ public final class RankingProcess extends Thread { // create a list of words that had been computed by statistics over all // words that appeared in the url or the description of all urls if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("authors") < 0) return new ArrayList(0); - - return this.authorNavigator.entries(count); + List result = this.authorNavigator.entries(count); + if (result.size() < 2) result.clear(); // navigators with one entry are not useful + return result; } public static void loadYBR(final File rankingPath, final int count) { diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 6e69f38b3..025cdab51 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -55,7 +55,7 @@ import net.yacy.kelondro.util.FileUtils; public class Document { - private final DigestURI source; // the source url + private final DigestURI source; // the source url private final String mimeType; // mimeType as taken from http header private final String charset; // the charset of the document private final List keywords; // most resources provide a keyword field