- enhanced domain navigator computation

- fixed domain navigator content in case that a mustmatch constraint was given

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6763 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent bb63c5d075
commit 6c093d6aed

@ -31,7 +31,6 @@
<input type="hidden" name="display" value="#[display]#" />
<input name="query" id="search" type="text" size="52" maxlength="80" value="#[former]#" />
<input type="submit" name="Enter" value="Search" />
<!-- <input type="hidden" name="former" value="#[former]#" /> uncommented to detect newsearch //-->
<input type="hidden" name="verify" value="true" />
<div class="yacysearch">
#(searchtext)#::<input type="radio" id="text" name="contentdom" value="text" #(check)#::checked="checked"#(/check)# /><label for="text">Text</label>&nbsp;&nbsp;#(/searchtext)#
@ -40,11 +39,11 @@
#(searchvideo)#::<input type="radio" id="video" name="contentdom" value="video" #(check)#::checked="checked"#(/check)# /><label for="video">Video</label>&nbsp;&nbsp;#(/searchvideo)#
#(searchapp)#::<input type="radio" id="app" name="contentdom" value="app" #(check)#::checked="checked"#(/check)# /><label for="app">Applications</label>#(/searchapp)#
</div>
<input type="hidden" name="nav" value="all" />
#(searchoptions)#
<input type="hidden" name="maximumRecords" value="#[maximumRecords]#" />
<input type="hidden" name="startRecord" value="0" />
<input type="hidden" name="resource" value="global" />
<input type="hidden" name="nav" value="all" />
<input type="hidden" name="urlmaskfilter" value=".*" />
<input type="hidden" name="prefermaskfilter" value="" />
<input type="hidden" name="indexof" value="off" />

@ -219,7 +219,6 @@ public final class search {
indexSegment,
rankingProfile
);
theQuery.domType = QueryParams.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + QueryParams.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");
final long timer = System.currentTimeMillis();
@ -273,7 +272,6 @@ public final class search {
sb.indexSegments.segment(Segments.Process.PUBLIC),
rankingProfile
);
theQuery.domType = QueryParams.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + QueryParams.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.displayResults() + " links");
RSSFeed.channels(RSSFeed.REMOTESEARCH).addMessage(new RSSMessage("Remote Search Request from " + ((remoteSeed == null) ? "unknown" : remoteSeed.getName()), QueryParams.anonymizedQueryHashes(theQuery.queryHashes), ""));

@ -541,8 +541,8 @@ public class yacysearch {
"&resource=" + ((theQuery.isLocal()) ? "local" : "global") +
"&verify=" + ((theQuery.onlineSnippetFetch) ? "true" : "false") +
"&nav=" + theQuery.navigators +
"&urlmaskfilter=" + originalUrlMask +
"&prefermaskfilter=" + theQuery.prefer +
"&urlmaskfilter=" + originalUrlMask.toString() +
"&prefermaskfilter=" + theQuery.prefer.toString() +
"&cat=href&amp;constraint=" + ((theQuery.constraint == null) ? "" : theQuery.constraint.exportB64()) +
"&contentdom=" + theQuery.contentdom() +
"&former=" + theQuery.queryString(true) +

@ -59,29 +59,33 @@ public final class QueryParams {
public static final Bitfield empty_constraint = new Bitfield(4, "AAAAAA");
public static final Bitfield catchall_constraint = new Bitfield(4, "______");
public static final Pattern catchall_pattern = Pattern.compile(".*");
public static final Pattern matchnothing_pattern = Pattern.compile("");
public String queryString;
public final String queryString;
public TreeSet<byte[]> fullqueryHashes, queryHashes, excludeHashes;
public int itemsPerPage, offset;
public Pattern urlMask, prefer;
public ContentDomain contentdom;
public String targetlang;
public String navigators;
public int domType;
public int zonecode;
public int domMaxTargets;
public int maxDistance;
public Bitfield constraint;
public boolean allofconstraint;
public boolean onlineSnippetFetch;
public RankingProfile ranking;
public final int itemsPerPage;
public int offset;
public final Pattern urlMask, prefer;
public final boolean urlMask_isCatchall, prefer_isMatchnothing;
public final ContentDomain contentdom;
public final String targetlang;
public final String navigators;
public final int domType;
public final int zonecode;
public final int domMaxTargets;
public final int maxDistance;
public final Bitfield constraint;
public final boolean allofconstraint;
public final boolean onlineSnippetFetch;
public final RankingProfile ranking;
private final Segment indexSegment;
public String host; // this is the client host that starts the query, not a site operator
public String sitehash; // this is a domain hash, 6 bytes long or null
public String authorhash;
public String tenant;
public final String host; // this is the client host that starts the query, not a site operator
public final String sitehash; // this is a domain hash, 6 bytes long or null
public final String authorhash;
public final String tenant;
public yacySeed remotepeer;
public Long handle;
public final Long handle;
// values that are set after a search:
public int resultcount; // number of found results
public long searchtime, urlretrievaltime, snippetcomputationtime; // time to perform the search, to get all the urls, and to compute the snippets
@ -107,11 +111,13 @@ public final class QueryParams {
this.ranking = ranking;
this.tenant = null;
this.maxDistance = Integer.MAX_VALUE;
this.prefer = Pattern.compile("");
this.urlMask = catchall_pattern;
this.urlMask_isCatchall = true;
this.prefer = matchnothing_pattern;
this.prefer_isMatchnothing = true;
this.contentdom = ContentDomain.ALL;
this.itemsPerPage = itemsPerPage;
this.offset = 0;
this.urlMask = Pattern.compile(".*");
this.targetlang = "en";
this.domType = SEARCHDOM_LOCAL;
this.zonecode = DigestURI.TLD_any_zone_filter;
@ -155,11 +161,13 @@ public final class QueryParams {
this.tenant = (tenant != null && tenant.length() == 0) ? null : tenant;
this.ranking = ranking;
this.maxDistance = maxDistance;
this.prefer = Pattern.compile(prefer);
this.contentdom = contentdom;
this.itemsPerPage = Math.min((specialRights) ? 1000 : 50, itemsPerPage);
this.offset = Math.min((specialRights) ? 10000 : 100, offset);
this.urlMask = Pattern.compile(urlMask);
this.urlMask_isCatchall = this.urlMask.toString().equals(catchall_pattern.toString());
this.prefer = Pattern.compile(prefer);
this.prefer_isMatchnothing = this.prefer.toString().equals(matchnothing_pattern.toString());;
assert language != null;
this.targetlang = language;
this.navigators = navigators;

@ -220,7 +220,7 @@ public final class RankingProcess extends Thread {
//this.domZones[DigestURI.domDomain(iEntry.metadataHash())]++;
// get statistics for host navigator
if (nav_hosts) {
if (nav_hosts && query.urlMask_isCatchall) {
domhash = iEntry.urlHash.substring(6);
this.hostNavigator.inc(domhash, iEntry.urlHash);
}
@ -374,6 +374,7 @@ public final class RankingProcess extends Thread {
// returns from the current RWI list the best URL entry and removes this entry from the list
long timeLimit = System.currentTimeMillis() + timeout;
int p = -1;
String urlhash;
while (System.currentTimeMillis() < timeLimit) {
final SortStack<WordReferenceVars>.stackElement obrwi = takeRWI(skipDoubleDom);
if (obrwi == null) {
@ -381,7 +382,8 @@ public final class RankingProcess extends Thread {
try {Thread.sleep(50);} catch (final InterruptedException e1) {}
continue;
}
final URIMetadataRow page = this.query.getSegment().urlMetadata().load(obrwi.element.metadataHash(), obrwi.element, obrwi.weight.longValue());
urlhash = obrwi.element.metadataHash();
final URIMetadataRow page = this.query.getSegment().urlMetadata().load(urlhash, obrwi.element, obrwi.weight.longValue());
if (page == null) {
misses.add(obrwi.element.metadataHash());
continue;
@ -395,12 +397,18 @@ public final class RankingProcess extends Thread {
continue; // rare case where the url is corrupted
}
// check url mask
if (!metadata.matches(query.urlMask)) {
continue;
if (!query.urlMask_isCatchall) {
// check url mask
if (!metadata.matches(query.urlMask)) {
continue;
}
// in case that we do not have e catchall filter for urls
// we must also construct the domain navigator here
this.hostNavigator.inc(urlhash.substring(6), urlhash);
}
// check for more errors
// check for more errors
if (metadata.url() == null) {
continue; // rare case where the url is corrupted
}
@ -539,14 +547,10 @@ public final class RankingProcess extends Thread {
int rc = Math.min(count, hsa.length);
ArrayList<Navigator.Item> result = new ArrayList<Navigator.Item>();
for (int i = 0; i < rc; i++) result.add(hsa[i]);
if (result.size() < 2) result.clear(); // navigators with one entry are not useful
return result;
}
public List<Navigator.Item> getHostNavigators(int count) {
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts") < 0) return new ArrayList<Navigator.Item>(0);
return this.hostNavigator.entries(10);
}
public List<Navigator.Item> getHostNavigator(int count) {
List<Navigator.Item> result = new ArrayList<Navigator.Item>();
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("hosts") < 0) return result;
@ -569,6 +573,7 @@ public final class RankingProcess extends Thread {
for (Navigator.Item entry: result) if (entry.name.equals(hostname)) continue loop; // check if one entry already exists
result.add(new Navigator.Item(hostname, item.count));
}
if (result.size() < 2) result.clear(); // navigators with one entry are not useful
return result;
}
@ -589,8 +594,9 @@ public final class RankingProcess extends Thread {
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("topics") < 0) return new ArrayList<Navigator.Item>(0);
return this.ref.entries(10);
List<Navigator.Item> result = this.ref.entries(10);
if (result.size() < 2) result.clear(); // navigators with one entry are not useful
return result;
}
public void addTopic(final String[] words) {
@ -623,8 +629,9 @@ public final class RankingProcess extends Thread {
// create a list of words that had been computed by statistics over all
// words that appeared in the url or the description of all urls
if (!this.query.navigators.equals("all") && this.query.navigators.indexOf("authors") < 0) return new ArrayList<Navigator.Item>(0);
return this.authorNavigator.entries(count);
List<Navigator.Item> result = this.authorNavigator.entries(count);
if (result.size() < 2) result.clear(); // navigators with one entry are not useful
return result;
}
public static void loadYBR(final File rankingPath, final int count) {

@ -55,7 +55,7 @@ import net.yacy.kelondro.util.FileUtils;
public class Document {
private final DigestURI source; // the source url
private final DigestURI source; // the source url
private final String mimeType; // mimeType as taken from http header
private final String charset; // the charset of the document
private final List<String> keywords; // most resources provide a keyword field

Loading…
Cancel
Save