first attempt to add 'real' Navigation to yacy search results: host navigation

- after a search is started, it is analysed how many hits are in each site
- this can be done really efficient, because the navigation information is hidden in the url hash and can be computed very fast
- the search result shows a column on the right with the hosts and the hits per host
- after a click on a host the search is modified using the efficient site: - operator

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5976 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 54b9e99c01
commit f246928c20

@ -120,17 +120,18 @@ document.getElementById("Enter").value = "search again";
var progressbar = new Progressbar(#[results]#, document.getElementById("results"));
</script>
#(display)#
::
::
#(navigation)#
::
<div id="sidebar" style="float: right;">
<h3><a href="#">Sidebar-1</a></h3>
<p>Sidebar-1 TEXT TEXT</p>
<h3><a href="#">Sidebar-2</a></h3>
<p>Sidebar-2 TEXT TEXT</p>
<h3><a href="#">Navigation</a></h3>
<h4><a href="#">Domains</a></h4>
<ul>
#{domains}#
<li>#[domain]#</li>
#{/domains}#
</ul>
</div>
#(/display)#
#(/navigation)#
<!-- linklist begin -->
#(resultTable)#::<table width="100%"><tr class="TableHeader"><td width="30%">Media</td><td width="70%">URL</tr>#(/resultTable)#

@ -28,6 +28,7 @@
// if the shell's current path is HTROOT
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.TreeSet;
@ -47,6 +48,7 @@ import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSwitchboardConstants;
import de.anomic.plasma.parser.Word;
import de.anomic.plasma.parser.Condenser;
import de.anomic.plasma.plasmaSearchRankingProcess.hostnaventry;
import de.anomic.server.serverCore;
import de.anomic.server.serverDomains;
import de.anomic.server.serverObjects;
@ -464,7 +466,7 @@ public class yacysearch {
resnav.append(navurla(thispage - 1, display, theQuery, originalUrlMask));
resnav.append("<strong>&lt;</strong></a>&nbsp;");
*/
resnav.append(navurla(thispage - 1, display, theQuery, originalUrlMask));
resnav.append(navurla(thispage - 1, display, theQuery, originalUrlMask, null));
resnav.append("<img src=\"env/grafics/navdl.gif\" width=\"16\" height=\"16\"></a>&nbsp;");
}
final int numberofpages = Math.min(10, Math.max(thispage + 2, totalcount / theQuery.displayResults()));
@ -484,7 +486,7 @@ public class yacysearch {
resnav.append(i + 1);
resnav.append("</a>&nbsp;");
*/
resnav.append(navurla(i, display, theQuery, originalUrlMask));
resnav.append(navurla(i, display, theQuery, originalUrlMask, null));
resnav.append("<img src=\"env/grafics/navd");
resnav.append(i + 1);
resnav.append(".gif\" width=\"16\" height=\"16\"></a>&nbsp;");
@ -498,12 +500,26 @@ public class yacysearch {
resnav.append(navurla(thispage + 1, display, theQuery, originalUrlMask));
resnav.append("<strong>&gt;</strong></a>");
*/
resnav.append(navurla(thispage + 1, display, theQuery, originalUrlMask));
resnav.append(navurla(thispage + 1, display, theQuery, originalUrlMask, null));
resnav.append("<img src=\"env/grafics/navdr.gif\" width=\"16\" height=\"16\"></a>");
}
prop.put("num-results_resnav", resnav.toString());
// compose search navigation
ArrayList<hostnaventry> hostNavigator = theSearch.getHostNavigator(10);
if (hostNavigator == null) {
prop.put("navigation", 0);
} else {
prop.put("navigation", 1);
hostnaventry entry;
for (int i = 0; i < hostNavigator.size(); i++) {
entry = hostNavigator.get(i);
prop.put("navigation_domains_" + i + "_domain", navurla(thispage, display, theQuery, originalUrlMask, "site:" + entry.host) + entry.host + " (" + entry.count + ")</a>");
}
prop.put("navigation_domains", hostNavigator.size());
}
// generate the search result lines; they will be produced by another servlet
// generate the search result lines; the content will be produced by another servlet
for (int i = 0; i < theQuery.displayResults(); i++) {
prop.put("results_" + i + "_item", offset + i);
prop.put("results_" + i + "_eventID", theQuery.id(false));
@ -573,10 +589,10 @@ public class yacysearch {
/**
* generates the page navigation bar
*/
private static String navurla(final int page, final int display, final plasmaSearchQuery theQuery, final String originalUrlMask) {
private static String navurla(final int page, final int display, final plasmaSearchQuery theQuery, final String originalUrlMask, String addToQuery) {
return
"<a href=\"yacysearch.html?display=" + display +
"&amp;search=" + theQuery.queryString(true) +
"&amp;search=" + theQuery.queryString(true) + ((addToQuery == null) ? "" : "+" + addToQuery) +
"&amp;maximumRecords="+ theQuery.displayResults() +
"&amp;startRecord=" + (page * theQuery.displayResults()) +
"&amp;resource=" + ((theQuery.isLocal()) ? "local" : "global") +

@ -52,6 +52,7 @@ import de.anomic.kelondro.util.SortStore;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.parser.Word;
import de.anomic.plasma.parser.Condenser;
import de.anomic.plasma.plasmaSearchRankingProcess.hostnaventry;
import de.anomic.plasma.plasmaSnippetCache.MediaSnippet;
import de.anomic.server.serverProfiling;
import de.anomic.yacy.yacySearch;
@ -94,7 +95,8 @@ public final class plasmaSearchEvent {
TreeSet<byte[]> snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets
long urlRetrievalAllTime;
long snippetComputationAllTime;
ResultURLs crawlResults;
public ResultURLs crawlResults;
public ArrayList<hostnaventry> hostNavigator;
@SuppressWarnings("unchecked")
private plasmaSearchEvent(final plasmaSearchQuery query,
@ -135,6 +137,7 @@ public final class plasmaSearchEvent {
(query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) {
// do a global search
this.rankedCache = new plasmaSearchRankingProcess(wordIndex, query, max_results_preparation, 16);
this.hostNavigator = null;
final int fetchpeers = 12;
@ -171,6 +174,7 @@ public final class plasmaSearchEvent {
// do a local search
this.rankedCache = new plasmaSearchRankingProcess(wordIndex, query, max_results_preparation, 2);
this.rankedCache.execQuery();
this.hostNavigator = rankedCache.getHostNavigator(10);
//plasmaWordIndex.Finding finding = wordIndex.retrieveURLs(query, false, 2, ranking, process);
if (generateAbstracts) {
@ -230,6 +234,7 @@ public final class plasmaSearchEvent {
// so following sortings together with the global results will be fast
try {
rankedCache.execQuery();
hostNavigator = rankedCache.getHostNavigator(10);
} catch (final Exception e) {
e.printStackTrace();
}
@ -563,51 +568,15 @@ public final class plasmaSearchEvent {
Log.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason);
}
/*
public ResultEntry oneResult(final int item) {
return oneResult(item, System.currentTimeMillis() + 100);
}
public ResultEntry oneResult(final int item, long timeout) {
// check if we already retrieved this item (happens if a search pages is accessed a second time)
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), "obtain one result entry - start", 0, 0));
if (this.result.sizeStore() > item) {
// we have the wanted result already in the result array .. return that
return this.result.element(item).element;
}
if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ||
(query.domType == plasmaSearchQuery.SEARCHDOM_CLUSTERALL)) {
// this is a search using remote search threads. Also the local search thread is started as background process
while (
localSearchThread != null &&
localSearchThread.isAlive() &&
System.currentTimeMillis() < timeout) {
// in case that the local search takes longer than some other remote search requests,
// do some sleeps to give the local process a chance to contribute
try {Thread.sleep(10);} catch (final InterruptedException e) {}
}
// now wait until as many remote worker threads have finished, as we want to display results
while (
this.primarySearchThreads != null &&
anyWorkerAlive() &&
countWorkerFinished() <= item &&
System.currentTimeMillis() < timeout &&
(result.size() <= item || countFinishedRemoteSearch() <= item)) {
try {Thread.sleep(10);} catch (final InterruptedException e) {}
}
}
// finally wait until enough results are there produced from the snippet fetch process
while (anyWorkerAlive() && result.size() <= item) {
try {Thread.sleep(10);} catch (final InterruptedException e) {}
}
// finally, if there is something, return the result
if (this.result.size() <= item) return null;
return this.result.element(item).element;
public ArrayList<hostnaventry> getHostNavigator(int maxentries) {
if (this.hostNavigator != null) return this.hostNavigator;
if (localSearchThread != null && localSearchThread.isAlive()) {
try {Thread.sleep(100L);} catch (final InterruptedException e) {}
}
this.hostNavigator = rankedCache.getHostNavigator(10);
if (this.hostNavigator.size() == 0) this.hostNavigator = null;
return this.hostNavigator;
}
*/
public ResultEntry oneResult(final int item) {
// check if we already retrieved this item (happens if a search

@ -74,6 +74,7 @@ public final class plasmaSearchRankingProcess {
private final plasmaWordIndex wordIndex;
private HashMap<byte[], ReferenceContainer<WordReference>>[] localSearchContainerMaps;
private final int[] domZones;
private HashMap<String, hoststat> hostNavigator;
public plasmaSearchRankingProcess(
final plasmaWordIndex wordIndex,
@ -101,6 +102,7 @@ public final class plasmaSearchRankingProcess {
this.flagcount = new int[32];
for (int i = 0; i < 32; i++) {this.flagcount[i] = 0;}
this.domZones = new int[8];
this.hostNavigator = new HashMap<String, hoststat>();
for (int i = 0; i < 8; i++) {this.domZones[i] = 0;}
}
@ -158,6 +160,8 @@ public final class plasmaSearchRankingProcess {
final Iterator<WordReferenceVars> i = decodedEntries.iterator();
WordReferenceVars iEntry;
Long r;
hoststat hs;
String domhash;
while (i.hasNext()) {
iEntry = i.next();
assert (iEntry.metadataHash().length() == index.row().primaryKeyLength);
@ -196,13 +200,17 @@ public final class plasmaSearchRankingProcess {
}
// count domZones
/*
indexURLEntry uentry = wordIndex.loadedURL.load(iEntry.urlHash, iEntry, 0); // this eats up a lot of time!!!
yacyURL uurl = (uentry == null) ? null : uentry.comp().url();
System.out.println("DEBUG domDomain dom=" + ((uurl == null) ? "null" : uurl.getHost()) + ", zone=" + yacyURL.domDomain(iEntry.urlHash()));
*/
this.domZones[yacyURL.domDomain(iEntry.metadataHash())]++;
// get statistics for host navigator
domhash = iEntry.urlHash.substring(6);
hs = this.hostNavigator.get(domhash);
if (hs == null) {
this.hostNavigator.put(domhash, new hoststat(iEntry.urlHash));
} else {
hs.inc();
}
// insert
if ((maxentries < 0) || (stack.size() < maxentries)) {
// in case that we don't have enough yet, accept any new entry
@ -225,6 +233,51 @@ public final class plasmaSearchRankingProcess {
//if ((query.neededResults() > 0) && (container.size() > query.neededResults())) remove(true, true);
serverProfiling.update("SEARCH", new plasmaProfiling.searchEvent(query.id(true), plasmaSearchEvent.PRESORT, index.size(), System.currentTimeMillis() - timer), false);
}
public class hoststat {
public int count;
public String hashsample;
public hoststat(String urlhash) {
this.count = 1;
this.hashsample = urlhash;
}
public void inc() {
this.count++;
}
}
public class hostnaventry {
public int count;
public String host;
public hostnaventry(String host, int count) {
this.host = host;
this.count = count;
}
}
public ArrayList<hostnaventry> getHostNavigator(int maxentries) {
ScoreCluster<String> score = new ScoreCluster<String>();
for (Map.Entry<String, hoststat> hsentry: this.hostNavigator.entrySet()) {
score.addScore(hsentry.getKey(), hsentry.getValue().count);
}
int rc = Math.min(maxentries, score.size());
ArrayList<hostnaventry> result = new ArrayList<hostnaventry>();
String hosthash;
hoststat hs;
URLMetadataRow mr;
yacyURL url;
for (int i = 0; i < rc; i++) {
hosthash = score.getMaxObject();
hs = this.hostNavigator.get(hosthash);
mr = wordIndex.metadata().load(hs.hashsample, null, 0);
if (mr == null) continue;
url = mr.metadata().url();
if (url == null) continue;
result.add(new hostnaventry(url.getHost(), score.getScore(hosthash)));
score.deleteScore(hosthash);
}
return result;
}
private boolean testFlags(final WordReference ientry) {
if (query.constraint == null) return true;

@ -21,25 +21,6 @@
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notice above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
package de.anomic.yacy;

Loading…
Cancel
Save