more refactoring of search:

integrated first version of ssi-using search interface,
but the function is currently disabled


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4063 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent f81ef40cc4
commit f9e6cf6a3d

@ -79,7 +79,7 @@ public class ViewImage {
if ((url == null) && (urlLicense.length() > 0)) {
url = sb.licensedURLs.releaseLicense(urlLicense);
urlString = url.toNormalform(true, true);
urlString = (url == null) ? null : url.toNormalform(true, true);
}
if (url == null) return null;

@ -10,5 +10,16 @@
<!--#include virtual="ssitestservlet.html?delay=1000" -->
<!--#include virtual="ssitestservlet.html?delay=2000" -->
<!--#include virtual="ssitestservlet.html?delay=1000" -->
<!--#include virtual="ssitestservlet.html?delay=0" -->
<!--#include virtual="ssitestservlet.html?delay=0" -->
<!--#include virtual="ssitestservlet.html?delay=0" -->
<!--#include virtual="ssitestservlet.html?delay=0" -->
<!--#include virtual="ssitestservlet.html?delay=0" -->
<!--#include virtual="ssitestservlet.html?delay=0" -->
<!--#include virtual="ssitestservlet.html?delay=0" -->
<!--#include virtual="ssitestservlet.html?delay=0" -->
<!--#include virtual="ssitestservlet.html?delay=0" -->
<!--#include virtual="ssitestservlet.html?delay=0" -->
</body>
</html>

@ -28,6 +28,7 @@
// javac -classpath .:../../Classes search.java
// if the shell's current path is htroot/yacy
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
@ -38,19 +39,16 @@ import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.index.indexContainer;
import de.anomic.net.natLib;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaSearchPreOrder;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSearchProcessing;
import de.anomic.plasma.plasmaSearchResultAccumulator;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyDHTAction;
import de.anomic.yacy.yacyNetwork;
import de.anomic.yacy.yacySeed;
import de.anomic.tools.crypt;
@ -126,22 +124,19 @@ public final class search {
StringBuffer indexabstract = new StringBuffer();
int indexabstractContainercount = 0;
int joincount = 0;
plasmaSearchQuery squery = null;
//plasmaSearchEvent theSearch = null;
plasmaSearchResultAccumulator accu = null;
plasmaSearchQuery theQuery = null;
ArrayList accu = null;
if ((query.length() == 0) && (abstractSet != null)) {
// this is _not_ a normal search, only a request for index abstracts
squery = new plasmaSearchQuery(abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), count, duetime, filter, plasmaSearchQuery.catchall_constraint);
squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(squery.queryHashes) + " - " + squery.wantedResults + " links");
theQuery = new plasmaSearchQuery(abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), count, duetime, filter, plasmaSearchQuery.catchall_constraint);
theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.wantedResults + " links");
// prepare a search profile
//plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(contentdom) : new plasmaSearchRankingProfile("", profile);
plasmaSearchProcessing localTiming = new plasmaSearchProcessing(squery.maximumTime, squery.wantedResults);
//plasmaSearchProcessing remoteTiming = null;
plasmaSearchProcessing localTiming = new plasmaSearchProcessing(theQuery.maximumTime, theQuery.wantedResults);
//theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, sb.wordIndex, null);
Map[] containers = localTiming.localSearchContainers(squery, sb.wordIndex, plasmaSearchQuery.hashes2Set(urls));
Map[] containers = localTiming.localSearchContainers(theQuery, sb.wordIndex, plasmaSearchQuery.hashes2Set(urls));
if (containers != null) {
Iterator ci = containers[0].entrySet().iterator();
Map.Entry entry;
@ -157,90 +152,68 @@ public final class search {
prop.putASIS("indexcount", "");
prop.put("joincount", 0);
prop.putASIS("references", "");
} else {
// retrieve index containers from search request
squery = new plasmaSearchQuery(queryhashes, excludehashes, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), count, duetime, filter, constraint);
squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + plasmaSearchQuery.anonymizedQueryHashes(squery.queryHashes) + " - " + squery.wantedResults + " links");
theQuery = new plasmaSearchQuery(queryhashes, excludehashes, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), count, duetime, filter, constraint);
theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL;
yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.wantedResults + " links");
// prepare a search profile
plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(plasmaSearchQuery.contentdomParser(contentdom)) : new plasmaSearchRankingProfile("", profile);
plasmaSearchProcessing localProcess = new plasmaSearchProcessing(squery.maximumTime, squery.wantedResults);
plasmaSearchProcessing localProcess = new plasmaSearchProcessing(theQuery.maximumTime, theQuery.wantedResults);
//plasmaSearchProcessing remoteProcess = null;
//theSearch = new plasmaSearchEvent(squery, rankingProfile, localProcess, remoteProcess, true, sb.wordIndex, null);
Map[] containers = localProcess.localSearchContainers(squery, sb.wordIndex, plasmaSearchQuery.hashes2Set(urls));
plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, rankingProfile, localProcess, null, sb.wordIndex, null, true, abstractSet);
//Map[] containers = localProcess.localSearchContainers(theQuery, sb.wordIndex, plasmaSearchQuery.hashes2Set(urls));
// set statistic details of search result and find best result index set
if (containers == null) {
if (theSearch.getLocalCount() == 0) {
prop.putASIS("indexcount", "");
prop.putASIS("joincount", "0");
} else {
Iterator ci = containers[0].entrySet().iterator();
// attach information about index abstracts
StringBuffer indexcount = new StringBuffer();
Map.Entry entry;
int maxcount = -1;
double mindhtdistance = 1.1, d;
String wordhash;
String maxcounthash = null, neardhthash = null;
while (ci.hasNext()) {
entry = (Map.Entry) ci.next();
wordhash = (String) entry.getKey();
indexContainer container = (indexContainer) entry.getValue();
if (container.size() > maxcount) {
maxcounthash = wordhash;
maxcount = container.size();
}
d = yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhash);
if (d < mindhtdistance) {
// calculate the word hash that is closest to our dht position
mindhtdistance = d;
neardhthash = wordhash;
}
indexcount.append("indexcount.").append(container.getWordHash()).append('=').append(Integer.toString(container.size())).append(serverCore.crlfString);
if ((abstractSet != null) && (abstractSet.contains(wordhash))) {
// if a specific index-abstract is demanded, attach it here
indexabstractContainercount += container.size();
indexabstract.append("indexabstract." + wordhash + "=").append(plasmaURL.compressIndex(container, null,1000).toString()).append(serverCore.crlfString);
Iterator i = theSearch.IACount.entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
indexcount.append("indexcount.").append((String) entry.getKey()).append('=').append(((Integer) entry.getValue()).toString()).append(serverCore.crlfString);
}
if (abstractSet != null) {
// if a specific index-abstract is demanded, attach it here
i = abstractSet.iterator();
String wordhash;
while (i.hasNext()) {
wordhash = (String) i.next();
indexabstractContainercount += ((Integer) theSearch.IACount.get(wordhash)).intValue();
indexabstract.append("indexabstract." + wordhash + "=").append((String) theSearch.IAResults.get(wordhash)).append(serverCore.crlfString);
}
}
prop.putASIS("indexcount", new String(indexcount));
// join and order the result
indexContainer localResults =
(containers == null) ?
plasmaWordIndex.emptyContainer(null, 0) :
localProcess.localSearchJoinExclude(
containers[0].values(),
containers[1].values(),
(squery.queryHashes.size() == 0) ?
0 :
localProcess.getTargetTime(plasmaSearchProcessing.PROCESS_JOIN) * squery.queryHashes.size() / (squery.queryHashes.size() + squery.excludeHashes.size()),
squery.maxDistance);
if (localResults == null) {
if (theSearch.getLocalCount() == 0) {
joincount = 0;
prop.put("joincount", 0);
} else {
joincount = localResults.size();
joincount = theSearch.getLocalCount();
prop.putASIS("joincount", Integer.toString(joincount));
plasmaSearchPreOrder pre = new plasmaSearchPreOrder(squery, localProcess, rankingProfile, localResults);
accu = new plasmaSearchResultAccumulator(squery, localProcess, rankingProfile, pre.strippedContainer(200), sb.wordIndex, plasmaSwitchboard.blueList, false);
accu = theSearch.computeResults(plasmaSwitchboard.blueList, false);
}
// generate compressed index for maxcounthash
// this is not needed if the search is restricted to specific
// urls, because it is a re-search
if ((maxcounthash == null) || (urls.length() != 0) || (queryhashes.size() == 1) || (abstracts.length() == 0)) {
if ((theSearch.IAmaxcounthash == null) || (urls.length() != 0) || (queryhashes.size() == 1) || (abstracts.length() == 0)) {
prop.putASIS("indexabstract", "");
} else if (abstracts.equals("auto")) {
// automatically attach the index abstract for the index that has the most references. This should be our target dht position
indexContainer container = (indexContainer) containers[0].get(maxcounthash);
indexabstractContainercount += container.size();
indexabstract.append("indexabstract." + maxcounthash + "=").append(plasmaURL.compressIndex(container,localResults, 1000).toString()).append(serverCore.crlfString);
if ((neardhthash != null) && (!(neardhthash.equals(maxcounthash)))) {
indexabstractContainercount += ((Integer) theSearch.IACount.get(theSearch.IAmaxcounthash)).intValue();
indexabstract.append("indexabstract." + theSearch.IAmaxcounthash + "=").append((String) theSearch.IAResults.get(theSearch.IAmaxcounthash)).append(serverCore.crlfString);
if ((theSearch.IAneardhthash != null) && (!(theSearch.IAneardhthash.equals(theSearch.IAmaxcounthash)))) {
// in case that the neardhthash is different from the maxcounthash attach also the neardhthash-container
container = (indexContainer) containers[0].get(neardhthash);
indexabstractContainercount += container.size();
indexabstract.append("indexabstract." + neardhthash + "=").append(plasmaURL.compressIndex(container, localResults, 1000).toString()).append(serverCore.crlfString);
indexabstractContainercount += ((Integer) theSearch.IACount.get(theSearch.IAneardhthash)).intValue();
indexabstract.append("indexabstract." + theSearch.IAneardhthash + "=").append((String) theSearch.IAResults.get(theSearch.IAneardhthash)).append(serverCore.crlfString);
}
//System.out.println("DEBUG-ABSTRACTGENERATION: maxcounthash = " + maxcounthash);
//System.out.println("DEBUG-ABSTRACTGENERATION: neardhthash = "+ neardhthash);
@ -248,13 +221,20 @@ public final class search {
}
}
if (partitions > 0) sb.requestedQueries = sb.requestedQueries + 1d / (double) partitions; // increase query counter
// prepare reference hints
Object[] ws = theSearch.references();
StringBuffer refstr = new StringBuffer();
for (int j = 0; j < ws.length; j++)
refstr.append(",").append((String) ws[j]);
prop.putASIS("references", (refstr.length() > 0) ? refstr.substring(1) : new String(refstr));
}
prop.putASIS("indexabstract", new String(indexabstract));
// prepare search statistics
Long trackerHandle = new Long(System.currentTimeMillis());
String client = (String) header.get("CLIENTIP");
HashMap searchProfile = squery.resultProfile((accu == null) ? 0 : accu.resultCount(), System.currentTimeMillis() - timestamp);
HashMap searchProfile = theQuery.resultProfile((accu == null) ? 0 : accu.size(), System.currentTimeMillis() - timestamp);
searchProfile.put("host", client);
yacySeed remotepeer = yacyCore.seedDB.lookupByIP(natLib.getInetAddress(client), true, false, false);
searchProfile.put("peername", (remotepeer == null) ? "unknown" : remotepeer.getName());
@ -278,23 +258,16 @@ public final class search {
// result is a List of urlEntry elements
StringBuffer links = new StringBuffer();
String resource = null;
plasmaSearchResultAccumulator.Entry entry;
for (int i = 0; i < accu.resultCount(); i++) {
entry = accu.resultEntry(i);
plasmaSearchEvent.Entry entry;
for (int i = 0; i < accu.size(); i++) {
entry = (plasmaSearchEvent.Entry) accu.get(i);
resource = entry.resource();
if (resource != null) {
links.append("resource").append(i).append('=').append(resource).append(serverCore.crlfString);
}
}
prop.putASIS("links", new String(links));
prop.put("linkcount", accu.resultCount());
// prepare reference hints
Object[] ws = accu.references();
StringBuffer refstr = new StringBuffer();
for (int j = 0; j < ws.length; j++)
refstr.append(",").append((String) ws[j]);
prop.putASIS("references", (refstr.length() > 0) ? refstr.substring(1) : new String(refstr));
prop.put("linkcount", accu.size());
}
// add information about forward peers
@ -304,7 +277,7 @@ public final class search {
// log
yacyCore.log.logInfo("EXIT HASH SEARCH: " +
plasmaSearchQuery.anonymizedQueryHashes(squery.queryHashes) + " - " + joincount + " links found, " +
plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + joincount + " links found, " +
prop.get("linkcount", "?") + " links selected, " +
indexabstractContainercount + " index abstract references attached, " +
(System.currentTimeMillis() - timestamp) + " milliseconds");

@ -230,20 +230,29 @@ document.getElementById("Enter").value = "search again - catch up more links";
</tr>
#{/results}#
</table>
<!-- type 4: experimental new text search -->
#(combine)#
::
<p><strong>Refine your search with these topwords</strong>:</p>
<p>
#{words}#
<a href="yacysearch.html?search=#[newsearch]#&amp;Enter=Search&amp;count=#[count]#&amp;resource=#[resource]#&amp;time=#[time]#">#[word]#</a>
#{/words}#
</p>
#(/combine)#
<!-- linklist begin -->
<div id="hidden_results"></div>
#{results}#
<!-- link begin -->
<div class="searchresults" name="searchresults">
<!--#include virtual="yacysearchitem.html?item=#[item]#&eventID=#[eventID]#" -->
</div>
<!-- link end -->
#{/results}#
#(/type)#
#(display)#
<p class="info">
YaCy is a GPL'ed project with the target of implementing a P2P-based global search engine.<br />
Architecture (C) by Michael Peter Christen, <img src="/env/grafics/mcemailh.gif" alt="Mail-Adresse von Michael Peter Christen" />
</p>
#%env/templates/simplefooter.template%#
::
#%env/templates/footer.template%#
::
#%env/templates/embeddedfooter.template%#
#(/display)#
</body>
</html>

@ -50,13 +50,13 @@ import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.http.httpHeader;
import de.anomic.index.indexContainer;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMSetTools;
@ -70,7 +70,6 @@ import de.anomic.plasma.plasmaSearchPreOrder;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSearchProcessing;
import de.anomic.plasma.plasmaSearchResultAccumulator;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
@ -298,12 +297,11 @@ public class yacysearch {
// create a new search event
String wrongregex = null;
plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, ranking, localTiming, remoteTiming, sb.wordIndex, (sb.isRobinsonMode()) ? sb.clusterhashes : null);
indexContainer preorder = theSearch.search();
plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, ranking, localTiming, remoteTiming, sb.wordIndex, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, null);
// fetch snippets
// generate result object
serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
plasmaSearchResultAccumulator accu = new plasmaSearchResultAccumulator(theQuery, localTiming, ranking, preorder, sb.wordIndex, plasmaSwitchboard.blueList, true);
ArrayList accu = theSearch.computeResults(plasmaSwitchboard.blueList, true);
serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER RESULT PREPARATION: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
// calc some more cross-reference
@ -316,13 +314,12 @@ public class yacysearch {
serverLog.logInfo("LOCAL_SEARCH", "EXIT WORD SEARCH: " + theQuery.queryString + " - " +
(theSearch.getLocalCount() + theSearch.getGlobalCount()) + " links found, " +
theSearch.filteredCount() + " links filtered, " +
accu.resultCount() + " links ordered, " +
accu.size() + " links ordered, " +
((System.currentTimeMillis() - timestamp) / 1000) + " seconds");
// prepare search statistics
Long trackerHandle = new Long(System.currentTimeMillis());
HashMap searchProfile = theQuery.resultProfile(accu.resultCount(), System.currentTimeMillis() - timestamp);
HashMap searchProfile = theQuery.resultProfile(accu.size(), System.currentTimeMillis() - timestamp);
searchProfile.put("querystring", theQuery.queryString);
searchProfile.put("time", trackerHandle);
searchProfile.put("host", client);
@ -333,22 +330,30 @@ public class yacysearch {
if (handles == null) handles = new TreeSet();
handles.add(trackerHandle);
sb.localSearchTracker.put(client, handles);
//**
//prop=sb.searchFromLocal(thisSearch, ranking, localTiming, remoteTiming, true, (String) header.get("CLIENTIP"));
prop=new serverObjects();
prop = new serverObjects();
//prop.put("references", 0);
URL wordURL=null;
prop.put("num-results_totalcount", theSearch.getLocalCount() + theSearch.getGlobalCount());
prop.put("num-results_filteredcount", theSearch.filteredCount());
prop.put("num-results_orderedcount", accu.resultCount());
prop.put("num-results_orderedcount", accu.size());
prop.put("num-results_globalresults", (theSearch.getGlobalCount() == 0) ? 0 : 1);
prop.put("num-results_globalresults_globalcount", theSearch.getGlobalCount());
prop.put("num-results_linkcount", 0);
/*
for (int i = 0; i < theQuery.wantedResults; i++) {
prop.put("type_results_" + i + "_item", i);
prop.put("type_results_" + i + "_eventID", theQuery.id());
}
prop.put("type_results", theQuery.wantedResults);
*/
//------------------------
prop.put("type_results", 0);
for (int i = 0; i < accu.resultCount(); i++) {
plasmaSearchResultAccumulator.Entry result = accu.resultEntry(i);
URL wordURL=null;
for (int i = 0; i < accu.size(); i++) {
plasmaSearchEvent.Entry result = (plasmaSearchEvent.Entry) accu.get(i);
prop.put("type_results_" + i + "_authorized_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? 1 : 0);
prop.put("type_results_" + i + "_authorized_recommend_deletelink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.wantedResults + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*");
prop.put("type_results_" + i + "_authorized_recommend_recommendlink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.wantedResults + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*");
@ -410,22 +415,24 @@ public class yacysearch {
}
prop.put("type_results_" + i + "_snippet", 1);
} else {
/* no snippet available (will be fetched later via ajax) */
// no snippet available (will be fetched later via ajax)
prop.put("type_results_" + i + "_snippet", 0);
prop.put("type_results_" + i + "_snippet_text", "");
}
}
prop.put("type_results", accu.resultCount());
prop.put("num-results_linkcount", Integer.toString(accu.resultCount()));
prop.put("type_results", accu.size());
prop.put("num-results_linkcount", Integer.toString(accu.size()));
}
//------------------------
// process result of search
if (filtered.size() > 0) {
prop.put("excluded", 1);
prop.put("excluded_stopwords", filtered.toString());
} else {
prop.put("excluded", 0);
}
// process result of search
if (filtered.size() > 0) {
prop.put("excluded", 1);
prop.put("excluded_stopwords", filtered.toString());
} else {
prop.put("excluded", 0);
}
if (prop == null || prop.size() == 0) {
if (post.get("search", "").length() < 3) {
@ -440,7 +447,6 @@ public class yacysearch {
prop.put("num-results", 5);
int hintcount = references.length;
if (hintcount > 0) {
prop.put("type_combine", 1);
// get the topwords
final TreeSet topwords = new TreeSet(kelondroNaturalOrder.naturalOrder);
@ -449,8 +455,6 @@ public class yacysearch {
tmp = (String) references[i];
if (tmp.matches("[a-z]+")) {
topwords.add(tmp);
// } else {
// topwords.add("(" + tmp + ")");
}
}
@ -460,13 +464,13 @@ public class yacysearch {
kelondroMSetTools.excludeDestructive(topwords, plasmaSwitchboard.badwords);
}
//avoid stopwords being topwords
// avoid stopwords being topwords
if (env.getConfig("filterOutStopwordsFromTopwords", "true").equals("true")) {
if ((plasmaSwitchboard.stopwords != null) && (plasmaSwitchboard.stopwords.size() > 0)) {
kelondroMSetTools.excludeDestructive(topwords, plasmaSwitchboard.stopwords);
}
if ((plasmaSwitchboard.stopwords != null) && (plasmaSwitchboard.stopwords.size() > 0)) {
kelondroMSetTools.excludeDestructive(topwords, plasmaSwitchboard.stopwords);
}
}
String word;
hintcount = 0;
final Iterator iter = topwords.iterator();
@ -489,11 +493,9 @@ public class yacysearch {
if (wrongregex != null) {
prop.put("num-results_wrong_regex", wrongregex);
prop.put("num-results", 4);
}
else if (totalcount == 0) {
} else if (totalcount == 0) {
prop.put("num-results", 3); // long
}
else {
} else {
prop.put("num-results", 5);
}
}
@ -545,9 +547,10 @@ public class yacysearch {
// if user is not authenticated, he may not vote for URLs
int linkcount = Integer.parseInt(prop.get("num-results_linkcount", "0"));
for (int i=0; i<linkcount; i++)
for (int i=0; i<linkcount; i++) {
prop.put("type_results_" + i + "_authorized", (authenticated) ? 1 : 0);
}
prop.put("searchagain", (global) ? 1 : 0);
prop.put("input", input);
prop.put("display", display);

@ -0,0 +1,18 @@
#(content)#::
#(authorized)#::
<div class="urlactions">
<a href="/Bookmarks.html?edit=#[urlhash]#" class="bookmarklink" title="bookmark"><img src="/env/grafics/empty.gif" title="bookmark" alt="bookmark" class="bookmarkIcon" /></a>
#(recommend)#
<img src="/env/grafics/empty.gif" title="" alt="" class="recommendIcon" />
<img src="/env/grafics/empty.gif" title="" alt="" class="deleteIcon" />
::
<a href="#[recommendlink]#" class="recommendlink" title="recommend"><img src="/env/grafics/empty.gif" title="recommend" alt="recommend" class="recommendIcon" /></a>
<a href="#[deletelink]#" title="delete" class="deletelink" ><img src="/env/grafics/empty.gif" title="delete" alt="delete" class="deleteIcon" /></a>
#(/recommend)#
</div>
#(/authorized)#
<h4 class="linktitle"><img src="ViewImage.png?width=16&height=16&code=#[faviconCode]#" id="f#[urlhash]#" class="favicon" width="16" height="16" /><a href="#[url]#" target="_parent">#[description]#</a></h4>
<p class="snippet iconindented"><span class="snippetLoaded" id="h#[urlhash]#">#[snippet]#</span></p>
<p class="url iconindented"><a href="#[url]#" id="url#[urlhash]#" target="_parent">#[urlname]#</a></p>
<p class="urlinfo iconindented">#[date]# | YBR-#[ybr]# | <a href="ViewFile.html?urlHash=#[urlhash]#&amp;words=#[words]#">Info</a> | <a href="yacysearch.html?cat=image&amp;url=#[url]#&amp;search=#[former]#">Pictures</a></p>
#(/content)#

@ -0,0 +1,145 @@
// yacysearchitem.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 28.08.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.TreeSet;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchPreOrder;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaSearchRankingProfile;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.tools.crypt;
import de.anomic.tools.nxTools;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyNewsPool;
import de.anomic.yacy.yacySeed;
public class yacysearchitem {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
final plasmaSwitchboard sb = (plasmaSwitchboard) env;
final serverObjects prop = new serverObjects();
String eventID = post.get("eventID", "");
int item = post.getInt("item", -1);
boolean authenticated = sb.adminAuthenticated(header) >= 2;
// find search event
plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(eventID);
plasmaSearchQuery theQuery = theSearch.getQuery();
plasmaSearchRankingProfile ranking = theSearch.getRanking();
long startprofiling = System.currentTimeMillis();
// generate result object
ArrayList accu = theSearch.computeResults(plasmaSwitchboard.blueList, true);
plasmaSearchEvent.Entry result = (plasmaSearchEvent.Entry) accu.get(item);
System.out.println("PROFILING_DEBUG: " + (System.currentTimeMillis() - startprofiling) + " millisekunden fuer item " + item);
prop.put("content", 1); // switch on content
prop.put("content_authorized", (authenticated) ? 1 : 0);
prop.put("content_authorized_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? 1 : 0);
prop.put("content_authorized_recommend_deletelink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.wantedResults + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*");
prop.put("content_authorized_recommend_recommendlink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.wantedResults + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*");
prop.put("content_authorized_urlhash", result.hash());
prop.put("content_description", result.title());
prop.put("content_url", result.urlstring());
int port=result.url().getPort();
URL faviconURL;
try {
faviconURL = new URL(result.url().getProtocol() + "://" + result.url().getHost() + ((port != -1) ? (":" + String.valueOf(port)) : "") + "/favicon.ico");
} catch (MalformedURLException e1) {
faviconURL = null;
}
prop.put("content_faviconCode", sb.licensedURLs.aquireLicense(faviconURL)); // aquire license for favicon url loading
prop.put("content_urlhash", result.hash());
prop.put("content_urlhexhash", yacySeed.b64Hash2hexHash(result.hash()));
prop.put("content_urlname", nxTools.shortenURLString(result.urlname(), 120));
prop.put("content_date", plasmaSwitchboard.dateString(result.modified()));
prop.put("content_ybr", plasmaSearchPreOrder.ybr(result.hash()));
prop.put("content_size", Long.toString(result.filesize()));
TreeSet[] query = theQuery.queryWords();
URL wordURL = null;
try {
prop.put("content_words", URLEncoder.encode(query[0].toString(),"UTF-8"));
} catch (UnsupportedEncodingException e) {}
prop.put("content_former", theQuery.queryString);
prop.put("content_rankingprops", result.word().toPropertyForm() + ", domLengthEstimated=" + plasmaURL.domLengthEstimation(result.hash()) +
((plasmaURL.probablyRootURL(result.hash())) ? ", probablyRootURL" : "") +
(((wordURL = plasmaURL.probablyWordURL(result.hash(), query[0])) != null) ? ", probablyWordURL=" + wordURL.toNormalform(false, true) : ""));
/*
// adding snippet if available
if (result.hasSnippet()) {
prop.put("content_snippet", result.textSnippet().getLineMarked(theQuery.queryHashes));
} else {
// snippet fetch timeout
int textsnippet_timeout = Integer.parseInt(env.getConfig("timeout_media", "10000"));
// boolean line_end_with_punctuation
boolean pre = post.get("pre", "false").equals("true");
// if 'remove' is set to true, then RWI references to URLs that do not have the snippet are removed
boolean remove = post.get("remove", "false").equals("true");
plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet(
result.url(),
theQuery.queryHashes,
true,
pre,
260,
textsnippet_timeout
);
if (snippet.getErrorCode() < 11) {
// no problems occurred
//prop.put("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown");
prop.putASIS("content_snippet", (snippet.exists()) ? snippet.getLineMarked(theQuery.queryHashes) : "unknown");
} else {
// problems with snippet fetch
prop.put("content_snippet", (remove) ? plasmaSnippetCache.failConsequences(snippet, theQuery.id()) : snippet.getError());
}
}
*/
prop.put("content_snippet","temporary no snippet computed");
return prop;
}
}

@ -64,11 +64,13 @@ public class URLLicense {
synchronized (permissions) {
url = (URL) permissions.remove(license);
}
/*
if (url == null) {
System.out.println("DEBUG-URLLICENSE: no URL license present for code=" + license);
} else {
System.out.println("DEBUG-URLLICENSE: granted download of " + url.toString());
}
*/
return url;
}

@ -900,7 +900,7 @@ public final class httpdFileHandler {
try {out.flush();}catch (Exception e) {}
if (((String)requestHeader.get(httpHeader.CONNECTION, "close")).indexOf("keep-alive") == -1) {
// wait a little time until everything closes so that clients can read from the streams/sockets
try {Thread.sleep(200);} catch (InterruptedException e) {}
//try {Thread.sleep(200);} catch (InterruptedException e) {} // FIXME: is this necessary?
}
}
}

@ -1,6 +1,6 @@
// indexCollectionRI.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 03.07.2006 on http://www.anomic.de
// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 03.07.2006 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//

@ -26,19 +26,28 @@
package de.anomic.plasma;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeMap;
import java.util.TreeSet;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMSetTools;
import de.anomic.net.URL;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyDHTAction;
import de.anomic.yacy.yacySearch;
import de.anomic.yacy.yacySeed;
public final class plasmaSearchEvent {
@ -62,13 +71,18 @@ public final class plasmaSearchEvent {
private int lastglobal;
private int filteredCount;
private ArrayList display; // an array of url hashes of urls that had been displayed as search result after this search
private Object[] references;
public TreeMap IAResults, IACount;
public String IAmaxcounthash, IAneardhthash;
private plasmaSearchEvent(plasmaSearchQuery query,
plasmaSearchRankingProfile ranking,
plasmaSearchProcessing localTiming,
plasmaSearchProcessing remoteTiming,
plasmaWordIndex wordIndex,
TreeMap preselectedPeerHashes) {
TreeMap preselectedPeerHashes,
boolean generateAbstracts,
TreeSet abstractSet) {
this.eventTime = System.currentTimeMillis(); // for lifetime check
this.wordIndex = wordIndex;
this.query = query;
@ -86,6 +100,11 @@ public final class plasmaSearchEvent {
this.sortedResults = null;
this.lastglobal = 0;
this.display = new ArrayList();
this.references = new String[0];
this.IAResults = new TreeMap();
this.IACount = new TreeMap();
this.IAmaxcounthash = null;
this.IAneardhthash = null;
long start = System.currentTimeMillis();
if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ||
@ -208,6 +227,33 @@ public final class plasmaSearchEvent {
} else {
Map[] searchContainerMaps = profileLocal.localSearchContainers(query, wordIndex, null);
if (generateAbstracts) {
// compute index abstracts
Iterator ci = searchContainerMaps[0].entrySet().iterator();
Map.Entry entry;
int maxcount = -1;
double mindhtdistance = 1.1, d;
String wordhash;
while (ci.hasNext()) {
entry = (Map.Entry) ci.next();
wordhash = (String) entry.getKey();
indexContainer container = (indexContainer) entry.getValue();
assert (container.getWordHash().equals(wordhash));
if (container.size() > maxcount) {
IAmaxcounthash = wordhash;
maxcount = container.size();
}
d = yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhash);
if (d < mindhtdistance) {
// calculate the word hash that is closest to our dht position
mindhtdistance = d;
IAneardhthash = wordhash;
}
IACount.put(wordhash, new Integer(container.size()));
IAResults.put(wordhash, plasmaURL.compressIndex(container, null, 1000).toString());
}
}
rcLocal =
(searchContainerMaps == null) ?
plasmaWordIndex.emptyContainer(null, 0) :
@ -274,10 +320,12 @@ public final class plasmaSearchEvent {
plasmaSearchProcessing localTiming,
plasmaSearchProcessing remoteTiming,
plasmaWordIndex wordIndex,
TreeMap preselectedPeerHashes) {
TreeMap preselectedPeerHashes,
boolean generateAbstracts,
TreeSet abstractSet) {
plasmaSearchEvent event = (plasmaSearchEvent) lastEvents.get(query.id());
if (event == null) {
event = new plasmaSearchEvent(query, ranking, localTiming, remoteTiming, wordIndex, preselectedPeerHashes);
event = new plasmaSearchEvent(query, ranking, localTiming, remoteTiming, wordIndex, preselectedPeerHashes, generateAbstracts, abstractSet);
} else {
//re-new the event time for this event, so it is not deleted next time too early
event.eventTime = System.currentTimeMillis();
@ -285,7 +333,7 @@ public final class plasmaSearchEvent {
return event;
}
public indexContainer search() {
private indexContainer search() {
// combine the local and global (if any) result and order
if ((rcGlobal != null) && (rcGlobal.size() > 0)) {
globalcount = rcGlobal.size();
@ -310,6 +358,136 @@ public final class plasmaSearchEvent {
return this.sortedResults;
}
public ArrayList computeResults(
TreeSet blueList,
boolean overfetch) {
indexContainer pre = search();
final ArrayList hits = new ArrayList();
// start url-fetch
final long postorderTime = this.profileLocal.getTargetTime(plasmaSearchProcessing.PROCESS_POSTSORT);
//System.out.println("DEBUG: postorder-final (urlfetch) maxtime = " + postorderTime);
final long postorderLimitTime = (postorderTime < 0) ? Long.MAX_VALUE : (System.currentTimeMillis() + postorderTime);
this.profileLocal.startTimer();
final plasmaSearchPostOrder acc = new plasmaSearchPostOrder(query, ranking);
indexRWIEntry rwientry;
indexURLEntry page;
indexURLEntry.Components comp;
String pagetitle, pageurl, pageauthor;
final int minEntries = this.profileLocal.getTargetCount(plasmaSearchProcessing.PROCESS_POSTSORT);
try {
ordering: for (int i = 0; i < pre.size(); i++) {
if ((System.currentTimeMillis() >= postorderLimitTime) || (acc.sizeFetched() >= ((overfetch) ? 4 : 1) * minEntries)) break;
rwientry = new indexRWIEntry(pre.get(i));
// load only urls if there was not yet a root url of that hash
// find the url entry
page = wordIndex.loadedURL.load(rwientry.urlHash(), rwientry);
if (page != null) {
comp = page.comp();
pagetitle = comp.title().toLowerCase();
if (comp.url() == null) continue ordering; // rare case where the url is corrupted
pageurl = comp.url().toString().toLowerCase();
pageauthor = comp.author().toLowerCase();
// check exclusion
if (plasmaSearchQuery.matches(pagetitle, query.excludeHashes)) continue ordering;
if (plasmaSearchQuery.matches(pageurl, query.excludeHashes)) continue ordering;
if (plasmaSearchQuery.matches(pageauthor, query.excludeHashes)) continue ordering;
// check url mask
if (!(pageurl.matches(query.urlMask))) continue ordering;
// check constraints
if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) &&
(query.constraint.get(plasmaCondenser.flag_cat_indexof)) &&
(!(comp.title().startsWith("Index of")))) {
serverLog.logFine("PLASMA", "filtered out " + comp.url().toString());
// filter out bad results
final Iterator wi = query.queryHashes.iterator();
while (wi.hasNext()) wordIndex.removeEntry((String) wi.next(), page.hash());
} else if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) {
if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (page.laudio() > 0)) acc.addPage(page);
else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (page.lvideo() > 0)) acc.addPage(page);
else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (page.limage() > 0)) acc.addPage(page);
else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP) && (page.lapp() > 0)) acc.addPage(page);
} else {
acc.addPage(page);
}
}
}
} catch (final kelondroException ee) {
serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee);
}
this.profileLocal.setYieldTime(plasmaSearchProcessing.PROCESS_URLFETCH);
this.profileLocal.setYieldCount(plasmaSearchProcessing.PROCESS_URLFETCH, acc.sizeFetched());
// start postsorting
this.profileLocal.startTimer();
acc.sortPages(true);
this.profileLocal.setYieldTime(plasmaSearchProcessing.PROCESS_POSTSORT);
this.profileLocal.setYieldCount(plasmaSearchProcessing.PROCESS_POSTSORT, acc.sizeOrdered());
// apply filter
this.profileLocal.startTimer();
acc.removeRedundant();
this.profileLocal.setYieldTime(plasmaSearchProcessing.PROCESS_FILTER);
this.profileLocal.setYieldCount(plasmaSearchProcessing.PROCESS_FILTER, acc.sizeOrdered());
// generate references
this.references = acc.getReferences(16);
// generate Result.Entry objects and optionally fetch snippets
int i = 0;
Entry entry;
final boolean includeSnippets = false;
while ((acc.hasMoreElements()) && (i < query.wantedResults)) {
try {
entry = new Entry(acc.nextElement(), wordIndex);
} catch (final RuntimeException e) {
continue;
}
// check bluelist again: filter out all links where any
// bluelisted word
// appear either in url, url's description or search word
// the search word was sorted out earlier
/*
* String s = descr.toLowerCase() + url.toString().toLowerCase();
* for (int c = 0; c < blueList.length; c++) { if
* (s.indexOf(blueList[c]) >= 0) return; }
*/
if (includeSnippets) {
entry.setSnippet(plasmaSnippetCache.retrieveTextSnippet(
entry.url(), query.queryHashes, false,
entry.flags().get(plasmaCondenser.flag_cat_indexof), 260,
1000));
// snippet =
// snippetCache.retrieveTextSnippet(comp.url(),
// query.queryHashes, false,
// urlentry.flags().get(plasmaCondenser.flag_cat_indexof),
// 260, 1000);
} else {
// snippet = null;
entry.setSnippet(null);
}
i++;
hits.add(entry);
}
/*
* while ((acc.hasMoreElements()) && (((time + timestamp) <
* System.currentTimeMillis()))) { urlentry = acc.nextElement();
* urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url());
* descr = urlentry.descr();
*
* addScoreForked(ref, gs, descr.split(" ")); addScoreForked(ref, gs,
* urlstring.split("/")); }
*/
return hits;
}
public int filteredCount() {
return this.filteredCount;
@ -418,4 +596,98 @@ public final class plasmaSearchEvent {
this.display.set(position, urlhash);
}
public Object[] references() {
return this.references;
}
public static class Entry {
private indexURLEntry urlentry;
private indexURLEntry.Components urlcomps; // buffer for components
private String alternative_urlstring;
private String alternative_urlname;
private plasmaSnippetCache.TextSnippet snippet;
public Entry(indexURLEntry urlentry, plasmaWordIndex wordIndex) {
this.urlentry = urlentry;
this.urlcomps = urlentry.comp();
this.alternative_urlstring = null;
this.alternative_urlname = null;
this.snippet = null;
String host = urlcomps.url().getHost();
if (host.endsWith(".yacyh")) {
// translate host into current IP
int p = host.indexOf(".");
String hash = yacySeed.hexHash2b64Hash(host.substring(p + 1, host.length() - 6));
yacySeed seed = yacyCore.seedDB.getConnected(hash);
String filename = urlcomps.url().getFile();
String address = null;
if ((seed == null) || ((address = seed.getPublicAddress()) == null)) {
// seed is not known from here
try {
wordIndex.removeWordReferences(
plasmaCondenser.getWords(
("yacyshare " +
filename.replace('?', ' ') +
" " +
urlcomps.title()).getBytes(), "UTF-8").keySet(),
urlentry.hash());
wordIndex.loadedURL.remove(urlentry.hash()); // clean up
throw new RuntimeException("index void");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("parser failed: " + e.getMessage());
}
}
alternative_urlstring = "http://" + address + "/" + host.substring(0, p) + filename;
alternative_urlname = "http://share." + seed.getName() + ".yacy" + filename;
if ((p = alternative_urlname.indexOf("?")) > 0) alternative_urlname = alternative_urlname.substring(0, p);
}
}
public String hash() {
return urlentry.hash();
}
public URL url() {
return urlcomps.url();
}
public kelondroBitfield flags() {
return urlentry.flags();
}
public String urlstring() {
return (alternative_urlstring == null) ? urlcomps.url().toNormalform(false, true) : alternative_urlstring;
}
public String urlname() {
return (alternative_urlname == null) ? urlcomps.url().toNormalform(false, true) : alternative_urlname;
}
public String title() {
return urlcomps.title();
}
public void setSnippet(plasmaSnippetCache.TextSnippet snippet) {
this.snippet = snippet;
}
public plasmaSnippetCache.TextSnippet snippet() {
return this.snippet;
}
public Date modified() {
return urlentry.moddate();
}
public int filesize() {
return urlentry.size();
}
public indexRWIEntry word() {
return urlentry.word();
}
public boolean hasSnippet() {
return false;
}
public plasmaSnippetCache.TextSnippet textSnippet() {
return null;
}
public String resource() {
// generate transport resource
if ((snippet != null) && (snippet.exists())) {
return urlentry.toString(snippet.getLineRaw());
} else {
return urlentry.toString();
}
}
}
}

@ -191,6 +191,10 @@ public final class plasmaSearchQuery {
return this.queryString;
}
public TreeSet[] queryWords() {
return cleanQuery(this.queryString);
}
public void filterOut(Set blueList) {
// filter out words that appear in this set
// this is applied to the queryHashes

@ -1,296 +0,0 @@
// plasmaSearchResultAccumulator.java
// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 15.08.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.TreeSet;
import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroException;
import de.anomic.net.URL;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeed;
public class plasmaSearchResultAccumulator {
private ArrayList hits;
private Object[] references;
public plasmaSearchResultAccumulator(
plasmaSearchQuery theQuery,
plasmaSearchProcessing process,
plasmaSearchRankingProfile ranking,
indexContainer pre,
plasmaWordIndex wordIndex,
TreeSet blueList,
boolean overfetch) {
hits = new ArrayList();
// start url-fetch
long postorderTime = process.getTargetTime(plasmaSearchProcessing.PROCESS_POSTSORT);
//System.out.println("DEBUG: postorder-final (urlfetch) maxtime = " + postorderTime);
long postorderLimitTime = (postorderTime < 0) ? Long.MAX_VALUE : (System.currentTimeMillis() + postorderTime);
process.startTimer();
plasmaSearchPostOrder acc = new plasmaSearchPostOrder(theQuery, ranking);
indexRWIEntry rwientry;
indexURLEntry page;
indexURLEntry.Components comp;
String pagetitle, pageurl, pageauthor;
int minEntries = process.getTargetCount(plasmaSearchProcessing.PROCESS_POSTSORT);
try {
ordering: for (int i = 0; i < pre.size(); i++) {
if ((System.currentTimeMillis() >= postorderLimitTime) || (acc.sizeFetched() >= ((overfetch) ? 4 : 1) * minEntries)) break;
rwientry = new indexRWIEntry(pre.get(i));
// load only urls if there was not yet a root url of that hash
// find the url entry
page = wordIndex.loadedURL.load(rwientry.urlHash(), rwientry);
if (page != null) {
comp = page.comp();
pagetitle = comp.title().toLowerCase();
if (comp.url() == null) continue ordering; // rare case where the url is corrupted
pageurl = comp.url().toString().toLowerCase();
pageauthor = comp.author().toLowerCase();
// check exclusion
if (plasmaSearchQuery.matches(pagetitle, theQuery.excludeHashes)) continue ordering;
if (plasmaSearchQuery.matches(pageurl, theQuery.excludeHashes)) continue ordering;
if (plasmaSearchQuery.matches(pageauthor, theQuery.excludeHashes)) continue ordering;
// check url mask
if (!(pageurl.matches(theQuery.urlMask))) continue ordering;
// check constraints
if ((!(theQuery.constraint.equals(plasmaSearchQuery.catchall_constraint))) &&
(theQuery.constraint.get(plasmaCondenser.flag_cat_indexof)) &&
(!(comp.title().startsWith("Index of")))) {
serverLog.logFine("PLASMA", "filtered out " + comp.url().toString());
// filter out bad results
Iterator wi = theQuery.queryHashes.iterator();
while (wi.hasNext()) wordIndex.removeEntry((String) wi.next(), page.hash());
} else if (theQuery.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) {
if ((theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (page.laudio() > 0)) acc.addPage(page);
else if ((theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (page.lvideo() > 0)) acc.addPage(page);
else if ((theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (page.limage() > 0)) acc.addPage(page);
else if ((theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_APP) && (page.lapp() > 0)) acc.addPage(page);
} else {
acc.addPage(page);
}
}
}
} catch (kelondroException ee) {
serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee);
}
process.setYieldTime(plasmaSearchProcessing.PROCESS_URLFETCH);
process.setYieldCount(plasmaSearchProcessing.PROCESS_URLFETCH, acc.sizeFetched());
// start postsorting
process.startTimer();
acc.sortPages(true);
process.setYieldTime(plasmaSearchProcessing.PROCESS_POSTSORT);
process.setYieldCount(plasmaSearchProcessing.PROCESS_POSTSORT, acc.sizeOrdered());
// apply filter
process.startTimer();
acc.removeRedundant();
process.setYieldTime(plasmaSearchProcessing.PROCESS_FILTER);
process.setYieldCount(plasmaSearchProcessing.PROCESS_FILTER, acc.sizeOrdered());
// generate references
references = acc.getReferences(16);
// generate Result.Entry objects and optionally fetch snippets
int i = 0;
Entry entry;
boolean includeSnippets = false;
while ((acc.hasMoreElements()) && (i < theQuery.wantedResults)) {
try {
entry = new Entry(acc.nextElement(), wordIndex);
} catch (RuntimeException e) {
continue;
}
// check bluelist again: filter out all links where any
// bluelisted word
// appear either in url, url's description or search word
// the search word was sorted out earlier
/*
* String s = descr.toLowerCase() + url.toString().toLowerCase();
* for (int c = 0; c < blueList.length; c++) { if
* (s.indexOf(blueList[c]) >= 0) return; }
*/
if (includeSnippets) {
entry.setSnippet(plasmaSnippetCache.retrieveTextSnippet(
entry.url(), theQuery.queryHashes, false,
entry.flags().get(plasmaCondenser.flag_cat_indexof), 260,
1000));
// snippet =
// snippetCache.retrieveTextSnippet(comp.url(),
// query.queryHashes, false,
// urlentry.flags().get(plasmaCondenser.flag_cat_indexof),
// 260, 1000);
} else {
// snippet = null;
entry.setSnippet(null);
}
i++;
hits.add(entry);
}
/*
* while ((acc.hasMoreElements()) && (((time + timestamp) <
* System.currentTimeMillis()))) { urlentry = acc.nextElement();
* urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url());
* descr = urlentry.descr();
*
* addScoreForked(ref, gs, descr.split(" ")); addScoreForked(ref, gs,
* urlstring.split("/")); }
*/
}
// filter
public void applyFilter(
plasmaSearchPostOrder acc) {
}
public int resultCount() {
return hits.size();
}
public Entry resultEntry(int i) {
return (Entry) hits.get(i);
}
public Object[] references() {
return this.references;
}
public static class Entry {
private indexURLEntry urlentry;
private indexURLEntry.Components urlcomps; // buffer for components
private String alternative_urlstring;
private String alternative_urlname;
private plasmaSnippetCache.TextSnippet snippet;
public Entry(indexURLEntry urlentry, plasmaWordIndex wordIndex) {
this.urlentry = urlentry;
this.urlcomps = urlentry.comp();
this.alternative_urlstring = null;
this.alternative_urlname = null;
this.snippet = null;
String host = urlcomps.url().getHost();
if (host.endsWith(".yacyh")) {
// translate host into current IP
int p = host.indexOf(".");
String hash = yacySeed.hexHash2b64Hash(host.substring(p + 1, host.length() - 6));
yacySeed seed = yacyCore.seedDB.getConnected(hash);
String filename = urlcomps.url().getFile();
String address = null;
if ((seed == null) || ((address = seed.getPublicAddress()) == null)) {
// seed is not known from here
try {
wordIndex.removeWordReferences(
plasmaCondenser.getWords(
("yacyshare " +
filename.replace('?', ' ') +
" " +
urlcomps.title()).getBytes(), "UTF-8").keySet(),
urlentry.hash());
wordIndex.loadedURL.remove(urlentry.hash()); // clean up
throw new RuntimeException("index void");
} catch (UnsupportedEncodingException e) {
throw new RuntimeException("parser failed: " + e.getMessage());
}
}
alternative_urlstring = "http://" + address + "/" + host.substring(0, p) + filename;
alternative_urlname = "http://share." + seed.getName() + ".yacy" + filename;
if ((p = alternative_urlname.indexOf("?")) > 0) alternative_urlname = alternative_urlname.substring(0, p);
}
}
public String hash() {
return urlentry.hash();
}
public URL url() {
return urlcomps.url();
}
public kelondroBitfield flags() {
return urlentry.flags();
}
public String urlstring() {
return (alternative_urlstring == null) ? urlcomps.url().toNormalform(false, true) : alternative_urlstring;
}
public String urlname() {
return (alternative_urlname == null) ? urlcomps.url().toNormalform(false, true) : alternative_urlname;
}
public String title() {
return urlcomps.title();
}
public void setSnippet(plasmaSnippetCache.TextSnippet snippet) {
this.snippet = snippet;
}
public plasmaSnippetCache.TextSnippet snippet() {
return this.snippet;
}
public Date modified() {
return urlentry.moddate();
}
public int filesize() {
return urlentry.size();
}
public indexRWIEntry word() {
return urlentry.word();
}
public boolean hasSnippet() {
return false;
}
public plasmaSnippetCache.TextSnippet textSnippet() {
return null;
}
public String resource() {
// generate transport resource
if ((snippet != null) && (snippet.exists())) {
return urlentry.toString(snippet.getLineRaw());
} else {
return urlentry.toString();
}
}
}
}
Loading…
Cancel
Save