diff --git a/htroot/ViewImage.java b/htroot/ViewImage.java index b827addcc..b0fee1001 100644 --- a/htroot/ViewImage.java +++ b/htroot/ViewImage.java @@ -79,7 +79,7 @@ public class ViewImage { if ((url == null) && (urlLicense.length() > 0)) { url = sb.licensedURLs.releaseLicense(urlLicense); - urlString = url.toNormalform(true, true); + urlString = (url == null) ? null : url.toNormalform(true, true); } if (url == null) return null; diff --git a/htroot/ssitest.html b/htroot/ssitest.html index 6446e2d22..adc032b70 100644 --- a/htroot/ssitest.html +++ b/htroot/ssitest.html @@ -10,5 +10,16 @@ + + + + + + + + + + + \ No newline at end of file diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 823e65d2b..d1b7ee6af 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -28,6 +28,7 @@ // javac -classpath .:../../Classes search.java // if the shell's current path is htroot/yacy +import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.Map; @@ -38,19 +39,16 @@ import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBitfield; import de.anomic.index.indexContainer; import de.anomic.net.natLib; +import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaURL; -import de.anomic.plasma.plasmaSearchPreOrder; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchProcessing; -import de.anomic.plasma.plasmaSearchResultAccumulator; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.plasmaWordIndex; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; -import de.anomic.yacy.yacyDHTAction; import de.anomic.yacy.yacyNetwork; import de.anomic.yacy.yacySeed; import de.anomic.tools.crypt; @@ -126,22 +124,19 @@ public final class search { StringBuffer indexabstract = new StringBuffer(); int indexabstractContainercount = 0; int joincount = 0; - plasmaSearchQuery squery = null; - //plasmaSearchEvent theSearch = null; - plasmaSearchResultAccumulator accu = null; + plasmaSearchQuery theQuery = null; + ArrayList accu = null; if ((query.length() == 0) && (abstractSet != null)) { // this is _not_ a normal search, only a request for index abstracts - squery = new plasmaSearchQuery(abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), count, duetime, filter, plasmaSearchQuery.catchall_constraint); - squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; - yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(squery.queryHashes) + " - " + squery.wantedResults + " links"); + theQuery = new plasmaSearchQuery(abstractSet, new TreeSet(kelondroBase64Order.enhancedCoder), maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), count, duetime, filter, plasmaSearchQuery.catchall_constraint); + theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; + yacyCore.log.logInfo("INIT HASH SEARCH (abstracts only): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.wantedResults + " links"); // prepare a search profile - //plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(contentdom) : new plasmaSearchRankingProfile("", profile); - plasmaSearchProcessing localTiming = new plasmaSearchProcessing(squery.maximumTime, squery.wantedResults); - //plasmaSearchProcessing remoteTiming = null; - + plasmaSearchProcessing localTiming = new plasmaSearchProcessing(theQuery.maximumTime, theQuery.wantedResults); + //theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, sb.wordIndex, null); - Map[] containers = localTiming.localSearchContainers(squery, sb.wordIndex, plasmaSearchQuery.hashes2Set(urls)); + Map[] containers = localTiming.localSearchContainers(theQuery, sb.wordIndex, plasmaSearchQuery.hashes2Set(urls)); if (containers != null) { Iterator ci = containers[0].entrySet().iterator(); Map.Entry entry; @@ -157,90 +152,68 @@ public final class search { prop.putASIS("indexcount", ""); prop.put("joincount", 0); + prop.putASIS("references", ""); + } else { // retrieve index containers from search request - squery = new plasmaSearchQuery(queryhashes, excludehashes, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), count, duetime, filter, constraint); - squery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; - yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + plasmaSearchQuery.anonymizedQueryHashes(squery.queryHashes) + " - " + squery.wantedResults + " links"); + theQuery = new plasmaSearchQuery(queryhashes, excludehashes, maxdist, prefer, plasmaSearchQuery.contentdomParser(contentdom), count, duetime, filter, constraint); + theQuery.domType = plasmaSearchQuery.SEARCHDOM_LOCAL; + yacyCore.log.logInfo("INIT HASH SEARCH (query-" + abstracts + "): " + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + theQuery.wantedResults + " links"); // prepare a search profile plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(plasmaSearchQuery.contentdomParser(contentdom)) : new plasmaSearchRankingProfile("", profile); - plasmaSearchProcessing localProcess = new plasmaSearchProcessing(squery.maximumTime, squery.wantedResults); + plasmaSearchProcessing localProcess = new plasmaSearchProcessing(theQuery.maximumTime, theQuery.wantedResults); //plasmaSearchProcessing remoteProcess = null; - //theSearch = new plasmaSearchEvent(squery, rankingProfile, localProcess, remoteProcess, true, sb.wordIndex, null); - Map[] containers = localProcess.localSearchContainers(squery, sb.wordIndex, plasmaSearchQuery.hashes2Set(urls)); + plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, rankingProfile, localProcess, null, sb.wordIndex, null, true, abstractSet); + //Map[] containers = localProcess.localSearchContainers(theQuery, sb.wordIndex, plasmaSearchQuery.hashes2Set(urls)); // set statistic details of search result and find best result index set - if (containers == null) { + if (theSearch.getLocalCount() == 0) { prop.putASIS("indexcount", ""); prop.putASIS("joincount", "0"); } else { - Iterator ci = containers[0].entrySet().iterator(); + // attach information about index abstracts StringBuffer indexcount = new StringBuffer(); Map.Entry entry; - int maxcount = -1; - double mindhtdistance = 1.1, d; - String wordhash; - String maxcounthash = null, neardhthash = null; - while (ci.hasNext()) { - entry = (Map.Entry) ci.next(); - wordhash = (String) entry.getKey(); - indexContainer container = (indexContainer) entry.getValue(); - if (container.size() > maxcount) { - maxcounthash = wordhash; - maxcount = container.size(); - } - d = yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhash); - if (d < mindhtdistance) { - // calculate the word hash that is closest to our dht position - mindhtdistance = d; - neardhthash = wordhash; - } - indexcount.append("indexcount.").append(container.getWordHash()).append('=').append(Integer.toString(container.size())).append(serverCore.crlfString); - if ((abstractSet != null) && (abstractSet.contains(wordhash))) { - // if a specific index-abstract is demanded, attach it here - indexabstractContainercount += container.size(); - indexabstract.append("indexabstract." + wordhash + "=").append(plasmaURL.compressIndex(container, null,1000).toString()).append(serverCore.crlfString); + Iterator i = theSearch.IACount.entrySet().iterator(); + while (i.hasNext()) { + entry = (Map.Entry) i.next(); + indexcount.append("indexcount.").append((String) entry.getKey()).append('=').append(((Integer) entry.getValue()).toString()).append(serverCore.crlfString); + } + if (abstractSet != null) { + // if a specific index-abstract is demanded, attach it here + i = abstractSet.iterator(); + String wordhash; + while (i.hasNext()) { + wordhash = (String) i.next(); + indexabstractContainercount += ((Integer) theSearch.IACount.get(wordhash)).intValue(); + indexabstract.append("indexabstract." + wordhash + "=").append((String) theSearch.IAResults.get(wordhash)).append(serverCore.crlfString); } } prop.putASIS("indexcount", new String(indexcount)); - - // join and order the result - indexContainer localResults = - (containers == null) ? - plasmaWordIndex.emptyContainer(null, 0) : - localProcess.localSearchJoinExclude( - containers[0].values(), - containers[1].values(), - (squery.queryHashes.size() == 0) ? - 0 : - localProcess.getTargetTime(plasmaSearchProcessing.PROCESS_JOIN) * squery.queryHashes.size() / (squery.queryHashes.size() + squery.excludeHashes.size()), - squery.maxDistance); - if (localResults == null) { + + if (theSearch.getLocalCount() == 0) { joincount = 0; prop.put("joincount", 0); } else { - joincount = localResults.size(); + joincount = theSearch.getLocalCount(); prop.putASIS("joincount", Integer.toString(joincount)); - plasmaSearchPreOrder pre = new plasmaSearchPreOrder(squery, localProcess, rankingProfile, localResults); - accu = new plasmaSearchResultAccumulator(squery, localProcess, rankingProfile, pre.strippedContainer(200), sb.wordIndex, plasmaSwitchboard.blueList, false); + accu = theSearch.computeResults(plasmaSwitchboard.blueList, false); } // generate compressed index for maxcounthash // this is not needed if the search is restricted to specific // urls, because it is a re-search - if ((maxcounthash == null) || (urls.length() != 0) || (queryhashes.size() == 1) || (abstracts.length() == 0)) { + if ((theSearch.IAmaxcounthash == null) || (urls.length() != 0) || (queryhashes.size() == 1) || (abstracts.length() == 0)) { prop.putASIS("indexabstract", ""); } else if (abstracts.equals("auto")) { // automatically attach the index abstract for the index that has the most references. This should be our target dht position - indexContainer container = (indexContainer) containers[0].get(maxcounthash); - indexabstractContainercount += container.size(); - indexabstract.append("indexabstract." + maxcounthash + "=").append(plasmaURL.compressIndex(container,localResults, 1000).toString()).append(serverCore.crlfString); - if ((neardhthash != null) && (!(neardhthash.equals(maxcounthash)))) { + indexabstractContainercount += ((Integer) theSearch.IACount.get(theSearch.IAmaxcounthash)).intValue(); + indexabstract.append("indexabstract." + theSearch.IAmaxcounthash + "=").append((String) theSearch.IAResults.get(theSearch.IAmaxcounthash)).append(serverCore.crlfString); + if ((theSearch.IAneardhthash != null) && (!(theSearch.IAneardhthash.equals(theSearch.IAmaxcounthash)))) { // in case that the neardhthash is different from the maxcounthash attach also the neardhthash-container - container = (indexContainer) containers[0].get(neardhthash); - indexabstractContainercount += container.size(); - indexabstract.append("indexabstract." + neardhthash + "=").append(plasmaURL.compressIndex(container, localResults, 1000).toString()).append(serverCore.crlfString); + indexabstractContainercount += ((Integer) theSearch.IACount.get(theSearch.IAneardhthash)).intValue(); + indexabstract.append("indexabstract." + theSearch.IAneardhthash + "=").append((String) theSearch.IAResults.get(theSearch.IAneardhthash)).append(serverCore.crlfString); } //System.out.println("DEBUG-ABSTRACTGENERATION: maxcounthash = " + maxcounthash); //System.out.println("DEBUG-ABSTRACTGENERATION: neardhthash = "+ neardhthash); @@ -248,13 +221,20 @@ public final class search { } } if (partitions > 0) sb.requestedQueries = sb.requestedQueries + 1d / (double) partitions; // increase query counter + + // prepare reference hints + Object[] ws = theSearch.references(); + StringBuffer refstr = new StringBuffer(); + for (int j = 0; j < ws.length; j++) + refstr.append(",").append((String) ws[j]); + prop.putASIS("references", (refstr.length() > 0) ? refstr.substring(1) : new String(refstr)); } prop.putASIS("indexabstract", new String(indexabstract)); // prepare search statistics Long trackerHandle = new Long(System.currentTimeMillis()); String client = (String) header.get("CLIENTIP"); - HashMap searchProfile = squery.resultProfile((accu == null) ? 0 : accu.resultCount(), System.currentTimeMillis() - timestamp); + HashMap searchProfile = theQuery.resultProfile((accu == null) ? 0 : accu.size(), System.currentTimeMillis() - timestamp); searchProfile.put("host", client); yacySeed remotepeer = yacyCore.seedDB.lookupByIP(natLib.getInetAddress(client), true, false, false); searchProfile.put("peername", (remotepeer == null) ? "unknown" : remotepeer.getName()); @@ -278,23 +258,16 @@ public final class search { // result is a List of urlEntry elements StringBuffer links = new StringBuffer(); String resource = null; - plasmaSearchResultAccumulator.Entry entry; - for (int i = 0; i < accu.resultCount(); i++) { - entry = accu.resultEntry(i); + plasmaSearchEvent.Entry entry; + for (int i = 0; i < accu.size(); i++) { + entry = (plasmaSearchEvent.Entry) accu.get(i); resource = entry.resource(); if (resource != null) { links.append("resource").append(i).append('=').append(resource).append(serverCore.crlfString); } } prop.putASIS("links", new String(links)); - prop.put("linkcount", accu.resultCount()); - - // prepare reference hints - Object[] ws = accu.references(); - StringBuffer refstr = new StringBuffer(); - for (int j = 0; j < ws.length; j++) - refstr.append(",").append((String) ws[j]); - prop.putASIS("references", (refstr.length() > 0) ? refstr.substring(1) : new String(refstr)); + prop.put("linkcount", accu.size()); } // add information about forward peers @@ -304,7 +277,7 @@ public final class search { // log yacyCore.log.logInfo("EXIT HASH SEARCH: " + - plasmaSearchQuery.anonymizedQueryHashes(squery.queryHashes) + " - " + joincount + " links found, " + + plasmaSearchQuery.anonymizedQueryHashes(theQuery.queryHashes) + " - " + joincount + " links found, " + prop.get("linkcount", "?") + " links selected, " + indexabstractContainercount + " index abstract references attached, " + (System.currentTimeMillis() - timestamp) + " milliseconds"); diff --git a/htroot/yacysearch.html b/htroot/yacysearch.html index dc32e5b7e..23a60c601 100644 --- a/htroot/yacysearch.html +++ b/htroot/yacysearch.html @@ -230,20 +230,29 @@ document.getElementById("Enter").value = "search again - catch up more links"; #{/results}# + + + #(combine)# + :: +

Refine your search with these topwords:

+

+ #{words}# + #[word]# + #{/words}# +

+ #(/combine)# + +
+ #{results}# + +
+ +
+ + #{/results}# + + #(/type)# - -#(display)# -

- YaCy is a GPL'ed project with the target of implementing a P2P-based global search engine.
- Architecture (C) by Michael Peter Christen, Mail-Adresse von Michael Peter Christen -

- #%env/templates/simplefooter.template%# - :: - #%env/templates/footer.template%# - :: - #%env/templates/embeddedfooter.template%# -#(/display)# - diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 5a27ad73f..76823fd20 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -50,13 +50,13 @@ import java.io.IOException; import java.io.UnsupportedEncodingException; import java.net.MalformedURLException; import java.net.URLEncoder; +import java.util.ArrayList; import java.util.HashMap; import java.util.Iterator; import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterImageEntry; import de.anomic.http.httpHeader; -import de.anomic.index.indexContainer; import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroMSetTools; @@ -70,7 +70,6 @@ import de.anomic.plasma.plasmaSearchPreOrder; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchProcessing; -import de.anomic.plasma.plasmaSearchResultAccumulator; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURL; @@ -298,12 +297,11 @@ public class yacysearch { // create a new search event String wrongregex = null; - plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, ranking, localTiming, remoteTiming, sb.wordIndex, (sb.isRobinsonMode()) ? sb.clusterhashes : null); - indexContainer preorder = theSearch.search(); + plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(theQuery, ranking, localTiming, remoteTiming, sb.wordIndex, (sb.isRobinsonMode()) ? sb.clusterhashes : null, false, null); - // fetch snippets + // generate result object serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER ORDERING OF SEARCH RESULTS: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds"); - plasmaSearchResultAccumulator accu = new plasmaSearchResultAccumulator(theQuery, localTiming, ranking, preorder, sb.wordIndex, plasmaSwitchboard.blueList, true); + ArrayList accu = theSearch.computeResults(plasmaSwitchboard.blueList, true); serverLog.logFine("LOCAL_SEARCH", "SEARCH TIME AFTER RESULT PREPARATION: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds"); // calc some more cross-reference @@ -316,13 +314,12 @@ public class yacysearch { serverLog.logInfo("LOCAL_SEARCH", "EXIT WORD SEARCH: " + theQuery.queryString + " - " + (theSearch.getLocalCount() + theSearch.getGlobalCount()) + " links found, " + theSearch.filteredCount() + " links filtered, " + - accu.resultCount() + " links ordered, " + + accu.size() + " links ordered, " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds"); - - + // prepare search statistics Long trackerHandle = new Long(System.currentTimeMillis()); - HashMap searchProfile = theQuery.resultProfile(accu.resultCount(), System.currentTimeMillis() - timestamp); + HashMap searchProfile = theQuery.resultProfile(accu.size(), System.currentTimeMillis() - timestamp); searchProfile.put("querystring", theQuery.queryString); searchProfile.put("time", trackerHandle); searchProfile.put("host", client); @@ -333,22 +330,30 @@ public class yacysearch { if (handles == null) handles = new TreeSet(); handles.add(trackerHandle); sb.localSearchTracker.put(client, handles); - //** - + //prop=sb.searchFromLocal(thisSearch, ranking, localTiming, remoteTiming, true, (String) header.get("CLIENTIP")); - prop=new serverObjects(); + prop = new serverObjects(); //prop.put("references", 0); - URL wordURL=null; prop.put("num-results_totalcount", theSearch.getLocalCount() + theSearch.getGlobalCount()); prop.put("num-results_filteredcount", theSearch.filteredCount()); - prop.put("num-results_orderedcount", accu.resultCount()); + prop.put("num-results_orderedcount", accu.size()); prop.put("num-results_globalresults", (theSearch.getGlobalCount() == 0) ? 0 : 1); prop.put("num-results_globalresults_globalcount", theSearch.getGlobalCount()); prop.put("num-results_linkcount", 0); + + /* + for (int i = 0; i < theQuery.wantedResults; i++) { + prop.put("type_results_" + i + "_item", i); + prop.put("type_results_" + i + "_eventID", theQuery.id()); + } + prop.put("type_results", theQuery.wantedResults); + */ + //------------------------ + prop.put("type_results", 0); - - for (int i = 0; i < accu.resultCount(); i++) { - plasmaSearchResultAccumulator.Entry result = accu.resultEntry(i); + URL wordURL=null; + for (int i = 0; i < accu.size(); i++) { + plasmaSearchEvent.Entry result = (plasmaSearchEvent.Entry) accu.get(i); prop.put("type_results_" + i + "_authorized_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? 1 : 0); prop.put("type_results_" + i + "_authorized_recommend_deletelink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.wantedResults + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*"); prop.put("type_results_" + i + "_authorized_recommend_recommendlink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.wantedResults + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*"); @@ -410,22 +415,24 @@ public class yacysearch { } prop.put("type_results_" + i + "_snippet", 1); } else { - /* no snippet available (will be fetched later via ajax) */ + // no snippet available (will be fetched later via ajax) prop.put("type_results_" + i + "_snippet", 0); prop.put("type_results_" + i + "_snippet_text", ""); } } - prop.put("type_results", accu.resultCount()); - prop.put("num-results_linkcount", Integer.toString(accu.resultCount())); + prop.put("type_results", accu.size()); + prop.put("num-results_linkcount", Integer.toString(accu.size())); + } + + //------------------------ + + // process result of search + if (filtered.size() > 0) { + prop.put("excluded", 1); + prop.put("excluded_stopwords", filtered.toString()); + } else { + prop.put("excluded", 0); } - - // process result of search - if (filtered.size() > 0) { - prop.put("excluded", 1); - prop.put("excluded_stopwords", filtered.toString()); - } else { - prop.put("excluded", 0); - } if (prop == null || prop.size() == 0) { if (post.get("search", "").length() < 3) { @@ -440,7 +447,6 @@ public class yacysearch { prop.put("num-results", 5); int hintcount = references.length; if (hintcount > 0) { - prop.put("type_combine", 1); // get the topwords final TreeSet topwords = new TreeSet(kelondroNaturalOrder.naturalOrder); @@ -449,8 +455,6 @@ public class yacysearch { tmp = (String) references[i]; if (tmp.matches("[a-z]+")) { topwords.add(tmp); - // } else { - // topwords.add("(" + tmp + ")"); } } @@ -460,13 +464,13 @@ public class yacysearch { kelondroMSetTools.excludeDestructive(topwords, plasmaSwitchboard.badwords); } - //avoid stopwords being topwords + // avoid stopwords being topwords if (env.getConfig("filterOutStopwordsFromTopwords", "true").equals("true")) { - if ((plasmaSwitchboard.stopwords != null) && (plasmaSwitchboard.stopwords.size() > 0)) { - kelondroMSetTools.excludeDestructive(topwords, plasmaSwitchboard.stopwords); - } + if ((plasmaSwitchboard.stopwords != null) && (plasmaSwitchboard.stopwords.size() > 0)) { + kelondroMSetTools.excludeDestructive(topwords, plasmaSwitchboard.stopwords); + } } - + String word; hintcount = 0; final Iterator iter = topwords.iterator(); @@ -489,11 +493,9 @@ public class yacysearch { if (wrongregex != null) { prop.put("num-results_wrong_regex", wrongregex); prop.put("num-results", 4); - } - else if (totalcount == 0) { + } else if (totalcount == 0) { prop.put("num-results", 3); // long - } - else { + } else { prop.put("num-results", 5); } } @@ -545,9 +547,10 @@ public class yacysearch { // if user is not authenticated, he may not vote for URLs int linkcount = Integer.parseInt(prop.get("num-results_linkcount", "0")); - for (int i=0; i + bookmark + #(recommend)# + + + :: + recommend + delete + #(/recommend)# + + #(/authorized)# +

#[description]#

+

#[snippet]#

+

#[urlname]#

+

#[date]# | YBR-#[ybr]# | Info | Pictures

+#(/content)# diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java new file mode 100644 index 000000000..13499209a --- /dev/null +++ b/htroot/yacysearchitem.java @@ -0,0 +1,145 @@ +// yacysearchitem.java +// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 28.08.2007 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import java.io.UnsupportedEncodingException; +import java.net.MalformedURLException; +import java.net.URLEncoder; +import java.util.ArrayList; +import java.util.TreeSet; + +import de.anomic.http.httpHeader; +import de.anomic.net.URL; +import de.anomic.plasma.plasmaSearchEvent; +import de.anomic.plasma.plasmaSearchPreOrder; +import de.anomic.plasma.plasmaSearchQuery; +import de.anomic.plasma.plasmaSearchRankingProfile; +import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.plasmaURL; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; +import de.anomic.tools.crypt; +import de.anomic.tools.nxTools; +import de.anomic.yacy.yacyCore; +import de.anomic.yacy.yacyNewsPool; +import de.anomic.yacy.yacySeed; + + +public class yacysearchitem { + + public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) { + final plasmaSwitchboard sb = (plasmaSwitchboard) env; + final serverObjects prop = new serverObjects(); + + String eventID = post.get("eventID", ""); + int item = post.getInt("item", -1); + boolean authenticated = sb.adminAuthenticated(header) >= 2; + + // find search event + plasmaSearchEvent theSearch = plasmaSearchEvent.getEvent(eventID); + plasmaSearchQuery theQuery = theSearch.getQuery(); + plasmaSearchRankingProfile ranking = theSearch.getRanking(); + + long startprofiling = System.currentTimeMillis(); + + // generate result object + ArrayList accu = theSearch.computeResults(plasmaSwitchboard.blueList, true); + + plasmaSearchEvent.Entry result = (plasmaSearchEvent.Entry) accu.get(item); + System.out.println("PROFILING_DEBUG: " + (System.currentTimeMillis() - startprofiling) + " millisekunden fuer item " + item); + + prop.put("content", 1); // switch on content + prop.put("content_authorized", (authenticated) ? 1 : 0); + prop.put("content_authorized_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, yacyNewsPool.CATEGORY_SURFTIPP_ADD, "url", result.urlstring()) == null) ? 1 : 0); + prop.put("content_authorized_recommend_deletelink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.wantedResults + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + result.hash() + "&urlmaskfilter=.*"); + prop.put("content_authorized_recommend_recommendlink", "/yacysearch.html?search=" + theQuery.queryString + "&Enter=Search&count=" + theQuery.wantedResults + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + result.hash() + "&urlmaskfilter=.*"); + prop.put("content_authorized_urlhash", result.hash()); + prop.put("content_description", result.title()); + prop.put("content_url", result.urlstring()); + + int port=result.url().getPort(); + URL faviconURL; + try { + faviconURL = new URL(result.url().getProtocol() + "://" + result.url().getHost() + ((port != -1) ? (":" + String.valueOf(port)) : "") + "/favicon.ico"); + } catch (MalformedURLException e1) { + faviconURL = null; + } + + prop.put("content_faviconCode", sb.licensedURLs.aquireLicense(faviconURL)); // aquire license for favicon url loading + prop.put("content_urlhash", result.hash()); + prop.put("content_urlhexhash", yacySeed.b64Hash2hexHash(result.hash())); + prop.put("content_urlname", nxTools.shortenURLString(result.urlname(), 120)); + prop.put("content_date", plasmaSwitchboard.dateString(result.modified())); + prop.put("content_ybr", plasmaSearchPreOrder.ybr(result.hash())); + prop.put("content_size", Long.toString(result.filesize())); + + TreeSet[] query = theQuery.queryWords(); + URL wordURL = null; + try { + prop.put("content_words", URLEncoder.encode(query[0].toString(),"UTF-8")); + } catch (UnsupportedEncodingException e) {} + prop.put("content_former", theQuery.queryString); + prop.put("content_rankingprops", result.word().toPropertyForm() + ", domLengthEstimated=" + plasmaURL.domLengthEstimation(result.hash()) + + ((plasmaURL.probablyRootURL(result.hash())) ? ", probablyRootURL" : "") + + (((wordURL = plasmaURL.probablyWordURL(result.hash(), query[0])) != null) ? ", probablyWordURL=" + wordURL.toNormalform(false, true) : "")); + + /* + // adding snippet if available + if (result.hasSnippet()) { + prop.put("content_snippet", result.textSnippet().getLineMarked(theQuery.queryHashes)); + } else { + // snippet fetch timeout + int textsnippet_timeout = Integer.parseInt(env.getConfig("timeout_media", "10000")); + + // boolean line_end_with_punctuation + boolean pre = post.get("pre", "false").equals("true"); + + // if 'remove' is set to true, then RWI references to URLs that do not have the snippet are removed + boolean remove = post.get("remove", "false").equals("true"); + + plasmaSnippetCache.TextSnippet snippet = plasmaSnippetCache.retrieveTextSnippet( + result.url(), + theQuery.queryHashes, + true, + pre, + 260, + textsnippet_timeout + ); + + if (snippet.getErrorCode() < 11) { + // no problems occurred + //prop.put("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown"); + prop.putASIS("content_snippet", (snippet.exists()) ? snippet.getLineMarked(theQuery.queryHashes) : "unknown"); + } else { + // problems with snippet fetch + prop.put("content_snippet", (remove) ? plasmaSnippetCache.failConsequences(snippet, theQuery.id()) : snippet.getError()); + } + } + */ + prop.put("content_snippet","temporary no snippet computed"); + return prop; + } + +} diff --git a/source/de/anomic/data/URLLicense.java b/source/de/anomic/data/URLLicense.java index a0b8eeb72..b7ed0e132 100644 --- a/source/de/anomic/data/URLLicense.java +++ b/source/de/anomic/data/URLLicense.java @@ -64,11 +64,13 @@ public class URLLicense { synchronized (permissions) { url = (URL) permissions.remove(license); } + /* if (url == null) { System.out.println("DEBUG-URLLICENSE: no URL license present for code=" + license); } else { System.out.println("DEBUG-URLLICENSE: granted download of " + url.toString()); } + */ return url; } diff --git a/source/de/anomic/http/httpdFileHandler.java b/source/de/anomic/http/httpdFileHandler.java index 7f30abd69..680172bf3 100644 --- a/source/de/anomic/http/httpdFileHandler.java +++ b/source/de/anomic/http/httpdFileHandler.java @@ -900,7 +900,7 @@ public final class httpdFileHandler { try {out.flush();}catch (Exception e) {} if (((String)requestHeader.get(httpHeader.CONNECTION, "close")).indexOf("keep-alive") == -1) { // wait a little time until everything closes so that clients can read from the streams/sockets - try {Thread.sleep(200);} catch (InterruptedException e) {} + //try {Thread.sleep(200);} catch (InterruptedException e) {} // FIXME: is this necessary? } } } diff --git a/source/de/anomic/index/indexCollectionRI.java b/source/de/anomic/index/indexCollectionRI.java index c52c8aa35..072d6da15 100644 --- a/source/de/anomic/index/indexCollectionRI.java +++ b/source/de/anomic/index/indexCollectionRI.java @@ -1,6 +1,6 @@ // indexCollectionRI.java -// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany -// first published 03.07.2006 on http://www.anomic.de +// (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 03.07.2006 on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine // diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 656671776..529e7be07 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -26,19 +26,28 @@ package de.anomic.plasma; +import java.io.UnsupportedEncodingException; import java.util.ArrayList; +import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.TreeMap; +import java.util.TreeSet; import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; +import de.anomic.index.indexURLEntry; +import de.anomic.kelondro.kelondroBitfield; +import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroMSetTools; +import de.anomic.net.URL; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacyCore; +import de.anomic.yacy.yacyDHTAction; import de.anomic.yacy.yacySearch; +import de.anomic.yacy.yacySeed; public final class plasmaSearchEvent { @@ -62,13 +71,18 @@ public final class plasmaSearchEvent { private int lastglobal; private int filteredCount; private ArrayList display; // an array of url hashes of urls that had been displayed as search result after this search + private Object[] references; + public TreeMap IAResults, IACount; + public String IAmaxcounthash, IAneardhthash; private plasmaSearchEvent(plasmaSearchQuery query, plasmaSearchRankingProfile ranking, plasmaSearchProcessing localTiming, plasmaSearchProcessing remoteTiming, plasmaWordIndex wordIndex, - TreeMap preselectedPeerHashes) { + TreeMap preselectedPeerHashes, + boolean generateAbstracts, + TreeSet abstractSet) { this.eventTime = System.currentTimeMillis(); // for lifetime check this.wordIndex = wordIndex; this.query = query; @@ -86,6 +100,11 @@ public final class plasmaSearchEvent { this.sortedResults = null; this.lastglobal = 0; this.display = new ArrayList(); + this.references = new String[0]; + this.IAResults = new TreeMap(); + this.IACount = new TreeMap(); + this.IAmaxcounthash = null; + this.IAneardhthash = null; long start = System.currentTimeMillis(); if ((query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) || @@ -208,6 +227,33 @@ public final class plasmaSearchEvent { } else { Map[] searchContainerMaps = profileLocal.localSearchContainers(query, wordIndex, null); + if (generateAbstracts) { + // compute index abstracts + Iterator ci = searchContainerMaps[0].entrySet().iterator(); + Map.Entry entry; + int maxcount = -1; + double mindhtdistance = 1.1, d; + String wordhash; + while (ci.hasNext()) { + entry = (Map.Entry) ci.next(); + wordhash = (String) entry.getKey(); + indexContainer container = (indexContainer) entry.getValue(); + assert (container.getWordHash().equals(wordhash)); + if (container.size() > maxcount) { + IAmaxcounthash = wordhash; + maxcount = container.size(); + } + d = yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhash); + if (d < mindhtdistance) { + // calculate the word hash that is closest to our dht position + mindhtdistance = d; + IAneardhthash = wordhash; + } + IACount.put(wordhash, new Integer(container.size())); + IAResults.put(wordhash, plasmaURL.compressIndex(container, null, 1000).toString()); + } + } + rcLocal = (searchContainerMaps == null) ? plasmaWordIndex.emptyContainer(null, 0) : @@ -274,10 +320,12 @@ public final class plasmaSearchEvent { plasmaSearchProcessing localTiming, plasmaSearchProcessing remoteTiming, plasmaWordIndex wordIndex, - TreeMap preselectedPeerHashes) { + TreeMap preselectedPeerHashes, + boolean generateAbstracts, + TreeSet abstractSet) { plasmaSearchEvent event = (plasmaSearchEvent) lastEvents.get(query.id()); if (event == null) { - event = new plasmaSearchEvent(query, ranking, localTiming, remoteTiming, wordIndex, preselectedPeerHashes); + event = new plasmaSearchEvent(query, ranking, localTiming, remoteTiming, wordIndex, preselectedPeerHashes, generateAbstracts, abstractSet); } else { //re-new the event time for this event, so it is not deleted next time too early event.eventTime = System.currentTimeMillis(); @@ -285,7 +333,7 @@ public final class plasmaSearchEvent { return event; } - public indexContainer search() { + private indexContainer search() { // combine the local and global (if any) result and order if ((rcGlobal != null) && (rcGlobal.size() > 0)) { globalcount = rcGlobal.size(); @@ -310,6 +358,136 @@ public final class plasmaSearchEvent { return this.sortedResults; } + + public ArrayList computeResults( + TreeSet blueList, + boolean overfetch) { + + indexContainer pre = search(); + final ArrayList hits = new ArrayList(); + + // start url-fetch + final long postorderTime = this.profileLocal.getTargetTime(plasmaSearchProcessing.PROCESS_POSTSORT); + //System.out.println("DEBUG: postorder-final (urlfetch) maxtime = " + postorderTime); + final long postorderLimitTime = (postorderTime < 0) ? Long.MAX_VALUE : (System.currentTimeMillis() + postorderTime); + this.profileLocal.startTimer(); + final plasmaSearchPostOrder acc = new plasmaSearchPostOrder(query, ranking); + + indexRWIEntry rwientry; + indexURLEntry page; + indexURLEntry.Components comp; + String pagetitle, pageurl, pageauthor; + final int minEntries = this.profileLocal.getTargetCount(plasmaSearchProcessing.PROCESS_POSTSORT); + try { + ordering: for (int i = 0; i < pre.size(); i++) { + if ((System.currentTimeMillis() >= postorderLimitTime) || (acc.sizeFetched() >= ((overfetch) ? 4 : 1) * minEntries)) break; + rwientry = new indexRWIEntry(pre.get(i)); + // load only urls if there was not yet a root url of that hash + // find the url entry + page = wordIndex.loadedURL.load(rwientry.urlHash(), rwientry); + if (page != null) { + comp = page.comp(); + pagetitle = comp.title().toLowerCase(); + if (comp.url() == null) continue ordering; // rare case where the url is corrupted + pageurl = comp.url().toString().toLowerCase(); + pageauthor = comp.author().toLowerCase(); + + // check exclusion + if (plasmaSearchQuery.matches(pagetitle, query.excludeHashes)) continue ordering; + if (plasmaSearchQuery.matches(pageurl, query.excludeHashes)) continue ordering; + if (plasmaSearchQuery.matches(pageauthor, query.excludeHashes)) continue ordering; + + // check url mask + if (!(pageurl.matches(query.urlMask))) continue ordering; + + // check constraints + if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && + (query.constraint.get(plasmaCondenser.flag_cat_indexof)) && + (!(comp.title().startsWith("Index of")))) { + serverLog.logFine("PLASMA", "filtered out " + comp.url().toString()); + // filter out bad results + final Iterator wi = query.queryHashes.iterator(); + while (wi.hasNext()) wordIndex.removeEntry((String) wi.next(), page.hash()); + } else if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) { + if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (page.laudio() > 0)) acc.addPage(page); + else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (page.lvideo() > 0)) acc.addPage(page); + else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (page.limage() > 0)) acc.addPage(page); + else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP) && (page.lapp() > 0)) acc.addPage(page); + } else { + acc.addPage(page); + } + } + } + } catch (final kelondroException ee) { + serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee); + } + this.profileLocal.setYieldTime(plasmaSearchProcessing.PROCESS_URLFETCH); + this.profileLocal.setYieldCount(plasmaSearchProcessing.PROCESS_URLFETCH, acc.sizeFetched()); + + // start postsorting + this.profileLocal.startTimer(); + acc.sortPages(true); + this.profileLocal.setYieldTime(plasmaSearchProcessing.PROCESS_POSTSORT); + this.profileLocal.setYieldCount(plasmaSearchProcessing.PROCESS_POSTSORT, acc.sizeOrdered()); + + + // apply filter + this.profileLocal.startTimer(); + acc.removeRedundant(); + this.profileLocal.setYieldTime(plasmaSearchProcessing.PROCESS_FILTER); + this.profileLocal.setYieldCount(plasmaSearchProcessing.PROCESS_FILTER, acc.sizeOrdered()); + + // generate references + this.references = acc.getReferences(16); + + // generate Result.Entry objects and optionally fetch snippets + int i = 0; + Entry entry; + final boolean includeSnippets = false; + while ((acc.hasMoreElements()) && (i < query.wantedResults)) { + try { + entry = new Entry(acc.nextElement(), wordIndex); + } catch (final RuntimeException e) { + continue; + } + // check bluelist again: filter out all links where any + // bluelisted word + // appear either in url, url's description or search word + // the search word was sorted out earlier + /* + * String s = descr.toLowerCase() + url.toString().toLowerCase(); + * for (int c = 0; c < blueList.length; c++) { if + * (s.indexOf(blueList[c]) >= 0) return; } + */ + if (includeSnippets) { + entry.setSnippet(plasmaSnippetCache.retrieveTextSnippet( + entry.url(), query.queryHashes, false, + entry.flags().get(plasmaCondenser.flag_cat_indexof), 260, + 1000)); + // snippet = + // snippetCache.retrieveTextSnippet(comp.url(), + // query.queryHashes, false, + // urlentry.flags().get(plasmaCondenser.flag_cat_indexof), + // 260, 1000); + } else { + // snippet = null; + entry.setSnippet(null); + } + i++; + hits.add(entry); + } + + /* + * while ((acc.hasMoreElements()) && (((time + timestamp) < + * System.currentTimeMillis()))) { urlentry = acc.nextElement(); + * urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url()); + * descr = urlentry.descr(); + * + * addScoreForked(ref, gs, descr.split(" ")); addScoreForked(ref, gs, + * urlstring.split("/")); } + */ + return hits; + } public int filteredCount() { return this.filteredCount; @@ -418,4 +596,98 @@ public final class plasmaSearchEvent { this.display.set(position, urlhash); } + public Object[] references() { + return this.references; + } + + public static class Entry { + private indexURLEntry urlentry; + private indexURLEntry.Components urlcomps; // buffer for components + private String alternative_urlstring; + private String alternative_urlname; + private plasmaSnippetCache.TextSnippet snippet; + + public Entry(indexURLEntry urlentry, plasmaWordIndex wordIndex) { + this.urlentry = urlentry; + this.urlcomps = urlentry.comp(); + this.alternative_urlstring = null; + this.alternative_urlname = null; + this.snippet = null; + String host = urlcomps.url().getHost(); + if (host.endsWith(".yacyh")) { + // translate host into current IP + int p = host.indexOf("."); + String hash = yacySeed.hexHash2b64Hash(host.substring(p + 1, host.length() - 6)); + yacySeed seed = yacyCore.seedDB.getConnected(hash); + String filename = urlcomps.url().getFile(); + String address = null; + if ((seed == null) || ((address = seed.getPublicAddress()) == null)) { + // seed is not known from here + try { + wordIndex.removeWordReferences( + plasmaCondenser.getWords( + ("yacyshare " + + filename.replace('?', ' ') + + " " + + urlcomps.title()).getBytes(), "UTF-8").keySet(), + urlentry.hash()); + wordIndex.loadedURL.remove(urlentry.hash()); // clean up + throw new RuntimeException("index void"); + } catch (UnsupportedEncodingException e) { + throw new RuntimeException("parser failed: " + e.getMessage()); + } + } + alternative_urlstring = "http://" + address + "/" + host.substring(0, p) + filename; + alternative_urlname = "http://share." + seed.getName() + ".yacy" + filename; + if ((p = alternative_urlname.indexOf("?")) > 0) alternative_urlname = alternative_urlname.substring(0, p); + } + } + public String hash() { + return urlentry.hash(); + } + public URL url() { + return urlcomps.url(); + } + public kelondroBitfield flags() { + return urlentry.flags(); + } + public String urlstring() { + return (alternative_urlstring == null) ? urlcomps.url().toNormalform(false, true) : alternative_urlstring; + } + public String urlname() { + return (alternative_urlname == null) ? urlcomps.url().toNormalform(false, true) : alternative_urlname; + } + public String title() { + return urlcomps.title(); + } + public void setSnippet(plasmaSnippetCache.TextSnippet snippet) { + this.snippet = snippet; + } + public plasmaSnippetCache.TextSnippet snippet() { + return this.snippet; + } + public Date modified() { + return urlentry.moddate(); + } + public int filesize() { + return urlentry.size(); + } + public indexRWIEntry word() { + return urlentry.word(); + } + public boolean hasSnippet() { + return false; + } + public plasmaSnippetCache.TextSnippet textSnippet() { + return null; + } + public String resource() { + // generate transport resource + if ((snippet != null) && (snippet.exists())) { + return urlentry.toString(snippet.getLineRaw()); + } else { + return urlentry.toString(); + } + } + } } diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index 7a810db5f..db7543773 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -191,6 +191,10 @@ public final class plasmaSearchQuery { return this.queryString; } + public TreeSet[] queryWords() { + return cleanQuery(this.queryString); + } + public void filterOut(Set blueList) { // filter out words that appear in this set // this is applied to the queryHashes diff --git a/source/de/anomic/plasma/plasmaSearchResultAccumulator.java b/source/de/anomic/plasma/plasmaSearchResultAccumulator.java deleted file mode 100644 index 4378a781f..000000000 --- a/source/de/anomic/plasma/plasmaSearchResultAccumulator.java +++ /dev/null @@ -1,296 +0,0 @@ -// plasmaSearchResultAccumulator.java -// (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 15.08.2007 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ -// $LastChangedRevision: 1986 $ -// $LastChangedBy: orbiter $ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - - -package de.anomic.plasma; - -import java.io.UnsupportedEncodingException; -import java.util.ArrayList; -import java.util.Date; -import java.util.Iterator; -import java.util.TreeSet; - -import de.anomic.index.indexContainer; -import de.anomic.index.indexRWIEntry; -import de.anomic.index.indexURLEntry; -import de.anomic.kelondro.kelondroBitfield; -import de.anomic.kelondro.kelondroException; -import de.anomic.net.URL; -import de.anomic.server.logging.serverLog; -import de.anomic.yacy.yacyCore; -import de.anomic.yacy.yacySeed; - -public class plasmaSearchResultAccumulator { - - private ArrayList hits; - private Object[] references; - - public plasmaSearchResultAccumulator( - plasmaSearchQuery theQuery, - plasmaSearchProcessing process, - plasmaSearchRankingProfile ranking, - indexContainer pre, - plasmaWordIndex wordIndex, - TreeSet blueList, - boolean overfetch) { - - hits = new ArrayList(); - - // start url-fetch - long postorderTime = process.getTargetTime(plasmaSearchProcessing.PROCESS_POSTSORT); - //System.out.println("DEBUG: postorder-final (urlfetch) maxtime = " + postorderTime); - long postorderLimitTime = (postorderTime < 0) ? Long.MAX_VALUE : (System.currentTimeMillis() + postorderTime); - process.startTimer(); - plasmaSearchPostOrder acc = new plasmaSearchPostOrder(theQuery, ranking); - - indexRWIEntry rwientry; - indexURLEntry page; - indexURLEntry.Components comp; - String pagetitle, pageurl, pageauthor; - int minEntries = process.getTargetCount(plasmaSearchProcessing.PROCESS_POSTSORT); - try { - ordering: for (int i = 0; i < pre.size(); i++) { - if ((System.currentTimeMillis() >= postorderLimitTime) || (acc.sizeFetched() >= ((overfetch) ? 4 : 1) * minEntries)) break; - rwientry = new indexRWIEntry(pre.get(i)); - // load only urls if there was not yet a root url of that hash - // find the url entry - page = wordIndex.loadedURL.load(rwientry.urlHash(), rwientry); - if (page != null) { - comp = page.comp(); - pagetitle = comp.title().toLowerCase(); - if (comp.url() == null) continue ordering; // rare case where the url is corrupted - pageurl = comp.url().toString().toLowerCase(); - pageauthor = comp.author().toLowerCase(); - - // check exclusion - if (plasmaSearchQuery.matches(pagetitle, theQuery.excludeHashes)) continue ordering; - if (plasmaSearchQuery.matches(pageurl, theQuery.excludeHashes)) continue ordering; - if (plasmaSearchQuery.matches(pageauthor, theQuery.excludeHashes)) continue ordering; - - // check url mask - if (!(pageurl.matches(theQuery.urlMask))) continue ordering; - - // check constraints - if ((!(theQuery.constraint.equals(plasmaSearchQuery.catchall_constraint))) && - (theQuery.constraint.get(plasmaCondenser.flag_cat_indexof)) && - (!(comp.title().startsWith("Index of")))) { - serverLog.logFine("PLASMA", "filtered out " + comp.url().toString()); - // filter out bad results - Iterator wi = theQuery.queryHashes.iterator(); - while (wi.hasNext()) wordIndex.removeEntry((String) wi.next(), page.hash()); - } else if (theQuery.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) { - if ((theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (page.laudio() > 0)) acc.addPage(page); - else if ((theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (page.lvideo() > 0)) acc.addPage(page); - else if ((theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (page.limage() > 0)) acc.addPage(page); - else if ((theQuery.contentdom == plasmaSearchQuery.CONTENTDOM_APP) && (page.lapp() > 0)) acc.addPage(page); - } else { - acc.addPage(page); - } - } - } - } catch (kelondroException ee) { - serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee); - } - process.setYieldTime(plasmaSearchProcessing.PROCESS_URLFETCH); - process.setYieldCount(plasmaSearchProcessing.PROCESS_URLFETCH, acc.sizeFetched()); - - // start postsorting - process.startTimer(); - acc.sortPages(true); - process.setYieldTime(plasmaSearchProcessing.PROCESS_POSTSORT); - process.setYieldCount(plasmaSearchProcessing.PROCESS_POSTSORT, acc.sizeOrdered()); - - - // apply filter - process.startTimer(); - acc.removeRedundant(); - process.setYieldTime(plasmaSearchProcessing.PROCESS_FILTER); - process.setYieldCount(plasmaSearchProcessing.PROCESS_FILTER, acc.sizeOrdered()); - - // generate references - references = acc.getReferences(16); - - // generate Result.Entry objects and optionally fetch snippets - int i = 0; - Entry entry; - boolean includeSnippets = false; - while ((acc.hasMoreElements()) && (i < theQuery.wantedResults)) { - try { - entry = new Entry(acc.nextElement(), wordIndex); - } catch (RuntimeException e) { - continue; - } - // check bluelist again: filter out all links where any - // bluelisted word - // appear either in url, url's description or search word - // the search word was sorted out earlier - /* - * String s = descr.toLowerCase() + url.toString().toLowerCase(); - * for (int c = 0; c < blueList.length; c++) { if - * (s.indexOf(blueList[c]) >= 0) return; } - */ - if (includeSnippets) { - entry.setSnippet(plasmaSnippetCache.retrieveTextSnippet( - entry.url(), theQuery.queryHashes, false, - entry.flags().get(plasmaCondenser.flag_cat_indexof), 260, - 1000)); - // snippet = - // snippetCache.retrieveTextSnippet(comp.url(), - // query.queryHashes, false, - // urlentry.flags().get(plasmaCondenser.flag_cat_indexof), - // 260, 1000); - } else { - // snippet = null; - entry.setSnippet(null); - } - i++; - hits.add(entry); - } - - /* - * while ((acc.hasMoreElements()) && (((time + timestamp) < - * System.currentTimeMillis()))) { urlentry = acc.nextElement(); - * urlstring = htmlFilterContentScraper.urlNormalform(urlentry.url()); - * descr = urlentry.descr(); - * - * addScoreForked(ref, gs, descr.split(" ")); addScoreForked(ref, gs, - * urlstring.split("/")); } - */ - - } - - - // filter - public void applyFilter( - plasmaSearchPostOrder acc) { - - - } - - public int resultCount() { - return hits.size(); - } - - public Entry resultEntry(int i) { - return (Entry) hits.get(i); - } - - public Object[] references() { - return this.references; - } - - public static class Entry { - private indexURLEntry urlentry; - private indexURLEntry.Components urlcomps; // buffer for components - private String alternative_urlstring; - private String alternative_urlname; - private plasmaSnippetCache.TextSnippet snippet; - - public Entry(indexURLEntry urlentry, plasmaWordIndex wordIndex) { - this.urlentry = urlentry; - this.urlcomps = urlentry.comp(); - this.alternative_urlstring = null; - this.alternative_urlname = null; - this.snippet = null; - String host = urlcomps.url().getHost(); - if (host.endsWith(".yacyh")) { - // translate host into current IP - int p = host.indexOf("."); - String hash = yacySeed.hexHash2b64Hash(host.substring(p + 1, host.length() - 6)); - yacySeed seed = yacyCore.seedDB.getConnected(hash); - String filename = urlcomps.url().getFile(); - String address = null; - if ((seed == null) || ((address = seed.getPublicAddress()) == null)) { - // seed is not known from here - try { - wordIndex.removeWordReferences( - plasmaCondenser.getWords( - ("yacyshare " + - filename.replace('?', ' ') + - " " + - urlcomps.title()).getBytes(), "UTF-8").keySet(), - urlentry.hash()); - wordIndex.loadedURL.remove(urlentry.hash()); // clean up - throw new RuntimeException("index void"); - } catch (UnsupportedEncodingException e) { - throw new RuntimeException("parser failed: " + e.getMessage()); - } - } - alternative_urlstring = "http://" + address + "/" + host.substring(0, p) + filename; - alternative_urlname = "http://share." + seed.getName() + ".yacy" + filename; - if ((p = alternative_urlname.indexOf("?")) > 0) alternative_urlname = alternative_urlname.substring(0, p); - } - } - public String hash() { - return urlentry.hash(); - } - public URL url() { - return urlcomps.url(); - } - public kelondroBitfield flags() { - return urlentry.flags(); - } - public String urlstring() { - return (alternative_urlstring == null) ? urlcomps.url().toNormalform(false, true) : alternative_urlstring; - } - public String urlname() { - return (alternative_urlname == null) ? urlcomps.url().toNormalform(false, true) : alternative_urlname; - } - public String title() { - return urlcomps.title(); - } - public void setSnippet(plasmaSnippetCache.TextSnippet snippet) { - this.snippet = snippet; - } - public plasmaSnippetCache.TextSnippet snippet() { - return this.snippet; - } - public Date modified() { - return urlentry.moddate(); - } - public int filesize() { - return urlentry.size(); - } - public indexRWIEntry word() { - return urlentry.word(); - } - public boolean hasSnippet() { - return false; - } - public plasmaSnippetCache.TextSnippet textSnippet() { - return null; - } - public String resource() { - // generate transport resource - if ((snippet != null) && (snippet.exists())) { - return urlentry.toString(snippet.getLineRaw()); - } else { - return urlentry.toString(); - } - } - } - -}