From 35039982daabef05c1cb9040cd242d05eac69e3a Mon Sep 17 00:00:00 2001 From: allo Date: Thu, 18 Jan 2007 07:41:15 +0000 Subject: [PATCH] refactoring of search process: store results in a searchResults structure. At the moment, its just stored in it, and read from it again. Next step: return searchResults instead of serverObjects, and parse the results in the servlets. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3241 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- ChangeLog | 2 + source/de/anomic/data/searchResults.java | 149 ++++++++++++++++++ .../de/anomic/plasma/plasmaSwitchboard.java | 86 ++++++---- 3 files changed, 206 insertions(+), 31 deletions(-) create mode 100644 source/de/anomic/data/searchResults.java diff --git a/ChangeLog b/ChangeLog index bd603e586..f7e74e4f6 100644 --- a/ChangeLog +++ b/ChangeLog @@ -5,6 +5,8 @@ version 0.50 * UPDATED: Result Ranking * UPDATED: Crawl Monitor * CHANGED: Migrated to the new Database Structure + * ADDED: XSS protection for all pages as default. + * ADDED: searchResults structure. version 0.49 * CHANGED: New Database Structure for Index and URL Storage diff --git a/source/de/anomic/data/searchResults.java b/source/de/anomic/data/searchResults.java new file mode 100644 index 000000000..7c95d9bc4 --- /dev/null +++ b/source/de/anomic/data/searchResults.java @@ -0,0 +1,149 @@ +//plasmaSearchResults.java - a container for searchresults. +//---------------------------------------------------------- +//part of YaCy +// +// (C) 2007 by Alexander Schier +// +// last change: $LastChangedDate: $ by $LastChangedBy: $ +// $LastChangedRevision: $ +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.data; +import java.util.ArrayList; +import java.util.Date; + +import de.anomic.index.indexURLEntry; +import de.anomic.plasma.plasmaSearchQuery; +import de.anomic.plasma.plasmaSearchRankingProfile; +import de.anomic.plasma.plasmaSnippetCache; + +public class searchResults { + private int totalcount=0; + private int filteredcount=0; + private int orderedcount=0; + private int linkcount=0; + private int globalresults=0; + private plasmaSearchRankingProfile ranking=null; + private String formerSearch=""; + private plasmaSearchQuery query=null; + private ArrayList results=null; + + public searchResults(){ + this.results=new ArrayList(); + } + public searchResults(int totalcount, int filteredcount, int orderedcount, int linkcount){ + this.results=new ArrayList(); + this.totalcount=totalcount; + this.filteredcount=filteredcount; + this.orderedcount=orderedcount; + this.linkcount=linkcount; + } + public void appendResult(searchResult result){ + results.add(result); + } + public void setTotalcount(int totalcount) { + this.totalcount = totalcount; + } + public int getTotalcount() { + return totalcount; + } + public void setFilteredcount(int filteredcount) { + this.filteredcount = filteredcount; + } + public int getFilteredcount() { + return filteredcount; + } + public void setOrderedcount(int orderedcount) { + this.orderedcount = orderedcount; + } + public int getOrderedcount() { + return orderedcount; + } + public void setLinkcount(int linkcount) { + this.linkcount = linkcount; + } + public int getLinkcount() { + return linkcount; + } + public void setGlobalresults(int globalresults) { + this.globalresults = globalresults; + } + public int getGlobalresults() { + return globalresults; + } + public void setRanking(plasmaSearchRankingProfile ranking) { + this.ranking = ranking; + } + public plasmaSearchRankingProfile getRanking() { + return ranking; + } + public searchResult createSearchResult(){ + return new searchResult(); + } + public void setFormerSearch(String formerSearch) { + this.formerSearch = formerSearch; + } + public String getFormerSearch() { + return formerSearch; + } + public void setQuery(plasmaSearchQuery query) { + this.query = query; + } + public plasmaSearchQuery getQuery() { + return query; + } + public class searchResult{ + private String url=""; + private String urlname=""; + private plasmaSnippetCache.TextSnippet snippet=null; + private indexURLEntry urlentry=null; + + public searchResult(){ + + } + + public void setUrl(String url) { + this.url = url; + } + public String getUrl() { + return url; + } + public void setUrlname(String urlname) { + this.urlname = urlname; + } + public String getUrlname() { + return urlname; + } + public void setSnippet(plasmaSnippetCache.TextSnippet snippet) { + this.snippet = snippet; + } + public plasmaSnippetCache.TextSnippet getSnippet() { + return snippet; + } + public void setUrlentry(indexURLEntry urlentry) { + this.urlentry = urlentry; + } + public indexURLEntry getUrlentry() { + return urlentry; + } + public String getUrlhash(){ + return urlentry.hash(); + } + public boolean hasSnippet(){ + return this.snippet!=null && this.snippet.exists(); + } + } +} diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index b33548d8b..a654f94ee 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -126,6 +126,7 @@ import de.anomic.data.blogBoard; import de.anomic.data.bookmarksDB; import de.anomic.data.listManager; import de.anomic.data.messageBoard; +import de.anomic.data.searchResults; import de.anomic.data.userDB; import de.anomic.data.wikiBoard; import de.anomic.htmlFilter.htmlFilterContentScraper; @@ -2090,10 +2091,15 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser intermissionAllThreads(2 * query.maximumTime); serverObjects prop = new serverObjects(); + searchResults results=new searchResults(); + results.setRanking(ranking); + results.setQuery(query); + results.setFormerSearch(""); try { // filter out words that appear in bluelist //log.logInfo("E"); query.filterOut(blueList); + results.setQuery(query); // log log.logInfo("INIT WORD SEARCH: " + query.queryWords + ":" + query.queryHashes + " - " + query.wantedResults + " links, " + (query.maximumTime / 1000) + " seconds"); @@ -2116,26 +2122,35 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // result is a List of urlEntry elements: prepare answer if (acc == null) { - prop.put("num-results_totalcount", 0); - prop.put("num-results_filteredcount", 0); - prop.put("num-results_orderedcount", 0); - prop.put("num-results_linkcount", 0); + results.setTotalcount(0); + results.setFilteredcount(0); + results.setOrderedcount(0); + results.setLinkcount(0); + prop.put("num-results_totalcount", results.getTotalcount()); + prop.put("num-results_filteredcount", results.getFilteredcount()); + prop.put("num-results_orderedcount", results.getOrderedcount()); + prop.put("num-results_linkcount", results.getLinkcount()); prop.put("references", 0); prop.put("type_results", 0); } else { - prop.put("num-results_totalcount", acc.globalContributions + acc.localContributions); - prop.put("num-results_filteredcount", acc.filteredResults); - prop.put("num-results_orderedcount", Integer.toString(acc.sizeOrdered())); - prop.put("num-results_globalresults", acc.globalContributions); + results.setTotalcount(acc.globalContributions + acc.localContributions); + results.setFilteredcount(acc.filteredResults); + results.setOrderedcount(acc.sizeOrdered()); + results.setGlobalresults(acc.globalContributions); + results.setRanking(ranking); + + prop.put("num-results_totalcount", results.getTotalcount()); + prop.put("num-results_filteredcount", results.getFilteredcount()); + prop.put("num-results_orderedcount", Integer.toString(results.getOrderedcount())); //why toString? + prop.put("num-results_globalresults", results.getGlobalresults()); int i = 0; int p; indexURLEntry urlentry; String urlstring, urlname, filename, urlhash; String host, hash, address; yacySeed seed; - plasmaSnippetCache.TextSnippet snippet; boolean includeSnippets = false; - String formerSearch = query.words(" "); + results.setFormerSearch(query.words(" ")); long targetTime = timestamp + query.maximumTime; if (targetTime < System.currentTimeMillis()) targetTime = System.currentTimeMillis() + 1000; while ((acc.hasMoreElements()) && (i < query.wantedResults) && (System.currentTimeMillis() < targetTime)) { @@ -2178,42 +2193,51 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser //addScoreForked(ref, gs, descr.split(" ")); //addScoreForked(ref, gs, urlstring.split("/")); URL wordURL; - if (urlstring.matches(query.urlMask)) { //.* is default + searchResults.searchResult result=results.createSearchResult(); + result.setUrl(urlstring); + result.setUrlname(urlname); + result.setUrlentry(urlentry); + if (urlstring.matches(results.getQuery().urlMask)) { //.* is default if (includeSnippets) { - snippet = snippetCache.retrieveTextSnippet(comp.url(), query.queryHashes, false, urlentry.flags().get(plasmaCondenser.flag_cat_indexof), 260, 1000); + result.setSnippet(snippetCache.retrieveTextSnippet(comp.url(), results.getQuery().queryHashes, false, urlentry.flags().get(plasmaCondenser.flag_cat_indexof), 260, 1000)); + //snippet = snippetCache.retrieveTextSnippet(comp.url(), query.queryHashes, false, urlentry.flags().get(plasmaCondenser.flag_cat_indexof), 260, 1000); } else { - snippet = null; + //snippet = null; + result.setSnippet(null); } /* if ((snippet != null) && (snippet.getSource() == plasmaSnippetCache.ERROR_NO_MATCH)) { // suppress line: there is no match in that resource } else {*/ - prop.put("type_results_" + i + "_authorized_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, "stippadd", "url", urlstring) == null) ? 1 : 0); - prop.put("type_results_" + i + "_authorized_recommend_deletelink", "/yacysearch.html?search=" + formerSearch + "&Enter=Search&count=" + query.wantedResults + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&deleteref=" + urlhash + "&urlmaskfilter=.*"); - prop.put("type_results_" + i + "_authorized_recommend_recommendlink", "/yacysearch.html?search=" + formerSearch + "&Enter=Search&count=" + query.wantedResults + "&order=" + crypt.simpleEncode(ranking.toExternalString()) + "&resource=local&time=3&recommendref=" + urlhash + "&urlmaskfilter=.*"); - prop.put("type_results_" + i + "_authorized_urlhash", urlhash); - prop.put("type_results_" + i + "_description", comp.descr()); - prop.put("type_results_" + i + "_url", urlstring); - prop.put("type_results_" + i + "_urlhash", urlhash); - prop.put("type_results_" + i + "_urlhexhash", yacySeed.b64Hash2hexHash(urlhash)); - prop.put("type_results_" + i + "_urlname", nxTools.shortenURLString(urlname, 120)); - prop.put("type_results_" + i + "_date", dateString(urlentry.moddate())); - prop.put("type_results_" + i + "_ybr", plasmaSearchPreOrder.ybr(urlentry.hash())); - prop.put("type_results_" + i + "_size", Long.toString(urlentry.size())); - prop.put("type_results_" + i + "_words", URLEncoder.encode(query.queryWords.toString(),"UTF-8")); - prop.put("type_results_" + i + "_former", formerSearch); - prop.put("type_results_" + i + "_rankingprops", urlentry.word().toPropertyForm() + ", domLengthEstimated=" + plasmaURL.domLengthEstimation(urlhash) + - ((plasmaURL.probablyRootURL(urlhash)) ? ", probablyRootURL" : "") + + + + prop.put("type_results_" + i + "_authorized_recommend", (yacyCore.newsPool.getSpecific(yacyNewsPool.OUTGOING_DB, "stippadd", "url", result.getUrl()) == null) ? 1 : 0); + prop.put("type_results_" + i + "_authorized_recommend_deletelink", "/yacysearch.html?search=" + results.getFormerSearch() + "&Enter=Search&count=" + results.getQuery().wantedResults + "&order=" + crypt.simpleEncode(results.getRanking().toExternalString()) + "&resource=local&time=3&deleteref=" + result.getUrlhash() + "&urlmaskfilter=.*"); + prop.put("type_results_" + i + "_authorized_recommend_recommendlink", "/yacysearch.html?search=" + results.getFormerSearch() + "&Enter=Search&count=" + results.getQuery().wantedResults + "&order=" + crypt.simpleEncode(results.getRanking().toExternalString()) + "&resource=local&time=3&recommendref=" + result.getUrlhash() + "&urlmaskfilter=.*"); + prop.put("type_results_" + i + "_authorized_urlhash", result.getUrlhash()); + prop.put("type_results_" + i + "_description", result.getUrlentry().comp().descr()); + prop.put("type_results_" + i + "_url", result.getUrl()); + prop.put("type_results_" + i + "_urlhash", result.getUrlhash()); + prop.put("type_results_" + i + "_urlhexhash", yacySeed.b64Hash2hexHash(result.getUrlhash())); + prop.put("type_results_" + i + "_urlname", nxTools.shortenURLString(result.getUrlname(), 120)); + prop.put("type_results_" + i + "_date", dateString(result.getUrlentry().moddate())); + prop.put("type_results_" + i + "_ybr", plasmaSearchPreOrder.ybr(result.getUrlentry().hash())); + prop.put("type_results_" + i + "_size", Long.toString(result.getUrlentry().size())); + prop.put("type_results_" + i + "_words", URLEncoder.encode(results.getQuery().queryWords.toString(),"UTF-8")); + prop.put("type_results_" + i + "_former", results.getFormerSearch()); + prop.put("type_results_" + i + "_rankingprops", result.getUrlentry().word().toPropertyForm() + ", domLengthEstimated=" + plasmaURL.domLengthEstimation(result.getUrlhash()) + + ((plasmaURL.probablyRootURL(result.getUrlhash())) ? ", probablyRootURL" : "") + (((wordURL = plasmaURL.probablyWordURL(urlhash, query.words(""))) != null) ? ", probablyWordURL=" + wordURL.toNormalform() : "")); // adding snippet if available - if ((snippet != null) && (snippet.exists())) { + if (result.hasSnippet()) { prop.put("type_results_" + i + "_snippet", 1); - prop.putASIS("type_results_" + i + "_snippet_text", snippet.getLineMarked(query.queryHashes));//FIXME: the ASIS should not be needed, if there is no html in .java + prop.putASIS("type_results_" + i + "_snippet_text", result.getSnippet().getLineMarked(query.queryHashes));//FIXME: the ASIS should not be needed, if there is no html in .java } else { prop.put("type_results_" + i + "_snippet", 0); prop.put("type_results_" + i + "_snippet_text", ""); } i++; + results.appendResult(result); //} } }