From 5605887571318ecd8b0164d31a08e7dbfc44d6ea Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 5 Aug 2007 23:57:25 +0000 Subject: [PATCH] refactoring of search processes git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@4030 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/PerformanceSearch_p.java | 26 +- htroot/yacy/search.java | 35 +- htroot/yacysearch.java | 6 +- .../de/anomic/plasma/plasmaSearchEvent.java | 206 ++------- .../anomic/plasma/plasmaSearchProcessing.java | 435 ++++++++++++++++++ .../plasma/plasmaSearchTimingProfile.java | 282 ------------ .../de/anomic/plasma/plasmaSwitchboard.java | 8 +- source/de/anomic/plasma/plasmaWordIndex.java | 2 +- source/de/anomic/xml/crawlHandler.java | 3 +- source/de/anomic/yacy/yacyClient.java | 8 +- source/de/anomic/yacy/yacySearch.java | 14 +- 11 files changed, 529 insertions(+), 496 deletions(-) create mode 100644 source/de/anomic/plasma/plasmaSearchProcessing.java delete mode 100644 source/de/anomic/plasma/plasmaSearchTimingProfile.java diff --git a/htroot/PerformanceSearch_p.java b/htroot/PerformanceSearch_p.java index 4c1fa75d7..c57667c40 100644 --- a/htroot/PerformanceSearch_p.java +++ b/htroot/PerformanceSearch_p.java @@ -48,7 +48,7 @@ import java.util.Map; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaSearchEvent; -import de.anomic.plasma.plasmaSearchTimingProfile; +import de.anomic.plasma.plasmaSearchProcessing; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverFileUtils; import de.anomic.server.serverObjects; @@ -70,14 +70,14 @@ public class PerformanceSearch_p { if (post.containsKey("submitlocalprofilecustom")) { // first count percentages int c = 0; - for (int i = 0; i < plasmaSearchTimingProfile.sequence.length; i++) { - c += post.getInt("searchProcessLocalTime_" + plasmaSearchTimingProfile.sequence[i], 0); + for (int i = 0; i < plasmaSearchProcessing.sequence.length; i++) { + c += post.getInt("searchProcessLocalTime_" + plasmaSearchProcessing.sequence[i], 0); } // if check is ok set new values if (c == 100) { - for (int i = 0; i < plasmaSearchTimingProfile.sequence.length; i++) { - sb.setConfig("searchProcessLocalTime_" + plasmaSearchTimingProfile.sequence[i], post.get("searchProcessLocalTime_" + plasmaSearchTimingProfile.sequence[i], "")); - sb.setConfig("searchProcessLocalCount_" + plasmaSearchTimingProfile.sequence[i], post.get("searchProcessLocalCount_" + plasmaSearchTimingProfile.sequence[i], "")); + for (int i = 0; i < plasmaSearchProcessing.sequence.length; i++) { + sb.setConfig("searchProcessLocalTime_" + plasmaSearchProcessing.sequence[i], post.get("searchProcessLocalTime_" + plasmaSearchProcessing.sequence[i], "")); + sb.setConfig("searchProcessLocalCount_" + plasmaSearchProcessing.sequence[i], post.get("searchProcessLocalCount_" + plasmaSearchProcessing.sequence[i], "")); } prop.put("submitlocalrespond", 1); } else { @@ -85,9 +85,9 @@ public class PerformanceSearch_p { } } if (post.containsKey("submitlocalprofiledefault")) { - for (int i = 0; i < plasmaSearchTimingProfile.sequence.length; i++) { - sb.setConfig("searchProcessLocalTime_" + plasmaSearchTimingProfile.sequence[i], (String) defaultSettings.get("searchProcessLocalTime_" + plasmaSearchTimingProfile.sequence[i])); - sb.setConfig("searchProcessLocalCount_" + plasmaSearchTimingProfile.sequence[i], (String) defaultSettings.get("searchProcessLocalCount_" + plasmaSearchTimingProfile.sequence[i])); + for (int i = 0; i < plasmaSearchProcessing.sequence.length; i++) { + sb.setConfig("searchProcessLocalTime_" + plasmaSearchProcessing.sequence[i], (String) defaultSettings.get("searchProcessLocalTime_" + plasmaSearchProcessing.sequence[i])); + sb.setConfig("searchProcessLocalCount_" + plasmaSearchProcessing.sequence[i], (String) defaultSettings.get("searchProcessLocalCount_" + plasmaSearchProcessing.sequence[i])); } prop.put("submitlocalrespond", 2); } @@ -100,12 +100,12 @@ public class PerformanceSearch_p { long t; int c; char sequence; - if (se != null) for (int i = 0; i < plasmaSearchTimingProfile.sequence.length; i++) { - t = se.getLocalTiming().getYieldTime(plasmaSearchTimingProfile.sequence[i]); + if (se != null) for (int i = 0; i < plasmaSearchProcessing.sequence.length; i++) { + t = se.getLocalTiming().getYieldTime(plasmaSearchProcessing.sequence[i]); if (t > 0) time += t; } - for (int i = 0; i < plasmaSearchTimingProfile.sequence.length; i++) { - sequence = plasmaSearchTimingProfile.sequence[i]; + for (int i = 0; i < plasmaSearchProcessing.sequence.length; i++) { + sequence = plasmaSearchProcessing.sequence[i]; prop.put("searchProcessLocalTime_" + sequence, sb.getConfig("searchProcessLocalTime_" + sequence, "")); prop.put("searchProcessLocalCount_" + sequence, sb.getConfig("searchProcessLocalCount_" + sequence, "")); if (se == null) { diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 2b7ea961c..1f22c0f28 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -64,9 +64,10 @@ import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchPostOrder; -import de.anomic.plasma.plasmaSearchTimingProfile; +import de.anomic.plasma.plasmaSearchProcessing; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.plasma.plasmaWordIndex; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -158,11 +159,11 @@ public final class search { // prepare a search profile plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(contentdom) : new plasmaSearchRankingProfile("", profile); - plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(squery.maximumTime, squery.wantedResults); - plasmaSearchTimingProfile remoteTiming = null; + plasmaSearchProcessing localTiming = new plasmaSearchProcessing(squery.maximumTime, squery.wantedResults); + plasmaSearchProcessing remoteTiming = null; - theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, yacyCore.log, sb.wordIndex, sb.wordIndex.loadedURL, sb.snippetCache, null); - Map[] containers = theSearch.localSearchContainers(plasmaSearchQuery.hashes2Set(urls)); + theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, yacyCore.log, sb.wordIndex, sb.snippetCache, null); + Map[] containers = localTiming.localSearchContainers(squery, sb.wordIndex, plasmaSearchQuery.hashes2Set(urls)); if (containers != null) { Iterator ci = containers[0].entrySet().iterator(); Map.Entry entry; @@ -186,15 +187,14 @@ public final class search { // prepare a search profile plasmaSearchRankingProfile rankingProfile = (profile.length() == 0) ? new plasmaSearchRankingProfile(contentdom) : new plasmaSearchRankingProfile("", profile); - plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(squery.maximumTime, squery.wantedResults); - plasmaSearchTimingProfile remoteTiming = null; + plasmaSearchProcessing localTiming = new plasmaSearchProcessing(squery.maximumTime, squery.wantedResults); + plasmaSearchProcessing remoteTiming = null; theSearch = new plasmaSearchEvent(squery, rankingProfile, localTiming, remoteTiming, true, - yacyCore.log, sb.wordIndex, sb.wordIndex.loadedURL, + yacyCore.log, sb.wordIndex, sb.snippetCache, null); - Map[] containers = theSearch.localSearchContainers(plasmaSearchQuery.hashes2Set(urls)); - + Map[] containers = localTiming.localSearchContainers(squery, sb.wordIndex, plasmaSearchQuery.hashes2Set(urls)); // set statistic details of search result and find best result index set if (containers == null) { prop.putASIS("indexcount", ""); @@ -231,7 +231,16 @@ public final class search { prop.putASIS("indexcount", new String(indexcount)); // join and order the result - indexContainer localResults = theSearch.localSearchJoinExclude(containers[0].values(), containers[1].values()); + indexContainer localResults = + (containers == null) ? + plasmaWordIndex.emptyContainer(null) : + localTiming.localSearchJoinExclude( + containers[0].values(), + containers[1].values(), + (squery.queryHashes.size() == 0) ? + 0 : + localTiming.getTargetTime(plasmaSearchProcessing.PROCESS_JOIN) * squery.queryHashes.size() / (squery.queryHashes.size() + squery.excludeHashes.size()), + squery.maxDistance); if (localResults == null) { joincount = 0; prop.put("joincount", 0); @@ -239,7 +248,9 @@ public final class search { } else { joincount = localResults.size(); prop.putASIS("joincount", Integer.toString(joincount)); - acc = theSearch.orderFinal(localResults); + acc = localTiming.orderFinal(squery, rankingProfile, sb.wordIndex, true, localResults); + + } // generate compressed index for maxcounthash // this is not needed if the search is restricted to specific diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index b1db973ee..f6d834d8a 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -68,7 +68,7 @@ import de.anomic.plasma.plasmaSearchImages; import de.anomic.plasma.plasmaSearchPreOrder; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchRankingProfile; -import de.anomic.plasma.plasmaSearchTimingProfile; +import de.anomic.plasma.plasmaSearchProcessing; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURL; @@ -285,8 +285,8 @@ public class yacysearch { 20, constraint); plasmaSearchRankingProfile ranking = (sb.getConfig("rankingProfile", "").length() == 0) ? new plasmaSearchRankingProfile(contentdomString) : new plasmaSearchRankingProfile("", crypt.simpleDecode(sb.getConfig("rankingProfile", ""), null)); - plasmaSearchTimingProfile localTiming = new plasmaSearchTimingProfile(4 * thisSearch.maximumTime / 10, thisSearch.wantedResults); - plasmaSearchTimingProfile remoteTiming = new plasmaSearchTimingProfile(6 * thisSearch.maximumTime / 10, thisSearch.wantedResults); + plasmaSearchProcessing localTiming = new plasmaSearchProcessing(4 * thisSearch.maximumTime / 10, thisSearch.wantedResults); + plasmaSearchProcessing remoteTiming = new plasmaSearchProcessing(6 * thisSearch.maximumTime / 10, thisSearch.wantedResults); plasmaSearchResults results = new plasmaSearchResults(); String wrongregex = null; diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index 9d7e1e841..f081e70db 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -41,17 +41,14 @@ package de.anomic.plasma; -import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; -import java.util.Set; import java.util.TreeMap; import de.anomic.index.indexContainer; import de.anomic.index.indexRWIEntry; -import de.anomic.index.indexURLEntry; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroMSetTools; import de.anomic.server.logging.serverLog; @@ -73,7 +70,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { private indexContainer rcContainers; // cache for results private int rcContainerFlushCount; private Map rcAbstracts; // cache for index abstracts; word:TreeMap mapping where the embedded TreeMap is a urlhash:peerlist relation - private plasmaSearchTimingProfile profileLocal, profileGlobal; + private plasmaSearchProcessing profileLocal, profileGlobal; private boolean postsort; private yacySearch[] primarySearchThreads, secondarySearchThreads; private long searchtime; @@ -82,21 +79,20 @@ public final class plasmaSearchEvent extends Thread implements Runnable { public plasmaSearchEvent(plasmaSearchQuery query, plasmaSearchRankingProfile ranking, - plasmaSearchTimingProfile localTiming, - plasmaSearchTimingProfile remoteTiming, + plasmaSearchProcessing localTiming, + plasmaSearchProcessing remoteTiming, boolean postsort, serverLog log, plasmaWordIndex wordIndex, - plasmaCrawlLURL urlStore, plasmaSnippetCache snippetCache, TreeMap preselectedPeerHashes) { this.log = log; this.wordIndex = wordIndex; this.query = query; this.ranking = ranking; - this.urlStore = urlStore; + this.urlStore = wordIndex.loadedURL; this.snippetCache = snippetCache; - this.rcContainers = wordIndex.emptyContainer(null); + this.rcContainers = plasmaWordIndex.emptyContainer(null); this.rcContainerFlushCount = 0; this.rcAbstracts = (query.queryHashes.size() > 1) ? new TreeMap() : null; // generate abstracts only for combined searches this.profileLocal = localTiming; @@ -113,7 +109,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { return query; } - public plasmaSearchTimingProfile getLocalTiming() { + public plasmaSearchProcessing getLocalTiming() { return profileLocal; } @@ -152,7 +148,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { // do a global search // the result of the fetch is then in the rcGlobal - log.logFine("STARTING " + fetchpeers + " THREADS TO CATCH EACH " + profileGlobal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT) + " URLs WITHIN " + (profileGlobal.duetime() / 1000) + " SECONDS"); + log.logFine("STARTING " + fetchpeers + " THREADS TO CATCH EACH " + profileGlobal.getTargetCount(plasmaSearchProcessing.PROCESS_POSTSORT) + " URLs WITHIN " + (profileGlobal.duetime() / 1000) + " SECONDS"); long secondaryTimeout = System.currentTimeMillis() + profileGlobal.duetime() / 3 * 2; long primaryTimeout = System.currentTimeMillis() + profileGlobal.duetime(); primarySearchThreads = yacySearch.primaryRemoteSearches(plasmaSearchQuery.hashSet2hashString(query.queryHashes), plasmaSearchQuery.hashSet2hashString(query.excludeHashes), "", @@ -161,7 +157,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { (query.domType == plasmaSearchQuery.SEARCHDOM_GLOBALDHT) ? null : preselectedPeerHashes); // meanwhile do a local search - Map[] searchContainerMaps = localSearchContainers(null); + Map[] searchContainerMaps = profileLocal.localSearchContainers(query, wordIndex, null); // use the search containers to fill up rcAbstracts locally /* @@ -189,7 +185,16 @@ public final class plasmaSearchEvent extends Thread implements Runnable { */ // try to pre-fetch some LURLs if there is enough time - indexContainer rcLocal = localSearchJoinExclude(searchContainerMaps[0].values(), searchContainerMaps[1].values()); + indexContainer rcLocal = + (searchContainerMaps == null) ? + plasmaWordIndex.emptyContainer(null) : + profileLocal.localSearchJoinExclude( + searchContainerMaps[0].values(), + searchContainerMaps[1].values(), + (query.queryHashes.size() == 0) ? + 0 : + profileLocal.getTargetTime(plasmaSearchProcessing.PROCESS_JOIN) * query.queryHashes.size() / (query.queryHashes.size() + query.excludeHashes.size()), + query.maxDistance); prefetchLocal(rcLocal, secondaryTimeout); // this is temporary debugging code to learn that the index abstracts are fetched correctly @@ -214,7 +219,13 @@ public final class plasmaSearchEvent extends Thread implements Runnable { log.logFine("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + primarySearchThreads.length + " PEERS: " + ((System.currentTimeMillis() - start) / 1000) + " seconds"); // combine the result and order - result = orderFinal(rcLocal); + indexContainer searchResult = plasmaWordIndex.emptyContainer(null); + searchResult.addAllUnique(rcLocal); + searchResult.addAllUnique(rcContainers); + searchResult.sort(); + searchResult.uniq(1000); + result = profileLocal.orderFinal(query, ranking, wordIndex, postsort, searchResult); + if (result != null) { result.globalContributions = globalContributions; @@ -222,9 +233,19 @@ public final class plasmaSearchEvent extends Thread implements Runnable { this.start(); // start to flush results } } else { - Map[] searchContainerMaps = localSearchContainers(null); - indexContainer rcLocal = (searchContainerMaps == null) ? wordIndex.emptyContainer(null) : localSearchJoinExclude(searchContainerMaps[0].values(), searchContainerMaps[1].values()); - result = orderFinal(rcLocal); + Map[] searchContainerMaps = profileLocal.localSearchContainers(query, wordIndex, null); + + indexContainer rcLocal = + (searchContainerMaps == null) ? + plasmaWordIndex.emptyContainer(null) : + profileLocal.localSearchJoinExclude( + searchContainerMaps[0].values(), + searchContainerMaps[1].values(), + (query.queryHashes.size() == 0) ? + 0 : + profileLocal.getTargetTime(plasmaSearchProcessing.PROCESS_JOIN) * query.queryHashes.size() / (query.queryHashes.size() + query.excludeHashes.size()), + query.maxDistance); + result = profileLocal.orderFinal(query, ranking, wordIndex, postsort, rcLocal); result.globalContributions = 0; } @@ -333,157 +354,6 @@ public final class plasmaSearchEvent extends Thread implements Runnable { return wordlist; } - public Map[] localSearchContainers(Set urlselection) { - // search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result - - // retrieve entities that belong to the hashes - profileLocal.startTimer(); - long start = System.currentTimeMillis(); - Map inclusionContainers = (query.queryHashes.size() == 0) ? new HashMap() : wordIndex.getContainers( - query.queryHashes, - urlselection, - true, - true, - profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_COLLECTION) * query.queryHashes.size() / (query.queryHashes.size() + query.excludeHashes.size())); - if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < query.queryHashes.size())) inclusionContainers = new HashMap(); // prevent that only a subset is returned - long remaintime = profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_COLLECTION) - System.currentTimeMillis() + start; - Map exclusionContainers = ((inclusionContainers == null) || (inclusionContainers.size() == 0) || (remaintime <= 0)) ? new HashMap() : wordIndex.getContainers( - query.excludeHashes, - urlselection, - true, - true, - remaintime); - profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_COLLECTION); - profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_COLLECTION, inclusionContainers.size()); - - return new Map[]{inclusionContainers, exclusionContainers}; - } - - public indexContainer localSearchJoinExclude(Collection includeContainers, Collection excludeContainers) { - // join a search result and return the joincount (number of pages after join) - - // since this is a conjunction we return an empty entity if any word is not known - if (includeContainers == null) return wordIndex.emptyContainer(null); - - // join the result - profileLocal.startTimer(); - long start = System.currentTimeMillis(); - indexContainer rcLocal = indexContainer.joinContainers(includeContainers, - (query.queryHashes.size() == 0) ? 0 : - profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_JOIN) * query.queryHashes.size() / (query.queryHashes.size() + query.excludeHashes.size()), - query.maxDistance); - long remaining = profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_JOIN) - System.currentTimeMillis() + start; - if ((rcLocal != null) && (remaining > 0)) { - indexContainer.excludeContainers(rcLocal, excludeContainers, remaining); - } - if (rcLocal == null) rcLocal = wordIndex.emptyContainer(null); - profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_JOIN); - profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_JOIN, rcLocal.size()); - - return rcLocal; - } - - public plasmaSearchPostOrder orderFinal(indexContainer rcLocal) { - // we collect the urlhashes and construct a list with urlEntry objects - // attention: if minEntries is too high, this method will not terminate within the maxTime - - assert (rcLocal != null); - - indexContainer searchResult = wordIndex.emptyContainer(null); - long preorderTime = profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_PRESORT); - - profileLocal.startTimer(); - long pst = System.currentTimeMillis(); - searchResult.addAllUnique(rcLocal); - searchResult.addAllUnique(rcContainers); - searchResult.sort(); - searchResult.uniq(1000); - preorderTime = preorderTime - (System.currentTimeMillis() - pst); - if (preorderTime < 0) preorderTime = 200; - plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, searchResult, preorderTime); - if (searchResult.size() > query.wantedResults) preorder.remove(true, true); - profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_PRESORT); - profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_PRESORT, rcLocal.size()); - - // start url-fetch - long postorderTime = profileLocal.getTargetTime(plasmaSearchTimingProfile.PROCESS_POSTSORT); - //System.out.println("DEBUG: postorder-final (urlfetch) maxtime = " + postorderTime); - long postorderLimitTime = (postorderTime < 0) ? Long.MAX_VALUE : (System.currentTimeMillis() + postorderTime); - profileLocal.startTimer(); - plasmaSearchPostOrder acc = new plasmaSearchPostOrder(query, ranking); - - indexRWIEntry entry; - indexURLEntry page; - Long preranking; - Object[] preorderEntry; - indexURLEntry.Components comp; - String pagetitle, pageurl, pageauthor; - int minEntries = profileLocal.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT); - try { - ordering: while (preorder.hasNext()) { - if ((System.currentTimeMillis() >= postorderLimitTime) && (acc.sizeFetched() >= minEntries)) break; - preorderEntry = preorder.next(); - entry = (indexRWIEntry) preorderEntry[0]; - // load only urls if there was not yet a root url of that hash - preranking = (Long) preorderEntry[1]; - // find the url entry - page = urlStore.load(entry.urlHash(), entry); - if (page != null) { - comp = page.comp(); - pagetitle = comp.title().toLowerCase(); - if (comp.url() == null) continue ordering; // rare case where the url is corrupted - pageurl = comp.url().toString().toLowerCase(); - pageauthor = comp.author().toLowerCase(); - - // check exclusion - if (plasmaSearchQuery.matches(pagetitle, query.excludeHashes)) continue ordering; - if (plasmaSearchQuery.matches(pageurl, query.excludeHashes)) continue ordering; - if (plasmaSearchQuery.matches(pageauthor, query.excludeHashes)) continue ordering; - - // check url mask - if (!(pageurl.matches(query.urlMask))) continue ordering; - - // check constraints - if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && - (query.constraint.get(plasmaCondenser.flag_cat_indexof)) && - (!(comp.title().startsWith("Index of")))) { - log.logFine("filtered out " + comp.url().toString()); - // filter out bad results - Iterator wi = query.queryHashes.iterator(); - while (wi.hasNext()) wordIndex.removeEntry((String) wi.next(), page.hash()); - } else if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) { - if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (page.laudio() > 0)) acc.addPage(page, preranking); - else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (page.lvideo() > 0)) acc.addPage(page, preranking); - else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (page.limage() > 0)) acc.addPage(page, preranking); - else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP) && (page.lapp() > 0)) acc.addPage(page, preranking); - } else { - acc.addPage(page, preranking); - } - } - } - } catch (kelondroException ee) { - serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee); - } - profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_URLFETCH); - profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_URLFETCH, acc.sizeFetched()); - - // start postsorting - profileLocal.startTimer(); - acc.sortPages(postsort); - profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_POSTSORT); - profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_POSTSORT, acc.sizeOrdered()); - - // apply filter - profileLocal.startTimer(); - acc.removeRedundant(); - profileLocal.setYieldTime(plasmaSearchTimingProfile.PROCESS_FILTER); - profileLocal.setYieldCount(plasmaSearchTimingProfile.PROCESS_FILTER, acc.sizeOrdered()); - - acc.localContributions = (rcLocal == null) ? 0 : rcLocal.size(); - acc.filteredResults = preorder.filteredCount(); - return acc; - } - private void prefetchLocal(indexContainer rcLocal, long timeout) { // pre-fetch some urls to fill LURL ram cache diff --git a/source/de/anomic/plasma/plasmaSearchProcessing.java b/source/de/anomic/plasma/plasmaSearchProcessing.java new file mode 100644 index 000000000..c55111195 --- /dev/null +++ b/source/de/anomic/plasma/plasmaSearchProcessing.java @@ -0,0 +1,435 @@ +// plasmaSearchProcess.java +// (C) 2005 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 17.10.2005 on http://yacy.net +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.plasma; + +import java.util.Collection; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; + +import de.anomic.index.indexContainer; +import de.anomic.index.indexRWIEntry; +import de.anomic.index.indexURLEntry; +import de.anomic.kelondro.kelondroException; +import de.anomic.server.logging.serverLog; + +/** + * + * This class provides search processes and keeps a timing record of the processes + * It shall be used to initiate a search and also to evaluate + * the real obtained timings after a search is performed + */ + +public class plasmaSearchProcessing implements Cloneable { + + // collection: + // time = time to get a RWI out of RAM cache, assortments and WORDS files + // count = maximum number of RWI-entries that shall be collected + + // join + // time = time to perform the join between all collected RWIs + // count = maximum number of entries that shall be joined + + // presort: + // time = time to do a sort of the joined URL-records + // count = maximum number of entries that shall be pre-sorted + + // urlfetch: + // time = time to fetch the real URLs from the LURL database + // count = maximum number of urls that shall be fetched + + // postsort: + // time = time for final sort of URLs + // count = maximum number oof URLs that shall be retrieved during sort + + // snippetfetch: + // time = time to fetch snippets for selected URLs + // count = maximum number of snipptes to be fetched + + public static final char PROCESS_COLLECTION = 'c'; + public static final char PROCESS_JOIN = 'j'; + public static final char PROCESS_PRESORT = 'r'; + public static final char PROCESS_URLFETCH = 'u'; + public static final char PROCESS_POSTSORT = 'o'; + public static final char PROCESS_FILTER = 'f'; + public static final char PROCESS_SNIPPETFETCH = 's'; + + private static final long minimumTargetTime = 100; + + public static char[] sequence = new char[]{ + PROCESS_COLLECTION, + PROCESS_JOIN, + PROCESS_PRESORT, + PROCESS_URLFETCH, + PROCESS_POSTSORT, + PROCESS_FILTER, + PROCESS_SNIPPETFETCH + }; + + private HashMap targetTime; + private HashMap targetCount; + private HashMap yieldTime; + private HashMap yieldCount; + private long timer; + + private plasmaSearchProcessing() { + targetTime = new HashMap(); + targetCount = new HashMap(); + yieldTime = new HashMap(); + yieldCount = new HashMap(); + timer = 0; + } + + public plasmaSearchProcessing(long time, int count) { + this( + 3 * time / 12, 10 * count, + 1 * time / 12, 10 * count, + 1 * time / 12, 10 * count, + 2 * time / 12, 5 * count, + 3 * time / 12, count, + 1 * time / 12, count, + 1 * time / 12, 1 + ); + } + + public plasmaSearchProcessing( + long time_collection, int count_collection, + long time_join, int count_join, + long time_presort, int count_presort, + long time_urlfetch, int count_urlfetch, + long time_postsort, int count_postsort, + long time_filter, int count_filter, + long time_snippetfetch, int count_snippetfetch) { + this(); + + targetTime.put(new Character(PROCESS_COLLECTION), new Long(time_collection)); + targetTime.put(new Character(PROCESS_JOIN), new Long(time_join)); + targetTime.put(new Character(PROCESS_PRESORT), new Long(time_presort)); + targetTime.put(new Character(PROCESS_URLFETCH), new Long(time_urlfetch)); + targetTime.put(new Character(PROCESS_POSTSORT), new Long(time_postsort)); + targetTime.put(new Character(PROCESS_FILTER), new Long(time_filter)); + targetTime.put(new Character(PROCESS_SNIPPETFETCH), new Long(time_snippetfetch)); + targetCount.put(new Character(PROCESS_COLLECTION), new Integer(count_collection)); + targetCount.put(new Character(PROCESS_JOIN), new Integer(count_join)); + targetCount.put(new Character(PROCESS_PRESORT), new Integer(count_presort)); + targetCount.put(new Character(PROCESS_URLFETCH), new Integer(count_urlfetch)); + targetCount.put(new Character(PROCESS_POSTSORT), new Integer(count_postsort)); + targetCount.put(new Character(PROCESS_FILTER), new Integer(count_filter)); + targetCount.put(new Character(PROCESS_SNIPPETFETCH), new Integer(count_snippetfetch)); + + } + + public Object clone() { + plasmaSearchProcessing p = new plasmaSearchProcessing(); + p.targetTime = (HashMap) this.targetTime.clone(); + p.targetCount = (HashMap) this.targetCount.clone(); + p.yieldTime = (HashMap) this.yieldTime.clone(); + p.yieldCount = (HashMap) this.yieldCount.clone(); + return p; + } + + public plasmaSearchProcessing(String s) { + targetTime = new HashMap(); + targetCount = new HashMap(); + yieldTime = new HashMap(); + yieldCount = new HashMap(); + + intoMap(s, targetTime, targetCount); + } + + public long duetime() { + // returns the old duetime value as sum of all waiting times + long d = 0; + for (int i = 0; i < sequence.length; i++) { + d += ((Long) targetTime.get(new Character(sequence[i]))).longValue(); + } + return d; + } + + public void putYield(String s) { + intoMap(s, yieldTime, yieldCount); + } + + public String yieldToString() { + return toString(yieldTime, yieldCount); + } + + public String targetToString() { + return toString(targetTime, targetCount); + } + + public long getTargetTime(char type) { + // sum up all time that was demanded and subtract all that had been wasted + long sum = 0; + Long t; + Character element; + for (int i = 0; i < sequence.length; i++) { + element = new Character(sequence[i]); + t = (Long) targetTime.get(element); + if (t != null) sum += t.longValue(); + if (type == sequence[i]) return (sum < 0) ? minimumTargetTime : sum; + t = (Long) yieldTime.get(element); + if (t != null) sum -= t.longValue(); + } + return minimumTargetTime; + } + + public int getTargetCount(char type) { + Integer i = (Integer) targetCount.get(new Character(type)); + if (i == null) return -1; else return i.intValue(); + } + + public long getYieldTime(char type) { + Long l = (Long) yieldTime.get(new Character(type)); + if (l == null) return -1; else return l.longValue(); + } + + public int getYieldCount(char type) { + Integer i = (Integer) yieldCount.get(new Character(type)); + if (i == null) return -1; else return i.intValue(); + } + + public void startTimer() { + this.timer = System.currentTimeMillis(); + } + + public void setYieldTime(char type) { + // sets a time that is computed using the timer + long t = System.currentTimeMillis() - this.timer; + yieldTime.put(new Character(type), new Long(t)); + } + + public void setYieldCount(char type, int count) { + yieldCount.put(new Character(type), new Integer(count)); + } + + public String reportToString() { + return "target=" + toString(targetTime, targetCount) + "; yield=" + toString(yieldTime, yieldCount); + } + + public static String toString(HashMap time, HashMap count) { + // put this into a format in such a way that it can be send in a http header or post argument + // that means that no '=' or spaces are allowed + StringBuffer sb = new StringBuffer(sequence.length * 10); + Character element; + Integer xi; + Long xl; + for (int i = 0; i < sequence.length; i++) { + element = new Character(sequence[i]); + sb.append("t"); + sb.append(element); + xl = (Long) time.get(element); + sb.append((xl == null) ? "0" : xl.toString()); + sb.append("|"); + sb.append("c"); + sb.append(element); + xi = (Integer) count.get(element); + sb.append((xi == null) ? "0" : xi.toString()); + sb.append("|"); + } + return sb.toString(); + } + + public static void intoMap(String s, HashMap time, HashMap count) { + // this is the reverse method to toString + int p = 0; + char ct; + String elt; + String v; + int p1; + while ((p < s.length()) && ((p1 = s.indexOf('|', p)) > 0)) { + ct = s.charAt(p); + elt = s.substring(p + 1, p + 2); + v = s.substring(p + 2, p1); + if (ct == 't') { + time.put(elt, new Long(Long.parseLong(v))); + } else { + count.put(elt, new Integer(Integer.parseInt(v))); + } + } + } + + // the processes + + public Map[] localSearchContainers( + plasmaSearchQuery query, + plasmaWordIndex wordIndex, + Set urlselection) { + // search for the set of hashes and return a map of of wordhash:indexContainer containing the seach result + + // retrieve entities that belong to the hashes + startTimer(); + long start = System.currentTimeMillis(); + Map inclusionContainers = (query.queryHashes.size() == 0) ? new HashMap() : wordIndex.getContainers( + query.queryHashes, + urlselection, + true, + true, + getTargetTime(plasmaSearchProcessing.PROCESS_COLLECTION) * query.queryHashes.size() / (query.queryHashes.size() + query.excludeHashes.size())); + if ((inclusionContainers.size() != 0) && (inclusionContainers.size() < query.queryHashes.size())) inclusionContainers = new HashMap(); // prevent that only a subset is returned + long remaintime = getTargetTime(plasmaSearchProcessing.PROCESS_COLLECTION) - System.currentTimeMillis() + start; + Map exclusionContainers = ((inclusionContainers == null) || (inclusionContainers.size() == 0) || (remaintime <= 0)) ? new HashMap() : wordIndex.getContainers( + query.excludeHashes, + urlselection, + true, + true, + remaintime); + setYieldTime(plasmaSearchProcessing.PROCESS_COLLECTION); + setYieldCount(plasmaSearchProcessing.PROCESS_COLLECTION, inclusionContainers.size()); + + return new Map[]{inclusionContainers, exclusionContainers}; + } + + public indexContainer localSearchJoinExclude( + Collection includeContainers, + Collection excludeContainers, + long time, int maxDistance) { + // join a search result and return the joincount (number of pages after join) + + // since this is a conjunction we return an empty entity if any word is not known + if (includeContainers == null) return plasmaWordIndex.emptyContainer(null); + + // join the result + startTimer(); + long start = System.currentTimeMillis(); + indexContainer rcLocal = indexContainer.joinContainers(includeContainers, time, maxDistance); + long remaining = getTargetTime(plasmaSearchProcessing.PROCESS_JOIN) - System.currentTimeMillis() + start; + if ((rcLocal != null) && (remaining > 0)) { + indexContainer.excludeContainers(rcLocal, excludeContainers, remaining); + } + if (rcLocal == null) rcLocal = plasmaWordIndex.emptyContainer(null); + setYieldTime(plasmaSearchProcessing.PROCESS_JOIN); + setYieldCount(plasmaSearchProcessing.PROCESS_JOIN, rcLocal.size()); + + return rcLocal; + } + + public plasmaSearchPostOrder orderFinal( + plasmaSearchQuery query, + plasmaSearchRankingProfile ranking, + plasmaWordIndex wordIndex, + boolean postsort, + indexContainer resultIndex) { + // we collect the urlhashes and construct a list with urlEntry objects + // attention: if minEntries is too high, this method will not terminate within the maxTime + + assert (resultIndex != null); + + long preorderTime = getTargetTime(plasmaSearchProcessing.PROCESS_PRESORT); + + startTimer(); + long pst = System.currentTimeMillis(); + resultIndex.sort(); + resultIndex.uniq(1000); + preorderTime = preorderTime - (System.currentTimeMillis() - pst); + if (preorderTime < 0) preorderTime = 200; + plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, resultIndex, preorderTime); + if (resultIndex.size() > query.wantedResults) preorder.remove(true, true); + setYieldTime(plasmaSearchProcessing.PROCESS_PRESORT); + setYieldCount(plasmaSearchProcessing.PROCESS_PRESORT, resultIndex.size()); + + // start url-fetch + long postorderTime = getTargetTime(plasmaSearchProcessing.PROCESS_POSTSORT); + //System.out.println("DEBUG: postorder-final (urlfetch) maxtime = " + postorderTime); + long postorderLimitTime = (postorderTime < 0) ? Long.MAX_VALUE : (System.currentTimeMillis() + postorderTime); + startTimer(); + plasmaSearchPostOrder acc = new plasmaSearchPostOrder(query, ranking); + + indexRWIEntry entry; + indexURLEntry page; + Long preranking; + Object[] preorderEntry; + indexURLEntry.Components comp; + String pagetitle, pageurl, pageauthor; + int minEntries = getTargetCount(plasmaSearchProcessing.PROCESS_POSTSORT); + try { + ordering: while (preorder.hasNext()) { + if ((System.currentTimeMillis() >= postorderLimitTime) || (acc.sizeFetched() >= minEntries)) break; + preorderEntry = preorder.next(); + entry = (indexRWIEntry) preorderEntry[0]; + // load only urls if there was not yet a root url of that hash + preranking = (Long) preorderEntry[1]; + // find the url entry + page = wordIndex.loadedURL.load(entry.urlHash(), entry); + if (page != null) { + comp = page.comp(); + pagetitle = comp.title().toLowerCase(); + if (comp.url() == null) continue ordering; // rare case where the url is corrupted + pageurl = comp.url().toString().toLowerCase(); + pageauthor = comp.author().toLowerCase(); + + // check exclusion + if (plasmaSearchQuery.matches(pagetitle, query.excludeHashes)) continue ordering; + if (plasmaSearchQuery.matches(pageurl, query.excludeHashes)) continue ordering; + if (plasmaSearchQuery.matches(pageauthor, query.excludeHashes)) continue ordering; + + // check url mask + if (!(pageurl.matches(query.urlMask))) continue ordering; + + // check constraints + if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && + (query.constraint.get(plasmaCondenser.flag_cat_indexof)) && + (!(comp.title().startsWith("Index of")))) { + serverLog.logFine("PLASMA", "filtered out " + comp.url().toString()); + // filter out bad results + Iterator wi = query.queryHashes.iterator(); + while (wi.hasNext()) wordIndex.removeEntry((String) wi.next(), page.hash()); + } else if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) { + if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (page.laudio() > 0)) acc.addPage(page, preranking); + else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (page.lvideo() > 0)) acc.addPage(page, preranking); + else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (page.limage() > 0)) acc.addPage(page, preranking); + else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP) && (page.lapp() > 0)) acc.addPage(page, preranking); + } else { + acc.addPage(page, preranking); + } + } + } + } catch (kelondroException ee) { + serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee); + } + setYieldTime(plasmaSearchProcessing.PROCESS_URLFETCH); + setYieldCount(plasmaSearchProcessing.PROCESS_URLFETCH, acc.sizeFetched()); + + // start postsorting + startTimer(); + acc.sortPages(postsort); + setYieldTime(plasmaSearchProcessing.PROCESS_POSTSORT); + setYieldCount(plasmaSearchProcessing.PROCESS_POSTSORT, acc.sizeOrdered()); + + // apply filter + startTimer(); + acc.removeRedundant(); + setYieldTime(plasmaSearchProcessing.PROCESS_FILTER); + setYieldCount(plasmaSearchProcessing.PROCESS_FILTER, acc.sizeOrdered()); + + acc.localContributions = (resultIndex == null) ? 0 : resultIndex.size(); + acc.filteredResults = preorder.filteredCount(); + return acc; + } + +} diff --git a/source/de/anomic/plasma/plasmaSearchTimingProfile.java b/source/de/anomic/plasma/plasmaSearchTimingProfile.java deleted file mode 100644 index f347b9f39..000000000 --- a/source/de/anomic/plasma/plasmaSearchTimingProfile.java +++ /dev/null @@ -1,282 +0,0 @@ -// plasmaSearchProfile.java -// ----------------------- -// part of YACY -// (C) by Michael Peter Christen; mc@anomic.de -// first published on http://www.anomic.de -// Frankfurt, Germany, 2005 -// Created: 17.10.2005 -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. - -package de.anomic.plasma; - -import java.util.HashMap; - -/** - * - * This class provides timing properties for search processes - * It shall be used to initiate a search and also to evaluate - * the real obtained timings after a search is performed - */ - -public class plasmaSearchTimingProfile implements Cloneable { - - // collection: - // time = time to get a RWI out of RAM cache, assortments and WORDS files - // count = maximum number of RWI-entries that shall be collected - - // join - // time = time to perform the join between all collected RWIs - // count = maximum number of entries that shall be joined - - // presort: - // time = time to do a sort of the joined URL-records - // count = maximum number of entries that shall be pre-sorted - - // urlfetch: - // time = time to fetch the real URLs from the LURL database - // count = maximum number of urls that shall be fetched - - // postsort: - // time = time for final sort of URLs - // count = maximum number oof URLs that shall be retrieved during sort - - // snippetfetch: - // time = time to fetch snippets for selected URLs - // count = maximum number of snipptes to be fetched - - public static final char PROCESS_COLLECTION = 'c'; - public static final char PROCESS_JOIN = 'j'; - public static final char PROCESS_PRESORT = 'r'; - public static final char PROCESS_URLFETCH = 'u'; - public static final char PROCESS_POSTSORT = 'o'; - public static final char PROCESS_FILTER = 'f'; - public static final char PROCESS_SNIPPETFETCH = 's'; - - private static final long minimumTargetTime = 100; - - public static char[] sequence = new char[]{ - PROCESS_COLLECTION, - PROCESS_JOIN, - PROCESS_PRESORT, - PROCESS_URLFETCH, - PROCESS_POSTSORT, - PROCESS_FILTER, - PROCESS_SNIPPETFETCH - }; - - private HashMap targetTime; - private HashMap targetCount; - private HashMap yieldTime; - private HashMap yieldCount; - private long timer; - - private plasmaSearchTimingProfile() { - targetTime = new HashMap(); - targetCount = new HashMap(); - yieldTime = new HashMap(); - yieldCount = new HashMap(); - timer = 0; - } - - public plasmaSearchTimingProfile(long time, int count) { - this( - 3 * time / 12, 10 * count, - 1 * time / 12, 10 * count, - 1 * time / 12, 10 * count, - 2 * time / 12, 5 * count, - 3 * time / 12, count, - 1 * time / 12, count, - 1 * time / 12, 1 - ); - } - - public plasmaSearchTimingProfile( - long time_collection, int count_collection, - long time_join, int count_join, - long time_presort, int count_presort, - long time_urlfetch, int count_urlfetch, - long time_postsort, int count_postsort, - long time_filter, int count_filter, - long time_snippetfetch, int count_snippetfetch) { - this(); - - targetTime.put(new Character(PROCESS_COLLECTION), new Long(time_collection)); - targetTime.put(new Character(PROCESS_JOIN), new Long(time_join)); - targetTime.put(new Character(PROCESS_PRESORT), new Long(time_presort)); - targetTime.put(new Character(PROCESS_URLFETCH), new Long(time_urlfetch)); - targetTime.put(new Character(PROCESS_POSTSORT), new Long(time_postsort)); - targetTime.put(new Character(PROCESS_FILTER), new Long(time_filter)); - targetTime.put(new Character(PROCESS_SNIPPETFETCH), new Long(time_snippetfetch)); - targetCount.put(new Character(PROCESS_COLLECTION), new Integer(count_collection)); - targetCount.put(new Character(PROCESS_JOIN), new Integer(count_join)); - targetCount.put(new Character(PROCESS_PRESORT), new Integer(count_presort)); - targetCount.put(new Character(PROCESS_URLFETCH), new Integer(count_urlfetch)); - targetCount.put(new Character(PROCESS_POSTSORT), new Integer(count_postsort)); - targetCount.put(new Character(PROCESS_FILTER), new Integer(count_filter)); - targetCount.put(new Character(PROCESS_SNIPPETFETCH), new Integer(count_snippetfetch)); - - } - - public Object clone() { - plasmaSearchTimingProfile p = new plasmaSearchTimingProfile(); - p.targetTime = (HashMap) this.targetTime.clone(); - p.targetCount = (HashMap) this.targetCount.clone(); - p.yieldTime = (HashMap) this.yieldTime.clone(); - p.yieldCount = (HashMap) this.yieldCount.clone(); - return p; - } - - public plasmaSearchTimingProfile(String s) { - targetTime = new HashMap(); - targetCount = new HashMap(); - yieldTime = new HashMap(); - yieldCount = new HashMap(); - - intoMap(s, targetTime, targetCount); - } - - public long duetime() { - // returns the old duetime value as sum of all waiting times - long d = 0; - for (int i = 0; i < sequence.length; i++) { - d += ((Long) targetTime.get(new Character(sequence[i]))).longValue(); - } - return d; - } - - public void putYield(String s) { - intoMap(s, yieldTime, yieldCount); - } - - public String yieldToString() { - return toString(yieldTime, yieldCount); - } - - public String targetToString() { - return toString(targetTime, targetCount); - } - - public long getTargetTime(char type) { - // sum up all time that was demanded and subtract all that had been wasted - long sum = 0; - Long t; - Character element; - for (int i = 0; i < sequence.length; i++) { - element = new Character(sequence[i]); - t = (Long) targetTime.get(element); - if (t != null) sum += t.longValue(); - if (type == sequence[i]) return (sum < 0) ? minimumTargetTime : sum; - t = (Long) yieldTime.get(element); - if (t != null) sum -= t.longValue(); - } - return minimumTargetTime; - } - - public int getTargetCount(char type) { - Integer i = (Integer) targetCount.get(new Character(type)); - if (i == null) return -1; else return i.intValue(); - } - - public long getYieldTime(char type) { - Long l = (Long) yieldTime.get(new Character(type)); - if (l == null) return -1; else return l.longValue(); - } - - public int getYieldCount(char type) { - Integer i = (Integer) yieldCount.get(new Character(type)); - if (i == null) return -1; else return i.intValue(); - } - - public void startTimer() { - this.timer = System.currentTimeMillis(); - } - - public void setYieldTime(char type) { - // sets a time that is computed using the timer - long t = System.currentTimeMillis() - this.timer; - yieldTime.put(new Character(type), new Long(t)); - } - - public void setYieldCount(char type, int count) { - yieldCount.put(new Character(type), new Integer(count)); - } - - public String reportToString() { - return "target=" + toString(targetTime, targetCount) + "; yield=" + toString(yieldTime, yieldCount); - } - - public static String toString(HashMap time, HashMap count) { - // put this into a format in such a way that it can be send in a http header or post argument - // that means that no '=' or spaces are allowed - StringBuffer sb = new StringBuffer(sequence.length * 10); - Character element; - Integer xi; - Long xl; - for (int i = 0; i < sequence.length; i++) { - element = new Character(sequence[i]); - sb.append("t"); - sb.append(element); - xl = (Long) time.get(element); - sb.append((xl == null) ? "0" : xl.toString()); - sb.append("|"); - sb.append("c"); - sb.append(element); - xi = (Integer) count.get(element); - sb.append((xi == null) ? "0" : xi.toString()); - sb.append("|"); - } - return sb.toString(); - } - - public static void intoMap(String s, HashMap time, HashMap count) { - // this is the reverse method to toString - int p = 0; - char ct; - String elt; - String v; - int p1; - while ((p < s.length()) && ((p1 = s.indexOf('|', p)) > 0)) { - ct = s.charAt(p); - elt = s.substring(p + 1, p + 2); - v = s.substring(p + 2, p1); - if (ct == 't') { - time.put(elt, new Long(Long.parseLong(v))); - } else { - count.put(elt, new Integer(Integer.parseInt(v))); - } - } - } - -} diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 7d817aa08..ee4754653 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -2628,7 +2628,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser ioLinks[1].intValue(), condenser.RESULT_FLAGS ); - indexContainer wordIdxContainer = wordIndex.emptyContainer(wordHash); + indexContainer wordIdxContainer = plasmaWordIndex.emptyContainer(wordHash); wordIdxContainer.add(wordIdxEntry); tmpContainers.add(wordIdxContainer); } @@ -2894,8 +2894,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public plasmaSearchResults searchFromLocal(plasmaSearchQuery query, plasmaSearchRankingProfile ranking, - plasmaSearchTimingProfile localTiming, - plasmaSearchTimingProfile remoteTiming, + plasmaSearchProcessing localTiming, + plasmaSearchProcessing remoteTiming, boolean postsort, String client) { @@ -2924,7 +2924,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser //} // create a new search event - plasmaSearchEvent theSearch = new plasmaSearchEvent(query, ranking, localTiming, remoteTiming, postsort, log, wordIndex, wordIndex.loadedURL, snippetCache, (isRobinsonMode()) ? this.clusterhashes : null); + plasmaSearchEvent theSearch = new plasmaSearchEvent(query, ranking, localTiming, remoteTiming, postsort, log, wordIndex, snippetCache, (isRobinsonMode()) ? this.clusterhashes : null); plasmaSearchPostOrder acc = theSearch.search(); // fetch snippets diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index d52b4ec7a..40fa7e42d 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -162,7 +162,7 @@ public final class plasmaWordIndex implements indexRI { return entries.updated(); } - public indexContainer emptyContainer(String wordHash) { + public static indexContainer emptyContainer(String wordHash) { return new indexContainer(wordHash, indexRWIEntry.urlEntryRow); } diff --git a/source/de/anomic/xml/crawlHandler.java b/source/de/anomic/xml/crawlHandler.java index fd6aec6cb..8a4666ede 100644 --- a/source/de/anomic/xml/crawlHandler.java +++ b/source/de/anomic/xml/crawlHandler.java @@ -34,7 +34,6 @@ import java.util.HashSet; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; -import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; public class crawlHandler extends DefaultHandler { @@ -110,7 +109,7 @@ public class crawlHandler extends DefaultHandler { } } - public void startElement(String uri, String name, String tag, Attributes atts) throws SAXException { + public void startElement(String uri, String name, String tag, Attributes atts) { if ("channel".equals(tag)) { channel = new Startpoint(); parsingAttributes = true; diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 2f970cdfc..d520ef903 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -63,7 +63,7 @@ import de.anomic.net.URL; import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSearchRankingProfile; -import de.anomic.plasma.plasmaSearchTimingProfile; +import de.anomic.plasma.plasmaSearchProcessing; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; @@ -350,7 +350,7 @@ public final class yacyClient { Map abstractCache, plasmaURLPattern blacklist, plasmaSnippetCache snippets, - plasmaSearchTimingProfile timingProfile, + plasmaSearchProcessing timingProfile, plasmaSearchRankingProfile rankingProfile, kelondroBitfield constraint ) { @@ -375,7 +375,7 @@ public final class yacyClient { final String salt = yacyNetwork.enrichRequestPost(post, plasmaSwitchboard.getSwitchboard(), target.hash); long duetime = timingProfile.duetime(); post.putASIS("myseed", yacyCore.seedDB.mySeed.genSeedStr(salt)); - post.put("count", timingProfile.getTargetCount(plasmaSearchTimingProfile.PROCESS_POSTSORT)); + post.put("count", timingProfile.getTargetCount(plasmaSearchProcessing.PROCESS_POSTSORT)); post.putASIS("resource", ((global) ? "global" : "local")); post.put("partitions", partitions); post.putASIS("query", wordhashes); @@ -452,7 +452,7 @@ public final class yacyClient { final int words = wordhashes.length() / yacySeedDB.commonHashLength; indexContainer[] container = new indexContainer[words]; for (int i = 0; i < words; i++) { - container[i] = wordIndex.emptyContainer(wordhashes.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength)); + container[i] = plasmaWordIndex.emptyContainer(wordhashes.substring(i * yacySeedDB.commonHashLength, (i + 1) * yacySeedDB.commonHashLength)); } // insert results to containers diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index 0f4ba60c6..188e47fc6 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -57,7 +57,7 @@ import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchRankingProfile; -import de.anomic.plasma.plasmaSearchTimingProfile; +import de.anomic.plasma.plasmaSearchProcessing; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.urlPattern.plasmaURLPattern; @@ -77,7 +77,7 @@ public class yacySearch extends Thread { final private yacySeed targetPeer; private String[] urls; private int maxDistance; - final private plasmaSearchTimingProfile timingProfile; + final private plasmaSearchProcessing timingProfile; final private plasmaSearchRankingProfile rankingProfile; final private String prefer, filter; final private kelondroBitfield constraint; @@ -86,7 +86,7 @@ public class yacySearch extends Thread { boolean global, int partitions, yacySeed targetPeer, plasmaCrawlLURL urlManager, plasmaWordIndex wordIndex, indexContainer containerCache, Map abstractCache, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, - plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile, + plasmaSearchProcessing timingProfile, plasmaSearchRankingProfile rankingProfile, kelondroBitfield constraint) { super("yacySearch_" + targetPeer.getName()); //System.out.println("DEBUG - yacySearch thread " + this.getName() + " initialized " + ((urlhashes.length() == 0) ? "(primary)" : "(secondary)")); @@ -106,7 +106,7 @@ public class yacySearch extends Thread { this.targetPeer = targetPeer; this.urls = null; this.maxDistance = maxDistance; - this.timingProfile = (plasmaSearchTimingProfile) timingProfile.clone(); + this.timingProfile = (plasmaSearchProcessing) timingProfile.clone(); this.rankingProfile = rankingProfile; this.constraint = constraint; } @@ -138,7 +138,7 @@ public class yacySearch extends Thread { return this.urls.length; } - public plasmaSearchTimingProfile timingProfile() { + public plasmaSearchProcessing timingProfile() { return this.timingProfile; } @@ -253,7 +253,7 @@ public class yacySearch extends Thread { plasmaCrawlLURL urlManager, plasmaWordIndex wordIndex, indexContainer containerCache, Map abstractCache, int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, - plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile, + plasmaSearchProcessing timingProfile, plasmaSearchRankingProfile rankingProfile, kelondroBitfield constraint, TreeMap clusterselection) { // check own peer status if (yacyCore.seedDB.mySeed == null || yacyCore.seedDB.mySeed.getPublicAddress() == null) { return null; } @@ -277,7 +277,7 @@ public class yacySearch extends Thread { plasmaCrawlLURL urlManager, plasmaWordIndex wordIndex, indexContainer containerCache, String targethash, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, - plasmaSearchTimingProfile timingProfile, plasmaSearchRankingProfile rankingProfile, + plasmaSearchProcessing timingProfile, plasmaSearchRankingProfile rankingProfile, kelondroBitfield constraint, TreeMap clusterselection) { // check own peer status if (yacyCore.seedDB.mySeed == null || yacyCore.seedDB.mySeed.getPublicAddress() == null) { return null; }