From bc56a88cc881066e3263b13bc881d4f0a2b5c1d1 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 13 Oct 2005 00:05:30 +0000 Subject: [PATCH] further refactoring of search git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@925 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/htdocsdefault/dir.java | 2 +- source/de/anomic/plasma/plasmaSearch.java | 131 ------------------ .../de/anomic/plasma/plasmaSwitchboard.java | 10 +- source/de/anomic/plasma/plasmaWordIndex.java | 45 ++++++ source/de/anomic/yacy/yacyClient.java | 8 +- source/de/anomic/yacy/yacySearch.java | 14 +- 6 files changed, 61 insertions(+), 149 deletions(-) delete mode 100644 source/de/anomic/plasma/plasmaSearch.java diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java index c862d7949..e64261bc8 100644 --- a/htroot/htdocsdefault/dir.java +++ b/htroot/htdocsdefault/dir.java @@ -466,7 +466,7 @@ public class dir { ); final String urlHash = newEntry.hash(); - final int words = switchboard.searchManager.addPageIndex(url, urlHash, new Date(), condenser, "**", plasmaWordIndexEntry.DT_SHARE); + final int words = switchboard.wordIndex.addPageIndex(url, urlHash, new Date(), condenser, "**", plasmaWordIndexEntry.DT_SHARE); } catch (IOException e) {} } diff --git a/source/de/anomic/plasma/plasmaSearch.java b/source/de/anomic/plasma/plasmaSearch.java deleted file mode 100644 index a7d753f92..000000000 --- a/source/de/anomic/plasma/plasmaSearch.java +++ /dev/null @@ -1,131 +0,0 @@ -// plasmaSearch.java -// ----------------------- -// part of YACY -// (C) by Michael Peter Christen; mc@anomic.de -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004 -// last major change: 11.06.2004 -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. - - -package de.anomic.plasma; - -import java.io.IOException; -import java.net.URL; -import java.util.ArrayList; -import java.util.Date; -import java.util.Enumeration; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Set; -import java.util.TreeMap; - -import de.anomic.kelondro.kelondroException; -import de.anomic.kelondro.kelondroMScoreCluster; -import de.anomic.server.serverCodings; -import de.anomic.server.logging.serverLog; - -public final class plasmaSearch { - - private final plasmaCrawlLURL urlStore; - private final plasmaWordIndex wordIndex; - - public plasmaSearch(plasmaCrawlLURL urlStore, plasmaWordIndex wordIndex) { - this.urlStore = urlStore; - this.wordIndex = wordIndex; - } - - public static int calcVirtualAge(Date modified) { - // this calculates a virtual age from a given date - // the purpose is to have an age in days of a given modified date - // from a fixed standpoint in the past - //if (modified == null) return 0; - // this is milliseconds. we need days - // one day has 60*60*24 seconds = 86400 seconds - // we take mod 64**3 = 262144, this is the mask of the storage - return (int) ((modified.getTime() / 86400000) % 262144); - } - - public void addWords(plasmaWordIndexEntryContainer container) { - wordIndex.addEntries(container, true); - } - - public int addPageIndex(URL url, String urlHash, Date urlModified, plasmaCondenser condenser, - String language, char doctype) { - // this is called by the switchboard to put in a new page into the index - // use all the words in one condenser object to simultanous create index entries - int age = calcVirtualAge(urlModified); - int quality = 0; - try { - quality = Integer.parseInt(condenser.getAnalysis().getProperty("INFORMATION_VALUE","0"), 16); - } catch (NumberFormatException e) { - System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + url.toString()); - } - - // iterate over all words - Iterator i = condenser.getWords().iterator(); - String word; - int count; - plasmaWordIndexEntry entry; - String wordHash; - int p = 0; - while (i.hasNext()) { - word = (String) i.next(); - count = condenser.wordCount(word); - //if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ": " + c); - wordHash = plasmaWordIndexEntry.word2hash(word); - entry = new plasmaWordIndexEntry(urlHash, count, p++, 0, 0, - age, quality, language, doctype, true); - this.wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry), false); - } - //System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries"); - return condenser.getWords().size(); - } - /* - public plasmaWordIndexEntity searchWords(Set words, long time) throws IOException { - - } - */ - /* - public plasmaWordIndexEntity searchHashes(Set hashes, long time) throws IOException { - - } - */ - /* - public plasmaSearchResult order(plasmaWordIndexEntity searchResult, Set searchhashes, Set stopwords, char[] priority, long maxTime, int minEntries) throws IOException { - - } - */ -} diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index d29c7824b..763407a9a 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -161,7 +161,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public File listsPath; public plasmaURLPool urlPool; public plasmaWordIndex wordIndex; - public plasmaSearch searchManager; public plasmaHTCache cacheManager; public plasmaSnippetCache snippetCache; public plasmaCrawlLoader cacheLoader; @@ -309,7 +308,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser int wordCacheMaxLow = (int) getConfigLong("wordCacheMaxLow", 8000); int wordCacheMaxHigh = (int) getConfigLong("wordCacheMaxHigh", 10000); wordIndex.setMaxWords(wordCacheMaxLow, wordCacheMaxHigh); - searchManager = new plasmaSearch(urlPool.loadedURL, wordIndex); // start a cache manager log.logConfig("Starting HT Cache Manager"); @@ -1175,14 +1173,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (((storagePeerHash = getConfig("storagePeerHash",null))== null) || (storagePeerHash.trim().length() == 0) || ((seed = yacyCore.seedDB.getConnected(storagePeerHash))==null)){ - words = searchManager.addPageIndex(entry.url(), urlHash, loadDate, condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType())); + words = wordIndex.addPageIndex(entry.url(), urlHash, loadDate, condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType())); } else { HashMap urlCache = new HashMap(1); urlCache.put(newEntry.hash(),newEntry); ArrayList tmpEntities = new ArrayList(condenser.getWords().size()); - int age = plasmaSearch.calcVirtualAge(lastModified); + int age = plasmaWordIndex.calcVirtualAge(lastModified); int quality = 0; try { quality = Integer.parseInt(condenser.getAnalysis().getProperty("INFORMATION_VALUE","0"), 16); @@ -1214,7 +1212,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String error = yacyClient.transferIndex(seed,(plasmaWordIndexEntity[])tmpEntities.toArray(new plasmaWordIndexEntity[tmpEntities.size()]),urlCache,true,120000); if (error != null) { - words = searchManager.addPageIndex(entry.url(), urlHash, loadDate, condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType())); + words = wordIndex.addPageIndex(entry.url(), urlHash, loadDate, condenser, plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType())); } // cleanup @@ -1468,7 +1466,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser long fetchtime = query.maximumTime * 6 / 10; // time to waste if (fetchpeers < 10) fetchpeers = 10; if (fetchcount > query.wantedResults * 10) fetchcount = query.wantedResults * 10; - globalresults = yacySearch.searchHashes(query.queryHashes, urlPool.loadedURL, searchManager, fetchcount, fetchpeers, urlBlacklist, snippetCache, fetchtime); + globalresults = yacySearch.searchHashes(query.queryHashes, urlPool.loadedURL, wordIndex, fetchcount, fetchpeers, urlBlacklist, snippetCache, fetchtime); log.logFine("SEARCH TIME AFTER GLOBAL-TRIGGER TO " + fetchpeers + " PEERS: " + ((System.currentTimeMillis() - timestamp) / 1000) + " seconds"); } prop.put("globalresults", globalresults); // the result are written to the local DB diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 8ec91c388..82adbdc26 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -55,6 +55,8 @@ import java.util.Iterator; import java.util.TreeSet; import java.util.HashSet; import java.util.Set; +import java.util.Date; +import java.net.URL; import de.anomic.kelondro.kelondroMSetTools; import de.anomic.server.logging.serverLog; @@ -103,6 +105,49 @@ public final class plasmaWordIndex { return ramCache.addEntries(entries, System.currentTimeMillis(), highPriority); } + public static int calcVirtualAge(Date modified) { + // this calculates a virtual age from a given date + // the purpose is to have an age in days of a given modified date + // from a fixed standpoint in the past + //if (modified == null) return 0; + // this is milliseconds. we need days + // one day has 60*60*24 seconds = 86400 seconds + // we take mod 64**3 = 262144, this is the mask of the storage + return (int) ((modified.getTime() / 86400000) % 262144); + } + + public int addPageIndex(URL url, String urlHash, Date urlModified, plasmaCondenser condenser, + String language, char doctype) { + // this is called by the switchboard to put in a new page into the index + // use all the words in one condenser object to simultanous create index entries + int age = calcVirtualAge(urlModified); + int quality = 0; + try { + quality = Integer.parseInt(condenser.getAnalysis().getProperty("INFORMATION_VALUE","0"), 16); + } catch (NumberFormatException e) { + System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + url.toString()); + } + + // iterate over all words + Iterator i = condenser.getWords().iterator(); + String word; + int count; + plasmaWordIndexEntry entry; + String wordHash; + int p = 0; + while (i.hasNext()) { + word = (String) i.next(); + count = condenser.wordCount(word); + //if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ": " + c); + wordHash = plasmaWordIndexEntry.word2hash(word); + entry = new plasmaWordIndexEntry(urlHash, count, p++, 0, 0, + age, quality, language, doctype, true); + addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry), false); + } + //System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries"); + return condenser.getWords().size(); + } + public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty) { return ramCache.getIndex(wordHash, deleteIfEmpty); } diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index ad51525bf..54b71a07e 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -51,13 +51,13 @@ import java.util.Enumeration; import java.util.HashMap; import de.anomic.http.httpc; import de.anomic.plasma.plasmaCrawlLURL; -import de.anomic.plasma.plasmaSearch; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndexEntity; import de.anomic.plasma.plasmaWordIndexEntry; import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.plasma.plasmaURLPattern; +import de.anomic.plasma.plasmaWordIndex; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.tools.crypt; @@ -272,7 +272,7 @@ public final class yacyClient { public static int search(String wordhashes, int count, boolean global, yacySeed targetPeer, plasmaCrawlLURL urlManager, - plasmaSearch searchManager, plasmaURLPattern blacklist, + plasmaWordIndex wordIndex, plasmaURLPattern blacklist, plasmaSnippetCache snippets, long duetime) { // send a search request to peer with remote Hash @@ -359,7 +359,7 @@ public final class yacyClient { link = urlManager.addEntry(lEntry, yacyCore.seedDB.mySeed.hash, targetPeer.hash, 2); // save the url entry final plasmaWordIndexEntry entry = new plasmaWordIndexEntry(link.hash(), link.wordCount(), 0, 0, 0, - plasmaSearch.calcVirtualAge(link.moddate()), link.quality(), + plasmaWordIndex.calcVirtualAge(link.moddate()), link.quality(), link.language(), link.doctype(), false); if (link.snippet() != null) { // we don't store the snippets along the url entry, because they are search-specific. @@ -374,7 +374,7 @@ public final class yacyClient { } // finally insert the containers to the index - for (int m = 0; m < words; m++) { searchManager.addWords(container[m]); } + for (int m = 0; m < words; m++) { wordIndex.addEntries(container[m], true); } // generate statistics long searchtime; diff --git a/source/de/anomic/yacy/yacySearch.java b/source/de/anomic/yacy/yacySearch.java index 293405352..64ea54baf 100644 --- a/source/de/anomic/yacy/yacySearch.java +++ b/source/de/anomic/yacy/yacySearch.java @@ -51,7 +51,7 @@ import java.util.HashMap; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaURLPattern; -import de.anomic.plasma.plasmaSearch; +import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.server.logging.serverLog; @@ -61,7 +61,7 @@ public class yacySearch extends Thread { final private int count; final private boolean global; final private plasmaCrawlLURL urlManager; - final private plasmaSearch searchManager; + final private plasmaWordIndex wordIndex; final private plasmaURLPattern blacklist; final private plasmaSnippetCache snippetCache; final private yacySeed targetPeer; @@ -69,13 +69,13 @@ public class yacySearch extends Thread { final private long duetime; public yacySearch(Set wordhashes, int count, boolean global, yacySeed targetPeer, - plasmaCrawlLURL urlManager, plasmaSearch searchManager, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, long duetime) { + plasmaCrawlLURL urlManager, plasmaWordIndex wordIndex, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, long duetime) { super("yacySearch_" + targetPeer.getName()); this.wordhashes = wordhashes; this.count = count; this.global = global; this.urlManager = urlManager; - this.searchManager = searchManager; + this.wordIndex = wordIndex; this.blacklist = blacklist; this.snippetCache = snippetCache; this.targetPeer = targetPeer; @@ -84,7 +84,7 @@ public class yacySearch extends Thread { } public void run() { - this.links = yacyClient.search(set2string(wordhashes), count, global, targetPeer, urlManager, searchManager, blacklist, snippetCache, duetime); + this.links = yacyClient.search(set2string(wordhashes), count, global, targetPeer, urlManager, wordIndex, blacklist, snippetCache, duetime); if (links != 0) { //yacyCore.log.logInfo("REMOTE SEARCH - remote peer " + targetPeer.hash + ":" + targetPeer.getName() + " contributed " + links + " links for word hash " + wordhashes); yacyCore.seedDB.mySeed.incRI(links); @@ -165,7 +165,7 @@ public class yacySearch extends Thread { return result; } - public static int searchHashes(Set wordhashes, plasmaCrawlLURL urlManager, plasmaSearch searchManager, + public static int searchHashes(Set wordhashes, plasmaCrawlLURL urlManager, plasmaWordIndex wordIndex, int count, int targets, plasmaURLPattern blacklist, plasmaSnippetCache snippetCache, long waitingtime) { // check own peer status if (yacyCore.seedDB.mySeed == null || yacyCore.seedDB.mySeed.getAddress() == null) { return 0; } @@ -186,7 +186,7 @@ public class yacySearch extends Thread { yacySearch[] searchThreads = new yacySearch[targets]; for (int i = 0; i < targets; i++) { searchThreads[i]= new yacySearch(wordhashes, count, true, targetPeers[i], - urlManager, searchManager, blacklist, snippetCache, duetime); + urlManager, wordIndex, blacklist, snippetCache, duetime); searchThreads[i].start(); try {Thread.currentThread().sleep(20);} catch (InterruptedException e) {} if ((System.currentTimeMillis() - start) > waitingtime) {