diff --git a/doc/Download.html b/doc/Download.html
index 40ce65816..016d34cfa 100644
--- a/doc/Download.html
+++ b/doc/Download.html
@@ -36,7 +36,8 @@ You can download the Java
Latest Release:
The latest YaCy-release is 0.37.
-Download YaCy 0.37 here.
+Download generic (all platforms with J2SE 1.4: Linux, Mac OS X, Windows, Solaris) YaCy 0.37 here.
+If you want to install YaCy on Windows, you can use the convenient Windows-Installer-Version of YaCy 0.37.
diff --git a/doc/Material.html b/doc/Material.html
new file mode 100644
index 000000000..fb1703202
--- /dev/null
+++ b/doc/Material.html
@@ -0,0 +1,41 @@
+
+
+
+Press Material, Publications, Presentations+ +Here you can find links to documents that had been published about YaCy by YaCy-Authors + + Deutsche Dokumentation / German-only documents +
+ + + + + + + + diff --git a/doc/navigation.js b/doc/navigation.js index 8d35015ef..6af85eaf4 100644 --- a/doc/navigation.js +++ b/doc/navigation.js @@ -1,7 +1,7 @@ var appname = "YACY: a Java Freeware P2P-Based Search Engine with Caching HTTP Proxy"; var thismenu = new Array( "index","FAQ","Details","Technology","Platforms","News","Demo","License","Download", - "Installation","Volunteers","Deutsches Forum@http://www.yacy-forum.de","English Forum@http://sourceforge.net/forum/?group_id=116142","Links","Contact","","Impressum"); + "Installation","Volunteers","Deutsches Forum@http://www.yacy-forum.de","English Forum@http://sourceforge.net/forum/?group_id=116142","Material","Links","Contact","","Impressum"); var mainmenu = new Array( "YACY Home@http://www.yacy.net/index.html", "Products@http://www.yacy.net/Products/index.html", diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 8c178305a..c03c06f86 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -76,7 +76,7 @@ public class IndexControl_p { prop.put("urlstring", ""); prop.put("urlhash", ""); prop.put("result", ""); - prop.put("wcount", "" + switchboard.wordIndex.sizeMin()); + prop.put("wcount", "" + switchboard.wordIndex.size()); prop.put("ucount", "" + switchboard.loadedURL.size()); prop.put("otherHosts", ""); prop.put("indexDistributeChecked", (switchboard.getConfig("allowDistributeIndex", "true").equals("true")) ? "checked" : ""); @@ -116,25 +116,23 @@ public class IndexControl_p { } if (post.containsKey("keyhashdeleteall")) { - try { - if ((delurl) || (delurlref)) { - // generate an urlx array - try { - HashSet keyhashes = new HashSet(); - keyhashes.add(keyhash); - plasmaWordIndexEntity index = switchboard.searchManager.searchHashes(keyhashes, 10000); - Enumeration en = index.elements(true); - int i = 0; - urlx = new String[index.size()]; - while (en.hasMoreElements()) urlx[i++] = ((plasmaWordIndexEntry) en.nextElement()).getUrlHash(); - } catch (IOException e) { - urlx = new String[0]; - } + if ((delurl) || (delurlref)) { + // generate an urlx array + try { + HashSet keyhashes = new HashSet(); + keyhashes.add(keyhash); + plasmaWordIndexEntity index = switchboard.searchManager.searchHashes(keyhashes, 10000); + Enumeration en = index.elements(true); + int i = 0; + urlx = new String[index.size()]; + while (en.hasMoreElements()) urlx[i++] = ((plasmaWordIndexEntry) en.nextElement()).getUrlHash(); + } catch (IOException e) { + urlx = new String[0]; } - if (delurlref) for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true); - if ((delurl) || (delurlref)) for (int i = 0; i < urlx.length; i++) switchboard.loadedURL.remove(urlx[i]); - switchboard.wordIndex.deleteComplete(keyhash); - } catch (IOException e) {} + } + if (delurlref) for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true); + if ((delurl) || (delurlref)) for (int i = 0; i < urlx.length; i++) switchboard.loadedURL.remove(urlx[i]); + switchboard.wordIndex.deleteIndex(keyhash); post.remove("keyhashdeleteall"); if ((keystring.length() > 0) && (plasmaWordIndexEntry.word2hash(keystring).equals(keyhash))) post.put("keystringsearch", "generated"); @@ -143,11 +141,9 @@ public class IndexControl_p { } if (post.containsKey("keyhashdelete")) { - try { - if (delurlref) for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true); - if ((delurl) || (delurlref)) for (int i = 0; i < urlx.length; i++) switchboard.loadedURL.remove(urlx[i]); - switchboard.wordIndex.removeEntries(keyhash, urlx, true); - } catch (IOException e) {} + if (delurlref) for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true); + if ((delurl) || (delurlref)) for (int i = 0; i < urlx.length; i++) switchboard.loadedURL.remove(urlx[i]); + switchboard.wordIndex.removeEntries(keyhash, urlx, true); // this shall lead to a presentation of the list; so handle that the remaining program // thinks that it was called for a list presentation post.remove("keyhashdelete"); @@ -200,16 +196,13 @@ public class IndexControl_p { plasmaWordIndexEntity[] indexes = new plasmaWordIndexEntity[1]; String result; long starttime = System.currentTimeMillis(); - try {indexes[0] = switchboard.wordIndex.getEntity(keyhash, true); - result = yacyClient.transferIndex(yacyCore.seedDB.getConnected(post.get("hostHash", "")), indexes, switchboard.loadedURL); - } catch (IOException e) { - result = "IOException: " + e.getMessage(); - } + indexes[0] = switchboard.wordIndex.getEntity(keyhash, true); + result = yacyClient.transferIndex(yacyCore.seedDB.getConnected(post.get("hostHash", "")), indexes, switchboard.loadedURL); prop.put("result", (result == null) ? ("Successfully transferred " + indexes[0].size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result); } if (post.containsKey("keyhashsimilar")) { - Iterator hashIt = switchboard.wordIndex.hashIterator(keyhash, true, true, true); + Iterator hashIt = switchboard.wordIndex.wordHashes(keyhash, true, true); String result = "Sequential List of Word-Hashes: "; String hash; int i = 0; @@ -294,7 +287,7 @@ public class IndexControl_p { } // insert constants - prop.put("wcount", "" + switchboard.wordIndex.sizeMin()); + prop.put("wcount", "" + switchboard.wordIndex.size()); prop.put("ucount", "" + switchboard.loadedURL.size()); prop.put("indexDistributeChecked", (switchboard.getConfig("allowDistributeIndex", "true").equals("true")) ? "checked" : ""); prop.put("indexReceiveChecked", (switchboard.getConfig("allowReceiveIndex", "true").equals("true")) ? "checked" : ""); diff --git a/htroot/IndexShare_p.java b/htroot/IndexShare_p.java index 961c71910..4749dfcdb 100644 --- a/htroot/IndexShare_p.java +++ b/htroot/IndexShare_p.java @@ -65,7 +65,7 @@ public class IndexShare_p { prop.put("wordfreq", switchboard.getConfig("defaultWordReceiveFrequency","10")); prop.put("dtable", ""); prop.put("rtable", ""); - prop.put("wcount", "" + switchboard.wordIndex.sizeMin()); + prop.put("wcount", "" + switchboard.wordIndex.size()); prop.put("ucount", "" + switchboard.loadedURL.size()); return prop; // be save } @@ -78,7 +78,7 @@ public class IndexShare_p { } // insert constants - prop.put("wcount", "" + switchboard.wordIndex.sizeMin()); + prop.put("wcount", "" + switchboard.wordIndex.size()); prop.put("ucount", "" + switchboard.loadedURL.size()); // return rewrite properties return prop; diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 244ce3d84..8bb66cd2b 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -51,6 +51,7 @@ import java.util.Vector; import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndexEntry; +import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; @@ -106,17 +107,13 @@ public class transferRWI { wordHash = estring.substring(0, p); wordhashes[i] = wordHash; entry = new plasmaWordIndexEntry(estring.substring(p)); - try { - switchboard.wordIndex.addEntry(wordHash, entry); - urlHash = entry.getUrlHash(); - if ((!(unknownURL.contains(urlHash))) && - (!(switchboard.loadedURL.exists(urlHash)))) { - unknownURL.add(urlHash); - } - received++; - } catch (IOException ee) { - ee.printStackTrace(); + switchboard.wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, entry)); + urlHash = entry.getUrlHash(); + if ((!(unknownURL.contains(urlHash))) && + (!(switchboard.loadedURL.exists(urlHash)))) { + unknownURL.add(urlHash); } + received++; } } yacyCore.seedDB.mySeed.incRI(received); diff --git a/makerelease.sh b/makerelease.sh index 45c5d7e9e..64a80e9a5 100755 --- a/makerelease.sh +++ b/makerelease.sh @@ -45,10 +45,10 @@ # Contributions and changes to the program code must be marked as such. # define variables -version='0.37' +version='0.371' datestr=`date +%Y%m%d` -release='yacy_v'$version'_'$datestr -#release='yacy_dev_v'$version'_'$datestr +#release='yacy_v'$version'_'$datestr +release='yacy_dev_v'$version'_'$datestr extralibs='yacy_libx' target='RELEASE' classes='classes' @@ -58,7 +58,7 @@ source='source' doc='doc' data='DATA' mainclass='yacy.java' -classpath='$classes:lib/commons-collections.jar:lib/commons-pool-1.2.jar:libx/PDFBox-0.7.1.jar:libx/log4j-1.2.9.jar:libx/tm-extractors-0.4.jar' +classpath='$classes:lib/commons-collections.jar:lib/commons-pool-1.2.jar:libx/PDFBox-0.7.1.jar:libx/log4j-1.2.9.jar:libx/tm-extractors-0.4.jar:libx/informa-0.6.0.jar:libx/jdom.jar' mkdir $release mkdir $extralibs diff --git a/source/de/anomic/kelondro/kelondroMScoreCluster.java b/source/de/anomic/kelondro/kelondroMScoreCluster.java index 21059748e..21a787372 100644 --- a/source/de/anomic/kelondro/kelondroMScoreCluster.java +++ b/source/de/anomic/kelondro/kelondroMScoreCluster.java @@ -301,16 +301,34 @@ public class kelondroMScoreCluster { public static void main(String[] args) { System.out.println("Test for Score: start"); - long time = System.currentTimeMillis(); kelondroMScoreCluster s = new kelondroMScoreCluster(); - for (int i = 0; i < 10000; i++) s.addScore("score#" + i + "xxx" + i + "xxx" + i + "xxx" + i + "xxx", i/10); + int c = 0; + + // create cluster + long time = System.currentTimeMillis(); + for (int i = 0; i < 10000; i++) { + s.addScore("score#" + i + "xxx" + i + "xxx" + i + "xxx" + i + "xxx", i/10); + c += i/10; + } + /* System.out.println("result:"); Object[] result; result = s.getScores(s.size(), true); for (int i = 0; i < s.size(); i++) System.out.println("up: " + result[i]); result = s.getScores(s.size(), false); for (int i = 0; i < s.size(); i++) System.out.println("down: " + result[i]); - System.out.println("Test for Score: finish. time = " + (System.currentTimeMillis() - time)); - System.out.println("total=" + s.totalCount() + ", elements=" + s.size()); + */ + System.out.println("finished create. time = " + (System.currentTimeMillis() - time)); + System.out.println("total=" + s.totalCount() + ", elements=" + s.size() + ", redundant count=" + c); + + // delete cluster + time = System.currentTimeMillis(); + for (int i = 0; i < 10000; i++) { + s.deleteScore("score#" + i + "xxx" + i + "xxx" + i + "xxx" + i + "xxx"); + c -= i/10; + } + System.out.println("finished delete. time = " + (System.currentTimeMillis() - time)); + System.out.println("total=" + s.totalCount() + ", elements=" + s.size() + ", redundant count=" + c); + } } diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index c4f895f36..7cff5dac9 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -147,7 +147,7 @@ public class plasmaParserDocument { private synchronized void resortLinks() { Iterator i; String url; - int extpos; + int extpos, qpos; String ext = null; i = anchors.entrySet().iterator(); hyperlinks = new HashMap(); @@ -163,10 +163,10 @@ public class plasmaParserDocument { extpos = url.lastIndexOf("."); String normal; if (extpos > 0) { - if (url.indexOf("?") != -1) { - ext = url.substring(extpos,url.indexOf("?")).toLowerCase(); + if (((qpos = url.indexOf("?")) >= 0) && (qpos > extpos)) { + ext = url.substring(extpos, qpos).toLowerCase(); } else { - ext = url.substring(extpos).toLowerCase(); + ext = url.substring(extpos).toLowerCase(); } normal = plasmaParser.urlNormalform(url); if (normal != null) { diff --git a/source/de/anomic/plasma/plasmaSearch.java b/source/de/anomic/plasma/plasmaSearch.java index 98dcbf308..0121f0dd2 100644 --- a/source/de/anomic/plasma/plasmaSearch.java +++ b/source/de/anomic/plasma/plasmaSearch.java @@ -81,19 +81,11 @@ public class plasmaSearch { // we take mod 64**3 = 262144, this is the mask of the storage return (int) ((modified.getTime() / 86400000) % 262144); } - - public void addWordIndex(URL url, String urlHash, Date urlModified, int quality, String wordHash, int wordCount, int posintext, int posinphrase, int posofphraseint, String language, char doctype, boolean local) { - // this is called by the remote search procedure when a new index arrives from remote - plasmaWordIndexEntry entry = new plasmaWordIndexEntry(urlHash, wordCount, - posintext, posinphrase, posofphraseint, - calcVirtualAge(urlModified), quality, - language, doctype, local); - try { - wordIndex.addEntry(wordHash, entry); - } catch (IOException e) {} - // System.out.println("* received one index entry for URL: " + url); // debug - } + public void addWords(plasmaWordIndexEntryContainer container) { + wordIndex.addEntries(container); + } + public int addPageIndex(URL url, String urlHash, Date urlModified, plasmaCondenser condenser, String language, char doctype) { // this is called by the switchboard to put in a new page into the index @@ -112,8 +104,7 @@ public class plasmaSearch { int count; plasmaWordIndexEntry entry; String wordHash; - int c = 0; - int p = 0; + int p = 0; while (i.hasNext()) { word = (String) i.next(); count = condenser.wordCount(word); @@ -121,9 +112,7 @@ public class plasmaSearch { wordHash = plasmaWordIndexEntry.word2hash(word); entry = new plasmaWordIndexEntry(urlHash, count, p++, 0, 0, age, quality, language, doctype, true); - try { - c += wordIndex.addEntry(wordHash, entry); - } catch (IOException e) {} + wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, entry)); } //System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries"); return condenser.getWords().size(); diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 28d08d7fc..656150ba5 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -250,8 +250,9 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi loadedURL = new plasmaCrawlLURL(new File(plasmaPath, "urlHash.db"), ramLURL); noticeURL = new plasmaCrawlNURL(plasmaPath, ramNURL); errorURL = new plasmaCrawlEURL(new File(plasmaPath, "urlErr0.db"), ramEURL); - wordIndex = new plasmaWordIndex(plasmaPath, ramRWI); - wordIndex.setMaxWords(10000); + wordIndex = new plasmaWordIndex(plasmaPath, ramRWI, log); + int wordCacheMax = Integer.parseInt((String) getConfig("wordCacheMax", "10000")); + wordIndex.setMaxWords(wordCacheMax); searchManager = new plasmaSearch(loadedURL, wordIndex); // start a cache manager @@ -430,7 +431,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi } public int cacheSizeMin() { - return wordIndex.sizeMin(); + return wordIndex.size(); } public void enQueue(Object job) { @@ -1195,9 +1196,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi while (it.hasNext()) { word = (String) it.next(); // delete the URL reference in this word index - try { - count += wordIndex.removeEntries(plasmaWordIndexEntry.word2hash(word), urlEntries, true); - } catch (IOException e) {} + count += wordIndex.removeEntries(plasmaWordIndexEntry.word2hash(word), urlEntries, true); } return count; } @@ -1266,7 +1265,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi (yacyCore.seedDB.mySeed == null) || (yacyCore.seedDB.mySeed.isVirgin()) || (loadedURL.size() < 10) || - (wordIndex.sizeMin() < 100) || + (wordIndex.size() < 100) || (!(yacyCore.seedDB.mySeed.isJunior()))) return false; int transferred; @@ -1369,7 +1368,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi Vector tmpEntities = new Vector(); String nexthash = ""; try { - Iterator wordHashIterator = wordIndex.hashIterator(hash, true, true, true); + Iterator wordHashIterator = wordIndex.wordHashes(hash, true, true); plasmaWordIndexEntity indexEntity, tmpEntity; Enumeration urlEnum; plasmaWordIndexEntry indexEntry; diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index bab8bea2e..0d9bb1aea 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -55,16 +55,17 @@ import java.util.TreeSet; import de.anomic.kelondro.kelondroMSetTools; import de.anomic.yacy.yacySeedDB; +import de.anomic.server.serverLog; public class plasmaWordIndex { File databaseRoot; - plasmaWordIndexRAMCache ramCache; + plasmaWordIndexCache ramCache; - public plasmaWordIndex(File databaseRoot, int bufferkb) throws IOException { + public plasmaWordIndex(File databaseRoot, int bufferkb, serverLog log) throws IOException { this.databaseRoot = databaseRoot; - this.ramCache = new plasmaWordIndexRAMCache(databaseRoot, bufferkb); - ramCache.start(); + plasmaWordIndexClassicDB fileDB = new plasmaWordIndexClassicDB(databaseRoot, log); + this.ramCache = new plasmaWordIndexCache(databaseRoot, fileDB, log); } public int maxURLinWordCache() { @@ -79,107 +80,35 @@ public class plasmaWordIndex { ramCache.setMaxWords(maxWords); } - public int addEntry(String wordHash, plasmaWordIndexEntry entry) throws IOException { - return ramCache.addEntryToIndexMem(wordHash, entry); - } + public int addEntries(plasmaWordIndexEntryContainer entries) { + return ramCache.addEntries(entries, System.currentTimeMillis()); + } - public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty) throws IOException { - return ramCache.getIndexMem(wordHash, deleteIfEmpty); + public plasmaWordIndexEntity getEntity(String wordHash, boolean deleteIfEmpty) { + return ramCache.getIndex(wordHash, deleteIfEmpty); } - public int sizeMin() { - return ramCache.sizeMin(); + public int size() { + return ramCache.size(); } - public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) throws IOException { - return ramCache.removeEntriesMem(wordHash, urlHashes, deleteComplete); + public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) { + return ramCache.removeEntries(wordHash, urlHashes, deleteComplete); } public void close(int waitingBoundSeconds) { ramCache.close(waitingBoundSeconds); } - public synchronized void deleteComplete(String wordHash) throws IOException { - ramCache.deleteComplete(wordHash); - } - - public synchronized Iterator hashIterator(String startHash, boolean up, boolean rot, boolean deleteEmpty) { - Iterator i = new iterateCombined(startHash, up, deleteEmpty); - if ((rot) && (!(i.hasNext())) && (startHash != null)) { - return new iterateCombined(null, up, deleteEmpty); - } else { - return i; - } + public void deleteIndex(String wordHash) { + ramCache.deleteIndex(wordHash); } - public class iterateCombined implements Iterator { - - Comparator comp; - Iterator filei; - Iterator cachei; - String nextf, nextc; - - public iterateCombined(String startHash, boolean up, boolean deleteEmpty) { - this.comp = kelondroMSetTools.fastStringComparator(up); - filei = fileIterator(startHash, up, deleteEmpty); - try { - cachei = ramCache.wordHashesMem(startHash, 100); - } catch (IOException e) { - cachei = new HashSet().iterator(); - } - nextFile(); - nextCache(); - } - - private void nextFile() { - if (filei.hasNext()) nextf = (String) filei.next(); else nextf = null; - } - private void nextCache() { - if (cachei.hasNext()) nextc = new String(((byte[][]) cachei.next())[0]); else nextc = null; - } - - public boolean hasNext() { - return (nextf != null) || (nextc != null); - } - - public Object next() { - String s; - if (nextc == null) { - s = nextf; - //System.out.println("Iterate Hash: take " + s + " from file, cache is empty"); - nextFile(); - return s;} - if (nextf == null) { - s = nextc; - //System.out.println("Iterate Hash: take " + s + " from cache, file is empty"); - nextCache(); - return s;} - // compare the strings - int c = comp.compare(nextf, nextc); - if (c == 0) { - s = nextf; - //System.out.println("Iterate Hash: take " + s + " from file&cache"); - nextFile(); - nextCache(); - return s; - } else if (c < 0) { - s = nextf; - //System.out.println("Iterate Hash: take " + s + " from file"); - nextFile(); - return s; - } else { - s = nextc; - //System.out.println("Iterate Hash: take " + s + " from cache"); - nextCache(); - return s; - } - } - - public void remove() { - - } + public Iterator wordHashes(String startHash, boolean up, boolean rot) { + return ramCache.wordHashes(startHash, up); } - + + public Iterator fileIterator(String startHash, boolean up, boolean deleteEmpty) { return new iterateFiles(startHash, up, deleteEmpty); } @@ -295,8 +224,8 @@ public class plasmaWordIndex { public static void main(String[] args) { //System.out.println(kelondroMSetTools.fastStringComparator(true).compare("RwGeoUdyDQ0Y", "rwGeoUdyDQ0Y")); try { - plasmaWordIndex index = new plasmaWordIndex(new File("D:\\dev\\proxy\\DATA\\PLASMADB"), 555); - Iterator i = index.hashIterator("5A8yhZMh_Kmv", true, true, true); + plasmaWordIndex index = new plasmaWordIndex(new File("D:\\dev\\proxy\\DATA\\PLASMADB"), 555, new serverLog("TESTAPP")); + Iterator i = index.wordHashes("5A8yhZMh_Kmv", true, true); while (i.hasNext()) { System.out.println("File: " + (String) i.next()); } diff --git a/source/de/anomic/plasma/plasmaWordIndexCache.java b/source/de/anomic/plasma/plasmaWordIndexCache.java new file mode 100644 index 000000000..37010ae7e --- /dev/null +++ b/source/de/anomic/plasma/plasmaWordIndexCache.java @@ -0,0 +1,380 @@ +// plasmaWordIndexCache.java +// ------------------------- +// part of YACY +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2005 +// last major change: 6.5.2005 +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + + +package de.anomic.plasma; + +import java.io.*; +import java.util.*; +import java.lang.RuntimeException; +import de.anomic.kelondro.*; +import de.anomic.server.serverLog; +import de.anomic.yacy.yacySeedDB; + +public class plasmaWordIndexCache implements plasmaWordIndexInterface { + + private static final String indexDumpFileName = "indexDump.stack"; + + static String minKey, maxKey; + + // class variables + private File databaseRoot; + private plasmaWordIndexInterface backend; + private TreeMap cache; + private kelondroMScoreCluster hashScore; + private HashMap hashDate; + private int maxWords; + private serverLog log; + + static { + maxKey = ""; + for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += 'z'; + minKey = ""; + for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-'; + } + + public plasmaWordIndexCache(File databaseRoot, plasmaWordIndexInterface backend, serverLog log) { + this.databaseRoot = databaseRoot; + this.cache = new TreeMap(); + this.hashScore = new kelondroMScoreCluster(); + this.hashDate = new HashMap(); + this.maxWords = 10000; + this.backend = backend; + this.log = log; + try { + restore(); + } catch (IOException e){ + log.logError("unable to restore cache dump: " + e.getMessage()); + e.printStackTrace(); + } + } + + private void dump(int waitingSeconds) throws IOException { + log.logSystem("creating dump for index cache, " + cache.size() + " words (and much more urls)"); + File indexDumpFile = new File(databaseRoot, indexDumpFileName); + if (indexDumpFile.exists()) indexDumpFile.delete(); + kelondroStack dumpStack = new kelondroStack(indexDumpFile, 0, new int[]{plasmaWordIndexEntry.wordHashLength, 4, 8, plasmaWordIndexEntry.attrSpaceLong}); + long startTime = System.currentTimeMillis(); + long messageTime = System.currentTimeMillis() + 5000; + long wordsPerSecond = 0, wordcount = 0, urlcount = 0; + synchronized (cache) { + Iterator i = cache.entrySet().iterator(); + Map.Entry entry; + String wordHash; + plasmaWordIndexEntryContainer container; + long creationTime; + plasmaWordIndexEntry wordEntry; + byte[][] row = new byte[4][]; + while (i.hasNext()) { + // get entries + entry = (Map.Entry) i.next(); + wordHash = (String) entry.getKey(); + creationTime = getCreationTime(wordHash); + container = (plasmaWordIndexEntryContainer) entry.getValue(); + + // put entries on stack + if (container != null) { + Iterator ci = container.entries(); + while (ci.hasNext()) { + wordEntry = (plasmaWordIndexEntry) ci.next(); + row[0] = wordHash.getBytes(); + row[1] = kelondroRecords.long2bytes(container.size(), 4); + row[2] = kelondroRecords.long2bytes(creationTime, 8); + row[3] = wordEntry.toEncodedForm(true).getBytes(); + dumpStack.push(row); + urlcount++; + } + } + wordcount++; + + // write a log + if (System.currentTimeMillis() > messageTime) { + wordsPerSecond = wordcount * 1000 / (1 + System.currentTimeMillis() - startTime); + log.logInfo("dumping status: " + wordcount + " words done, " + ((cache.size() - wordcount) / wordsPerSecond) + " seconds remaining"); + messageTime = System.currentTimeMillis() + 5000; + } + } + } + log.logSystem("dumped " + urlcount + " word/url relations in " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds"); + } + + private long restore() throws IOException { + File indexDumpFile = new File(databaseRoot, indexDumpFileName); + if (!(indexDumpFile.exists())) return 0; + kelondroStack dumpStack = new kelondroStack(indexDumpFile, 0); + log.logSystem("restore dump of index cache, " + dumpStack.size() + " word/url relations"); + long startTime = System.currentTimeMillis(); + long messageTime = System.currentTimeMillis() + 5000; + long urlCount = 0, urlsPerSecond = 0; + synchronized (cache) { + Iterator i = dumpStack.iterator(); + kelondroRecords.Node node; + String wordHash; + plasmaWordIndexEntryContainer container; + long creationTime; + plasmaWordIndexEntry wordEntry; + byte[][] row = new byte[4][]; + while (i.hasNext()) { + // get out one entry + node = (kelondroRecords.Node) i.next(); + row = node.getValues(); + wordHash = new String(row[0]); + creationTime = kelondroRecords.bytes2long(row[2]); + wordEntry = new plasmaWordIndexEntry(wordHash, new String(row[3])); + + // store to cache + addEntry(wordHash, wordEntry, creationTime); + urlCount++; + + // write a log + if (System.currentTimeMillis() > messageTime) { + urlsPerSecond = urlCount * 1000 / (1 + System.currentTimeMillis() - startTime); + log.logInfo("restoring status: " + urlCount + " urls done, " + ((dumpStack.size() - urlCount) / urlsPerSecond) + " seconds remaining"); + messageTime = System.currentTimeMillis() + 5000; + } + } + } + log.logSystem("restored " + cache.size() + " words in " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds"); + return urlCount; + } + + public int maxURLinWordCache() { + return hashScore.getScore(hashScore.getMaxObject()); + } + + public int wordCacheRAMSize() { + return cache.size(); + } + + public void setMaxWords(int maxWords) { + this.maxWords = maxWords; + } + + public int size() { + if (backend.size() > cache.size()) return backend.size(); else return cache.size(); + } + + public Iterator wordHashes(String startWordHash, boolean up) { + if (!(up)) throw new RuntimeException("plasmaWordIndexCache.wordHashes can only count up"); + return new iterateCombined(cache.keySet().iterator(), backend.wordHashes(startWordHash, true), true); + } + + public class iterateCombined implements Iterator { + + Comparator comp; + Iterator a, b; + String na, nb; + boolean up; + + public iterateCombined(Iterator a, Iterator b, boolean up) { + this.a = a; + this.b = b; + this.up = up; + this.comp = kelondroMSetTools.fastStringComparator(up); + nexta(); + nextb(); + } + + private void nexta() { + if (a.hasNext()) na = (String) a.next(); else na = null; + } + private void nextb() { + if (b.hasNext()) nb = (String) b.next(); else nb = null; + } + + public boolean hasNext() { + return (na != null) || (nb != null); + } + + public Object next() { + String s; + if (na == null) { + s = nb; + nextb(); + return s; + } + if (nb == null) { + s = na; + nexta(); + return s; + } + // compare the strings + int c = comp.compare(na, nb); + if (c == 0) { + s = na; + //System.out.println("Iterate Hash: take " + s + " from file&cache"); + nexta(); + nextb(); + return s; + } else if ((up) && (c < 0)) { + s = na; + nexta(); + return s; + } else { + s = nb; + nextb(); + return s; + } + } + + public void remove() { + + } + } + + private int flushKey(String key) { + plasmaWordIndexEntryContainer container = null; + long time; + synchronized (cache) { + container = (plasmaWordIndexEntryContainer) cache.get(key); + if (container == null) return 0; // flushing of nonexisting key + time = getCreationTime(key); + cache.remove(key); + hashScore.deleteScore(key); + hashDate.remove(key); + } + return backend.addEntries(container, time); + } + + private int flushToLimit() { + if ((hashScore.size() == 0) && (cache.size() == 0)) { + serverLog.logDebug("PLASMA INDEXING", "flushToLimit: called but cache is empty"); + return 0; + } + if ((hashScore.size() == 0) && (cache.size() != 0)) { + serverLog.logError("PLASMA INDEXING", "flushToLimit: hashScore.size=0 but cache.size=" + cache.size()); + return 0; + } + if ((hashScore.size() != 0) && (cache.size() == 0)) { + serverLog.logError("PLASMA INDEXING", "flushToLimit: hashScore.size=" + hashScore.size() + " but cache.size=0"); + return 0; + } + + //serverLog.logDebug("PLASMA INDEXING", "flushSpecific: hashScore.size=" + hashScore.size() + ", cache.size=" + cache.size()); + int total = 0; + synchronized (hashScore) { + String key; + int count; + Long createTime; + while (hashScore.size() >= maxWords) { + key = (String) hashScore.getMaxObject(); + createTime = (Long) hashDate.get(key); + count = hashScore.getScore(key); + if ((createTime != null) && ((System.currentTimeMillis() - createTime.longValue()) < 9000)) { + log.logDebug("key " + key + " is too fresh, abandon flush (count=" + count + ", cachesize=" + cache.size() + ")"); + break; + } + if (count < 5) log.logWarning("flushing of key " + key + " not appropriate (too less entries, count=" + count + "): increase cache size"); + log.logDebug("flushing key " + key + ", count=" + count + ", cachesize=" + cache.size()); + total += flushKey(key); + if (total > 100) break; + } + } + return total; + } + + public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty) { + flushKey(wordHash); + return backend.getIndex(wordHash, deleteIfEmpty); + } + + public long getCreationTime(String wordHash) { + Long time = (Long) hashDate.get(wordHash); + if (time == null) return 0; + return time.longValue(); + } + + public void deleteIndex(String wordHash) { + synchronized (cache) { + cache.remove(wordHash); + hashScore.deleteScore(wordHash); + hashDate.remove(wordHash); + } + backend.deleteIndex(wordHash); + } + + public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) { + flushKey(wordHash); + return backend.removeEntries(wordHash, urlHashes, deleteComplete); + } + + public int addEntries(plasmaWordIndexEntryContainer container, long creationTime) { + //serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size()); + flushToLimit(); + //if (flushc > 0) serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem - flushed " + flushc + " entries"); + + // put new words into cache + int added = 0; + synchronized (cache) { + String wordHash = container.wordHash(); + plasmaWordIndexEntryContainer entries = (plasmaWordIndexEntryContainer) cache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null + if (entries == null) entries = new plasmaWordIndexEntryContainer(wordHash); + added = entries.add(container); + if (added > 0) { + cache.put(wordHash, entries); + hashScore.addScore(wordHash, added); + hashDate.put(wordHash, new Long(creationTime)); + } + } + //System.out.println("DEBUG: cache = " + cache.toString()); + return added; + } + + private void addEntry(String wordHash, plasmaWordIndexEntry newEntry, long creationTime) { + plasmaWordIndexEntryContainer entries = (plasmaWordIndexEntryContainer) cache.get(wordHash); + if (entries == null) entries = new plasmaWordIndexEntryContainer(wordHash); + if (entries.add(newEntry)) { + cache.put(wordHash, entries); + hashScore.incScore(wordHash); + hashDate.put(wordHash, new Long(creationTime)); + } + } + + public void close(int waitingSeconds) { + try { + dump(waitingSeconds); + } catch (IOException e){ + log.logError("unable to dump cache: " + e.getMessage()); + e.printStackTrace(); + } + } + +} diff --git a/source/de/anomic/plasma/plasmaWordIndexClassicDB.java b/source/de/anomic/plasma/plasmaWordIndexClassicDB.java new file mode 100644 index 000000000..14782905f --- /dev/null +++ b/source/de/anomic/plasma/plasmaWordIndexClassicDB.java @@ -0,0 +1,255 @@ +// plasmaWordIndexClassicDB.java +// ----------------------------- +// part of YACY +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2005 +// last major change: 6.5.2005 +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + + +package de.anomic.plasma; + +import java.io.File; +import java.io.IOException; +import java.util.*; + +import de.anomic.kelondro.*; +import de.anomic.server.serverLog; +import de.anomic.yacy.yacySeedDB; + +public class plasmaWordIndexClassicDB implements plasmaWordIndexInterface { + + + // class variables + private File databaseRoot; + private serverLog log; + private int size; + + public plasmaWordIndexClassicDB(File databaseRoot, serverLog log) throws IOException { + this.databaseRoot = databaseRoot; + this.log = log; + this.size = 0; + } + + public int size() { + return size; + } + + public Iterator wordHashes(String startHash, boolean up) { + return new iterateFiles(startHash, up); + } + + public class iterateFiles implements Iterator { + + private ArrayList hierarchy; // contains TreeSet elements, earch TreeSet contains File Entries + private Comparator comp; // for string-compare + private String buffer; // the prefetch-buffer + + public iterateFiles(String startHash, boolean up) { + this.hierarchy = new ArrayList(); + this.comp = kelondroMSetTools.fastStringComparator(up); + + // the we initially fill the hierarchy with the content of the root folder + String path = "WORDS"; + TreeSet list = list(new File(databaseRoot, path)); + + // if we have a start hash then we find the appropriate subdirectory to start + if ((startHash != null) && (startHash.length() == yacySeedDB.commonHashLength)) { + delete(startHash.substring(0, 1), list); + if (list.size() > 0) { + hierarchy.add(list); + String[] paths = new String[]{startHash.substring(0, 1), startHash.substring(1, 2), startHash.substring(2, 4), startHash.substring(4, 6)}; + int pathc = 0; + while ((pathc < paths.length) && + (comp.compare((String) list.first(), paths[pathc]) == 0)) { + path = path + "/" + paths[pathc]; + list = list(new File(databaseRoot, path)); + delete(paths[pathc], list); + if (list.size() == 0) break; + hierarchy.add(list); + pathc++; + } + } + while (((buffer = next0()) != null) && (comp.compare(buffer, startHash) < 0)) {}; + } else { + hierarchy.add(list); + buffer = next0(); + } + } + + private synchronized void delete(String pattern, TreeSet names) { + String name; + while ((names.size() > 0) && (comp.compare((new File(name = (String) names.first())).getName(), pattern) < 0)) names.remove(name); + } + + private TreeSet list(File path) { + //System.out.println("PATH: " + path); + TreeSet t = new TreeSet(comp); + String[] l = path.list(); + if (l != null) for (int i = 0; i < l.length; i++) t.add(path + "/" + l[i]); + //else System.out.println("DEBUG: wrong path " + path); + //System.out.println(t); + return t; + } + + private synchronized String next0() { + // the object is a File pointing to the corresponding file + File f; + String n; + TreeSet t; + do { + t = null; + while ((t == null) && (hierarchy.size() > 0)) { + t = (TreeSet) hierarchy.get(hierarchy.size() - 1); + if (t.size() == 0) { + hierarchy.remove(hierarchy.size() - 1); // we step up one hierarchy + t = null; + } + } + if ((hierarchy.size() == 0) || (t.size() == 0)) return null; // this is the end + // fetch value + f = new File(n = (String) t.first()); + t.remove(n); + // if the value represents another folder, we step into the next hierarchy + if (f.isDirectory()) { + t = list(f); + if (t.size() == 0) { + // the folder is empty, delete it + f.delete(); + } else { + hierarchy.add(t); + } + f = null; + } + } while (f == null); + // thats it + if ((f == null) || ((n = f.getName()) == null) || (n.length() < yacySeedDB.commonHashLength)) { + return null; + } else { + return n.substring(0, yacySeedDB.commonHashLength); + } + } + + public boolean hasNext() { + return buffer != null; + } + + public Object next() { + String r = buffer; + while (((buffer = next0()) != null) && (comp.compare(buffer, r) < 0)) {}; + return r; + } + + public void remove() { + + } + } + + public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty) { + try { + return new plasmaWordIndexEntity(databaseRoot, wordHash, deleteIfEmpty); + } catch (IOException e) { + log.logError("plasmaWordIndexClassic.getIndex: " + e.getMessage()); + return null; + } + } + + public long getCreationTime(String wordHash) { + File f = plasmaWordIndexEntity.wordHash2path(databaseRoot, wordHash); + if (f.exists()) return f.lastModified(); else return -1; + } + + + public void deleteIndex(String wordHash) { + try { + plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash); + } catch (IOException e) { + log.logError("plasmaWordIndexClassic.deleteIndex: " + e.getMessage()); + return; + } + } + + public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) { + // removes all given url hashes from a single word index. Returns number of deletions. + plasmaWordIndexEntity pi = getIndex(wordHash, true); + int count = 0; + try { + for (int i = 0; i < urlHashes.length; i++) + if (pi.removeEntry(urlHashes[i], deleteComplete)) count++; + int size = pi.size(); + pi.close(); pi = null; + // check if we can remove the index completely + if ((deleteComplete) && (size == 0)) deleteIndex(wordHash); + return count; + } catch (IOException e) { + log.logError("plasmaWordIndexClassic.removeEntries: " + e.getMessage()); + return count; + } + } + + public int addEntries(plasmaWordIndexEntryContainer container, long creationTime) { + //System.out.println("* adding " + newEntries.size() + " cached word index entries for word " + wordHash); // debug + // fetch the index cache + if (container.size() == 0) return 0; + + // open file + try { + plasmaWordIndexEntity pi = new plasmaWordIndexEntity(databaseRoot, container.wordHash(), false); + int count = 0; + + // write from vector + if (container != null) { + Iterator i = container.entries(); + while (i.hasNext()) { + if (pi.addEntry((plasmaWordIndexEntry) i.next())) count++; + } + } + + // close and return + pi.close(); + pi = null; + return count; + } catch (IOException e) { + log.logError("plasmaWordIndexClassic.addEntries: " + e.getMessage()); + return 0; + } + } + + public void close(int waitingSeconds) { + + } + +} diff --git a/source/de/anomic/plasma/plasmaWordIndexEntity.java b/source/de/anomic/plasma/plasmaWordIndexEntity.java index 8d9adf656..4c9388e33 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntity.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntity.java @@ -1,5 +1,5 @@ -// plasmaIndex.java -// ----------------------- +// plasmaWordIndexEntity.java +// -------------------------- // part of YACY // (C) by Michael Peter Christen; mc@anomic.de // first published on http://www.anomic.de @@ -154,12 +154,11 @@ public class plasmaWordIndexEntity { if (theTmpMap == null) return (theIndex.get(entry.getUrlHash().getBytes()) != null); else return (theTmpMap.containsKey(entry.getUrlHash())); } - public void addEntry(plasmaWordIndexEntry entry) throws IOException { + public boolean addEntry(plasmaWordIndexEntry entry) throws IOException { if (theTmpMap == null) { - theIndex.put(entry.getUrlHash().getBytes(), entry.toEncodedForm(false).getBytes()); - //System.out.println(theIndex.toString()); // debug + return (theIndex.put(entry.getUrlHash().getBytes(), entry.toEncodedForm(false).getBytes()) == null); } else { - theTmpMap.put(entry.getUrlHash(), entry); + return (theTmpMap.put(entry.getUrlHash(), entry) == null); } } diff --git a/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java new file mode 100644 index 000000000..13c252e9a --- /dev/null +++ b/source/de/anomic/plasma/plasmaWordIndexEntryContainer.java @@ -0,0 +1,97 @@ +// plasmaIndexEntryContainer.java +// ------------------------------ +// part of YaCy +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2005 +// last major change: 07.05.2005 +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + +package de.anomic.plasma; + +import java.util.*; + +public class plasmaWordIndexEntryContainer { + + private String wordHash; + private HashMap container; + + public plasmaWordIndexEntryContainer(String wordHash) { + this.wordHash = wordHash; + container = new HashMap(); // a urlhash/plasmaWordIndexEntry - relation + } + + public int size() { + return container.size(); + } + + public String wordHash() { + return wordHash; + } + + public boolean add(plasmaWordIndexEntry entry) { + // returns true if the new entry was added, false if it already existet + String urlHash = entry.getUrlHash(); + if (container.containsKey(urlHash)) return false; + container.put(urlHash, entry); + return true; + } + + public int add(plasmaWordIndexEntryContainer c) { + // returns the number of new elements + Iterator i = c.entries(); + int x = 0; + while (i.hasNext()) { + if (add((plasmaWordIndexEntry) i.next())) x++; + } + return x; + } + + public Iterator entries() { + // returns an iterator of plasmaWordIndexEntry objects + return container.values().iterator(); + } + + public static plasmaWordIndexEntryContainer instantContainer(String wordHash, plasmaWordIndexEntry entry) { + plasmaWordIndexEntryContainer c = new plasmaWordIndexEntryContainer(wordHash); + c.add(entry); + return c; + } + + public String toString() { + return "C[" + wordHash + "] has " + container.size() + " entries"; + } + +} diff --git a/source/de/anomic/plasma/plasmaWordIndexFileCache.java b/source/de/anomic/plasma/plasmaWordIndexFileCache.java deleted file mode 100644 index e6477a8e7..000000000 --- a/source/de/anomic/plasma/plasmaWordIndexFileCache.java +++ /dev/null @@ -1,275 +0,0 @@ -// plasmaWordIndexFileCache.java -// ----------------------------- -// part of YACY -// (C) by Michael Peter Christen; mc@anomic.de -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004 -// last major change: 22.01.2004 -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. - - -/* - The plasmaIndexCache manages a database table with a list of - indexEntries in it. This is done in a completely different fashion - as organized by the plasmaIndex tables. The entries are not - sorted and just stored in a buffer. - Whenever during a seach an index is retrieved, first it's buffer - is flushed into the corresponding index table, so that it can be - sorted into the remaining index entry elements. - The cache database consist of - - the word hash as primary key - - one column with a one-byte counter - - a number of more columns with indexEntry elements -*/ - - -// compile with -// javac -classpath classes -sourcepath source -d classes -g source/de/anomic/plasma/*.java - -package de.anomic.plasma; - -import java.io.File; -import java.io.IOException; -import java.util.HashSet; -import java.util.Iterator; -import java.util.Vector; - -import de.anomic.kelondro.kelondroException; -import de.anomic.kelondro.kelondroTree; -import de.anomic.server.serverLog; - -public class plasmaWordIndexFileCache { - - private static final String indexCacheFileName = "indexCache.db"; - private static final int buffers = 50; // number of buffered entries per word - - // class variables - private File databaseRoot; - private kelondroTree indexCache; - private int bufferkb; - - public plasmaWordIndexFileCache(File databaseRoot, int bufferkb) throws IOException { - this.databaseRoot = databaseRoot; - this.bufferkb = bufferkb; - File indexCacheFile = new File(databaseRoot, indexCacheFileName); - if (indexCacheFile.exists()) { - // simply open the file - indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400); - } else { - createCacheFile(indexCacheFile); - } - } - - private void resetCacheFile() { - // this has to be used in emergencies only - // it can happen that there is a serious db inconsistency; in that case we re-create the indexCache - try { indexCache.close(); } catch (IOException e) {} - File indexCacheFile = new File(databaseRoot, indexCacheFileName); - if (indexCacheFile.exists()) indexCacheFile.delete(); - try { - createCacheFile(indexCacheFile); - } catch (IOException e) { - de.anomic.server.serverLog.logError("PLASMA", "plasmaWordIndexFileCache.resetCacheFile: serious failure creating the cache file: " + e.getMessage()); - indexCache = null; - } - } - - private void createCacheFile(File indexCacheFile) throws IOException { - // create a new file - int[] columns = new int[buffers + 2]; - columns[0] = plasmaWordIndexEntry.wordHashLength; - columns[1] = 1; - for (int i = 0; i < buffers; i++) columns[i + 2] = plasmaCrawlLURL.urlHashLength + plasmaWordIndexEntry.attrSpaceShort; - indexCache = new kelondroTree(indexCacheFile, bufferkb * 0x400, columns); - } - - protected void close() throws IOException { - indexCache.close(); - indexCache = null; - } - - private byte[][] getCache(String wordHash) throws IOException { - // read one line from the cache; if none exists: construct one - byte[][] row; - try { - row = indexCache.get(wordHash.getBytes()); - } catch (Exception e) { - // we had some negativeSeekOffsetExceptions here, and also loops may cause this - // in that case the indexCache is corrupt - System.out.println("Error in plasmaWordINdexFileCache.getCache: index for hash " + wordHash + " is corrupt:" + e.toString()); - //e.printStackTrace(); - row = null; - } - if (row == null) { - row = new byte[indexCache.columns()][]; - row[0] = wordHash.getBytes(); - row[1] = new byte[1]; - row[1][0] = (byte) 0; - } - return row; - } - - - protected Iterator wordHashes(String wordHash, boolean up) throws IOException { - try { - return indexCache.rows(up, false, (wordHash == null) ? null : wordHash.getBytes()); - } catch (kelondroException e) { - de.anomic.server.serverLog.logError("PLASMA", "kelondro error in plasmaWordIndexFileCache: " + e.getMessage() + "; deleting index for " + wordHash); - deleteComplete(wordHash); - return new HashSet().iterator(); - } - } - - protected plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty) throws IOException { - // first flush the index cache, if there is any for that word hash - byte[][] row = indexCache.get(wordHash.getBytes()); - if (row != null) { - int entries = (int) row[1][0]; - if (entries != 0) flushCache(row, null); // if the cache has entries, flush it - indexCache.remove(wordHash.getBytes()); // delete the cache index row; suppose to be empty now - } - // then return the index from the uncached file (with new entries) - return new plasmaWordIndexEntity(databaseRoot, wordHash, deleteIfEmpty); - } - - protected void addEntriesToIndex(String wordHash, Vector /* of plasmaIndexEntry */ newEntries) throws IOException { - //System.out.println("* adding " + newEntries.size() + " cached word index entries for word " + wordHash); // debug - // fetch the index cache - if (newEntries.size() == 0) return; - byte[][] row = getCache(wordHash); - int entries = (int) row[1][0]; - // check if the index cache is full - if (entries + 2 + newEntries.size() >= indexCache.columns()) { - flushCache(row, newEntries); // and put in new values - entries = 0; - row[1][0] = (byte) 0; // set number of entries to zero - } else { - // put in the new values - String newEntry; - for (int i = 0; i < newEntries.size(); i++) { - newEntry = ((plasmaWordIndexEntry) newEntries.elementAt(i)).getUrlHash() + ((plasmaWordIndexEntry) newEntries.elementAt(i)).toEncodedForm(false); - row[entries + 2] = newEntry.getBytes(); - entries++; - } - row[1][0] = (byte) entries; - try { - indexCache.put(row); - } catch (kelondroException e) { - // this is a very bad case; a database inconsistency occurred - serverLog.logError("PLASMA", "fatal error in plasmaWordIndexFileCache.addEntriesToIndex: write of " + wordHash + " to index cache failed - " + e.getMessage() + " - indexCache.db deleted"); - resetCacheFile(); - } catch (IOException e) { - // this is a very bad case; a database inconsistency occurred - serverLog.logError("PLASMA", "fatal error in plasmaWordIndexFileCache.addEntriesToIndex: write of " + wordHash + " to index cache failed - " + e.getMessage() + " - indexCache.db deleted"); - resetCacheFile(); - } - } - // finished! - } - - protected void deleteComplete(String wordHash) throws IOException { - plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash); - indexCache.remove(wordHash.getBytes()); - } - - protected int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete) throws IOException { - // removes all given url hashes from a single word index. Returns number of deletions. - plasmaWordIndexEntity pi = getIndex(wordHash, true); - int count = 0; - for (int i = 0; i < urlHashes.length; i++) if (pi.removeEntry(urlHashes[i], deleteComplete)) count++; - int size = pi.size(); - pi.close(); pi = null; - // check if we can remove the index completely - if ((deleteComplete) && (size == 0)) { - // remove index - if (!(plasmaWordIndexEntity.removePlasmaIndex(databaseRoot, wordHash))) - System.out.println("DEBUG: cannot remove index file for word hash " + wordHash); - // remove cache - indexCache.remove(wordHash.getBytes()); - } - return count; - } - - private synchronized void flushCache(byte[][] row, Vector indexEntries) throws IOException { - String wordHash = new String(row[0]); - int entries = (int) row[1][0]; - if ((entries == 0) && ((indexEntries == null) || (indexEntries.size() == 0))) return; - - // open file - plasmaWordIndexEntity pi = new plasmaWordIndexEntity(databaseRoot, wordHash, false); - - // write from array - plasmaWordIndexEntry entry; - for (int i = 0; i < entries; i++) { - entry = new plasmaWordIndexEntry(new String(row[i + 2], 0, plasmaCrawlLURL.urlHashLength), - new String(row[i + 2], plasmaCrawlLURL.urlHashLength, plasmaWordIndexEntry.attrSpaceShort)); - pi.addEntry(entry); - } - - // write from vector - if (indexEntries != null) { - for (int i = 0; i < indexEntries.size(); i++) - pi.addEntry((plasmaWordIndexEntry) indexEntries.elementAt(i)); - } - - // close and return - pi.close(); - pi = null; - } - - private int size(String wordHash) throws IOException { - // return number of entries in specific cache - byte[][] row = indexCache.get(wordHash.getBytes()); - if (row == null) return 0; - return (int) row[1][0]; - } - - protected int size() { - if (indexCache == null) return 0; else return indexCache.size(); - } - - /* - private plasmaIndex getIndexF(String wordHash) throws IOException { - return new plasmaIndex(databaseRoot, wordHash); - } - - private void addEntryToIndexF(String wordHash, plasmaIndexEntry entry) throws IOException { - plasmaIndex pi = new plasmaIndex(databaseRoot, wordHash); - pi.addEntry(entry); - pi.close(); - } - */ - -} diff --git a/source/de/anomic/plasma/plasmaWordIndexInterface.java b/source/de/anomic/plasma/plasmaWordIndexInterface.java new file mode 100644 index 000000000..218c7b58e --- /dev/null +++ b/source/de/anomic/plasma/plasmaWordIndexInterface.java @@ -0,0 +1,62 @@ +// plasmaWordIndexInterface.java +// ----------------------------- +// part of YACY +// (C) by Michael Peter Christen; mc@anomic.de +// first published on http://www.anomic.de +// Frankfurt, Germany, 2005 +// last major change: 6.5.2005 +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// +// Using this software in any meaning (reading, learning, copying, compiling, +// running) means that you agree that the Author(s) is (are) not responsible +// for cost, loss of data or any harm that may be caused directly or indirectly +// by usage of this softare or this documentation. The usage of this software +// is on your own risk. The installation and usage (starting/running) of this +// software may allow other people or application to access your computer and +// any attached devices and is highly dependent on the configuration of the +// software which must be done by the user of the software; the author(s) is +// (are) also not responsible for proper configuration and usage of the +// software, even if provoked by documentation provided together with +// the software. +// +// Any changes to this file according to the GPL as documented in the file +// gpl.txt aside this file in the shipment you received can be done to the +// lines that follows this copyright notice here, but changes must not be +// done inside the copyright notive above. A re-distribution must contain +// the intact and unchanged copyright notice. +// Contributions and changes to the program code must be marked as such. + + +package de.anomic.plasma; + +import java.util.*; + +public interface plasmaWordIndexInterface { + + public int size(); + + public Iterator wordHashes(String startWordHash, boolean up); + + public plasmaWordIndexEntity getIndex(String wordHash, boolean deleteIfEmpty); + public long getCreationTime(String wordHash); + public void deleteIndex(String wordHash); + + public int removeEntries(String wordHash, String[] urlHashes, boolean deleteComplete); + public int addEntries(plasmaWordIndexEntryContainer newEntries, long creationTime); + + public void close(int waitingSeconds); + +} diff --git a/source/de/anomic/plasma/plasmaWordIndexRAMCache.java b/source/de/anomic/plasma/plasmaWordIndexRAMCache.java deleted file mode 100644 index 31cbf1099..000000000 --- a/source/de/anomic/plasma/plasmaWordIndexRAMCache.java +++ /dev/null @@ -1,253 +0,0 @@ -// plasmaIndexRAMCache.java -// ----------------------- -// part of YACY -// (C) by Michael Peter Christen; mc@anomic.de -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004 -// last major change: 22.12.2004 -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. - -// compile with -// javac -classpath classes -sourcepath source -d classes -g source/de/anomic/plasma/*.java - - -package de.anomic.plasma; - -import java.io.File; -import java.io.IOException; -import java.util.Iterator; -import java.util.TreeMap; -import java.util.Vector; - -import de.anomic.kelondro.kelondroMScoreCluster; -import de.anomic.server.serverLog; -import de.anomic.yacy.yacySeedDB; - -public class plasmaWordIndexRAMCache extends Thread { - - static String minKey, maxKey; - - // class variables - TreeMap cache; - kelondroMScoreCluster hashScore; - plasmaWordIndexFileCache pic; - boolean terminate; - long terminateUntil; - int maxWords; - - static { - maxKey = ""; - for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += 'z'; - minKey = ""; - for (int i = 0; i < yacySeedDB.commonHashLength; i++) maxKey += '-'; - } - - public plasmaWordIndexRAMCache(File databaseRoot, int bufferkb) throws IOException { - this.pic = new plasmaWordIndexFileCache(databaseRoot, bufferkb); - this.cache = new TreeMap(); - this.hashScore = new kelondroMScoreCluster(); - this.maxWords = 1000; - this.terminate = false; - } - - public int maxURLinWordCache() { - return hashScore.getScore(hashScore.getMaxObject()); - } - - public int wordCacheRAMSize() { - return cache.size(); - } - - public void setMaxWords(int maxWords) { - this.maxWords = maxWords; - } - - public void run() { - serverLog.logSystem("PLASMA INDEXING", "started word cache management"); - int check; - // permanently flush cache elements - while (!(terminate)) { - if (hashScore.size() < 100) try {Thread.currentThread().sleep(10000);} catch (InterruptedException e) {} - while ((!(terminate)) && (cache != null) && (hashScore.size() > 0)) try { - check = hashScore.size(); - flushSpecific(false); - //serverLog.logDebug("PLASMA INDEXING", "single flush. bevore=" + check + "; after=" + hashScore.size()); - try {Thread.currentThread().sleep(10 + ((maxWords / 10) / (1 + hashScore.size())));} catch (InterruptedException e) {} - } catch (IOException e) { - serverLog.logError("PLASMA INDEXING", "PANIK! exception in main cache loop: " + e.getMessage()); - e.printStackTrace(); - terminate = true; - cache = null; - } - } - - serverLog.logSystem("PLASMA INDEXING", "CATCHED TERMINATION SIGNAL: start final flush"); - - // close all; - try { - // first flush everything - while ((hashScore.size() > 0) && (System.currentTimeMillis() < terminateUntil)) { - flushSpecific(false); - } - - // then close file cache: - pic.close(); - } catch (IOException e) { - serverLog.logDebug("PLASMA INDEXING", "interrupted final flush: " + e.toString()); - } - // report - if (hashScore.size() == 0) - serverLog.logSystem("PLASMA INDEXING", "finished final flush; flushed all words"); - else - serverLog.logError("PLASMA INDEXING", "terminated final flush; " + hashScore.size() + " words lost"); - - // delete data - cache = null; - hashScore = null; - - } - - public void close(int waitingBoundSeconds) { - terminate = true; - // wait until terination is done - // we can do at least 6 flushes/second - int waitingtime = 10 + (((cache == null) ? 0 : cache.size()) / 5); // seconds - if (waitingtime > waitingBoundSeconds) waitingtime = waitingBoundSeconds; // upper bound - this.terminateUntil = System.currentTimeMillis() + (waitingtime * 1000); - terminate = true; - while ((cache != null) && (waitingtime > 0)) { - serverLog.logDebug("PLASMA INDEXING", "final word flush; cache.size=" + cache.size() + "; time-out in " + waitingtime + " seconds"); - try {Thread.currentThread().sleep(5000);} catch (InterruptedException e) {} - waitingtime -= 5; - } - } - - private int flushSpecific(boolean greatest) throws IOException { - //System.out.println("DEBUG: plasmaIndexRAMCache.flushSpecific(" + ((greatest) ? "greatest" : "smallest") + "); cache.size() = " + cache.size()); - if ((hashScore.size() == 0) && (cache.size() == 0)) { - serverLog.logDebug("PLASMA INDEXING", "flushSpecific: called but cache is empty"); - return 0; - } - if ((hashScore.size() == 0) && (cache.size() != 0)) { - serverLog.logError("PLASMA INDEXING", "flushSpecific: hashScore.size=0 but cache.size=" + cache.size()); - return 0; - } - if ((hashScore.size() != 0) && (cache.size() == 0)) { - serverLog.logError("PLASMA INDEXING", "flushSpecific: hashScore.size=" + hashScore.size() + " but cache.size=0"); - return 0; - } - - //serverLog.logDebug("PLASMA INDEXING", "flushSpecific: hashScore.size=" + hashScore.size() + ", cache.size=" + cache.size()); - - String key = (String) ((greatest) ? hashScore.getMaxObject() : hashScore.getMinObject()); - return flushKey(key, "flushSpecific"); - } - - private int flushKey(String key, String caller) throws IOException { - Vector v = null; - v = (Vector) cache.get(key); - if (v == null) return 0; // flushing of nonexisting key - synchronized (cache) { - cache.remove(key); - hashScore.deleteScore(key); - } - pic.addEntriesToIndex(key, v); - return v.size(); - } - - public synchronized Iterator wordHashesMem(String wordHash, int count) throws IOException { - // returns a list of hashes from a specific start point - // we need to flush some of the elements in the cache first - // maybe we flush too much, but this is not easy to find out and it does not matter - TreeMap subMap = new TreeMap(cache.subMap((wordHash == null) ? minKey : wordHash, maxKey)); - int flushcount = subMap.size(); - if (flushcount > count) flushcount = count; - String key; - for (int i = 0; i < flushcount ; i++) { - key = (String) subMap.firstKey(); - flushKey(key, "getSequentialWordHashesMem"); - subMap.remove(key); - } - // finally return the result from the underlying hash list: - return pic.wordHashes(wordHash, true); - } - - public plasmaWordIndexEntity getIndexMem(String wordHash, boolean deleteIfEmpty) throws IOException { - flushKey(wordHash, "getIndexMem"); - return pic.getIndex(wordHash, deleteIfEmpty); - } - - public int addEntryToIndexMem(String wordHash, plasmaWordIndexEntry entry) throws IOException { - // make space for new words - int flushc = 0; - //serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem: cache.size=" + cache.size() + "; hashScore.size=" + hashScore.size()); - synchronized (hashScore) { - while (hashScore.size() > maxWords) flushc += flushSpecific(true); - } - //if (flushc > 0) serverLog.logDebug("PLASMA INDEXING", "addEntryToIndexMem - flushed " + flushc + " entries"); - - // put new words into cache - synchronized (cache) { - Vector v = (Vector) cache.get(wordHash); // null pointer exception? wordhash != null! must be cache==null - if (v == null) v = new Vector(); - v.add(entry); - cache.put(wordHash, v); - hashScore.incScore(wordHash); - } - return flushc; - } - - public synchronized void deleteComplete(String wordHash) throws IOException { - cache.remove(wordHash); - hashScore.deleteScore(wordHash); - pic.deleteComplete(wordHash); - } - - public int removeEntriesMem(String wordHash, String[] urlHashes, boolean deleteComplete) throws IOException { - flushKey(wordHash, "removeEntriesMem"); - return pic.removeEntries(wordHash, urlHashes, deleteComplete); - } - - public int sizeMin() { - // it is not easy to find out the correct size of the cache - // to make the result correct, it would be necessary to flush the complete ram cache - // instead, we return the minimum size of the cache, which is the maximun of either the - // ram or table cache - if ((hashScore == null) || (pic == null)) return 0; - return (hashScore.size() < pic.size()) ? pic.size() : hashScore.size(); - } - - -} diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 2bac836d5..c7fcb6a94 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -1,4 +1,4 @@ -// yacyClient.java +// yacyClient.java // ------------------------------------- // (C) by Michael Peter Christen; mc@anomic.de // first published on http://www.anomic.de @@ -55,115 +55,116 @@ import de.anomic.plasma.plasmaSearch; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndexEntity; import de.anomic.plasma.plasmaWordIndexEntry; +import de.anomic.plasma.plasmaWordIndexEntryContainer; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.tools.crypt; import de.anomic.tools.nxTools; public class yacyClient { - + public static int publishMySeed(String address, String otherHash) { - // this is called to enrich the seed information by - // - own address (if peer is behind a nat/router) - // - check peer type (virgin/junior/senior/principal) - // to do this, we send a 'Hello' to another peer - // this carries the following information: - // 'iam' - own hash - // 'youare' - remote hash, to verify that we are correct - // 'key' - a session key that the remote peer may use to answer - // and the own seed string - // we expect the following information to be send back: - // - 'yourip' the ip of the connection peer (we) - // - 'yourtype' the type of this peer that the other peer checked by asking for a specific word - // and the remote seed string - // the number of new seeds are returned - // one exceptional failure case is when we know the other's peers hash, the other peers responds correctly - // but they appear to be another peer by comparisment of the other peer's hash - // this works of course only if we know the other peer's hash. - - String key = crypt.randomSalt(); - HashMap result = null; - try { - /* - URL url = new URL("http://" + address + "/yacy/hello.html?iam=" + - yacyCore.seedCache.mySeed.hash + - "&pattern=&count=20" + - "&key=" + key + "&seed=" + yacyCore.seedCache.mySeed.genSeedStr(key)); - yacyCore.log.logDebug("HELLO to URL " + url.toString()); - result = nxTools.table(httpc.wget(url, - 10000, null, null, yacyCore.seedCache.sb.remoteProxyHost, yacyCore.seedCache.sb.remoteProxyPort)); - */ - - URL url = new URL("http://" + address + "/yacy/hello.html"); - serverObjects obj = new serverObjects(); - obj.put("iam", yacyCore.seedDB.mySeed.hash); - obj.put("pattern", ""); - obj.put("count", "20"); - obj.put("key", key); + // this is called to enrich the seed information by + // - own address (if peer is behind a nat/router) + // - check peer type (virgin/junior/senior/principal) + // to do this, we send a 'Hello' to another peer + // this carries the following information: + // 'iam' - own hash + // 'youare' - remote hash, to verify that we are correct + // 'key' - a session key that the remote peer may use to answer + // and the own seed string + // we expect the following information to be send back: + // - 'yourip' the ip of the connection peer (we) + // - 'yourtype' the type of this peer that the other peer checked by asking for a specific word + // and the remote seed string + // the number of new seeds are returned + // one exceptional failure case is when we know the other's peers hash, the other peers responds correctly + // but they appear to be another peer by comparisment of the other peer's hash + // this works of course only if we know the other peer's hash. + + String key = crypt.randomSalt(); + HashMap result = null; + try { + /* + URL url = new URL("http://" + address + "/yacy/hello.html?iam=" + + yacyCore.seedCache.mySeed.hash + + "&pattern=&count=20" + + "&key=" + key + "&seed=" + yacyCore.seedCache.mySeed.genSeedStr(key)); + yacyCore.log.logDebug("HELLO to URL " + url.toString()); + result = nxTools.table(httpc.wget(url, + 10000, null, null, yacyCore.seedCache.sb.remoteProxyHost, yacyCore.seedCache.sb.remoteProxyPort)); + */ + + URL url = new URL("http://" + address + "/yacy/hello.html"); + serverObjects obj = new serverObjects(); + obj.put("iam", yacyCore.seedDB.mySeed.hash); + obj.put("pattern", ""); + obj.put("count", "20"); + obj.put("key", key); obj.put("mytime", yacyCore.universalDateShortString()); - obj.put("seed", yacyCore.seedDB.mySeed.genSeedStr(key)); + obj.put("seed", yacyCore.seedDB.mySeed.genSeedStr(key)); result = nxTools.table(httpc.wput(url, - 105000, null, null, - yacyCore.seedDB.sb.remoteProxyHost, - yacyCore.seedDB.sb.remoteProxyPort, - obj)); - } catch (Exception e) { - yacyCore.log.logDebug("yacyClient.publishMySeed exception:" + e.getMessage()); - return -1; - } - if ((result == null) || (result.size() < 3)) { - yacyCore.log.logDebug("yacyClient.publishMySeed result error: " + - ((result == null) ? "result null" : ("result=" + result.toString()))); - return -1; - } - + 105000, null, null, + yacyCore.seedDB.sb.remoteProxyHost, + yacyCore.seedDB.sb.remoteProxyPort, + obj)); + } catch (Exception e) { + yacyCore.log.logDebug("yacyClient.publishMySeed exception:" + e.getMessage()); + return -1; + } + if ((result == null) || (result.size() < 3)) { + yacyCore.log.logDebug("yacyClient.publishMySeed result error: " + + ((result == null) ? "result null" : ("result=" + result.toString()))); + return -1; + } + Date remoteTime = yacyCore.parseUniversalDate((String) result.get("mytime")); // read remote time - // check consistency with expectation - if ((otherHash != null ) && (otherHash.length() > 0)) { - yacySeed otherPeer = yacySeed.genRemoteSeed((String) result.get("seed0"), key, remoteTime); - if ((otherPeer == null) || (!(otherPeer.hash.equals(otherHash)))) { - yacyCore.log.logDebug("yacyClient.publishMySeed consistency error: other peer wrong"); - return -1; // no success - } - } - - // set my own seed according to new information - yacySeed mySeedBkp = (yacySeed) yacyCore.seedDB.mySeed.clone(); - yacyCore.seedDB.mySeed.put("IP", (String) result.get("yourip")); - String mytype = (String) result.get("yourtype"); - if (mytype == null) mytype = "junior"; - if ((yacyCore.seedDB.mySeed.get("PeerType", "junior").equals("principal")) && (mytype.equals("senior"))) mytype = "principal"; - yacyCore.seedDB.mySeed.put("PeerType", mytype); - - if (!(yacyCore.seedDB.mySeed.isProper())) { - yacyCore.seedDB.mySeed = mySeedBkp; - yacyCore.log.logDebug("yacyClient.publishMySeed mySeed error: not proper"); - return -1; - } - - // read the seeds that the peer returned and integrate them into own database + // check consistency with expectation + if ((otherHash != null ) && (otherHash.length() > 0)) { + yacySeed otherPeer = yacySeed.genRemoteSeed((String) result.get("seed0"), key, remoteTime); + if ((otherPeer == null) || (!(otherPeer.hash.equals(otherHash)))) { + yacyCore.log.logDebug("yacyClient.publishMySeed consistency error: other peer wrong"); + return -1; // no success + } + } + + // set my own seed according to new information + yacySeed mySeedBkp = (yacySeed) yacyCore.seedDB.mySeed.clone(); + yacyCore.seedDB.mySeed.put("IP", (String) result.get("yourip")); + String mytype = (String) result.get("yourtype"); + if (mytype == null) mytype = "junior"; + if ((yacyCore.seedDB.mySeed.get("PeerType", "junior").equals("principal")) && (mytype.equals("senior"))) mytype = "principal"; + yacyCore.seedDB.mySeed.put("PeerType", mytype); + + if (!(yacyCore.seedDB.mySeed.isProper())) { + yacyCore.seedDB.mySeed = mySeedBkp; + yacyCore.log.logDebug("yacyClient.publishMySeed mySeed error: not proper"); + return -1; + } + + // read the seeds that the peer returned and integrate them into own database int i = 0; - String seedStr; - int count = 0; - while ((seedStr = (String) result.get("seed" + i++)) != null) { - // integrate new seed into own database - // the first seed, "seed0" is the seed of the responding peer - if (yacyCore.peerActions.peerArrival(yacySeed.genRemoteSeed(seedStr, key, remoteTime), (i == 1))) count++; - } - return count; + String seedStr; + int count = 0; + while ((seedStr = (String) result.get("seed" + i++)) != null) { + // integrate new seed into own database + // the first seed, "seed0" is the seed of the responding peer + if (yacyCore.peerActions.peerArrival(yacySeed.genRemoteSeed(seedStr, key, remoteTime), (i == 1))) count++; + } + return count; } - - + + public static yacySeed querySeed(yacySeed target, String seedHash) { - String key = crypt.randomSalt(); + String key = crypt.randomSalt(); try { HashMap result = nxTools.table(httpc.wget( - new URL("http://" + target.getAddress() + - "/yacy/query.html?iam=" + yacyCore.seedDB.mySeed.hash + - "&youare=" + target.hash + "&key=" + key + - "&object=seed&env=" + seedHash), - 10000, null, null, yacyCore.seedDB.sb.remoteProxyHost, yacyCore.seedDB.sb.remoteProxyPort)); + new URL("http://" + target.getAddress() + + "/yacy/query.html?iam=" + yacyCore.seedDB.mySeed.hash + + "&youare=" + target.hash + "&key=" + key + + "&object=seed&env=" + seedHash), + 10000, null, null, yacyCore.seedDB.sb.remoteProxyHost, yacyCore.seedDB.sb.remoteProxyPort)); if ((result == null) || (result.size() == 0)) return null; Date remoteTime = yacyCore.parseUniversalDate((String) result.get("mytime")); // read remote time return yacySeed.genRemoteSeed((String) result.get("response"), key, remoteTime); @@ -172,126 +173,138 @@ public class yacyClient { return null; } } - + public static int queryRWICount(yacySeed target, String wordHash) { try { HashMap result = nxTools.table(httpc.wget( - new URL("http://" + target.getAddress() + - "/yacy/query.html?iam=" + yacyCore.seedDB.mySeed.hash + - "&youare=" + target.hash + "&key=" + - "&object=rwicount&env=" + wordHash + - "&ttl=0"), - 10000, null, null, yacyCore.seedDB.sb.remoteProxyHost, yacyCore.seedDB.sb.remoteProxyPort)); + new URL("http://" + target.getAddress() + + "/yacy/query.html?iam=" + yacyCore.seedDB.mySeed.hash + + "&youare=" + target.hash + "&key=" + + "&object=rwicount&env=" + wordHash + + "&ttl=0"), + 10000, null, null, yacyCore.seedDB.sb.remoteProxyHost, yacyCore.seedDB.sb.remoteProxyPort)); if ((result == null) || (result.size() == 0)) return -1; - return Integer.parseInt((String) result.get("response")); - } catch (Exception e) { + return Integer.parseInt((String) result.get("response")); + } catch (Exception e) { yacyCore.log.logError("yacyClient.queryRWICount error:" + e.getMessage()); - return -1; - } + return -1; + } } - + public static int queryUrlCount(yacySeed target) { - if (target == null) return -1; - if (yacyCore.seedDB.mySeed == null) return -1; - String querystr = - "http://" + target.getAddress() + - "/yacy/query.html?iam=" + yacyCore.seedDB.mySeed.hash + - "&youare=" + target.hash + - "&key=" + - "&object=lurlcount&env=&ttl=0"; - try { - HashMap result = nxTools.table(httpc.wget( - new URL(querystr), 5000, null, null, - yacyCore.seedDB.sb.remoteProxyHost, yacyCore.seedDB.sb.remoteProxyPort)); - //yacyCore.log("DEBUG QUERY: query=" + querystr + "; result = " + result.toString()); - if ((result == null) || (result.size() == 0)) return -1; + if (target == null) return -1; + if (yacyCore.seedDB.mySeed == null) return -1; + String querystr = + "http://" + target.getAddress() + + "/yacy/query.html?iam=" + yacyCore.seedDB.mySeed.hash + + "&youare=" + target.hash + + "&key=" + + "&object=lurlcount&env=&ttl=0"; + try { + HashMap result = nxTools.table(httpc.wget( + new URL(querystr), 6000, null, null, + yacyCore.seedDB.sb.remoteProxyHost, yacyCore.seedDB.sb.remoteProxyPort)); + //yacyCore.log("DEBUG QUERY: query=" + querystr + "; result = " + result.toString()); + if ((result == null) || (result.size() == 0)) return -1; String resp = (String) result.get("response"); if (resp == null) return -1; else return Integer.parseInt(resp); - } catch (Exception e) { + } catch (Exception e) { yacyCore.log.logError("yacyClient.queryUrlCount error asking peer '" + target.getName() + "':" + e.toString()); - return -1; - } + return -1; + } } - + public static int search(String wordhashes, int count, boolean global, - yacySeed targetPeer, plasmaCrawlLURL urlManager, plasmaSearch searchManager, - long duetime) { - // send a search request to peer with remote Hash - // this mainly converts the words into word hashes - - // INPUT: - // iam : complete seed of the requesting peer - // youare : seed hash of the target peer, used for testing network stability - // key : transmission key for response - // search : a list of search words - // hsearch : a string of word hashes - // fwdep : forward depth. if "0" then peer may NOT ask another peer for more results - // fwden : forward deny, a list of seed hashes. They may NOT be target of forward hopping - // count : maximum number of wanted results - // global : if "true", then result may consist of answers from other peers + yacySeed targetPeer, plasmaCrawlLURL urlManager, plasmaSearch searchManager, + long duetime) { + // send a search request to peer with remote Hash + // this mainly converts the words into word hashes + + // INPUT: + // iam : complete seed of the requesting peer + // youare : seed hash of the target peer, used for testing network stability + // key : transmission key for response + // search : a list of search words + // hsearch : a string of word hashes + // fwdep : forward depth. if "0" then peer may NOT ask another peer for more results + // fwden : forward deny, a list of seed hashes. They may NOT be target of forward hopping + // count : maximum number of wanted results + // global : if "true", then result may consist of answers from other peers // duetime : maximum time that a peer should spent to create a result - - // request result - String key = crypt.randomSalt(); - try { - String url = "http://" + targetPeer.getAddress() + "/yacy/search.html"; - /* - String url = "http://" + targetPeer.getAddress() + - "/yacy/search.html?myseed=" + yacyCore.seedCache.mySeed.genSeedStr(key) + - "&youare=" + targetPeer.hash + "&key=" + key + - "&myseed=" + yacyCore.seedCache.mySeed.genSeedStr(key) + - "&count=" + count + "&resource=" + ((global) ? "global" : "local") + - "&query=" + wordhashes; - */ - serverObjects obj = new serverObjects(); - obj.put("myseed", yacyCore.seedDB.mySeed.genSeedStr(key)); - obj.put("youare", targetPeer.hash); - obj.put("key", key); - obj.put("count", count); - obj.put("resource", ((global) ? "global" : "local")); - obj.put("query", wordhashes); + + // request result + String key = crypt.randomSalt(); + try { + String url = "http://" + targetPeer.getAddress() + "/yacy/search.html"; + /* + String url = "http://" + targetPeer.getAddress() + + "/yacy/search.html?myseed=" + yacyCore.seedCache.mySeed.genSeedStr(key) + + "&youare=" + targetPeer.hash + "&key=" + key + + "&myseed=" + yacyCore.seedCache.mySeed.genSeedStr(key) + + "&count=" + count + "&resource=" + ((global) ? "global" : "local") + + "&query=" + wordhashes; + */ + serverObjects obj = new serverObjects(); + obj.put("myseed", yacyCore.seedDB.mySeed.genSeedStr(key)); + obj.put("youare", targetPeer.hash); + obj.put("key", key); + obj.put("count", count); + obj.put("resource", ((global) ? "global" : "local")); + obj.put("query", wordhashes); obj.put("ttl", "0"); obj.put("duetime", "" + duetime); - obj.put("mytime", yacyCore.universalDateShortString()); - //yacyCore.log.logDebug("yacyClient.search url=" + url); + obj.put("mytime", yacyCore.universalDateShortString()); + //yacyCore.log.logDebug("yacyClient.search url=" + url); long timestamp = System.currentTimeMillis(); - HashMap result = nxTools.table(httpc.wput(new URL(url), - 300000, null, null, - yacyCore.seedDB.sb.remoteProxyHost, - yacyCore.seedDB.sb.remoteProxyPort, - obj)); + HashMap result = nxTools.table(httpc.wput(new URL(url), + 300000, null, null, + yacyCore.seedDB.sb.remoteProxyHost, + yacyCore.seedDB.sb.remoteProxyPort, + obj)); long totalrequesttime = System.currentTimeMillis() - timestamp; - /* - HashMap result = nxTools.table(httpc.wget(new URL(url), + /* + HashMap result = nxTools.table(httpc.wget(new URL(url), 300000, null, null, yacyCore.seedCache.remoteProxyHost, yacyCore.seedCache.remoteProxyPort)); - */ - // OUTPUT: - // version : application version of responder - // uptime : uptime in seconds of responder - // total : number of total available LURL's for this search - // count : number of returned LURL's for this search - // resource |