diff --git a/source/de/anomic/index/indexCachedRI.java b/source/de/anomic/index/indexCachedRI.java new file mode 100644 index 000000000..6dd921a56 --- /dev/null +++ b/source/de/anomic/index/indexCachedRI.java @@ -0,0 +1,356 @@ +// indexCachedRI.java +// ----------------------------- +// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany +// first published 7.11.2006 on http://www.anomic.de +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.index; + +import java.util.Date; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; + +import de.anomic.kelondro.kelondroBase64Order; +import de.anomic.kelondro.kelondroMergeIterator; +import de.anomic.kelondro.kelondroNaturalOrder; +import de.anomic.kelondro.kelondroOrder; +import de.anomic.kelondro.kelondroRow; +import de.anomic.server.logging.serverLog; + +public class indexCachedRI implements indexRI { + + private kelondroRow payloadrow; + private kelondroOrder indexOrder = new kelondroNaturalOrder(true); + private indexRAMRI dhtOutCache, dhtInCache; + private indexRI backend; + public boolean busyCacheFlush; // shows if a cache flush is currently performed + private int idleDivisor, busyDivisor; + + public indexCachedRI(indexRAMRI dhtOutCache, indexRAMRI dhtInCache, indexRI backend, kelondroRow payloadrow, serverLog log) { + this.dhtOutCache = dhtOutCache; + this.dhtInCache = dhtInCache; + this.backend = backend; + this.payloadrow = payloadrow; + this.busyCacheFlush = false; + this.busyDivisor = 5000; + this.idleDivisor = 420; + } + + public kelondroRow payloadrow() { + return payloadrow; + } + + public void setWordFlushDivisor(int idleDivisor, int busyDivisor) { + this.idleDivisor = idleDivisor; + this.busyDivisor = busyDivisor; + } + + public void flushControl() { + // check for forced flush + synchronized (this) { + if (dhtOutCache.size() > dhtOutCache.getMaxWordCount()) { + flushCache(dhtOutCache, dhtOutCache.size() + 500 - dhtOutCache.getMaxWordCount()); + } + if (dhtInCache.size() > dhtInCache.getMaxWordCount()) { + flushCache(dhtInCache, dhtInCache.size() + 500 - dhtInCache.getMaxWordCount()); + } + } + } + + public long getUpdateTime(String wordHash) { + indexContainer entries = getContainer(wordHash, null, false, -1); + if (entries == null) return 0; + return entries.updated(); + } + + public indexContainer emptyContainer(String wordHash) { + return new indexContainer(wordHash, payloadrow); + } + + public indexContainer addEntry(String wordHash, indexEntry entry, long updateTime, boolean dhtInCase) { + // add the entry + if (dhtInCase) { + dhtInCache.addEntry(wordHash, entry, updateTime, true); + } else { + dhtOutCache.addEntry(wordHash, entry, updateTime, false); + flushControl(); + } + return null; + } + + public indexContainer addEntries(indexContainer entries, long updateTime, boolean dhtInCase) { + // add the entry + if (dhtInCase) { + dhtInCache.addEntries(entries, updateTime, true); + } else { + dhtOutCache.addEntries(entries, updateTime, false); + flushControl(); + } + return null; + } + + public void flushCacheSome(boolean busy) { + flushCacheSome(dhtOutCache, busy); + flushCacheSome(dhtInCache, busy); + } + + private void flushCacheSome(indexRAMRI ram, boolean busy) { + int flushCount; + if (ram.size() > ram.getMaxWordCount()) { + flushCount = ram.size() + 100 - ram.getMaxWordCount(); + } else { + flushCount = (busy) ? ram.size() / busyDivisor : ram.size() / idleDivisor; + if (flushCount > 100) flushCount = 100; + if (flushCount < 1) flushCount = Math.min(1, ram.size()); + } + flushCache(ram, flushCount); + } + + private void flushCache(indexRAMRI ram, int count) { + if (count <= 0) return; + busyCacheFlush = true; + String wordHash; + for (int i = 0; i < count; i++) { // possible position of outOfMemoryError ? + if (ram.size() == 0) break; + synchronized (this) { + wordHash = ram.bestFlushWordHash(); + + // flush the wordHash + indexContainer c = ram.deleteContainer(wordHash); + if (c != null) { + indexContainer feedback = backend.addEntries(c, c.updated(), false); + if (feedback != null) { + throw new RuntimeException("indexCollectionRI shall not return feedback entries; feedback = " + feedback.toString()); + } + } + + // pause to next loop to give other processes a chance to use IO + try {this.wait(8);} catch (InterruptedException e) {} + } + } + busyCacheFlush = false; + } + + private static final int hour = 3600000; + private static final int day = 86400000; + + public static int microDateDays(Date modified) { + return microDateDays(modified.getTime()); + } + + public static int microDateDays(long modified) { + // this calculates a virtual age from a given date + // the purpose is to have an age in days of a given modified date + // from a fixed standpoint in the past + // one day has 60*60*24 seconds = 86400 seconds + // we take mod 64**3 = 262144, this is the mask of the storage + return (int) ((modified / day) % 262144); + } + + public static String microDateHoursStr(long time) { + return kelondroBase64Order.enhancedCoder.encodeLong(microDateHoursInt(time), 3); + } + + public static int microDateHoursInt(long time) { + return (int) ((time / hour) % 262144); + } + + public static int microDateHoursAge(String mdhs) { + return microDateHoursInt(System.currentTimeMillis()) - (int) kelondroBase64Order.enhancedCoder.decodeLong(mdhs); + } + + public static long reverseMicroDateDays(int microDateDays) { + return ((long) microDateDays) * ((long) day); + } + + public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxTime) { + // get from cache + indexContainer container = dhtOutCache.getContainer(wordHash, urlselection, true, maxTime); + if (container == null) { + container = dhtInCache.getContainer(wordHash, urlselection, true, maxTime); + } else { + container.add(dhtInCache.getContainer(wordHash, urlselection, true, maxTime), maxTime); + } + + // get from collection index + if (container == null) { + container = backend.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime); + } else { + container.add(backend.getContainer(wordHash, urlselection, true, (maxTime < 0) ? -1 : maxTime), maxTime); + } + return container; + } + + public Map getContainers(Set wordHashes, Set urlselection, boolean deleteIfEmpty, boolean interruptIfEmpty, long maxTime) { + // return map of wordhash:indexContainer + + // retrieve entities that belong to the hashes + HashMap containers = new HashMap(); + String singleHash; + indexContainer singleContainer; + Iterator i = wordHashes.iterator(); + long start = System.currentTimeMillis(); + long remaining; + while (i.hasNext()) { + // check time + remaining = maxTime - (System.currentTimeMillis() - start); + //if ((maxTime > 0) && (remaining <= 0)) break; + if ((maxTime >= 0) && (remaining <= 0)) remaining = 100; + + // get next word hash: + singleHash = (String) i.next(); + + // retrieve index + singleContainer = getContainer(singleHash, urlselection, deleteIfEmpty, (maxTime < 0) ? -1 : remaining / (wordHashes.size() - containers.size())); + + // check result + if (((singleContainer == null) || (singleContainer.size() == 0)) && (interruptIfEmpty)) return new HashMap(); + + containers.put(singleHash, singleContainer); + } + return containers; + } + + public int size() { + return java.lang.Math.max(backend.size(), java.lang.Math.max(dhtInCache.size(), dhtOutCache.size())); + } + + public int indexSize(String wordHash) { + int size = backend.indexSize(wordHash); + size += dhtInCache.indexSize(wordHash); + size += dhtOutCache.indexSize(wordHash); + return size; + } + + public void close(int waitingBoundSeconds) { + synchronized (this) { + dhtInCache.close(waitingBoundSeconds); + dhtOutCache.close(waitingBoundSeconds); + backend.close(-1); + } + } + + public indexContainer deleteContainer(String wordHash) { + indexContainer c = new indexContainer(wordHash, payloadrow); + c.add(dhtInCache.deleteContainer(wordHash), -1); + c.add(dhtOutCache.deleteContainer(wordHash), -1); + c.add(backend.deleteContainer(wordHash), -1); + return c; + } + + public boolean removeEntry(String wordHash, String urlHash, boolean deleteComplete) { + boolean removed = false; + removed = removed | (dhtInCache.removeEntry(wordHash, urlHash, deleteComplete)); + removed = removed | (dhtOutCache.removeEntry(wordHash, urlHash, deleteComplete)); + removed = removed | (backend.removeEntry(wordHash, urlHash, deleteComplete)); + return removed; + } + + public int removeEntries(String wordHash, Set urlHashes, boolean deleteComplete) { + int removed = 0; + removed += dhtInCache.removeEntries(wordHash, urlHashes, deleteComplete); + removed += dhtOutCache.removeEntries(wordHash, urlHashes, deleteComplete); + removed += backend.removeEntries(wordHash, urlHashes, deleteComplete); + return removed; + } + + public String removeEntriesExpl(String wordHash, Set urlHashes, boolean deleteComplete) { + String removed = ""; + removed += dhtInCache.removeEntries(wordHash, urlHashes, deleteComplete) + ", "; + removed += dhtOutCache.removeEntries(wordHash, urlHashes, deleteComplete) + ", "; + removed += backend.removeEntries(wordHash, urlHashes, deleteComplete) + ", "; + return removed; + } + + public TreeSet indexContainerSet(String startHash, boolean ramOnly, boolean rot, int count) { + // creates a set of indexContainers + // this does not use the dhtInCache + kelondroOrder containerOrder = new indexContainerOrder((kelondroOrder) indexOrder.clone()); + containerOrder.rotate(startHash.getBytes()); + TreeSet containers = new TreeSet(containerOrder); + Iterator i = wordContainers(startHash, ramOnly, rot); + if (ramOnly) count = Math.min(dhtOutCache.size(), count); + indexContainer container; + while ((count > 0) && (i.hasNext())) { + container = (indexContainer) i.next(); + if ((container != null) && (container.size() > 0)) { + containers.add(container); + count--; + } + } + return containers; + } + + public Iterator wordContainers(String startHash, boolean rot) { + // returns an iteration of indexContainers + return wordContainers(startHash, false, rot); + } + + public Iterator wordContainers(String startHash, boolean ramOnly, boolean rot) { + if (rot) return new rotatingContainerIterator(startHash, ramOnly); + if (ramOnly) { + return dhtOutCache.wordContainers(startHash, false); + } + return new kelondroMergeIterator( + dhtOutCache.wordContainers(startHash, false), + backend.wordContainers(startHash, false), + new indexContainerOrder(kelondroNaturalOrder.naturalOrder), + indexContainer.containerMergeMethod, + true); + } + + private class rotatingContainerIterator implements Iterator { + Iterator i; + boolean ramOnly; + + public rotatingContainerIterator(String startWordHash, boolean ramOnly) { + this.ramOnly = ramOnly; + i = wordContainers(startWordHash, ramOnly); + } + + public void finalize() { + i = null; + } + + public boolean hasNext() { + if (i.hasNext()) return true; + else { + i = wordContainers("------------", ramOnly); + return i.hasNext(); + } + } + + public Object next() { + return i.next(); + } + + public void remove() { + throw new java.lang.UnsupportedOperationException("rotatingWordIterator does not support remove"); + } + } // class rotatingContainerIterator + +} diff --git a/source/de/anomic/index/indexRAMRI.java b/source/de/anomic/index/indexRAMRI.java index 0356ca5c4..0858ef8ba 100644 --- a/source/de/anomic/index/indexRAMRI.java +++ b/source/de/anomic/index/indexRAMRI.java @@ -1,4 +1,4 @@ -// indexRAMCacheRI.java +// indexRAMRI.java // (C) 2005, 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany // first published 2005 on http://www.anomic.de // diff --git a/source/de/anomic/index/indexRI.java b/source/de/anomic/index/indexRI.java index 2cadc1352..43187cb02 100644 --- a/source/de/anomic/index/indexRI.java +++ b/source/de/anomic/index/indexRI.java @@ -1,11 +1,16 @@ // indexRI.java // ----------------------------- -// part of YACY -// (C) by Michael Peter Christen; mc@anomic.de -// first published on http://www.anomic.de -// Frankfurt, Germany, 2005 -// last major change: 6.5.2005 +// (C) 2005 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany +// first published 6.5.2005 on http://www.anomic.de // +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -19,25 +24,6 @@ // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. package de.anomic.index; @@ -52,7 +38,7 @@ public interface indexRI { public Iterator wordContainers(String startWordHash, boolean rot); // method to replace wordHashes public long getUpdateTime(String wordHash); - + public int indexSize(String wordHash); public indexContainer getContainer(String wordHash, Set urlselection, boolean deleteIfEmpty, long maxtime); public indexContainer deleteContainer(String wordHash); diff --git a/source/de/anomic/kelondro/kelondroRecords.java b/source/de/anomic/kelondro/kelondroRecords.java index d476445aa..113d87b33 100644 --- a/source/de/anomic/kelondro/kelondroRecords.java +++ b/source/de/anomic/kelondro/kelondroRecords.java @@ -700,6 +700,10 @@ public class kelondroRecords { if (value == null) { while (valuewidth-- > 0) targetarray[targetoffset++] = 0; } else { + assert ((valueoffset >= 0) && (valueoffset < value.length)) : "valueoffset = " + valueoffset; + assert ((valueoffset + valuewidth <= value.length)) : "valueoffset = " + valueoffset + ", valuewidth = " + valuewidth + ", value.length = " + value.length; + assert ((targetoffset >= 0) && (targetoffset < targetarray.length)) : "targetoffset = " + targetoffset; + assert ((targetoffset + valuewidth <= targetarray.length)) : "targetoffset = " + targetoffset + ", valuewidth = " + valuewidth + ", targetarray.length = " + targetarray.length; System.arraycopy(value, valueoffset, targetarray, targetoffset, Math.min(value.length, valuewidth)); // error? while (valuewidth-- > value.length) targetarray[targetoffset + valuewidth] = 0; } diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 8bd8c540f..650425095 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -513,7 +513,12 @@ public final class plasmaCrawlStacker { this.anchors = (int) entry.getColLong(8); this.forkfactor = (int) entry.getColLong(9); this.flags = new bitfield(entry.getColBytes(10)); - this.handle = Integer.parseInt(new String(entry.getColBytes(11), "UTF-8")); + try { + this.handle = Integer.parseInt(new String(entry.getColBytes(11), "UTF-8")); + } catch (NumberFormatException ee) { + System.out.println("BUG in stackCrawlMessage. entry = " + entry.toString()); + throw new RuntimeException(ee.getMessage()); + } } catch (Exception e) { e.printStackTrace(); throw new IllegalStateException(e.toString()); diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java index d032d7b9e..48b4adfee 100644 --- a/source/de/anomic/plasma/plasmaParserDocument.java +++ b/source/de/anomic/plasma/plasmaParserDocument.java @@ -66,7 +66,7 @@ public class plasmaParserDocument { String charset; // the charset of the document String[] keywords; // most resources provide a keyword field String shortTitle; // a shortTitle mostly appears in the window header (border) - String longTitle; // the real title of the document, commonly h1-tags + private String longTitle; // the real title of the document, commonly h1-tags String[] sections; // if present: more titles/headlines appearing in the document String abstrct; // an abstract, if present: short content description private Object text; // the clear text, all that is visible diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index dc0b34be9..457fff15c 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1677,7 +1677,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser urlHash, urlLength, urlComps, wordStat.count, - document.longTitle.length(), + document.getMainLongTitle().length(), condenser.RESULT_SIMI_WORDS, condenser.RESULT_SIMI_SENTENCES, wordStat.posInText, diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 2000a8ebe..084e4263c 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -64,7 +64,7 @@ public final class plasmaWordIndex implements indexRI { private final File oldDatabaseRoot; private final kelondroOrder indexOrder = new kelondroNaturalOrder(true); - private final indexRAMRI dhtOutCache, dhtInCache; + private final indexRAMRI dhtOutCache, dhtInCache; private final indexCollectionRI collections; // new database structure to replace AssortmentCluster and FileCluster private int assortmentBufferSize; // kb private final plasmaWordIndexAssortmentCluster assortmentCluster; // old database structure, to be replaced by CollectionRI @@ -334,7 +334,7 @@ public final class plasmaWordIndex implements indexRI { // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c); wordHash = indexEntryAttribute.word2hash(word); ientry = new indexURLEntry(urlHash, - urlLength, urlComps, (document == null) ? urlLength : document.longTitle.length(), + urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(), wprop.count, condenser.RESULT_SIMI_WORDS, condenser.RESULT_SIMI_SENTENCES, @@ -630,7 +630,7 @@ public final class plasmaWordIndex implements indexRI { return null; } - private class rotatingContainerIterator implements Iterator { + public class rotatingContainerIterator implements Iterator { Iterator i; int resourceLevel; diff --git a/source/de/anomic/plasma/plasmaWordIndexFileCluster.java b/source/de/anomic/plasma/plasmaWordIndexFileCluster.java index 3cfd4be0b..961033617 100644 --- a/source/de/anomic/plasma/plasmaWordIndexFileCluster.java +++ b/source/de/anomic/plasma/plasmaWordIndexFileCluster.java @@ -334,4 +334,8 @@ public class plasmaWordIndexFileCluster implements indexRI { } + public int indexSize(String wordHash) { + throw new UnsupportedOperationException(); + } + } diff --git a/source/de/anomic/server/logging/GuiHandler.java b/source/de/anomic/server/logging/GuiHandler.java index 76292bf8e..cc77ce99b 100644 --- a/source/de/anomic/server/logging/GuiHandler.java +++ b/source/de/anomic/server/logging/GuiHandler.java @@ -45,7 +45,6 @@ package de.anomic.server.logging; import java.util.ArrayList; -import java.util.Date; import java.util.logging.ErrorManager; import java.util.logging.Filter; import java.util.logging.Formatter; diff --git a/source/de/anomic/yacy/yacySeedDB.java b/source/de/anomic/yacy/yacySeedDB.java index 416515223..e14124d0a 100644 --- a/source/de/anomic/yacy/yacySeedDB.java +++ b/source/de/anomic/yacy/yacySeedDB.java @@ -205,7 +205,13 @@ public final class yacySeedDB { private synchronized kelondroMap openSeedTable(File seedDBFile) { new File(seedDBFile.getParent()).mkdirs(); - return new kelondroMap(kelondroDyn.open(seedDBFile, (seedDBBufferKB * 0x400) / 3, preloadTime / 3, commonHashLength, 480, '#'), sortFields, accFields); + try { + return new kelondroMap(kelondroDyn.open(seedDBFile, (seedDBBufferKB * 0x400) / 3, preloadTime / 3, commonHashLength, 480, '#'), sortFields, accFields); + } catch (Exception e) { + seedDBFile.delete(); + // try again + return new kelondroMap(kelondroDyn.open(seedDBFile, (seedDBBufferKB * 0x400) / 3, preloadTime / 3, commonHashLength, 480, '#'), sortFields, accFields); + } } protected synchronized kelondroMap resetSeedTable(kelondroMap seedDB, File seedDBFile) {