From 93ea0a47892b6933765f99c1e645503288efa20e Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 20 Apr 2010 13:45:22 +0000 Subject: [PATCH] enhanced remove operation in search consequences (which are triggered when the snippet fetch proves that the word has disappeared from the page that was stored in the index) - no direct deletion of referenced during search (shifted to time after search) - bundling of all deletions for the references of a single word into one remove operation - enhanced remove operation by caring that the collection is stored sorted (experimental) - more String -> byte[] transition for search word lists - clean up of unused code - enhanced memory allocation of RowSet Objects (will use a little bit less memory which was wasted before) git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6823 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/search/ResultFetcher.java | 30 +++++++++---------- source/de/anomic/search/SearchEvent.java | 5 ++-- source/de/anomic/search/SearchEventCache.java | 2 +- source/de/anomic/search/TextSnippet.java | 25 ---------------- source/net/yacy/kelondro/blob/ArrayStack.java | 1 + source/net/yacy/kelondro/index/HandleSet.java | 4 +++ .../yacy/kelondro/index/RowCollection.java | 13 ++++++-- source/net/yacy/kelondro/index/RowSet.java | 2 +- source/net/yacy/kelondro/rwi/IndexCell.java | 17 +---------- .../yacy/kelondro/rwi/ReferenceContainer.java | 7 ----- .../kelondro/rwi/ReferenceContainerCache.java | 22 +------------- 11 files changed, 36 insertions(+), 92 deletions(-) diff --git a/source/de/anomic/search/ResultFetcher.java b/source/de/anomic/search/ResultFetcher.java index 4ec99b0e3..9302ff26a 100644 --- a/source/de/anomic/search/ResultFetcher.java +++ b/source/de/anomic/search/ResultFetcher.java @@ -61,7 +61,7 @@ public class ResultFetcher { protected Worker[] workerThreads; protected final SortStore result; protected final SortStore images; // container to sort images by size - protected final HashMap failedURLs; // a mapping from a urlhash to a fail reason string + protected final HandleSet failedURLs; // a set of urlhashes that could not been verified during search protected final HandleSet snippetFetchWordHashes; // a set of word hashes that are used to match with the snippets long urlRetrievalAllTime; long snippetComputationAllTime; @@ -84,7 +84,7 @@ public class ResultFetcher { this.snippetComputationAllTime = 0; this.result = new SortStore(-1, true); // this is the result, enriched with snippets, ranked and ordered by ranking this.images = new SortStore(-1, true); - this.failedURLs = new HashMap(); // a map of urls to reason strings where a worker thread tried to work on, but failed. + this.failedURLs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); // a set of url hashes where a worker thread tried to work on, but failed. // snippets do not need to match with the complete query hashes, // only with the query minus the stopwords which had not been used for the search @@ -167,7 +167,7 @@ public class ResultFetcher { // get next entry page = rankedCache.takeURL(true, taketimeout); if (page == null) break; - if (failedURLs.get(new String(page.hash())) != null) continue; + if (failedURLs.has(page.hash())) continue; final ResultEntry resultEntry = fetchSnippet(page, snippetMode); // does not fetch snippets if snippetMode == 0 @@ -230,7 +230,7 @@ public class ResultFetcher { (snippetMode == 2) ? Integer.MAX_VALUE : 30000, query.isGlobal()); final long snippetComputationTime = System.currentTimeMillis() - startTime; - Log.logInfo("SEARCH_EVENT", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")"))); + Log.logInfo("SEARCH", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + ((snippet.getErrorCode() < 11) ? "snippet found" : ("no snippet found (" + snippet.getError() + ")"))); if (snippet.getErrorCode() < 11) { // we loaded the file and found the snippet @@ -241,13 +241,7 @@ public class ResultFetcher { return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet } else { // problems with snippet fetch - registerFailure(new String(page.hash()), "no text snippet for URL " + metadata.url()); - if (!peers.mySeed().isVirgin()) - try { - TextSnippet.failConsequences(query.getSegment(), page.word(), snippet, query.id(false)); - } catch (IOException e) { - Log.logException(e); - } + registerFailure(page.hash(), "no text snippet for URL " + metadata.url()); return null; } } else { @@ -255,7 +249,7 @@ public class ResultFetcher { startTime = System.currentTimeMillis(); final ArrayList mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, (snippetMode == 2), 6000, query.isGlobal()); final long snippetComputationTime = System.currentTimeMillis() - startTime; - Log.logInfo("SEARCH_EVENT", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime); + Log.logInfo("SEARCH", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime); if (mediaSnippets != null && !mediaSnippets.isEmpty()) { // found media snippets, return entry @@ -264,16 +258,20 @@ public class ResultFetcher { return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime); } else { // problems with snippet fetch - registerFailure(new String(page.hash()), "no media snippet for URL " + metadata.url()); + registerFailure(page.hash(), "no media snippet for URL " + metadata.url()); return null; } } // finished, no more actions possible here } - private void registerFailure(final String urlhash, final String reason) { - this.failedURLs.put(urlhash, reason); - Log.logInfo("search", "sorted out hash " + urlhash + " during search: " + reason); + private void registerFailure(final byte[] urlhash, final String reason) { + try { + this.failedURLs.put(urlhash); + } catch (RowSpaceExceededException e) { + Log.logException(e); + } + Log.logInfo("SEARCH", "sorted out urlhash " + new String(urlhash) + " during search: " + reason); } public int resultCount() { diff --git a/source/de/anomic/search/SearchEvent.java b/source/de/anomic/search/SearchEvent.java index 16c2f82ae..c59f1449e 100644 --- a/source/de/anomic/search/SearchEvent.java +++ b/source/de/anomic/search/SearchEvent.java @@ -223,6 +223,7 @@ public final class SearchEvent { // execute deletion of failed words int rw = this.results.failedURLs.size(); if (rw > 0) { + long start = System.currentTimeMillis(); final HandleSet removeWords = query.queryHashes; try { removeWords.putAll(query.excludeHashes); @@ -233,12 +234,12 @@ public final class SearchEvent { final Iterator j = removeWords.iterator(); // remove the same url hashes for multiple words while (j.hasNext()) { - this.query.getSegment().termIndex().remove(j.next(), this.results.failedURLs.keySet()); + this.query.getSegment().termIndex().remove(j.next(), this.results.failedURLs); } } catch (IOException e) { Log.logException(e); } - Log.logInfo("SearchEvents", "cleaning up event " + query.id(true) + ", removed " + rw + " URL references on " + removeWords.size() + " words"); + Log.logInfo("SearchEvents", "cleaning up event " + query.id(true) + ", removed " + rw + " URL references on " + removeWords.size() + " words in " + (System.currentTimeMillis() - start) + " milliseconds"); } } diff --git a/source/de/anomic/search/SearchEventCache.java b/source/de/anomic/search/SearchEventCache.java index 3f5cbe951..dbd51edda 100644 --- a/source/de/anomic/search/SearchEventCache.java +++ b/source/de/anomic/search/SearchEventCache.java @@ -53,7 +53,7 @@ public class SearchEventCache { SearchEvent event; while (i.hasNext()) { event = i.next(); - if ((all) || (event.getEventTime() + eventLifetime < System.currentTimeMillis())) { + if (all || event.getEventTime() + eventLifetime < System.currentTimeMillis()) { event.cleanup(); // remove the event diff --git a/source/de/anomic/search/TextSnippet.java b/source/de/anomic/search/TextSnippet.java index 048994aa7..bef83f455 100644 --- a/source/de/anomic/search/TextSnippet.java +++ b/source/de/anomic/search/TextSnippet.java @@ -583,29 +583,4 @@ public class TextSnippet implements Comparable, Comparator, Cloneable { index = null; } + public final String toString() { + return this.index.toString(); + } + // set tools public HandleSet joinConstructive(final HandleSet other) throws RowSpaceExceededException { diff --git a/source/net/yacy/kelondro/index/RowCollection.java b/source/net/yacy/kelondro/index/RowCollection.java index 20370095e..45d4ceb91 100644 --- a/source/net/yacy/kelondro/index/RowCollection.java +++ b/source/net/yacy/kelondro/index/RowCollection.java @@ -199,6 +199,8 @@ public class RowCollection implements Iterable, Cloneable { public synchronized byte[] exportCollection() { // returns null if the collection is empty trim(false); + sort(); // experimental; supervise CPU load + assert this.sortBound == this.chunkcount; // on case the collection is sorted assert this.size() * this.rowdef.objectsize == this.chunkcache.length : "this.size() = " + this.size() + ", objectsize = " + this.rowdef.objectsize + ", chunkcache.length = " + this.chunkcache.length; final Row row = exportRow(chunkcache.length); final Row.Entry entry = row.newEntry(); @@ -227,9 +229,11 @@ public class RowCollection implements Iterable, Cloneable { if (chunkcache.length >= needed) return 0; assert needed > 0 : "needed = " + needed; long allocram = needed * growfactorLarge100 / 100L; + allocram -= allocram % rowdef.objectsize; assert allocram > 0 : "elements = " + elements + ", new = " + allocram; if (allocram <= Integer.MAX_VALUE && MemoryControl.request(allocram, false)) return allocram; allocram = needed * growfactorSmall100 / 100L; + allocram -= allocram % rowdef.objectsize; assert allocram > 0 : "elements = " + elements + ", new = " + allocram; if (allocram <= Integer.MAX_VALUE && MemoryControl.request(allocram, forcegc)) return allocram; return needed; @@ -239,7 +243,8 @@ public class RowCollection implements Iterable, Cloneable { if (elements == 0) return; final long allocram = neededSpaceForEnsuredSize(elements, true); if (allocram == 0) return; - assert allocram > chunkcache.length : "wrong alloc computation: allocram = " + allocram + ", chunkcache.length = " + chunkcache.length; + assert chunkcache.length < elements * rowdef.objectsize : "wrong alloc computation (1): elements * rowdef.objectsize = " + (elements * rowdef.objectsize) + ", chunkcache.length = " + chunkcache.length; + assert allocram > chunkcache.length : "wrong alloc computation (2): allocram = " + allocram + ", chunkcache.length = " + chunkcache.length; if (allocram > Integer.MAX_VALUE || !MemoryControl.request(allocram, true)) throw new RowSpaceExceededException(allocram, "RowCollection grow"); try { @@ -564,7 +569,7 @@ public class RowCollection implements Iterable, Cloneable { } - protected synchronized final void sort() { + public synchronized final void sort() { assert (this.rowdef.objectOrder != null); if (this.sortBound == this.chunkcount) return; // this is already sorted if (this.chunkcount < isortlimit) { @@ -609,6 +614,7 @@ public class RowCollection implements Iterable, Cloneable { //assert this.isSorted(); } + /* public synchronized final void sort2() { assert (this.rowdef.objectOrder != null); if (this.sortBound == this.chunkcount) return; // this is already sorted @@ -643,7 +649,8 @@ public class RowCollection implements Iterable, Cloneable { this.sortBound = this.chunkcount; //assert this.isSorted(); } - + */ + private static class qsortthread implements Callable { private RowCollection rc; int L, R, S; diff --git a/source/net/yacy/kelondro/index/RowSet.java b/source/net/yacy/kelondro/index/RowSet.java index 3100513a3..2af40a400 100644 --- a/source/net/yacy/kelondro/index/RowSet.java +++ b/source/net/yacy/kelondro/index/RowSet.java @@ -75,7 +75,7 @@ public class RowSet extends RowCollection implements ObjectIndex, Iterable= 0 : "orderbound = " + orderbound; - if (orderbound < 0) return new RowSet(rowdef); + if (orderbound < 0) return new RowSet(rowdef); // error final byte[] chunkcache = new byte[size * rowdef.objectsize]; //assert b.length - exportOverheadSize == size * rowdef.objectsize : "b.length = " + b.length + ", size * rowdef.objectsize = " + size * rowdef.objectsize; if (b.length - exportOverheadSize != size * rowdef.objectsize) { diff --git a/source/net/yacy/kelondro/rwi/IndexCell.java b/source/net/yacy/kelondro/rwi/IndexCell.java index f797eaefa..b98c320ba 100644 --- a/source/net/yacy/kelondro/rwi/IndexCell.java +++ b/source/net/yacy/kelondro/rwi/IndexCell.java @@ -256,12 +256,6 @@ public final class IndexCell extends AbstractBu return removed + (reduced / this.array.rowdef().objectsize); } - public int remove(byte[] termHash, Set urlHashes) throws IOException { - int removed = this.ram.remove(termHash, urlHashes); - int reduced = this.array.replace(termHash, new RemoveRewriter(urlHashes)); - return removed + (reduced / this.array.rowdef().objectsize); - } - public boolean remove(byte[] termHash, byte[] urlHashBytes) throws IOException { boolean removed = this.ram.remove(termHash, urlHashBytes); int reduced = this.array.replace(termHash, new RemoveRewriter(urlHashBytes)); @@ -276,16 +270,6 @@ public final class IndexCell extends AbstractBu this.urlHashes = urlHashes; } - public RemoveRewriter(Set urlHashes) { - this.urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); - for (String s: urlHashes) - try { - this.urlHashes.put(s.getBytes()); - } catch (RowSpaceExceededException e) { - Log.logException(e); - } - } - public RemoveRewriter(byte[] urlHashBytes) { this.urlHashes = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0); try { @@ -296,6 +280,7 @@ public final class IndexCell extends AbstractBu } public ReferenceContainer rewrite(ReferenceContainer container) { + container.sort(); container.removeEntries(urlHashes); return container; } diff --git a/source/net/yacy/kelondro/rwi/ReferenceContainer.java b/source/net/yacy/kelondro/rwi/ReferenceContainer.java index 63612974f..456600053 100644 --- a/source/net/yacy/kelondro/rwi/ReferenceContainer.java +++ b/source/net/yacy/kelondro/rwi/ReferenceContainer.java @@ -188,13 +188,6 @@ public class ReferenceContainer extends RowSet return count; } - public int removeEntries(final Set urlHashes) { - int count = 0; - final Iterator i = urlHashes.iterator(); - while (i.hasNext()) count += (remove(i.next().getBytes()) == null) ? 0 : 1; - return count; - } - public Iterator entries() { // returns an iterator of indexRWIEntry objects return new entryIterator(); diff --git a/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java b/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java index 07024acf0..e2de222f5 100644 --- a/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java +++ b/source/net/yacy/kelondro/rwi/ReferenceContainerCache.java @@ -366,27 +366,7 @@ public final class ReferenceContainerCache exte } return 0; } - - public int remove(final byte[] termHash, final Set urlHashes) { - assert this.cache != null; - if (urlHashes.isEmpty()) return 0; - ByteArray tha = new ByteArray(termHash); - int count; - synchronized (cache) { - final ReferenceContainer c = cache.get(tha); - if ((c != null) && ((count = c.removeEntries(urlHashes)) > 0)) { - // removal successful - if (c.isEmpty()) { - delete(termHash); - } else { - cache.put(tha, c); - } - return count; - } - } - return 0; - } - + public void add(final ReferenceContainer container) throws RowSpaceExceededException { // this puts the entries into the cache assert this.cache != null;