From fdaeac374a0b804684e9aa9b4f50020b363dbf87 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 28 Feb 2014 14:01:09 +0100 Subject: [PATCH] - enhanced postprocessing speed and memory footprint (by using HashMaps instead of TreeMaps) - enhanced memory footprint of database indexes (by introduction of optimize calls) - optimize calls shrink the amount of used memory for index sets if they are not changed afterwards any more --- htroot/HostBrowser.java | 2 +- htroot/api/webstructure.java | 2 +- .../federate/solr/SchemaConfiguration.java | 2 +- source/net/yacy/cora/storage/HandleMap.java | 2 +- source/net/yacy/crawler/Balancer.java | 2 +- source/net/yacy/crawler/data/NoticedURL.java | 3 +- source/net/yacy/kelondro/blob/ArrayStack.java | 6 +-- source/net/yacy/kelondro/blob/BLOB.java | 2 +- source/net/yacy/kelondro/blob/Compressor.java | 4 +- source/net/yacy/kelondro/blob/HeapReader.java | 4 +- source/net/yacy/kelondro/data/word/Word.java | 2 + .../kelondro/index/BufferedObjectIndex.java | 7 +++ source/net/yacy/kelondro/index/Cache.java | 7 +++ source/net/yacy/kelondro/index/Index.java | 1 + source/net/yacy/kelondro/index/RAMIndex.java | 7 +-- .../yacy/kelondro/index/RAMIndexCluster.java | 4 +- .../yacy/kelondro/index/RowCollection.java | 5 ++ .../net/yacy/kelondro/index/RowHandleMap.java | 6 +-- .../net/yacy/kelondro/index/RowHandleSet.java | 4 ++ source/net/yacy/kelondro/table/SQLTable.java | 4 ++ .../net/yacy/kelondro/table/SplitTable.java | 5 ++ source/net/yacy/kelondro/table/Table.java | 9 +++- source/net/yacy/search/index/Segment.java | 48 ++++++++++--------- .../schema/CollectionConfiguration.java | 2 +- .../net/yacy/search/snippet/MediaSnippet.java | 1 + 25 files changed, 94 insertions(+), 47 deletions(-) diff --git a/htroot/HostBrowser.java b/htroot/HostBrowser.java index e904fd9c0..0f730cbfc 100644 --- a/htroot/HostBrowser.java +++ b/htroot/HostBrowser.java @@ -559,7 +559,7 @@ public class HostBrowser { if (fetchReferences) { // get the references from the citation index try { - ReferenceReport rr = rrCache.getReferenceReport(ASCII.getBytes(urlhash), false); + ReferenceReport rr = rrCache.getReferenceReport(urlhash, false); List internalIDs = new ArrayList(); List externalIDs = new ArrayList(); HandleSet iids = rr.getInternallIDs(); diff --git a/htroot/api/webstructure.java b/htroot/api/webstructure.java index 458993d4f..47287ef1e 100644 --- a/htroot/api/webstructure.java +++ b/htroot/api/webstructure.java @@ -141,7 +141,7 @@ public class webstructure { prop.put("citations", 1); ReferenceReportCache rrc = sb.index.getReferenceReportCache(); ReferenceReport rr = null; - try {rr = rrc.getReferenceReport(urlhash, true);} catch (IOException e) {} + try {rr = rrc.getReferenceReport(ASCII.String(urlhash), true);} catch (IOException e) {} if (rr != null && rr.getInternalCount() > 0 && rr.getExternalCount() > 0) { prop.put("citations_count", 1); prop.put("citations_documents", 1); diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java index da1a07438..47a5f63c2 100644 --- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java +++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java @@ -204,7 +204,7 @@ public class SchemaConfiguration extends Configuration implements Serializable { Integer exthosts_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName()); Integer hostextc_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.host_extent_i.getSolrFieldName()); try { - ReferenceReport rr = rrCache.getReferenceReport(url.hash(), false); + ReferenceReport rr = rrCache.getReferenceReport(ASCII.String(url.hash()), false); List internalIDs = new ArrayList(); HandleSet iids = rr.getInternallIDs(); for (byte[] b: iids) internalIDs.add(ASCII.String(b)); diff --git a/source/net/yacy/cora/storage/HandleMap.java b/source/net/yacy/cora/storage/HandleMap.java index 2a968a8b7..5c4b7e8bc 100644 --- a/source/net/yacy/cora/storage/HandleMap.java +++ b/source/net/yacy/cora/storage/HandleMap.java @@ -32,7 +32,7 @@ public interface HandleMap extends Iterable> { public long mem(); - public void trim(); + public void optimize(); /** * write a dump of the index to a file. All entries are written in order diff --git a/source/net/yacy/crawler/Balancer.java b/source/net/yacy/crawler/Balancer.java index 86cdac873..d8d9689c8 100644 --- a/source/net/yacy/crawler/Balancer.java +++ b/source/net/yacy/crawler/Balancer.java @@ -331,7 +331,7 @@ public class Balancer { HostHandles hh = this.domainStacks.get(host); if (hh == null) { // create new list - HandleSet domainList = new RowHandleSet(12, Base64Order.enhancedCoder, 1); + HandleSet domainList = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 1); domainList.put(urlhash); this.domainStacks.put(host, new HostHandles(hosthash, domainList)); } else { diff --git a/source/net/yacy/crawler/data/NoticedURL.java b/source/net/yacy/crawler/data/NoticedURL.java index ed929eb23..7343fb70b 100644 --- a/source/net/yacy/crawler/data/NoticedURL.java +++ b/source/net/yacy/crawler/data/NoticedURL.java @@ -42,6 +42,7 @@ import net.yacy.crawler.Balancer; import net.yacy.crawler.CrawlSwitchboard; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.robots.RobotsTxt; +import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowHandleSet; public class NoticedURL { @@ -184,7 +185,7 @@ public class NoticedURL { */ public boolean removeByURLHash(final byte[] urlhashBytes) { try { - final HandleSet urlHashes = new RowHandleSet(12, Base64Order.enhancedCoder, 1); + final HandleSet urlHashes = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 1); urlHashes.put(urlhashBytes); boolean ret = false; try {ret |= this.noloadStack.remove(urlHashes) > 0;} catch (final IOException e) {} diff --git a/source/net/yacy/kelondro/blob/ArrayStack.java b/source/net/yacy/kelondro/blob/ArrayStack.java index 7f91a8a3e..e31964d99 100644 --- a/source/net/yacy/kelondro/blob/ArrayStack.java +++ b/source/net/yacy/kelondro/blob/ArrayStack.java @@ -206,7 +206,7 @@ public class ArrayStack implements BLOB { oneBlob = new Heap(f, keylength, ordering, buffersize); } else { oneBlob = new HeapModifier(f, keylength, ordering); - oneBlob.trim(); // no writings here, can be used with minimum memory + oneBlob.optimize(); // no writings here, can be used with minimum memory } sortedItems.put(Long.valueOf(time), new blobItem(d, f, oneBlob)); } catch (final IOException e) { @@ -236,7 +236,7 @@ public class ArrayStack implements BLOB { } @Override - public void trim() { + public void optimize() { // trim shall not be called for ArrayStacks because the characteristics of an ArrayStack is that the 'topmost' BLOB on the stack // is used for write operations and all other shall be trimmed automatically since they are not used for writing. And the // topmost BLOB must not be trimmed to support fast writings. @@ -261,7 +261,7 @@ public class ArrayStack implements BLOB { oneBlob = new Heap(location, this.keylength, this.ordering, this.buffersize); } else { oneBlob = new HeapModifier(location, this.keylength, this.ordering); - oneBlob.trim(); + oneBlob.optimize(); } this.blobs.add(new blobItem(d, location, oneBlob)); } diff --git a/source/net/yacy/kelondro/blob/BLOB.java b/source/net/yacy/kelondro/blob/BLOB.java index 1cc3f2a5b..aee7b19a5 100644 --- a/source/net/yacy/kelondro/blob/BLOB.java +++ b/source/net/yacy/kelondro/blob/BLOB.java @@ -63,7 +63,7 @@ public interface BLOB { * trim the index of the database: this releases memory not currently used * @throws IOException */ - public void trim(); + public void optimize(); /** * calculate the memory in RAM that the BLOB occupies diff --git a/source/net/yacy/kelondro/blob/Compressor.java b/source/net/yacy/kelondro/blob/Compressor.java index c042148be..8da99e5a9 100644 --- a/source/net/yacy/kelondro/blob/Compressor.java +++ b/source/net/yacy/kelondro/blob/Compressor.java @@ -68,8 +68,8 @@ public class Compressor implements BLOB, Iterable { } @Override - public void trim() { - this.backend.trim(); + public void optimize() { + this.backend.optimize(); } @Override diff --git a/source/net/yacy/kelondro/blob/HeapReader.java b/source/net/yacy/kelondro/blob/HeapReader.java index 6e441d34e..b3aa9d190 100644 --- a/source/net/yacy/kelondro/blob/HeapReader.java +++ b/source/net/yacy/kelondro/blob/HeapReader.java @@ -131,8 +131,8 @@ public class HeapReader { return this.index.mem(); // don't add the memory for free here since then the asserts for memory management don't work } - public void trim() { - this.index.trim(); + public void optimize() { + this.index.optimize(); } protected byte[] normalizeKey(byte[] key) { diff --git a/source/net/yacy/kelondro/data/word/Word.java b/source/net/yacy/kelondro/data/word/Word.java index 6c207cfff..2b28c33a1 100644 --- a/source/net/yacy/kelondro/data/word/Word.java +++ b/source/net/yacy/kelondro/data/word/Word.java @@ -151,6 +151,7 @@ public class Word { ConcurrentLog.logException(e); return hashes; } + hashes.optimize(); return hashes; } @@ -163,6 +164,7 @@ public class Word { ConcurrentLog.logException(e); return hashes; } + hashes.optimize(); return hashes; } } diff --git a/source/net/yacy/kelondro/index/BufferedObjectIndex.java b/source/net/yacy/kelondro/index/BufferedObjectIndex.java index 29d11774c..1be2db47f 100644 --- a/source/net/yacy/kelondro/index/BufferedObjectIndex.java +++ b/source/net/yacy/kelondro/index/BufferedObjectIndex.java @@ -81,6 +81,12 @@ public class BufferedObjectIndex implements Index, Iterable { } } + @Override + public void optimize() { + this.backend.optimize(); + this.buffer.optimize(); + } + @Override public long mem() { return this.backend.mem() + this.buffer.mem(); @@ -356,6 +362,7 @@ public class BufferedObjectIndex implements Index, Iterable { break; } } + handles.optimize(); return handles; } } diff --git a/source/net/yacy/kelondro/index/Cache.java b/source/net/yacy/kelondro/index/Cache.java index 1f551bd91..226d9b459 100644 --- a/source/net/yacy/kelondro/index/Cache.java +++ b/source/net/yacy/kelondro/index/Cache.java @@ -101,6 +101,13 @@ public final class Cache implements Index, Iterable { return this.index.mem() + this.readHitCache.mem() + this.readMissCache.mem(); } + @Override + public void optimize() { + this.index.optimize(); + this.readHitCache.optimize(); + this.readMissCache.optimize(); + } + public final int writeBufferSize() { return 0; } diff --git a/source/net/yacy/kelondro/index/Index.java b/source/net/yacy/kelondro/index/Index.java index a0e265210..9e17c0e06 100644 --- a/source/net/yacy/kelondro/index/Index.java +++ b/source/net/yacy/kelondro/index/Index.java @@ -38,6 +38,7 @@ public interface Index extends Iterable { public String filename(); // returns a unique identified for this index; can be a real or artificial file name public int size(); + public void optimize(); public long mem(); public boolean isEmpty(); public Row row(); diff --git a/source/net/yacy/kelondro/index/RAMIndex.java b/source/net/yacy/kelondro/index/RAMIndex.java index 7435c8c1a..fea44fad5 100644 --- a/source/net/yacy/kelondro/index/RAMIndex.java +++ b/source/net/yacy/kelondro/index/RAMIndex.java @@ -82,9 +82,10 @@ public final class RAMIndex implements Index, Iterable { reset(); } - public void trim() { - if (this.index0 != null) this.index0.trim(); - if (this.index1 != null) this.index1.trim(); + @Override + public void optimize() { + if (this.index0 != null) this.index0.optimize(); + if (this.index1 != null) this.index1.optimize(); } public final synchronized void reset() { diff --git a/source/net/yacy/kelondro/index/RAMIndexCluster.java b/source/net/yacy/kelondro/index/RAMIndexCluster.java index f64b61b10..cdca0c3d8 100644 --- a/source/net/yacy/kelondro/index/RAMIndexCluster.java +++ b/source/net/yacy/kelondro/index/RAMIndexCluster.java @@ -63,8 +63,8 @@ public final class RAMIndexCluster implements Index, Iterable, Clonea this.rowdef = rowdef; } - public void trim() { - for (final RAMIndex i: this.cluster) if (i != null) i.trim(); + public void optimize() { + for (final RAMIndex i: this.cluster) if (i != null) i.optimize(); } @Override diff --git a/source/net/yacy/kelondro/index/RowCollection.java b/source/net/yacy/kelondro/index/RowCollection.java index 47c3c9645..5ca1dc37a 100644 --- a/source/net/yacy/kelondro/index/RowCollection.java +++ b/source/net/yacy/kelondro/index/RowCollection.java @@ -676,6 +676,11 @@ public class RowCollection implements Sortable, Iterable, } + public void optimize() { + sort(); + trim(); + } + public final void sort() { if (this.sortBound == this.chunkcount) return; // this is sorted synchronized (this) { diff --git a/source/net/yacy/kelondro/index/RowHandleMap.java b/source/net/yacy/kelondro/index/RowHandleMap.java index 6a362a958..f75bd5992 100644 --- a/source/net/yacy/kelondro/index/RowHandleMap.java +++ b/source/net/yacy/kelondro/index/RowHandleMap.java @@ -104,12 +104,12 @@ public final class RowHandleMap implements HandleMap, Iterable, Cloneabl @Override public RowHandleSet clone() { + optimize(); return new RowHandleSet(this.rowdef, this.index.clone()); } @@ -92,6 +93,7 @@ public final class RowHandleSet implements HandleSet, Iterable, Cloneabl @Override public void optimize() { this.index.sort(); + this.index.trim(); } /** @@ -305,6 +307,7 @@ public final class RowHandleSet implements HandleSet, Iterable, Cloneabl o = mi.next(); if (large.has(o)) result.put(o); } + result.optimize(); return result; } @@ -331,6 +334,7 @@ public final class RowHandleSet implements HandleSet, Iterable, Cloneabl } } } + result.optimize(); return result; } diff --git a/source/net/yacy/kelondro/table/SQLTable.java b/source/net/yacy/kelondro/table/SQLTable.java index df8b9a5b6..5705225d2 100644 --- a/source/net/yacy/kelondro/table/SQLTable.java +++ b/source/net/yacy/kelondro/table/SQLTable.java @@ -103,6 +103,10 @@ public class SQLTable implements Index, Iterable { } } + + @Override + public void optimize() { + } @Override public long mem() { diff --git a/source/net/yacy/kelondro/table/SplitTable.java b/source/net/yacy/kelondro/table/SplitTable.java index eff5f6215..614700661 100644 --- a/source/net/yacy/kelondro/table/SplitTable.java +++ b/source/net/yacy/kelondro/table/SplitTable.java @@ -107,6 +107,11 @@ public class SplitTable implements Index, Iterable { init(); } + @Override + public void optimize() { + for (Index table: tables.values()) table.optimize(); + } + @Override public long mem() { long m = 0; diff --git a/source/net/yacy/kelondro/table/Table.java b/source/net/yacy/kelondro/table/Table.java index fb2d9ef7d..222a9cb58 100644 --- a/source/net/yacy/kelondro/table/Table.java +++ b/source/net/yacy/kelondro/table/Table.java @@ -196,7 +196,7 @@ public class Table implements Index, Iterable { this.table = null; } } - this.index.trim(); + optimize(); // open the file this.file = new BufferedRecords(new Records(tablefile, rowdef.objectsize), this.buffersize); @@ -270,6 +270,13 @@ public class Table implements Index, Iterable { } catch (final IOException e) { ConcurrentLog.severe("Table", "", e); } + optimize(); + } + + @Override + public void optimize() { + this.index.optimize(); + if (this.table != null) this.table.optimize(); } @Override diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index e397a14f9..d1dfe3ad7 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -30,12 +30,12 @@ import java.io.File; import java.io.IOException; import java.net.MalformedURLException; import java.util.Date; +import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.TreeMap; -import java.util.TreeSet; import java.util.concurrent.BlockingQueue; import java.util.regex.Pattern; @@ -52,7 +52,6 @@ import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.ByteOrder; -import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.storage.HandleSet; @@ -215,9 +214,9 @@ public class Segment { final byte[] searchhash = url.hash(); RowHandleSet rootCandidates = getPossibleRootHashes(url); - Set ignore = new TreeSet(NaturalOrder.naturalOrder); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops - Set levelhashes = new TreeSet(NaturalOrder.naturalOrder); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry - levelhashes.add(searchhash); + Set ignore = new HashSet(); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops + Set levelhashes = new HashSet(); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry + levelhashes.add(ASCII.String(searchhash)); int leveldepth = 0; // the recursion depth and therefore the result depth-1. Shall be 0 for the first call final byte[] hosthash = new byte[6]; // the host of the url to be checked System.arraycopy(searchhash, 6, hosthash, 0, 6); @@ -225,13 +224,13 @@ public class Segment { long timeout = System.currentTimeMillis() + maxtime; mainloop: for (int maxdepth = 0; maxdepth < 6 && System.currentTimeMillis() < timeout; maxdepth++) { - Set checknext = new TreeSet(NaturalOrder.naturalOrder); + Set checknext = new HashSet(); // loop over all hashes at this clickdepth; the first call to this loop should contain only one hash and a leveldepth = 0 - checkloop: for (byte[] urlhash: levelhashes) { + checkloop: for (String urlhashs: levelhashes) { // get all the citations for this url and iterate - ReferenceReport rr = rrc.getReferenceReport(urlhash, false); + ReferenceReport rr = rrc.getReferenceReport(urlhashs, false); //ReferenceContainer references = this.urlCitationIndex.get(urlhash, null); if (rr == null || rr.getInternalCount() == 0) continue checkloop; // don't know Iterator i = rr.getInternallIDs().iterator(); @@ -241,17 +240,17 @@ public class Segment { // check if this is from the same host assert (ByteBuffer.equals(u, 6, hosthash, 0, 6)); - + String us = ASCII.String(u); // check ignore - if (ignore.contains(u)) continue nextloop; + if (ignore.contains(us)) continue nextloop; // check if the url is a root url if (rootCandidates.has(u)) { return leveldepth + 1; } - checknext.add(u); - ignore.add(u); + checknext.add(us); + ignore.add(us); } if (System.currentTimeMillis() > timeout) break mainloop; } @@ -286,16 +285,16 @@ public class Segment { } public class ReferenceReportCache { - Map cache; + private final Map cache; public ReferenceReportCache() { - this.cache = new TreeMap(Base64Order.enhancedCoder); + this.cache = new HashMap(); } - public ReferenceReport getReferenceReport(final byte[] id, final boolean acceptSelfReference) throws IOException { + public ReferenceReport getReferenceReport(final String id, final boolean acceptSelfReference) throws IOException { ReferenceReport rr = cache.get(id); if (MemoryControl.shortStatus()) cache.clear(); if (rr != null) return rr; try { - rr = new ReferenceReport(id, acceptSelfReference); + rr = new ReferenceReport(ASCII.getBytes(id), acceptSelfReference); cache.put(id, rr); return rr; } catch (final SpaceExceededException e) { @@ -311,13 +310,13 @@ public class Segment { public class ClickdepthCache { ReferenceReportCache rrc; - Map cache; + Map cache; public ClickdepthCache(ReferenceReportCache rrc) { this.rrc = rrc; - this.cache = new TreeMap(Base64Order.enhancedCoder); + this.cache = new HashMap(); } public int getClickdepth(final DigestURL url, int maxtime) throws IOException { - Integer clickdepth = cache.get(url.hash()); + Integer clickdepth = cache.get(ASCII.String(url.hash())); if (MemoryControl.shortStatus()) cache.clear(); if (clickdepth != null) { //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth + " CACHE HIT"); @@ -325,7 +324,7 @@ public class Segment { } clickdepth = Segment.this.getClickDepth(this.rrc, url, maxtime); //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth); - this.cache.put(url.hash(), clickdepth); + this.cache.put(ASCII.String(url.hash()), clickdepth); return clickdepth.intValue(); } } @@ -343,8 +342,8 @@ public class Segment { this.internal = 0; this.external = 0; this.externalHosts = new RowHandleSet(6, Base64Order.enhancedCoder, 0); - this.internalIDs = new RowHandleSet(12, Base64Order.enhancedCoder, 0); - this.externalIDs = new RowHandleSet(12, Base64Order.enhancedCoder, 0); + this.internalIDs = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 0); + this.externalIDs = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 0); try { if (connectedCitation()) { // read the references from the citation index @@ -397,6 +396,9 @@ public class Segment { ConcurrentLog.logException(e); } } + this.externalHosts.optimize(); + this.internalIDs.optimize(); + this.externalIDs.optimize(); } public int getInternalCount() { return this.internal; diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 447be376e..733df2487 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -1322,7 +1322,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri try { for (Map.Entry entry: this.crt.entrySet()) { String id = entry.getKey(); - ReferenceReport rr = this.rrCache.getReferenceReport(ASCII.getBytes(id), false); + ReferenceReport rr = this.rrCache.getReferenceReport(id, false); // sum up the cr of the internal links HandleSet iids = rr.getInternallIDs(); double ncr = 0.0d; diff --git a/source/net/yacy/search/snippet/MediaSnippet.java b/source/net/yacy/search/snippet/MediaSnippet.java index cf4dd21a8..f33ceb099 100644 --- a/source/net/yacy/search/snippet/MediaSnippet.java +++ b/source/net/yacy/search/snippet/MediaSnippet.java @@ -244,6 +244,7 @@ public class MediaSnippet implements Comparable, Comparator