- enhanced postprocessing speed and memory footprint (by using HashMaps

instead of TreeMaps) - enhanced memory footprint of database indexes (by introduction of optimize calls) - optimize calls shrink the amount of used memory for index sets if they are not changed afterwards any more
11 years ago · fdaeac374a
parent 1245cfeb43
commit fdaeac374a
25 changed files with 94 additions and 47 deletions
--- a/htroot/HostBrowser.java
+++ b/htroot/HostBrowser.java
@ -559,7 +559,7 @@ public class HostBrowser {
            if (fetchReferences) {
                // get the references from the citation index
                try {
-                    ReferenceReport rr = rrCache.getReferenceReport(ASCII.getBytes(urlhash), false);
+                    ReferenceReport rr = rrCache.getReferenceReport(urlhash, false);
                    List<String> internalIDs = new ArrayList<String>();
                    List<String> externalIDs = new ArrayList<String>();
                    HandleSet iids = rr.getInternallIDs();
--- a/htroot/api/webstructure.java
+++ b/htroot/api/webstructure.java
@ -141,7 +141,7 @@ public class webstructure {
                prop.put("citations", 1);
                ReferenceReportCache rrc = sb.index.getReferenceReportCache();
                ReferenceReport rr = null;
-                try {rr = rrc.getReferenceReport(urlhash, true);} catch (IOException e) {}
+                try {rr = rrc.getReferenceReport(ASCII.String(urlhash), true);} catch (IOException e) {}
            	if (rr != null && rr.getInternalCount() > 0 && rr.getExternalCount() > 0) {
                    prop.put("citations_count", 1);
                    prop.put("citations_documents", 1);
--- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
+++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java
@ -204,7 +204,7 @@ public class SchemaConfiguration extends Configuration implements Serializable {
        Integer exthosts_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName());
        Integer hostextc_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.host_extent_i.getSolrFieldName());
        try {
-            ReferenceReport rr = rrCache.getReferenceReport(url.hash(), false);
+            ReferenceReport rr = rrCache.getReferenceReport(ASCII.String(url.hash()), false);
            List<String> internalIDs = new ArrayList<String>();
            HandleSet iids = rr.getInternallIDs();
            for (byte[] b: iids) internalIDs.add(ASCII.String(b));
--- a/source/net/yacy/cora/storage/HandleMap.java
+++ b/source/net/yacy/cora/storage/HandleMap.java
@ -32,7 +32,7 @@ public interface HandleMap extends Iterable<Map.Entry<byte[], Long>> {

    public long mem();

-    public void trim();
+    public void optimize();

    /**
     * write a dump of the index to a file. All entries are written in order
--- a/source/net/yacy/crawler/Balancer.java
+++ b/source/net/yacy/crawler/Balancer.java
@ -331,7 +331,7 @@ public class Balancer {
        HostHandles hh = this.domainStacks.get(host);
        if (hh == null) {
            // create new list
-            HandleSet domainList = new RowHandleSet(12, Base64Order.enhancedCoder, 1);
+            HandleSet domainList = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 1);
            domainList.put(urlhash);
            this.domainStacks.put(host, new HostHandles(hosthash, domainList));
        } else {
--- a/source/net/yacy/crawler/data/NoticedURL.java
+++ b/source/net/yacy/crawler/data/NoticedURL.java
@ -42,6 +42,7 @@ import net.yacy.crawler.Balancer;
 import net.yacy.crawler.CrawlSwitchboard;
 import net.yacy.crawler.retrieval.Request;
 import net.yacy.crawler.robots.RobotsTxt;
+import net.yacy.kelondro.data.word.Word;
 import net.yacy.kelondro.index.RowHandleSet;

 public class NoticedURL {
@ -184,7 +185,7 @@ public class NoticedURL {
     */
    public boolean removeByURLHash(final byte[] urlhashBytes) {
        try {
-            final HandleSet urlHashes = new RowHandleSet(12, Base64Order.enhancedCoder, 1);
+            final HandleSet urlHashes = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 1);
            urlHashes.put(urlhashBytes);
            boolean ret = false;
            try {ret |= this.noloadStack.remove(urlHashes) > 0;} catch (final IOException e) {}
--- a/source/net/yacy/kelondro/blob/ArrayStack.java
+++ b/source/net/yacy/kelondro/blob/ArrayStack.java
@ -206,7 +206,7 @@ public class ArrayStack implements BLOB {
                           oneBlob = new Heap(f, keylength, ordering, buffersize);
                       } else {
                           oneBlob = new HeapModifier(f, keylength, ordering);
-                           oneBlob.trim(); // no writings here, can be used with minimum memory
+                           oneBlob.optimize(); // no writings here, can be used with minimum memory
                       }
                       sortedItems.put(Long.valueOf(time), new blobItem(d, f, oneBlob));
                   } catch (final IOException e) {
@ -236,7 +236,7 @@ public class ArrayStack implements BLOB {
    }

    @Override
-    public void trim() {
+    public void optimize() {
        // trim shall not be called for ArrayStacks because the characteristics of an ArrayStack is that the 'topmost' BLOB on the stack
        // is used for write operations and all other shall be trimmed automatically since they are not used for writing. And the
        // topmost BLOB must not be trimmed to support fast writings.
@ -261,7 +261,7 @@ public class ArrayStack implements BLOB {
            oneBlob = new Heap(location, this.keylength, this.ordering, this.buffersize);
        } else {
            oneBlob = new HeapModifier(location, this.keylength, this.ordering);
-            oneBlob.trim();
+            oneBlob.optimize();
        }
        this.blobs.add(new blobItem(d, location, oneBlob));
    }
--- a/source/net/yacy/kelondro/blob/BLOB.java
+++ b/source/net/yacy/kelondro/blob/BLOB.java
@ -63,7 +63,7 @@ public interface BLOB {
     * trim the index of the database: this releases memory not currently used
     * @throws IOException
     */
-    public void trim();
+    public void optimize();
    
    /**
     * calculate the memory in RAM that the BLOB occupies
--- a/source/net/yacy/kelondro/blob/Compressor.java
+++ b/source/net/yacy/kelondro/blob/Compressor.java
@ -68,8 +68,8 @@ public class Compressor implements BLOB, Iterable<byte[]> {
    }

    @Override
-    public void trim() {
-        this.backend.trim();
+    public void optimize() {
+        this.backend.optimize();
    }

    @Override
--- a/source/net/yacy/kelondro/blob/HeapReader.java
+++ b/source/net/yacy/kelondro/blob/HeapReader.java
@ -131,8 +131,8 @@ public class HeapReader {
        return this.index.mem(); // don't add the memory for free here since then the asserts for memory management don't work
    }

-    public void trim() {
-        this.index.trim();
+    public void optimize() {
+        this.index.optimize();
    }

    protected byte[] normalizeKey(byte[] key) {
--- a/source/net/yacy/kelondro/data/word/Word.java
+++ b/source/net/yacy/kelondro/data/word/Word.java
@ -151,6 +151,7 @@ public class Word {
                ConcurrentLog.logException(e);
                return hashes;
            }
+        hashes.optimize();
        return hashes;
    }

@ -163,6 +164,7 @@ public class Word {
                ConcurrentLog.logException(e);
                return hashes;
            }
+        hashes.optimize();
        return hashes;
    }
 }
--- a/source/net/yacy/kelondro/index/BufferedObjectIndex.java
+++ b/source/net/yacy/kelondro/index/BufferedObjectIndex.java
@ -81,6 +81,12 @@ public class BufferedObjectIndex implements Index, Iterable<Row.Entry> {
        }
    }

+    @Override
+    public void optimize() {
+        this.backend.optimize();
+        this.buffer.optimize();
+    }
+    
    @Override
    public long mem() {
        return this.backend.mem() + this.buffer.mem();
@ -356,6 +362,7 @@ public class BufferedObjectIndex implements Index, Iterable<Row.Entry> {
                    break;
                }
            }
+            handles.optimize();
            return handles;
        }
    }
--- a/source/net/yacy/kelondro/index/Cache.java
+++ b/source/net/yacy/kelondro/index/Cache.java
@ -101,6 +101,13 @@ public final class Cache implements Index, Iterable<Row.Entry> {
        return this.index.mem() + this.readHitCache.mem() + this.readMissCache.mem();
    }

+    @Override
+    public void optimize() {
+        this.index.optimize();
+        this.readHitCache.optimize();
+        this.readMissCache.optimize();
+    }
+
    public final int writeBufferSize() {
        return 0;
    }
--- a/source/net/yacy/kelondro/index/Index.java
+++ b/source/net/yacy/kelondro/index/Index.java
@ -38,6 +38,7 @@ public interface Index extends Iterable<Row.Entry> {

    public String filename(); // returns a unique identified for this index; can be a real or artificial file name
    public int size();
+    public void optimize();
    public long mem();
    public boolean isEmpty();
    public Row row();
--- a/source/net/yacy/kelondro/index/RAMIndex.java
+++ b/source/net/yacy/kelondro/index/RAMIndex.java
@ -82,9 +82,10 @@ public final class RAMIndex implements Index, Iterable<Row.Entry> {
 		reset();
 	}

-    public void trim() {
-        if (this.index0 != null) this.index0.trim();
-        if (this.index1 != null) this.index1.trim();
+    @Override
+    public void optimize() {
+        if (this.index0 != null) this.index0.optimize();
+        if (this.index1 != null) this.index1.optimize();
    }

    public final synchronized void reset() {
--- a/source/net/yacy/kelondro/index/RAMIndexCluster.java
+++ b/source/net/yacy/kelondro/index/RAMIndexCluster.java
@ -63,8 +63,8 @@ public final class RAMIndexCluster implements Index, Iterable<Row.Entry>, Clonea
        this.rowdef = rowdef;
    }

-    public void trim() {
-        for (final RAMIndex i: this.cluster) if (i != null)  i.trim();
+    public void optimize() {
+        for (final RAMIndex i: this.cluster) if (i != null) i.optimize();
    }

    @Override
--- a/source/net/yacy/kelondro/index/RowCollection.java
+++ b/source/net/yacy/kelondro/index/RowCollection.java
@ -676,6 +676,11 @@ public class RowCollection implements Sortable<Row.Entry>, Iterable<Row.Entry>,

    }

+    public void optimize() {
+        sort();
+        trim();
+    }
+    
    public final void sort() {
        if (this.sortBound == this.chunkcount) return; // this is sorted
        synchronized (this) {
--- a/source/net/yacy/kelondro/index/RowHandleMap.java
+++ b/source/net/yacy/kelondro/index/RowHandleMap.java
@ -104,12 +104,12 @@ public final class RowHandleMap implements HandleMap, Iterable<Map.Entry<byte[],
        is.close();
        is = null;
        assert this.index.size() == file.length() / (keylength + idxbytes);
-        trim();
+        optimize();
    }

    @Override
-    public void trim() {
-        this.index.trim();
+    public void optimize() {
+        this.index.optimize();
    }

    public long mem() {
--- a/source/net/yacy/kelondro/index/RowHandleSet.java
+++ b/source/net/yacy/kelondro/index/RowHandleSet.java
@ -81,6 +81,7 @@ public final class RowHandleSet implements HandleSet, Iterable<byte[]>, Cloneabl

    @Override
    public RowHandleSet clone() {
+        optimize();
        return new RowHandleSet(this.rowdef, this.index.clone());
    }

@ -92,6 +93,7 @@ public final class RowHandleSet implements HandleSet, Iterable<byte[]>, Cloneabl
    @Override
    public void optimize() {
        this.index.sort();
+        this.index.trim();
    }
    
    /**
@ -305,6 +307,7 @@ public final class RowHandleSet implements HandleSet, Iterable<byte[]>, Cloneabl
            o = mi.next();
            if (large.has(o)) result.put(o);
        }
+        result.optimize();
        return result;
    }

@ -331,6 +334,7 @@ public final class RowHandleSet implements HandleSet, Iterable<byte[]>, Cloneabl
                }
            }
        }
+        result.optimize();
        return result;
    }

--- a/source/net/yacy/kelondro/table/SQLTable.java
+++ b/source/net/yacy/kelondro/table/SQLTable.java
@ -103,6 +103,10 @@ public class SQLTable implements Index, Iterable<Row.Entry> {
        }

    }
+    
+    @Override
+    public void optimize() {
+    }

    @Override
    public long mem() {
--- a/source/net/yacy/kelondro/table/SplitTable.java
+++ b/source/net/yacy/kelondro/table/SplitTable.java
@ -107,6 +107,11 @@ public class SplitTable implements Index, Iterable<Row.Entry> {
        init();
    }

+    @Override
+    public void optimize() {
+        for (Index table: tables.values()) table.optimize();
+    }
+    
    @Override
    public long mem() {
        long m = 0;
--- a/source/net/yacy/kelondro/table/Table.java
+++ b/source/net/yacy/kelondro/table/Table.java
@ -196,7 +196,7 @@ public class Table implements Index, Iterable<Row.Entry> {
                    this.table = null;
                }
            }
-            this.index.trim();
+            optimize();

            // open the file
            this.file = new BufferedRecords(new Records(tablefile, rowdef.objectsize), this.buffersize);
@ -270,6 +270,13 @@ public class Table implements Index, Iterable<Row.Entry> {
        } catch (final IOException e) {
            ConcurrentLog.severe("Table", "", e);
        }
+        optimize();
+    }
+
+    @Override
+    public void optimize() {
+        this.index.optimize();
+        if (this.table != null) this.table.optimize();
    }

    @Override
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@ -30,12 +30,12 @@ import java.io.File;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.util.Date;
+import java.util.HashMap;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
-import java.util.TreeMap;
-import java.util.TreeSet;
 import java.util.concurrent.BlockingQueue;
 import java.util.regex.Pattern;

@ -52,7 +52,6 @@ import net.yacy.cora.federate.solr.connector.SolrConnector;
 import net.yacy.cora.federate.yacy.CacheStrategy;
 import net.yacy.cora.order.Base64Order;
 import net.yacy.cora.order.ByteOrder;
-import net.yacy.cora.order.NaturalOrder;
 import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.protocol.ResponseHeader;
 import net.yacy.cora.storage.HandleSet;
@ -215,9 +214,9 @@ public class Segment {
        final byte[] searchhash = url.hash();
        RowHandleSet rootCandidates = getPossibleRootHashes(url);
        
-        Set<byte[]> ignore = new TreeSet<byte[]>(NaturalOrder.naturalOrder); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops
-        Set<byte[]> levelhashes = new TreeSet<byte[]>(NaturalOrder.naturalOrder); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
-        levelhashes.add(searchhash);
+        Set<String> ignore = new HashSet<String>(); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops
+        Set<String> levelhashes = new HashSet<String>(); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
+        levelhashes.add(ASCII.String(searchhash));
        int leveldepth = 0; // the recursion depth and therefore the result depth-1. Shall be 0 for the first call
        final byte[] hosthash = new byte[6]; // the host of the url to be checked
        System.arraycopy(searchhash, 6, hosthash, 0, 6);
@ -225,13 +224,13 @@ public class Segment {
        long timeout = System.currentTimeMillis() + maxtime;
        mainloop: for (int maxdepth = 0; maxdepth < 6 && System.currentTimeMillis() < timeout; maxdepth++) {
            
-            Set<byte[]> checknext = new TreeSet<byte[]>(NaturalOrder.naturalOrder);
+            Set<String> checknext = new HashSet<String>();
            
            // loop over all hashes at this clickdepth; the first call to this loop should contain only one hash and a leveldepth = 0
-            checkloop: for (byte[] urlhash: levelhashes) {
+            checkloop: for (String urlhashs: levelhashes) {
    
                // get all the citations for this url and iterate
-                ReferenceReport rr = rrc.getReferenceReport(urlhash, false);
+                ReferenceReport rr = rrc.getReferenceReport(urlhashs, false);
                //ReferenceContainer<CitationReference> references = this.urlCitationIndex.get(urlhash, null);
                if (rr == null || rr.getInternalCount() == 0) continue checkloop; // don't know
                Iterator<byte[]> i = rr.getInternallIDs().iterator();
@ -241,17 +240,17 @@ public class Segment {
                    
                    // check if this is from the same host
                    assert (ByteBuffer.equals(u, 6, hosthash, 0, 6));
-                    
+                    String us = ASCII.String(u);
                    // check ignore
-                    if (ignore.contains(u)) continue nextloop;
+                    if (ignore.contains(us)) continue nextloop;
                    
                    // check if the url is a root url
                    if (rootCandidates.has(u)) {
                        return leveldepth + 1;
                    }
                    
-                    checknext.add(u);
-                    ignore.add(u);
+                    checknext.add(us);
+                    ignore.add(us);
                }
                if (System.currentTimeMillis() > timeout) break mainloop;
            }
@ -286,16 +285,16 @@ public class Segment {
    }
    
    public class ReferenceReportCache {
-        Map<byte[], ReferenceReport> cache;
+        private final Map<String, ReferenceReport> cache;
        public ReferenceReportCache() {
-            this.cache = new TreeMap<byte[], ReferenceReport>(Base64Order.enhancedCoder);
+            this.cache = new HashMap<String, ReferenceReport>();
        }
-        public ReferenceReport getReferenceReport(final byte[] id, final boolean acceptSelfReference) throws IOException {
+        public ReferenceReport getReferenceReport(final String id, final boolean acceptSelfReference) throws IOException {
            ReferenceReport rr = cache.get(id);
            if (MemoryControl.shortStatus()) cache.clear();
            if (rr != null) return rr;
            try {
-                rr = new ReferenceReport(id, acceptSelfReference);
+                rr = new ReferenceReport(ASCII.getBytes(id), acceptSelfReference);
                cache.put(id, rr);
                return rr;
            } catch (final SpaceExceededException e) {
@ -311,13 +310,13 @@ public class Segment {
    
    public class ClickdepthCache {
        ReferenceReportCache rrc;
-        Map<byte[], Integer> cache;
+        Map<String, Integer> cache;
        public ClickdepthCache(ReferenceReportCache rrc) {
            this.rrc = rrc;
-            this.cache = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
+            this.cache = new HashMap<String, Integer>();
        }
        public int getClickdepth(final DigestURL url, int maxtime) throws IOException {
-            Integer clickdepth = cache.get(url.hash());
+            Integer clickdepth = cache.get(ASCII.String(url.hash()));
            if (MemoryControl.shortStatus()) cache.clear();
            if (clickdepth != null) {
                //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth + " CACHE HIT");
@ -325,7 +324,7 @@ public class Segment {
            }
            clickdepth = Segment.this.getClickDepth(this.rrc, url, maxtime);
            //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth);
-            this.cache.put(url.hash(), clickdepth);
+            this.cache.put(ASCII.String(url.hash()), clickdepth);
            return clickdepth.intValue();
        }
    }
@ -343,8 +342,8 @@ public class Segment {
            this.internal = 0;
            this.external = 0;
            this.externalHosts = new RowHandleSet(6, Base64Order.enhancedCoder, 0);
-            this.internalIDs = new RowHandleSet(12, Base64Order.enhancedCoder, 0);
-            this.externalIDs = new RowHandleSet(12, Base64Order.enhancedCoder, 0);
+            this.internalIDs = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 0);
+            this.externalIDs = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 0);
            try {
                if (connectedCitation()) {
                    // read the references from the citation index
@ -397,6 +396,9 @@ public class Segment {
                    ConcurrentLog.logException(e);
                }
            }
+            this.externalHosts.optimize();
+            this.internalIDs.optimize();
+            this.externalIDs.optimize();
        }
        public int getInternalCount() {
            return this.internal;
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -1322,7 +1322,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            try {
                for (Map.Entry<String, double[]> entry: this.crt.entrySet()) {
                    String id = entry.getKey();
-                    ReferenceReport rr = this.rrCache.getReferenceReport(ASCII.getBytes(id), false);
+                    ReferenceReport rr = this.rrCache.getReferenceReport(id, false);
                    // sum up the cr of the internal links
                    HandleSet iids = rr.getInternallIDs();
                    double ncr = 0.0d;
--- a/source/net/yacy/search/snippet/MediaSnippet.java
+++ b/source/net/yacy/search/snippet/MediaSnippet.java
@ -244,6 +244,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
                }
            }
        }
+        remaininghashes.optimize();
        return remaininghashes;
    }