- enhanced postprocessing speed and memory footprint (by using HashMaps

instead of TreeMaps)
- enhanced memory footprint of database indexes (by introduction of
optimize calls)
- optimize calls shrink the amount of used memory for index sets if they
are not changed afterwards any more
pull/1/head
Michael Peter Christen 11 years ago
parent 1245cfeb43
commit fdaeac374a

@ -559,7 +559,7 @@ public class HostBrowser {
if (fetchReferences) { if (fetchReferences) {
// get the references from the citation index // get the references from the citation index
try { try {
ReferenceReport rr = rrCache.getReferenceReport(ASCII.getBytes(urlhash), false); ReferenceReport rr = rrCache.getReferenceReport(urlhash, false);
List<String> internalIDs = new ArrayList<String>(); List<String> internalIDs = new ArrayList<String>();
List<String> externalIDs = new ArrayList<String>(); List<String> externalIDs = new ArrayList<String>();
HandleSet iids = rr.getInternallIDs(); HandleSet iids = rr.getInternallIDs();

@ -141,7 +141,7 @@ public class webstructure {
prop.put("citations", 1); prop.put("citations", 1);
ReferenceReportCache rrc = sb.index.getReferenceReportCache(); ReferenceReportCache rrc = sb.index.getReferenceReportCache();
ReferenceReport rr = null; ReferenceReport rr = null;
try {rr = rrc.getReferenceReport(urlhash, true);} catch (IOException e) {} try {rr = rrc.getReferenceReport(ASCII.String(urlhash), true);} catch (IOException e) {}
if (rr != null && rr.getInternalCount() > 0 && rr.getExternalCount() > 0) { if (rr != null && rr.getInternalCount() > 0 && rr.getExternalCount() > 0) {
prop.put("citations_count", 1); prop.put("citations_count", 1);
prop.put("citations_documents", 1); prop.put("citations_documents", 1);

@ -204,7 +204,7 @@ public class SchemaConfiguration extends Configuration implements Serializable {
Integer exthosts_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName()); Integer exthosts_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName());
Integer hostextc_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.host_extent_i.getSolrFieldName()); Integer hostextc_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.host_extent_i.getSolrFieldName());
try { try {
ReferenceReport rr = rrCache.getReferenceReport(url.hash(), false); ReferenceReport rr = rrCache.getReferenceReport(ASCII.String(url.hash()), false);
List<String> internalIDs = new ArrayList<String>(); List<String> internalIDs = new ArrayList<String>();
HandleSet iids = rr.getInternallIDs(); HandleSet iids = rr.getInternallIDs();
for (byte[] b: iids) internalIDs.add(ASCII.String(b)); for (byte[] b: iids) internalIDs.add(ASCII.String(b));

@ -32,7 +32,7 @@ public interface HandleMap extends Iterable<Map.Entry<byte[], Long>> {
public long mem(); public long mem();
public void trim(); public void optimize();
/** /**
* write a dump of the index to a file. All entries are written in order * write a dump of the index to a file. All entries are written in order

@ -331,7 +331,7 @@ public class Balancer {
HostHandles hh = this.domainStacks.get(host); HostHandles hh = this.domainStacks.get(host);
if (hh == null) { if (hh == null) {
// create new list // create new list
HandleSet domainList = new RowHandleSet(12, Base64Order.enhancedCoder, 1); HandleSet domainList = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 1);
domainList.put(urlhash); domainList.put(urlhash);
this.domainStacks.put(host, new HostHandles(hosthash, domainList)); this.domainStacks.put(host, new HostHandles(hosthash, domainList));
} else { } else {

@ -42,6 +42,7 @@ import net.yacy.crawler.Balancer;
import net.yacy.crawler.CrawlSwitchboard; import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.robots.RobotsTxt; import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.index.RowHandleSet;
public class NoticedURL { public class NoticedURL {
@ -184,7 +185,7 @@ public class NoticedURL {
*/ */
public boolean removeByURLHash(final byte[] urlhashBytes) { public boolean removeByURLHash(final byte[] urlhashBytes) {
try { try {
final HandleSet urlHashes = new RowHandleSet(12, Base64Order.enhancedCoder, 1); final HandleSet urlHashes = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 1);
urlHashes.put(urlhashBytes); urlHashes.put(urlhashBytes);
boolean ret = false; boolean ret = false;
try {ret |= this.noloadStack.remove(urlHashes) > 0;} catch (final IOException e) {} try {ret |= this.noloadStack.remove(urlHashes) > 0;} catch (final IOException e) {}

@ -206,7 +206,7 @@ public class ArrayStack implements BLOB {
oneBlob = new Heap(f, keylength, ordering, buffersize); oneBlob = new Heap(f, keylength, ordering, buffersize);
} else { } else {
oneBlob = new HeapModifier(f, keylength, ordering); oneBlob = new HeapModifier(f, keylength, ordering);
oneBlob.trim(); // no writings here, can be used with minimum memory oneBlob.optimize(); // no writings here, can be used with minimum memory
} }
sortedItems.put(Long.valueOf(time), new blobItem(d, f, oneBlob)); sortedItems.put(Long.valueOf(time), new blobItem(d, f, oneBlob));
} catch (final IOException e) { } catch (final IOException e) {
@ -236,7 +236,7 @@ public class ArrayStack implements BLOB {
} }
@Override @Override
public void trim() { public void optimize() {
// trim shall not be called for ArrayStacks because the characteristics of an ArrayStack is that the 'topmost' BLOB on the stack // trim shall not be called for ArrayStacks because the characteristics of an ArrayStack is that the 'topmost' BLOB on the stack
// is used for write operations and all other shall be trimmed automatically since they are not used for writing. And the // is used for write operations and all other shall be trimmed automatically since they are not used for writing. And the
// topmost BLOB must not be trimmed to support fast writings. // topmost BLOB must not be trimmed to support fast writings.
@ -261,7 +261,7 @@ public class ArrayStack implements BLOB {
oneBlob = new Heap(location, this.keylength, this.ordering, this.buffersize); oneBlob = new Heap(location, this.keylength, this.ordering, this.buffersize);
} else { } else {
oneBlob = new HeapModifier(location, this.keylength, this.ordering); oneBlob = new HeapModifier(location, this.keylength, this.ordering);
oneBlob.trim(); oneBlob.optimize();
} }
this.blobs.add(new blobItem(d, location, oneBlob)); this.blobs.add(new blobItem(d, location, oneBlob));
} }

@ -63,7 +63,7 @@ public interface BLOB {
* trim the index of the database: this releases memory not currently used * trim the index of the database: this releases memory not currently used
* @throws IOException * @throws IOException
*/ */
public void trim(); public void optimize();
/** /**
* calculate the memory in RAM that the BLOB occupies * calculate the memory in RAM that the BLOB occupies

@ -68,8 +68,8 @@ public class Compressor implements BLOB, Iterable<byte[]> {
} }
@Override @Override
public void trim() { public void optimize() {
this.backend.trim(); this.backend.optimize();
} }
@Override @Override

@ -131,8 +131,8 @@ public class HeapReader {
return this.index.mem(); // don't add the memory for free here since then the asserts for memory management don't work return this.index.mem(); // don't add the memory for free here since then the asserts for memory management don't work
} }
public void trim() { public void optimize() {
this.index.trim(); this.index.optimize();
} }
protected byte[] normalizeKey(byte[] key) { protected byte[] normalizeKey(byte[] key) {

@ -151,6 +151,7 @@ public class Word {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
return hashes; return hashes;
} }
hashes.optimize();
return hashes; return hashes;
} }
@ -163,6 +164,7 @@ public class Word {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
return hashes; return hashes;
} }
hashes.optimize();
return hashes; return hashes;
} }
} }

@ -81,6 +81,12 @@ public class BufferedObjectIndex implements Index, Iterable<Row.Entry> {
} }
} }
@Override
public void optimize() {
this.backend.optimize();
this.buffer.optimize();
}
@Override @Override
public long mem() { public long mem() {
return this.backend.mem() + this.buffer.mem(); return this.backend.mem() + this.buffer.mem();
@ -356,6 +362,7 @@ public class BufferedObjectIndex implements Index, Iterable<Row.Entry> {
break; break;
} }
} }
handles.optimize();
return handles; return handles;
} }
} }

@ -101,6 +101,13 @@ public final class Cache implements Index, Iterable<Row.Entry> {
return this.index.mem() + this.readHitCache.mem() + this.readMissCache.mem(); return this.index.mem() + this.readHitCache.mem() + this.readMissCache.mem();
} }
@Override
public void optimize() {
this.index.optimize();
this.readHitCache.optimize();
this.readMissCache.optimize();
}
public final int writeBufferSize() { public final int writeBufferSize() {
return 0; return 0;
} }

@ -38,6 +38,7 @@ public interface Index extends Iterable<Row.Entry> {
public String filename(); // returns a unique identified for this index; can be a real or artificial file name public String filename(); // returns a unique identified for this index; can be a real or artificial file name
public int size(); public int size();
public void optimize();
public long mem(); public long mem();
public boolean isEmpty(); public boolean isEmpty();
public Row row(); public Row row();

@ -82,9 +82,10 @@ public final class RAMIndex implements Index, Iterable<Row.Entry> {
reset(); reset();
} }
public void trim() { @Override
if (this.index0 != null) this.index0.trim(); public void optimize() {
if (this.index1 != null) this.index1.trim(); if (this.index0 != null) this.index0.optimize();
if (this.index1 != null) this.index1.optimize();
} }
public final synchronized void reset() { public final synchronized void reset() {

@ -63,8 +63,8 @@ public final class RAMIndexCluster implements Index, Iterable<Row.Entry>, Clonea
this.rowdef = rowdef; this.rowdef = rowdef;
} }
public void trim() { public void optimize() {
for (final RAMIndex i: this.cluster) if (i != null) i.trim(); for (final RAMIndex i: this.cluster) if (i != null) i.optimize();
} }
@Override @Override

@ -676,6 +676,11 @@ public class RowCollection implements Sortable<Row.Entry>, Iterable<Row.Entry>,
} }
public void optimize() {
sort();
trim();
}
public final void sort() { public final void sort() {
if (this.sortBound == this.chunkcount) return; // this is sorted if (this.sortBound == this.chunkcount) return; // this is sorted
synchronized (this) { synchronized (this) {

@ -104,12 +104,12 @@ public final class RowHandleMap implements HandleMap, Iterable<Map.Entry<byte[],
is.close(); is.close();
is = null; is = null;
assert this.index.size() == file.length() / (keylength + idxbytes); assert this.index.size() == file.length() / (keylength + idxbytes);
trim(); optimize();
} }
@Override @Override
public void trim() { public void optimize() {
this.index.trim(); this.index.optimize();
} }
public long mem() { public long mem() {

@ -81,6 +81,7 @@ public final class RowHandleSet implements HandleSet, Iterable<byte[]>, Cloneabl
@Override @Override
public RowHandleSet clone() { public RowHandleSet clone() {
optimize();
return new RowHandleSet(this.rowdef, this.index.clone()); return new RowHandleSet(this.rowdef, this.index.clone());
} }
@ -92,6 +93,7 @@ public final class RowHandleSet implements HandleSet, Iterable<byte[]>, Cloneabl
@Override @Override
public void optimize() { public void optimize() {
this.index.sort(); this.index.sort();
this.index.trim();
} }
/** /**
@ -305,6 +307,7 @@ public final class RowHandleSet implements HandleSet, Iterable<byte[]>, Cloneabl
o = mi.next(); o = mi.next();
if (large.has(o)) result.put(o); if (large.has(o)) result.put(o);
} }
result.optimize();
return result; return result;
} }
@ -331,6 +334,7 @@ public final class RowHandleSet implements HandleSet, Iterable<byte[]>, Cloneabl
} }
} }
} }
result.optimize();
return result; return result;
} }

@ -104,6 +104,10 @@ public class SQLTable implements Index, Iterable<Row.Entry> {
} }
@Override
public void optimize() {
}
@Override @Override
public long mem() { public long mem() {
return 0; return 0;

@ -107,6 +107,11 @@ public class SplitTable implements Index, Iterable<Row.Entry> {
init(); init();
} }
@Override
public void optimize() {
for (Index table: tables.values()) table.optimize();
}
@Override @Override
public long mem() { public long mem() {
long m = 0; long m = 0;

@ -196,7 +196,7 @@ public class Table implements Index, Iterable<Row.Entry> {
this.table = null; this.table = null;
} }
} }
this.index.trim(); optimize();
// open the file // open the file
this.file = new BufferedRecords(new Records(tablefile, rowdef.objectsize), this.buffersize); this.file = new BufferedRecords(new Records(tablefile, rowdef.objectsize), this.buffersize);
@ -270,6 +270,13 @@ public class Table implements Index, Iterable<Row.Entry> {
} catch (final IOException e) { } catch (final IOException e) {
ConcurrentLog.severe("Table", "", e); ConcurrentLog.severe("Table", "", e);
} }
optimize();
}
@Override
public void optimize() {
this.index.optimize();
if (this.table != null) this.table.optimize();
} }
@Override @Override

@ -30,12 +30,12 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Date; import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Set; import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -52,7 +52,6 @@ import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.ByteOrder; import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.order.NaturalOrder;
import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.storage.HandleSet; import net.yacy.cora.storage.HandleSet;
@ -215,9 +214,9 @@ public class Segment {
final byte[] searchhash = url.hash(); final byte[] searchhash = url.hash();
RowHandleSet rootCandidates = getPossibleRootHashes(url); RowHandleSet rootCandidates = getPossibleRootHashes(url);
Set<byte[]> ignore = new TreeSet<byte[]>(NaturalOrder.naturalOrder); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops Set<String> ignore = new HashSet<String>(); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops
Set<byte[]> levelhashes = new TreeSet<byte[]>(NaturalOrder.naturalOrder); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry Set<String> levelhashes = new HashSet<String>(); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
levelhashes.add(searchhash); levelhashes.add(ASCII.String(searchhash));
int leveldepth = 0; // the recursion depth and therefore the result depth-1. Shall be 0 for the first call int leveldepth = 0; // the recursion depth and therefore the result depth-1. Shall be 0 for the first call
final byte[] hosthash = new byte[6]; // the host of the url to be checked final byte[] hosthash = new byte[6]; // the host of the url to be checked
System.arraycopy(searchhash, 6, hosthash, 0, 6); System.arraycopy(searchhash, 6, hosthash, 0, 6);
@ -225,13 +224,13 @@ public class Segment {
long timeout = System.currentTimeMillis() + maxtime; long timeout = System.currentTimeMillis() + maxtime;
mainloop: for (int maxdepth = 0; maxdepth < 6 && System.currentTimeMillis() < timeout; maxdepth++) { mainloop: for (int maxdepth = 0; maxdepth < 6 && System.currentTimeMillis() < timeout; maxdepth++) {
Set<byte[]> checknext = new TreeSet<byte[]>(NaturalOrder.naturalOrder); Set<String> checknext = new HashSet<String>();
// loop over all hashes at this clickdepth; the first call to this loop should contain only one hash and a leveldepth = 0 // loop over all hashes at this clickdepth; the first call to this loop should contain only one hash and a leveldepth = 0
checkloop: for (byte[] urlhash: levelhashes) { checkloop: for (String urlhashs: levelhashes) {
// get all the citations for this url and iterate // get all the citations for this url and iterate
ReferenceReport rr = rrc.getReferenceReport(urlhash, false); ReferenceReport rr = rrc.getReferenceReport(urlhashs, false);
//ReferenceContainer<CitationReference> references = this.urlCitationIndex.get(urlhash, null); //ReferenceContainer<CitationReference> references = this.urlCitationIndex.get(urlhash, null);
if (rr == null || rr.getInternalCount() == 0) continue checkloop; // don't know if (rr == null || rr.getInternalCount() == 0) continue checkloop; // don't know
Iterator<byte[]> i = rr.getInternallIDs().iterator(); Iterator<byte[]> i = rr.getInternallIDs().iterator();
@ -241,17 +240,17 @@ public class Segment {
// check if this is from the same host // check if this is from the same host
assert (ByteBuffer.equals(u, 6, hosthash, 0, 6)); assert (ByteBuffer.equals(u, 6, hosthash, 0, 6));
String us = ASCII.String(u);
// check ignore // check ignore
if (ignore.contains(u)) continue nextloop; if (ignore.contains(us)) continue nextloop;
// check if the url is a root url // check if the url is a root url
if (rootCandidates.has(u)) { if (rootCandidates.has(u)) {
return leveldepth + 1; return leveldepth + 1;
} }
checknext.add(u); checknext.add(us);
ignore.add(u); ignore.add(us);
} }
if (System.currentTimeMillis() > timeout) break mainloop; if (System.currentTimeMillis() > timeout) break mainloop;
} }
@ -286,16 +285,16 @@ public class Segment {
} }
public class ReferenceReportCache { public class ReferenceReportCache {
Map<byte[], ReferenceReport> cache; private final Map<String, ReferenceReport> cache;
public ReferenceReportCache() { public ReferenceReportCache() {
this.cache = new TreeMap<byte[], ReferenceReport>(Base64Order.enhancedCoder); this.cache = new HashMap<String, ReferenceReport>();
} }
public ReferenceReport getReferenceReport(final byte[] id, final boolean acceptSelfReference) throws IOException { public ReferenceReport getReferenceReport(final String id, final boolean acceptSelfReference) throws IOException {
ReferenceReport rr = cache.get(id); ReferenceReport rr = cache.get(id);
if (MemoryControl.shortStatus()) cache.clear(); if (MemoryControl.shortStatus()) cache.clear();
if (rr != null) return rr; if (rr != null) return rr;
try { try {
rr = new ReferenceReport(id, acceptSelfReference); rr = new ReferenceReport(ASCII.getBytes(id), acceptSelfReference);
cache.put(id, rr); cache.put(id, rr);
return rr; return rr;
} catch (final SpaceExceededException e) { } catch (final SpaceExceededException e) {
@ -311,13 +310,13 @@ public class Segment {
public class ClickdepthCache { public class ClickdepthCache {
ReferenceReportCache rrc; ReferenceReportCache rrc;
Map<byte[], Integer> cache; Map<String, Integer> cache;
public ClickdepthCache(ReferenceReportCache rrc) { public ClickdepthCache(ReferenceReportCache rrc) {
this.rrc = rrc; this.rrc = rrc;
this.cache = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder); this.cache = new HashMap<String, Integer>();
} }
public int getClickdepth(final DigestURL url, int maxtime) throws IOException { public int getClickdepth(final DigestURL url, int maxtime) throws IOException {
Integer clickdepth = cache.get(url.hash()); Integer clickdepth = cache.get(ASCII.String(url.hash()));
if (MemoryControl.shortStatus()) cache.clear(); if (MemoryControl.shortStatus()) cache.clear();
if (clickdepth != null) { if (clickdepth != null) {
//ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth + " CACHE HIT"); //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth + " CACHE HIT");
@ -325,7 +324,7 @@ public class Segment {
} }
clickdepth = Segment.this.getClickDepth(this.rrc, url, maxtime); clickdepth = Segment.this.getClickDepth(this.rrc, url, maxtime);
//ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth); //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth);
this.cache.put(url.hash(), clickdepth); this.cache.put(ASCII.String(url.hash()), clickdepth);
return clickdepth.intValue(); return clickdepth.intValue();
} }
} }
@ -343,8 +342,8 @@ public class Segment {
this.internal = 0; this.internal = 0;
this.external = 0; this.external = 0;
this.externalHosts = new RowHandleSet(6, Base64Order.enhancedCoder, 0); this.externalHosts = new RowHandleSet(6, Base64Order.enhancedCoder, 0);
this.internalIDs = new RowHandleSet(12, Base64Order.enhancedCoder, 0); this.internalIDs = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 0);
this.externalIDs = new RowHandleSet(12, Base64Order.enhancedCoder, 0); this.externalIDs = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 0);
try { try {
if (connectedCitation()) { if (connectedCitation()) {
// read the references from the citation index // read the references from the citation index
@ -397,6 +396,9 @@ public class Segment {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
} }
this.externalHosts.optimize();
this.internalIDs.optimize();
this.externalIDs.optimize();
} }
public int getInternalCount() { public int getInternalCount() {
return this.internal; return this.internal;

@ -1322,7 +1322,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
try { try {
for (Map.Entry<String, double[]> entry: this.crt.entrySet()) { for (Map.Entry<String, double[]> entry: this.crt.entrySet()) {
String id = entry.getKey(); String id = entry.getKey();
ReferenceReport rr = this.rrCache.getReferenceReport(ASCII.getBytes(id), false); ReferenceReport rr = this.rrCache.getReferenceReport(id, false);
// sum up the cr of the internal links // sum up the cr of the internal links
HandleSet iids = rr.getInternallIDs(); HandleSet iids = rr.getInternallIDs();
double ncr = 0.0d; double ncr = 0.0d;

@ -244,6 +244,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
} }
} }
} }
remaininghashes.optimize();
return remaininghashes; return remaininghashes;
} }

Loading…
Cancel
Save