- enhanced postprocessing speed and memory footprint (by using HashMaps

instead of TreeMaps)
- enhanced memory footprint of database indexes (by introduction of
optimize calls)
- optimize calls shrink the amount of used memory for index sets if they
are not changed afterwards any more
pull/1/head
Michael Peter Christen 11 years ago
parent 1245cfeb43
commit fdaeac374a

@ -559,7 +559,7 @@ public class HostBrowser {
if (fetchReferences) {
// get the references from the citation index
try {
ReferenceReport rr = rrCache.getReferenceReport(ASCII.getBytes(urlhash), false);
ReferenceReport rr = rrCache.getReferenceReport(urlhash, false);
List<String> internalIDs = new ArrayList<String>();
List<String> externalIDs = new ArrayList<String>();
HandleSet iids = rr.getInternallIDs();

@ -141,7 +141,7 @@ public class webstructure {
prop.put("citations", 1);
ReferenceReportCache rrc = sb.index.getReferenceReportCache();
ReferenceReport rr = null;
try {rr = rrc.getReferenceReport(urlhash, true);} catch (IOException e) {}
try {rr = rrc.getReferenceReport(ASCII.String(urlhash), true);} catch (IOException e) {}
if (rr != null && rr.getInternalCount() > 0 && rr.getExternalCount() > 0) {
prop.put("citations_count", 1);
prop.put("citations_documents", 1);

@ -204,7 +204,7 @@ public class SchemaConfiguration extends Configuration implements Serializable {
Integer exthosts_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.references_exthosts_i.getSolrFieldName());
Integer hostextc_old = sid == null ? null : (Integer) sid.getFieldValue(CollectionSchema.host_extent_i.getSolrFieldName());
try {
ReferenceReport rr = rrCache.getReferenceReport(url.hash(), false);
ReferenceReport rr = rrCache.getReferenceReport(ASCII.String(url.hash()), false);
List<String> internalIDs = new ArrayList<String>();
HandleSet iids = rr.getInternallIDs();
for (byte[] b: iids) internalIDs.add(ASCII.String(b));

@ -32,7 +32,7 @@ public interface HandleMap extends Iterable<Map.Entry<byte[], Long>> {
public long mem();
public void trim();
public void optimize();
/**
* write a dump of the index to a file. All entries are written in order

@ -331,7 +331,7 @@ public class Balancer {
HostHandles hh = this.domainStacks.get(host);
if (hh == null) {
// create new list
HandleSet domainList = new RowHandleSet(12, Base64Order.enhancedCoder, 1);
HandleSet domainList = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 1);
domainList.put(urlhash);
this.domainStacks.put(host, new HostHandles(hosthash, domainList));
} else {

@ -42,6 +42,7 @@ import net.yacy.crawler.Balancer;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.robots.RobotsTxt;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowHandleSet;
public class NoticedURL {
@ -184,7 +185,7 @@ public class NoticedURL {
*/
public boolean removeByURLHash(final byte[] urlhashBytes) {
try {
final HandleSet urlHashes = new RowHandleSet(12, Base64Order.enhancedCoder, 1);
final HandleSet urlHashes = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 1);
urlHashes.put(urlhashBytes);
boolean ret = false;
try {ret |= this.noloadStack.remove(urlHashes) > 0;} catch (final IOException e) {}

@ -206,7 +206,7 @@ public class ArrayStack implements BLOB {
oneBlob = new Heap(f, keylength, ordering, buffersize);
} else {
oneBlob = new HeapModifier(f, keylength, ordering);
oneBlob.trim(); // no writings here, can be used with minimum memory
oneBlob.optimize(); // no writings here, can be used with minimum memory
}
sortedItems.put(Long.valueOf(time), new blobItem(d, f, oneBlob));
} catch (final IOException e) {
@ -236,7 +236,7 @@ public class ArrayStack implements BLOB {
}
@Override
public void trim() {
public void optimize() {
// trim shall not be called for ArrayStacks because the characteristics of an ArrayStack is that the 'topmost' BLOB on the stack
// is used for write operations and all other shall be trimmed automatically since they are not used for writing. And the
// topmost BLOB must not be trimmed to support fast writings.
@ -261,7 +261,7 @@ public class ArrayStack implements BLOB {
oneBlob = new Heap(location, this.keylength, this.ordering, this.buffersize);
} else {
oneBlob = new HeapModifier(location, this.keylength, this.ordering);
oneBlob.trim();
oneBlob.optimize();
}
this.blobs.add(new blobItem(d, location, oneBlob));
}

@ -63,7 +63,7 @@ public interface BLOB {
* trim the index of the database: this releases memory not currently used
* @throws IOException
*/
public void trim();
public void optimize();
/**
* calculate the memory in RAM that the BLOB occupies

@ -68,8 +68,8 @@ public class Compressor implements BLOB, Iterable<byte[]> {
}
@Override
public void trim() {
this.backend.trim();
public void optimize() {
this.backend.optimize();
}
@Override

@ -131,8 +131,8 @@ public class HeapReader {
return this.index.mem(); // don't add the memory for free here since then the asserts for memory management don't work
}
public void trim() {
this.index.trim();
public void optimize() {
this.index.optimize();
}
protected byte[] normalizeKey(byte[] key) {

@ -151,6 +151,7 @@ public class Word {
ConcurrentLog.logException(e);
return hashes;
}
hashes.optimize();
return hashes;
}
@ -163,6 +164,7 @@ public class Word {
ConcurrentLog.logException(e);
return hashes;
}
hashes.optimize();
return hashes;
}
}

@ -81,6 +81,12 @@ public class BufferedObjectIndex implements Index, Iterable<Row.Entry> {
}
}
@Override
public void optimize() {
this.backend.optimize();
this.buffer.optimize();
}
@Override
public long mem() {
return this.backend.mem() + this.buffer.mem();
@ -356,6 +362,7 @@ public class BufferedObjectIndex implements Index, Iterable<Row.Entry> {
break;
}
}
handles.optimize();
return handles;
}
}

@ -101,6 +101,13 @@ public final class Cache implements Index, Iterable<Row.Entry> {
return this.index.mem() + this.readHitCache.mem() + this.readMissCache.mem();
}
@Override
public void optimize() {
this.index.optimize();
this.readHitCache.optimize();
this.readMissCache.optimize();
}
public final int writeBufferSize() {
return 0;
}

@ -38,6 +38,7 @@ public interface Index extends Iterable<Row.Entry> {
public String filename(); // returns a unique identified for this index; can be a real or artificial file name
public int size();
public void optimize();
public long mem();
public boolean isEmpty();
public Row row();

@ -82,9 +82,10 @@ public final class RAMIndex implements Index, Iterable<Row.Entry> {
reset();
}
public void trim() {
if (this.index0 != null) this.index0.trim();
if (this.index1 != null) this.index1.trim();
@Override
public void optimize() {
if (this.index0 != null) this.index0.optimize();
if (this.index1 != null) this.index1.optimize();
}
public final synchronized void reset() {

@ -63,8 +63,8 @@ public final class RAMIndexCluster implements Index, Iterable<Row.Entry>, Clonea
this.rowdef = rowdef;
}
public void trim() {
for (final RAMIndex i: this.cluster) if (i != null) i.trim();
public void optimize() {
for (final RAMIndex i: this.cluster) if (i != null) i.optimize();
}
@Override

@ -676,6 +676,11 @@ public class RowCollection implements Sortable<Row.Entry>, Iterable<Row.Entry>,
}
public void optimize() {
sort();
trim();
}
public final void sort() {
if (this.sortBound == this.chunkcount) return; // this is sorted
synchronized (this) {

@ -104,12 +104,12 @@ public final class RowHandleMap implements HandleMap, Iterable<Map.Entry<byte[],
is.close();
is = null;
assert this.index.size() == file.length() / (keylength + idxbytes);
trim();
optimize();
}
@Override
public void trim() {
this.index.trim();
public void optimize() {
this.index.optimize();
}
public long mem() {

@ -81,6 +81,7 @@ public final class RowHandleSet implements HandleSet, Iterable<byte[]>, Cloneabl
@Override
public RowHandleSet clone() {
optimize();
return new RowHandleSet(this.rowdef, this.index.clone());
}
@ -92,6 +93,7 @@ public final class RowHandleSet implements HandleSet, Iterable<byte[]>, Cloneabl
@Override
public void optimize() {
this.index.sort();
this.index.trim();
}
/**
@ -305,6 +307,7 @@ public final class RowHandleSet implements HandleSet, Iterable<byte[]>, Cloneabl
o = mi.next();
if (large.has(o)) result.put(o);
}
result.optimize();
return result;
}
@ -331,6 +334,7 @@ public final class RowHandleSet implements HandleSet, Iterable<byte[]>, Cloneabl
}
}
}
result.optimize();
return result;
}

@ -103,6 +103,10 @@ public class SQLTable implements Index, Iterable<Row.Entry> {
}
}
@Override
public void optimize() {
}
@Override
public long mem() {

@ -107,6 +107,11 @@ public class SplitTable implements Index, Iterable<Row.Entry> {
init();
}
@Override
public void optimize() {
for (Index table: tables.values()) table.optimize();
}
@Override
public long mem() {
long m = 0;

@ -196,7 +196,7 @@ public class Table implements Index, Iterable<Row.Entry> {
this.table = null;
}
}
this.index.trim();
optimize();
// open the file
this.file = new BufferedRecords(new Records(tablefile, rowdef.objectsize), this.buffersize);
@ -270,6 +270,13 @@ public class Table implements Index, Iterable<Row.Entry> {
} catch (final IOException e) {
ConcurrentLog.severe("Table", "", e);
}
optimize();
}
@Override
public void optimize() {
this.index.optimize();
if (this.table != null) this.table.optimize();
}
@Override

@ -30,12 +30,12 @@ import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.BlockingQueue;
import java.util.regex.Pattern;
@ -52,7 +52,6 @@ import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.ByteOrder;
import net.yacy.cora.order.NaturalOrder;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.storage.HandleSet;
@ -215,9 +214,9 @@ public class Segment {
final byte[] searchhash = url.hash();
RowHandleSet rootCandidates = getPossibleRootHashes(url);
Set<byte[]> ignore = new TreeSet<byte[]>(NaturalOrder.naturalOrder); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops
Set<byte[]> levelhashes = new TreeSet<byte[]>(NaturalOrder.naturalOrder); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
levelhashes.add(searchhash);
Set<String> ignore = new HashSet<String>(); // a set of urlhashes to be ignored. This is generated from all hashes that are seen during recursion to prevent enless loops
Set<String> levelhashes = new HashSet<String>(); // all hashes of a clickdepth. The first call contains the target hash only and therefore just one entry
levelhashes.add(ASCII.String(searchhash));
int leveldepth = 0; // the recursion depth and therefore the result depth-1. Shall be 0 for the first call
final byte[] hosthash = new byte[6]; // the host of the url to be checked
System.arraycopy(searchhash, 6, hosthash, 0, 6);
@ -225,13 +224,13 @@ public class Segment {
long timeout = System.currentTimeMillis() + maxtime;
mainloop: for (int maxdepth = 0; maxdepth < 6 && System.currentTimeMillis() < timeout; maxdepth++) {
Set<byte[]> checknext = new TreeSet<byte[]>(NaturalOrder.naturalOrder);
Set<String> checknext = new HashSet<String>();
// loop over all hashes at this clickdepth; the first call to this loop should contain only one hash and a leveldepth = 0
checkloop: for (byte[] urlhash: levelhashes) {
checkloop: for (String urlhashs: levelhashes) {
// get all the citations for this url and iterate
ReferenceReport rr = rrc.getReferenceReport(urlhash, false);
ReferenceReport rr = rrc.getReferenceReport(urlhashs, false);
//ReferenceContainer<CitationReference> references = this.urlCitationIndex.get(urlhash, null);
if (rr == null || rr.getInternalCount() == 0) continue checkloop; // don't know
Iterator<byte[]> i = rr.getInternallIDs().iterator();
@ -241,17 +240,17 @@ public class Segment {
// check if this is from the same host
assert (ByteBuffer.equals(u, 6, hosthash, 0, 6));
String us = ASCII.String(u);
// check ignore
if (ignore.contains(u)) continue nextloop;
if (ignore.contains(us)) continue nextloop;
// check if the url is a root url
if (rootCandidates.has(u)) {
return leveldepth + 1;
}
checknext.add(u);
ignore.add(u);
checknext.add(us);
ignore.add(us);
}
if (System.currentTimeMillis() > timeout) break mainloop;
}
@ -286,16 +285,16 @@ public class Segment {
}
public class ReferenceReportCache {
Map<byte[], ReferenceReport> cache;
private final Map<String, ReferenceReport> cache;
public ReferenceReportCache() {
this.cache = new TreeMap<byte[], ReferenceReport>(Base64Order.enhancedCoder);
this.cache = new HashMap<String, ReferenceReport>();
}
public ReferenceReport getReferenceReport(final byte[] id, final boolean acceptSelfReference) throws IOException {
public ReferenceReport getReferenceReport(final String id, final boolean acceptSelfReference) throws IOException {
ReferenceReport rr = cache.get(id);
if (MemoryControl.shortStatus()) cache.clear();
if (rr != null) return rr;
try {
rr = new ReferenceReport(id, acceptSelfReference);
rr = new ReferenceReport(ASCII.getBytes(id), acceptSelfReference);
cache.put(id, rr);
return rr;
} catch (final SpaceExceededException e) {
@ -311,13 +310,13 @@ public class Segment {
public class ClickdepthCache {
ReferenceReportCache rrc;
Map<byte[], Integer> cache;
Map<String, Integer> cache;
public ClickdepthCache(ReferenceReportCache rrc) {
this.rrc = rrc;
this.cache = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
this.cache = new HashMap<String, Integer>();
}
public int getClickdepth(final DigestURL url, int maxtime) throws IOException {
Integer clickdepth = cache.get(url.hash());
Integer clickdepth = cache.get(ASCII.String(url.hash()));
if (MemoryControl.shortStatus()) cache.clear();
if (clickdepth != null) {
//ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth + " CACHE HIT");
@ -325,7 +324,7 @@ public class Segment {
}
clickdepth = Segment.this.getClickDepth(this.rrc, url, maxtime);
//ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth);
this.cache.put(url.hash(), clickdepth);
this.cache.put(ASCII.String(url.hash()), clickdepth);
return clickdepth.intValue();
}
}
@ -343,8 +342,8 @@ public class Segment {
this.internal = 0;
this.external = 0;
this.externalHosts = new RowHandleSet(6, Base64Order.enhancedCoder, 0);
this.internalIDs = new RowHandleSet(12, Base64Order.enhancedCoder, 0);
this.externalIDs = new RowHandleSet(12, Base64Order.enhancedCoder, 0);
this.internalIDs = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 0);
this.externalIDs = new RowHandleSet(Word.commonHashLength, Base64Order.enhancedCoder, 0);
try {
if (connectedCitation()) {
// read the references from the citation index
@ -397,6 +396,9 @@ public class Segment {
ConcurrentLog.logException(e);
}
}
this.externalHosts.optimize();
this.internalIDs.optimize();
this.externalIDs.optimize();
}
public int getInternalCount() {
return this.internal;

@ -1322,7 +1322,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
try {
for (Map.Entry<String, double[]> entry: this.crt.entrySet()) {
String id = entry.getKey();
ReferenceReport rr = this.rrCache.getReferenceReport(ASCII.getBytes(id), false);
ReferenceReport rr = this.rrCache.getReferenceReport(id, false);
// sum up the cr of the internal links
HandleSet iids = rr.getInternallIDs();
double ncr = 0.0d;

@ -244,6 +244,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
}
}
}
remaininghashes.optimize();
return remaininghashes;
}

Loading…
Cancel
Save