From 7b17cdf6dd67ada4567fa0fbb5ba35f833c6ebc2 Mon Sep 17 00:00:00 2001 From: reger Date: Thu, 7 Nov 2013 03:11:03 +0100 Subject: [PATCH 1/7] add content_type:image/* to image search - see numerous idx entries with content_type image without url_file_ext_s (for various reason) which should be included in result - try it yourself with following sample query /solr/select?q=content_type:image/* AND -url_file_ext_s:[* TO *]&defType=edismax&fl=sku,url_file_ext_s,content_type adresses also possible url without or deviating extension. --- source/net/yacy/search/query/QueryGoal.java | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/source/net/yacy/search/query/QueryGoal.java b/source/net/yacy/search/query/QueryGoal.java index 745bbb2ac..50861de59 100644 --- a/source/net/yacy/search/query/QueryGoal.java +++ b/source/net/yacy/search/query/QueryGoal.java @@ -242,7 +242,8 @@ public class QueryGoal { // add filter to prevent that results come from failed urls q.append(CollectionSchema.httpstatus_i.getSolrFieldName()).append(":200").append(" AND ("); q.append(CollectionSchema.images_urlstub_sxt.getSolrFieldName()).append(":[* TO *] OR "); - q.append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":(jpg OR png OR gif))"); + q.append(CollectionSchema.url_file_ext_s.getSolrFieldName()).append(":(jpg OR png OR gif) OR"); + q.append(CollectionSchema.content_type.getSolrFieldName()).append(":(image/*))"); // parse special requests if (isCatchall()) return q; From 81bb50118e047e927c67f69c2631ac435066f1cc Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 7 Nov 2013 10:01:44 +0100 Subject: [PATCH 2/7] found and fixed a huge memory leak in solr caching (inside Solr). The not-flushed Solr cache is now handled in this way: - it is smaller by default - an Solr-internal process is started to flush the cache periodically (this does NOT clean the cache, just removes old objects) - a Solr-external process (the standard YaCy cleanup-process) now has direct access to the solr internal cache and flushes them completely. The time frame for such a flush is defined by the cleanup-process frequency, by default 10 minutes. --- defaults/solr/solrconfig.xml | 39 +++++++++++-------- htroot/ContentAnalysis_p.java | 2 +- htroot/RankingSolr_p.java | 2 +- htroot/yacysearch.java | 2 +- .../solr/connector/CachedSolrConnector.java | 10 ++--- .../ConcurrentUpdateSolrConnector.java | 6 +++ .../solr/connector/EmbeddedSolrConnector.java | 21 ++++++++++ .../solr/connector/MirrorSolrConnector.java | 6 +++ .../solr/connector/RemoteSolrConnector.java | 6 +++ .../solr/connector/SolrConnector.java | 7 +++- .../solr/connector/SolrServerConnector.java | 2 +- .../solr/instance/InstanceMirror.java | 5 +-- source/net/yacy/search/ResourceObserver.java | 2 +- source/net/yacy/search/Switchboard.java | 2 +- source/net/yacy/search/index/Fulltext.java | 14 +++---- source/net/yacy/search/index/Segment.java | 4 +- 16 files changed, 89 insertions(+), 41 deletions(-) diff --git a/defaults/solr/solrconfig.xml b/defaults/solr/solrconfig.xml index 1234dd0d3..d8044f969 100644 --- a/defaults/solr/solrconfig.xml +++ b/defaults/solr/solrconfig.xml @@ -461,19 +461,21 @@ and old cache. --> + size="64" + initialSize="64" + autowarmCount="4" + cleanupThread="true"/> - + - + diff --git a/htroot/ContentAnalysis_p.java b/htroot/ContentAnalysis_p.java index 2ba573ab0..eed8455e7 100644 --- a/htroot/ContentAnalysis_p.java +++ b/htroot/ContentAnalysis_p.java @@ -34,7 +34,7 @@ public class ContentAnalysis_p { // clean up all search events SearchEventCache.cleanupEvents(true); - sb.index.clearCache(); // every time the ranking is changed we need to remove old orderings + sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings if (post != null && post.containsKey("EnterDoublecheck")) { Ranking.setMinTokenLen(post.getInt("minTokenLen", 3)); diff --git a/htroot/RankingSolr_p.java b/htroot/RankingSolr_p.java index 04784f938..91e543a11 100644 --- a/htroot/RankingSolr_p.java +++ b/htroot/RankingSolr_p.java @@ -38,7 +38,7 @@ public class RankingSolr_p { // clean up all search events SearchEventCache.cleanupEvents(true); - sb.index.clearCache(); // every time the ranking is changed we need to remove old orderings + sb.index.clearCaches(); // every time the ranking is changed we need to remove old orderings int profileNr = 0; if (post != null) profileNr = post.getInt("profileNr", profileNr); diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index 284a0b15e..b79c8061b 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -360,7 +360,7 @@ public class yacysearch { // check available memory and clean up if necessary if ( !MemoryControl.request(8000000L, false) ) { - indexSegment.clearCache(); + indexSegment.clearCaches(); SearchEventCache.cleanupEvents(false); } diff --git a/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java index c96fe2d33..eaf93603c 100644 --- a/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/CachedSolrConnector.java @@ -61,7 +61,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo this.missCache = new ConcurrentARC(missCacheMax, partitions); } - public void clearCache() { + public void clearCaches() { this.hitCache.clear(); this.missCache.clear(); this.documentCache.clear(); @@ -70,9 +70,9 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo @Override public synchronized void close() { + this.clearCaches(); if (this.solr != null) this.solr.close(); this.solr = null; - this.clearCache(); } /** @@ -81,7 +81,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo */ @Override public void clear() throws IOException { - this.clearCache(); + this.clearCaches(); if (this.solr != null) this.solr.clear(); } @@ -119,7 +119,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo @Override public void deleteByQuery(final String querystring) throws IOException { - this.clearCache(); + this.clearCaches(); this.solr.deleteByQuery(querystring); } @@ -261,7 +261,7 @@ public class CachedSolrConnector extends AbstractSolrConnector implements SolrCo } private void addToCache(SolrDocumentList list, boolean doccache) { - if (MemoryControl.shortStatus()) clearCache(); + if (MemoryControl.shortStatus()) clearCaches(); for (final SolrDocument solrdoc: list) { addToCache(solrdoc, doccache); } diff --git a/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java index 792d921ad..ddbf550ec 100644 --- a/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/ConcurrentUpdateSolrConnector.java @@ -118,6 +118,12 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector { ensureAliveUpdateHandler(); } + @Override + public void clearCaches() { + this.connector.clearCaches(); + this.idCache.clear(); + } + /** * used for debugging */ diff --git a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java index 533ecb080..10d36a9c9 100644 --- a/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/EmbeddedSolrConnector.java @@ -34,6 +34,7 @@ import net.yacy.search.schema.CollectionSchema; import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.search.Query; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrServerException; import org.apache.solr.client.solrj.response.QueryResponse; @@ -47,10 +48,14 @@ import org.apache.solr.core.SolrCore; import org.apache.solr.handler.component.SearchHandler; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.request.SolrQueryRequestBase; +import org.apache.solr.request.UnInvertedField; import org.apache.solr.response.ResultContext; import org.apache.solr.response.SolrQueryResponse; import org.apache.solr.search.DocIterator; import org.apache.solr.search.DocList; +import org.apache.solr.search.DocSet; +import org.apache.solr.search.QueryResultKey; +import org.apache.solr.search.SolrCache; import org.apache.solr.search.SolrIndexSearcher; import org.apache.solr.util.RefCounted; @@ -88,6 +93,22 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo super.init(this.instance.getServer(coreName)); } + public void clearCaches() { + SolrConfig solrConfig = this.core.getSolrConfig(); + @SuppressWarnings("unchecked") + SolrCache fieldValueCache = solrConfig.fieldValueCacheConfig == null ? null : solrConfig.fieldValueCacheConfig.newInstance(); + if (fieldValueCache != null) fieldValueCache.clear(); + @SuppressWarnings("unchecked") + SolrCache filterCache= solrConfig.filterCacheConfig == null ? null : solrConfig.filterCacheConfig.newInstance(); + if (filterCache != null) filterCache.clear(); + @SuppressWarnings("unchecked") + SolrCache queryResultCache = solrConfig.queryResultCacheConfig == null ? null : solrConfig.queryResultCacheConfig.newInstance(); + if (queryResultCache != null) queryResultCache.clear(); + @SuppressWarnings("unchecked") + SolrCache documentCache = solrConfig.documentCacheConfig == null ? null : solrConfig.documentCacheConfig.newInstance(); + if (documentCache != null) documentCache.clear(); + } + public SolrInstance getInstance() { return this.instance; } diff --git a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java index c6d51e8ec..19fa604c5 100644 --- a/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/MirrorSolrConnector.java @@ -53,6 +53,12 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo this.solr0 = solr0; this.solr1 = solr1; } + + @Override + public void clearCaches() { + if (this.solr0 != null) this.solr0.clearCaches(); + if (this.solr1 != null) this.solr1.clearCaches(); + } public boolean isConnected0() { return this.solr0 != null; diff --git a/source/net/yacy/cora/federate/solr/connector/RemoteSolrConnector.java b/source/net/yacy/cora/federate/solr/connector/RemoteSolrConnector.java index 4e2a9369f..0ab5f8b31 100644 --- a/source/net/yacy/cora/federate/solr/connector/RemoteSolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/RemoteSolrConnector.java @@ -71,6 +71,11 @@ public class RemoteSolrConnector extends SolrServerConnector implements SolrConn super.close(); } + @Override + public void clearCaches() { + // we do not have a direct access to the caches here, thus we simply do nothing. + } + @Override public QueryResponse getResponseByParams(ModifiableSolrParams params) throws IOException { // during the solr query we set the thread name to the query string to get more debugging info in thread dumps @@ -134,4 +139,5 @@ public class RemoteSolrConnector extends SolrServerConnector implements SolrConn } System.exit(0); } + } diff --git a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java index 8fb31c531..f28d26f09 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrConnector.java @@ -36,7 +36,12 @@ import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.params.ModifiableSolrParams; public interface SolrConnector extends Iterable /* Iterable of document IDs */ { - + + /** + * clear all caches: inside solr and ouside solr within the implementations of this interface + */ + public void clearCaches(); + /** * get the size of the index * @return number of results if solr is queries with a catch-all pattern diff --git a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java index f12d43950..aec6352f0 100644 --- a/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java +++ b/source/net/yacy/cora/federate/solr/connector/SolrServerConnector.java @@ -64,7 +64,7 @@ public abstract class SolrServerConnector extends AbstractSolrConnector implemen public SolrServer getServer() { return this.server; } - + @Override public void commit(final boolean softCommit) { synchronized (this.server) { diff --git a/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java b/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java index 6b9b7a939..1d49fd537 100644 --- a/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java +++ b/source/net/yacy/cora/federate/solr/instance/InstanceMirror.java @@ -24,7 +24,6 @@ import java.util.Collection; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; -import net.yacy.cora.federate.solr.connector.CachedSolrConnector; import net.yacy.cora.federate.solr.connector.ConcurrentUpdateSolrConnector; import net.yacy.cora.federate.solr.connector.EmbeddedSolrConnector; import net.yacy.cora.federate.solr.connector.MirrorSolrConnector; @@ -161,9 +160,9 @@ public class InstanceMirror { return msc; } - public void clearCache() { + public void clearCaches() { for (SolrConnector csc: this.connectorCache.values()) { - if (csc instanceof CachedSolrConnector) ((CachedSolrConnector) csc).clearCache(); + csc.clearCaches(); } for (EmbeddedSolrConnector ssc: this.embeddedCache.values()) ssc.commit(true); } diff --git a/source/net/yacy/search/ResourceObserver.java b/source/net/yacy/search/ResourceObserver.java index 9cc6a58e7..32e8d2396 100644 --- a/source/net/yacy/search/ResourceObserver.java +++ b/source/net/yacy/search/ResourceObserver.java @@ -129,7 +129,7 @@ public class ResourceObserver { if(MemoryControl.properState()) return Space.HIGH; // clear some caches - @all: are there more of these, we could clear here? - this.sb.index.clearCache(); + this.sb.index.clearCaches(); SearchEventCache.cleanupEvents(true); this.sb.trail.clear(); Switchboard.urlBlacklist.clearblacklistCache(); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 1cfe10298..0307b7e01 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2031,7 +2031,7 @@ public final class Switchboard extends serverSwitch { // clear caches if necessary if ( !MemoryControl.request(128000000L, false) ) { - this.index.clearCache(); + this.index.clearCaches(); SearchEventCache.cleanupEvents(false); this.trail.clear(); GuiHandler.clear(); diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 39ed4d89a..ea8a2bac5 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -225,10 +225,10 @@ public final class Fulltext { } } - public void clearCache() { + public void clearCaches() { if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) ((Cache) this.urlIndexFile).clearCache(); if (this.statsDump != null) this.statsDump.clear(); - this.solrInstances.clearCache(); + this.solrInstances.clearCaches(); this.statsDump = null; } @@ -250,7 +250,7 @@ public final class Fulltext { for (String name: instance.getCoreNames()) new EmbeddedSolrConnector(instance, name).clear(); } this.commit(false); - this.solrInstances.clearCache(); + this.solrInstances.clearCaches(); } } @@ -260,7 +260,7 @@ public final class Fulltext { if (instance != null) { for (String name: instance.getCoreNames()) new RemoteSolrConnector(instance, name).clear(); } - this.solrInstances.clearCache(); + this.solrInstances.clearCaches(); } } @@ -400,7 +400,7 @@ public final class Fulltext { throw new IOException(e.getMessage(), e); } this.statsDump = null; - if (MemoryControl.shortStatus()) clearCache(); + if (MemoryControl.shortStatus()) clearCaches(); } public void putEdges(final Collection edges) throws IOException { @@ -412,7 +412,7 @@ public final class Fulltext { throw new IOException(e.getMessage(), e); } this.statsDump = null; - if (MemoryControl.shortStatus()) clearCache(); + if (MemoryControl.shortStatus()) clearCaches(); } /** @@ -432,7 +432,7 @@ public final class Fulltext { throw new IOException(e.getMessage(), e); } this.statsDump = null; - if (MemoryControl.shortStatus()) clearCache(); + if (MemoryControl.shortStatus()) clearCaches(); } /** diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index df479736b..617d5269c 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -503,10 +503,10 @@ public class Segment { } } - public void clearCache() { + public void clearCaches() { if (this.urlCitationIndex != null) this.urlCitationIndex.clearCache(); if (this.termIndex != null) this.termIndex.clearCache(); - this.fulltext.clearCache(); + this.fulltext.clearCaches(); } public File getLocation() { From c152d996e6404cd67e1d80ad363aca1780149f3d Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 7 Nov 2013 10:55:02 +0100 Subject: [PATCH 3/7] reduced footprint of BookmarksDB which can take quite a lot of memory if the number of bookmarks is high (i.e. > 2000 URLs) --- source/net/yacy/data/BookmarksDB.java | 150 ++++++++++---------------- 1 file changed, 54 insertions(+), 96 deletions(-) diff --git a/source/net/yacy/data/BookmarksDB.java b/source/net/yacy/data/BookmarksDB.java index 1c11b4b15..d9c0140a6 100644 --- a/source/net/yacy/data/BookmarksDB.java +++ b/source/net/yacy/data/BookmarksDB.java @@ -41,7 +41,10 @@ import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.order.NaturalOrder; import net.yacy.cora.util.ConcurrentLog; +import net.yacy.cora.util.SpaceExceededException; import net.yacy.kelondro.blob.MapHeap; +import net.yacy.kelondro.data.meta.URIMetadataRow; +import net.yacy.kelondro.index.RowHandleSet; public class BookmarksDB { @@ -147,11 +150,6 @@ public class BookmarksDB { ConcurrentLog.logException(e); } } - public String addBookmark(final Bookmark bookmark){ - saveBookmark(bookmark); - return bookmark.getUrlHash(); - - } public Bookmark getBookmark(final String urlHash) throws IOException { try { @@ -214,18 +212,13 @@ public class BookmarksDB { final TreeSet set=new TreeSet(new bookmarkComparator(true)); final String tagHash=BookmarkHelper.tagHash(tagName); final Tag tag=getTag(tagHash); - Set hashes=new HashSet(); - if (tag != null) { - hashes=getTag(tagHash).getUrlHashes(); - } + RowHandleSet hashes = tag == null ? new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10) : tag.getUrlHashes(); if (priv) { - set.addAll(hashes); + for (byte[] hash: hashes) set.add(ASCII.String(hash)); } else { - final Iterator it=hashes.iterator(); - Bookmark bm; - while(it.hasNext()){ + for (byte[] hash: hashes) { try { - bm = getBookmark(it.next()); + Bookmark bm = getBookmark(ASCII.String(hash)); if (bm != null && bm.getPublic()) { set.add(bm.getUrlHash()); } @@ -249,7 +242,7 @@ public class BookmarksDB { * retrieve an object of type Tag from the the tagCache, if object is not cached return loadTag(hash) * @param hash an object of type String, containing a tagHash */ - public Tag getTag(final String hash){ + private Tag getTag(final String hash){ return this.tags.get(hash); //null if it does not exists } @@ -257,7 +250,7 @@ public class BookmarksDB { * store a Tag in tagsTable or remove an empty tag * @param tag an object of type Tag to be stored/removed */ - public void putTag(final Tag tag){ + private void putTag(final Tag tag){ if (tag == null) return; if (tag.isEmpty()) { this.tags.remove(tag.getTagHash()); @@ -266,7 +259,7 @@ public class BookmarksDB { } } - public void removeTag(final String hash) { + private void removeTag(final String hash) { this.tags.remove(hash); } @@ -301,7 +294,7 @@ public class BookmarksDB { return set.iterator(); } - public Iterator getTagIterator(final String tagName, final boolean priv, final int comp) { + private Iterator getTagIterator(final String tagName, final boolean priv, final int comp) { final TreeSet set=new TreeSet((comp == SORT_SIZE) ? tagSizeComparator : tagComparator); Iterator it=null; final Iterator bit=getBookmarksIterator(tagName, priv); @@ -347,14 +340,14 @@ public class BookmarksDB { final Tag oldTag=getTag(BookmarkHelper.tagHash(oldName)); if (oldTag != null) { - final Set urlHashes = oldTag.getUrlHashes(); // preserve urlHashes of oldTag + final RowHandleSet urlHashes = oldTag.getUrlHashes(); // preserve urlHashes of oldTag removeTag(BookmarkHelper.tagHash(oldName)); // remove oldHash from TagsDB Bookmark bookmark; Set tagSet = new TreeSet(String.CASE_INSENSITIVE_ORDER); - for (final String urlHash : urlHashes) { // looping through all bookmarks which were tagged with oldName + for (final byte[] urlHash : urlHashes) { // looping through all bookmarks which were tagged with oldName try { - bookmark = getBookmark(urlHash); + bookmark = getBookmark(ASCII.String(urlHash)); tagSet = bookmark.getTags(); tagSet.remove(oldName); bookmark.setTags(tagSet, true); // might not be needed, but doesn't hurt @@ -371,9 +364,9 @@ public class BookmarksDB { public void addTag(final String selectTag, final String newTag) { Bookmark bookmark; - for (final String urlHash : getTag(BookmarkHelper.tagHash(selectTag)).getUrlHashes()) { // looping through all bookmarks which were tagged with selectTag + for (final byte[] urlHash : getTag(BookmarkHelper.tagHash(selectTag)).getUrlHashes()) { // looping through all bookmarks which were tagged with selectTag try { - bookmark = getBookmark(urlHash); + bookmark = getBookmark(ASCII.String(urlHash)); bookmark.addTag(newTag); saveBookmark(bookmark); } catch (final IOException e) { @@ -389,51 +382,24 @@ public class BookmarksDB { * Subclass of bookmarksDB, which provides the Tag object-type */ public class Tag { - public static final String URL_HASHES = "urlHashes"; - public static final String TAG_NAME = "tagName"; private final String tagHash; - private final Map mem; - private Set urlHashes; - - public Tag(final String hash, final Map map){ - this.tagHash = hash; - this.mem = map; - if (this.mem.containsKey(URL_HASHES)) { - this.urlHashes = ListManager.string2set(this.mem.get(URL_HASHES)); - } else { - this.urlHashes = new HashSet(); - } - } + private final String tagName; + private RowHandleSet urlHashes; - public Tag(final String name, final HashSet entries){ + private Tag(final String name) { this.tagHash = BookmarkHelper.tagHash(name); - this.mem = new HashMap(); - //mem.put(URL_HASHES, listManager.arraylist2string(entries)); - this.urlHashes = entries; - this.mem.put(TAG_NAME, name); - } - - public Tag(final String name){ - this(name, new HashSet()); - } - - public Map getMap(){ - this.mem.put(URL_HASHES, ListManager.collection2string(this.urlHashes)); - return this.mem; + this.tagName = name; + this.urlHashes = new RowHandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 10); } /** * get the lowercase Tagname */ public String getTagName(){ - /*if(this.mem.containsKey(TAG_NAME)){ - return (String) this.mem.get(TAG_NAME); - } - return "";*/ return getFriendlyName().toLowerCase(); } - public String getTagHash(){ + private String getTagHash(){ return this.tagHash; } @@ -441,37 +407,33 @@ public class BookmarksDB { * @return the tag name, with all uppercase chars */ public String getFriendlyName(){ - /*if(this.mem.containsKey(TAG_FRIENDLY_NAME)){ - return (String) this.mem.get(TAG_FRIENDLY_NAME); - } - return getTagName();*/ - if(this.mem.containsKey(TAG_NAME)){ - return this.mem.get(TAG_NAME); - } - return "notagname"; + return this.tagName; } - public Set getUrlHashes(){ + private RowHandleSet getUrlHashes(){ return this.urlHashes; } - public boolean hasPublicItems(){ + private boolean hasPublicItems(){ return getBookmarksIterator(getTagName(), false).hasNext(); } - public void addUrl(final String urlHash){ - this.urlHashes.add(urlHash); + private void addUrl(final String urlHash){ + try { + this.urlHashes.put(ASCII.getBytes(urlHash)); + } catch (SpaceExceededException e) { + } } - public void delete(final String urlHash){ - this.urlHashes.remove(urlHash); + private void delete(final String urlHash){ + this.urlHashes.remove(ASCII.getBytes(urlHash)); } public int size(){ return this.urlHashes.size(); } - public boolean isEmpty() { + private boolean isEmpty() { return this.urlHashes.isEmpty(); } } @@ -481,27 +443,19 @@ public class BookmarksDB { */ public class Bookmark { - public static final String BOOKMARK_URL = "bookmarkUrl"; + private static final String BOOKMARK_URL = "bookmarkUrl"; public static final String BOOKMARK_TITLE = "bookmarkTitle"; public static final String BOOKMARK_DESCRIPTION = "bookmarkDesc"; - public static final String BOOKMARK_TAGS = "bookmarkTags"; - public static final String BOOKMARK_PUBLIC = "bookmarkPublic"; - public static final String BOOKMARK_TIMESTAMP = "bookmarkTimestamp"; - public static final String BOOKMARK_OWNER = "bookmarkOwner"; - public static final String BOOKMARK_IS_FEED = "bookmarkIsFeed"; + private static final String BOOKMARK_TAGS = "bookmarkTags"; + private static final String BOOKMARK_PUBLIC = "bookmarkPublic"; + private static final String BOOKMARK_TIMESTAMP = "bookmarkTimestamp"; + private static final String BOOKMARK_OWNER = "bookmarkOwner"; + private static final String BOOKMARK_IS_FEED = "bookmarkIsFeed"; private final String urlHash; private Set tagNames; private long timestamp; private final Map entry; - public Bookmark(final String urlHash, final Map map) { - this.entry = map; - this.urlHash = urlHash; - this.tagNames = new TreeSet(String.CASE_INSENSITIVE_ORDER); - if (map.containsKey(BOOKMARK_TAGS)) this.tagNames.addAll(ListManager.string2set(map.get(BOOKMARK_TAGS))); - loadTimestamp(); - } - public Bookmark(final DigestURL url) { this.entry = new HashMap(); this.urlHash = ASCII.String(url.hash()); @@ -529,11 +483,15 @@ public class BookmarksDB { this(new DigestURL((url.indexOf("://") < 0) ? "http://" + url : url)); } - public Bookmark(final Map map) throws MalformedURLException { - this(ASCII.String((new DigestURL(map.get(BOOKMARK_URL))).hash()), map); + private Bookmark(final Map map) throws MalformedURLException { + this.entry = map; + this.urlHash = ASCII.String((new DigestURL(map.get(BOOKMARK_URL))).hash()); + this.tagNames = new TreeSet(String.CASE_INSENSITIVE_ORDER); + if (map.containsKey(BOOKMARK_TAGS)) this.tagNames.addAll(ListManager.string2set(map.get(BOOKMARK_TAGS))); + loadTimestamp(); } - Map toMap() { + private Map toMap() { this.entry.put(BOOKMARK_TAGS, ListManager.collection2string(this.tagNames)); this.entry.put(BOOKMARK_TIMESTAMP, String.valueOf(this.timestamp)); return this.entry; @@ -688,11 +646,11 @@ public class BookmarksDB { /** * Subclass of bookmarksDB, which provides the bookmarkIterator object-type */ - public class bookmarkIterator implements Iterator { + private class bookmarkIterator implements Iterator { Iterator bookmarkIter; - public bookmarkIterator(final boolean up) throws IOException { + private bookmarkIterator(final boolean up) throws IOException { //flushBookmarkCache(); //XXX: this will cost performance this.bookmarkIter = BookmarksDB.this.bookmarks.keys(up, false); //this.nextEntry = null; @@ -722,14 +680,14 @@ public class BookmarksDB { /** * Comparator to sort objects of type Bookmark according to their timestamps */ - public class bookmarkComparator implements Comparator { + private class bookmarkComparator implements Comparator { private final boolean newestFirst; /** * @param newestFirst newest first, or oldest first? */ - public bookmarkComparator(final boolean newestFirst){ + private bookmarkComparator(final boolean newestFirst){ this.newestFirst = newestFirst; } @@ -752,13 +710,13 @@ public class BookmarksDB { } } - public static final TagComparator tagComparator = new TagComparator(); - public static final TagSizeComparator tagSizeComparator = new TagSizeComparator(); + private static final TagComparator tagComparator = new TagComparator(); + private static final TagSizeComparator tagSizeComparator = new TagSizeComparator(); /** * Comparator to sort objects of type Tag according to their names */ - public static class TagComparator implements Comparator, Serializable { + private static class TagComparator implements Comparator, Serializable { /** * generated serial @@ -772,7 +730,7 @@ public class BookmarksDB { } - public static class TagSizeComparator implements Comparator, Serializable { + private static class TagSizeComparator implements Comparator, Serializable { /** * generated serial From 81d9e2353217182669a623e1e0503ccc31fbb159 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 7 Nov 2013 11:57:01 +0100 Subject: [PATCH 4/7] fixed another memory leak in the PDF parser: the class org.apache.pdfbox.pdmodel.font.PDFont occupies 8MB of space which cannot be cleaned if PDFont.clearResources is called. The attempt to clean the class cache therefore causes that the class is loaded and this cache is initialized with some rubbish. I tried to prevent to instantiate this class by usage of a hacked findLoadedClass call to the SystemClassLoader (which is protected ...). Now, without using the PDF parser at all, 8MB of RAM space is not occupied, however, when the first PDF arrives this space will be taked and never given back to GC. WAKE UP YOU LAZY PDFBOX HACKER AND FIX THIS SHIT! --- .../net/yacy/document/parser/pdfParser.java | 73 ++++++++++++------- 1 file changed, 45 insertions(+), 28 deletions(-) diff --git a/source/net/yacy/document/parser/pdfParser.java b/source/net/yacy/document/parser/pdfParser.java index 72181ca7a..d74114180 100644 --- a/source/net/yacy/document/parser/pdfParser.java +++ b/source/net/yacy/document/parser/pdfParser.java @@ -32,27 +32,15 @@ import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; +import java.lang.reflect.Method; import java.util.Date; -import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.exceptions.CryptographyException; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; import org.apache.pdfbox.pdmodel.encryption.AccessPermission; import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException; import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial; -import org.apache.pdfbox.pdmodel.font.PDCIDFont; -import org.apache.pdfbox.pdmodel.font.PDCIDFontType0Font; -import org.apache.pdfbox.pdmodel.font.PDCIDFontType2Font; -import org.apache.pdfbox.pdmodel.font.PDFont; -import org.apache.pdfbox.pdmodel.font.PDMMType1Font; -import org.apache.pdfbox.pdmodel.font.PDSimpleFont; -import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont; -import org.apache.pdfbox.pdmodel.font.PDType0Font; -import org.apache.pdfbox.pdmodel.font.PDType1AfmPfbFont; -import org.apache.pdfbox.pdmodel.font.PDType1CFont; -import org.apache.pdfbox.pdmodel.font.PDType1Font; -import org.apache.pdfbox.pdmodel.font.PDType3Font; import org.apache.pdfbox.util.PDFTextStripper; import net.yacy.cora.document.id.AnchorURL; @@ -222,25 +210,54 @@ public class pdfParser extends AbstractParser implements Parser { false, docDate)}; } - - @SuppressWarnings("static-access") + public static void clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes() { // thank you very much, PDFParser hackers, this font cache will occupy >80MB RAM for a single pdf and then stays forever // AND I DO NOT EVEN NEED A FONT HERE TO PARSE THE TEXT! // Don't be so ignorant, just google once "PDFParser OutOfMemoryError" to feel the pain. - PDFont.clearResources(); - COSName.clearResources(); - PDType1Font.clearResources(); - PDTrueTypeFont.clearResources(); - PDType0Font.clearResources(); - PDType1AfmPfbFont.clearResources(); - PDType3Font.clearResources(); - PDType1CFont.clearResources(); - PDCIDFont.clearResources(); - PDCIDFontType0Font.clearResources(); - PDCIDFontType2Font.clearResources(); - PDMMType1Font.clearResources(); - PDSimpleFont.clearResources(); + ResourceCleaner cl = new ResourceCleaner(); + cl.clearClassResources("org.apache.pdfbox.cos.COSName"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDFont"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1Font"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDTrueTypeFont"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType0Font"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1AfmPfbFont"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType3Font"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDType1CFont"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFont"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFontType0Font"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDCIDFontType2Font"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDMMType1Font"); + cl.clearClassResources("org.apache.pdfbox.pdmodel.font.PDSimpleFont"); + } + + @SuppressWarnings({ "unchecked", "rawtypes" }) + private static class ResourceCleaner { + Method findLoadedClass; + private ClassLoader sys; + public ResourceCleaner() { + try { + this.findLoadedClass = ClassLoader.class.getDeclaredMethod("findLoadedClass", new Class[] { String.class }); + this.findLoadedClass.setAccessible(true); + this.sys = ClassLoader.getSystemClassLoader(); + } catch (Throwable e) { + e.printStackTrace(); + this.findLoadedClass = null; + this.sys = null; + } + } + public void clearClassResources(String name) { + if (this.findLoadedClass == null) return; + try { + Object pdfparserpainclass = this.findLoadedClass.invoke(this.sys, name); + if (pdfparserpainclass != null) { + Method clearResources = ((Class) pdfparserpainclass).getDeclaredMethod("clearResources", new Class[] {}); + if (clearResources != null) clearResources.invoke(null); + } + } catch (Throwable e) { + e.printStackTrace(); + } + } } /** From 87a956e8813a1ed8688c80b119a8b4177f20ebca Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Thu, 7 Nov 2013 12:13:12 +0100 Subject: [PATCH 5/7] calculating and showing the number of files and the average size of a file in the HTCACHE in ConfigHTCache_p.html --- htroot/ConfigHTCache_p.html | 2 +- htroot/ConfigHTCache_p.java | 4 +++- source/net/yacy/crawler/data/Cache.java | 8 ++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/htroot/ConfigHTCache_p.html b/htroot/ConfigHTCache_p.html index ae8c851ce..890ae9845 100644 --- a/htroot/ConfigHTCache_p.html +++ b/htroot/ConfigHTCache_p.html @@ -19,7 +19,7 @@
-
#[actualCacheSize]# MB
+
#[actualCacheSize]# MB for #[actualCacheDocCount]# files, #[docSizeAverage]# KB / file in average
MB
 
diff --git a/htroot/ConfigHTCache_p.java b/htroot/ConfigHTCache_p.java index 73141e65a..48d4df623 100644 --- a/htroot/ConfigHTCache_p.java +++ b/htroot/ConfigHTCache_p.java @@ -77,7 +77,9 @@ public class ConfigHTCache_p { } prop.put("HTCachePath", env.getConfig(SwitchboardConstants.HTCACHE_PATH, SwitchboardConstants.HTCACHE_PATH_DEFAULT)); - prop.put("actualCacheSize", (Cache.getActualCacheSize() / 1024 / 1024)); + prop.put("actualCacheSize", Cache.getActualCacheSize() / 1024 / 1024); + prop.put("actualCacheDocCount", Cache.getActualCacheDocCount()); + prop.put("docSizeAverage", Cache.getActualCacheSize() / Cache.getActualCacheDocCount() / 1024); prop.put("maxCacheSize", env.getConfigLong(SwitchboardConstants.PROXY_CACHE_SIZE, 64)); // return rewrite properties return prop; diff --git a/source/net/yacy/crawler/data/Cache.java b/source/net/yacy/crawler/data/Cache.java index f1d72354f..9973f08a0 100644 --- a/source/net/yacy/crawler/data/Cache.java +++ b/source/net/yacy/crawler/data/Cache.java @@ -182,6 +182,14 @@ public final class Cache { public static long getActualCacheSize() { return fileDBunbuffered.length(); } + + /** + * get the current actual cache size + * @return + */ + public static long getActualCacheDocCount() { + return fileDBunbuffered.size(); + } /** * close the databases From a5c1249ee2a59a2a16bb458c3518309662e92f9f Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sat, 9 Nov 2013 01:43:44 +0100 Subject: [PATCH 6/7] reverted autowarming setting in solrconfig --- defaults/solr/solrconfig.xml | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/defaults/solr/solrconfig.xml b/defaults/solr/solrconfig.xml index d8044f969..e0f895152 100644 --- a/defaults/solr/solrconfig.xml +++ b/defaults/solr/solrconfig.xml @@ -463,7 +463,7 @@ -