From 3d474a843e2048915f0a651d8a97401fabef4fca Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Sun, 9 Feb 2014 12:36:56 +0100 Subject: [PATCH] added memory protection for postprocessing --- source/net/yacy/search/Switchboard.java | 37 +++++++++++-------- source/net/yacy/search/index/Segment.java | 4 ++ .../schema/CollectionConfiguration.java | 2 + 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 2d3c8f4e9..09a6abb30 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2015,6 +2015,25 @@ public final class Switchboard extends serverSwitch { // do nothing } + public static void clearCaches() { + // flush caches in used libraries + pdfParser.clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); // eats up megabytes, see http://markmail.org/thread/quk5odee4hbsauhu + + // clear caches + if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords(); + Word.clearCache(); + // Domains.clear(); + FieldCache.DEFAULT.purgeAllCaches(); + + // clean up image stack + ResultImages.clearQueues(); + + // flush the document compressor cache + Cache.commit(); + Digest.cleanup(); // don't let caches become permanent memory leaks + + } + public int cleanupJobSize() { int c = 1; // "es gibt immer was zu tun" if ( (this.crawlQueues.delegatedURL.size() > 1000) ) { @@ -2040,21 +2059,7 @@ public final class Switchboard extends serverSwitch { ConcurrentLog.ensureWorkerIsRunning(); try { - // flush caches in used libraries - pdfParser.clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); // eats up megabytes, see http://markmail.org/thread/quk5odee4hbsauhu - - // clear caches - if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords(); - Word.clearCache(); - // Domains.clear(); - FieldCache.DEFAULT.purgeAllCaches(); - - // clean up image stack - ResultImages.clearQueues(); - - // flush the document compressor cache - Cache.commit(); - Digest.cleanup(); // don't let caches become permanent memory leaks + clearCaches(); // clear caches if necessary if ( !MemoryControl.request(128000000L, false) ) { @@ -2304,7 +2309,7 @@ public final class Switchboard extends serverSwitch { Fulltext fulltext = index.fulltext(); CollectionConfiguration collection1Configuration = fulltext.getDefaultConfiguration(); WebgraphConfiguration webgraphConfiguration = fulltext.getWebgraphConfiguration(); - if (!this.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) { + if (!this.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) && MemoryControl.request(256000000L, false) && Memory.load() < 1.0f) { // we optimize first because that is useful for postprocessing int proccount = 0; diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 3ad3f427f..e3a5ac687 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -76,6 +76,7 @@ import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.rwi.ReferenceFactory; import net.yacy.kelondro.util.Bitfield; import net.yacy.kelondro.util.ISO639; +import net.yacy.kelondro.util.MemoryControl; import net.yacy.kelondro.workflow.WorkflowProcessor; import net.yacy.repository.LoaderDispatcher; import net.yacy.search.StorageQueueEntry; @@ -304,6 +305,7 @@ public class Segment { } public ReferenceReport getReferenceReport(final byte[] id, final boolean acceptSelfReference) throws IOException { ReferenceReport rr = cache.get(id); + if (MemoryControl.shortStatus()) cache.clear(); if (rr != null) return rr; try { rr = new ReferenceReport(id, acceptSelfReference); @@ -329,6 +331,7 @@ public class Segment { } public int getClickdepth(final DigestURL url, int maxtime) throws IOException { Integer clickdepth = cache.get(url.hash()); + if (MemoryControl.shortStatus()) cache.clear(); if (clickdepth != null) { //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth + " CACHE HIT"); return clickdepth.intValue(); @@ -386,6 +389,7 @@ public class Segment { SolrDocument doc; try { while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { + if (MemoryControl.shortStatus()) break; String refid = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName()); if (refid == null) continue; byte[] refidh = ASCII.getBytes(refid); diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index c6eef588c..eb1080e4a 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -1059,6 +1059,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri Set uniqueURLs = new HashSet(); try { long count = collectionConnector.getCountByQuery(query); + long start = System.currentTimeMillis(); ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the collection for harvestkey " + harvestkey); BlockingQueue docs = collectionConnector.concurrentDocumentsByQuery(query, 0, 10000000, 1800000, 100); int countcheck = 0; @@ -1117,6 +1118,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri collectionConnector.add(sid); proccount++; + if (proccount % 100 == 0) ConcurrentLog.info("CollectionConfiguration", "postprocessed " + proccount + " from " + count + " documents; " + (proccount * 1000 / (System.currentTimeMillis() - start)) + " docs/second; " + ((System.currentTimeMillis() - start) * (count - proccount) / proccount / 60000) + " minutes remaining"); } catch (final Throwable e1) { ConcurrentLog.logException(e1); }