added memory protection for postprocessing

11 years ago · 3d474a843e
parent 412d55523c
commit 3d474a843e
3 changed files with 27 additions and 16 deletions
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2015,6 +2015,25 @@ public final class Switchboard extends serverSwitch {
        // do nothing
    }
    
+    public static void clearCaches() {
+        // flush caches in used libraries
+        pdfParser.clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); // eats up megabytes, see http://markmail.org/thread/quk5odee4hbsauhu
+        
+        // clear caches
+        if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords();
+        Word.clearCache();
+        // Domains.clear();            
+        FieldCache.DEFAULT.purgeAllCaches();
+        
+        // clean up image stack
+        ResultImages.clearQueues();
+        
+        // flush the document compressor cache
+        Cache.commit();
+        Digest.cleanup(); // don't let caches become permanent memory leaks
+
+    }
+    
    public int cleanupJobSize() {
        int c = 1; // "es gibt immer was zu tun"
        if ( (this.crawlQueues.delegatedURL.size() > 1000) ) {
@ -2040,21 +2059,7 @@ public final class Switchboard extends serverSwitch {
        
        ConcurrentLog.ensureWorkerIsRunning();
        try {
-            // flush caches in used libraries
-            pdfParser.clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes(); // eats up megabytes, see http://markmail.org/thread/quk5odee4hbsauhu
-            
-            // clear caches
-            if (WordCache.sizeCommonWords() > 1000) WordCache.clearCommonWords();
-            Word.clearCache();
-            // Domains.clear();            
-            FieldCache.DEFAULT.purgeAllCaches();
-            
-            // clean up image stack
-            ResultImages.clearQueues();
-            
-        	// flush the document compressor cache
-        	Cache.commit();
-        	Digest.cleanup(); // don't let caches become permanent memory leaks
+            clearCaches();

            // clear caches if necessary
            if ( !MemoryControl.request(128000000L, false) ) {
@ -2304,7 +2309,7 @@ public final class Switchboard extends serverSwitch {
            Fulltext fulltext = index.fulltext();
            CollectionConfiguration collection1Configuration = fulltext.getDefaultConfiguration();
            WebgraphConfiguration webgraphConfiguration = fulltext.getWebgraphConfiguration();
-            if (!this.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL)) {
+            if (!this.crawlJobIsPaused(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL) && MemoryControl.request(256000000L, false) && Memory.load() < 1.0f) {
                
                // we optimize first because that is useful for postprocessing
                int proccount = 0;
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@ -76,6 +76,7 @@ import net.yacy.kelondro.rwi.ReferenceContainer;
 import net.yacy.kelondro.rwi.ReferenceFactory;
 import net.yacy.kelondro.util.Bitfield;
 import net.yacy.kelondro.util.ISO639;
+import net.yacy.kelondro.util.MemoryControl;
 import net.yacy.kelondro.workflow.WorkflowProcessor;
 import net.yacy.repository.LoaderDispatcher;
 import net.yacy.search.StorageQueueEntry;
@ -304,6 +305,7 @@ public class Segment {
        }
        public ReferenceReport getReferenceReport(final byte[] id, final boolean acceptSelfReference) throws IOException {
            ReferenceReport rr = cache.get(id);
+            if (MemoryControl.shortStatus()) cache.clear();
            if (rr != null) return rr;
            try {
                rr = new ReferenceReport(id, acceptSelfReference);
@ -329,6 +331,7 @@ public class Segment {
        }
        public int getClickdepth(final DigestURL url, int maxtime) throws IOException {
            Integer clickdepth = cache.get(url.hash());
+            if (MemoryControl.shortStatus()) cache.clear();
            if (clickdepth != null) {
                //ConcurrentLog.info("Segment", "get clickdepth of url " + url.toNormalform(true) + ": " + clickdepth + " CACHE HIT");
                return clickdepth.intValue();
@ -386,6 +389,7 @@ public class Segment {
                SolrDocument doc;
                try {
                    while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {
+                        if (MemoryControl.shortStatus()) break;
                        String refid = (String) doc.getFieldValue(WebgraphSchema.source_id_s.getSolrFieldName());
                        if (refid == null) continue;
                        byte[] refidh = ASCII.getBytes(refid);
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -1059,6 +1059,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        Set<String> uniqueURLs = new HashSet<String>();
        try {
            long count = collectionConnector.getCountByQuery(query);
+            long start = System.currentTimeMillis();
            ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the collection for harvestkey " + harvestkey);
            BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(query, 0, 10000000, 1800000, 100);
            int countcheck = 0;
@ -1117,6 +1118,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
                    collectionConnector.add(sid);
                    
                    proccount++;
+                    if (proccount % 100 == 0) ConcurrentLog.info("CollectionConfiguration", "postprocessed " + proccount + " from " + count + " documents; " + (proccount * 1000 / (System.currentTimeMillis() - start)) + " docs/second; " + ((System.currentTimeMillis() - start) * (count - proccount) / proccount / 60000) + " minutes remaining");
                } catch (final Throwable e1) {
                    ConcurrentLog.logException(e1);
                }