fixed possible memory leak in htmlScraper: be aware that now links can get lost; further work necessary

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@288 6c8d7289-2bf4-0310-a012-ef5d649a1542
20 years ago · a25b5b4986
parent 38747857c2
commit a25b5b4986
18 changed files with 158 additions and 75 deletions
--- a/build.xml
+++ b/build.xml
@ -360,6 +360,29 @@

 <!-- run YaCy (needed for NetBeans4) -->
 <target name="run" description="run YaCy">
+<!-- debug options:
+     java -Xrunhprof:help
+     Hprof usage: -Xrunhprof[:help]|[:<option>=<value>, ...]
+
+     Option Name and Value  Description                Default
+     _____________________  ___________                _______
+     heap=dump|sites|all    heap profiling             all
+     cpu=samples|times|old  CPU usage                  off
+     monitor=y|n            monitor contention         n
+     format=a|b             ascii or binary output     a
+     file=<file>            write data to file         java.hprof(.txt for ascii)
+     net=<host>:<port>      send data over a socket    write to file
+     depth=<size>           stack trace depth          4
+     cutoff=<value>         output cutoff point        0.0001
+     lineno=y|n             line number in traces?     y
+     thread=y|n             thread in traces?          n
+     doe=y|n                dump on exit?              y
+     gc_okay=y|n            GC okay during sampling    y
+
+     Example: java -Xrunhprof:cpu=samples,file=log.txt,depth=3 FooClass
+
+     Note: format=b cannot be used with cpu=old|times
+-->
  <java classname="yacy" fork="yes">
    <classpath>    
      <pathelement location="${build}"/>
@ -369,8 +392,9 @@
      <pathelement location="${libx}" />                 
      <fileset dir="${libx}" includes="**/*.jar" />
    </classpath>
+    <!--<arg value="-Xrunhprof"/>-->
    <arg line="-start"/>
-    <!-- <arg line="-migratewords"/> -->
+    <!-- <arg line="-migratewords"/>-->
    <!-- <arg line="-start ${user.dir}"/>-->
  </java>
 </target>
--- a/htroot/IndexCreate_p.java
+++ b/htroot/IndexCreate_p.java
@ -104,11 +104,11 @@ public class IndexCreate_p {
                    boolean crawlOrder = ((String) post.get("crawlOrder", "")).equals("on");
                    env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
                    boolean xsstopw = ((String) post.get("xsstopw", "")).equals("on");
-                    env.setConfig("xsstopw", (crawlOrder) ? "true" : "false");
+                    env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
                    boolean xdstopw = ((String) post.get("xdstopw", "")).equals("on");
-                    env.setConfig("xdstopw", (crawlOrder) ? "true" : "false");
+                    env.setConfig("xdstopw", (xdstopw) ? "true" : "false");
                    boolean xpstopw = ((String) post.get("xpstopw", "")).equals("on");
-                    env.setConfig("xpstopw", (crawlOrder) ? "true" : "false");
+                    env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
                    
                    String crawlingStart = (String) post.get("crawlingURL");
                    if (!(crawlingStart.startsWith("http"))) crawlingStart = "http://" + crawlingStart;
--- a/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterAbstractScraper.java
@ -46,11 +46,11 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
    }

    public boolean isTag0(String tag) {
-	return tags0.contains(tag);
+	return (tags0 != null) && (tags0.contains(tag));
    }

    public boolean isTag1(String tag) {
-	return tags1.contains(tag);
+	return (tags1 != null) && (tags1.contains(tag));
    }

    //the 'missing' method that shall be implemented:
@ -405,4 +405,14 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
 	 return convertUmlaute(transscriptAll(stripAllTags(bb)));
    }

+    public void close() {
+        // free resources
+        tags0 = null;
+        tags1 = null;
+    }
+    
+    public void finalize() {
+        close();
+    }
+        
 }
--- a/source/de/anomic/htmlFilter/htmlFilterAbstractTransformer.java
+++ b/source/de/anomic/htmlFilter/htmlFilterAbstractTransformer.java
@ -78,4 +78,14 @@ public abstract class htmlFilterAbstractTransformer implements htmlFilterTransfo
 	return htmlFilterOutputStream.genTag1(tagname, tagopts, text, quotechar);
    }

+    public void close() {
+        // free resources
+        tags0 = null;
+        tags1 = null;
+    }
+    
+    public void finalize() {
+        close();
+    }
+        
 }
--- a/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentScraper.java
@ -122,10 +122,12 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen

    public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
 	//System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
-	if (tagname.equals("a")) anchors.put(absolutePath(tagopts.getProperty("href", "")),
-						    new serverByteBuffer(super.stripAll(new serverByteBuffer(text)).getBytes()).trim().toString());
-	if (tagname.equals("h1")) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
-	if (tagname.equals("title")) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
+	if ((tagname.equals("a")) && (text.length < 2048)) {
+            byte[] a = super.stripAll(new serverByteBuffer(text)).getBytes();
+            anchors.put(absolutePath(tagopts.getProperty("href", "")), new serverByteBuffer(a).trim().toString());
+        }
+	if ((tagname.equals("h1")) && (text.length < 512)) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
+	if ((tagname.equals("title")) && (text.length < 512)) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
    }


@ -161,6 +163,13 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
 	return images;
    }

+    public void close() {
+        // free resources
+        super.close();
+        linkTags0 = null;
+        linkTags1 = null;
+    }
+    
    public void print() {
 	System.out.println("TITLE   :" + title);
 	System.out.println("HEADLINE:" + headline);
--- a/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
+++ b/source/de/anomic/htmlFilter/htmlFilterContentTransformer.java
@ -128,4 +128,11 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
 	return htmlFilterOutputStream.genTag1(tagname, tagopts, text, quotechar);
    }

+    public void close() {
+        // free resources
+        super.close();
+        linkTags0 = null;
+        linkTags1 = null;
+    }
+        
 }
--- a/source/de/anomic/htmlFilter/htmlFilterOutputStream.java
+++ b/source/de/anomic/htmlFilter/htmlFilterOutputStream.java
@ -434,13 +434,15 @@ public final class htmlFilterOutputStream extends OutputStream {
 	if (out != null) out.flush();
 	// if you want to flush all, call close() at end of writing;
    }
-
-    private byte[] finalized = null;
-
+    
    public void finalize() throws IOException {
 	// if we are forced to close, we of course flush the buffer first,
 	// then close the connection
-        byte quotechar = (inSingleQuote) ? singlequote : doublequote;
+        close();
+    }
+
+    public void close() throws IOException {
+	byte quotechar = (inSingleQuote) ? singlequote : doublequote;
        if (buffer != null) {
            if (buffer.length() > 0) {
                byte[] filtered = filterSentence(buffer.getBytes(), quotechar);
@ -448,18 +450,19 @@ public final class htmlFilterOutputStream extends OutputStream {
            }
            buffer = null;
        }
-        finalized = filterFinalize(quotechar);
-    }
-
-    public void close() throws IOException {
-	finalize();
+        byte[] finalized = filterFinalize(quotechar);
 	if (out != null) {
 	    if (finalized != null) out.write(finalized);
 	    out.flush();
 	    out.close();
 	}
+        filterTag = null;
+        filterOpts = null;
+        filterCont = null;
+        //if (scraper != null) {scraper.close(); scraper = null;}
+        //if (transformer != null) {transformer.close(); transformer = null;}
    }
-
+    
    private static boolean binaryHint(byte b) {
 	if (b < 0) return false;
        if (b > 31) return false;
--- a/source/de/anomic/htmlFilter/htmlFilterScraper.java
+++ b/source/de/anomic/htmlFilter/htmlFilterScraper.java
@ -53,5 +53,7 @@ public interface htmlFilterScraper {
    public void scrapeTag0(String tagname, Properties tagopts);

    public void scrapeTag1(String tagname, Properties tagopts, byte[] text);
+    
+    public void close();

 }
--- a/source/de/anomic/htmlFilter/htmlFilterTransformer.java
+++ b/source/de/anomic/htmlFilter/htmlFilterTransformer.java
@ -73,4 +73,5 @@ public interface htmlFilterTransformer {
    // method that is called when a body-containing text occurs
    public byte[] transformTag1(String tagname, Properties tagopts, byte[] text, byte quotechar);

+    public void close();
 }
--- a/source/de/anomic/plasma/plasmaParser.java
+++ b/source/de/anomic/plasma/plasmaParser.java
@ -284,35 +284,35 @@ public final class plasmaParser {
        
        if (mimeTypeSet != null) {
            Iterator mimeTypes = mimeTypeSet.iterator();
-	        while (mimeTypes.hasNext()) {
-	            String mimeType = (String) mimeTypes.next();
-				if (availableParserList.containsKey(mimeType)) {
+            while (mimeTypes.hasNext()) {
+                String mimeType = (String) mimeTypes.next();
+                if (availableParserList.containsKey(mimeType)) {
                    Parser theParser = null;
                    try {
                        // getting the parser
                        theParser = (Parser) plasmaParser.theParserPool.borrowObject(availableParserList.get(mimeType));
                        
                        // getting a list of mimeTypes that the parser supports
-                        Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes();                        
+                        Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes();
                        if (parserSupportsMimeTypes != null) {
-                            Object supportedExtensions = parserSupportsMimeTypes.get(mimeType);                        
-                            if ((supportedExtensions != null) && 
-                                (supportedExtensions instanceof String) && 
-                                (((String)supportedExtensions).length() > 0)) {
-                        		String[] extArray = ((String)supportedExtensions).split(",");
+                            Object supportedExtensions = parserSupportsMimeTypes.get(mimeType);
+                            if ((supportedExtensions != null) &&
+                                    (supportedExtensions instanceof String) &&
+                                    (((String)supportedExtensions).length() > 0)) {
+                                String[] extArray = ((String)supportedExtensions).split(",");
                                newSupportedFileExt.addAll(Arrays.asList(extArray));
                            }
                        }
-						newEnabledParsers.put(mimeType,availableParserList.get(mimeType));
+                        newEnabledParsers.put(mimeType,availableParserList.get(mimeType));
                        
-                    } catch (Exception e) { 
+                    } catch (Exception e) {
                        e.printStackTrace();
                    } finally {
-                        if (theParser != null) 
+                        if (theParser != null)
                            try { plasmaParser.theParserPool.returnObject(mimeType,theParser); } catch (Exception e) {}
                    }
-				}
-	        }
+                }
+            }
        }
        
        synchronized (enabledParserList) {
@ -392,7 +392,7 @@ public final class plasmaParser {
                    String fullClassName = plasmaParserPkgName + "." + currentDir.getName() + "." + className;
 	                try {
                        // trying to load the parser class by its name
-						Class parserClass = Class.forName(fullClassName);
+                        Class parserClass = Class.forName(fullClassName);
                        Object theParser = parserClass.newInstance();
                        if (!(theParser instanceof Parser)) continue;
                        
@ -458,11 +458,13 @@ public final class plasmaParser {
                OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
    
                hfos.write(source);
+                hfos.close();
                return transformScraper(location, mimeType, scraper);
            } else {
                return null;
            }
        } catch (Exception e) {
+            //e.printStackTrace();
            return null;
        } finally {
            if ((theParser != null) && (supportedMimeTypesContains(mimeType))) {
@ -487,14 +489,14 @@ public final class plasmaParser {
                // ...otherwise we make a scraper and transformer
                htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
                OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);            
-                
                serverFileUtils.copy(sourceFile, hfos);
+                hfos.close();
                return transformScraper(location, mimeType, scraper);
            } else {
                return null;
            }
        } catch (Exception e) {
-            // e.printStackTrace();
+            //e.printStackTrace();
            return null;
        } finally {
            if ((theParser != null) && (supportedMimeTypesContains(mimeType))) {
@ -505,11 +507,14 @@ public final class plasmaParser {
    
    public plasmaParserDocument transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) {
        try {
-            return new plasmaParserDocument(new URL(urlNormalform(location)),
+            plasmaParserDocument ppd =  new plasmaParserDocument(new URL(urlNormalform(location)),
                                mimeType, null, null, scraper.getHeadline(),
                                null, null,
                                scraper.getText(), scraper.getAnchors(), scraper.getImages());
+            //scraper.close();
+            return ppd;
        } catch (MalformedURLException e) {
+            //e.printStackTrace();
            return null;
        }
    }
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -445,7 +445,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        wordIndex.close(waitingBoundSeconds);
        log.logSystem("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager");
        try {
-			cacheLoader.close();
+            cacheLoader.close();
            wikiDB.close();
            messageDB.close();
            facilityDB.close();
@ -468,10 +468,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        //return processStack.size() + cacheLoader.size() + noticeURL.stackSize();
    }
    
+    /*
    public int lUrlSize() {
 	return urlPool.loadedURL.size();
    }
-
+    */
+    
    public int cacheSizeMin() {
 	return wordIndex.size();
    }
@ -765,7 +767,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            log.logDebug("processResourceStack processCase=" + processCase + ", depth=" + entry.depth + ", maxDepth=" + entry.profile.generalDepth() + ", filter=" + entry.profile.generalFilter() + ", initiatorHash=" + initiatorHash + ", status=" + entry.status + ", source=" + ((entry.cacheArray == null) ? "scraper" : "byte[]") + ", url=" + entry.nomalizedURLString); // DEBUG
            
            // parse content
-            plasmaParserDocument document;
+            plasmaParserDocument document = null;
            
            if (plasmaParser.supportedMimeTypesContains(entry.responseHeader.mime())) {
                if (entry.scraper != null) {
--- a/source/de/anomic/plasma/plasmaWordIndexCache.java
+++ b/source/de/anomic/plasma/plasmaWordIndexCache.java
@ -184,36 +184,43 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
        long startTime = System.currentTimeMillis();
        long messageTime = System.currentTimeMillis() + 5000;
        long urlCount = 0, urlsPerSecond = 0;
-        synchronized (cache) {
-            Iterator i = dumpStack.iterator();
-            kelondroRecords.Node node;
-            String wordHash;
-            plasmaWordIndexEntryContainer container;
-            long creationTime;
-            plasmaWordIndexEntry wordEntry;
-            byte[][] row;
-            while (i.hasNext()) {
-                // get out one entry
-                node = (kelondroRecords.Node) i.next();
-                row = node.getValues();
-                wordHash = new String(row[0]);
-                creationTime = kelondroRecords.bytes2long(row[2]);
-                wordEntry = new plasmaWordIndexEntry(new String(row[3]), new String(row[4]));
-
-                // store to cache
-                addEntry(wordHash, wordEntry, creationTime);
-                urlCount++;
-                
-                // write a log
-                if (System.currentTimeMillis() > messageTime) {
-                    urlsPerSecond = 1 + urlCount * 1000 / (1 + System.currentTimeMillis() - startTime);
-                    log.logInfo("restoring status: " + urlCount + " urls done, " + ((dumpStack.size() - urlCount) / urlsPerSecond) + " seconds remaining");
-                    messageTime = System.currentTimeMillis() + 5000;
+        try {
+            synchronized (cache) {
+                Iterator i = dumpStack.iterator();
+                kelondroRecords.Node node;
+                String wordHash;
+                plasmaWordIndexEntryContainer container;
+                long creationTime;
+                plasmaWordIndexEntry wordEntry;
+                byte[][] row;
+                while (i.hasNext()) {
+                    // get out one entry
+                    node = (kelondroRecords.Node) i.next();
+                    row = node.getValues();
+                    wordHash = new String(row[0]);
+                    creationTime = kelondroRecords.bytes2long(row[2]);
+                    wordEntry = new plasmaWordIndexEntry(new String(row[3]), new String(row[4]));
+                    
+                    // store to cache
+                    addEntry(wordHash, wordEntry, creationTime);
+                    urlCount++;
+                    
+                    // write a log
+                    if (System.currentTimeMillis() > messageTime) {
+                        urlsPerSecond = 1 + urlCount * 1000 / (1 + System.currentTimeMillis() - startTime);
+                        log.logInfo("restoring status: " + urlCount + " urls done, " + ((dumpStack.size() - urlCount) / urlsPerSecond) + " seconds remaining");
+                        messageTime = System.currentTimeMillis() + 5000;
+                    }
                }
            }
+            
+            dumpStack.close();
+            log.logSystem("restored " + cache.size() + " words in " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds");
+        } catch (kelondroException e) {
+            // restore failed
+            log.logError("restore of indexCache dump failed: " + e.getMessage());
+            e.printStackTrace();
        }
-	dumpStack.close();
-        log.logSystem("restored " + cache.size() + " words in " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds");
        return urlCount;
    }

--- a/source/de/anomic/server/logging/serverLog.java
+++ b/source/de/anomic/server/logging/serverLog.java
@ -74,7 +74,7 @@ public final class serverLog {
    
    public serverLog(String appName) {
        this.theLogger = Logger.getLogger(appName);
-        this.theLogger.setLevel(Level.FINEST); // set a default level
+        //this.theLogger.setLevel(Level.FINEST); // set a default level
    }
       
    public void setLevel(Level newLevel) {
--- a/source/de/anomic/server/serverCore.java
+++ b/source/de/anomic/server/serverCore.java
@ -383,16 +383,17 @@ public final class serverCore extends serverAbstractThread implements serverThre
        
        // wait for new connection
        announceThreadBlockApply();
-                
        Socket controlSocket = this.socket.accept();
-        
        announceThreadBlockRelease();
+
        String cIP = clientAddress(controlSocket);
        //System.out.println("server bfHosts=" + bfHost.toString());
        if (bfHost.get(cIP) != null) {
            this.log.logInfo("SLOWING DOWN ACCESS FOR BRUTE-FORCE PREVENTION FROM " + cIP);
            // add a delay to make brute-force harder
+            announceThreadBlockApply();
            try {Thread.currentThread().sleep(3000);} catch (InterruptedException e) {}
+            announceThreadBlockRelease();
        }
        
        if ((this.denyHost == null) || (this.denyHost.get(cIP) == null)) {
--- a/source/de/anomic/yacy/yacyCore.java
+++ b/source/de/anomic/yacy/yacyCore.java
@ -359,7 +359,7 @@ public class yacyCore {
                if (i == 0) Thread.currentThread().sleep(2000); // after the first time wait some seconds
 		Thread.currentThread().sleep(1000 + 500 * v.size()); // wait a while
 	    } catch (InterruptedException e) {}
-
+            
            // check all threads
            for (int j = 0; j < v.size(); j++) {
                t = (publishThread) v.elementAt(j);
--- a/source/de/anomic/yacy/yacyPeerActions.java
+++ b/source/de/anomic/yacy/yacyPeerActions.java
@ -102,7 +102,8 @@ public class yacyPeerActions {
        seedDB.mySeed.put("ISpeed", "unknown"); // the speed of indexing (words/minute) of the peer
        long uptime = ((yacyCore.universalTime() - Long.parseLong(sb.getConfig("startupTime", "0"))) / 1000) / 60;
        seedDB.mySeed.put("Uptime", "" + uptime); // the number of minutes that the peer is up in minutes/day (moving average MA30)
-        seedDB.mySeed.put("LCount", "" + sb.lUrlSize()); // the number of links that the peer has stored (LURL's)
+        seedDB.mySeed.put("LCount", "" + sb.urlPool.loadedURL.size()); // the number of links that the peer has stored (LURL's)
+        seedDB.mySeed.put("NCount", "" + sb.urlPool.noticeURL.stackSize()); // the number of links that the peer has noticed, but not loaded (NURL's)
        seedDB.mySeed.put("ICount", "" + sb.cacheSizeMin()); // the minimum number of words that the peer has indexed (as it says)
        seedDB.mySeed.put("SCount", "" + seedDB.sizeConnected()); // the number of seeds that the peer has stored
        seedDB.mySeed.put("CCount", "" + (((int) ((seedDB.sizeConnected() + seedDB.sizeDisconnected() + seedDB.sizePotential()) * 60.0 / (uptime + 1.01)) * 100) / 100.0)); // the number of clients that the peer connects (as connects/hour)
--- a/source/de/anomic/yacy/yacySeed.java
+++ b/source/de/anomic/yacy/yacySeed.java
@ -102,6 +102,7 @@ public class yacySeed {
 	dna.put("ISpeed", "0");   // the speed of indexing (words/minute) of the peer
 	dna.put("Uptime", "0");   // the number of minutes that the peer is up in minutes/day (moving average MA30)
 	dna.put("LCount", "0");   // the number of links that the peer has stored (LURL's)
+        dna.put("NCount", "0");   // the number of links that the peer has noticed, but not loaded (NURL's)
 	dna.put("ICount", "0");   // the number of words that the peer has indexed (as it says)
 	dna.put("SCount", "0");   // the number of seeds that the peer has stored
 	dna.put("CCount", "0");   // the number of clients that the peer connects (as connects/hour)
--- a/source/yacy.java
+++ b/source/yacy.java
@ -628,8 +628,8 @@ public final class yacy {
    // application wrapper
    public static void main(String args[]) {
        String applicationRoot = System.getProperty("user.dir");
-        //System.out.println("args.length=" + args.length);
-        //System.out.print("args=["); for (int i = 0; i < args.length; i++) System.out.print(args[i] + ", "); System.out.println("]");
+        System.out.println("args.length=" + args.length);
+        System.out.print("args=["); for (int i = 0; i < args.length; i++) System.out.print(args[i] + ", "); System.out.println("]");
        if ((args.length >= 1) && ((args[0].equals("-startup")) || (args[0].equals("-start")))) {
            // normal start-up of yacy
            if (args.length == 2) applicationRoot= args[1];