fixed possible memory leak in htmlScraper: be aware that now links can get lost; further work necessary

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@288 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent 38747857c2
commit a25b5b4986

@ -360,6 +360,29 @@
<!-- run YaCy (needed for NetBeans4) -->
<target name="run" description="run YaCy">
<!-- debug options:
java -Xrunhprof:help
Hprof usage: -Xrunhprof[:help]|[:<option>=<value>, ...]
Option Name and Value Description Default
_____________________ ___________ _______
heap=dump|sites|all heap profiling all
cpu=samples|times|old CPU usage off
monitor=y|n monitor contention n
format=a|b ascii or binary output a
file=<file> write data to file java.hprof(.txt for ascii)
net=<host>:<port> send data over a socket write to file
depth=<size> stack trace depth 4
cutoff=<value> output cutoff point 0.0001
lineno=y|n line number in traces? y
thread=y|n thread in traces? n
doe=y|n dump on exit? y
gc_okay=y|n GC okay during sampling y
Example: java -Xrunhprof:cpu=samples,file=log.txt,depth=3 FooClass
Note: format=b cannot be used with cpu=old|times
-->
<java classname="yacy" fork="yes">
<classpath>
<pathelement location="${build}"/>
@ -369,8 +392,9 @@
<pathelement location="${libx}" />
<fileset dir="${libx}" includes="**/*.jar" />
</classpath>
<!--<arg value="-Xrunhprof"/>-->
<arg line="-start"/>
<!-- <arg line="-migratewords"/> -->
<!-- <arg line="-migratewords"/>-->
<!-- <arg line="-start ${user.dir}"/>-->
</java>
</target>

@ -104,11 +104,11 @@ public class IndexCreate_p {
boolean crawlOrder = ((String) post.get("crawlOrder", "")).equals("on");
env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
boolean xsstopw = ((String) post.get("xsstopw", "")).equals("on");
env.setConfig("xsstopw", (crawlOrder) ? "true" : "false");
env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
boolean xdstopw = ((String) post.get("xdstopw", "")).equals("on");
env.setConfig("xdstopw", (crawlOrder) ? "true" : "false");
env.setConfig("xdstopw", (xdstopw) ? "true" : "false");
boolean xpstopw = ((String) post.get("xpstopw", "")).equals("on");
env.setConfig("xpstopw", (crawlOrder) ? "true" : "false");
env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
String crawlingStart = (String) post.get("crawlingURL");
if (!(crawlingStart.startsWith("http"))) crawlingStart = "http://" + crawlingStart;

@ -46,11 +46,11 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
}
public boolean isTag0(String tag) {
return tags0.contains(tag);
return (tags0 != null) && (tags0.contains(tag));
}
public boolean isTag1(String tag) {
return tags1.contains(tag);
return (tags1 != null) && (tags1.contains(tag));
}
//the 'missing' method that shall be implemented:
@ -405,4 +405,14 @@ public abstract class htmlFilterAbstractScraper implements htmlFilterScraper {
return convertUmlaute(transscriptAll(stripAllTags(bb)));
}
public void close() {
// free resources
tags0 = null;
tags1 = null;
}
public void finalize() {
close();
}
}

@ -78,4 +78,14 @@ public abstract class htmlFilterAbstractTransformer implements htmlFilterTransfo
return htmlFilterOutputStream.genTag1(tagname, tagopts, text, quotechar);
}
public void close() {
// free resources
tags0 = null;
tags1 = null;
}
public void finalize() {
close();
}
}

@ -122,10 +122,12 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
//System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + new String(text));
if (tagname.equals("a")) anchors.put(absolutePath(tagopts.getProperty("href", "")),
new serverByteBuffer(super.stripAll(new serverByteBuffer(text)).getBytes()).trim().toString());
if (tagname.equals("h1")) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
if (tagname.equals("title")) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
if ((tagname.equals("a")) && (text.length < 2048)) {
byte[] a = super.stripAll(new serverByteBuffer(text)).getBytes();
anchors.put(absolutePath(tagopts.getProperty("href", "")), new serverByteBuffer(a).trim().toString());
}
if ((tagname.equals("h1")) && (text.length < 512)) headline = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
if ((tagname.equals("title")) && (text.length < 512)) title = new String(super.stripAll(new serverByteBuffer(text)).getBytes());
}
@ -161,6 +163,13 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
return images;
}
public void close() {
// free resources
super.close();
linkTags0 = null;
linkTags1 = null;
}
public void print() {
System.out.println("TITLE :" + title);
System.out.println("HEADLINE:" + headline);

@ -128,4 +128,11 @@ public class htmlFilterContentTransformer extends htmlFilterAbstractTransformer
return htmlFilterOutputStream.genTag1(tagname, tagopts, text, quotechar);
}
public void close() {
// free resources
super.close();
linkTags0 = null;
linkTags1 = null;
}
}

@ -434,13 +434,15 @@ public final class htmlFilterOutputStream extends OutputStream {
if (out != null) out.flush();
// if you want to flush all, call close() at end of writing;
}
private byte[] finalized = null;
public void finalize() throws IOException {
// if we are forced to close, we of course flush the buffer first,
// then close the connection
byte quotechar = (inSingleQuote) ? singlequote : doublequote;
close();
}
public void close() throws IOException {
byte quotechar = (inSingleQuote) ? singlequote : doublequote;
if (buffer != null) {
if (buffer.length() > 0) {
byte[] filtered = filterSentence(buffer.getBytes(), quotechar);
@ -448,18 +450,19 @@ public final class htmlFilterOutputStream extends OutputStream {
}
buffer = null;
}
finalized = filterFinalize(quotechar);
}
public void close() throws IOException {
finalize();
byte[] finalized = filterFinalize(quotechar);
if (out != null) {
if (finalized != null) out.write(finalized);
out.flush();
out.close();
}
filterTag = null;
filterOpts = null;
filterCont = null;
//if (scraper != null) {scraper.close(); scraper = null;}
//if (transformer != null) {transformer.close(); transformer = null;}
}
private static boolean binaryHint(byte b) {
if (b < 0) return false;
if (b > 31) return false;

@ -53,5 +53,7 @@ public interface htmlFilterScraper {
public void scrapeTag0(String tagname, Properties tagopts);
public void scrapeTag1(String tagname, Properties tagopts, byte[] text);
public void close();
}

@ -73,4 +73,5 @@ public interface htmlFilterTransformer {
// method that is called when a body-containing text occurs
public byte[] transformTag1(String tagname, Properties tagopts, byte[] text, byte quotechar);
public void close();
}

@ -284,35 +284,35 @@ public final class plasmaParser {
if (mimeTypeSet != null) {
Iterator mimeTypes = mimeTypeSet.iterator();
while (mimeTypes.hasNext()) {
String mimeType = (String) mimeTypes.next();
if (availableParserList.containsKey(mimeType)) {
while (mimeTypes.hasNext()) {
String mimeType = (String) mimeTypes.next();
if (availableParserList.containsKey(mimeType)) {
Parser theParser = null;
try {
// getting the parser
theParser = (Parser) plasmaParser.theParserPool.borrowObject(availableParserList.get(mimeType));
// getting a list of mimeTypes that the parser supports
Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes();
Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes();
if (parserSupportsMimeTypes != null) {
Object supportedExtensions = parserSupportsMimeTypes.get(mimeType);
if ((supportedExtensions != null) &&
(supportedExtensions instanceof String) &&
(((String)supportedExtensions).length() > 0)) {
String[] extArray = ((String)supportedExtensions).split(",");
Object supportedExtensions = parserSupportsMimeTypes.get(mimeType);
if ((supportedExtensions != null) &&
(supportedExtensions instanceof String) &&
(((String)supportedExtensions).length() > 0)) {
String[] extArray = ((String)supportedExtensions).split(",");
newSupportedFileExt.addAll(Arrays.asList(extArray));
}
}
newEnabledParsers.put(mimeType,availableParserList.get(mimeType));
newEnabledParsers.put(mimeType,availableParserList.get(mimeType));
} catch (Exception e) {
} catch (Exception e) {
e.printStackTrace();
} finally {
if (theParser != null)
if (theParser != null)
try { plasmaParser.theParserPool.returnObject(mimeType,theParser); } catch (Exception e) {}
}
}
}
}
}
}
synchronized (enabledParserList) {
@ -392,7 +392,7 @@ public final class plasmaParser {
String fullClassName = plasmaParserPkgName + "." + currentDir.getName() + "." + className;
try {
// trying to load the parser class by its name
Class parserClass = Class.forName(fullClassName);
Class parserClass = Class.forName(fullClassName);
Object theParser = parserClass.newInstance();
if (!(theParser instanceof Parser)) continue;
@ -458,11 +458,13 @@ public final class plasmaParser {
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
hfos.write(source);
hfos.close();
return transformScraper(location, mimeType, scraper);
} else {
return null;
}
} catch (Exception e) {
//e.printStackTrace();
return null;
} finally {
if ((theParser != null) && (supportedMimeTypesContains(mimeType))) {
@ -487,14 +489,14 @@ public final class plasmaParser {
// ...otherwise we make a scraper and transformer
htmlFilterContentScraper scraper = new htmlFilterContentScraper(location);
OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
serverFileUtils.copy(sourceFile, hfos);
hfos.close();
return transformScraper(location, mimeType, scraper);
} else {
return null;
}
} catch (Exception e) {
// e.printStackTrace();
//e.printStackTrace();
return null;
} finally {
if ((theParser != null) && (supportedMimeTypesContains(mimeType))) {
@ -505,11 +507,14 @@ public final class plasmaParser {
public plasmaParserDocument transformScraper(URL location, String mimeType, htmlFilterContentScraper scraper) {
try {
return new plasmaParserDocument(new URL(urlNormalform(location)),
plasmaParserDocument ppd = new plasmaParserDocument(new URL(urlNormalform(location)),
mimeType, null, null, scraper.getHeadline(),
null, null,
scraper.getText(), scraper.getAnchors(), scraper.getImages());
//scraper.close();
return ppd;
} catch (MalformedURLException e) {
//e.printStackTrace();
return null;
}
}

@ -445,7 +445,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
wordIndex.close(waitingBoundSeconds);
log.logSystem("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager");
try {
cacheLoader.close();
cacheLoader.close();
wikiDB.close();
messageDB.close();
facilityDB.close();
@ -468,10 +468,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
//return processStack.size() + cacheLoader.size() + noticeURL.stackSize();
}
/*
public int lUrlSize() {
return urlPool.loadedURL.size();
}
*/
public int cacheSizeMin() {
return wordIndex.size();
}
@ -765,7 +767,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logDebug("processResourceStack processCase=" + processCase + ", depth=" + entry.depth + ", maxDepth=" + entry.profile.generalDepth() + ", filter=" + entry.profile.generalFilter() + ", initiatorHash=" + initiatorHash + ", status=" + entry.status + ", source=" + ((entry.cacheArray == null) ? "scraper" : "byte[]") + ", url=" + entry.nomalizedURLString); // DEBUG
// parse content
plasmaParserDocument document;
plasmaParserDocument document = null;
if (plasmaParser.supportedMimeTypesContains(entry.responseHeader.mime())) {
if (entry.scraper != null) {

@ -184,36 +184,43 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
long startTime = System.currentTimeMillis();
long messageTime = System.currentTimeMillis() + 5000;
long urlCount = 0, urlsPerSecond = 0;
synchronized (cache) {
Iterator i = dumpStack.iterator();
kelondroRecords.Node node;
String wordHash;
plasmaWordIndexEntryContainer container;
long creationTime;
plasmaWordIndexEntry wordEntry;
byte[][] row;
while (i.hasNext()) {
// get out one entry
node = (kelondroRecords.Node) i.next();
row = node.getValues();
wordHash = new String(row[0]);
creationTime = kelondroRecords.bytes2long(row[2]);
wordEntry = new plasmaWordIndexEntry(new String(row[3]), new String(row[4]));
// store to cache
addEntry(wordHash, wordEntry, creationTime);
urlCount++;
// write a log
if (System.currentTimeMillis() > messageTime) {
urlsPerSecond = 1 + urlCount * 1000 / (1 + System.currentTimeMillis() - startTime);
log.logInfo("restoring status: " + urlCount + " urls done, " + ((dumpStack.size() - urlCount) / urlsPerSecond) + " seconds remaining");
messageTime = System.currentTimeMillis() + 5000;
try {
synchronized (cache) {
Iterator i = dumpStack.iterator();
kelondroRecords.Node node;
String wordHash;
plasmaWordIndexEntryContainer container;
long creationTime;
plasmaWordIndexEntry wordEntry;
byte[][] row;
while (i.hasNext()) {
// get out one entry
node = (kelondroRecords.Node) i.next();
row = node.getValues();
wordHash = new String(row[0]);
creationTime = kelondroRecords.bytes2long(row[2]);
wordEntry = new plasmaWordIndexEntry(new String(row[3]), new String(row[4]));
// store to cache
addEntry(wordHash, wordEntry, creationTime);
urlCount++;
// write a log
if (System.currentTimeMillis() > messageTime) {
urlsPerSecond = 1 + urlCount * 1000 / (1 + System.currentTimeMillis() - startTime);
log.logInfo("restoring status: " + urlCount + " urls done, " + ((dumpStack.size() - urlCount) / urlsPerSecond) + " seconds remaining");
messageTime = System.currentTimeMillis() + 5000;
}
}
}
dumpStack.close();
log.logSystem("restored " + cache.size() + " words in " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds");
} catch (kelondroException e) {
// restore failed
log.logError("restore of indexCache dump failed: " + e.getMessage());
e.printStackTrace();
}
dumpStack.close();
log.logSystem("restored " + cache.size() + " words in " + ((System.currentTimeMillis() - startTime) / 1000) + " seconds");
return urlCount;
}

@ -74,7 +74,7 @@ public final class serverLog {
public serverLog(String appName) {
this.theLogger = Logger.getLogger(appName);
this.theLogger.setLevel(Level.FINEST); // set a default level
//this.theLogger.setLevel(Level.FINEST); // set a default level
}
public void setLevel(Level newLevel) {

@ -383,16 +383,17 @@ public final class serverCore extends serverAbstractThread implements serverThre
// wait for new connection
announceThreadBlockApply();
Socket controlSocket = this.socket.accept();
announceThreadBlockRelease();
String cIP = clientAddress(controlSocket);
//System.out.println("server bfHosts=" + bfHost.toString());
if (bfHost.get(cIP) != null) {
this.log.logInfo("SLOWING DOWN ACCESS FOR BRUTE-FORCE PREVENTION FROM " + cIP);
// add a delay to make brute-force harder
announceThreadBlockApply();
try {Thread.currentThread().sleep(3000);} catch (InterruptedException e) {}
announceThreadBlockRelease();
}
if ((this.denyHost == null) || (this.denyHost.get(cIP) == null)) {

@ -359,7 +359,7 @@ public class yacyCore {
if (i == 0) Thread.currentThread().sleep(2000); // after the first time wait some seconds
Thread.currentThread().sleep(1000 + 500 * v.size()); // wait a while
} catch (InterruptedException e) {}
// check all threads
for (int j = 0; j < v.size(); j++) {
t = (publishThread) v.elementAt(j);

@ -102,7 +102,8 @@ public class yacyPeerActions {
seedDB.mySeed.put("ISpeed", "unknown"); // the speed of indexing (words/minute) of the peer
long uptime = ((yacyCore.universalTime() - Long.parseLong(sb.getConfig("startupTime", "0"))) / 1000) / 60;
seedDB.mySeed.put("Uptime", "" + uptime); // the number of minutes that the peer is up in minutes/day (moving average MA30)
seedDB.mySeed.put("LCount", "" + sb.lUrlSize()); // the number of links that the peer has stored (LURL's)
seedDB.mySeed.put("LCount", "" + sb.urlPool.loadedURL.size()); // the number of links that the peer has stored (LURL's)
seedDB.mySeed.put("NCount", "" + sb.urlPool.noticeURL.stackSize()); // the number of links that the peer has noticed, but not loaded (NURL's)
seedDB.mySeed.put("ICount", "" + sb.cacheSizeMin()); // the minimum number of words that the peer has indexed (as it says)
seedDB.mySeed.put("SCount", "" + seedDB.sizeConnected()); // the number of seeds that the peer has stored
seedDB.mySeed.put("CCount", "" + (((int) ((seedDB.sizeConnected() + seedDB.sizeDisconnected() + seedDB.sizePotential()) * 60.0 / (uptime + 1.01)) * 100) / 100.0)); // the number of clients that the peer connects (as connects/hour)

@ -102,6 +102,7 @@ public class yacySeed {
dna.put("ISpeed", "0"); // the speed of indexing (words/minute) of the peer
dna.put("Uptime", "0"); // the number of minutes that the peer is up in minutes/day (moving average MA30)
dna.put("LCount", "0"); // the number of links that the peer has stored (LURL's)
dna.put("NCount", "0"); // the number of links that the peer has noticed, but not loaded (NURL's)
dna.put("ICount", "0"); // the number of words that the peer has indexed (as it says)
dna.put("SCount", "0"); // the number of seeds that the peer has stored
dna.put("CCount", "0"); // the number of clients that the peer connects (as connects/hour)

@ -628,8 +628,8 @@ public final class yacy {
// application wrapper
public static void main(String args[]) {
String applicationRoot = System.getProperty("user.dir");
//System.out.println("args.length=" + args.length);
//System.out.print("args=["); for (int i = 0; i < args.length; i++) System.out.print(args[i] + ", "); System.out.println("]");
System.out.println("args.length=" + args.length);
System.out.print("args=["); for (int i = 0; i < args.length; i++) System.out.print(args[i] + ", "); System.out.println("]");
if ((args.length >= 1) && ((args[0].equals("-startup")) || (args[0].equals("-start")))) {
// normal start-up of yacy
if (args.length == 2) applicationRoot= args[1];

Loading…
Cancel
Save