diff --git a/htroot/IndexControlURLs_p.html b/htroot/IndexControlURLs_p.html
index 9c7c88f46..67401c397 100644
--- a/htroot/IndexControlURLs_p.html
+++ b/htroot/IndexControlURLs_p.html
@@ -77,10 +77,10 @@
Export Format
- Only Domain (superfast):
+ Only Domain:
Plain Text List (domains only)
HTML (domains as URLs, no title)
- Full URL List (high IO) :
+ Full URL List:
Plain Text List (URLs only)
HTML (URLs with title)
XML (RSS)
diff --git a/source/de/anomic/data/URLAnalysis.java b/source/de/anomic/data/URLAnalysis.java
index 017c1e8c9..4cb182952 100644
--- a/source/de/anomic/data/URLAnalysis.java
+++ b/source/de/anomic/data/URLAnalysis.java
@@ -50,6 +50,8 @@ public class URLAnalysis {
/**
* processes to analyse URL lists
*/
+
+ private static final long cleanuplimit = 50 * 1024 * 1024;
public static yacyURL poison = null;
static {
@@ -77,7 +79,6 @@ public class URLAnalysis {
try {
url = in.take();
if (url == poison) break;
- //System.out.println(url);
update(url.getHost().replaceAll("-", "\\.").split("\\."));
update(p.matcher(url.getPath()).replaceAll("/").split("/"));
} catch (InterruptedException e) {
@@ -96,6 +97,30 @@ public class URLAnalysis {
}
}
+ public static void cleanup(ConcurrentHashMap stat) {
+ Map.Entry entry;
+ int c, low = Integer.MAX_VALUE;
+ Iterator> i = stat.entrySet().iterator();
+ while (i.hasNext()) {
+ entry = i.next();
+ c = entry.getValue().intValue();
+ if (c == 1) {
+ i.remove();
+ } else {
+ if (c < low) low = c;
+ }
+ }
+ i = stat.entrySet().iterator();
+ while (i.hasNext()) {
+ entry = i.next();
+ c = entry.getValue().intValue();
+ if (c == low) {
+ i.remove();
+ }
+ }
+ Runtime.getRuntime().gc();
+ }
+
public static void main(String[] args) {
String filename = args[0];
String analysis = filename + ".stats";
@@ -133,6 +158,11 @@ public class URLAnalysis {
if (System.currentTimeMillis() - time > 1000) {
time = System.currentTimeMillis();
System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second");
+ if (MemoryControl.available() < cleanuplimit) {
+ System.out.println("starting cleanup, " + out.size() + " entries in statistic");
+ cleanup(out);
+ System.out.println("finished cleanup, " + out.size() + " entries in statistic left, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
+ }
}
}
reader.close();