diff --git a/htroot/IndexControlURLs_p.html b/htroot/IndexControlURLs_p.html index 9c7c88f46..67401c397 100644 --- a/htroot/IndexControlURLs_p.html +++ b/htroot/IndexControlURLs_p.html @@ -77,10 +77,10 @@
Export Format
-
Only Domain (superfast): +
Only Domain: Plain Text List (domains only)   HTML (domains as URLs, no title)
- Full URL List (high IO)    : + Full URL List: Plain Text List (URLs only)        HTML (URLs with title)       XML (RSS) diff --git a/source/de/anomic/data/URLAnalysis.java b/source/de/anomic/data/URLAnalysis.java index 017c1e8c9..4cb182952 100644 --- a/source/de/anomic/data/URLAnalysis.java +++ b/source/de/anomic/data/URLAnalysis.java @@ -50,6 +50,8 @@ public class URLAnalysis { /** * processes to analyse URL lists */ + + private static final long cleanuplimit = 50 * 1024 * 1024; public static yacyURL poison = null; static { @@ -77,7 +79,6 @@ public class URLAnalysis { try { url = in.take(); if (url == poison) break; - //System.out.println(url); update(url.getHost().replaceAll("-", "\\.").split("\\.")); update(p.matcher(url.getPath()).replaceAll("/").split("/")); } catch (InterruptedException e) { @@ -96,6 +97,30 @@ public class URLAnalysis { } } + public static void cleanup(ConcurrentHashMap stat) { + Map.Entry entry; + int c, low = Integer.MAX_VALUE; + Iterator> i = stat.entrySet().iterator(); + while (i.hasNext()) { + entry = i.next(); + c = entry.getValue().intValue(); + if (c == 1) { + i.remove(); + } else { + if (c < low) low = c; + } + } + i = stat.entrySet().iterator(); + while (i.hasNext()) { + entry = i.next(); + c = entry.getValue().intValue(); + if (c == low) { + i.remove(); + } + } + Runtime.getRuntime().gc(); + } + public static void main(String[] args) { String filename = args[0]; String analysis = filename + ".stats"; @@ -133,6 +158,11 @@ public class URLAnalysis { if (System.currentTimeMillis() - time > 1000) { time = System.currentTimeMillis(); System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second"); + if (MemoryControl.available() < cleanuplimit) { + System.out.println("starting cleanup, " + out.size() + " entries in statistic"); + cleanup(out); + System.out.println("finished cleanup, " + out.size() + " entries in statistic left, " + (MemoryControl.available() / 1024 / 1024) + " mb left"); + } } } reader.close();