memory protection for URLAnalysis

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5649 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 0f6fa804ff
commit 89d8e824ed

@ -77,10 +77,10 @@
<dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" /> <dd><input type="text" name="exportfilter" value=".*.*" size="20" maxlength="250" />
</dd> </dd>
<dt class="TableCellDark">Export Format</dt> <dt class="TableCellDark">Export Format</dt>
<dd>Only Domain <i>(superfast)</i>: <dd>Only Domain:
<input type="radio" name="format" value="dom-text" />Plain Text List (domains only)&nbsp;&nbsp; <input type="radio" name="format" value="dom-text" />Plain Text List (domains only)&nbsp;&nbsp;
<input type="radio" name="format" value="dom-html" checked="checked" />HTML (domains as URLs, no title)<br /> <input type="radio" name="format" value="dom-html" checked="checked" />HTML (domains as URLs, no title)<br />
Full URL List <i>(high IO)&nbsp;&nbsp;&nbsp;&nbsp;</i>: Full URL List:
<input type="radio" name="format" value="url-text" />Plain Text List (URLs only)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <input type="radio" name="format" value="url-text" />Plain Text List (URLs only)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<input type="radio" name="format" value="url-html" />HTML (URLs with title)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <input type="radio" name="format" value="url-html" />HTML (URLs with title)&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;
<input type="radio" name="format" value="url-rss" />XML (RSS) <input type="radio" name="format" value="url-rss" />XML (RSS)

@ -51,6 +51,8 @@ public class URLAnalysis {
* processes to analyse URL lists * processes to analyse URL lists
*/ */
private static final long cleanuplimit = 50 * 1024 * 1024;
public static yacyURL poison = null; public static yacyURL poison = null;
static { static {
try { try {
@ -77,7 +79,6 @@ public class URLAnalysis {
try { try {
url = in.take(); url = in.take();
if (url == poison) break; if (url == poison) break;
//System.out.println(url);
update(url.getHost().replaceAll("-", "\\.").split("\\.")); update(url.getHost().replaceAll("-", "\\.").split("\\."));
update(p.matcher(url.getPath()).replaceAll("/").split("/")); update(p.matcher(url.getPath()).replaceAll("/").split("/"));
} catch (InterruptedException e) { } catch (InterruptedException e) {
@ -96,6 +97,30 @@ public class URLAnalysis {
} }
} }
public static void cleanup(ConcurrentHashMap<String, Integer> stat) {
Map.Entry<String, Integer> entry;
int c, low = Integer.MAX_VALUE;
Iterator<Map.Entry<String, Integer>> i = stat.entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
c = entry.getValue().intValue();
if (c == 1) {
i.remove();
} else {
if (c < low) low = c;
}
}
i = stat.entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
c = entry.getValue().intValue();
if (c == low) {
i.remove();
}
}
Runtime.getRuntime().gc();
}
public static void main(String[] args) { public static void main(String[] args) {
String filename = args[0]; String filename = args[0];
String analysis = filename + ".stats"; String analysis = filename + ".stats";
@ -133,6 +158,11 @@ public class URLAnalysis {
if (System.currentTimeMillis() - time > 1000) { if (System.currentTimeMillis() - time > 1000) {
time = System.currentTimeMillis(); time = System.currentTimeMillis();
System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second"); System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second");
if (MemoryControl.available() < cleanuplimit) {
System.out.println("starting cleanup, " + out.size() + " entries in statistic");
cleanup(out);
System.out.println("finished cleanup, " + out.size() + " entries in statistic left, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
}
} }
} }
reader.close(); reader.close();

Loading…
Cancel
Save