or will be superfluous or subject of complete redesign after the migration to solr. Removing these things now will make the transition to solr more simple.pull/1/head
parent
6f1ddb2519
commit
3bcd9d622b
@ -1,108 +0,0 @@
|
|||||||
//-----------------------
|
|
||||||
//part of the AnomicHTTPD caching proxy
|
|
||||||
//(C) by Michael Peter Christen; mc@yacy.net
|
|
||||||
//first published on http://www.anomic.de
|
|
||||||
//Frankfurt, Germany, 2005
|
|
||||||
//
|
|
||||||
//This file is contributed by Matthias Soehnholz
|
|
||||||
//
|
|
||||||
// $LastChangedDate$
|
|
||||||
// $LastChangedRevision$
|
|
||||||
// $LastChangedBy$
|
|
||||||
//
|
|
||||||
//This program is free software; you can redistribute it and/or modify
|
|
||||||
//it under the terms of the GNU General Public License as published by
|
|
||||||
//the Free Software Foundation; either version 2 of the License, or
|
|
||||||
//(at your option) any later version.
|
|
||||||
//
|
|
||||||
//This program is distributed in the hope that it will be useful,
|
|
||||||
//but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
//GNU General Public License for more details.
|
|
||||||
//
|
|
||||||
//You should have received a copy of the GNU General Public License
|
|
||||||
//along with this program; if not, write to the Free Software
|
|
||||||
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
||||||
|
|
||||||
import net.yacy.cora.document.ASCII;
|
|
||||||
import net.yacy.cora.protocol.RequestHeader;
|
|
||||||
import net.yacy.search.Switchboard;
|
|
||||||
import net.yacy.search.index.MetadataRepository;
|
|
||||||
import net.yacy.search.index.Segment;
|
|
||||||
import de.anomic.server.serverObjects;
|
|
||||||
import de.anomic.server.serverSwitch;
|
|
||||||
|
|
||||||
public class IndexCleaner_p {
|
|
||||||
private static MetadataRepository.BlacklistCleaner urldbCleanerThread = null;
|
|
||||||
private static Segment.ReferenceCleaner indexCleanerThread = null;
|
|
||||||
|
|
||||||
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
|
|
||||||
final serverObjects prop = new serverObjects();
|
|
||||||
final Switchboard sb = (Switchboard) env;
|
|
||||||
prop.put("title", "DbCleanup_p");
|
|
||||||
|
|
||||||
// get segment
|
|
||||||
Segment indexSegment = sb.index;
|
|
||||||
|
|
||||||
if (post!=null) {
|
|
||||||
if (post.get("action").equals("ustart")) {
|
|
||||||
if (urldbCleanerThread==null || !urldbCleanerThread.isAlive()) {
|
|
||||||
urldbCleanerThread = indexSegment.urlMetadata().getBlacklistCleaner(Switchboard.urlBlacklist, sb.crawlStacker);
|
|
||||||
urldbCleanerThread.start();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
urldbCleanerThread.endPause();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (post.get("action").equals("ustop") && (urldbCleanerThread!=null)) {
|
|
||||||
urldbCleanerThread.abort();
|
|
||||||
}
|
|
||||||
else if (post.get("action").equals("upause") && (urldbCleanerThread!=null)) {
|
|
||||||
urldbCleanerThread.pause();
|
|
||||||
}
|
|
||||||
else if (post.get("action").equals("rstart")) {
|
|
||||||
if (indexCleanerThread==null || !indexCleanerThread.isAlive()) {
|
|
||||||
indexCleanerThread = indexSegment.getReferenceCleaner(post.get("wordHash","AAAAAAAAAAAA").getBytes());
|
|
||||||
indexCleanerThread.start();
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
indexCleanerThread.endPause();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
else if (post.get("action").equals("rstop") && (indexCleanerThread!=null)) {
|
|
||||||
indexCleanerThread.abort();
|
|
||||||
}
|
|
||||||
else if (post.get("action").equals("rpause") && (indexCleanerThread!=null)) {
|
|
||||||
indexCleanerThread.pause();
|
|
||||||
}
|
|
||||||
prop.put("LOCATION","");
|
|
||||||
return prop;
|
|
||||||
}
|
|
||||||
if (urldbCleanerThread!=null) {
|
|
||||||
prop.put("urldb", "1");
|
|
||||||
prop.putNum("urldb_percentUrls", ((double)urldbCleanerThread.totalSearchedUrls/indexSegment.urlMetadata().size())*100);
|
|
||||||
prop.putNum("urldb_blacklisted", urldbCleanerThread.blacklistedUrls);
|
|
||||||
prop.putNum("urldb_total", urldbCleanerThread.totalSearchedUrls);
|
|
||||||
prop.putHTML("urldb_lastBlacklistedUrl", urldbCleanerThread.lastBlacklistedUrl);
|
|
||||||
prop.put("urldb_lastBlacklistedHash", urldbCleanerThread.lastBlacklistedHash);
|
|
||||||
prop.putHTML("urldb_lastUrl", urldbCleanerThread.lastUrl);
|
|
||||||
prop.put("urldb_lastHash", urldbCleanerThread.lastHash);
|
|
||||||
prop.put("urldb_threadAlive", Boolean.toString(urldbCleanerThread.isAlive()));
|
|
||||||
prop.put("urldb_threadToString", urldbCleanerThread.toString());
|
|
||||||
final double percent = ((double)urldbCleanerThread.blacklistedUrls/urldbCleanerThread.totalSearchedUrls)*100;
|
|
||||||
prop.putNum("urldb_percent", percent);
|
|
||||||
}
|
|
||||||
if (indexCleanerThread!=null) {
|
|
||||||
prop.put("rwidb", "1");
|
|
||||||
prop.put("rwidb_threadAlive", Boolean.toString(indexCleanerThread.isAlive()));
|
|
||||||
prop.put("rwidb_threadToString", indexCleanerThread.toString());
|
|
||||||
prop.putNum("rwidb_RWIcountstart", indexCleanerThread.rwiCountAtStart);
|
|
||||||
prop.putNum("rwidb_RWIcountnow", indexCleanerThread.rwisize());
|
|
||||||
prop.put("rwidb_wordHashNow", (indexCleanerThread.wordHashNow == null) ? "NULL" : ASCII.String(indexCleanerThread.wordHashNow));
|
|
||||||
prop.put("rwidb_lastWordHash", (indexCleanerThread.lastWordHash == null) ? "null" : ASCII.String(indexCleanerThread.lastWordHash));
|
|
||||||
prop.putNum("rwidb_lastDeletionCounter", indexCleanerThread.lastDeletionCounter);
|
|
||||||
|
|
||||||
}
|
|
||||||
return prop;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1,108 +0,0 @@
|
|||||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
|
||||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
|
||||||
<head>
|
|
||||||
<title>YaCy '#[clientname]#': Index Cleaner</title>
|
|
||||||
#%env/templates/metas.template%#
|
|
||||||
</head>
|
|
||||||
<body id="IndexControlCleaner">
|
|
||||||
<div id="fullcontent">
|
|
||||||
#(inline)##%env/templates/header.template%#
|
|
||||||
|
|
||||||
<h2>Steering of API Actions</h2>
|
|
||||||
<p>This table shows search results that had been sorted out from the search result display because their content had not been verified.
|
|
||||||
This means that the searched word does not appear on the search page.
|
|
||||||
</p>::#(/inline)#
|
|
||||||
#(showtable)#::
|
|
||||||
<form action="IndexControlCleaner_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" id="cleanerlist">
|
|
||||||
<fieldset>
|
|
||||||
<legend><label for="apilist">Recorded Actions</label></legend>
|
|
||||||
<p><span id="resCounter" style="display: inline;">
|
|
||||||
#(navigation)#
|
|
||||||
::
|
|
||||||
#(left)#<img src="env/grafics/navdl.gif" alt="no previous page" />::<a href="Table_API_p.html?startRecord=#[startRecord]#&maximumRecords=#[maximumRecords]#&inline=#(inline)#false::true#(/inline)#&filter=#[filter]#" target="_self"><img src="env/grafics/navsl.gif" alt="previous page" /></a>#(/left)#
|
|
||||||
#[startRecord]#-#[to]# of #[of]#
|
|
||||||
#(right)#<img src="env/grafics/navdr.gif" alt="no next page" />::<a href="Table_API_p.html?startRecord=#[startRecord]#&maximumRecords=#[maximumRecords]#&inline=#(inline)#false::true#(/inline)#&filter=#[filter]#" target="_self"><img src="env/grafics/navsr.gif" alt="next page" /></a>#(/right)#
|
|
||||||
<img src="env/grafics/nave.gif" alt="" />
|
|
||||||
#(/navigation)#
|
|
||||||
<div>
|
|
||||||
<input type="hidden" name="startRecord" value="#[startRecord]#" />
|
|
||||||
<input type="hidden" name="maximumRecords" value="#[maximumRecords]#" />
|
|
||||||
<input type="hidden" name="inline" value="#(inline)#false::true#(/inline)#" />
|
|
||||||
<input type="hidden" name="filter" value="#[filter]#" />
|
|
||||||
<input type="text" name="query" value="#[query]#" style="font-size:16px;float:left;border:0px;height:20px;background-image:url('env/grafics/find.gif');background-repeat:no-repeat;background-position:right top;" />
|
|
||||||
</div>
|
|
||||||
</span><br/></p>
|
|
||||||
<p style="clear:both;">
|
|
||||||
<table class="sortable" border="0" cellpadding="2" cellspacing="1">
|
|
||||||
<tr class="TableHeader" valign="bottom">
|
|
||||||
<th class="sorttable_nosort"><input type="checkbox" id="allswitch" onclick="checkAll(this.form.id, this.checked);" /></th>
|
|
||||||
<th>Type</th>
|
|
||||||
<th width="100">Comment</th>
|
|
||||||
<th>Call<br/>Count</th>
|
|
||||||
<th>Recording<br/>Date</th>
|
|
||||||
<th>Last Exec<br/>Date</th>
|
|
||||||
<th>Next Exec<br/>Date</th>
|
|
||||||
<th class="sorttable_nosort">Scheduler</th>
|
|
||||||
#(inline)#<th class="sorttable_nosort">URL</th>::#(/inline)#
|
|
||||||
</tr>
|
|
||||||
#{list}#
|
|
||||||
<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
|
|
||||||
<td align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[pk]#" /></td>
|
|
||||||
<td>#[type]#</td>
|
|
||||||
<td>#[comment]#</td>
|
|
||||||
<td>#[callcount]#</td>
|
|
||||||
<td>#[dateRecording]#</td>
|
|
||||||
<td>#[dateLastExec]#</td>
|
|
||||||
<td>#[dateNextExec]#</td>
|
|
||||||
<td>
|
|
||||||
#(scheduler)#
|
|
||||||
<form action="Table_API_p.html" method="post" enctype="multipart/form-data" id="modify_repeat" accept-charset="UTF-8">
|
|
||||||
<select name="repeat_select" onchange='this.form.submit()'>
|
|
||||||
<option value="off" selected="selected">no repetition</option>
|
|
||||||
<option value="on">activate scheduler</option>
|
|
||||||
</select>
|
|
||||||
<input type="hidden" name="pk" value="#[pk]#" />
|
|
||||||
<input type="hidden" name="inline" value="#[inline]#" />
|
|
||||||
<input type="hidden" name="filter" value="#[filter]#" />
|
|
||||||
</form>
|
|
||||||
::
|
|
||||||
<form action="Table_API_p.html" method="post" enctype="multipart/form-data" id="modify_repeat">
|
|
||||||
<table><tr><td>
|
|
||||||
<select name="repeat_time" onchange='this.form.submit()'>
|
|
||||||
#{scale}#
|
|
||||||
<option value="#[time]#" #(selected)#::selected="selected"#(/selected)#>#[time]#</option>
|
|
||||||
#{/scale}#
|
|
||||||
</select>
|
|
||||||
</td><td>
|
|
||||||
<select name="repeat_unit" onchange='this.form.submit()'>
|
|
||||||
<option value="selminutes" #(selectedMinutes)#::selected="selected"#(/selectedMinutes)#>minutes</option>
|
|
||||||
<option value="selhours" #(selectedHours)#::selected="selected"#(/selectedHours)#>hours</option>
|
|
||||||
<option value="seldays" #(selectedDays)#::selected="selected"#(/selectedDays)#>days</option>
|
|
||||||
</select>
|
|
||||||
</td></tr></table>
|
|
||||||
<input type="hidden" name="pk" value="#[pk]#" />
|
|
||||||
<input type="hidden" name="inline" value="#[inline]#" />
|
|
||||||
<input type="hidden" name="filter" value="#[filter]#" />
|
|
||||||
<noscript><input type="submit" value="Submit" /></noscript>
|
|
||||||
</form>
|
|
||||||
#(/scheduler)#
|
|
||||||
</td>
|
|
||||||
#(inline)#<td>#[url]#</td>::#(/inline)#
|
|
||||||
</tr>
|
|
||||||
#{/list}#
|
|
||||||
</fieldset>
|
|
||||||
</table>
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
<input type="hidden" name="num" value="#[num]#" />
|
|
||||||
<input type="submit" name="deletefromindex" value="Delete selected references from the search index" />
|
|
||||||
<input type="submit" name="deleterows" value="Delete selected entries from the list" />
|
|
||||||
</p>
|
|
||||||
|
|
||||||
</form>
|
|
||||||
#(/showtable)#
|
|
||||||
</div>
|
|
||||||
#%env/templates/footer.template%#
|
|
||||||
</body>
|
|
||||||
</html>
|
|
||||||
|
|
@ -1,564 +0,0 @@
|
|||||||
// URLAnalysis.java
|
|
||||||
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
|
||||||
// first published 24.02.2009 on http://yacy.net
|
|
||||||
//
|
|
||||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
|
||||||
//
|
|
||||||
// $LastChangedDate$
|
|
||||||
// $LastChangedRevision$
|
|
||||||
// $LastChangedBy$
|
|
||||||
//
|
|
||||||
// LICENSE
|
|
||||||
//
|
|
||||||
// This program is free software; you can redistribute it and/or modify
|
|
||||||
// it under the terms of the GNU General Public License as published by
|
|
||||||
// the Free Software Foundation; either version 2 of the License, or
|
|
||||||
// (at your option) any later version.
|
|
||||||
//
|
|
||||||
// This program is distributed in the hope that it will be useful,
|
|
||||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
||||||
// GNU General Public License for more details.
|
|
||||||
//
|
|
||||||
// You should have received a copy of the GNU General Public License
|
|
||||||
// along with this program; if not, write to the Free Software
|
|
||||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
||||||
|
|
||||||
|
|
||||||
package de.anomic.data;
|
|
||||||
|
|
||||||
import java.io.BufferedInputStream;
|
|
||||||
import java.io.BufferedOutputStream;
|
|
||||||
import java.io.BufferedReader;
|
|
||||||
import java.io.File;
|
|
||||||
import java.io.FileInputStream;
|
|
||||||
import java.io.FileOutputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.io.InputStream;
|
|
||||||
import java.io.InputStreamReader;
|
|
||||||
import java.io.OutputStream;
|
|
||||||
import java.net.MalformedURLException;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Iterator;
|
|
||||||
import java.util.Map;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.TreeMap;
|
|
||||||
import java.util.TreeSet;
|
|
||||||
import java.util.concurrent.ArrayBlockingQueue;
|
|
||||||
import java.util.concurrent.ConcurrentHashMap;
|
|
||||||
import java.util.regex.Pattern;
|
|
||||||
import java.util.zip.GZIPInputStream;
|
|
||||||
import java.util.zip.GZIPOutputStream;
|
|
||||||
|
|
||||||
import net.yacy.cora.document.UTF8;
|
|
||||||
import net.yacy.kelondro.data.meta.DigestURI;
|
|
||||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
|
||||||
import net.yacy.kelondro.data.word.WordReferenceRow;
|
|
||||||
import net.yacy.kelondro.index.HandleMap;
|
|
||||||
import net.yacy.kelondro.index.HandleSet;
|
|
||||||
import net.yacy.kelondro.index.RowSpaceExceededException;
|
|
||||||
import net.yacy.kelondro.logging.Log;
|
|
||||||
import net.yacy.kelondro.order.Base64Order;
|
|
||||||
import net.yacy.kelondro.rwi.ReferenceContainerArray;
|
|
||||||
import net.yacy.kelondro.util.MemoryControl;
|
|
||||||
import net.yacy.search.index.MetadataRepository;
|
|
||||||
import net.yacy.search.index.Segment;
|
|
||||||
import net.yacy.search.index.MetadataRepository.Export;
|
|
||||||
|
|
||||||
public class URLAnalysis {
|
|
||||||
|
|
||||||
private static final Pattern patternMinus = Pattern.compile("-");
|
|
||||||
|
|
||||||
/**
|
|
||||||
* processes to analyse URL lists
|
|
||||||
*/
|
|
||||||
|
|
||||||
private static DigestURI poison = null;
|
|
||||||
static {
|
|
||||||
try {
|
|
||||||
poison = new DigestURI("http://poison.org/poison");
|
|
||||||
} catch (final MalformedURLException e) {
|
|
||||||
poison = null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static class splitter extends Thread {
|
|
||||||
|
|
||||||
private final ArrayBlockingQueue<DigestURI> in;
|
|
||||||
private final ConcurrentHashMap<String, Integer> out;
|
|
||||||
|
|
||||||
public splitter(final ArrayBlockingQueue<DigestURI> in, final ConcurrentHashMap<String, Integer> out) {
|
|
||||||
this.in = in;
|
|
||||||
this.out = out;
|
|
||||||
}
|
|
||||||
|
|
||||||
@Override
|
|
||||||
public void run() {
|
|
||||||
try {
|
|
||||||
DigestURI url;
|
|
||||||
final Pattern p = Pattern.compile("~|\\(|\\)|\\+|-|@|:|%|\\.|;|_");
|
|
||||||
while (true) {
|
|
||||||
try {
|
|
||||||
url = this.in.take();
|
|
||||||
if (url == poison) break;
|
|
||||||
update(patternMinus.matcher(url.getHost()).replaceAll("\\.").split("\\."));
|
|
||||||
update(p.matcher(url.getPath()).replaceAll("/").split("/"));
|
|
||||||
} catch (final InterruptedException e) {
|
|
||||||
Log.logException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (final Exception e) {
|
|
||||||
Log.logException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
private void update(final String[] s) {
|
|
||||||
Integer c;
|
|
||||||
for (final String t: s) {
|
|
||||||
if (t.isEmpty()) continue;
|
|
||||||
c = this.out.get(t);
|
|
||||||
this.out.put(t, (c == null) ? 1 : c.intValue() + 1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void cleanup(final ConcurrentHashMap<String, Integer> stat) {
|
|
||||||
Map.Entry<String, Integer> entry;
|
|
||||||
int c, low = Integer.MAX_VALUE;
|
|
||||||
Iterator<Map.Entry<String, Integer>> i = stat.entrySet().iterator();
|
|
||||||
while (i.hasNext()) {
|
|
||||||
entry = i.next();
|
|
||||||
c = entry.getValue().intValue();
|
|
||||||
if (c == 1) {
|
|
||||||
i.remove();
|
|
||||||
} else {
|
|
||||||
if (c < low) low = c;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
i = stat.entrySet().iterator();
|
|
||||||
while (i.hasNext()) {
|
|
||||||
entry = i.next();
|
|
||||||
c = entry.getValue().intValue();
|
|
||||||
if (c == low) {
|
|
||||||
i.remove();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Runtime.getRuntime().gc();
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void genstat(final String urlfile) {
|
|
||||||
|
|
||||||
final boolean gz = urlfile.endsWith(".gz");
|
|
||||||
final String analysis = (gz) ? urlfile.substring(0, urlfile.length() - 3) + ".stats.gz" : urlfile + ".stats";
|
|
||||||
final long cleanuplimit = Math.max(50 * 1024 * 1024, MemoryControl.available() / 8);
|
|
||||||
|
|
||||||
// start threads
|
|
||||||
final ArrayBlockingQueue<DigestURI> in = new ArrayBlockingQueue<DigestURI>(1000);
|
|
||||||
final ConcurrentHashMap<String, Integer> out = new ConcurrentHashMap<String, Integer>();
|
|
||||||
for (int i = 0, available = Runtime.getRuntime().availableProcessors(); i < available; i++) new splitter(in, out).start();
|
|
||||||
final splitter spl = new splitter(in, out);
|
|
||||||
spl.start();
|
|
||||||
|
|
||||||
// put urls in queue
|
|
||||||
final File infile = new File(urlfile);
|
|
||||||
final File outfile = new File(analysis);
|
|
||||||
BufferedReader reader = null;
|
|
||||||
long time = System.currentTimeMillis();
|
|
||||||
final long start = time;
|
|
||||||
int count = 0;
|
|
||||||
|
|
||||||
System.out.println("start processing");
|
|
||||||
try {
|
|
||||||
InputStream is = new BufferedInputStream(new FileInputStream(infile));
|
|
||||||
if (gz) is = new GZIPInputStream(is);
|
|
||||||
reader = new BufferedReader(new InputStreamReader(is));
|
|
||||||
String line;
|
|
||||||
while ((line = reader.readLine()) != null) {
|
|
||||||
line = line.trim();
|
|
||||||
if (line.length() > 0) {
|
|
||||||
try {
|
|
||||||
final DigestURI url = new DigestURI(line);
|
|
||||||
in.put(url);
|
|
||||||
} catch (final InterruptedException e) {
|
|
||||||
Log.logException(e);
|
|
||||||
} catch (final MalformedURLException e) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
count++;
|
|
||||||
if (System.currentTimeMillis() - time > 1000) {
|
|
||||||
time = System.currentTimeMillis();
|
|
||||||
System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second");
|
|
||||||
if (MemoryControl.available() < cleanuplimit) {
|
|
||||||
System.out.println("starting cleanup, " + out.size() + " entries in statistic");
|
|
||||||
cleanup(out);
|
|
||||||
System.out.println("finished cleanup, " + out.size() + " entries in statistic left, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
reader.close();
|
|
||||||
} catch (final IOException e) {
|
|
||||||
Log.logException(e);
|
|
||||||
} finally {
|
|
||||||
if (reader != null) try { reader.close(); } catch (final Exception e) {}
|
|
||||||
}
|
|
||||||
|
|
||||||
// stop threads
|
|
||||||
System.out.println("stopping threads");
|
|
||||||
for (int i = 0, available = Runtime.getRuntime().availableProcessors() + 1; i < available; i++) try {
|
|
||||||
in.put(poison);
|
|
||||||
} catch (final InterruptedException e) {
|
|
||||||
Log.logException(e);
|
|
||||||
}
|
|
||||||
try {
|
|
||||||
spl.join();
|
|
||||||
} catch (final InterruptedException e1) {
|
|
||||||
Log.logException(e1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// generate statistics
|
|
||||||
System.out.println("start processing results");
|
|
||||||
final TreeMap<String, Integer> results = new TreeMap<String, Integer>();
|
|
||||||
count = 0;
|
|
||||||
Map.Entry<String, Integer> entry;
|
|
||||||
final Iterator<Map.Entry<String, Integer>> i = out.entrySet().iterator();
|
|
||||||
while (i.hasNext()) {
|
|
||||||
entry = i.next();
|
|
||||||
results.put(num(entry.getValue().intValue() * (entry.getKey().length() - 1)) + " - " + entry.getKey(), entry.getValue());
|
|
||||||
count++;
|
|
||||||
i.remove(); // free memory
|
|
||||||
if (System.currentTimeMillis() - time > 10000) {
|
|
||||||
time = System.currentTimeMillis();
|
|
||||||
System.out.println("processed " + count + " results, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// write statistics
|
|
||||||
System.out.println("start writing results");
|
|
||||||
try {
|
|
||||||
OutputStream os = new BufferedOutputStream(new FileOutputStream(outfile));
|
|
||||||
if (gz) os = new GZIPOutputStream(os);
|
|
||||||
count = 0;
|
|
||||||
for (final Map.Entry<String, Integer> e: results.entrySet()) {
|
|
||||||
os.write(UTF8.getBytes(e.getKey()));
|
|
||||||
os.write(new byte[]{'\t'});
|
|
||||||
os.write(UTF8.getBytes(Integer.toString(e.getValue())));
|
|
||||||
os.write(new byte[]{'\n'});
|
|
||||||
count++;
|
|
||||||
if (System.currentTimeMillis() - time > 10000) {
|
|
||||||
time = System.currentTimeMillis();
|
|
||||||
System.out.println("wrote " + count + " lines.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
os.close();
|
|
||||||
} catch (final IOException e) {
|
|
||||||
Log.logException(e);
|
|
||||||
}
|
|
||||||
|
|
||||||
System.out.println("finished");
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void genhost(final String urlfile) {
|
|
||||||
|
|
||||||
final boolean gz = urlfile.endsWith(".gz");
|
|
||||||
final String trunk = (gz) ? urlfile.substring(0, urlfile.length() - 3) + ".host" : urlfile + ".host";
|
|
||||||
final HashSet<String> hosts = new HashSet<String>();
|
|
||||||
final File infile = new File(urlfile);
|
|
||||||
BufferedReader reader = null;
|
|
||||||
long time = System.currentTimeMillis();
|
|
||||||
final long start = time;
|
|
||||||
int count = 0;
|
|
||||||
|
|
||||||
System.out.println("start processing");
|
|
||||||
try {
|
|
||||||
InputStream is = new BufferedInputStream(new FileInputStream(infile));
|
|
||||||
if (gz) is = new GZIPInputStream(is);
|
|
||||||
reader = new BufferedReader(new InputStreamReader(is));
|
|
||||||
String line;
|
|
||||||
while ((line = reader.readLine()) != null) {
|
|
||||||
line = line.trim();
|
|
||||||
if (line.length() > 0) {
|
|
||||||
try {
|
|
||||||
final DigestURI url = new DigestURI(line);
|
|
||||||
hosts.add(url.getHost());
|
|
||||||
} catch (final MalformedURLException e) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
count++;
|
|
||||||
if (System.currentTimeMillis() - time > 1000) {
|
|
||||||
time = System.currentTimeMillis();
|
|
||||||
System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
reader.close();
|
|
||||||
} catch (final IOException e) {
|
|
||||||
Log.logException(e);
|
|
||||||
} finally {
|
|
||||||
if (reader != null) try { reader.close(); } catch (final Exception e) {}
|
|
||||||
}
|
|
||||||
|
|
||||||
// copy everything into a TreeSet to order it
|
|
||||||
System.out.println("start processing results");
|
|
||||||
final TreeSet<String> results = new TreeSet<String>();
|
|
||||||
count = 0;
|
|
||||||
final Iterator<String> i = hosts.iterator();
|
|
||||||
while (i.hasNext()) {
|
|
||||||
results.add(i.next());
|
|
||||||
count++;
|
|
||||||
i.remove(); // free memory
|
|
||||||
if (System.currentTimeMillis() - time > 10000) {
|
|
||||||
time = System.currentTimeMillis();
|
|
||||||
System.out.println("processed " + count + " results, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// write hosts
|
|
||||||
writeSet(trunk, gz, results);
|
|
||||||
|
|
||||||
System.out.println("finished");
|
|
||||||
}
|
|
||||||
|
|
||||||
private static void writeSet(final String trunk, final boolean gz, final Set<String> set) {
|
|
||||||
|
|
||||||
// write hosts
|
|
||||||
System.out.println("start writing results");
|
|
||||||
final File outfile = new File(trunk + ((gz) ? ".gz" : ""));
|
|
||||||
long time = System.currentTimeMillis();
|
|
||||||
try {
|
|
||||||
OutputStream os = new BufferedOutputStream(new FileOutputStream(outfile));
|
|
||||||
if (gz) os = new GZIPOutputStream(os);
|
|
||||||
int count = 0;
|
|
||||||
for (final String h: set) {
|
|
||||||
os.write(UTF8.getBytes(h));
|
|
||||||
os.write(new byte[]{'\n'});
|
|
||||||
count++;
|
|
||||||
if (System.currentTimeMillis() - time > 10000) {
|
|
||||||
time = System.currentTimeMillis();
|
|
||||||
System.out.println("wrote " + count + " lines.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
os.close();
|
|
||||||
} catch (final IOException e) {
|
|
||||||
Log.logException(e);
|
|
||||||
}
|
|
||||||
|
|
||||||
System.out.println("finished writing results");
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void sortsplit(final String urlfile) {
|
|
||||||
|
|
||||||
final boolean gz = urlfile.endsWith(".gz");
|
|
||||||
final String trunk = ((gz) ? urlfile.substring(0, urlfile.length() - 3) : urlfile) + ".sort";
|
|
||||||
final File infile = new File(urlfile);
|
|
||||||
final TreeSet<String> urls = new TreeSet<String>();
|
|
||||||
BufferedReader reader = null;
|
|
||||||
long time = System.currentTimeMillis();
|
|
||||||
final long start = time;
|
|
||||||
int count = 0;
|
|
||||||
int filecount = 0;
|
|
||||||
final long cleanuplimit = Math.max(50 * 1024 * 1024, MemoryControl.available() / 8);
|
|
||||||
|
|
||||||
System.out.println("start processing");
|
|
||||||
try {
|
|
||||||
InputStream is = new BufferedInputStream(new FileInputStream(infile));
|
|
||||||
if (gz) is = new GZIPInputStream(is);
|
|
||||||
reader = new BufferedReader(new InputStreamReader(is));
|
|
||||||
String line;
|
|
||||||
while ((line = reader.readLine()) != null) {
|
|
||||||
line = line.trim();
|
|
||||||
if (line.length() > 0) {
|
|
||||||
try {
|
|
||||||
final DigestURI url = new DigestURI(line);
|
|
||||||
urls.add(url.toNormalform(true, true));
|
|
||||||
} catch (final MalformedURLException e) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
count++;
|
|
||||||
if (System.currentTimeMillis() - time > 1000) {
|
|
||||||
time = System.currentTimeMillis();
|
|
||||||
System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second");
|
|
||||||
}
|
|
||||||
if (MemoryControl.available() < cleanuplimit) {
|
|
||||||
writeSet(trunk + "." + filecount, gz, urls);
|
|
||||||
filecount++;
|
|
||||||
urls.clear();
|
|
||||||
Runtime.getRuntime().gc();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
reader.close();
|
|
||||||
} catch (final IOException e) {
|
|
||||||
Log.logException(e);
|
|
||||||
} finally {
|
|
||||||
if (reader != null) try { reader.close(); } catch (final Exception e) {}
|
|
||||||
}
|
|
||||||
|
|
||||||
// write hosts
|
|
||||||
writeSet(trunk + "." + filecount, gz, urls);
|
|
||||||
|
|
||||||
System.out.println("finished");
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void incell(final File cellPath, final String statisticPath) {
|
|
||||||
try {
|
|
||||||
final HandleMap idx = ReferenceContainerArray.referenceHashes(
|
|
||||||
cellPath,
|
|
||||||
Segment.wordReferenceFactory,
|
|
||||||
Base64Order.enhancedCoder,
|
|
||||||
WordReferenceRow.urlEntryRow);
|
|
||||||
System.out.println("INDEX REFERENCE COLLECTION starting dump of statistics");
|
|
||||||
idx.dump(new File(statisticPath));
|
|
||||||
System.out.println("INDEX REFERENCE COLLECTION finished dump, wrote " + idx.size() + " entries to " + statisticPath);
|
|
||||||
idx.close();
|
|
||||||
} catch (final Exception e) {
|
|
||||||
Log.logException(e);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
public static int diffurlcol(final String metadataPath, final String statisticFile, final String diffFile) throws IOException, RowSpaceExceededException {
|
|
||||||
System.out.println("INDEX DIFF URL-COL startup");
|
|
||||||
final HandleMap idx = new HandleMap(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 4, new File(statisticFile));
|
|
||||||
final MetadataRepository mr = new MetadataRepository(new File(metadataPath));
|
|
||||||
mr.connectUrlDb(Segment.UrlDbName, false, false);
|
|
||||||
final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1000000);
|
|
||||||
System.out.println("INDEX DIFF URL-COL loaded dump, starting diff");
|
|
||||||
final long start = System.currentTimeMillis();
|
|
||||||
long update = start - 7000;
|
|
||||||
int count = 0;
|
|
||||||
for (final byte[] refhash: mr) {
|
|
||||||
if (idx.get(refhash) == -1) {
|
|
||||||
// the key exists as urlhash in the URL database, but not in the collection as referenced urlhash
|
|
||||||
hs.put(refhash);
|
|
||||||
}
|
|
||||||
count++;
|
|
||||||
if (System.currentTimeMillis() - update > 10000) {
|
|
||||||
System.out.println("INDEX DIFF URL-COL running, checked " + count + ", found " + hs.size() + " missing references so far, " + (((System.currentTimeMillis() - start) * (mr.size() - count) / count) / 60000) + " minutes remaining");
|
|
||||||
update = System.currentTimeMillis();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
idx.close();
|
|
||||||
mr.close();
|
|
||||||
System.out.println("INDEX DIFF URL-COL finished diff, starting dump to " + diffFile);
|
|
||||||
count = hs.dump(new File(diffFile));
|
|
||||||
System.out.println("INDEX DIFF URL-COL finished dump, wrote " + count + " references that occur in the URL-DB, but not in the collection-dump");
|
|
||||||
return count;
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void export(final String metadataPath, final int format, final String export, final String diffFile) throws IOException, RowSpaceExceededException {
|
|
||||||
// format: 0=text, 1=html, 2=rss/xml
|
|
||||||
System.out.println("URL EXPORT startup");
|
|
||||||
final MetadataRepository mr = new MetadataRepository(new File(metadataPath));
|
|
||||||
mr.connectUrlDb(Segment.UrlDbName, false, false);
|
|
||||||
final HandleSet hs = (diffFile == null) ? null : new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile));
|
|
||||||
System.out.println("URL EXPORT loaded dump, starting export");
|
|
||||||
final Export e = mr.export(new File(export), ".*", hs, format, false);
|
|
||||||
try {
|
|
||||||
e.join();
|
|
||||||
} catch (final InterruptedException e1) {
|
|
||||||
Log.logException(e1);
|
|
||||||
}
|
|
||||||
System.out.println("URL EXPORT finished export, wrote " + ((hs == null) ? mr.size() : hs.size()) + " entries");
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void delete(final String metadataPath, final String diffFile) throws IOException, RowSpaceExceededException {
|
|
||||||
System.out.println("URL DELETE startup");
|
|
||||||
final MetadataRepository mr = new MetadataRepository(new File(metadataPath));
|
|
||||||
mr.connectUrlDb(Segment.UrlDbName, false, false);
|
|
||||||
final int mrSize = mr.size();
|
|
||||||
final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile));
|
|
||||||
System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize);
|
|
||||||
for (final byte[] refhash: hs) {
|
|
||||||
mr.remove(refhash);
|
|
||||||
}
|
|
||||||
System.out.println("URL DELETE finished deletions, " + mr.size() + " entries left in URL database");
|
|
||||||
}
|
|
||||||
|
|
||||||
public static void main(final String[] args) {
|
|
||||||
if (args[0].equals("-stat") && args.length >= 2) {
|
|
||||||
// generate a statistics about common words in file, store to <file>.stat
|
|
||||||
// example:
|
|
||||||
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -stat DATA/EXPORT/urls1.txt.gz
|
|
||||||
for (int i = 1; i < args.length; i++) genstat(args[i]);
|
|
||||||
} else if (args[0].equals("-host") && args.length >= 2) {
|
|
||||||
// generate a file <file>.host containing only the hosts of the urls
|
|
||||||
for (int i = 1; i < args.length; i++) genhost(args[i]);
|
|
||||||
} else if (args[0].equals("-sort") && args.length >= 2) {
|
|
||||||
// generate file <file>.x.sort with sorted lists and split the file in smaller pieces
|
|
||||||
for (int i = 1; i < args.length; i++) sortsplit(args[i]);
|
|
||||||
} else if (args[0].equals("-incell") && args.length >= 2) {
|
|
||||||
// generate a dump of all referenced URL hashes from a given RICELL
|
|
||||||
// example:
|
|
||||||
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -incell DATA/INDEX/freeworld/TEXT/RICELL used.dump
|
|
||||||
incell(new File(args[1]), args[2]);
|
|
||||||
} else if (args[0].equals("-diffurlcol") && args.length >= 3) {
|
|
||||||
// make a diff-file that contains hashes from the url database that do not occur in the collection reference dump
|
|
||||||
// example:
|
|
||||||
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT/METADATA used.dump diffurlcol.dump
|
|
||||||
try {
|
|
||||||
diffurlcol(args[1], args[2], args[3]);
|
|
||||||
} catch (final Exception e) {
|
|
||||||
Log.logException(e);
|
|
||||||
}
|
|
||||||
} else if (args[0].equals("-export") && args.length >= 4) {
|
|
||||||
// export a url-list file
|
|
||||||
// example:
|
|
||||||
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -export DATA/INDEX/freeworld/TEXT xml urls.xml diffurlcol.dump
|
|
||||||
// instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html'
|
|
||||||
final int format = (args[2].equals("xml")) ? 2 : (args[2].equals("html")) ? 1 : 0;
|
|
||||||
try {
|
|
||||||
export(args[1], format, args[3], (args.length >= 5) ? args[4] : null);
|
|
||||||
} catch (final Exception e) {
|
|
||||||
Log.logException(e);
|
|
||||||
}
|
|
||||||
} else if (args[0].equals("-delete") && args.length >= 3) {
|
|
||||||
// delete from URLs as given by urlreference diff dump
|
|
||||||
// example:
|
|
||||||
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -delete DATA/INDEX/freeworld/TEXT diffurlcol.dump
|
|
||||||
// instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html'
|
|
||||||
try {
|
|
||||||
delete(args[1], args[2]);
|
|
||||||
} catch (final Exception e) {
|
|
||||||
Log.logException(e);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
System.out.println("usage:");
|
|
||||||
System.out.println();
|
|
||||||
System.out.println("-stat <file> ");
|
|
||||||
System.out.println(" generate a statistics about common words in file, store to <file>.stat");
|
|
||||||
System.out.println();
|
|
||||||
System.out.println("-host <file>");
|
|
||||||
System.out.println(" generate a file <file>.host containing only the hosts of the urls");
|
|
||||||
System.out.println();
|
|
||||||
System.out.println("-sort <file>");
|
|
||||||
System.out.println(" generate file <file>.x.sort with sorted lists and split the file in smaller pieces");
|
|
||||||
System.out.println();
|
|
||||||
System.out.println("-incollection <path-to-RICOLLECTION> <file>");
|
|
||||||
System.out.println(" generate a dump of all referenced URL hashes");
|
|
||||||
System.out.println();
|
|
||||||
System.out.println("-diffurlcol <path-to-URL-DB> <dump-from-incollection> <diff-dump>");
|
|
||||||
System.out.println(" find URLs that occur in url-db but not in collections");
|
|
||||||
System.out.println();
|
|
||||||
System.out.println("-export <path-to-URL-DB> <format text|html|xml> <export-file> <diff-dump>");
|
|
||||||
System.out.println(" export urls to file. the last argument can be omitted, then all urls are exported");
|
|
||||||
System.out.println();
|
|
||||||
System.out.println("-delete <path-to-URL-DB> <diff-dump>");
|
|
||||||
System.out.println(" delete all urls that are listed in the diff-dump from the url-db");
|
|
||||||
System.out.println();
|
|
||||||
System.out.println("to do a complete clean-up of the url database, start the following:");
|
|
||||||
System.out.println();
|
|
||||||
System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -incollection DATA/INDEX/freeworld/TEXT/RICOLLECTION used.dump");
|
|
||||||
System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT used.dump diffurlcol.dump");
|
|
||||||
System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -export DATA/INDEX/freeworld/TEXT xml urls.xml diffurlcol.dump");
|
|
||||||
System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -delete DATA/INDEX/freeworld/TEXT diffurlcol.dump");
|
|
||||||
System.out.println();
|
|
||||||
}
|
|
||||||
System.exit(0); // kill remaining threads
|
|
||||||
}
|
|
||||||
|
|
||||||
private static final String num(final int i) {
|
|
||||||
final StringBuilder s = new StringBuilder(Integer.toString(i));
|
|
||||||
while (s.length() < 9) s.insert(0, "0");
|
|
||||||
return s.toString();
|
|
||||||
}
|
|
||||||
}
|
|
Loading…
Reference in new issue