cleaned up classes and methods which are either superfluous at this time

or will be superfluous or subject of complete redesign after the
migration to solr. Removing these things now will make the transition to
solr more simple.
pull/1/head
Michael Peter Christen 13 years ago
parent 6f1ddb2519
commit 3bcd9d622b

@ -1,108 +0,0 @@
//-----------------------
//part of the AnomicHTTPD caching proxy
//(C) by Michael Peter Christen; mc@yacy.net
//first published on http://www.anomic.de
//Frankfurt, Germany, 2005
//
//This file is contributed by Matthias Soehnholz
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
//This program is free software; you can redistribute it and/or modify
//it under the terms of the GNU General Public License as published by
//the Free Software Foundation; either version 2 of the License, or
//(at your option) any later version.
//
//This program is distributed in the hope that it will be useful,
//but WITHOUT ANY WARRANTY; without even the implied warranty of
//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//GNU General Public License for more details.
//
//You should have received a copy of the GNU General Public License
//along with this program; if not, write to the Free Software
//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.search.Switchboard;
import net.yacy.search.index.MetadataRepository;
import net.yacy.search.index.Segment;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class IndexCleaner_p {
private static MetadataRepository.BlacklistCleaner urldbCleanerThread = null;
private static Segment.ReferenceCleaner indexCleanerThread = null;
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard) env;
prop.put("title", "DbCleanup_p");
// get segment
Segment indexSegment = sb.index;
if (post!=null) {
if (post.get("action").equals("ustart")) {
if (urldbCleanerThread==null || !urldbCleanerThread.isAlive()) {
urldbCleanerThread = indexSegment.urlMetadata().getBlacklistCleaner(Switchboard.urlBlacklist, sb.crawlStacker);
urldbCleanerThread.start();
}
else {
urldbCleanerThread.endPause();
}
}
else if (post.get("action").equals("ustop") && (urldbCleanerThread!=null)) {
urldbCleanerThread.abort();
}
else if (post.get("action").equals("upause") && (urldbCleanerThread!=null)) {
urldbCleanerThread.pause();
}
else if (post.get("action").equals("rstart")) {
if (indexCleanerThread==null || !indexCleanerThread.isAlive()) {
indexCleanerThread = indexSegment.getReferenceCleaner(post.get("wordHash","AAAAAAAAAAAA").getBytes());
indexCleanerThread.start();
}
else {
indexCleanerThread.endPause();
}
}
else if (post.get("action").equals("rstop") && (indexCleanerThread!=null)) {
indexCleanerThread.abort();
}
else if (post.get("action").equals("rpause") && (indexCleanerThread!=null)) {
indexCleanerThread.pause();
}
prop.put("LOCATION","");
return prop;
}
if (urldbCleanerThread!=null) {
prop.put("urldb", "1");
prop.putNum("urldb_percentUrls", ((double)urldbCleanerThread.totalSearchedUrls/indexSegment.urlMetadata().size())*100);
prop.putNum("urldb_blacklisted", urldbCleanerThread.blacklistedUrls);
prop.putNum("urldb_total", urldbCleanerThread.totalSearchedUrls);
prop.putHTML("urldb_lastBlacklistedUrl", urldbCleanerThread.lastBlacklistedUrl);
prop.put("urldb_lastBlacklistedHash", urldbCleanerThread.lastBlacklistedHash);
prop.putHTML("urldb_lastUrl", urldbCleanerThread.lastUrl);
prop.put("urldb_lastHash", urldbCleanerThread.lastHash);
prop.put("urldb_threadAlive", Boolean.toString(urldbCleanerThread.isAlive()));
prop.put("urldb_threadToString", urldbCleanerThread.toString());
final double percent = ((double)urldbCleanerThread.blacklistedUrls/urldbCleanerThread.totalSearchedUrls)*100;
prop.putNum("urldb_percent", percent);
}
if (indexCleanerThread!=null) {
prop.put("rwidb", "1");
prop.put("rwidb_threadAlive", Boolean.toString(indexCleanerThread.isAlive()));
prop.put("rwidb_threadToString", indexCleanerThread.toString());
prop.putNum("rwidb_RWIcountstart", indexCleanerThread.rwiCountAtStart);
prop.putNum("rwidb_RWIcountnow", indexCleanerThread.rwisize());
prop.put("rwidb_wordHashNow", (indexCleanerThread.wordHashNow == null) ? "NULL" : ASCII.String(indexCleanerThread.wordHashNow));
prop.put("rwidb_lastWordHash", (indexCleanerThread.lastWordHash == null) ? "null" : ASCII.String(indexCleanerThread.lastWordHash));
prop.putNum("rwidb_lastDeletionCounter", indexCleanerThread.lastDeletionCounter);
}
return prop;
}
}

@ -1,108 +0,0 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>YaCy '#[clientname]#': Index Cleaner</title>
#%env/templates/metas.template%#
</head>
<body id="IndexControlCleaner">
<div id="fullcontent">
#(inline)##%env/templates/header.template%#
<h2>Steering of API Actions</h2>
<p>This table shows search results that had been sorted out from the search result display because their content had not been verified.
This means that the searched word does not appear on the search page.
</p>::#(/inline)#
#(showtable)#::
<form action="IndexControlCleaner_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8" id="cleanerlist">
<fieldset>
<legend><label for="apilist">Recorded Actions</label></legend>
<p><span id="resCounter" style="display: inline;">
#(navigation)#
::
#(left)#<img src="env/grafics/navdl.gif" alt="no previous page" />::<a href="Table_API_p.html?startRecord=#[startRecord]#&amp;maximumRecords=#[maximumRecords]#&amp;inline=#(inline)#false::true#(/inline)#&amp;filter=#[filter]#" target="_self"><img src="env/grafics/navsl.gif" alt="previous page" /></a>#(/left)#
#[startRecord]#-#[to]# of #[of]#
#(right)#<img src="env/grafics/navdr.gif" alt="no next page" />::<a href="Table_API_p.html?startRecord=#[startRecord]#&amp;maximumRecords=#[maximumRecords]#&amp;inline=#(inline)#false::true#(/inline)#&amp;filter=#[filter]#" target="_self"><img src="env/grafics/navsr.gif" alt="next page" /></a>#(/right)#
<img src="env/grafics/nave.gif" alt="" />
#(/navigation)#
<div>
<input type="hidden" name="startRecord" value="#[startRecord]#" />
<input type="hidden" name="maximumRecords" value="#[maximumRecords]#" />
<input type="hidden" name="inline" value="#(inline)#false::true#(/inline)#" />
<input type="hidden" name="filter" value="#[filter]#" />
<input type="text" name="query" value="#[query]#" style="font-size:16px;float:left;border:0px;height:20px;background-image:url('env/grafics/find.gif');background-repeat:no-repeat;background-position:right top;" />
</div>
</span><br/></p>
<p style="clear:both;">
<table class="sortable" border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">
<th class="sorttable_nosort"><input type="checkbox" id="allswitch" onclick="checkAll(this.form.id, this.checked);" /></th>
<th>Type</th>
<th width="100">Comment</th>
<th>Call<br/>Count</th>
<th>Recording<br/>Date</th>
<th>Last&nbsp;Exec<br/>Date</th>
<th>Next&nbsp;Exec<br/>Date</th>
<th class="sorttable_nosort">Scheduler</th>
#(inline)#<th class="sorttable_nosort">URL</th>::#(/inline)#
</tr>
#{list}#
<tr class="TableCell#(dark)#Light::Dark::Summary#(/dark)#">
<td align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[pk]#" /></td>
<td>#[type]#</td>
<td>#[comment]#</td>
<td>#[callcount]#</td>
<td>#[dateRecording]#</td>
<td>#[dateLastExec]#</td>
<td>#[dateNextExec]#</td>
<td>
#(scheduler)#
<form action="Table_API_p.html" method="post" enctype="multipart/form-data" id="modify_repeat" accept-charset="UTF-8">
<select name="repeat_select" onchange='this.form.submit()'>
<option value="off" selected="selected">no repetition</option>
<option value="on">activate scheduler</option>
</select>
<input type="hidden" name="pk" value="#[pk]#" />
<input type="hidden" name="inline" value="#[inline]#" />
<input type="hidden" name="filter" value="#[filter]#" />
</form>
::
<form action="Table_API_p.html" method="post" enctype="multipart/form-data" id="modify_repeat">
<table><tr><td>
<select name="repeat_time" onchange='this.form.submit()'>
#{scale}#
<option value="#[time]#" #(selected)#::selected="selected"#(/selected)#>#[time]#</option>
#{/scale}#
</select>
</td><td>
<select name="repeat_unit" onchange='this.form.submit()'>
<option value="selminutes" #(selectedMinutes)#::selected="selected"#(/selectedMinutes)#>minutes</option>
<option value="selhours" #(selectedHours)#::selected="selected"#(/selectedHours)#>hours</option>
<option value="seldays" #(selectedDays)#::selected="selected"#(/selectedDays)#>days</option>
</select>
</td></tr></table>
<input type="hidden" name="pk" value="#[pk]#" />
<input type="hidden" name="inline" value="#[inline]#" />
<input type="hidden" name="filter" value="#[filter]#" />
<noscript><input type="submit" value="Submit" /></noscript>
</form>
#(/scheduler)#
</td>
#(inline)#<td>#[url]#</td>::#(/inline)#
</tr>
#{/list}#
</fieldset>
</table>
</p>
<p>
<input type="hidden" name="num" value="#[num]#" />
<input type="submit" name="deletefromindex" value="Delete selected references from the search index" />
<input type="submit" name="deleterows" value="Delete selected entries from the list" />
</p>
</form>
#(/showtable)#
</div>
#%env/templates/footer.template%#
</body>
</html>

@ -72,14 +72,6 @@
<td>RAM Cache</td>
<td>Description</td>
</tr>
<tr valign="top" class="TableCellDark">
<td>URLs in RAM buffer:</td>
<td align="center">#[urlCacheSize]#</td>
<td>
This is the size of the URL write buffer. Its purpose is to buffer incoming URLs
in case of search result transmission and during DHT transfer.
</td>
</tr>
<tr valign="top" class="TableCellDark">
<td>Words in RAM cache:<br />(Size in KBytes)</td>
<td>#[wordCacheSize]#<br />(#[wordCacheSizeKBytes]# KB)</td>

@ -299,7 +299,6 @@ public class PerformanceQueues_p {
prop.put("minimumGlobalDelta", sb.crawlQueues.noticeURL.getMinimumGlobalDelta());
// table cache settings
prop.putNum("urlCacheSize", indexSegment.urlMetadata().writeCacheSize());
prop.putNum("wordCacheSize", indexSegment.termIndex().getBufferSize());
prop.putNum("wordCacheSizeKBytes", indexSegment.termIndex().getBufferSizeBytes()/1024);
prop.putNum("maxURLinCache", indexSegment.termIndex().getBufferMaxReferences());

@ -24,7 +24,6 @@
</Task>
#{/table}#</Tasks>
<Cache>
<urlCacheSize>#[urlCacheSize]#</urlCacheSize>
<wordCacheSize>#[wordCacheSize]#</wordCacheSize>
<maxURLinCache>#[maxURLinCache]#</maxURLinCache>
<maxAgeOfCache>#[maxAgeOfCache]#</maxAgeOfCache>

@ -5,6 +5,5 @@
<li><a href="/BlacklistCleaner_p.html" class="MenuItemLink lock">Blacklist Cleaner</a></li>
<li><a href="/BlacklistTest_p.html" class="MenuItemLink lock">Blacklist Test</a></li>
<li><a href="/BlacklistImpExp_p.html" class="MenuItemLink lock">Import/Export</a></li>
<li><a href="/IndexCleaner_p.html" class="MenuItemLink lock">Index Cleaner</a></li>
</ul>
</div>

@ -1,564 +0,0 @@
// URLAnalysis.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 24.02.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.data;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import net.yacy.cora.document.UTF8;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.index.HandleMap;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.rwi.ReferenceContainerArray;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.search.index.MetadataRepository;
import net.yacy.search.index.Segment;
import net.yacy.search.index.MetadataRepository.Export;
public class URLAnalysis {
private static final Pattern patternMinus = Pattern.compile("-");
/**
* processes to analyse URL lists
*/
private static DigestURI poison = null;
static {
try {
poison = new DigestURI("http://poison.org/poison");
} catch (final MalformedURLException e) {
poison = null;
}
}
public static class splitter extends Thread {
private final ArrayBlockingQueue<DigestURI> in;
private final ConcurrentHashMap<String, Integer> out;
public splitter(final ArrayBlockingQueue<DigestURI> in, final ConcurrentHashMap<String, Integer> out) {
this.in = in;
this.out = out;
}
@Override
public void run() {
try {
DigestURI url;
final Pattern p = Pattern.compile("~|\\(|\\)|\\+|-|@|:|%|\\.|;|_");
while (true) {
try {
url = this.in.take();
if (url == poison) break;
update(patternMinus.matcher(url.getHost()).replaceAll("\\.").split("\\."));
update(p.matcher(url.getPath()).replaceAll("/").split("/"));
} catch (final InterruptedException e) {
Log.logException(e);
}
}
} catch (final Exception e) {
Log.logException(e);
}
}
private void update(final String[] s) {
Integer c;
for (final String t: s) {
if (t.isEmpty()) continue;
c = this.out.get(t);
this.out.put(t, (c == null) ? 1 : c.intValue() + 1);
}
}
}
public static void cleanup(final ConcurrentHashMap<String, Integer> stat) {
Map.Entry<String, Integer> entry;
int c, low = Integer.MAX_VALUE;
Iterator<Map.Entry<String, Integer>> i = stat.entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
c = entry.getValue().intValue();
if (c == 1) {
i.remove();
} else {
if (c < low) low = c;
}
}
i = stat.entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
c = entry.getValue().intValue();
if (c == low) {
i.remove();
}
}
Runtime.getRuntime().gc();
}
public static void genstat(final String urlfile) {
final boolean gz = urlfile.endsWith(".gz");
final String analysis = (gz) ? urlfile.substring(0, urlfile.length() - 3) + ".stats.gz" : urlfile + ".stats";
final long cleanuplimit = Math.max(50 * 1024 * 1024, MemoryControl.available() / 8);
// start threads
final ArrayBlockingQueue<DigestURI> in = new ArrayBlockingQueue<DigestURI>(1000);
final ConcurrentHashMap<String, Integer> out = new ConcurrentHashMap<String, Integer>();
for (int i = 0, available = Runtime.getRuntime().availableProcessors(); i < available; i++) new splitter(in, out).start();
final splitter spl = new splitter(in, out);
spl.start();
// put urls in queue
final File infile = new File(urlfile);
final File outfile = new File(analysis);
BufferedReader reader = null;
long time = System.currentTimeMillis();
final long start = time;
int count = 0;
System.out.println("start processing");
try {
InputStream is = new BufferedInputStream(new FileInputStream(infile));
if (gz) is = new GZIPInputStream(is);
reader = new BufferedReader(new InputStreamReader(is));
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
if (line.length() > 0) {
try {
final DigestURI url = new DigestURI(line);
in.put(url);
} catch (final InterruptedException e) {
Log.logException(e);
} catch (final MalformedURLException e) {
continue;
}
}
count++;
if (System.currentTimeMillis() - time > 1000) {
time = System.currentTimeMillis();
System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second");
if (MemoryControl.available() < cleanuplimit) {
System.out.println("starting cleanup, " + out.size() + " entries in statistic");
cleanup(out);
System.out.println("finished cleanup, " + out.size() + " entries in statistic left, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
}
}
}
reader.close();
} catch (final IOException e) {
Log.logException(e);
} finally {
if (reader != null) try { reader.close(); } catch (final Exception e) {}
}
// stop threads
System.out.println("stopping threads");
for (int i = 0, available = Runtime.getRuntime().availableProcessors() + 1; i < available; i++) try {
in.put(poison);
} catch (final InterruptedException e) {
Log.logException(e);
}
try {
spl.join();
} catch (final InterruptedException e1) {
Log.logException(e1);
}
// generate statistics
System.out.println("start processing results");
final TreeMap<String, Integer> results = new TreeMap<String, Integer>();
count = 0;
Map.Entry<String, Integer> entry;
final Iterator<Map.Entry<String, Integer>> i = out.entrySet().iterator();
while (i.hasNext()) {
entry = i.next();
results.put(num(entry.getValue().intValue() * (entry.getKey().length() - 1)) + " - " + entry.getKey(), entry.getValue());
count++;
i.remove(); // free memory
if (System.currentTimeMillis() - time > 10000) {
time = System.currentTimeMillis();
System.out.println("processed " + count + " results, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
}
}
// write statistics
System.out.println("start writing results");
try {
OutputStream os = new BufferedOutputStream(new FileOutputStream(outfile));
if (gz) os = new GZIPOutputStream(os);
count = 0;
for (final Map.Entry<String, Integer> e: results.entrySet()) {
os.write(UTF8.getBytes(e.getKey()));
os.write(new byte[]{'\t'});
os.write(UTF8.getBytes(Integer.toString(e.getValue())));
os.write(new byte[]{'\n'});
count++;
if (System.currentTimeMillis() - time > 10000) {
time = System.currentTimeMillis();
System.out.println("wrote " + count + " lines.");
}
}
os.close();
} catch (final IOException e) {
Log.logException(e);
}
System.out.println("finished");
}
public static void genhost(final String urlfile) {
final boolean gz = urlfile.endsWith(".gz");
final String trunk = (gz) ? urlfile.substring(0, urlfile.length() - 3) + ".host" : urlfile + ".host";
final HashSet<String> hosts = new HashSet<String>();
final File infile = new File(urlfile);
BufferedReader reader = null;
long time = System.currentTimeMillis();
final long start = time;
int count = 0;
System.out.println("start processing");
try {
InputStream is = new BufferedInputStream(new FileInputStream(infile));
if (gz) is = new GZIPInputStream(is);
reader = new BufferedReader(new InputStreamReader(is));
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
if (line.length() > 0) {
try {
final DigestURI url = new DigestURI(line);
hosts.add(url.getHost());
} catch (final MalformedURLException e) {
continue;
}
}
count++;
if (System.currentTimeMillis() - time > 1000) {
time = System.currentTimeMillis();
System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second");
}
}
reader.close();
} catch (final IOException e) {
Log.logException(e);
} finally {
if (reader != null) try { reader.close(); } catch (final Exception e) {}
}
// copy everything into a TreeSet to order it
System.out.println("start processing results");
final TreeSet<String> results = new TreeSet<String>();
count = 0;
final Iterator<String> i = hosts.iterator();
while (i.hasNext()) {
results.add(i.next());
count++;
i.remove(); // free memory
if (System.currentTimeMillis() - time > 10000) {
time = System.currentTimeMillis();
System.out.println("processed " + count + " results, " + (MemoryControl.available() / 1024 / 1024) + " mb left");
}
}
// write hosts
writeSet(trunk, gz, results);
System.out.println("finished");
}
private static void writeSet(final String trunk, final boolean gz, final Set<String> set) {
// write hosts
System.out.println("start writing results");
final File outfile = new File(trunk + ((gz) ? ".gz" : ""));
long time = System.currentTimeMillis();
try {
OutputStream os = new BufferedOutputStream(new FileOutputStream(outfile));
if (gz) os = new GZIPOutputStream(os);
int count = 0;
for (final String h: set) {
os.write(UTF8.getBytes(h));
os.write(new byte[]{'\n'});
count++;
if (System.currentTimeMillis() - time > 10000) {
time = System.currentTimeMillis();
System.out.println("wrote " + count + " lines.");
}
}
os.close();
} catch (final IOException e) {
Log.logException(e);
}
System.out.println("finished writing results");
}
public static void sortsplit(final String urlfile) {
final boolean gz = urlfile.endsWith(".gz");
final String trunk = ((gz) ? urlfile.substring(0, urlfile.length() - 3) : urlfile) + ".sort";
final File infile = new File(urlfile);
final TreeSet<String> urls = new TreeSet<String>();
BufferedReader reader = null;
long time = System.currentTimeMillis();
final long start = time;
int count = 0;
int filecount = 0;
final long cleanuplimit = Math.max(50 * 1024 * 1024, MemoryControl.available() / 8);
System.out.println("start processing");
try {
InputStream is = new BufferedInputStream(new FileInputStream(infile));
if (gz) is = new GZIPInputStream(is);
reader = new BufferedReader(new InputStreamReader(is));
String line;
while ((line = reader.readLine()) != null) {
line = line.trim();
if (line.length() > 0) {
try {
final DigestURI url = new DigestURI(line);
urls.add(url.toNormalform(true, true));
} catch (final MalformedURLException e) {
continue;
}
}
count++;
if (System.currentTimeMillis() - time > 1000) {
time = System.currentTimeMillis();
System.out.println("processed " + count + " urls, " + (MemoryControl.available() / 1024 / 1024) + " mb left, " + count * 1000L / (time - start) + " url/second");
}
if (MemoryControl.available() < cleanuplimit) {
writeSet(trunk + "." + filecount, gz, urls);
filecount++;
urls.clear();
Runtime.getRuntime().gc();
}
}
reader.close();
} catch (final IOException e) {
Log.logException(e);
} finally {
if (reader != null) try { reader.close(); } catch (final Exception e) {}
}
// write hosts
writeSet(trunk + "." + filecount, gz, urls);
System.out.println("finished");
}
public static void incell(final File cellPath, final String statisticPath) {
try {
final HandleMap idx = ReferenceContainerArray.referenceHashes(
cellPath,
Segment.wordReferenceFactory,
Base64Order.enhancedCoder,
WordReferenceRow.urlEntryRow);
System.out.println("INDEX REFERENCE COLLECTION starting dump of statistics");
idx.dump(new File(statisticPath));
System.out.println("INDEX REFERENCE COLLECTION finished dump, wrote " + idx.size() + " entries to " + statisticPath);
idx.close();
} catch (final Exception e) {
Log.logException(e);
}
}
public static int diffurlcol(final String metadataPath, final String statisticFile, final String diffFile) throws IOException, RowSpaceExceededException {
System.out.println("INDEX DIFF URL-COL startup");
final HandleMap idx = new HandleMap(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 4, new File(statisticFile));
final MetadataRepository mr = new MetadataRepository(new File(metadataPath));
mr.connectUrlDb(Segment.UrlDbName, false, false);
final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 1000000);
System.out.println("INDEX DIFF URL-COL loaded dump, starting diff");
final long start = System.currentTimeMillis();
long update = start - 7000;
int count = 0;
for (final byte[] refhash: mr) {
if (idx.get(refhash) == -1) {
// the key exists as urlhash in the URL database, but not in the collection as referenced urlhash
hs.put(refhash);
}
count++;
if (System.currentTimeMillis() - update > 10000) {
System.out.println("INDEX DIFF URL-COL running, checked " + count + ", found " + hs.size() + " missing references so far, " + (((System.currentTimeMillis() - start) * (mr.size() - count) / count) / 60000) + " minutes remaining");
update = System.currentTimeMillis();
}
}
idx.close();
mr.close();
System.out.println("INDEX DIFF URL-COL finished diff, starting dump to " + diffFile);
count = hs.dump(new File(diffFile));
System.out.println("INDEX DIFF URL-COL finished dump, wrote " + count + " references that occur in the URL-DB, but not in the collection-dump");
return count;
}
public static void export(final String metadataPath, final int format, final String export, final String diffFile) throws IOException, RowSpaceExceededException {
// format: 0=text, 1=html, 2=rss/xml
System.out.println("URL EXPORT startup");
final MetadataRepository mr = new MetadataRepository(new File(metadataPath));
mr.connectUrlDb(Segment.UrlDbName, false, false);
final HandleSet hs = (diffFile == null) ? null : new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile));
System.out.println("URL EXPORT loaded dump, starting export");
final Export e = mr.export(new File(export), ".*", hs, format, false);
try {
e.join();
} catch (final InterruptedException e1) {
Log.logException(e1);
}
System.out.println("URL EXPORT finished export, wrote " + ((hs == null) ? mr.size() : hs.size()) + " entries");
}
public static void delete(final String metadataPath, final String diffFile) throws IOException, RowSpaceExceededException {
System.out.println("URL DELETE startup");
final MetadataRepository mr = new MetadataRepository(new File(metadataPath));
mr.connectUrlDb(Segment.UrlDbName, false, false);
final int mrSize = mr.size();
final HandleSet hs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, new File(diffFile));
System.out.println("URL DELETE loaded dump, starting deletion of " + hs.size() + " entries from " + mrSize);
for (final byte[] refhash: hs) {
mr.remove(refhash);
}
System.out.println("URL DELETE finished deletions, " + mr.size() + " entries left in URL database");
}
public static void main(final String[] args) {
if (args[0].equals("-stat") && args.length >= 2) {
// generate a statistics about common words in file, store to <file>.stat
// example:
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -stat DATA/EXPORT/urls1.txt.gz
for (int i = 1; i < args.length; i++) genstat(args[i]);
} else if (args[0].equals("-host") && args.length >= 2) {
// generate a file <file>.host containing only the hosts of the urls
for (int i = 1; i < args.length; i++) genhost(args[i]);
} else if (args[0].equals("-sort") && args.length >= 2) {
// generate file <file>.x.sort with sorted lists and split the file in smaller pieces
for (int i = 1; i < args.length; i++) sortsplit(args[i]);
} else if (args[0].equals("-incell") && args.length >= 2) {
// generate a dump of all referenced URL hashes from a given RICELL
// example:
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -incell DATA/INDEX/freeworld/TEXT/RICELL used.dump
incell(new File(args[1]), args[2]);
} else if (args[0].equals("-diffurlcol") && args.length >= 3) {
// make a diff-file that contains hashes from the url database that do not occur in the collection reference dump
// example:
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT/METADATA used.dump diffurlcol.dump
try {
diffurlcol(args[1], args[2], args[3]);
} catch (final Exception e) {
Log.logException(e);
}
} else if (args[0].equals("-export") && args.length >= 4) {
// export a url-list file
// example:
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -export DATA/INDEX/freeworld/TEXT xml urls.xml diffurlcol.dump
// instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html'
final int format = (args[2].equals("xml")) ? 2 : (args[2].equals("html")) ? 1 : 0;
try {
export(args[1], format, args[3], (args.length >= 5) ? args[4] : null);
} catch (final Exception e) {
Log.logException(e);
}
} else if (args[0].equals("-delete") && args.length >= 3) {
// delete from URLs as given by urlreference diff dump
// example:
// java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -delete DATA/INDEX/freeworld/TEXT diffurlcol.dump
// instead of 'xml' (which is in fact a rss), the format can also be 'text' and 'html'
try {
delete(args[1], args[2]);
} catch (final Exception e) {
Log.logException(e);
}
} else {
System.out.println("usage:");
System.out.println();
System.out.println("-stat <file> ");
System.out.println(" generate a statistics about common words in file, store to <file>.stat");
System.out.println();
System.out.println("-host <file>");
System.out.println(" generate a file <file>.host containing only the hosts of the urls");
System.out.println();
System.out.println("-sort <file>");
System.out.println(" generate file <file>.x.sort with sorted lists and split the file in smaller pieces");
System.out.println();
System.out.println("-incollection <path-to-RICOLLECTION> <file>");
System.out.println(" generate a dump of all referenced URL hashes");
System.out.println();
System.out.println("-diffurlcol <path-to-URL-DB> <dump-from-incollection> <diff-dump>");
System.out.println(" find URLs that occur in url-db but not in collections");
System.out.println();
System.out.println("-export <path-to-URL-DB> <format text|html|xml> <export-file> <diff-dump>");
System.out.println(" export urls to file. the last argument can be omitted, then all urls are exported");
System.out.println();
System.out.println("-delete <path-to-URL-DB> <diff-dump>");
System.out.println(" delete all urls that are listed in the diff-dump from the url-db");
System.out.println();
System.out.println("to do a complete clean-up of the url database, start the following:");
System.out.println();
System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -incollection DATA/INDEX/freeworld/TEXT/RICOLLECTION used.dump");
System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -diffurlcol DATA/INDEX/freeworld/TEXT used.dump diffurlcol.dump");
System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -export DATA/INDEX/freeworld/TEXT xml urls.xml diffurlcol.dump");
System.out.println("java -Xmx1000m -cp classes de.anomic.data.URLAnalysis -delete DATA/INDEX/freeworld/TEXT diffurlcol.dump");
System.out.println();
}
System.exit(0); // kill remaining threads
}
private static final String num(final int i) {
final StringBuilder s = new StringBuilder(Integer.toString(i));
while (s.length() < 9) s.insert(0, "0");
return s.toString();
}
}

@ -33,21 +33,17 @@ import java.io.IOException;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.TreeSet;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.services.federated.solr.DoubleSolrConnector;
import net.yacy.cora.services.federated.solr.SolrConnector;
import net.yacy.cora.sorting.ConcurrentScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
import net.yacy.document.parser.html.CharacterCoding;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadata;
@ -61,15 +57,11 @@ import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.table.SplitTable;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.repository.Blacklist;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
import net.yacy.search.solr.EmbeddedSolrConnector;
import org.apache.lucene.util.Version;
import de.anomic.crawler.CrawlStacker;
public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]> {
// class objects
@ -186,26 +178,20 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
this.solr.close();
}
public int writeCacheSize() {
if (this.urlIndexFile != null && this.urlIndexFile instanceof SplitTable) return ((SplitTable) this.urlIndexFile).writeBufferSize();
if (this.urlIndexFile != null && this.urlIndexFile instanceof Cache) return ((Cache) this.urlIndexFile).writeBufferSize();
return 0;
}
/**
* generates an plasmaLURLEntry using the url hash
* if the url cannot be found, this returns null
* @param obrwi
* @return
*/
public URIMetadata load(final WeakPriorityBlockingQueue.Element<WordReferenceVars> obrwi) {
if (obrwi == null) return null; // all time was already wasted in takeRWI to get another element
final byte[] urlHash = obrwi.getElement().urlhash();
public URIMetadata load(WordReferenceVars wre, long weight) {
if (wre == null) return null; // all time was already wasted in takeRWI to get another element
final byte[] urlHash = wre.urlhash();
if (urlHash == null) return null;
if (this.urlIndexFile != null) try {
final Row.Entry entry = this.urlIndexFile.get(urlHash, false);
if (entry == null) return null;
return new URIMetadataRow(entry, obrwi.getElement(), obrwi.getWeight());
return new URIMetadataRow(entry, wre, weight);
} catch (final IOException e) {
Log.logException(e);
}
@ -280,29 +266,25 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
public boolean exists(final byte[] urlHash) {
if (urlHash == null) return false;
if (this.urlIndexFile != null && this.urlIndexFile.has(urlHash)) return true;
try {
if (this.solr.exists(ASCII.String(urlHash))) return true;
} catch (final Throwable e) {
Log.logException(e);
}
if (this.urlIndexFile == null) return false; // case may happen during shutdown
return this.urlIndexFile.has(urlHash);
return false;
}
public CloneableIterator<byte[]> keys(final boolean up, final byte[] firstKey) {
@Override
public Iterator<byte[]> iterator() {
try {
return this.urlIndexFile.keys(up, firstKey);
return this.urlIndexFile.keys(true, null);
} catch (final IOException e) {
Log.logException(e);
return null;
}
}
@Override
public Iterator<byte[]> iterator() {
return keys(true, null);
}
public CloneableIterator<URIMetadata> entries() throws IOException {
// enumerates entry elements
return new kiter();
@ -367,186 +349,6 @@ public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]>
}
}
/**
* Uses an Iteration over urlHash.db to detect malformed URL-Entries.
* Damaged URL-Entries will be marked in a HashSet and removed at the end of the function.
*
* @param proxyConfig
*/
public void deadlinkCleaner() {
final Log log = new Log("URLDBCLEANUP");
final HashSet<String> damagedURLS = new HashSet<String>();
try {
final Iterator<URIMetadata> eiter = entries(true, null);
int iteratorCount = 0;
while (eiter.hasNext()) try {
eiter.next();
iteratorCount++;
} catch (final RuntimeException e) {
if(e.getMessage() != null) {
final String m = e.getMessage();
damagedURLS.add(m.substring(m.length() - 12));
} else {
log.logSevere("RuntimeException:", e);
}
}
log.logInfo("URLs vorher: " + this.urlIndexFile.size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + damagedURLS.size());
final HTTPClient client = new HTTPClient();
final Iterator<String> eiter2 = damagedURLS.iterator();
byte[] urlHashBytes;
while (eiter2.hasNext()) {
urlHashBytes = ASCII.getBytes(eiter2.next());
// trying to fix the invalid URL
String oldUrlStr = null;
try {
// getting the url data as byte array
final Row.Entry entry = this.urlIndexFile.get(urlHashBytes, true);
// getting the wrong url string
oldUrlStr = entry.getColUTF8(1).trim();
int pos = -1;
if ((pos = oldUrlStr.indexOf("://",0)) != -1) {
// trying to correct the url
final String newUrlStr = "http://" + oldUrlStr.substring(pos + 3);
final DigestURI newUrl = new DigestURI(newUrlStr);
if (client.HEADResponse(newUrl.toString()) != null
&& client.getHttpResponse().getStatusLine().getStatusCode() == 200) {
entry.setCol(1, UTF8.getBytes(newUrl.toString()));
this.urlIndexFile.put(entry);
if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + ASCII.String(urlHashBytes) + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr);
} else {
remove(urlHashBytes);
if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + ASCII.String(urlHashBytes) + "' removed\n\tURL: " + oldUrlStr + "\n\tConnection Status: " + (client.getHttpResponse() == null ? "null" : client.getHttpResponse().getStatusLine()));
}
}
} catch (final Exception e) {
remove(urlHashBytes);
if (log.isInfo()) log.logInfo("UrlDB-Entry with urlHash '" + ASCII.String(urlHashBytes) + "' removed\n\tURL: " + oldUrlStr + "\n\tExecption: " + e.getMessage());
}
}
log.logInfo("URLs nachher: " + size() + " kaputte URLs: " + damagedURLS.size());
} catch (final IOException e) {
log.logSevere("IOException", e);
}
}
public BlacklistCleaner getBlacklistCleaner(final Blacklist blacklist, final CrawlStacker crawlStacker) {
return new BlacklistCleaner(blacklist, crawlStacker);
}
public class BlacklistCleaner extends Thread {
private boolean run = true;
private boolean pause;
public int blacklistedUrls = 0;
public int totalSearchedUrls = 1;
public String lastBlacklistedUrl = "";
public String lastBlacklistedHash = "";
public String lastUrl = "";
public String lastHash = "";
private final Blacklist blacklist;
private final CrawlStacker crawlStacker;
public BlacklistCleaner(final Blacklist blacklist, final CrawlStacker crawlStacker) {
this.blacklist = blacklist;
this.crawlStacker = crawlStacker;
}
@Override
public void run() {
try {
Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread startet");
final Iterator<URIMetadata> eiter = entries(true, null);
while (eiter.hasNext() && this.run) {
synchronized (this) {
if (this.pause) {
try {
this.wait();
} catch (final InterruptedException e) {
Log.logWarning("URLDBCLEANER", "InterruptedException", e);
this.run = false;
return;
}
}
}
final URIMetadata entry = eiter.next();
if (entry == null) {
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", "entry == null");
} else if (entry.hash() == null) {
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++this.blacklistedUrls + " blacklisted (" + ((double) this.blacklistedUrls / this.totalSearchedUrls) * 100 + "%): " + "hash == null");
} else {
this.totalSearchedUrls++;
if (entry.url() == null) {
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++this.blacklistedUrls + " blacklisted (" + ((double) this.blacklistedUrls / this.totalSearchedUrls) * 100 + "%): " + ASCII.String(entry.hash()) + "URL == null");
remove(entry.hash());
continue;
}
if (this.blacklist.isListed(BlacklistType.CRAWLER, entry) ||
this.blacklist.isListed(BlacklistType.DHT, entry) ||
(this.crawlStacker.urlInAcceptedDomain(entry.url()) != null)) {
this.lastBlacklistedUrl = entry.url().toNormalform(true, true);
this.lastBlacklistedHash = ASCII.String(entry.hash());
if (Log.isFine("URLDBCLEANER")) Log.logFine("URLDBCLEANER", ++this.blacklistedUrls + " blacklisted (" + ((double) this.blacklistedUrls / this.totalSearchedUrls) * 100 + "%): " + ASCII.String(entry.hash()) + " " + entry.url().toNormalform(false, true));
remove(entry.hash());
if (this.blacklistedUrls % 100 == 0) {
Log.logInfo("URLDBCLEANER", "Deleted " + this.blacklistedUrls + " URLs until now. Last deleted URL-Hash: " + this.lastBlacklistedUrl);
}
}
this.lastUrl = entry.url().toNormalform(true, true);
this.lastHash = ASCII.String(entry.hash());
}
}
} catch (final RuntimeException e) {
if (e.getMessage() != null && e.getMessage().indexOf("not found in LURL",0) != -1) {
Log.logWarning("URLDBCLEANER", "urlHash not found in LURL", e);
}
else {
Log.logWarning("URLDBCLEANER", "RuntimeException", e);
this.run = false;
}
} catch (final IOException e) {
Log.logException(e);
this.run = false;
} catch (final Exception e) {
Log.logException(e);
this.run = false;
}
Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread stopped");
}
public void abort() {
synchronized(this) {
this.run = false;
notifyAll();
}
}
public void pause() {
synchronized(this) {
if (!this.pause) {
this.pause = true;
Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread paused");
}
}
}
public void endPause() {
synchronized(this) {
if (this.pause) {
this.pause = false;
notifyAll();
Log.logInfo("URLDBCLEANER", "UrldbCleaner-Thread resumed");
}
}
}
}
// export methods
public Export export(final File f, final String filter, final HandleSet set, final int format, final boolean dom) {
if ((this.exportthread != null) && (this.exportthread.isAlive())) {

@ -33,7 +33,6 @@ import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TreeSet;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
@ -55,7 +54,6 @@ import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceFactory;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.data.word.WordReferenceVars;
import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
@ -66,7 +64,6 @@ import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceFactory;
import net.yacy.kelondro.util.ISO639;
import net.yacy.kelondro.util.LookAheadIterator;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.Switchboard;
import net.yacy.search.query.RWIProcess;
@ -252,7 +249,7 @@ public class Segment {
* @param host
* @return an iterator for all url hashes that belong to a specific host
*/
public Iterator<byte[]> hostSelector(String host) {
private Iterator<byte[]> hostSelector(String host) {
String hh = DigestURI.hosthash(host);
final HandleSet ref = new HandleSet(12, Base64Order.enhancedCoder, 100);
for (byte[] b: this.urlMetadata) {
@ -551,12 +548,6 @@ public class Segment {
return newEntry;
}
// method for index deletion
public int removeAllUrlReferences(final DigestURI url, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) {
return removeAllUrlReferences(url.hash(), loader, cacheStrategy);
}
public void removeAllUrlReferences(final HandleSet urls, final LoaderDispatcher loader, final CacheStrategy cacheStrategy) {
for (final byte[] urlhash: urls) removeAllUrlReferences(urlhash, loader, cacheStrategy);
}
@ -604,129 +595,4 @@ public class Segment {
}
}
// The Cleaner class was provided as "UrldbCleaner" by Hydrox
public synchronized ReferenceCleaner getReferenceCleaner(final byte[] startHash) {
return new ReferenceCleaner(startHash);
}
public class ReferenceCleaner extends Thread {
private final byte[] startHash;
private boolean run = true;
private boolean pause = false;
public int rwiCountAtStart = 0;
public byte[] wordHashNow = null;
public byte[] lastWordHash = null;
public int lastDeletionCounter = 0;
public ReferenceCleaner(final byte[] startHash) {
this.startHash = startHash;
this.rwiCountAtStart = termIndex().sizesMax();
}
@Override
public void run() {
Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread started");
ReferenceContainer<WordReference> container = null;
WordReferenceVars entry = null;
DigestURI url = null;
final HandleSet urlHashs = new HandleSet(URIMetadataRow.rowdef.primaryKeyLength, URIMetadataRow.rowdef.objectOrder, 0);
try {
Iterator<ReferenceContainer<WordReference>> indexContainerIterator = Segment.this.termIndex.referenceContainer(this.startHash, false, false, 100, false).iterator();
while (indexContainerIterator.hasNext() && this.run) {
waiter();
container = indexContainerIterator.next();
final Iterator<WordReference> containerIterator = container.entries();
this.wordHashNow = container.getTermHash();
while (containerIterator.hasNext() && this.run) {
waiter();
entry = new WordReferenceVars(containerIterator.next());
// System.out.println("Wordhash: "+wordHash+" UrlHash:
// "+entry.getUrlHash());
final URIMetadata ue = Segment.this.urlMetadata.load(entry.urlhash());
if (ue == null) {
urlHashs.put(entry.urlhash());
} else {
url = ue.url();
if (url == null || Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, url)) {
urlHashs.put(entry.urlhash());
}
}
}
if (!urlHashs.isEmpty()) try {
final int removed = Segment.this.termIndex.remove(container.getTermHash(), urlHashs);
Log.logFine("INDEXCLEANER", ASCII.String(container.getTermHash()) + ": " + removed + " of " + container.size() + " URL-entries deleted");
this.lastWordHash = container.getTermHash();
this.lastDeletionCounter = urlHashs.size();
urlHashs.clear();
} catch (final IOException e) {
Log.logException(e);
}
if (!containerIterator.hasNext()) {
// We may not be finished yet, try to get the next chunk of wordHashes
final TreeSet<ReferenceContainer<WordReference>> containers = Segment.this.termIndex.referenceContainer(container.getTermHash(), false, false, 100, false);
indexContainerIterator = containers.iterator();
// Make sure we don't get the same wordhash twice, but don't skip a word
if ((indexContainerIterator.hasNext()) && (!container.getTermHash().equals(indexContainerIterator.next().getTermHash()))) {
indexContainerIterator = containers.iterator();
}
}
}
} catch (final IOException e) {
Log.logException(e);
} catch (final Exception e) {
Log.logException(e);
}
Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread stopped");
}
public void abort() {
synchronized(this) {
this.run = false;
notifyAll();
}
}
public void pause() {
synchronized (this) {
if (!this.pause) {
this.pause = true;
Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread paused");
}
}
}
public void endPause() {
synchronized (this) {
if (this.pause) {
this.pause = false;
notifyAll();
Log.logInfo("INDEXCLEANER", "IndexCleaner-Thread resumed");
}
}
}
public void waiter() {
synchronized (this) {
if (this.pause) {
try {
this.wait();
} catch (final InterruptedException e) {
this.run = false;
return;
}
}
}
}
public int rwisize() {
return termIndex().sizesMax();
}
public int urlsize() {
return urlMetadata().size();
}
}
}

@ -628,7 +628,7 @@ public final class RWIProcess extends Thread
if ( obrwi == null ) {
return null; // all time was already wasted in takeRWI to get another element
}
final URIMetadata page = this.query.getSegment().urlMetadata().load(obrwi);
final URIMetadata page = this.query.getSegment().urlMetadata().load(obrwi.getElement(), obrwi.getWeight());
if ( page == null ) {
try {
this.misses.putUnique(obrwi.getElement().urlhash());

@ -24,7 +24,6 @@
package net.yacy;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
@ -38,54 +37,33 @@ import java.io.PrintWriter;
import java.io.RandomAccessFile;
import java.nio.channels.FileChannel;
import java.nio.channels.FileLock;
import java.util.Iterator;
import java.util.Map;
import java.util.Properties;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.Semaphore;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.lod.JenaTripleStore;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.sorting.Array;
import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.gui.YaCyApp;
import net.yacy.gui.framework.Browser;
import net.yacy.kelondro.blob.MapDataMining;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.rwi.Reference;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.Formatter;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.util.OS;
import net.yacy.peers.SeedDB;
import net.yacy.peers.operation.yacyBuildProperties;
import net.yacy.peers.operation.yacyRelease;
import net.yacy.peers.operation.yacyVersion;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.MetadataRepository;
import net.yacy.search.index.Segment;
import com.google.common.io.Files;
import de.anomic.data.Translator;
import de.anomic.http.server.HTTPDemon;
import de.anomic.server.serverCore;
import de.anomic.tools.enumerateFiles;
/**
* This is the main class of YaCy. Several threads are started from here:
@ -595,346 +573,6 @@ public final class yacy {
Log.logConfig("COMMAND-STEERING", "SUCCESSFULLY FINISHED COMMAND: " + processdescription);
}
/**
* This method gets all found words and outputs a statistic about the score
* of the words. The output of this method can be used to create stop-word
* lists. This method will be called if you start yacy with the argument
* -genwordstat.
* FIXME: How can stop-word list be created from this output? What type of
* score is output?
*
* @param homePath Root-Path where all the information is to be found.
*/
private static void genWordstat(final File homePath) {
// start up
System.out.println(copyright);
System.out.println(hline);
// load words
Log.logInfo("GEN-WORDSTAT", "loading words...");
final TreeMap<byte[], String> words = loadWordMap(new File(homePath, "yacy.words"));
// find all hashes
Log.logInfo("GEN-WORDSTAT", "searching all word-hash databases...");
final File dbRoot = new File(homePath, "DATA/INDEX/freeworld/");
final enumerateFiles ef = new enumerateFiles(new File(dbRoot, "WORDS"), true, false, true, true);
File f;
byte[] h;
final ScoreMap<byte[]> hs = new OrderedScoreMap<byte[]>(Base64Order.standardCoder);
while (ef.hasMoreElements()) {
f = ef.nextElement();
h = f.getName().substring(0, Word.commonHashLength).getBytes();
hs.inc(h, (int) f.length());
}
// list the hashes in reverse order
Log.logInfo("GEN-WORDSTAT", "listing words in reverse size order...");
String w;
final Iterator<byte[]> i = hs.keys(false);
while (i.hasNext()) {
h = i.next();
w = words.get(h);
if (w == null) System.out.print("# " + h); else System.out.print(w);
System.out.println(" - " + hs.get(h));
}
// finished
Log.logConfig("GEN-WORDSTAT", "FINISHED");
}
/**
* @param homePath path to the YaCy directory
* @param networkName
*/
public static void minimizeUrlDB(final File dataHome, final File appHome, final String networkName) {
// run with "java -classpath classes yacy -minimizeUrlDB"
try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {}
final File indexPrimaryRoot = new File(dataHome, "DATA/INDEX");
final File indexRoot2 = new File(dataHome, "DATA/INDEX2");
final Log log = new Log("URL-CLEANUP");
try {
log.logInfo("STARTING URL CLEANUP");
// db containing all currently loades urls
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexPrimaryRoot, networkName), "TEXT"));
currentUrlDB.connectUrlDb(Segment.UrlDbName, false, false);
// db used to hold all neede urls
final MetadataRepository minimizedUrlDB = new MetadataRepository(new File(new File(indexRoot2, networkName), "TEXT"));
minimizedUrlDB.connectUrlDb(Segment.UrlDbName, false, false);
final int cacheMem = (int)(MemoryControl.maxMemory() - MemoryControl.total());
if (cacheMem < 2048000) throw new OutOfMemoryError("Not enough memory available to start clean up.");
final Segment wordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"), null);
wordIndex.connectRWI(10000, Integer.MAX_VALUE);
wordIndex.connectUrlDb(false, false);
final Iterator<ReferenceContainer<WordReference>> indexContainerIterator = wordIndex.termIndex().referenceContainerIterator("AAAAAAAAAAAA".getBytes(), false, false);
long urlCounter = 0, wordCounter = 0;
long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = 0;
String wordChunkStartHash = "AAAAAAAAAAAA", wordChunkEndHash;
while (indexContainerIterator.hasNext()) {
ReferenceContainer<WordReference> wordIdxContainer = null;
try {
wordCounter++;
wordIdxContainer = indexContainerIterator.next();
// the combined container will fit, read the container
final Iterator<WordReference> wordIdxEntries = wordIdxContainer.entries();
Reference iEntry;
while (wordIdxEntries.hasNext()) {
iEntry = wordIdxEntries.next();
final byte[] urlHash = iEntry.urlhash();
if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
final URIMetadata urlEntry = currentUrlDB.load(urlHash);
urlCounter++;
minimizedUrlDB.store(urlEntry);
if (urlCounter % 500 == 0) {
log.logInfo(urlCounter + " URLs found so far.");
}
} catch (final IOException e) {}
}
if (wordCounter%500 == 0) {
wordChunkEndHash = ASCII.String(wordIdxContainer.getTermHash());
wordChunkEnd = System.currentTimeMillis();
final long duration = wordChunkEnd - wordChunkStart;
log.logInfo(wordCounter + " words scanned " +
"[" + wordChunkStartHash + " .. " + wordChunkEndHash + "]\n" +
"Duration: "+ 500*1000/duration + " words/s" +
" | Free memory: " + MemoryControl.free() +
" | Total memory: " + MemoryControl.total());
wordChunkStart = wordChunkEnd;
wordChunkStartHash = wordChunkEndHash;
}
// we have read all elements, now we can close it
wordIdxContainer = null;
} catch (final Exception e) {
log.logSevere("Exception", e);
} finally {
if (wordIdxContainer != null) try { wordIdxContainer = null; } catch (final Exception e) {}
}
}
log.logInfo("current LURL DB contains " + currentUrlDB.size() + " entries.");
log.logInfo("mimimized LURL DB contains " + minimizedUrlDB.size() + " entries.");
currentUrlDB.close();
minimizedUrlDB.close();
wordIndex.close();
// TODO: rename the mimimized UrlDB to the name of the previous UrlDB
log.logInfo("FINISHED URL CLEANUP, WAIT FOR DUMP");
log.logInfo("You can now backup your old URL DB and rename minimized/urlHash.db to urlHash.db");
log.logInfo("TERMINATED URL CLEANUP");
} catch (final Exception e) {
log.logSevere("Exception: " + e.getMessage(), e);
} catch (final Error e) {
log.logSevere("Error: " + e.getMessage(), e);
}
}
/**
* Reads all words from the given file and creates a treemap, where key is
* the plasma word hash and value is the word itself.
*
* @param wordlist File where the words are stored.
* @return HashMap with the hash-word - relation.
*/
private static TreeMap<byte[], String> loadWordMap(final File wordlist) {
// returns a hash-word - Relation
final TreeMap<byte[], String> wordmap = new TreeMap<byte[], String>(Base64Order.enhancedCoder);
try {
String word;
final BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(wordlist)));
while ((word = br.readLine()) != null) wordmap.put(Word.word2hash(word), word);
br.close();
} catch (final IOException e) {}
return wordmap;
}
/**
* Cleans a wordlist in a file according to the length of the words. The
* file with the given filename is read and then only the words in the given
* length-range are written back to the file.
*
* @param wordlist Name of the file the words are stored in.
* @param minlength Minimal needed length for each word to be stored.
* @param maxlength Maximal allowed length for each word to be stored.
*/
private static void cleanwordlist(final String wordlist, final int minlength, final int maxlength) {
// start up
System.out.println(copyright);
System.out.println(hline);
Log.logConfig("CLEAN-WORDLIST", "START");
String word;
final TreeSet<String> wordset = new TreeSet<String>();
int count = 0;
try {
final BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(wordlist)));
final String seps = "' .,:/-&";
while ((word = br.readLine()) != null) {
word = word.toLowerCase().trim();
for (int i = 0; i < seps.length(); i++) {
if (word.indexOf(seps.charAt(i)) >= 0) word = word.substring(0, word.indexOf(seps.charAt(i)));
}
if ((word.length() >= minlength) && (word.length() <= maxlength)) wordset.add(word);
count++;
}
br.close();
if (wordset.size() != count) {
count = count - wordset.size();
final BufferedWriter bw = new BufferedWriter(new PrintWriter(new FileWriter(wordlist)));
while (!wordset.isEmpty()) {
word = wordset.first();
bw.write(word + "\n");
wordset.remove(word);
}
bw.close();
Log.logInfo("CLEAN-WORDLIST", "shrinked wordlist by " + count + " words.");
} else {
Log.logInfo("CLEAN-WORDLIST", "not necessary to change wordlist");
}
} catch (final IOException e) {
Log.logSevere("CLEAN-WORDLIST", "ERROR: " + e.getMessage());
System.exit(-1);
}
// finished
Log.logConfig("CLEAN-WORDLIST", "FINISHED");
}
private static String[] shift(final String[] args, final int pos, final int count) {
final String[] newargs = new String[args.length - count];
System.arraycopy(args, 0, newargs, 0, pos);
System.arraycopy(args, pos + count, newargs, pos, args.length - pos - count);
return newargs;
}
/**
* Uses an Iteration over urlHash.db to detect malformed URL-Entries.
* Damaged URL-Entries will be marked in a HashSet and removed at the end of the function.
*
* @param homePath Root-Path where all information is to be found.
*/
private static void urldbcleanup(final File dataHome, final File appHome, final String networkName) {
final File root = dataHome;
final File indexroot = new File(root, "DATA/INDEX");
try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {}
final MetadataRepository currentUrlDB = new MetadataRepository(new File(new File(indexroot, networkName), "TEXT"));
currentUrlDB.connectUrlDb(Segment.UrlDbName, false, false);
currentUrlDB.deadlinkCleaner();
currentUrlDB.close();
}
private static void RWIHashList(final File dataHome, final File appHome, final String targetName, final String resource, final String format) {
Segment WordIndex = null;
final Log log = new Log("HASHLIST");
final File indexPrimaryRoot = new File(dataHome, "DATA/INDEX");
final String wordChunkStartHash = "AAAAAAAAAAAA";
try {Log.configureLogging(dataHome, appHome, new File(dataHome, "DATA/LOG/yacy.logging"));} catch (final Exception e) {}
log.logInfo("STARTING CREATION OF RWI-HASHLIST");
final File root = dataHome;
try {
Iterator<ReferenceContainer<WordReference>> indexContainerIterator = null;
if (resource.equals("all")) {
WordIndex = new Segment(log, new File(new File(indexPrimaryRoot, "freeworld"), "TEXT"), null);
WordIndex.connectRWI(10000, Integer.MAX_VALUE);
WordIndex.connectUrlDb(false, false);
indexContainerIterator = WordIndex.termIndex().referenceContainerIterator(wordChunkStartHash.getBytes(), false, false);
}
int counter = 0;
ReferenceContainer<WordReference> container = null;
if (format.equals("zip")) {
log.logInfo("Writing Hashlist to ZIP-file: " + targetName + ".zip");
final ZipEntry zipEntry = new ZipEntry(targetName + ".txt");
final File file = new File(root, targetName + ".zip");
final ZipOutputStream bos = new ZipOutputStream(new FileOutputStream(file));
bos.putNextEntry(zipEntry);
if(indexContainerIterator != null) {
while (indexContainerIterator.hasNext()) {
counter++;
container = indexContainerIterator.next();
bos.write(container.getTermHash());
bos.write(serverCore.CRLF);
if (counter % 500 == 0) {
log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + ASCII.String(container.getTermHash()));
}
}
}
bos.flush();
bos.close();
} else {
log.logInfo("Writing Hashlist to TXT-file: " + targetName + ".txt");
final File file = new File(root, targetName + ".txt");
final BufferedOutputStream bos = new BufferedOutputStream(new FileOutputStream(file));
if(indexContainerIterator != null) {
while (indexContainerIterator.hasNext()) {
counter++;
container = indexContainerIterator.next();
bos.write(container.getTermHash());
bos.write(serverCore.CRLF);
if (counter % 500 == 0) {
log.logInfo("Found " + counter + " Hashs until now. Last found Hash: " + ASCII.String(container.getTermHash()));
}
}
}
bos.flush();
bos.close();
}
log.logInfo("Total number of Hashs: " + counter + ". Last found Hash: " + (container == null ? "null" : ASCII.String(container.getTermHash())));
} catch (final IOException e) {
log.logSevere("IOException", e);
}
if (WordIndex != null) {
WordIndex.close();
WordIndex = null;
}
}
/**
* Searching for peers affected by Bug
* @param homePath
*/
public static void testPeerDB(final File homePath) {
try {
final File yacyDBPath = new File(homePath, "DATA/INDEX/freeworld/NETWORK");
final String[] dbFileNames = {"seed.new.db","seed.old.db","seed.pot.db"};
for (final String dbFileName : dbFileNames) {
final File dbFile = new File(yacyDBPath,dbFileName);
final MapDataMining db = new MapDataMining(dbFile, Word.commonHashLength, Base64Order.enhancedCoder, 1024 * 512, 500, SeedDB.sortFields, SeedDB.longaccFields, SeedDB.doubleaccFields);
Iterator<Map.Entry<byte[], Map<String, String>>> it;
it = db.entries(true, false);
while (it.hasNext()) {
final Map.Entry<byte[], Map<String, String>> dna = it.next();
String peerHash = UTF8.String(dna.getKey());
if (peerHash.length() < Word.commonHashLength) {
final String peerName = dna.getValue().get("Name");
final String peerIP = dna.getValue().get("IP");
final String peerPort = dna.getValue().get("Port");
while (peerHash.length() < Word.commonHashLength) { peerHash = peerHash + "_"; }
System.err.println("Invalid Peer-Hash found in '" + dbFileName + "': " + peerName + ":" + peerHash + ", http://" + peerIP + ":" + peerPort);
}
}
db.close();
}
} catch (final Exception e) {
Log.logException(e);
}
}
/**
* Main-method which is started by java. Checks for special arguments or
* starts up the application.
@ -993,46 +631,6 @@ public final class yacy {
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-version"))) {
// show yacy version
System.out.println(copyright);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-minimizeurldb"))) {
// migrate words from DATA/PLASMADB/WORDS path to assortment cache, if possible
// attention: this may run long and should not be interrupted!
if (args.length >= 3 && args[1].toLowerCase().equals("-cache")) {
args = shift(args, 1, 2);
}
if (args.length == 2) applicationRoot= new File(args[1]);
minimizeUrlDB(dataRoot, applicationRoot, "freeworld");
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-testpeerdb"))) {
if (args.length == 2) {
applicationRoot = new File(args[1]);
} else if (args.length > 2) {
System.err.println("Usage: -testPeerDB [homeDbRoot]");
}
testPeerDB(applicationRoot);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-genwordstat"))) {
// this can help to create a stop-word list
// to use this, you need a 'yacy.words' file in the root path
// start this with "java -classpath classes yacy -genwordstat [<rootdir>]"
if (args.length == 2) applicationRoot= new File(args[1]);
genWordstat(applicationRoot);
} else if ((args.length == 4) && (args[0].toLowerCase().equals("-cleanwordlist"))) {
// this can be used to organize and clean a word-list
// start this with "java -classpath classes yacy -cleanwordlist <word-file> <minlength> <maxlength>"
final int minlength = Integer.parseInt(args[2]);
final int maxlength = Integer.parseInt(args[3]);
cleanwordlist(args[1], minlength, maxlength);
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-urldbcleanup"))) {
// generate a url list and save it in a file
if (args.length == 2) applicationRoot= new File(args[1]);
urldbcleanup(dataRoot, applicationRoot, "freeworld");
} else if ((args.length >= 1) && (args[0].toLowerCase().equals("-rwihashlist"))) {
// generate a url list and save it in a file
String domain = "all";
String format = "txt";
if (args.length >= 2) domain= args[1];
if (args.length >= 3) format= args[2];
if (args.length == 4) applicationRoot= new File(args[3]);
final String outfile = "rwihashlist_" + System.currentTimeMillis();
RWIHashList(dataRoot, applicationRoot, outfile, domain, format);
} else {
if (args.length == 1) applicationRoot= new File(args[0]);
startup(dataRoot, applicationRoot, startupMemFree, startupMemTotal, false);

Loading…
Cancel
Save