added DBCleaner from Hydrox

see http://www.yacy-forum.de/viewtopic.php?p=18093#18093
The servlet is now named IndexCleaner_p.
See http://localhost:8080/IndexCleaner_p.html
The Servlet was adopted to fit in the overall architecture

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1863 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 74ffb9b842
commit 0ec28b8f8e

@ -0,0 +1,53 @@
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html>
<head>
<title>YaCy '#[clientname]#': Index Control</title>
#%env/templates/metas.template%#
<meta http-equiv="REFRESH" content="10">
</head>
<body marginheight="0" marginwidth="0" leftmargin="0" topmargin="0">
#%env/templates/header.template%#
#%env/templates/submenuIndexControl.template%#
<br>
<h2>Index Cleaner</h2>
#(urldb)#
::
<p>
ThreadAlive: #[threadAlive]#<br>
ThreadToString: #[threadToString]#<br>
Total URLs searched: #[total]# (#[percentUrls]#%)<br>
Blacklisted URLs found: #[blacklisted]#<br>
Percentage blacklisted: #[percent]#%<br>
last searched URL: #[lastUrl]# (#[lastHash]#)<br>
last blacklisted URL found: #[lastBlacklistedUrl]# (#[lastBlacklistedHash]#)<br>
</p>
#(/urldb)#
#(rwidb)#
::
<p>
ThreadAlive: #[threadAlive]#<br>
ThreadToString: #[threadToString]#<br>
RWIs at Start: #[RWIcountstart]#<br>
RWIs now: #[RWIcountnow]#<br>
wordHash in Progress: #[wordHashNow]#<br>
last wordHash with deleted URLs: #[lastWordHash]#<br>
Number of deleted URLs in on this Hash: #[lastDeletionCounter]#<br>
</p>
#(/rwidb)#
<p>
UrldbCleaner - Clean up the database by deletion of blacklisted urls:<br>
<a href="IndexCleaner_p.html?action=ustart">Start/Resume</a>
<a href="IndexCleaner_p.html?action=ustop">Stop</a>
<a href="IndexCleaner_p.html?action=upause">Pause</a>
</p>
<p>
RWIDbCleaner - Clean up the database by deletion of words with reference to blacklisted urls:<br>
<a href="IndexCleaner_p.html?action=rstart">Start/Resume</a>
<a href="IndexCleaner_p.html?action=rstop">Stop</a>
<a href="IndexCleaner_p.html?action=rpause">Pause</a>
</p>
#%env/templates/footer.template%#
</body>
</html>

@ -0,0 +1,84 @@
// This file was provided by Hydrox
// see http://www.yacy-forum.de/viewtopic.php?p=18093#18093
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class IndexCleaner_p {
private static plasmaCrawlLURL.Cleaner urldbCleanerThread;
private static plasmaWordIndex.Cleaner indexCleanerThread;
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
serverObjects prop = new serverObjects();
plasmaSwitchboard sb = (plasmaSwitchboard) env;
prop.put("title", "DbCleanup_p");
if (post!=null) {
prop.put("bla", "post!=null");
if (post.get("action").equals("ustart")) {
if (urldbCleanerThread==null || !urldbCleanerThread.isAlive()) {
urldbCleanerThread = sb.urlPool.loadedURL.makeCleaner();
urldbCleanerThread.start();
}
else {
urldbCleanerThread.endPause();
}
}
else if (post.get("action").equals("ustop")) {
urldbCleanerThread.abort();
}
else if (post.get("action").equals("upause")) {
urldbCleanerThread.pause();
}
else if (post.get("action").equals("rstart")) {
if (indexCleanerThread==null || !indexCleanerThread.isAlive()) {
indexCleanerThread = sb.wordIndex.makeCleaner(sb.urlPool.loadedURL, post.get("wordHash","--------"));
indexCleanerThread.start();
}
else {
indexCleanerThread.endPause();
}
}
else if (post.get("action").equals("rstop")) {
indexCleanerThread.abort();
}
else if (post.get("action").equals("rpause")) {
indexCleanerThread.pause();
}
prop.put("LOCATION","");
return prop;
}
else {
prop.put("bla", "post==null");
}
if (urldbCleanerThread!=null) {
prop.put("urldb", 1);
prop.put("urldb_percentUrls", ((double)urldbCleanerThread.totalSearchedUrls/sb.urlPool.loadedURL.size())*100 + "");
prop.put("urldb_blacklisted", urldbCleanerThread.blacklistedUrls);
prop.put("urldb_total", urldbCleanerThread.totalSearchedUrls);
prop.put("urldb_lastBlacklistedUrl", urldbCleanerThread.lastBlacklistedUrl);
prop.put("urldb_lastBlacklistedHash", urldbCleanerThread.lastBlacklistedHash);
prop.put("urldb_lastUrl", urldbCleanerThread.lastUrl);
prop.put("urldb_lastHash", urldbCleanerThread.lastHash);
prop.put("urldb_threadAlive", urldbCleanerThread.isAlive() + "");
prop.put("urldb_threadToString", urldbCleanerThread.toString());
double percent = ((double)urldbCleanerThread.blacklistedUrls/urldbCleanerThread.totalSearchedUrls)*100;
prop.put("urldb_percent", percent + "");
}
if (indexCleanerThread!=null) {
prop.put("rwidb", 1);
prop.put("rwidb_threadAlive", indexCleanerThread.isAlive() + "");
prop.put("rwidb_threadToString", indexCleanerThread.toString());
prop.put("rwidb_RWIcountstart", indexCleanerThread.rwiCountAtStart);
prop.put("rwidb_RWIcountnow", sb.wordIndex.size());
prop.put("rwidb_wordHashNow", indexCleanerThread.wordHashNow);
prop.put("rwidb_lastWordHash", indexCleanerThread.lastWordHash);
prop.put("rwidb_lastDeletionCounter", indexCleanerThread.lastDeletionCounter);
}
return prop;
}
}

@ -7,7 +7,7 @@
<body marginheight="0" marginwidth="0" leftmargin="0" topmargin="0">
#%env/templates/header.template%#
#%env/templates/submenuIndexControl.template%#
<br><br>
<br>
<h2>Index Administration</h2>
<form action="IndexControl_p.html" method="post" enctype="multipart/form-data">

@ -8,7 +8,7 @@
<body marginheight="0" marginwidth="0" leftmargin="0" topmargin="0">
#%env/templates/header.template%#
#%env/templates/submenuIndexControl.template%#
<br><br>
<br>
<h2>Index DB Import</h2>
<p>The local index currently consists of (at least) #[wcount]# reverse word indexes and #[ucount]# URL references.</p>
<hr>

@ -8,7 +8,7 @@
<body marginheight="0" marginwidth="0" leftmargin="0" topmargin="0">
#%env/templates/header.template%#
#%env/templates/submenuIndexControl.template%#
<br><br>
<br>
<h2>Index Transfer</h2>
<form action="IndexTransfer_p.html" method="post" enctype="multipart/form-data">

@ -1,14 +1,17 @@
<table width="100%" border="0" cellpadding="0" cellspacing="0" class="SubMenu">
<tr height="10"><td colspan="17" class="MenuHeader">&nbsp;Index Control Menu</td></tr>
<tr height="10"><td colspan="7" class="MenuHeader">&nbsp;Index Control Menu</td></tr>
<tr height="2"><td colspan="17"></td></tr>
<tr class="TableHeader">
<td width="33%" class="MenuSubItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;
<td width="25%" class="MenuSubItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;
<a href="/IndexControl_p.html" class="MenuItemLink">Index Administration</a>&nbsp;</td>
<td class="MenuSubSpacer"></td>
<td width="33%" class="MenuSubItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;
<td width="25%" class="MenuSubItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;
<a href="/IndexImport_p.html" class="MenuItemLink">Index Import</a>&nbsp;</td>
<td class="MenuSubSpacer"></td>
<td width="33%" class="MenuSubItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;
<td width="25%" class="MenuSubItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;
<a href="/IndexTransfer_p.html" class="MenuItemLink">Index Transfer</a>&nbsp;</td>
<td class="MenuSubSpacer"></td>
<td width="25%" class="MenuSubItem">&nbsp;<img border="0" src="/env/grafics/lock.gif" align="top">&nbsp;
<a href="/IndexCleaner_p.html" class="MenuItemLink">Index Cleaner</a>&nbsp;</td>
</tr>
</table>

@ -729,6 +729,96 @@ public final class plasmaCrawlLURL extends plasmaURL {
return new kiter(up, rotating);
}
// The Cleaner class was provided as "UrldbCleaner" by Hydrox
// see http://www.yacy-forum.de/viewtopic.php?p=18093#18093
public Cleaner makeCleaner() {
return new Cleaner();
}
public class Cleaner extends Thread {
private boolean run = true;
private boolean pause = false;
public int blacklistedUrls = 0;
public int totalSearchedUrls = 1;
public String lastBlacklistedUrl = "";
public String lastBlacklistedHash = "";
public String lastUrl = "";
public String lastHash = "";
public Cleaner() {
}
public void run() {
try {
serverLog.logInfo("URLDBCLEANER", "UrldbCleaner-Thread startet");
Iterator eiter = entries(true,false);
while (eiter.hasNext() && run) {
synchronized(this) {
if (this.pause) {
try {
this.wait();
} catch (InterruptedException e) {
this.run = false;
return;
}
}
}
plasmaCrawlLURL.Entry entry = (plasmaCrawlLURL.Entry) eiter.next();
totalSearchedUrls++;
if (plasmaSwitchboard.urlBlacklist.isListed(entry.url().getHost().toLowerCase(),entry.url().getPath())==true) {
lastBlacklistedUrl = entry.url().toString();
lastBlacklistedHash = entry.hash();
serverLog.logFine("URLDBCLEANER", ++blacklistedUrls + " blacklisted (" + ((double)blacklistedUrls/totalSearchedUrls)*100 + "%): " + entry.hash() + " " + entry.url());
remove(entry.hash());
}
lastUrl = entry.url().toString();
lastHash = entry.hash();
}
} catch (RuntimeException e) {
if (e.getMessage().indexOf("not found in LURL") != -1) {
e.printStackTrace();
}
else {
e.printStackTrace();
run = false;
}
} catch (IOException e) {
e.printStackTrace();
run = false;
}
serverLog.logInfo("URLDBCLEANER", "UrldbCleaner-Thread stopped");
}
public void abort() {
synchronized(this) {
run = false;
this.notifyAll();
}
}
public void pause() {
synchronized(this) {
if(pause == false) {
pause = true;
serverLog.logInfo("URLDBCLEANER", "UrldbCleaner-Thread paused");
}
}
}
public void endPause() {
synchronized(this) {
if (pause == true) {
pause = false;
this.notifyAll();
serverLog.logInfo("URLDBCLEANER", "UrldbCleaner-Thread resumed");
}
}
}
}
public static void main(String[] args) {
// test-generation of url hashes for debugging
// one argument requires, will be treated as url

@ -530,6 +530,110 @@ public final class plasmaWordIndex {
}
}
// The Cleaner class was provided as "UrldbCleaner" by Hydrox
// see http://www.yacy-forum.de/viewtopic.php?p=18093#18093
public Cleaner makeCleaner(plasmaCrawlLURL lurl, String startHash) {
return new Cleaner(lurl, startHash);
}
public class Cleaner extends Thread {
private String startHash;
private boolean run = true;
private boolean pause = false;
public int rwiCountAtStart = 0;
public String wordHashNow = "";
public String lastWordHash = "";
public int lastDeletionCounter = 0;
private plasmaCrawlLURL lurl;
public Cleaner(plasmaCrawlLURL lurl, String startHash) {
this.lurl = lurl;
this.startHash = startHash;
this.rwiCountAtStart = size();
}
public void run() {
serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread stopped");
String wordHash = "";
plasmaWordIndexEntryContainer wordContainer = null;
plasmaWordIndexEntry entry = null;
URL url = null;
HashSet urlHashs = new HashSet();
Iterator wordHashIterator = wordHashes(startHash, plasmaWordIndex.RL_WORDFILES, false);
while (wordHashIterator.hasNext() && run) {
waiter();
wordHash = (String) wordHashIterator.next();
wordContainer = getContainer(wordHash, true, -1);
Iterator containerIterator = wordContainer.entries();
wordHashNow = wordHash;
while (containerIterator.hasNext() && run) {
waiter();
entry = (plasmaWordIndexEntry) containerIterator.next();
//System.out.println("Wordhash: "+wordHash+" UrlHash: "+entry.getUrlHash());
try {
url = lurl.getEntry(entry.getUrlHash(), null).url();
if ((url == null) ||
(plasmaSwitchboard.urlBlacklist.isListed(url.getHost().toLowerCase(),url.getPath())==true)) {
urlHashs.add(entry.getUrlHash());
}
} catch (IOException e) {
urlHashs.add(entry.getUrlHash());
}
}
if (urlHashs.size()>0) {
String [] urlArray;
urlArray = (String[]) urlHashs.toArray(new String[0]);
int removed = removeEntries(wordHash, urlArray, true);
serverLog.logFine("INDEXCLEANER", wordHash + ": " + removed + " of " + wordContainer.size() + " URL-entries deleted");
lastWordHash = wordHash;
lastDeletionCounter = urlHashs.size();
urlHashs.clear();
}
}
serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread stopped");
}
public void abort() {
synchronized(this) {
run = false;
this.notifyAll();
}
}
public void pause() {
synchronized(this) {
if(pause == false) {
pause = true;
serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread paused");
}
}
}
public void endPause() {
synchronized(this) {
if (pause == true) {
pause = false;
this.notifyAll();
serverLog.logInfo("INDEXCLEANER", "IndexCleaner-Thread resumed");
}
}
}
public void waiter() {
synchronized(this) {
if (this.pause) {
try {
this.wait();
} catch (InterruptedException e) {
this.run = false;
return;
}
}
}
}
}
public static void main(String[] args) {
// System.out.println(kelondroMSetTools.fastStringComparator(true).compare("RwGeoUdyDQ0Y", "rwGeoUdyDQ0Y"));
// System.out.println(new Date(reverseMicroDateDays(microDateDays(System.currentTimeMillis()))));

Loading…
Cancel
Save