*)added function to removed malformed URLs from urlHash.db

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1182 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
hydrox 19 years ago
parent 6b1a49ea23
commit 96930f0d2b

@ -57,14 +57,18 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;
import de.anomic.http.httpc;
import de.anomic.kelondro.kelondroTree;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroTree;
import de.anomic.server.serverCodings;
import de.anomic.server.serverObjects;
import de.anomic.server.logging.serverLog;
@ -85,6 +89,8 @@ public final class plasmaCrawlLURL extends plasmaURL {
private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling
private final LinkedList gcrawlResultStack; // 6 - local index: triggered external
public static Set damagedURLS = Collections.synchronizedSet(new HashSet());
public plasmaCrawlLURL(File cachePath, int bufferkb) throws IOException {
super();
int[] ce = {
@ -478,6 +484,10 @@ public final class plasmaCrawlLURL extends plasmaURL {
this.snippet = null;
return;
}
} catch (MalformedURLException e) {
plasmaCrawlLURL.damagedURLS.add(this.urlHash);
System.out.println("DEBUG: Marked damaged Entry for removal (malformedURL). UrlHash: " + this.urlHash);
//serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e);
} catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e);
}
@ -774,4 +784,4 @@ public final class plasmaCrawlLURL extends plasmaURL {
}
}
}
}

@ -1170,6 +1170,43 @@ public final class yacy {
return newargs;
}
/**
* Uses an Iteration over urlHash.db to detect malformed URL-Entries.
* Damaged URL-Entries will be marked in a HashSet and removed at the end of the function.
*
* @param homePath Root-Path where all information is to be found.
*/
private static void urldbcleanup(String homePath) {
File root = new File(homePath);
File dbroot = new File(root, "DATA/PLASMADB");
try {
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(new File(dbroot, "urlHash.db"), 4194304);
Iterator eiter = currentUrlDB.entries(true, false);
int iteratorCount=0;
while (eiter.hasNext()) {
eiter.next();
iteratorCount++;
}
try { Thread.sleep(1000); } catch (InterruptedException e) {}
System.out.println("URLs vorher: " + currentUrlDB.size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + plasmaCrawlLURL.damagedURLS.size());
synchronized(plasmaCrawlLURL.damagedURLS)
{
Iterator eiter2 = plasmaCrawlLURL.damagedURLS.iterator();
String urlHash;
while (eiter2.hasNext()) {
urlHash = (String) eiter2.next();
currentUrlDB.remove(urlHash);
System.out.println("Removed UrlDB-Entry for urlHash: " + urlHash);
}
}
plasmaCrawlLURL.damagedURLS.clear();
System.out.println("URLs nachher: " + currentUrlDB.size() + " kaputte URLs: " + plasmaCrawlLURL.damagedURLS.size());
currentUrlDB.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Main-method which is started by java. Checks for special arguments or
* starts up the application.
@ -1260,6 +1297,10 @@ public final class yacy {
if (args.length == 2) applicationRoot= args[1];
String outfile = "urllist_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt");
urllist(applicationRoot, html, outfile);
} else if ((args.length >= 1) && (args[0].equals("-urldbcleanup"))) {
// generate a url list and save it in a file
if (args.length == 2) applicationRoot= args[1];
urldbcleanup(applicationRoot);
} else {
if (args.length == 1) applicationRoot= args[0];
startup(applicationRoot, startupMemFree, startupMemTotal);

Loading…
Cancel
Save