From 96930f0d2b9ade4a02bfc329db267ede6f7f6b9a Mon Sep 17 00:00:00 2001 From: hydrox Date: Wed, 7 Dec 2005 11:10:08 +0000 Subject: [PATCH] *)added function to removed malformed URLs from urlHash.db git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1182 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- source/de/anomic/plasma/plasmaCrawlLURL.java | 14 ++++++- source/yacy.java | 41 ++++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index fe153bbbc..b45aa8eca 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -57,14 +57,18 @@ import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.text.SimpleDateFormat; +import java.util.Collections; import java.util.Date; +import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.Locale; import java.util.Properties; +import java.util.Set; + import de.anomic.http.httpc; -import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroException; +import de.anomic.kelondro.kelondroTree; import de.anomic.server.serverCodings; import de.anomic.server.serverObjects; import de.anomic.server.logging.serverLog; @@ -85,6 +89,8 @@ public final class plasmaCrawlLURL extends plasmaURL { private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling private final LinkedList gcrawlResultStack; // 6 - local index: triggered external + public static Set damagedURLS = Collections.synchronizedSet(new HashSet()); + public plasmaCrawlLURL(File cachePath, int bufferkb) throws IOException { super(); int[] ce = { @@ -478,6 +484,10 @@ public final class plasmaCrawlLURL extends plasmaURL { this.snippet = null; return; } + } catch (MalformedURLException e) { + plasmaCrawlLURL.damagedURLS.add(this.urlHash); + System.out.println("DEBUG: Marked damaged Entry for removal (malformedURL). UrlHash: " + this.urlHash); + //serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e); } catch (Exception e) { serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e); } @@ -774,4 +784,4 @@ public final class plasmaCrawlLURL extends plasmaURL { } } -} \ No newline at end of file +} diff --git a/source/yacy.java b/source/yacy.java index ec192b1ff..426eddc9a 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -1170,6 +1170,43 @@ public final class yacy { return newargs; } + /** + * Uses an Iteration over urlHash.db to detect malformed URL-Entries. + * Damaged URL-Entries will be marked in a HashSet and removed at the end of the function. + * + * @param homePath Root-Path where all information is to be found. + */ + private static void urldbcleanup(String homePath) { + File root = new File(homePath); + File dbroot = new File(root, "DATA/PLASMADB"); + try { + plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(new File(dbroot, "urlHash.db"), 4194304); + Iterator eiter = currentUrlDB.entries(true, false); + int iteratorCount=0; + while (eiter.hasNext()) { + eiter.next(); + iteratorCount++; + } + try { Thread.sleep(1000); } catch (InterruptedException e) {} + System.out.println("URLs vorher: " + currentUrlDB.size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + plasmaCrawlLURL.damagedURLS.size()); + synchronized(plasmaCrawlLURL.damagedURLS) + { + Iterator eiter2 = plasmaCrawlLURL.damagedURLS.iterator(); + String urlHash; + while (eiter2.hasNext()) { + urlHash = (String) eiter2.next(); + currentUrlDB.remove(urlHash); + System.out.println("Removed UrlDB-Entry for urlHash: " + urlHash); + } + } + plasmaCrawlLURL.damagedURLS.clear(); + System.out.println("URLs nachher: " + currentUrlDB.size() + " kaputte URLs: " + plasmaCrawlLURL.damagedURLS.size()); + currentUrlDB.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } + /** * Main-method which is started by java. Checks for special arguments or * starts up the application. @@ -1260,6 +1297,10 @@ public final class yacy { if (args.length == 2) applicationRoot= args[1]; String outfile = "urllist_" + System.currentTimeMillis() + ((html) ? ".html" : ".txt"); urllist(applicationRoot, html, outfile); + } else if ((args.length >= 1) && (args[0].equals("-urldbcleanup"))) { + // generate a url list and save it in a file + if (args.length == 2) applicationRoot= args[1]; + urldbcleanup(applicationRoot); } else { if (args.length == 1) applicationRoot= args[0]; startup(applicationRoot, startupMemFree, startupMemTotal);