refactoring: better data capsulation for indexURL

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2131 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · 757ec28430
parent fd27d55385
commit 757ec28430
5 changed files with 105 additions and 99 deletions
--- a/htroot/PerformanceMemory_p.java
+++ b/htroot/PerformanceMemory_p.java
@ -176,22 +176,22 @@ public class PerformanceMemory_p {
        ost = sb.cacheManager.dbCacheObjectStatus();
        putprop(prop, env, "HTTP", set);
        
-        req = sb.urlPool.loadedURL.urlHashCache.size();
-        chk = sb.urlPool.loadedURL.urlHashCache.cacheNodeChunkSize();
-        slt = sb.urlPool.loadedURL.urlHashCache.cacheNodeFillStatus();
-        ost = sb.urlPool.loadedURL.urlHashCache.cacheObjectStatus();
+        req = sb.urlPool.loadedURL.size();
+        chk = sb.urlPool.loadedURL.cacheNodeChunkSize();
+        slt = sb.urlPool.loadedURL.cacheNodeFillStatus();
+        ost = sb.urlPool.loadedURL.cacheObjectStatus();
        putprop(prop, env, "LURL", set);
        
-        req = sb.urlPool.noticeURL.urlHashCache.size();
-        chk = sb.urlPool.noticeURL.urlHashCache.cacheNodeChunkSize();
-        slt = sb.urlPool.noticeURL.urlHashCache.cacheNodeFillStatus();
-        ost = sb.urlPool.noticeURL.urlHashCache.cacheObjectStatus();
+        req = sb.urlPool.noticeURL.size();
+        chk = sb.urlPool.noticeURL.cacheNodeChunkSize();
+        slt = sb.urlPool.noticeURL.cacheNodeFillStatus();
+        ost = sb.urlPool.noticeURL.cacheObjectStatus();
        putprop(prop, env, "NURL", set);
        
-        req = sb.urlPool.errorURL.urlHashCache.size();
-        chk = sb.urlPool.errorURL.urlHashCache.cacheNodeChunkSize();
-        slt = sb.urlPool.errorURL.urlHashCache.cacheNodeFillStatus();
-        ost = sb.urlPool.errorURL.urlHashCache.cacheObjectStatus();
+        req = sb.urlPool.errorURL.size();
+        chk = sb.urlPool.errorURL.cacheNodeChunkSize();
+        slt = sb.urlPool.errorURL.cacheNodeFillStatus();
+        ost = sb.urlPool.errorURL.cacheObjectStatus();
        putprop(prop, env, "EURL", set);
        
        req = yacyCore.seedDB.sizeConnected() + yacyCore.seedDB.sizeDisconnected() + yacyCore.seedDB.sizePotential();
--- a/source/de/anomic/index/indexURL.java
+++ b/source/de/anomic/index/indexURL.java
@ -382,8 +382,8 @@ public class indexURL {
 
 
 // the class object
- public kelondroTree urlHashCache;
- public final HashMap existsIndex; // allow subclasses to access the existsIndex during Entry.store()
+ protected kelondroTree urlHashCache;
+ protected final HashMap existsIndex; // allow subclasses to access the existsIndex during Entry.store()

 public indexURL() {
     urlHashCache = null;
@ -398,6 +398,18 @@ public class indexURL {
     if (urlHashCache != null) urlHashCache.close();
 }

+ public int[] cacheNodeChunkSize() {
+     return urlHashCache.cacheNodeChunkSize();
+ }
+ 
+ public int[] cacheNodeFillStatus() {
+     return urlHashCache.cacheNodeFillStatus();
+ }
+ 
+ public String[] cacheObjectStatus() {
+     return urlHashCache.cacheObjectStatus();
+ }
+ 
 public boolean exists(String urlHash) {
     synchronized (existsIndex) {
         Boolean existsInIndex = (Boolean) existsIndex.get(urlHash);
--- a/source/de/anomic/plasma/plasmaCrawlEURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlEURL.java
@ -134,15 +134,15 @@ public class plasmaCrawlEURL extends indexURL {
    
    public class Entry {

-	private String   hash;       // the url's hash
+        private String   hash;       // the url's hash
        private String   referrer;   // the url's referrer hash
        private String   initiator;  // the crawling initiator
        private String   executor;  // the crawling initiator
        private URL      url;        // the url as string
-	private String   name;       // the name of the url, from anchor tag <a>name</a>     
+        private String   name;       // the name of the url, from anchor tag <a>name</a>     
        private Date     initdate;   // the time when the url was first time appeared
        private Date     trydate;    // the time when the url was last time tried to load
-	private int      trycount;   // number of tryings
+        private int      trycount;   // number of tryings
        private String   failreason; // string describing reason for load fail
        private bitfield flags;      // extra space

--- a/source/de/anomic/plasma/plasmaCrawlLURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlLURL.java
@ -59,12 +59,14 @@ import java.net.MalformedURLException;
 import java.net.URL;
 import java.text.SimpleDateFormat;
 import java.util.Date;
+import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.Locale;
 import java.util.Properties;

 import de.anomic.http.httpc;
+import de.anomic.http.httpc.response;
 import de.anomic.index.indexURL;
 import de.anomic.kelondro.kelondroBase64Order;
 import de.anomic.kelondro.kelondroTree;
@ -132,28 +134,6 @@ public final class plasmaCrawlLURL extends indexURL {
        lcrawlResultStack = new LinkedList();
        gcrawlResultStack = new LinkedList();
    }
-
-    /*
-    public synchronized Entry addEntry(URL url, String descr, Date moddate, Date loaddate,
-                                       String initiatorHash, String executorHash,
-                                       String referrerHash, int copyCount, boolean localNeed,
-                                       int quality, String language, char doctype,
-                                       int size, int wordCount, int stackType) {
-        Entry e = new Entry(url, descr, moddate, loaddate, referrerHash, copyCount, localNeed, quality, language, doctype, size, wordCount);
-        if (initiatorHash == null) { initiatorHash = dummyHash; }
-        if (executorHash == null) { executorHash = dummyHash; }
-        switch (stackType) {
-            case 0: break;
-            case 1: externResultStack.add(e.urlHash + initiatorHash + executorHash); break;
-            case 2: searchResultStack.add(e.urlHash + initiatorHash + executorHash); break;
-            case 3: transfResultStack.add(e.urlHash + initiatorHash + executorHash); break;
-            case 4: proxyResultStack.add(e.urlHash + initiatorHash + executorHash); break;
-            case 5: lcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break;
-            case 6: gcrawlResultStack.add(e.urlHash + initiatorHash + executorHash); break;
-        }
-        return e;
-    }
-    */
    
    public synchronized void stackEntry(Entry e, String initiatorHash, String executorHash, int stackType) {
        if (e == null) { return; }
@ -794,6 +774,79 @@ public final class plasmaCrawlLURL extends indexURL {
        // enumerates entry elements
        return new kiter(up, rotating);
    }
+    
+    /**
+     * Uses an Iteration over urlHash.db to detect malformed URL-Entries.
+     * Damaged URL-Entries will be marked in a HashSet and removed at the end of the function.
+     *
+     * @param homePath Root-Path where all information is to be found.
+     */
+    public void urldbcleanup() {
+        serverLog log = new serverLog("URLDBCLEANUP");
+        HashSet damagedURLS = new HashSet();
+        try {
+            Iterator eiter = entries(true, false);
+            int iteratorCount = 0;
+            while (eiter.hasNext()) try {
+                eiter.next();
+                iteratorCount++;
+            } catch (RuntimeException e) {
+                String m = e.getMessage();
+                damagedURLS.add(m.substring(m.length() - 12));
+            }
+            try { Thread.sleep(1000); } catch (InterruptedException e) { }
+            log.logInfo("URLs vorher: " + size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + damagedURLS.size());
+
+            Iterator eiter2 = damagedURLS.iterator();
+            String urlHash;
+            while (eiter2.hasNext()) {
+                urlHash = (String) eiter2.next();
+
+                // trying to fix the invalid URL
+                httpc theHttpc = null;
+                String oldUrlStr = null;
+                try {
+                    // getting the url data as byte array
+                    byte[][] entry = urlHashCache.get(urlHash.getBytes());
+
+                    // getting the wrong url string
+                    oldUrlStr = new String(entry[1]).trim();
+
+                    int pos = -1;
+                    if ((pos = oldUrlStr.indexOf("://")) != -1) {
+                        // trying to correct the url
+                        String newUrlStr = "http://" + oldUrlStr.substring(pos + 3);
+                        URL newUrl = new URL(newUrlStr);
+
+                        // doing a http head request to test if the url is correct
+                        theHttpc = httpc.getInstance(newUrl.getHost(), newUrl.getHost(), newUrl.getPort(), 30000, false);
+                        response res = theHttpc.HEAD(newUrl.getPath(), null);
+
+                        if (res.statusCode == 200) {
+                            entry[1] = newUrl.toString().getBytes();
+                            urlHashCache.put(entry);
+                            log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr);
+                        } else {
+                            remove(urlHash);
+                            log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tConnection Status: " + res.status);
+                        }
+                    }
+                } catch (Exception e) {
+                    remove(urlHash);
+                    log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tExecption: " + e.getMessage());
+                } finally {
+                    if (theHttpc != null) try {
+                        theHttpc.close();
+                        httpc.returnInstance(theHttpc);
+                    } catch (Exception e) { }
+                }
+            }
+
+            log.logInfo("URLs nachher: " + size() + " kaputte URLs: " + damagedURLS.size());
+        } catch (IOException e) {
+            log.logSevere("IOException", e);
+        }
+    }

    //  The Cleaner class was provided as "UrldbCleaner" by Hydrox
    //  see http://www.yacy-forum.de/viewtopic.php?p=18093#18093
--- a/source/yacy.java
+++ b/source/yacy.java
@ -71,7 +71,6 @@ import de.anomic.http.httpc;
 import de.anomic.http.httpd;
 import de.anomic.http.httpdFileHandler;
 import de.anomic.http.httpdProxyHandler;
-import de.anomic.http.httpc.response;
 import de.anomic.index.indexEntryAttribute;
 import de.anomic.index.indexURL;
 import de.anomic.kelondro.kelondroDyn;
@ -1287,67 +1286,9 @@ public final class yacy {
        File root = new File(homePath);
        File dbroot = new File(root, "DATA/PLASMADB");
        serverLog log = new serverLog("URLDBCLEANUP");
-        HashSet damagedURLS = new HashSet();
        try {
            plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(new File(dbroot, "urlHash.db"), 4194304);
-            Iterator eiter = currentUrlDB.entries(true, false);
-            int iteratorCount = 0;
-            while (eiter.hasNext()) try {
-                eiter.next();
-                iteratorCount++;
-            } catch (RuntimeException e) {
-                String m = e.getMessage();
-                damagedURLS.add(m.substring(m.length() - 12));
-            }
-            try { Thread.sleep(1000); } catch (InterruptedException e) { }
-            log.logInfo("URLs vorher: " + currentUrlDB.size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + damagedURLS.size());
-
-            Iterator eiter2 = damagedURLS.iterator();
-            String urlHash;
-            while (eiter2.hasNext()) {
-                urlHash = (String) eiter2.next();
-
-                // trying to fix the invalid URL
-                httpc theHttpc = null;
-                String oldUrlStr = null;
-                try {
-                    // getting the url data as byte array
-                    byte[][] entry = currentUrlDB.urlHashCache.get(urlHash.getBytes());
-
-                    // getting the wrong url string
-                    oldUrlStr = new String(entry[1]).trim();
-
-                    int pos = -1;
-                    if ((pos = oldUrlStr.indexOf("://")) != -1) {
-                        // trying to correct the url
-                        String newUrlStr = "http://" + oldUrlStr.substring(pos + 3);
-                        URL newUrl = new URL(newUrlStr);
-
-                        // doing a http head request to test if the url is correct
-                        theHttpc = httpc.getInstance(newUrl.getHost(), newUrl.getHost(), newUrl.getPort(), 30000, false);
-                        response res = theHttpc.HEAD(newUrl.getPath(), null);
-
-                        if (res.statusCode == 200) {
-                            entry[1] = newUrl.toString().getBytes();
-                            currentUrlDB.urlHashCache.put(entry);
-                            log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr);
-                        } else {
-                            currentUrlDB.remove(urlHash);
-                            log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tConnection Status: " + res.status);
-                        }
-                    }
-                } catch (Exception e) {
-                    currentUrlDB.remove(urlHash);
-                    log.logInfo("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tExecption: " + e.getMessage());
-                } finally {
-                    if (theHttpc != null) try {
-                        theHttpc.close();
-                        httpc.returnInstance(theHttpc);
-                    } catch (Exception e) { }
-                }
-            }
-
-            log.logInfo("URLs nachher: " + currentUrlDB.size() + " kaputte URLs: " + damagedURLS.size());
+            currentUrlDB.urldbcleanup();
            currentUrlDB.close();
        } catch (IOException e) {
            log.logSevere("IOException", e);