fixed some bugs concerning url entry retrieval and intexControl interface

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1212 6c8d7289-2bf4-0310-a012-ef5d649a1542
20 years ago · 4500506735
parent 83a34b838d
commit 4500506735
12 changed files with 109 additions and 85 deletions
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@ -97,7 +97,7 @@ public class CacheAdmin_p {

        // generate sorted dir/file listing
        final String[] list = dir.list();
-        final StringBuffer tree  = new StringBuffer((list.length + 2) * 256);
+        final StringBuffer tree  = new StringBuffer((list == null) ? 70 : (list.length + 2) * 256);
        tree.append("Directory of<br>").append((pathString.length() == 0) ? "domain list" : linkPathString(pathString)).append("<br><br>");
        if (list == null) {
            tree.append("[empty]");
--- a/htroot/IndexControl_p.java
+++ b/htroot/IndexControl_p.java
@ -47,6 +47,7 @@
 // if the shell's current path is HTROOT

 import java.io.IOException;
+import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.Enumeration;
 import java.util.HashSet;
@ -91,10 +92,10 @@ public class IndexControl_p {
        }

        // default values
-        String keystring = ((String) post.get("keystring")).trim();
-        String keyhash = ((String) post.get("keyhash")).trim();
-        String urlstring = ((String) post.get("urlstring")).trim();
-        String urlhash = ((String) post.get("urlhash")).trim();
+        String keystring = ((String) post.get("keystring", "")).trim();
+        String keyhash = ((String) post.get("keyhash", "")).trim();
+        String urlstring = ((String) post.get("urlstring", "")).trim();
+        String urlhash = ((String) post.get("urlhash", "")).trim();

        if (!urlstring.startsWith("http://") &&
            !urlstring.startsWith("https://")) { urlstring = "http://" + urlstring; }
@ -166,9 +167,7 @@ public class IndexControl_p {
                }
            }
            if (delurlref) {
-                for (int i = 0; i < urlx.length; i++) try {
-                    switchboard.removeAllUrlReferences(urlx[i], true);
-                } catch (IOException e) {}
+                for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true);
            }
            if (delurl || delurlref) {
                for (int i = 0; i < urlx.length; i++) {
@ -188,9 +187,7 @@ public class IndexControl_p {
        // delete selected URLs
        if (post.containsKey("keyhashdelete")) {
            if (delurlref) {
-                for (int i = 0; i < urlx.length; i++) try {
-                    switchboard.removeAllUrlReferences(urlx[i], true);
-                } catch (IOException e) {}
+                for (int i = 0; i < urlx.length; i++) switchboard.removeAllUrlReferences(urlx[i], true);
            }
            if (delurl || delurlref) {
                for (int i = 0; i < urlx.length; i++) {
@ -211,12 +208,12 @@ public class IndexControl_p {
        }

        if (post.containsKey("urlhashdeleteall")) {
-            try {
+            //try {
                int i = switchboard.removeAllUrlReferences(urlhash, true);
                prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
-            } catch (IOException e) {
-                prop.put("result", "Deleted nothing because the url-hash could not be resolved");
-            }
+            //} catch (IOException e) {
+            //    prop.put("result", "Deleted nothing because the url-hash could not be resolved");
+            //}
        }

        if (post.containsKey("urlhashdelete")) {
@ -311,11 +308,7 @@ public class IndexControl_p {
            while (hashIt.hasNext() && i < 256) {
                hash = (String) hashIt.next();
                result.append("<a href=\"/IndexControl_p.html?")
-                      .append("keystring=")
-                      .append("&keyhash=").append(hash)
-                      .append("&urlhash=")
-                      .append("&urlstring=")
-                      .append("&urlhashsearch=")
+                      .append("keyhash=").append(hash).append("&keyhashsearch=")
                      .append("\" class=\"tt\">").append(hash).append("</a> ")
                      .append(((i + 1) % 8 == 0) ? "<br>" : "");
                i++;
@ -330,8 +323,11 @@ public class IndexControl_p {
            prop.put("urlhash", urlhash);
                plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
                prop.put("result", genUrlProfile(switchboard, entry, urlhash));
-            } catch (Exception e) {
-                prop.put("urlstring", "wrong url: " + urlstring);
+            } catch (MalformedURLException e) {
+                prop.put("urlstring", "bad url: " + urlstring);
+                prop.put("urlhash", "");
+            } catch (IOException e) {
+                prop.put("urlstring", "unknown url: " + urlstring);
                prop.put("urlhash", "");
            }
        }
@ -356,7 +352,10 @@ public class IndexControl_p {
            int i = 0;
            while (hashIt.hasNext() && i < 256) {
                hash = (String) hashIt.next();
-                result.append("<a href=\"/IndexControl_p.html?").append("keystring=").append("&keyhash=").append("&urlhash=").append(hash).append("&urlstring=").append("&urlhashsearch=").append("\" class=\"tt\">").append(hash).append("</a> ").append(((i + 1) % 8 == 0) ? "<br>" : "");
+                result.append("<a href=\"/IndexControl_p.html?")
+                .append("urlhash=").append(hash).append("&urlhashsearch=")
+                .append("\" class=\"tt\">").append(hash).append("</a> ")
+                .append(((i + 1) % 8 == 0) ? "<br>" : "");
                i++;
            }
            prop.put("result", result.toString());
@ -449,10 +448,10 @@ public class IndexControl_p {
                final TreeMap tm = new TreeMap();
                while (en.hasNext()) {
                    uh = ((plasmaWordIndexEntry)en.next()).getUrlHash();
-                    if (switchboard.urlPool.loadedURL.exists(uh)) {
+                    try {
                        us = switchboard.urlPool.loadedURL.getEntry(uh).url().toString();
                        tm.put(us, uh);
-                    } else {
+                    } catch (IOException e) {
                        tm.put("", uh);
                    }
                }
--- a/source/de/anomic/plasma/plasmaCrawlLURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlLURL.java
@ -468,6 +468,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
            // if the url cannot be found, this returns null
            this.urlHash = urlHash;
            byte[][] entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes());
+            if (entry == null) throw new IOException("url hash " + urlHash + " not found in LURL");
            try {
                if (entry != null) {
                    this.url = new URL(new String(entry[1]).trim());
--- a/source/de/anomic/plasma/plasmaDbImporter.java
+++ b/source/de/anomic/plasma/plasmaDbImporter.java
@ -1,6 +1,7 @@
 package de.anomic.plasma;

 import java.io.File;
+import java.io.IOException;
 import java.util.Iterator;
 import java.util.Vector;

@ -247,17 +248,16 @@ public class plasmaDbImporter extends Thread {
                        entryCounter++;
                        importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next();
                        String urlHash = importWordIdxEntry.getUrlHash();                    
-                        if ((this.importUrlDB.exists(urlHash)) && (!this.homeUrlDB.exists(urlHash))) {
-                            urlCounter++;
-                            
+                        if ((this.importUrlDB.exists(urlHash)) && (!this.homeUrlDB.exists(urlHash))) try {
                            // importing the new url
                            plasmaCrawlLURL.Entry urlEntry = this.importUrlDB.getEntry(urlHash);                       
+                            urlCounter++;
                            this.homeUrlDB.newEntry(urlEntry);
                            
                            if (urlCounter % 500 == 0) {
                                this.log.logFine(urlCounter + " URLs processed so far.");
                            }
-                        }
+                        } catch (IOException e) {}
                        
                        // adding word index entity to container
                        newContainer.add(importWordIdxEntry,System.currentTimeMillis());
--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@ -231,9 +231,13 @@ public final class plasmaSearchEvent {
                if ((acc.sizeFetched() >= minEntries) && (System.currentTimeMillis() >= postorderLimitTime)) break;
                entry = preorder.next();
                // find the url entry
+                try {
                    page = urlStore.getEntry(entry.getUrlHash());
                    // add a result
                    acc.addResult(entry, page);
+                } catch (IOException e) {
+                    // result was not found
+                }
            }
        } catch (kelondroException ee) {
            serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee);
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -1221,12 +1221,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
            // create index
            String descr = document.getMainLongTitle();
            String referrerHash;
-            try {
            URL referrerURL = entry.referrerURL();
            referrerHash = plasmaURL.urlHash(referrerURL);
-            } catch (IOException e) {
-                referrerHash = plasmaURL.dummyHash;
-            }
+            if (referrerHash == null) referrerHash = plasmaURL.dummyHash;
+
            String noIndexReason = "unspecified";
            if (processCase == 4) {
                // proxy-load
@ -1825,26 +1823,32 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
    }
    
    // method for index deletion
-    public int removeAllUrlReferences(URL url, boolean fetchOnline) throws IOException {
+    public int removeAllUrlReferences(URL url, boolean fetchOnline) {
        return removeAllUrlReferences(plasmaURL.urlHash(url), fetchOnline);
    }
    
-    public int removeAllUrlReferences(String urlhash, boolean fetchOnline) throws IOException {
+    public int removeAllUrlReferences(String urlhash, boolean fetchOnline) {
        // find all the words in a specific resource and remove the url reference from every word index
        // finally, delete the url entry
        
        // determine the url string
+        try {
            plasmaCrawlLURL.Entry entry = urlPool.loadedURL.getEntry(urlhash);
            URL url = entry.url();
-        if (url == null) return 0;
+            if (url == null)
+                return 0;
            // get set of words
-        //Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
+            // Set words = plasmaCondenser.getWords(getText(getResource(url,
+            // fetchOnline)));
            Set words = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline)).getText());
            // delete all word references
            int count = removeReferences(urlhash, words);
            // finally delete the url entry itself
            urlPool.loadedURL.remove(urlhash);
            return count;
+        } catch (IOException e) {
+            return 0;
+        }
    }
    
    public int removeReferences(URL url, Set words) {
--- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java
+++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java
@ -286,10 +286,15 @@ public class plasmaSwitchboardQueue {
            return responseHeader;
        }

-        public URL referrerURL() throws IOException {
+        public URL referrerURL() {
            if (referrerURL == null) {
                if ((referrerHash == null) || (referrerHash.equals(plasmaURL.dummyHash))) return null;
+                try {
                    referrerURL = lurls.getEntry(referrerHash).url();
+                } catch (IOException e) {
+                    referrerURL = null;
+                    return null;
+                }
            }
            return referrerURL;
        }
--- a/source/de/anomic/plasma/plasmaURL.java
+++ b/source/de/anomic/plasma/plasmaURL.java
@ -473,6 +473,7 @@ public class plasmaURL {
    }
    
    public static final String urlHash(String url) {
+        if ((url == null) || (url.length() == 0)) return null;
        try {
            return urlHash(new URL(url));
        } catch (MalformedURLException e) {
@ -542,7 +543,7 @@ public class plasmaURL {
    }
    
    public Iterator urlHashes(String urlHash, boolean up) {
-        return urlHashCache.rows(up, false, urlHash.getBytes());
+        return urlHashCache.keys(up, false, urlHash.getBytes());
    }

 }
--- a/source/de/anomic/plasma/plasmaURLPool.java
+++ b/source/de/anomic/plasma/plasmaURLPool.java
@ -75,8 +75,10 @@ public class plasmaURLPool {
        if (urlhash.equals(plasmaURL.dummyHash)) return null;
        plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
        if (ne != null) return ne.url();
+        try {
            plasmaCrawlLURL.Entry le = loadedURL.getEntry(urlhash);
            if (le != null) return le.url();
+        } catch (IOException e) {}
        plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
        if (ee != null) return ee.url();
        return null;
--- a/source/de/anomic/plasma/plasmaWordIndexDistribution.java
+++ b/source/de/anomic/plasma/plasmaWordIndexDistribution.java
@ -331,12 +331,16 @@ public final class plasmaWordIndexDistribution {
                        unknownURLEntries.clear();
                        while (urlIter.hasNext()) {
                            indexEntry = (plasmaWordIndexEntry) urlIter.next();     
+                            try {
                                lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash());
                                if ((lurl == null) || (lurl.url() == null)) {
                                    unknownURLEntries.add(indexEntry.getUrlHash());
                                } else {
                                    knownURLs.put(indexEntry.getUrlHash(), lurl);
                                }
+                            } catch (IOException e) {
+                                unknownURLEntries.add(indexEntry.getUrlHash());
+                            }
                        }
                        // now delete all entries that have no url entry
                        hashIter = unknownURLEntries.iterator();
@ -367,6 +371,7 @@ public final class plasmaWordIndexDistribution {
                        unknownURLEntries.clear();
                        while ((urlIter.hasNext()) && (count > 0)) {
                            indexEntry = (plasmaWordIndexEntry) urlIter.next();
+                            try {
                                lurl = this.urlPool.loadedURL.getEntry(indexEntry.getUrlHash());
                                if ((lurl == null) || (lurl.url()==null)) {
                                    unknownURLEntries.add(indexEntry.getUrlHash());
@ -375,6 +380,9 @@ public final class plasmaWordIndexDistribution {
                                    tmpEntity.addEntry(indexEntry);
                                    count--;
                                }
+                            } catch (IOException e) {
+                                unknownURLEntries.add(indexEntry.getUrlHash());
+                            }
                        }
                        // now delete all entries that have no url entry
                        hashIter = unknownURLEntries.iterator();
--- a/source/de/anomic/server/serverDate.java
+++ b/source/de/anomic/server/serverDate.java
@ -166,6 +166,7 @@ public final class serverDate {
        if (utime < utimeyearsacc[years]) years--; // the correction
        long remain = utime - utimeyearsacc[years];
        months = (int) (remain / (29 * dayMillis)); // a guess
+        if (months > 11) months = 11;
        if ((years & 3) == 0) {
            if (remain < dimleapacc[months]) months--; // correction
            remain = remain - dimleapacc[months];           
--- a/source/yacy.java
+++ b/source/yacy.java
@ -797,17 +797,16 @@ public final class yacy {
                        entryCounter++;
                        importWordIdxEntry = (plasmaWordIndexEntry) importWordIdxEntries.next();
                        String urlHash = importWordIdxEntry.getUrlHash();                    
-                        if ((importUrlDB.exists(urlHash)) && (!homeUrlDB.exists(urlHash))) {
-                            urlCounter++;
-                            
+                        if ((importUrlDB.exists(urlHash)) && (!homeUrlDB.exists(urlHash))) try {
                            // importing the new url
                            plasmaCrawlLURL.Entry urlEntry = importUrlDB.getEntry(urlHash);                       
+                            urlCounter++;
                            homeUrlDB.newEntry(urlEntry);
                            
                            if (urlCounter % 500 == 0) {
                                log.logFine(urlCounter + " URLs processed so far.");
                            }
-                        }
+                        } catch (IOException e) {}
                        
                        // adding word index entity to container
                        newContainer.add(importWordIdxEntry,System.currentTimeMillis());
@ -906,14 +905,14 @@ public final class yacy {
                    while (wordIdxEntries.hasNext()) {
                        wordIdxEntry = (plasmaWordIndexEntry) wordIdxEntries.next();
                        String urlHash = wordIdxEntry.getUrlHash();                    
-                        if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) {
-                            urlCounter++;
+                        if ((currentUrlDB.exists(urlHash)) && (!minimizedUrlDB.exists(urlHash))) try {
                            plasmaCrawlLURL.Entry urlEntry = currentUrlDB.getEntry(urlHash);                       
+                            urlCounter++;
                            /*plasmaCrawlLURL.Entry newEntry =*/ minimizedUrlDB.newEntry(urlEntry);
                            if (urlCounter % 500 == 0) {
                                log.logInfo(urlCounter + " URLs found so far.");
                            }
-                        }
+                        } catch (IOException e) {}
                    }
                    // we have read all elements, now we can close it
                    wordIdxEntity.close(); wordIdxEntity = null;