fixed all possible problems with nullpointer exception for LURLs

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2513 6c8d7289-2bf4-0310-a012-ef5d649a1542
19 years ago · 9340dbb501
parent a5ed86105b
commit 9340dbb501
16 changed files with 91 additions and 137 deletions
--- a/htroot/Bookmarks.java
+++ b/htroot/Bookmarks.java
@ -46,7 +46,6 @@
 // if the shell's current path is HTROOT

 import java.io.File;
-import java.io.IOException;
 import java.net.MalformedURLException;
 import java.util.HashSet;
 import java.util.Iterator;
@ -135,8 +134,8 @@ public class Bookmarks {
                    bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash);
                    if (bookmark == null) {
                        // try to get the bookmark from the LURL database
-                        try {
-                            plasmaCrawlLURL.Entry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
+                        plasmaCrawlLURL.Entry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
+                        if (urlentry != null) {
                            prop.put("mode_edit", 0); // create mode
                            if (urlentry != null) {
                                prop.put("mode_title", urlentry.descr());
@ -145,8 +144,6 @@ public class Bookmarks {
                            }
                            prop.put("mode_tags", "");
                            prop.put("mode_public", 0);
-                        } catch (IOException e) {
-                            e.printStackTrace();
                        }
                    } else {
                        // get from the bookmark database
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@ -55,7 +55,6 @@ import java.util.TreeSet;
 import de.anomic.htmlFilter.htmlFilterContentScraper;
 import de.anomic.htmlFilter.htmlFilterOutputStream;
 import de.anomic.http.httpHeader;
-import de.anomic.index.indexURL;
 import de.anomic.plasma.plasmaHTCache;
 import de.anomic.plasma.plasmaParserDocument;
 import de.anomic.plasma.plasmaSwitchboard;
--- a/htroot/IndexControl_p.java
+++ b/htroot/IndexControl_p.java
@ -211,8 +211,10 @@ public class IndexControl_p {
        }

        if (post.containsKey("urlhashdelete")) {
-            try {
-                plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
+            plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
+            if (entry == null) {
+                prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
+            } else {
                if (entry != null) {
                    URL url = entry.url();
                    urlstring = url.toNormalform();
@ -222,8 +224,6 @@ public class IndexControl_p {
                } else {
                    prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
                }
-            } catch (IOException e) {
-                prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
            }
        }

@ -265,16 +265,12 @@ public class IndexControl_p {
            plasmaCrawlLURL.Entry lurl;
            while (urlIter.hasNext()) {
                iEntry = (indexEntry) urlIter.next();
-                try {
-                    lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null);
-                    if ((lurl == null)||(lurl.toString() == null)) {
-                        unknownURLEntries.add(iEntry.urlHash());
-                        urlIter.remove();
-                    } else {
-                        knownURLs.put(iEntry.urlHash(), lurl);
-                    }
-                } catch (IOException e) {
+                lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null);
+                if (lurl.toString() == null) {
                    unknownURLEntries.add(iEntry.urlHash());
+                    urlIter.remove();
+                } else {
+                    knownURLs.put(iEntry.urlHash(), lurl);
                }
            }
            // use whats remaining           
@ -313,22 +309,26 @@ public class IndexControl_p {
        if (post.containsKey("urlstringsearch")) {
            try {
                URL url = new URL(urlstring);
-            urlhash = indexURL.urlHash(url);
-            prop.put("urlhash", urlhash);
+                urlhash = indexURL.urlHash(url);
+                prop.put("urlhash", urlhash);
                plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
-                prop.put("result", genUrlProfile(switchboard, entry, urlhash));
+                if (entry == null) {
+                    prop.put("urlstring", "unknown url: " + urlstring);
+                    prop.put("urlhash", "");
+                } else {
+                    prop.put("result", genUrlProfile(switchboard, entry, urlhash));
+                }
            } catch (MalformedURLException e) {
                prop.put("urlstring", "bad url: " + urlstring);
                prop.put("urlhash", "");
-            } catch (IOException e) {
-                prop.put("urlstring", "unknown url: " + urlstring);
-                prop.put("urlhash", "");
            }
        }

        if (post.containsKey("urlhashsearch")) {
-            try {
-                plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
+            plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
+            if (entry == null) {
+                prop.put("result", "No Entry for URL hash " + urlhash);
+            } else {
                if (entry != null) {
                    URL url = entry.url();
                    urlstring = url.toString();
@ -337,8 +337,6 @@ public class IndexControl_p {
                } else {
                    prop.put("result", "No Entry for URL hash " + urlhash);
                }
-            } catch (IOException e) {
-                prop.put("result", "No Entry for URL hash " + urlhash);
            }
        }

@ -394,15 +392,11 @@ public class IndexControl_p {
        if (entry == null) { return "No entry found for URL-hash " + urlhash; }
        URL url = entry.url();
        String referrer = null;
-        try {
-            plasmaCrawlLURL.Entry referrerEntry = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
-            if (referrerEntry != null) {
-                referrer = referrerEntry.url().toString();
-            } else {
-                referrer = "<unknown>";
-            }
-        } catch (IOException e) {
+        plasmaCrawlLURL.Entry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
+        if (le == null) {
            referrer = "<unknown>";
+        } else {
+            referrer = le.url().toString();
        }
        if (url == null) { return "No entry found for URL-hash " + urlhash; }
        String result = "<table>" +
@ -456,16 +450,13 @@ public class IndexControl_p {
                while (en.hasNext()) {
                    xi = (indexEntry) en.next();
                    uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())};
-                    try {
-                        plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(uh[0], null);
-                        if (entry != null) {
-                            us = entry.url().toString();
-                            tm.put(us, uh);
-                        } else {
-                            tm.put(uh[0], uh);
-                        }
-                    } catch (IOException e) {
+                    plasmaCrawlLURL.Entry le = switchboard.urlPool.loadedURL.load(uh[0], null);
+                    if (le == null) {
                        tm.put(uh[0], uh);
+                    } else {
+                        us = le.url().toString();
+                        tm.put(us, uh);
+
                    }
                }

--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@ -106,9 +106,8 @@ public class ViewFile {

                // getting the urlEntry that belongs to the url hash
                Entry urlEntry = null;
-                try {
-                    urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
-                } catch (IOException e) {
+                urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
+                if (urlEntry == null) {
                    prop.put("error",2);
                    prop.put("viewMode",VIEW_MODE_NO_TEXT);
                    return prop;
--- a/htroot/yacy/crawlOrder.java
+++ b/htroot/yacy/crawlOrder.java
@ -45,7 +45,6 @@
 // You must compile this file with
 // javac -classpath .:../classes crawlOrder.java

-import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Date;
 import de.anomic.http.httpHeader;
@ -249,8 +248,11 @@ public final class crawlOrder {
            // case where we have already the url loaded;
            reason = reasonString;
            // send lurl-Entry as response
-            try {
-                plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null);
+            plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null);
+            if (entry == null) {
+                response = "rejected";
+                lurl = "";
+            } else {
                if (entry != null) {
                    response = "double";
                    switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare);
@ -259,9 +261,6 @@ public final class crawlOrder {
                    response = "rejected";
                    lurl = "";
                }
-            } catch (IOException e) {
-                response = "rejected";
-                lurl = "";
            }
        } else {
            response = "rejected";
--- a/source/de/anomic/http/httpdProxyHandler.java
+++ b/source/de/anomic/http/httpdProxyHandler.java
@ -92,7 +92,6 @@ import java.util.zip.GZIPOutputStream;
 import de.anomic.htmlFilter.htmlFilterContentTransformer;
 import de.anomic.htmlFilter.htmlFilterOutputStream;
 import de.anomic.htmlFilter.htmlFilterTransformer;
-import de.anomic.index.indexURL;
 import de.anomic.plasma.plasmaHTCache;
 import de.anomic.plasma.plasmaParser;
 import de.anomic.plasma.plasmaSwitchboard;
--- a/source/de/anomic/kelondro/kelondroRow.java
+++ b/source/de/anomic/kelondro/kelondroRow.java
@ -351,6 +351,11 @@ public class kelondroRow {
            case kelondroColumn.encoder_none:
                throw new kelondroException("ROW", "getColLong has celltype none, no encoder given");
            case kelondroColumn.encoder_b64e:
+                // start - fix for badly stored parameters
+                boolean maxvalue = true;
+                for (int i = 0; i < length; i++) if (rowinstance[offset + i] != '_') {maxvalue = false; break;}
+                if (maxvalue) return 0;
+                // stop - fix for badly stored parameters
                return kelondroBase64Order.enhancedCoder.decodeLong(rowinstance, offset, length);
            case kelondroColumn.encoder_b256:
                return kelondroNaturalOrder.decodeLong(rowinstance, offset, length);
--- a/source/de/anomic/plasma/plasmaCrawlLURL.java
+++ b/source/de/anomic/plasma/plasmaCrawlLURL.java
@ -160,7 +160,7 @@ public final class plasmaCrawlLURL extends indexURL {
        gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
    }

-    public Entry load(String urlHash, indexEntry searchedWord) throws IOException {
+    public Entry load(String urlHash, indexEntry searchedWord) {
        // generates an plasmaLURLEntry using the url hash
        // to speed up the access, the url-hashes are buffered
        // in the hash cache.
@ -169,9 +169,13 @@ public final class plasmaCrawlLURL extends indexURL {
        // - look into the filed properties
        // if the url cannot be found, this returns null
        kelondroRow.Entry entry = urlIndexCache.get(urlHash.getBytes());
-        if (entry == null) entry = urlIndexFile.get(urlHash.getBytes());
-        if (entry == null) return null;
-        return new Entry(entry, searchedWord);
+        try {
+            if (entry == null) entry = urlIndexFile.get(urlHash.getBytes());
+            if (entry == null) return null;
+            return new Entry(entry, searchedWord);
+        } catch (IOException e) {
+            return null;
+        }
    }

    public void store(Entry entry, boolean cached) throws IOException {
--- a/source/de/anomic/plasma/plasmaCrawlStacker.java
+++ b/source/de/anomic/plasma/plasmaCrawlStacker.java
@ -379,14 +379,10 @@ public final class plasmaCrawlStacker {
        String nexturlhash = indexURL.urlHash(nexturl);
        String dbocc = this.sb.urlPool.exists(nexturlhash);
        plasmaCrawlLURL.Entry oldEntry = null;
-        if (dbocc != null) try {
-            oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null);
-        } catch (IOException e) {}
-        boolean recrawl = (oldEntry != null) &&
-                          (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
+        oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null);
+        boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
        if ((dbocc != null) && (!(recrawl))) {
            reason = plasmaCrawlEURL.DOUBLE_REGISTERED + dbocc + ")";
-
            this.log.logFine("URL '" + nexturlString + "' is double registered in '" + dbocc + "'. " +
                             "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
            return reason;
--- a/source/de/anomic/plasma/plasmaDHTChunk.java
+++ b/source/de/anomic/plasma/plasmaDHTChunk.java
@ -227,22 +227,16 @@ public class plasmaDHTChunk {
                    // iterate over indexes to fetch url entries and store them in the urlCache
                    while ((urlIter.hasNext()) && (maxcount > refcount)) {
                        iEntry = (indexEntry) urlIter.next();
-                        try {
-                            lurl = lurls.load(iEntry.urlHash(), iEntry);
-                            if ((lurl == null) || (lurl.url() == null)) {
-                                //yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash());
-                                notBoundCounter++;
-                                urlIter.remove();
-                                wordIndex.removeEntry(container.getWordHash(), iEntry.urlHash(), true);
-                            } else {
-                                urlCache.put(iEntry.urlHash(), lurl);
-                                //yacyCore.log.logFine("DEBUG selectTransferContainersResource: added url hash '" + iEntry.urlHash() + "' to urlCache for word hash " + container.getWordHash());
-                                refcount++;
-                            }
-                        } catch (IOException e) {
+                        lurl = lurls.load(iEntry.urlHash(), iEntry);
+                        if ((lurl == null) || (lurl.url() == null)) {
+                            //yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash());
                            notBoundCounter++;
                            urlIter.remove();
                            wordIndex.removeEntry(container.getWordHash(), iEntry.urlHash(), true);
+                        } else {
+                            urlCache.put(iEntry.urlHash(), lurl);
+                            //yacyCore.log.logFine("DEBUG selectTransferContainersResource: added url hash '" + iEntry.urlHash() + "' to urlCache for word hash " + container.getWordHash());
+                            refcount++;
                        }
                    }

--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@ -45,7 +45,6 @@ package de.anomic.plasma;
 import java.util.Iterator;
 import java.util.Set;
 import java.util.HashSet;
-import java.io.IOException;

 import de.anomic.kelondro.kelondroException;
 import de.anomic.server.logging.serverLog;
@ -242,13 +241,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
                if (System.currentTimeMillis() >= postorderLimitTime) break;
                entry = preorder.next();
                // find the url entry
-                try {
-                    page = urlStore.load(entry.urlHash(), entry);
-                    // add a result
-                    if (page != null) acc.addResult(entry, page);
-                } catch (IOException e) {
-                    // result was not found
-                }
+                page = urlStore.load(entry.urlHash(), entry);
+                // add a result
+                if (page != null) acc.addResult(entry, page);
            }
        } catch (kelondroException ee) {
            serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee);
@ -298,13 +293,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
                if (System.currentTimeMillis() >= postorderLimitTime) break;
                entry = preorder.next();
                // find the url entry
-                try {
-                    page = urlStore.load(entry.urlHash(), entry);
-                    // add a result
-                    if (page != null) acc.addResult(entry, page);
-                } catch (IOException e) {
-                    // result was not found
-                }
+                page = urlStore.load(entry.urlHash(), entry);
+                // add a result
+                if (page != null) acc.addResult(entry, page);
            }
        } catch (kelondroException ee) {
            serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee);
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@ -45,7 +45,6 @@ package de.anomic.plasma;
 import java.io.IOException;
 import de.anomic.net.URL;
 import de.anomic.plasma.cache.IResourceInfo;
-import de.anomic.plasma.crawler.http.CrawlWorker;

 import java.util.Enumeration;
 import java.util.HashMap;
@ -53,7 +52,6 @@ import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Set;

-import de.anomic.http.httpHeader;
 import de.anomic.kelondro.kelondroMScoreCluster;
 import de.anomic.server.logging.serverLog;
 import de.anomic.yacy.yacySearch;
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@ -2157,25 +2157,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
        // finally, delete the url entry
        
        // determine the url string
-        try {
-            plasmaCrawlLURL.Entry entry = urlPool.loadedURL.load(urlhash, null);
-            if (entry == null)
-                return 0;
-            URL url = entry.url();
-            if (url == null)
-                return 0;
-            // get set of words
-            // Set words = plasmaCondenser.getWords(getText(getResource(url,
-            // fetchOnline)));
-            Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText());
-            // delete all word references
-            int count = removeReferences(urlhash, witer);
-            // finally delete the url entry itself
-            urlPool.loadedURL.remove(urlhash);
-            return count;
-        } catch (IOException e) {
-            return 0;
-        }
+        plasmaCrawlLURL.Entry entry = urlPool.loadedURL.load(urlhash, null);
+        if (entry == null) return 0;
+        URL url = entry.url();
+        if (url == null) return 0;
+        // get set of words
+        // Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
+        Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText());
+        // delete all word references
+        int count = removeReferences(urlhash, witer);
+        // finally delete the url entry itself
+        urlPool.loadedURL.remove(urlhash);
+        return count;
    }
    
    public int removeReferences(URL url, Set words) {
--- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java
+++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java
@ -328,12 +328,8 @@ public class plasmaSwitchboardQueue {
        public URL referrerURL() {
            if (referrerURL == null) {
                if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null;
-                try {
-                    plasmaCrawlLURL.Entry entry = lurls.load(referrerHash, null);
-                    if (entry == null) referrerURL = null; else referrerURL = entry.url();
-                } catch (IOException e) {
-                    referrerURL = null;
-                }
+                plasmaCrawlLURL.Entry entry = lurls.load(referrerHash, null);
+                if (entry == null) referrerURL = null; else referrerURL = entry.url();
            }
            return referrerURL;
        }
--- a/source/de/anomic/plasma/plasmaURLPool.java
+++ b/source/de/anomic/plasma/plasmaURLPool.java
@ -81,10 +81,8 @@ public class plasmaURLPool {
        if (urlhash.equals(indexURL.dummyHash)) return null;
        plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
        if (ne != null) return ne.url();
-        try {
-            plasmaCrawlLURL.Entry le = loadedURL.load(urlhash, null);
-            if (le != null) return le.url();
-        } catch (IOException e) {}
+        plasmaCrawlLURL.Entry le = loadedURL.load(urlhash, null);
+        if (le != null) return le.url();
        plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
        if (ee != null) return ee.url();
        return null;
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@ -689,20 +689,15 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
                    while (containerIterator.hasNext() && run) {
                        waiter();
                        entry = (indexEntry) containerIterator.next();
-                        // System.out.println("Wordhash: "+wordHash+" UrlHash:
-                        // "+entry.getUrlHash());
-                        try {
-                            plasmaCrawlLURL.Entry lurlEntry = lurl.load(entry.urlHash(), null);
-                            if (lurlEntry != null) {
-                                url = lurlEntry.url();
-                                if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) {
-                                    urlHashs.add(entry.urlHash());
-                                }
-                            } else {
+                        // System.out.println("Wordhash: "+wordHash+" UrlHash: "+entry.getUrlHash());
+                        plasmaCrawlLURL.Entry ue = lurl.load(entry.urlHash(), null);
+                        if (ue == null) {
+                            urlHashs.add(entry.urlHash());
+                        } else {
+                            url = ue.url();
+                            if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) {
                                urlHashs.add(entry.urlHash());
                            }
-                        } catch (IOException e) {
-                            urlHashs.add(entry.urlHash());
                        }
                    }
                    if (urlHashs.size() > 0) {