From 9340dbb5015f661bb4cf68f6396b28933464f7a0 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 7 Sep 2006 18:24:39 +0000 Subject: [PATCH] fixed all possible problems with nullpointer exception for LURLs git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2513 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Bookmarks.java | 7 +- htroot/CacheAdmin_p.java | 1 - htroot/IndexControl_p.java | 71 ++++++++----------- htroot/ViewFile.java | 5 +- htroot/yacy/crawlOrder.java | 11 ++- source/de/anomic/http/httpdProxyHandler.java | 1 - source/de/anomic/kelondro/kelondroRow.java | 5 ++ source/de/anomic/plasma/plasmaCrawlLURL.java | 12 ++-- .../de/anomic/plasma/plasmaCrawlStacker.java | 8 +-- source/de/anomic/plasma/plasmaDHTChunk.java | 20 ++---- .../de/anomic/plasma/plasmaSearchEvent.java | 21 ++---- .../de/anomic/plasma/plasmaSnippetCache.java | 2 - .../de/anomic/plasma/plasmaSwitchboard.java | 31 ++++---- .../anomic/plasma/plasmaSwitchboardQueue.java | 8 +-- source/de/anomic/plasma/plasmaURLPool.java | 6 +- source/de/anomic/plasma/plasmaWordIndex.java | 19 ++--- 16 files changed, 91 insertions(+), 137 deletions(-) diff --git a/htroot/Bookmarks.java b/htroot/Bookmarks.java index 5c7f58dd6..5e1eafab6 100644 --- a/htroot/Bookmarks.java +++ b/htroot/Bookmarks.java @@ -46,7 +46,6 @@ // if the shell's current path is HTROOT import java.io.File; -import java.io.IOException; import java.net.MalformedURLException; import java.util.HashSet; import java.util.Iterator; @@ -135,8 +134,8 @@ public class Bookmarks { bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash); if (bookmark == null) { // try to get the bookmark from the LURL database - try { - plasmaCrawlLURL.Entry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null); + plasmaCrawlLURL.Entry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null); + if (urlentry != null) { prop.put("mode_edit", 0); // create mode if (urlentry != null) { prop.put("mode_title", urlentry.descr()); @@ -145,8 +144,6 @@ public class Bookmarks { } prop.put("mode_tags", ""); prop.put("mode_public", 0); - } catch (IOException e) { - e.printStackTrace(); } } else { // get from the bookmark database diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java index 84d7f69d7..cb9c41a39 100644 --- a/htroot/CacheAdmin_p.java +++ b/htroot/CacheAdmin_p.java @@ -55,7 +55,6 @@ import java.util.TreeSet; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterOutputStream; import de.anomic.http.httpHeader; -import de.anomic.index.indexURL; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParserDocument; import de.anomic.plasma.plasmaSwitchboard; diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index a426554e6..cb69d1146 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -211,8 +211,10 @@ public class IndexControl_p { } if (post.containsKey("urlhashdelete")) { - try { - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null); + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null); + if (entry == null) { + prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); + } else { if (entry != null) { URL url = entry.url(); urlstring = url.toNormalform(); @@ -222,8 +224,6 @@ public class IndexControl_p { } else { prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); } - } catch (IOException e) { - prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); } } @@ -265,16 +265,12 @@ public class IndexControl_p { plasmaCrawlLURL.Entry lurl; while (urlIter.hasNext()) { iEntry = (indexEntry) urlIter.next(); - try { - lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null); - if ((lurl == null)||(lurl.toString() == null)) { - unknownURLEntries.add(iEntry.urlHash()); - urlIter.remove(); - } else { - knownURLs.put(iEntry.urlHash(), lurl); - } - } catch (IOException e) { + lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null); + if (lurl.toString() == null) { unknownURLEntries.add(iEntry.urlHash()); + urlIter.remove(); + } else { + knownURLs.put(iEntry.urlHash(), lurl); } } // use whats remaining @@ -313,22 +309,26 @@ public class IndexControl_p { if (post.containsKey("urlstringsearch")) { try { URL url = new URL(urlstring); - urlhash = indexURL.urlHash(url); - prop.put("urlhash", urlhash); + urlhash = indexURL.urlHash(url); + prop.put("urlhash", urlhash); plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null); - prop.put("result", genUrlProfile(switchboard, entry, urlhash)); + if (entry == null) { + prop.put("urlstring", "unknown url: " + urlstring); + prop.put("urlhash", ""); + } else { + prop.put("result", genUrlProfile(switchboard, entry, urlhash)); + } } catch (MalformedURLException e) { prop.put("urlstring", "bad url: " + urlstring); prop.put("urlhash", ""); - } catch (IOException e) { - prop.put("urlstring", "unknown url: " + urlstring); - prop.put("urlhash", ""); } } if (post.containsKey("urlhashsearch")) { - try { - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null); + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null); + if (entry == null) { + prop.put("result", "No Entry for URL hash " + urlhash); + } else { if (entry != null) { URL url = entry.url(); urlstring = url.toString(); @@ -337,8 +337,6 @@ public class IndexControl_p { } else { prop.put("result", "No Entry for URL hash " + urlhash); } - } catch (IOException e) { - prop.put("result", "No Entry for URL hash " + urlhash); } } @@ -394,15 +392,11 @@ public class IndexControl_p { if (entry == null) { return "No entry found for URL-hash " + urlhash; } URL url = entry.url(); String referrer = null; - try { - plasmaCrawlLURL.Entry referrerEntry = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null); - if (referrerEntry != null) { - referrer = referrerEntry.url().toString(); - } else { - referrer = ""; - } - } catch (IOException e) { + plasmaCrawlLURL.Entry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null); + if (le == null) { referrer = ""; + } else { + referrer = le.url().toString(); } if (url == null) { return "No entry found for URL-hash " + urlhash; } String result = "" + @@ -456,16 +450,13 @@ public class IndexControl_p { while (en.hasNext()) { xi = (indexEntry) en.next(); uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())}; - try { - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(uh[0], null); - if (entry != null) { - us = entry.url().toString(); - tm.put(us, uh); - } else { - tm.put(uh[0], uh); - } - } catch (IOException e) { + plasmaCrawlLURL.Entry le = switchboard.urlPool.loadedURL.load(uh[0], null); + if (le == null) { tm.put(uh[0], uh); + } else { + us = le.url().toString(); + tm.put(us, uh); + } } diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index bcfecfcc6..d28daa23a 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -106,9 +106,8 @@ public class ViewFile { // getting the urlEntry that belongs to the url hash Entry urlEntry = null; - try { - urlEntry = sb.urlPool.loadedURL.load(urlHash, null); - } catch (IOException e) { + urlEntry = sb.urlPool.loadedURL.load(urlHash, null); + if (urlEntry == null) { prop.put("error",2); prop.put("viewMode",VIEW_MODE_NO_TEXT); return prop; diff --git a/htroot/yacy/crawlOrder.java b/htroot/yacy/crawlOrder.java index c52de8975..a1604c8d4 100644 --- a/htroot/yacy/crawlOrder.java +++ b/htroot/yacy/crawlOrder.java @@ -45,7 +45,6 @@ // You must compile this file with // javac -classpath .:../classes crawlOrder.java -import java.io.IOException; import java.util.ArrayList; import java.util.Date; import de.anomic.http.httpHeader; @@ -249,8 +248,11 @@ public final class crawlOrder { // case where we have already the url loaded; reason = reasonString; // send lurl-Entry as response - try { - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null); + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null); + if (entry == null) { + response = "rejected"; + lurl = ""; + } else { if (entry != null) { response = "double"; switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare); @@ -259,9 +261,6 @@ public final class crawlOrder { response = "rejected"; lurl = ""; } - } catch (IOException e) { - response = "rejected"; - lurl = ""; } } else { response = "rejected"; diff --git a/source/de/anomic/http/httpdProxyHandler.java b/source/de/anomic/http/httpdProxyHandler.java index 7f81554bb..4de724b49 100644 --- a/source/de/anomic/http/httpdProxyHandler.java +++ b/source/de/anomic/http/httpdProxyHandler.java @@ -92,7 +92,6 @@ import java.util.zip.GZIPOutputStream; import de.anomic.htmlFilter.htmlFilterContentTransformer; import de.anomic.htmlFilter.htmlFilterOutputStream; import de.anomic.htmlFilter.htmlFilterTransformer; -import de.anomic.index.indexURL; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; diff --git a/source/de/anomic/kelondro/kelondroRow.java b/source/de/anomic/kelondro/kelondroRow.java index 91c12c4de..81897ab39 100644 --- a/source/de/anomic/kelondro/kelondroRow.java +++ b/source/de/anomic/kelondro/kelondroRow.java @@ -351,6 +351,11 @@ public class kelondroRow { case kelondroColumn.encoder_none: throw new kelondroException("ROW", "getColLong has celltype none, no encoder given"); case kelondroColumn.encoder_b64e: + // start - fix for badly stored parameters + boolean maxvalue = true; + for (int i = 0; i < length; i++) if (rowinstance[offset + i] != '_') {maxvalue = false; break;} + if (maxvalue) return 0; + // stop - fix for badly stored parameters return kelondroBase64Order.enhancedCoder.decodeLong(rowinstance, offset, length); case kelondroColumn.encoder_b256: return kelondroNaturalOrder.decodeLong(rowinstance, offset, length); diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index 0856c82a2..fadf08d52 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -160,7 +160,7 @@ public final class plasmaCrawlLURL extends indexURL { gcrawlResultStack.add(urlHash + initiatorHash + executorHash); } - public Entry load(String urlHash, indexEntry searchedWord) throws IOException { + public Entry load(String urlHash, indexEntry searchedWord) { // generates an plasmaLURLEntry using the url hash // to speed up the access, the url-hashes are buffered // in the hash cache. @@ -169,9 +169,13 @@ public final class plasmaCrawlLURL extends indexURL { // - look into the filed properties // if the url cannot be found, this returns null kelondroRow.Entry entry = urlIndexCache.get(urlHash.getBytes()); - if (entry == null) entry = urlIndexFile.get(urlHash.getBytes()); - if (entry == null) return null; - return new Entry(entry, searchedWord); + try { + if (entry == null) entry = urlIndexFile.get(urlHash.getBytes()); + if (entry == null) return null; + return new Entry(entry, searchedWord); + } catch (IOException e) { + return null; + } } public void store(Entry entry, boolean cached) throws IOException { diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 7e5b58062..1b65bcaad 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -379,14 +379,10 @@ public final class plasmaCrawlStacker { String nexturlhash = indexURL.urlHash(nexturl); String dbocc = this.sb.urlPool.exists(nexturlhash); plasmaCrawlLURL.Entry oldEntry = null; - if (dbocc != null) try { - oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null); - } catch (IOException e) {} - boolean recrawl = (oldEntry != null) && - (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder()); + oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null); + boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder()); if ((dbocc != null) && (!(recrawl))) { reason = plasmaCrawlEURL.DOUBLE_REGISTERED + dbocc + ")"; - this.log.logFine("URL '" + nexturlString + "' is double registered in '" + dbocc + "'. " + "Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms"); return reason; diff --git a/source/de/anomic/plasma/plasmaDHTChunk.java b/source/de/anomic/plasma/plasmaDHTChunk.java index bd44b7f8a..fe35dc81f 100644 --- a/source/de/anomic/plasma/plasmaDHTChunk.java +++ b/source/de/anomic/plasma/plasmaDHTChunk.java @@ -227,22 +227,16 @@ public class plasmaDHTChunk { // iterate over indexes to fetch url entries and store them in the urlCache while ((urlIter.hasNext()) && (maxcount > refcount)) { iEntry = (indexEntry) urlIter.next(); - try { - lurl = lurls.load(iEntry.urlHash(), iEntry); - if ((lurl == null) || (lurl.url() == null)) { - //yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash()); - notBoundCounter++; - urlIter.remove(); - wordIndex.removeEntry(container.getWordHash(), iEntry.urlHash(), true); - } else { - urlCache.put(iEntry.urlHash(), lurl); - //yacyCore.log.logFine("DEBUG selectTransferContainersResource: added url hash '" + iEntry.urlHash() + "' to urlCache for word hash " + container.getWordHash()); - refcount++; - } - } catch (IOException e) { + lurl = lurls.load(iEntry.urlHash(), iEntry); + if ((lurl == null) || (lurl.url() == null)) { + //yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash()); notBoundCounter++; urlIter.remove(); wordIndex.removeEntry(container.getWordHash(), iEntry.urlHash(), true); + } else { + urlCache.put(iEntry.urlHash(), lurl); + //yacyCore.log.logFine("DEBUG selectTransferContainersResource: added url hash '" + iEntry.urlHash() + "' to urlCache for word hash " + container.getWordHash()); + refcount++; } } diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index da1fa3b91..daa98089a 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -45,7 +45,6 @@ package de.anomic.plasma; import java.util.Iterator; import java.util.Set; import java.util.HashSet; -import java.io.IOException; import de.anomic.kelondro.kelondroException; import de.anomic.server.logging.serverLog; @@ -242,13 +241,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable { if (System.currentTimeMillis() >= postorderLimitTime) break; entry = preorder.next(); // find the url entry - try { - page = urlStore.load(entry.urlHash(), entry); - // add a result - if (page != null) acc.addResult(entry, page); - } catch (IOException e) { - // result was not found - } + page = urlStore.load(entry.urlHash(), entry); + // add a result + if (page != null) acc.addResult(entry, page); } } catch (kelondroException ee) { serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee); @@ -298,13 +293,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable { if (System.currentTimeMillis() >= postorderLimitTime) break; entry = preorder.next(); // find the url entry - try { - page = urlStore.load(entry.urlHash(), entry); - // add a result - if (page != null) acc.addResult(entry, page); - } catch (IOException e) { - // result was not found - } + page = urlStore.load(entry.urlHash(), entry); + // add a result + if (page != null) acc.addResult(entry, page); } } catch (kelondroException ee) { serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee); diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index f6c0fe2b3..dfc87e157 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -45,7 +45,6 @@ package de.anomic.plasma; import java.io.IOException; import de.anomic.net.URL; import de.anomic.plasma.cache.IResourceInfo; -import de.anomic.plasma.crawler.http.CrawlWorker; import java.util.Enumeration; import java.util.HashMap; @@ -53,7 +52,6 @@ import java.util.HashSet; import java.util.Iterator; import java.util.Set; -import de.anomic.http.httpHeader; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySearch; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 3a3ac2cb7..e083b8425 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -2157,25 +2157,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // finally, delete the url entry // determine the url string - try { - plasmaCrawlLURL.Entry entry = urlPool.loadedURL.load(urlhash, null); - if (entry == null) - return 0; - URL url = entry.url(); - if (url == null) - return 0; - // get set of words - // Set words = plasmaCondenser.getWords(getText(getResource(url, - // fetchOnline))); - Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText()); - // delete all word references - int count = removeReferences(urlhash, witer); - // finally delete the url entry itself - urlPool.loadedURL.remove(urlhash); - return count; - } catch (IOException e) { - return 0; - } + plasmaCrawlLURL.Entry entry = urlPool.loadedURL.load(urlhash, null); + if (entry == null) return 0; + URL url = entry.url(); + if (url == null) return 0; + // get set of words + // Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline))); + Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText()); + // delete all word references + int count = removeReferences(urlhash, witer); + // finally delete the url entry itself + urlPool.loadedURL.remove(urlhash); + return count; } public int removeReferences(URL url, Set words) { diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java index b664f745d..e0b115a87 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java +++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java @@ -328,12 +328,8 @@ public class plasmaSwitchboardQueue { public URL referrerURL() { if (referrerURL == null) { if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null; - try { - plasmaCrawlLURL.Entry entry = lurls.load(referrerHash, null); - if (entry == null) referrerURL = null; else referrerURL = entry.url(); - } catch (IOException e) { - referrerURL = null; - } + plasmaCrawlLURL.Entry entry = lurls.load(referrerHash, null); + if (entry == null) referrerURL = null; else referrerURL = entry.url(); } return referrerURL; } diff --git a/source/de/anomic/plasma/plasmaURLPool.java b/source/de/anomic/plasma/plasmaURLPool.java index 8f41adbd7..157f058df 100644 --- a/source/de/anomic/plasma/plasmaURLPool.java +++ b/source/de/anomic/plasma/plasmaURLPool.java @@ -81,10 +81,8 @@ public class plasmaURLPool { if (urlhash.equals(indexURL.dummyHash)) return null; plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash); if (ne != null) return ne.url(); - try { - plasmaCrawlLURL.Entry le = loadedURL.load(urlhash, null); - if (le != null) return le.url(); - } catch (IOException e) {} + plasmaCrawlLURL.Entry le = loadedURL.load(urlhash, null); + if (le != null) return le.url(); plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash); if (ee != null) return ee.url(); return null; diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index f0818f5e0..71f456475 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -689,20 +689,15 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI { while (containerIterator.hasNext() && run) { waiter(); entry = (indexEntry) containerIterator.next(); - // System.out.println("Wordhash: "+wordHash+" UrlHash: - // "+entry.getUrlHash()); - try { - plasmaCrawlLURL.Entry lurlEntry = lurl.load(entry.urlHash(), null); - if (lurlEntry != null) { - url = lurlEntry.url(); - if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) { - urlHashs.add(entry.urlHash()); - } - } else { + // System.out.println("Wordhash: "+wordHash+" UrlHash: "+entry.getUrlHash()); + plasmaCrawlLURL.Entry ue = lurl.load(entry.urlHash(), null); + if (ue == null) { + urlHashs.add(entry.urlHash()); + } else { + url = ue.url(); + if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) { urlHashs.add(entry.urlHash()); } - } catch (IOException e) { - urlHashs.add(entry.urlHash()); } } if (urlHashs.size() > 0) {