fixed all possible problems with nullpointer exception for LURLs

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2513 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent a5ed86105b
commit 9340dbb501

@ -46,7 +46,6 @@
// if the shell's current path is HTROOT
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.HashSet;
import java.util.Iterator;
@ -135,8 +134,8 @@ public class Bookmarks {
bookmarksDB.Bookmark bookmark = switchboard.bookmarksDB.getBookmark(urlHash);
if (bookmark == null) {
// try to get the bookmark from the LURL database
try {
plasmaCrawlLURL.Entry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
plasmaCrawlLURL.Entry urlentry = switchboard.urlPool.loadedURL.load(urlHash, null);
if (urlentry != null) {
prop.put("mode_edit", 0); // create mode
if (urlentry != null) {
prop.put("mode_title", urlentry.descr());
@ -145,8 +144,6 @@ public class Bookmarks {
}
prop.put("mode_tags", "");
prop.put("mode_public", 0);
} catch (IOException e) {
e.printStackTrace();
}
} else {
// get from the bookmark database

@ -55,7 +55,6 @@ import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterOutputStream;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.plasma.plasmaSwitchboard;

@ -211,8 +211,10 @@ public class IndexControl_p {
}
if (post.containsKey("urlhashdelete")) {
try {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
if (entry != null) {
URL url = entry.url();
urlstring = url.toNormalform();
@ -222,8 +224,6 @@ public class IndexControl_p {
} else {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
}
} catch (IOException e) {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
}
}
@ -265,16 +265,12 @@ public class IndexControl_p {
plasmaCrawlLURL.Entry lurl;
while (urlIter.hasNext()) {
iEntry = (indexEntry) urlIter.next();
try {
lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null);
if ((lurl == null)||(lurl.toString() == null)) {
unknownURLEntries.add(iEntry.urlHash());
urlIter.remove();
} else {
knownURLs.put(iEntry.urlHash(), lurl);
}
} catch (IOException e) {
lurl = switchboard.urlPool.loadedURL.load(iEntry.urlHash(), null);
if (lurl.toString() == null) {
unknownURLEntries.add(iEntry.urlHash());
urlIter.remove();
} else {
knownURLs.put(iEntry.urlHash(), lurl);
}
}
// use whats remaining
@ -313,22 +309,26 @@ public class IndexControl_p {
if (post.containsKey("urlstringsearch")) {
try {
URL url = new URL(urlstring);
urlhash = indexURL.urlHash(url);
prop.put("urlhash", urlhash);
urlhash = indexURL.urlHash(url);
prop.put("urlhash", urlhash);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
prop.put("result", genUrlProfile(switchboard, entry, urlhash));
if (entry == null) {
prop.put("urlstring", "unknown url: " + urlstring);
prop.put("urlhash", "");
} else {
prop.put("result", genUrlProfile(switchboard, entry, urlhash));
}
} catch (MalformedURLException e) {
prop.put("urlstring", "bad url: " + urlstring);
prop.put("urlhash", "");
} catch (IOException e) {
prop.put("urlstring", "unknown url: " + urlstring);
prop.put("urlhash", "");
}
}
if (post.containsKey("urlhashsearch")) {
try {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(urlhash, null);
if (entry == null) {
prop.put("result", "No Entry for URL hash " + urlhash);
} else {
if (entry != null) {
URL url = entry.url();
urlstring = url.toString();
@ -337,8 +337,6 @@ public class IndexControl_p {
} else {
prop.put("result", "No Entry for URL hash " + urlhash);
}
} catch (IOException e) {
prop.put("result", "No Entry for URL hash " + urlhash);
}
}
@ -394,15 +392,11 @@ public class IndexControl_p {
if (entry == null) { return "No entry found for URL-hash " + urlhash; }
URL url = entry.url();
String referrer = null;
try {
plasmaCrawlLURL.Entry referrerEntry = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
if (referrerEntry != null) {
referrer = referrerEntry.url().toString();
} else {
referrer = "<unknown>";
}
} catch (IOException e) {
plasmaCrawlLURL.Entry le = switchboard.urlPool.loadedURL.load(entry.referrerHash(), null);
if (le == null) {
referrer = "<unknown>";
} else {
referrer = le.url().toString();
}
if (url == null) { return "No entry found for URL-hash " + urlhash; }
String result = "<table>" +
@ -456,16 +450,13 @@ public class IndexControl_p {
while (en.hasNext()) {
xi = (indexEntry) en.next();
uh = new String[]{xi.urlHash(), Integer.toString(xi.posintext())};
try {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(uh[0], null);
if (entry != null) {
us = entry.url().toString();
tm.put(us, uh);
} else {
tm.put(uh[0], uh);
}
} catch (IOException e) {
plasmaCrawlLURL.Entry le = switchboard.urlPool.loadedURL.load(uh[0], null);
if (le == null) {
tm.put(uh[0], uh);
} else {
us = le.url().toString();
tm.put(us, uh);
}
}

@ -106,9 +106,8 @@ public class ViewFile {
// getting the urlEntry that belongs to the url hash
Entry urlEntry = null;
try {
urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
} catch (IOException e) {
urlEntry = sb.urlPool.loadedURL.load(urlHash, null);
if (urlEntry == null) {
prop.put("error",2);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;

@ -45,7 +45,6 @@
// You must compile this file with
// javac -classpath .:../classes crawlOrder.java
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import de.anomic.http.httpHeader;
@ -249,8 +248,11 @@ public final class crawlOrder {
// case where we have already the url loaded;
reason = reasonString;
// send lurl-Entry as response
try {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.load(indexURL.urlHash(url), null);
if (entry == null) {
response = "rejected";
lurl = "";
} else {
if (entry != null) {
response = "double";
switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare);
@ -259,9 +261,6 @@ public final class crawlOrder {
response = "rejected";
lurl = "";
}
} catch (IOException e) {
response = "rejected";
lurl = "";
}
} else {
response = "rejected";

@ -92,7 +92,6 @@ import java.util.zip.GZIPOutputStream;
import de.anomic.htmlFilter.htmlFilterContentTransformer;
import de.anomic.htmlFilter.htmlFilterOutputStream;
import de.anomic.htmlFilter.htmlFilterTransformer;
import de.anomic.index.indexURL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;

@ -351,6 +351,11 @@ public class kelondroRow {
case kelondroColumn.encoder_none:
throw new kelondroException("ROW", "getColLong has celltype none, no encoder given");
case kelondroColumn.encoder_b64e:
// start - fix for badly stored parameters
boolean maxvalue = true;
for (int i = 0; i < length; i++) if (rowinstance[offset + i] != '_') {maxvalue = false; break;}
if (maxvalue) return 0;
// stop - fix for badly stored parameters
return kelondroBase64Order.enhancedCoder.decodeLong(rowinstance, offset, length);
case kelondroColumn.encoder_b256:
return kelondroNaturalOrder.decodeLong(rowinstance, offset, length);

@ -160,7 +160,7 @@ public final class plasmaCrawlLURL extends indexURL {
gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
}
public Entry load(String urlHash, indexEntry searchedWord) throws IOException {
public Entry load(String urlHash, indexEntry searchedWord) {
// generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
@ -169,9 +169,13 @@ public final class plasmaCrawlLURL extends indexURL {
// - look into the filed properties
// if the url cannot be found, this returns null
kelondroRow.Entry entry = urlIndexCache.get(urlHash.getBytes());
if (entry == null) entry = urlIndexFile.get(urlHash.getBytes());
if (entry == null) return null;
return new Entry(entry, searchedWord);
try {
if (entry == null) entry = urlIndexFile.get(urlHash.getBytes());
if (entry == null) return null;
return new Entry(entry, searchedWord);
} catch (IOException e) {
return null;
}
}
public void store(Entry entry, boolean cached) throws IOException {

@ -379,14 +379,10 @@ public final class plasmaCrawlStacker {
String nexturlhash = indexURL.urlHash(nexturl);
String dbocc = this.sb.urlPool.exists(nexturlhash);
plasmaCrawlLURL.Entry oldEntry = null;
if (dbocc != null) try {
oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null);
} catch (IOException e) {}
boolean recrawl = (oldEntry != null) &&
(((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
oldEntry = this.sb.urlPool.loadedURL.load(nexturlhash, null);
boolean recrawl = (oldEntry != null) && (((System.currentTimeMillis() - oldEntry.loaddate().getTime()) / 60000) > profile.recrawlIfOlder());
if ((dbocc != null) && (!(recrawl))) {
reason = plasmaCrawlEURL.DOUBLE_REGISTERED + dbocc + ")";
this.log.logFine("URL '" + nexturlString + "' is double registered in '" + dbocc + "'. " +
"Stack processing time: " + (System.currentTimeMillis()-startTime) + "ms");
return reason;

@ -227,22 +227,16 @@ public class plasmaDHTChunk {
// iterate over indexes to fetch url entries and store them in the urlCache
while ((urlIter.hasNext()) && (maxcount > refcount)) {
iEntry = (indexEntry) urlIter.next();
try {
lurl = lurls.load(iEntry.urlHash(), iEntry);
if ((lurl == null) || (lurl.url() == null)) {
//yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash());
notBoundCounter++;
urlIter.remove();
wordIndex.removeEntry(container.getWordHash(), iEntry.urlHash(), true);
} else {
urlCache.put(iEntry.urlHash(), lurl);
//yacyCore.log.logFine("DEBUG selectTransferContainersResource: added url hash '" + iEntry.urlHash() + "' to urlCache for word hash " + container.getWordHash());
refcount++;
}
} catch (IOException e) {
lurl = lurls.load(iEntry.urlHash(), iEntry);
if ((lurl == null) || (lurl.url() == null)) {
//yacyCore.log.logFine("DEBUG selectTransferContainersResource: not-bound url hash '" + iEntry.urlHash() + "' for word hash " + container.getWordHash());
notBoundCounter++;
urlIter.remove();
wordIndex.removeEntry(container.getWordHash(), iEntry.urlHash(), true);
} else {
urlCache.put(iEntry.urlHash(), lurl);
//yacyCore.log.logFine("DEBUG selectTransferContainersResource: added url hash '" + iEntry.urlHash() + "' to urlCache for word hash " + container.getWordHash());
refcount++;
}
}

@ -45,7 +45,6 @@ package de.anomic.plasma;
import java.util.Iterator;
import java.util.Set;
import java.util.HashSet;
import java.io.IOException;
import de.anomic.kelondro.kelondroException;
import de.anomic.server.logging.serverLog;
@ -242,13 +241,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
if (System.currentTimeMillis() >= postorderLimitTime) break;
entry = preorder.next();
// find the url entry
try {
page = urlStore.load(entry.urlHash(), entry);
// add a result
if (page != null) acc.addResult(entry, page);
} catch (IOException e) {
// result was not found
}
page = urlStore.load(entry.urlHash(), entry);
// add a result
if (page != null) acc.addResult(entry, page);
}
} catch (kelondroException ee) {
serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee);
@ -298,13 +293,9 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
if (System.currentTimeMillis() >= postorderLimitTime) break;
entry = preorder.next();
// find the url entry
try {
page = urlStore.load(entry.urlHash(), entry);
// add a result
if (page != null) acc.addResult(entry, page);
} catch (IOException e) {
// result was not found
}
page = urlStore.load(entry.urlHash(), entry);
// add a result
if (page != null) acc.addResult(entry, page);
}
} catch (kelondroException ee) {
serverLog.logSevere("PLASMA", "Database Failure during plasmaSearch.order: " + ee.getMessage(), ee);

@ -45,7 +45,6 @@ package de.anomic.plasma;
import java.io.IOException;
import de.anomic.net.URL;
import de.anomic.plasma.cache.IResourceInfo;
import de.anomic.plasma.crawler.http.CrawlWorker;
import java.util.Enumeration;
import java.util.HashMap;
@ -53,7 +52,6 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySearch;

@ -2157,25 +2157,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// finally, delete the url entry
// determine the url string
try {
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.load(urlhash, null);
if (entry == null)
return 0;
URL url = entry.url();
if (url == null)
return 0;
// get set of words
// Set words = plasmaCondenser.getWords(getText(getResource(url,
// fetchOnline)));
Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText());
// delete all word references
int count = removeReferences(urlhash, witer);
// finally delete the url entry itself
urlPool.loadedURL.remove(urlhash);
return count;
} catch (IOException e) {
return 0;
}
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.load(urlhash, null);
if (entry == null) return 0;
URL url = entry.url();
if (url == null) return 0;
// get set of words
// Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
Iterator witer = plasmaCondenser.getWords(snippetCache.parseDocument(url, snippetCache.getResource(url, fetchOnline, 10000)).getText());
// delete all word references
int count = removeReferences(urlhash, witer);
// finally delete the url entry itself
urlPool.loadedURL.remove(urlhash);
return count;
}
public int removeReferences(URL url, Set words) {

@ -328,12 +328,8 @@ public class plasmaSwitchboardQueue {
public URL referrerURL() {
if (referrerURL == null) {
if ((referrerHash == null) || (referrerHash.equals(indexURL.dummyHash))) return null;
try {
plasmaCrawlLURL.Entry entry = lurls.load(referrerHash, null);
if (entry == null) referrerURL = null; else referrerURL = entry.url();
} catch (IOException e) {
referrerURL = null;
}
plasmaCrawlLURL.Entry entry = lurls.load(referrerHash, null);
if (entry == null) referrerURL = null; else referrerURL = entry.url();
}
return referrerURL;
}

@ -81,10 +81,8 @@ public class plasmaURLPool {
if (urlhash.equals(indexURL.dummyHash)) return null;
plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
if (ne != null) return ne.url();
try {
plasmaCrawlLURL.Entry le = loadedURL.load(urlhash, null);
if (le != null) return le.url();
} catch (IOException e) {}
plasmaCrawlLURL.Entry le = loadedURL.load(urlhash, null);
if (le != null) return le.url();
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
if (ee != null) return ee.url();
return null;

@ -689,20 +689,15 @@ public final class plasmaWordIndex extends indexAbstractRI implements indexRI {
while (containerIterator.hasNext() && run) {
waiter();
entry = (indexEntry) containerIterator.next();
// System.out.println("Wordhash: "+wordHash+" UrlHash:
// "+entry.getUrlHash());
try {
plasmaCrawlLURL.Entry lurlEntry = lurl.load(entry.urlHash(), null);
if (lurlEntry != null) {
url = lurlEntry.url();
if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) {
urlHashs.add(entry.urlHash());
}
} else {
// System.out.println("Wordhash: "+wordHash+" UrlHash: "+entry.getUrlHash());
plasmaCrawlLURL.Entry ue = lurl.load(entry.urlHash(), null);
if (ue == null) {
urlHashs.add(entry.urlHash());
} else {
url = ue.url();
if ((url == null) || (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_CRAWLER, url) == true)) {
urlHashs.add(entry.urlHash());
}
} catch (IOException e) {
urlHashs.add(entry.urlHash());
}
}
if (urlHashs.size() > 0) {

Loading…
Cancel
Save