From bb79fb5d917ce33f6fb5ef3c3b020a92420eeba0 Mon Sep 17 00:00:00 2001 From: orbiter Date: Sun, 11 Dec 2005 00:25:02 +0000 Subject: [PATCH] - changed handling of error cases retrieving urls from database (no more NULL values are returned, instead, an IOException is thrown) - removed ugly damagedURLS implementation from plasmaCrawlLURL.java (this inserted a static value into the Object which is not really a good style) - re-coded damagedURLS collection in yacy.java by catching an exception and evaluating the exception message to do: - the urldbcleanup feature must be re-tested git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1200 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/IndexControl_p.java | 55 ++++---- htroot/IndexCreateWWWLocalQueue_p.java | 10 +- htroot/ViewFile.java | 6 +- htroot/yacy/crawlOrder.java | 7 +- htroot/yacy/crawlReceipt.java | 8 +- source/de/anomic/plasma/plasmaCrawlEURL.java | 68 +++++----- source/de/anomic/plasma/plasmaCrawlLURL.java | 90 ++++++------- source/de/anomic/plasma/plasmaCrawlNURL.java | 42 +++--- .../de/anomic/plasma/plasmaSwitchboard.java | 124 ++++++++++-------- .../anomic/plasma/plasmaSwitchboardQueue.java | 2 +- source/de/anomic/plasma/plasmaURLPool.java | 2 +- source/yacy.java | 124 +++++++++--------- 12 files changed, 285 insertions(+), 253 deletions(-) diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 18ec40e95..2fb4389f4 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -47,7 +47,6 @@ // if the shell's current path is HTROOT import java.io.IOException; -import java.net.MalformedURLException; import java.net.URL; import java.util.Enumeration; import java.util.HashSet; @@ -167,9 +166,9 @@ public class IndexControl_p { } } if (delurlref) { - for (int i = 0; i < urlx.length; i++) { + for (int i = 0; i < urlx.length; i++) try { switchboard.removeAllUrlReferences(urlx[i], true); - } + } catch (IOException e) {} } if (delurl || delurlref) { for (int i = 0; i < urlx.length; i++) { @@ -189,9 +188,9 @@ public class IndexControl_p { // delete selected URLs if (post.containsKey("keyhashdelete")) { if (delurlref) { - for (int i = 0; i < urlx.length; i++) { + for (int i = 0; i < urlx.length; i++) try { switchboard.removeAllUrlReferences(urlx[i], true); - } + } catch (IOException e) {} } if (delurl || delurlref) { for (int i = 0; i < urlx.length; i++) { @@ -212,20 +211,24 @@ public class IndexControl_p { } if (post.containsKey("urlhashdeleteall")) { - int i = switchboard.removeAllUrlReferences(urlhash, true); - prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes."); + try { + int i = switchboard.removeAllUrlReferences(urlhash, true); + prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes."); + } catch (IOException e) { + prop.put("result", "Deleted nothing because the url-hash could not be resolved"); + } } if (post.containsKey("urlhashdelete")) { - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); - URL url = entry.url(); - if (url == null) { - prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); - } else { + try { + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); + URL url = entry.url(); urlstring = htmlFilterContentScraper.urlNormalform(url); prop.put("urlstring", ""); switchboard.urlPool.loadedURL.remove(urlhash); prop.put("result", "Removed URL " + urlstring); + } catch (IOException e) { + prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted."); } } @@ -267,16 +270,16 @@ public class IndexControl_p { plasmaCrawlLURL.Entry lurl; while (urlIter.hasNext()) { indexEntry = (plasmaWordIndexEntry) urlIter.next(); - lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash()); - if (lurl == null) { - unknownURLEntries.add(indexEntry.getUrlHash()); - } else { + try { + lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash()); if (lurl.toString() == null) { switchboard.urlPool.loadedURL.remove(indexEntry.getUrlHash()); unknownURLEntries.add(indexEntry.getUrlHash()); } else { knownURLs.put(indexEntry.getUrlHash(), lurl); } + } catch (IOException e) { + unknownURLEntries.add(indexEntry.getUrlHash()); } } // now delete all entries that have no url entry @@ -327,21 +330,21 @@ public class IndexControl_p { prop.put("urlhash", urlhash); plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); prop.put("result", genUrlProfile(switchboard, entry, urlhash)); - } catch (MalformedURLException e) { + } catch (Exception e) { prop.put("urlstring", "wrong url: " + urlstring); prop.put("urlhash", ""); } } if (post.containsKey("urlhashsearch")) { - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); - URL url = entry.url(); - if (url == null) { - prop.put("result", "No Entry for URL hash " + urlhash); - } else { + try { + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); + URL url = entry.url(); urlstring = url.toString(); prop.put("urlstring", urlstring); prop.put("result", genUrlProfile(switchboard, entry, urlhash)); + } catch (IOException e) { + prop.put("result", "No Entry for URL hash " + urlhash); } } @@ -391,6 +394,12 @@ public class IndexControl_p { public static String genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURL.Entry entry, String urlhash) { if (entry == null) { return "No entry found for URL-hash " + urlhash; } URL url = entry.url(); + String referrer = null; + try { + referrer = switchboard.urlPool.loadedURL.getEntry(entry.referrerHash()).url().toString(); + } catch (IOException e) { + referrer = ""; + } if (url == null) { return "No entry found for URL-hash " + urlhash; } String result = "" + "" + @@ -398,7 +407,7 @@ public class IndexControl_p { "" + "" + "" + - "" + + "" + "" + "" + "" + diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java index 6fda9b75b..d4d7b166c 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.java +++ b/htroot/IndexCreateWWWLocalQueue_p.java @@ -43,6 +43,7 @@ // javac -classpath .:../classes IndexCreate_p.java // if the shell's current path is HTROOT +import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Iterator; @@ -96,9 +97,12 @@ public class IndexCreateWWWLocalQueue_p { while (iter.hasNext()) { String value = null; String nextHash = new String((byte[]) iter.next()); - Entry entry = switchboard.urlPool.noticeURL.getEntry(nextHash); - if (entry == null) continue; - + Entry entry = null; + try { + entry = switchboard.urlPool.noticeURL.getEntry(nextHash); + } catch (IOException e) { + continue; + } if ((option.equals("URL")&&(entry.url() != null))) { value = entry.url().toString(); } else if ((option.equals("AnchorName"))) { diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 1bace063d..048ff9cfe 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -102,8 +102,10 @@ public class ViewFile { String viewMode = post.get("viewMode","sentences"); // getting the urlEntry that belongs to the url hash - Entry urlEntry = sb.urlPool.loadedURL.getEntry(urlHash); - if (urlEntry == null) { + Entry urlEntry = null; + try { + urlEntry = sb.urlPool.loadedURL.getEntry(urlHash); + } catch (IOException e) { prop.put("error",2); prop.put("viewMode",VIEW_MODE_NO_TEXT); return prop; diff --git a/htroot/yacy/crawlOrder.java b/htroot/yacy/crawlOrder.java index 8ab682781..093c46398 100644 --- a/htroot/yacy/crawlOrder.java +++ b/htroot/yacy/crawlOrder.java @@ -45,6 +45,7 @@ // You must compile this file with // javac -classpath .:../classes crawlOrder.java +import java.io.IOException; import java.util.ArrayList; import java.util.Date; import de.anomic.htmlFilter.htmlFilterContentScraper; @@ -243,12 +244,12 @@ public final class crawlOrder { // case where we have already the url loaded; reason = reasonString; // send lurl-Entry as response - plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaURL.urlHash(url)); - if (entry != null) { + try { + plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaURL.urlHash(url)); response = "double"; switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare); lurl = crypt.simpleEncode(entry.toString()); - } else { + } catch (IOException e) { response = "rejected"; lurl = ""; } diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index 14a0c2a3b..cec996983 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -43,6 +43,8 @@ // javac -classpath .:../classes crawlOrder.java +import java.io.IOException; + import de.anomic.http.httpHeader; import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlLURL; @@ -135,10 +137,12 @@ public final class crawlReceipt { // ready for more prop.put("delay", "10"); } else { - plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash); - if (en != null) { + try { + plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash); switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(plasmaURL.urlFlagLength), false); switchboard.urlPool.noticeURL.remove(receivedUrlhash); + } catch (IOException e) { + } prop.put("delay", "100"); // what shall we do with that??? } diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index 69940e15b..f9fb5c78d 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -109,7 +109,7 @@ public class plasmaCrawlEURL extends plasmaURL { return e; } - public synchronized Entry getEntry(String hash) { + public synchronized Entry getEntry(String hash) throws IOException { return new Entry(hash); } @@ -157,32 +157,30 @@ public class plasmaCrawlEURL extends plasmaURL { } - public Entry(String hash) { - // generates an plasmaEURLEntry using the url hash - // to speed up the access, the url-hashes are buffered - // in the hash cache. - // we have two options to find the url: - // - look into the hash cache - // - look into the filed properties - // if the url cannot be found, this returns null - this.hash = hash; - try { - byte[][] entry = urlHashCache.get(hash.getBytes()); - if (entry != null) { - this.referrer = new String(entry[1]); - this.initiator = new String(entry[2]); - this.executor = new String(entry[3]); - this.url = new URL(new String(entry[4]).trim()); - this.name = new String(entry[5]).trim(); - this.initdate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[6]))); - this.trydate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[7]))); - this.trycount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[8])); - this.failreason = new String(entry[9]); - this.flags = new bitfield(entry[10]); - return; - } - } catch (Exception e) {} - } + public Entry(String hash) throws IOException { + // generates an plasmaEURLEntry using the url hash + // to speed up the access, the url-hashes are buffered + // in the hash cache. + // we have two options to find the url: + // - look into the hash cache + // - look into the filed properties + // if the url cannot be found, this returns null + this.hash = hash; + byte[][] entry = urlHashCache.get(hash.getBytes()); + if (entry != null) { + this.referrer = new String(entry[1]); + this.initiator = new String(entry[2]); + this.executor = new String(entry[3]); + this.url = new URL(new String(entry[4]).trim()); + this.name = new String(entry[5]).trim(); + this.initdate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[6]))); + this.trydate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[7]))); + this.trycount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[8])); + this.failreason = new String(entry[9]); + this.flags = new bitfield(entry[10]); + return; + } + } private void store() { // stores the values from the object variables into the database @@ -257,16 +255,20 @@ public class plasmaCrawlEURL extends plasmaURL { } public class kenum implements Enumeration { - // enumerates entry elements - Iterator i; - public kenum(boolean up, boolean rotating) throws IOException { + // enumerates entry elements + Iterator i; + public kenum(boolean up, boolean rotating) throws IOException { i = urlHashCache.rows(up, rotating); } - public boolean hasMoreElements() { + public boolean hasMoreElements() { return i.hasNext(); } - public Object nextElement() { - return new Entry(new String(((byte[][]) i.next())[0])); + public Object nextElement() { + try { + return new Entry(new String(((byte[][]) i.next())[0])); + } catch (IOException e) { + return null; + } } } diff --git a/source/de/anomic/plasma/plasmaCrawlLURL.java b/source/de/anomic/plasma/plasmaCrawlLURL.java index b45aa8eca..870571581 100644 --- a/source/de/anomic/plasma/plasmaCrawlLURL.java +++ b/source/de/anomic/plasma/plasmaCrawlLURL.java @@ -57,17 +57,13 @@ import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.text.SimpleDateFormat; -import java.util.Collections; import java.util.Date; -import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.Locale; import java.util.Properties; -import java.util.Set; import de.anomic.http.httpc; -import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroTree; import de.anomic.server.serverCodings; import de.anomic.server.serverObjects; @@ -89,7 +85,7 @@ public final class plasmaCrawlLURL extends plasmaURL { private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling private final LinkedList gcrawlResultStack; // 6 - local index: triggered external - public static Set damagedURLS = Collections.synchronizedSet(new HashSet()); + //public static Set damagedURLS = Collections.synchronizedSet(new HashSet()); public plasmaCrawlLURL(File cachePath, int bufferkb) throws IOException { super(); @@ -173,7 +169,7 @@ public final class plasmaCrawlLURL extends plasmaURL { gcrawlResultStack.add(urlHash + initiatorHash + executorHash); } - public synchronized Entry getEntry(String hash) { + public synchronized Entry getEntry(String hash) throws IOException { return new Entry(hash); } @@ -347,9 +343,9 @@ public final class plasmaCrawlLURL extends plasmaURL { // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps initiatorHash=" + initiatorHash + " executorHash=" + executorHash); urlHash = getUrlHash(tabletype, i); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash); - urle = getEntry(urlHash); -// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString()); - if (urle != null) try { + try { + urle = getEntry(urlHash); +// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString()); initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash); executorSeed = yacyCore.seedDB.getConnected(executorHash); @@ -457,41 +453,38 @@ public final class plasmaCrawlLURL extends plasmaURL { store(); } - public Entry(String urlHash) { - // generates an plasmaLURLEntry using the url hash - // to speed up the access, the url-hashes are buffered - // in the hash cache. - // we have two options to find the url: - // - look into the hash cache - // - look into the filed properties - // if the url cannot be found, this returns null - this.urlHash = urlHash; - try { + public Entry(String urlHash) throws IOException { + // generates an plasmaLURLEntry using the url hash + // to speed up the access, the url-hashes are buffered + // in the hash cache. + // we have two options to find the url: + // - look into the hash cache + // - look into the filed properties + // if the url cannot be found, this returns null + this.urlHash = urlHash; byte[][] entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes()); - if (entry != null) { - this.url = new URL(new String(entry[1]).trim()); - this.descr = (entry[2] == null) ? this.url.toString() : new String(entry[2]).trim(); - this.moddate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[3]))); - this.loaddate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[4]))); - this.referrerHash = (entry[5]==null)?dummyHash:new String(entry[5]); - this.copyCount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[6])); - this.flags = new String(entry[7]); - this.quality = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[8])); - this.language = new String(entry[9]); - this.doctype = (char) entry[10][0]; - this.size = serverCodings.enhancedCoder.decodeBase64Long(new String(entry[11])); - this.wordCount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[12])); - this.snippet = null; - return; + try { + if (entry != null) { + this.url = new URL(new String(entry[1]).trim()); + this.descr = (entry[2] == null) ? this.url.toString() : new String(entry[2]).trim(); + this.moddate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[3]))); + this.loaddate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[4]))); + this.referrerHash = (entry[5] == null) ? dummyHash : new String(entry[5]); + this.copyCount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[6])); + this.flags = new String(entry[7]); + this.quality = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[8])); + this.language = new String(entry[9]); + this.doctype = (char) entry[10][0]; + this.size = serverCodings.enhancedCoder.decodeBase64Long(new String(entry[11])); + this.wordCount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[12])); + this.snippet = null; + return; + } + } catch (Exception e) { + serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e); + throw new IOException("plasmaLURL.entry/1: " + e.toString()); } - } catch (MalformedURLException e) { - plasmaCrawlLURL.damagedURLS.add(this.urlHash); - System.out.println("DEBUG: Marked damaged Entry for removal (malformedURL). UrlHash: " + this.urlHash); - //serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e); - } catch (Exception e) { - serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e); } - } public Entry(Properties prop, boolean setGlobal) { // generates an plasmaLURLEntry using the properties from the argument @@ -742,14 +735,15 @@ public final class plasmaCrawlLURL extends plasmaURL { return i.hasNext(); } - public Object next() { + public Object next() throws RuntimeException { + byte[] e = ((byte[][]) i.next())[0]; + if (e == null) return null; + String hash = null; try { - byte[] e = ((byte[][])i.next())[0]; - if (e == null) return null; else return new Entry(new String(e)); - } catch (kelondroException e) { - e.printStackTrace(); - error = true; - return null; + hash = new String(e); + return new Entry(hash); + } catch (IOException ex) { + throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + hash); } } diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index 77e044b7c..108e7dd9a 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -363,7 +363,7 @@ public class plasmaCrawlNURL extends plasmaURL { } } - public synchronized Entry getEntry(String hash) { + public synchronized Entry getEntry(String hash) throws IOException { return new Entry(hash); } @@ -431,7 +431,7 @@ public class plasmaCrawlNURL extends plasmaURL { return str.toString(); } - public Entry(String hash) { + public Entry(String hash) throws IOException { // generates an plasmaNURLEntry using the url hash // to speed up the access, the url-hashes are buffered // in the hash cache. @@ -440,26 +440,28 @@ public class plasmaCrawlNURL extends plasmaURL { // - look into the filed properties // if the url cannot be found, this returns null this.hash = hash; - try { - byte[][] entry = urlHashCache.get(hash.getBytes()); - if (entry != null) { - this.initiator = new String(entry[1]); - this.url = new URL(new String(entry[2]).trim()); - this.referrer = (entry[3]==null) ? dummyHash : new String(entry[3]); - this.name = (entry[4] == null) ? "" : new String(entry[4]).trim(); - this.loaddate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[5]))); + byte[][] entry = urlHashCache.get(hash.getBytes()); + if (entry != null) { + //try { + this.initiator = new String(entry[1]); + this.url = new URL(new String(entry[2]).trim()); + this.referrer = (entry[3] == null) ? dummyHash : new String(entry[3]); + this.name = (entry[4] == null) ? "" : new String(entry[4]).trim(); + this.loaddate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[5]))); this.profileHandle = (entry[6] == null) ? null : new String(entry[6]).trim(); - this.depth = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[7])); - this.anchors = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[8])); - this.forkfactor = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[9])); - this.flags = new bitfield(entry[10]); - this.handle = Integer.parseInt(new String(entry[11])); + this.depth = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[7])); + this.anchors = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[8])); + this.forkfactor = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[9])); + this.flags = new bitfield(entry[10]); + this.handle = Integer.parseInt(new String(entry[11])); return; - } else { - // show that we found nothing - this.url = null; - } - } catch (Exception e) { + //} catch (MalformedURLException e) { + // throw new IOException("plasmaCrawlNURL/Entry: " + e); + //} + } else { + // show that we found nothing + throw new IOException("hash not found"); + //this.url = null; } } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 368ca45a7..d64cd760d 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -1230,8 +1230,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // create index String descr = document.getMainLongTitle(); - URL referrerURL = entry.referrerURL(); - String referrerHash = (referrerURL == null) ? plasmaURL.dummyHash : plasmaURL.urlHash(referrerURL); + String referrerHash; + try { + URL referrerURL = entry.referrerURL(); + referrerHash = plasmaURL.urlHash(referrerURL); + } catch (IOException e) { + referrerHash = plasmaURL.dummyHash; + } String noIndexReason = "unspecified"; if (processCase == 4) { // proxy-load @@ -1480,8 +1485,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser URL refererURL = null; String refererHash = urlEntry.referrerHash(); - if ((refererHash != null) && (!refererHash.equals(plasmaURL.dummyHash))) { + if ((refererHash != null) && (!refererHash.equals(plasmaURL.dummyHash))) try { refererURL = this.urlPool.getURL(refererHash); + } catch (IOException e) { + refererURL = null; } cacheLoader.loadParallel(urlEntry.url(), urlEntry.name(), (refererURL!=null)?refererURL.toString():null, urlEntry.initiator(), urlEntry.depth(), profile); log.logInfo(stats + ": enqueued for load " + urlEntry.url() + " [" + urlEntry.hash() + "]"); @@ -1519,60 +1526,63 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // do the request - HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), urlPool.getURL(urlEntry.referrerHash())); - - - // check success - /* - the result of the 'response' value can have one of the following values: - negative cases, no retry - denied - the peer does not want to crawl that - exception - an exception occurred - - negative case, retry possible - rejected - the peer has rejected to process, but a re-try should be possible - - positive case with crawling - stacked - the resource is processed asap - - positive case without crawling - double - the resource is already in database, believed to be fresh and not reloaded - the resource is also returned in lurl - */ - if ((page == null) || (page.get("delay") == null)) { - log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + urlEntry.url().toString() + ")"); - if (remoteSeed != null) yacyCore.peerActions.peerDeparture(remoteSeed); - return false; - } else try { - log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed=" + remoteSeed.getName() + ", url=" + urlEntry.url().toString() + ", response=" + page.toString()); // DEBUG - - int newdelay = Integer.parseInt((String) page.get("delay")); - yacyCore.dhtAgent.setCrawlDelay(remoteSeed.hash, newdelay); - String response = (String) page.get("response"); - if (response.equals("stacked")) { - log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " PLACED URL=" + urlEntry.url().toString() + "; NEW DELAY=" + newdelay); - return true; - } else if (response.equals("double")) { - String lurl = (String) page.get("lurl"); - if ((lurl != null) && (lurl.length() != 0)) { - String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); - plasmaCrawlLURL.Entry entry = urlPool.loadedURL.addEntry( - urlPool.loadedURL.newEntry(propStr, true), - yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); - urlPool.noticeURL.remove(entry.hash()); - log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'"); - return true; - } else { - log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " REJECTED. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + ")"); + try { + HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), urlPool.getURL(urlEntry.referrerHash())); + + // check success + /* + * the result of the 'response' value can have one of the following + * values: negative cases, no retry denied - the peer does not want + * to crawl that exception - an exception occurred + * + * negative case, retry possible rejected - the peer has rejected to + * process, but a re-try should be possible + * + * positive case with crawling stacked - the resource is processed + * asap + * + * positive case without crawling double - the resource is already + * in database, believed to be fresh and not reloaded the resource + * is also returned in lurl + */ + if ((page == null) || (page.get("delay") == null)) { + log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + urlEntry.url().toString() + ")"); + if (remoteSeed != null) + yacyCore.peerActions.peerDeparture(remoteSeed); + return false; + } else + try { + log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed=" + remoteSeed.getName() + ", url=" + urlEntry.url().toString() + ", response=" + page.toString()); // DEBUG + + int newdelay = Integer.parseInt((String) page.get("delay")); + yacyCore.dhtAgent.setCrawlDelay(remoteSeed.hash, newdelay); + String response = (String) page.get("response"); + if (response.equals("stacked")) { + log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " PLACED URL=" + urlEntry.url().toString() + "; NEW DELAY=" + newdelay); + return true; + } else if (response.equals("double")) { + String lurl = (String) page.get("lurl"); + if ((lurl != null) && (lurl.length() != 0)) { + String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); + plasmaCrawlLURL.Entry entry = urlPool.loadedURL.addEntry(urlPool.loadedURL.newEntry(propStr, true), yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); + urlPool.noticeURL.remove(entry.hash()); + log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'"); + return true; + } else { + log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " REJECTED. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + ")"); + return false; + } + } else { + log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " DENIED. RESPONSE=" + response + ", CAUSE=" + page.get("reason") + ", URL=" + urlEntry.url().toString()); + return false; + } + } catch (Exception e) { + // wrong values + log.logSevere(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. CLIENT RETURNED: " + page.toString(), e); return false; } - } else { - log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " DENIED. RESPONSE=" + response + ", CAUSE=" + page.get("reason") + ", URL=" + urlEntry.url().toString()); - return false; - } - } catch (Exception e) { - // wrong values - log.logSevere(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. CLIENT RETURNED: " + page.toString(), e); + } catch (IOException e) { + log.logSevere(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. URL CANNOT BE RETRIEVED from referrer hash: " + urlEntry.referrerHash(), e); return false; } } @@ -1825,11 +1835,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // method for index deletion - public int removeAllUrlReferences(URL url, boolean fetchOnline) { + public int removeAllUrlReferences(URL url, boolean fetchOnline) throws IOException { return removeAllUrlReferences(plasmaURL.urlHash(url), fetchOnline); } - public int removeAllUrlReferences(String urlhash, boolean fetchOnline) { + public int removeAllUrlReferences(String urlhash, boolean fetchOnline) throws IOException { // find all the words in a specific resource and remove the url reference from every word index // finally, delete the url entry diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java index 17499bac4..ff8b5cda8 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java +++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java @@ -273,7 +273,7 @@ public class plasmaSwitchboardQueue { return responseHeader; } - public URL referrerURL() { + public URL referrerURL() throws IOException { if (referrerURL == null) { if ((referrerHash == null) || (referrerHash.equals(plasmaURL.dummyHash))) return null; referrerURL = lurls.getEntry(referrerHash).url(); diff --git a/source/de/anomic/plasma/plasmaURLPool.java b/source/de/anomic/plasma/plasmaURLPool.java index 3a81553cf..3b31888b5 100644 --- a/source/de/anomic/plasma/plasmaURLPool.java +++ b/source/de/anomic/plasma/plasmaURLPool.java @@ -71,7 +71,7 @@ public class plasmaURLPool { return null; } - public URL getURL(String urlhash) { + public URL getURL(String urlhash) throws IOException { if (urlhash.equals(plasmaURL.dummyHash)) return null; plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash); if (ne != null) return ne.url(); diff --git a/source/yacy.java b/source/yacy.java index 9dfaa8654..62147f23f 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -1180,70 +1180,67 @@ public final class yacy { private static void urldbcleanup(String homePath) { File root = new File(homePath); File dbroot = new File(root, "DATA/PLASMADB"); + HashSet damagedURLS = new HashSet(); try { plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(new File(dbroot, "urlHash.db"), 4194304); Iterator eiter = currentUrlDB.entries(true, false); - int iteratorCount=0; - while (eiter.hasNext()) { + int iteratorCount = 0; + while (eiter.hasNext()) try { eiter.next(); iteratorCount++; + } catch (RuntimeException e) { + String m = e.getMessage(); + damagedURLS.add(m.substring(m.length() - 12)); } - try { Thread.sleep(1000); } catch (InterruptedException e) {} - System.out.println("URLs vorher: " + currentUrlDB.size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + plasmaCrawlLURL.damagedURLS.size()); - synchronized(plasmaCrawlLURL.damagedURLS) - { - Iterator eiter2 = plasmaCrawlLURL.damagedURLS.iterator(); - String urlHash; - while (eiter2.hasNext()) { - urlHash = (String) eiter2.next(); - - // trying to fix the invalid URL - httpc theHttpc = null; - String oldUrlStr = null; - try { - // getting the url data as byte array - byte[][] entry = currentUrlDB.urlHashCache.get(urlHash.getBytes()); - - // getting the wrong url string - oldUrlStr = new String(entry[1]).trim(); - - int pos = -1; - if ((pos = oldUrlStr.indexOf("://"))!= -1) { - // trying to correct the url - String newUrlStr = "http://" + oldUrlStr.substring(pos+3); - URL newUrl = new URL(newUrlStr); - - // doing a http head request to test if the url is correct - theHttpc = httpc.getInstance(newUrl.getHost(), newUrl.getPort(), 30000, false); - response res = theHttpc.HEAD(newUrl.getPath(), null); - - if (res.statusCode == 200) { - entry[1] = newUrl.toString().getBytes(); - currentUrlDB.urlHashCache.put(entry); - System.out.println("UrlDB-Entry with urlHash '" + urlHash + - "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr); - } else { - currentUrlDB.remove(urlHash); - System.out.println("UrlDB-Entry with urlHash '" + urlHash + - "' removed\n\tURL: " + oldUrlStr + - "\n\tConnection Status: " + res.status); - } + try { Thread.sleep(1000); } catch (InterruptedException e) { } + System.out.println("URLs vorher: " + currentUrlDB.size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + damagedURLS.size()); + + Iterator eiter2 = damagedURLS.iterator(); + String urlHash; + while (eiter2.hasNext()) { + urlHash = (String) eiter2.next(); + + // trying to fix the invalid URL + httpc theHttpc = null; + String oldUrlStr = null; + try { + // getting the url data as byte array + byte[][] entry = currentUrlDB.urlHashCache.get(urlHash.getBytes()); + + // getting the wrong url string + oldUrlStr = new String(entry[1]).trim(); + + int pos = -1; + if ((pos = oldUrlStr.indexOf("://")) != -1) { + // trying to correct the url + String newUrlStr = "http://" + oldUrlStr.substring(pos + 3); + URL newUrl = new URL(newUrlStr); + + // doing a http head request to test if the url is correct + theHttpc = httpc.getInstance(newUrl.getHost(), newUrl.getPort(), 30000, false); + response res = theHttpc.HEAD(newUrl.getPath(), null); + + if (res.statusCode == 200) { + entry[1] = newUrl.toString().getBytes(); + currentUrlDB.urlHashCache.put(entry); + System.out.println("UrlDB-Entry with urlHash '" + urlHash + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr); + } else { + currentUrlDB.remove(urlHash); + System.out.println("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tConnection Status: " + res.status); } - } catch (Exception e) { - currentUrlDB.remove(urlHash); - System.out.println("UrlDB-Entry with urlHash '" + urlHash + - "' removed\n\tURL: " + oldUrlStr + - "\n\tExecption: " + e.getMessage()); - } finally { - if (theHttpc != null) try { - theHttpc.close(); - httpc.returnInstance(theHttpc); - } catch (Exception e) {} } + } catch (Exception e) { + currentUrlDB.remove(urlHash); + System.out.println("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tExecption: " + e.getMessage()); + } finally { + if (theHttpc != null) try { + theHttpc.close(); + httpc.returnInstance(theHttpc); + } catch (Exception e) { } } } - plasmaCrawlLURL.damagedURLS.clear(); - System.out.println("URLs nachher: " + currentUrlDB.size() + " kaputte URLs: " + plasmaCrawlLURL.damagedURLS.size()); + + System.out.println("URLs nachher: " + currentUrlDB.size() + " kaputte URLs: " + damagedURLS.size()); currentUrlDB.close(); } catch (IOException e) { e.printStackTrace(); @@ -1251,16 +1248,23 @@ public final class yacy { } /** - * Main-method which is started by java. Checks for special arguments or - * starts up the application. - * - * @param args Given arguments from the command line. - */ + * Main-method which is started by java. Checks for special arguments or + * starts up the application. + * + * @param args + * Given arguments from the command line. + */ public static void main(String args[]) { // check memory amount System.gc(); - long startupMemFree = Runtime.getRuntime().freeMemory(); // the amount of free memory in the Java Virtual Machine + long startupMemFree = Runtime.getRuntime().freeMemory(); // the + // amount of + // free + // memory in + // the Java + // Virtual + // Machine long startupMemTotal = Runtime.getRuntime().totalMemory(); // the total amount of memory in the Java virtual machine; may vary over time // go into headless awt mode
URL String" + htmlFilterContentScraper.urlNormalform(url) + "
Description" + entry.descr() + "
Modified-Date" + entry.moddate() + "
Loaded-Date" + entry.loaddate() + "
Referrer" + switchboard.urlPool.loadedURL.getEntry(entry.referrerHash()).url() + "
Referrer" + referrer + "
Doctype" + entry.doctype() + "
Copy-Count" + entry.copyCount() + "
Local-Flag" + entry.local() + "