- changed handling of error cases retrieving urls from database

(no more NULL values are returned, instead, an IOException is thrown)
- removed ugly damagedURLS implementation from plasmaCrawlLURL.java
  (this inserted a static value into the Object which is not really a good style)
- re-coded damagedURLS collection in yacy.java by catching an exception and evaluating the exception message
to do:
- the urldbcleanup feature must be re-tested


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1200 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent fed92d364b
commit bb79fb5d91

@ -47,7 +47,6 @@
// if the shell's current path is HTROOT // if the shell's current path is HTROOT
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.HashSet; import java.util.HashSet;
@ -167,9 +166,9 @@ public class IndexControl_p {
} }
} }
if (delurlref) { if (delurlref) {
for (int i = 0; i < urlx.length; i++) { for (int i = 0; i < urlx.length; i++) try {
switchboard.removeAllUrlReferences(urlx[i], true); switchboard.removeAllUrlReferences(urlx[i], true);
} } catch (IOException e) {}
} }
if (delurl || delurlref) { if (delurl || delurlref) {
for (int i = 0; i < urlx.length; i++) { for (int i = 0; i < urlx.length; i++) {
@ -189,9 +188,9 @@ public class IndexControl_p {
// delete selected URLs // delete selected URLs
if (post.containsKey("keyhashdelete")) { if (post.containsKey("keyhashdelete")) {
if (delurlref) { if (delurlref) {
for (int i = 0; i < urlx.length; i++) { for (int i = 0; i < urlx.length; i++) try {
switchboard.removeAllUrlReferences(urlx[i], true); switchboard.removeAllUrlReferences(urlx[i], true);
} } catch (IOException e) {}
} }
if (delurl || delurlref) { if (delurl || delurlref) {
for (int i = 0; i < urlx.length; i++) { for (int i = 0; i < urlx.length; i++) {
@ -212,20 +211,24 @@ public class IndexControl_p {
} }
if (post.containsKey("urlhashdeleteall")) { if (post.containsKey("urlhashdeleteall")) {
try {
int i = switchboard.removeAllUrlReferences(urlhash, true); int i = switchboard.removeAllUrlReferences(urlhash, true);
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes."); prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
} catch (IOException e) {
prop.put("result", "Deleted nothing because the url-hash could not be resolved");
}
} }
if (post.containsKey("urlhashdelete")) { if (post.containsKey("urlhashdelete")) {
try {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
URL url = entry.url(); URL url = entry.url();
if (url == null) {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
urlstring = htmlFilterContentScraper.urlNormalform(url); urlstring = htmlFilterContentScraper.urlNormalform(url);
prop.put("urlstring", ""); prop.put("urlstring", "");
switchboard.urlPool.loadedURL.remove(urlhash); switchboard.urlPool.loadedURL.remove(urlhash);
prop.put("result", "Removed URL " + urlstring); prop.put("result", "Removed URL " + urlstring);
} catch (IOException e) {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} }
} }
@ -267,16 +270,16 @@ public class IndexControl_p {
plasmaCrawlLURL.Entry lurl; plasmaCrawlLURL.Entry lurl;
while (urlIter.hasNext()) { while (urlIter.hasNext()) {
indexEntry = (plasmaWordIndexEntry) urlIter.next(); indexEntry = (plasmaWordIndexEntry) urlIter.next();
try {
lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash()); lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash());
if (lurl == null) {
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
if (lurl.toString() == null) { if (lurl.toString() == null) {
switchboard.urlPool.loadedURL.remove(indexEntry.getUrlHash()); switchboard.urlPool.loadedURL.remove(indexEntry.getUrlHash());
unknownURLEntries.add(indexEntry.getUrlHash()); unknownURLEntries.add(indexEntry.getUrlHash());
} else { } else {
knownURLs.put(indexEntry.getUrlHash(), lurl); knownURLs.put(indexEntry.getUrlHash(), lurl);
} }
} catch (IOException e) {
unknownURLEntries.add(indexEntry.getUrlHash());
} }
} }
// now delete all entries that have no url entry // now delete all entries that have no url entry
@ -327,21 +330,21 @@ public class IndexControl_p {
prop.put("urlhash", urlhash); prop.put("urlhash", urlhash);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
prop.put("result", genUrlProfile(switchboard, entry, urlhash)); prop.put("result", genUrlProfile(switchboard, entry, urlhash));
} catch (MalformedURLException e) { } catch (Exception e) {
prop.put("urlstring", "wrong url: " + urlstring); prop.put("urlstring", "wrong url: " + urlstring);
prop.put("urlhash", ""); prop.put("urlhash", "");
} }
} }
if (post.containsKey("urlhashsearch")) { if (post.containsKey("urlhashsearch")) {
try {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash); plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
URL url = entry.url(); URL url = entry.url();
if (url == null) {
prop.put("result", "No Entry for URL hash " + urlhash);
} else {
urlstring = url.toString(); urlstring = url.toString();
prop.put("urlstring", urlstring); prop.put("urlstring", urlstring);
prop.put("result", genUrlProfile(switchboard, entry, urlhash)); prop.put("result", genUrlProfile(switchboard, entry, urlhash));
} catch (IOException e) {
prop.put("result", "No Entry for URL hash " + urlhash);
} }
} }
@ -391,6 +394,12 @@ public class IndexControl_p {
public static String genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURL.Entry entry, String urlhash) { public static String genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURL.Entry entry, String urlhash) {
if (entry == null) { return "No entry found for URL-hash " + urlhash; } if (entry == null) { return "No entry found for URL-hash " + urlhash; }
URL url = entry.url(); URL url = entry.url();
String referrer = null;
try {
referrer = switchboard.urlPool.loadedURL.getEntry(entry.referrerHash()).url().toString();
} catch (IOException e) {
referrer = "<unknown>";
}
if (url == null) { return "No entry found for URL-hash " + urlhash; } if (url == null) { return "No entry found for URL-hash " + urlhash; }
String result = "<table>" + String result = "<table>" +
"<tr><td class=\"small\">URL String</td><td class=\"tt\">" + htmlFilterContentScraper.urlNormalform(url) + "</td></tr>" + "<tr><td class=\"small\">URL String</td><td class=\"tt\">" + htmlFilterContentScraper.urlNormalform(url) + "</td></tr>" +
@ -398,7 +407,7 @@ public class IndexControl_p {
"<tr><td class=\"small\">Description</td><td class=\"tt\">" + entry.descr() + "</td></tr>" + "<tr><td class=\"small\">Description</td><td class=\"tt\">" + entry.descr() + "</td></tr>" +
"<tr><td class=\"small\">Modified-Date</td><td class=\"tt\">" + entry.moddate() + "</td></tr>" + "<tr><td class=\"small\">Modified-Date</td><td class=\"tt\">" + entry.moddate() + "</td></tr>" +
"<tr><td class=\"small\">Loaded-Date</td><td class=\"tt\">" + entry.loaddate() + "</td></tr>" + "<tr><td class=\"small\">Loaded-Date</td><td class=\"tt\">" + entry.loaddate() + "</td></tr>" +
"<tr><td class=\"small\">Referrer</td><td class=\"tt\">" + switchboard.urlPool.loadedURL.getEntry(entry.referrerHash()).url() + "</td></tr>" + "<tr><td class=\"small\">Referrer</td><td class=\"tt\">" + referrer + "</td></tr>" +
"<tr><td class=\"small\">Doctype</td><td class=\"tt\">" + entry.doctype() + "</td></tr>" + "<tr><td class=\"small\">Doctype</td><td class=\"tt\">" + entry.doctype() + "</td></tr>" +
"<tr><td class=\"small\">Copy-Count</td><td class=\"tt\">" + entry.copyCount() + "</td></tr>" + "<tr><td class=\"small\">Copy-Count</td><td class=\"tt\">" + entry.copyCount() + "</td></tr>" +
"<tr><td class=\"small\">Local-Flag</td><td class=\"tt\">" + entry.local() + "</td></tr>" + "<tr><td class=\"small\">Local-Flag</td><td class=\"tt\">" + entry.local() + "</td></tr>" +

@ -43,6 +43,7 @@
// javac -classpath .:../classes IndexCreate_p.java // javac -classpath .:../classes IndexCreate_p.java
// if the shell's current path is HTROOT // if the shell's current path is HTROOT
import java.io.IOException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.Date; import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
@ -96,9 +97,12 @@ public class IndexCreateWWWLocalQueue_p {
while (iter.hasNext()) { while (iter.hasNext()) {
String value = null; String value = null;
String nextHash = new String((byte[]) iter.next()); String nextHash = new String((byte[]) iter.next());
Entry entry = switchboard.urlPool.noticeURL.getEntry(nextHash); Entry entry = null;
if (entry == null) continue; try {
entry = switchboard.urlPool.noticeURL.getEntry(nextHash);
} catch (IOException e) {
continue;
}
if ((option.equals("URL")&&(entry.url() != null))) { if ((option.equals("URL")&&(entry.url() != null))) {
value = entry.url().toString(); value = entry.url().toString();
} else if ((option.equals("AnchorName"))) { } else if ((option.equals("AnchorName"))) {

@ -102,8 +102,10 @@ public class ViewFile {
String viewMode = post.get("viewMode","sentences"); String viewMode = post.get("viewMode","sentences");
// getting the urlEntry that belongs to the url hash // getting the urlEntry that belongs to the url hash
Entry urlEntry = sb.urlPool.loadedURL.getEntry(urlHash); Entry urlEntry = null;
if (urlEntry == null) { try {
urlEntry = sb.urlPool.loadedURL.getEntry(urlHash);
} catch (IOException e) {
prop.put("error",2); prop.put("error",2);
prop.put("viewMode",VIEW_MODE_NO_TEXT); prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop; return prop;

@ -45,6 +45,7 @@
// You must compile this file with // You must compile this file with
// javac -classpath .:../classes crawlOrder.java // javac -classpath .:../classes crawlOrder.java
import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -243,12 +244,12 @@ public final class crawlOrder {
// case where we have already the url loaded; // case where we have already the url loaded;
reason = reasonString; reason = reasonString;
// send lurl-Entry as response // send lurl-Entry as response
try {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaURL.urlHash(url)); plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaURL.urlHash(url));
if (entry != null) {
response = "double"; response = "double";
switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare); switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare);
lurl = crypt.simpleEncode(entry.toString()); lurl = crypt.simpleEncode(entry.toString());
} else { } catch (IOException e) {
response = "rejected"; response = "rejected";
lurl = ""; lurl = "";
} }

@ -43,6 +43,8 @@
// javac -classpath .:../classes crawlOrder.java // javac -classpath .:../classes crawlOrder.java
import java.io.IOException;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
@ -135,10 +137,12 @@ public final class crawlReceipt {
// ready for more // ready for more
prop.put("delay", "10"); prop.put("delay", "10");
} else { } else {
try {
plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash); plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash);
if (en != null) {
switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(plasmaURL.urlFlagLength), false); switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(plasmaURL.urlFlagLength), false);
switchboard.urlPool.noticeURL.remove(receivedUrlhash); switchboard.urlPool.noticeURL.remove(receivedUrlhash);
} catch (IOException e) {
} }
prop.put("delay", "100"); // what shall we do with that??? prop.put("delay", "100"); // what shall we do with that???
} }

@ -109,7 +109,7 @@ public class plasmaCrawlEURL extends plasmaURL {
return e; return e;
} }
public synchronized Entry getEntry(String hash) { public synchronized Entry getEntry(String hash) throws IOException {
return new Entry(hash); return new Entry(hash);
} }
@ -157,7 +157,7 @@ public class plasmaCrawlEURL extends plasmaURL {
} }
public Entry(String hash) { public Entry(String hash) throws IOException {
// generates an plasmaEURLEntry using the url hash // generates an plasmaEURLEntry using the url hash
// to speed up the access, the url-hashes are buffered // to speed up the access, the url-hashes are buffered
// in the hash cache. // in the hash cache.
@ -166,7 +166,6 @@ public class plasmaCrawlEURL extends plasmaURL {
// - look into the filed properties // - look into the filed properties
// if the url cannot be found, this returns null // if the url cannot be found, this returns null
this.hash = hash; this.hash = hash;
try {
byte[][] entry = urlHashCache.get(hash.getBytes()); byte[][] entry = urlHashCache.get(hash.getBytes());
if (entry != null) { if (entry != null) {
this.referrer = new String(entry[1]); this.referrer = new String(entry[1]);
@ -181,7 +180,6 @@ public class plasmaCrawlEURL extends plasmaURL {
this.flags = new bitfield(entry[10]); this.flags = new bitfield(entry[10]);
return; return;
} }
} catch (Exception e) {}
} }
private void store() { private void store() {
@ -266,7 +264,11 @@ public class plasmaCrawlEURL extends plasmaURL {
return i.hasNext(); return i.hasNext();
} }
public Object nextElement() { public Object nextElement() {
try {
return new Entry(new String(((byte[][]) i.next())[0])); return new Entry(new String(((byte[][]) i.next())[0]));
} catch (IOException e) {
return null;
}
} }
} }

@ -57,17 +57,13 @@ import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.net.URL; import java.net.URL;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.Collections;
import java.util.Date; import java.util.Date;
import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.Locale; import java.util.Locale;
import java.util.Properties; import java.util.Properties;
import java.util.Set;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroTree;
import de.anomic.server.serverCodings; import de.anomic.server.serverCodings;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
@ -89,7 +85,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling
private final LinkedList gcrawlResultStack; // 6 - local index: triggered external private final LinkedList gcrawlResultStack; // 6 - local index: triggered external
public static Set damagedURLS = Collections.synchronizedSet(new HashSet()); //public static Set damagedURLS = Collections.synchronizedSet(new HashSet());
public plasmaCrawlLURL(File cachePath, int bufferkb) throws IOException { public plasmaCrawlLURL(File cachePath, int bufferkb) throws IOException {
super(); super();
@ -173,7 +169,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
gcrawlResultStack.add(urlHash + initiatorHash + executorHash); gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
} }
public synchronized Entry getEntry(String hash) { public synchronized Entry getEntry(String hash) throws IOException {
return new Entry(hash); return new Entry(hash);
} }
@ -347,9 +343,9 @@ public final class plasmaCrawlLURL extends plasmaURL {
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps initiatorHash=" + initiatorHash + " executorHash=" + executorHash); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps initiatorHash=" + initiatorHash + " executorHash=" + executorHash);
urlHash = getUrlHash(tabletype, i); urlHash = getUrlHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash);
try {
urle = getEntry(urlHash); urle = getEntry(urlHash);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString()); // serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString());
if (urle != null) try {
initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash); initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash);
executorSeed = yacyCore.seedDB.getConnected(executorHash); executorSeed = yacyCore.seedDB.getConnected(executorHash);
@ -457,7 +453,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
store(); store();
} }
public Entry(String urlHash) { public Entry(String urlHash) throws IOException {
// generates an plasmaLURLEntry using the url hash // generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered // to speed up the access, the url-hashes are buffered
// in the hash cache. // in the hash cache.
@ -466,8 +462,8 @@ public final class plasmaCrawlLURL extends plasmaURL {
// - look into the filed properties // - look into the filed properties
// if the url cannot be found, this returns null // if the url cannot be found, this returns null
this.urlHash = urlHash; this.urlHash = urlHash;
try {
byte[][] entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes()); byte[][] entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes());
try {
if (entry != null) { if (entry != null) {
this.url = new URL(new String(entry[1]).trim()); this.url = new URL(new String(entry[1]).trim());
this.descr = (entry[2] == null) ? this.url.toString() : new String(entry[2]).trim(); this.descr = (entry[2] == null) ? this.url.toString() : new String(entry[2]).trim();
@ -484,12 +480,9 @@ public final class plasmaCrawlLURL extends plasmaURL {
this.snippet = null; this.snippet = null;
return; return;
} }
} catch (MalformedURLException e) {
plasmaCrawlLURL.damagedURLS.add(this.urlHash);
System.out.println("DEBUG: Marked damaged Entry for removal (malformedURL). UrlHash: " + this.urlHash);
//serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e);
} catch (Exception e) { } catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e); serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e);
throw new IOException("plasmaLURL.entry/1: " + e.toString());
} }
} }
@ -742,14 +735,15 @@ public final class plasmaCrawlLURL extends plasmaURL {
return i.hasNext(); return i.hasNext();
} }
public Object next() { public Object next() throws RuntimeException {
try {
byte[] e = ((byte[][]) i.next())[0]; byte[] e = ((byte[][]) i.next())[0];
if (e == null) return null; else return new Entry(new String(e)); if (e == null) return null;
} catch (kelondroException e) { String hash = null;
e.printStackTrace(); try {
error = true; hash = new String(e);
return null; return new Entry(hash);
} catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + hash);
} }
} }

@ -363,7 +363,7 @@ public class plasmaCrawlNURL extends plasmaURL {
} }
} }
public synchronized Entry getEntry(String hash) { public synchronized Entry getEntry(String hash) throws IOException {
return new Entry(hash); return new Entry(hash);
} }
@ -431,7 +431,7 @@ public class plasmaCrawlNURL extends plasmaURL {
return str.toString(); return str.toString();
} }
public Entry(String hash) { public Entry(String hash) throws IOException {
// generates an plasmaNURLEntry using the url hash // generates an plasmaNURLEntry using the url hash
// to speed up the access, the url-hashes are buffered // to speed up the access, the url-hashes are buffered
// in the hash cache. // in the hash cache.
@ -440,9 +440,9 @@ public class plasmaCrawlNURL extends plasmaURL {
// - look into the filed properties // - look into the filed properties
// if the url cannot be found, this returns null // if the url cannot be found, this returns null
this.hash = hash; this.hash = hash;
try {
byte[][] entry = urlHashCache.get(hash.getBytes()); byte[][] entry = urlHashCache.get(hash.getBytes());
if (entry != null) { if (entry != null) {
//try {
this.initiator = new String(entry[1]); this.initiator = new String(entry[1]);
this.url = new URL(new String(entry[2]).trim()); this.url = new URL(new String(entry[2]).trim());
this.referrer = (entry[3] == null) ? dummyHash : new String(entry[3]); this.referrer = (entry[3] == null) ? dummyHash : new String(entry[3]);
@ -455,11 +455,13 @@ public class plasmaCrawlNURL extends plasmaURL {
this.flags = new bitfield(entry[10]); this.flags = new bitfield(entry[10]);
this.handle = Integer.parseInt(new String(entry[11])); this.handle = Integer.parseInt(new String(entry[11]));
return; return;
//} catch (MalformedURLException e) {
// throw new IOException("plasmaCrawlNURL/Entry: " + e);
//}
} else { } else {
// show that we found nothing // show that we found nothing
this.url = null; throw new IOException("hash not found");
} //this.url = null;
} catch (Exception e) {
} }
} }

@ -1230,8 +1230,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// create index // create index
String descr = document.getMainLongTitle(); String descr = document.getMainLongTitle();
String referrerHash;
try {
URL referrerURL = entry.referrerURL(); URL referrerURL = entry.referrerURL();
String referrerHash = (referrerURL == null) ? plasmaURL.dummyHash : plasmaURL.urlHash(referrerURL); referrerHash = plasmaURL.urlHash(referrerURL);
} catch (IOException e) {
referrerHash = plasmaURL.dummyHash;
}
String noIndexReason = "unspecified"; String noIndexReason = "unspecified";
if (processCase == 4) { if (processCase == 4) {
// proxy-load // proxy-load
@ -1480,8 +1485,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
URL refererURL = null; URL refererURL = null;
String refererHash = urlEntry.referrerHash(); String refererHash = urlEntry.referrerHash();
if ((refererHash != null) && (!refererHash.equals(plasmaURL.dummyHash))) { if ((refererHash != null) && (!refererHash.equals(plasmaURL.dummyHash))) try {
refererURL = this.urlPool.getURL(refererHash); refererURL = this.urlPool.getURL(refererHash);
} catch (IOException e) {
refererURL = null;
} }
cacheLoader.loadParallel(urlEntry.url(), urlEntry.name(), (refererURL!=null)?refererURL.toString():null, urlEntry.initiator(), urlEntry.depth(), profile); cacheLoader.loadParallel(urlEntry.url(), urlEntry.name(), (refererURL!=null)?refererURL.toString():null, urlEntry.initiator(), urlEntry.depth(), profile);
log.logInfo(stats + ": enqueued for load " + urlEntry.url() + " [" + urlEntry.hash() + "]"); log.logInfo(stats + ": enqueued for load " + urlEntry.url() + " [" + urlEntry.hash() + "]");
@ -1519,31 +1526,32 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
// do the request // do the request
try {
HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), urlPool.getURL(urlEntry.referrerHash())); HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), urlPool.getURL(urlEntry.referrerHash()));
// check success // check success
/* /*
the result of the 'response' value can have one of the following values: * the result of the 'response' value can have one of the following
negative cases, no retry * values: negative cases, no retry denied - the peer does not want
denied - the peer does not want to crawl that * to crawl that exception - an exception occurred
exception - an exception occurred *
* negative case, retry possible rejected - the peer has rejected to
negative case, retry possible * process, but a re-try should be possible
rejected - the peer has rejected to process, but a re-try should be possible *
* positive case with crawling stacked - the resource is processed
positive case with crawling * asap
stacked - the resource is processed asap *
* positive case without crawling double - the resource is already
positive case without crawling * in database, believed to be fresh and not reloaded the resource
double - the resource is already in database, believed to be fresh and not reloaded * is also returned in lurl
the resource is also returned in lurl
*/ */
if ((page == null) || (page.get("delay") == null)) { if ((page == null) || (page.get("delay") == null)) {
log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + urlEntry.url().toString() + ")"); log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + urlEntry.url().toString() + ")");
if (remoteSeed != null) yacyCore.peerActions.peerDeparture(remoteSeed); if (remoteSeed != null)
yacyCore.peerActions.peerDeparture(remoteSeed);
return false; return false;
} else try { } else
try {
log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed=" + remoteSeed.getName() + ", url=" + urlEntry.url().toString() + ", response=" + page.toString()); // DEBUG log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed=" + remoteSeed.getName() + ", url=" + urlEntry.url().toString() + ", response=" + page.toString()); // DEBUG
int newdelay = Integer.parseInt((String) page.get("delay")); int newdelay = Integer.parseInt((String) page.get("delay"));
@ -1556,9 +1564,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String lurl = (String) page.get("lurl"); String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) { if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.addEntry( plasmaCrawlLURL.Entry entry = urlPool.loadedURL.addEntry(urlPool.loadedURL.newEntry(propStr, true), yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1);
urlPool.loadedURL.newEntry(propStr, true),
yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1);
urlPool.noticeURL.remove(entry.hash()); urlPool.noticeURL.remove(entry.hash());
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'"); log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'");
return true; return true;
@ -1575,6 +1581,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logSevere(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. CLIENT RETURNED: " + page.toString(), e); log.logSevere(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. CLIENT RETURNED: " + page.toString(), e);
return false; return false;
} }
} catch (IOException e) {
log.logSevere(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. URL CANNOT BE RETRIEVED from referrer hash: " + urlEntry.referrerHash(), e);
return false;
}
} }
private static SimpleDateFormat DateFormatter = new SimpleDateFormat("EEE, dd MMM yyyy"); private static SimpleDateFormat DateFormatter = new SimpleDateFormat("EEE, dd MMM yyyy");
@ -1825,11 +1835,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
// method for index deletion // method for index deletion
public int removeAllUrlReferences(URL url, boolean fetchOnline) { public int removeAllUrlReferences(URL url, boolean fetchOnline) throws IOException {
return removeAllUrlReferences(plasmaURL.urlHash(url), fetchOnline); return removeAllUrlReferences(plasmaURL.urlHash(url), fetchOnline);
} }
public int removeAllUrlReferences(String urlhash, boolean fetchOnline) { public int removeAllUrlReferences(String urlhash, boolean fetchOnline) throws IOException {
// find all the words in a specific resource and remove the url reference from every word index // find all the words in a specific resource and remove the url reference from every word index
// finally, delete the url entry // finally, delete the url entry

@ -273,7 +273,7 @@ public class plasmaSwitchboardQueue {
return responseHeader; return responseHeader;
} }
public URL referrerURL() { public URL referrerURL() throws IOException {
if (referrerURL == null) { if (referrerURL == null) {
if ((referrerHash == null) || (referrerHash.equals(plasmaURL.dummyHash))) return null; if ((referrerHash == null) || (referrerHash.equals(plasmaURL.dummyHash))) return null;
referrerURL = lurls.getEntry(referrerHash).url(); referrerURL = lurls.getEntry(referrerHash).url();

@ -71,7 +71,7 @@ public class plasmaURLPool {
return null; return null;
} }
public URL getURL(String urlhash) { public URL getURL(String urlhash) throws IOException {
if (urlhash.equals(plasmaURL.dummyHash)) return null; if (urlhash.equals(plasmaURL.dummyHash)) return null;
plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash); plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
if (ne != null) return ne.url(); if (ne != null) return ne.url();

@ -1180,19 +1180,22 @@ public final class yacy {
private static void urldbcleanup(String homePath) { private static void urldbcleanup(String homePath) {
File root = new File(homePath); File root = new File(homePath);
File dbroot = new File(root, "DATA/PLASMADB"); File dbroot = new File(root, "DATA/PLASMADB");
HashSet damagedURLS = new HashSet();
try { try {
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(new File(dbroot, "urlHash.db"), 4194304); plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(new File(dbroot, "urlHash.db"), 4194304);
Iterator eiter = currentUrlDB.entries(true, false); Iterator eiter = currentUrlDB.entries(true, false);
int iteratorCount = 0; int iteratorCount = 0;
while (eiter.hasNext()) { while (eiter.hasNext()) try {
eiter.next(); eiter.next();
iteratorCount++; iteratorCount++;
} catch (RuntimeException e) {
String m = e.getMessage();
damagedURLS.add(m.substring(m.length() - 12));
} }
try { Thread.sleep(1000); } catch (InterruptedException e) { } try { Thread.sleep(1000); } catch (InterruptedException e) { }
System.out.println("URLs vorher: " + currentUrlDB.size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + plasmaCrawlLURL.damagedURLS.size()); System.out.println("URLs vorher: " + currentUrlDB.size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + damagedURLS.size());
synchronized(plasmaCrawlLURL.damagedURLS)
{ Iterator eiter2 = damagedURLS.iterator();
Iterator eiter2 = plasmaCrawlLURL.damagedURLS.iterator();
String urlHash; String urlHash;
while (eiter2.hasNext()) { while (eiter2.hasNext()) {
urlHash = (String) eiter2.next(); urlHash = (String) eiter2.next();
@ -1220,20 +1223,15 @@ public final class yacy {
if (res.statusCode == 200) { if (res.statusCode == 200) {
entry[1] = newUrl.toString().getBytes(); entry[1] = newUrl.toString().getBytes();
currentUrlDB.urlHashCache.put(entry); currentUrlDB.urlHashCache.put(entry);
System.out.println("UrlDB-Entry with urlHash '" + urlHash + System.out.println("UrlDB-Entry with urlHash '" + urlHash + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr);
"' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr);
} else { } else {
currentUrlDB.remove(urlHash); currentUrlDB.remove(urlHash);
System.out.println("UrlDB-Entry with urlHash '" + urlHash + System.out.println("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tConnection Status: " + res.status);
"' removed\n\tURL: " + oldUrlStr +
"\n\tConnection Status: " + res.status);
} }
} }
} catch (Exception e) { } catch (Exception e) {
currentUrlDB.remove(urlHash); currentUrlDB.remove(urlHash);
System.out.println("UrlDB-Entry with urlHash '" + urlHash + System.out.println("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tExecption: " + e.getMessage());
"' removed\n\tURL: " + oldUrlStr +
"\n\tExecption: " + e.getMessage());
} finally { } finally {
if (theHttpc != null) try { if (theHttpc != null) try {
theHttpc.close(); theHttpc.close();
@ -1241,9 +1239,8 @@ public final class yacy {
} catch (Exception e) { } } catch (Exception e) { }
} }
} }
}
plasmaCrawlLURL.damagedURLS.clear(); System.out.println("URLs nachher: " + currentUrlDB.size() + " kaputte URLs: " + damagedURLS.size());
System.out.println("URLs nachher: " + currentUrlDB.size() + " kaputte URLs: " + plasmaCrawlLURL.damagedURLS.size());
currentUrlDB.close(); currentUrlDB.close();
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
@ -1254,13 +1251,20 @@ public final class yacy {
* Main-method which is started by java. Checks for special arguments or * Main-method which is started by java. Checks for special arguments or
* starts up the application. * starts up the application.
* *
* @param args Given arguments from the command line. * @param args
* Given arguments from the command line.
*/ */
public static void main(String args[]) { public static void main(String args[]) {
// check memory amount // check memory amount
System.gc(); System.gc();
long startupMemFree = Runtime.getRuntime().freeMemory(); // the amount of free memory in the Java Virtual Machine long startupMemFree = Runtime.getRuntime().freeMemory(); // the
// amount of
// free
// memory in
// the Java
// Virtual
// Machine
long startupMemTotal = Runtime.getRuntime().totalMemory(); // the total amount of memory in the Java virtual machine; may vary over time long startupMemTotal = Runtime.getRuntime().totalMemory(); // the total amount of memory in the Java virtual machine; may vary over time
// go into headless awt mode // go into headless awt mode

Loading…
Cancel
Save