- changed handling of error cases retrieving urls from database

(no more NULL values are returned, instead, an IOException is thrown)
- removed ugly damagedURLS implementation from plasmaCrawlLURL.java
  (this inserted a static value into the Object which is not really a good style)
- re-coded damagedURLS collection in yacy.java by catching an exception and evaluating the exception message
to do:
- the urldbcleanup feature must be re-tested


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1200 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent fed92d364b
commit bb79fb5d91

@ -47,7 +47,6 @@
// if the shell's current path is HTROOT
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.Enumeration;
import java.util.HashSet;
@ -167,9 +166,9 @@ public class IndexControl_p {
}
}
if (delurlref) {
for (int i = 0; i < urlx.length; i++) {
for (int i = 0; i < urlx.length; i++) try {
switchboard.removeAllUrlReferences(urlx[i], true);
}
} catch (IOException e) {}
}
if (delurl || delurlref) {
for (int i = 0; i < urlx.length; i++) {
@ -189,9 +188,9 @@ public class IndexControl_p {
// delete selected URLs
if (post.containsKey("keyhashdelete")) {
if (delurlref) {
for (int i = 0; i < urlx.length; i++) {
for (int i = 0; i < urlx.length; i++) try {
switchboard.removeAllUrlReferences(urlx[i], true);
}
} catch (IOException e) {}
}
if (delurl || delurlref) {
for (int i = 0; i < urlx.length; i++) {
@ -212,20 +211,24 @@ public class IndexControl_p {
}
if (post.containsKey("urlhashdeleteall")) {
int i = switchboard.removeAllUrlReferences(urlhash, true);
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
try {
int i = switchboard.removeAllUrlReferences(urlhash, true);
prop.put("result", "Deleted URL and " + i + " references from " + i + " word indexes.");
} catch (IOException e) {
prop.put("result", "Deleted nothing because the url-hash could not be resolved");
}
}
if (post.containsKey("urlhashdelete")) {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
URL url = entry.url();
if (url == null) {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {
try {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
URL url = entry.url();
urlstring = htmlFilterContentScraper.urlNormalform(url);
prop.put("urlstring", "");
switchboard.urlPool.loadedURL.remove(urlhash);
prop.put("result", "Removed URL " + urlstring);
} catch (IOException e) {
prop.put("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
}
}
@ -267,16 +270,16 @@ public class IndexControl_p {
plasmaCrawlLURL.Entry lurl;
while (urlIter.hasNext()) {
indexEntry = (plasmaWordIndexEntry) urlIter.next();
lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash());
if (lurl == null) {
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
try {
lurl = switchboard.urlPool.loadedURL.getEntry(indexEntry.getUrlHash());
if (lurl.toString() == null) {
switchboard.urlPool.loadedURL.remove(indexEntry.getUrlHash());
unknownURLEntries.add(indexEntry.getUrlHash());
} else {
knownURLs.put(indexEntry.getUrlHash(), lurl);
}
} catch (IOException e) {
unknownURLEntries.add(indexEntry.getUrlHash());
}
}
// now delete all entries that have no url entry
@ -327,21 +330,21 @@ public class IndexControl_p {
prop.put("urlhash", urlhash);
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
prop.put("result", genUrlProfile(switchboard, entry, urlhash));
} catch (MalformedURLException e) {
} catch (Exception e) {
prop.put("urlstring", "wrong url: " + urlstring);
prop.put("urlhash", "");
}
}
if (post.containsKey("urlhashsearch")) {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
URL url = entry.url();
if (url == null) {
prop.put("result", "No Entry for URL hash " + urlhash);
} else {
try {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(urlhash);
URL url = entry.url();
urlstring = url.toString();
prop.put("urlstring", urlstring);
prop.put("result", genUrlProfile(switchboard, entry, urlhash));
} catch (IOException e) {
prop.put("result", "No Entry for URL hash " + urlhash);
}
}
@ -391,6 +394,12 @@ public class IndexControl_p {
public static String genUrlProfile(plasmaSwitchboard switchboard, plasmaCrawlLURL.Entry entry, String urlhash) {
if (entry == null) { return "No entry found for URL-hash " + urlhash; }
URL url = entry.url();
String referrer = null;
try {
referrer = switchboard.urlPool.loadedURL.getEntry(entry.referrerHash()).url().toString();
} catch (IOException e) {
referrer = "<unknown>";
}
if (url == null) { return "No entry found for URL-hash " + urlhash; }
String result = "<table>" +
"<tr><td class=\"small\">URL String</td><td class=\"tt\">" + htmlFilterContentScraper.urlNormalform(url) + "</td></tr>" +
@ -398,7 +407,7 @@ public class IndexControl_p {
"<tr><td class=\"small\">Description</td><td class=\"tt\">" + entry.descr() + "</td></tr>" +
"<tr><td class=\"small\">Modified-Date</td><td class=\"tt\">" + entry.moddate() + "</td></tr>" +
"<tr><td class=\"small\">Loaded-Date</td><td class=\"tt\">" + entry.loaddate() + "</td></tr>" +
"<tr><td class=\"small\">Referrer</td><td class=\"tt\">" + switchboard.urlPool.loadedURL.getEntry(entry.referrerHash()).url() + "</td></tr>" +
"<tr><td class=\"small\">Referrer</td><td class=\"tt\">" + referrer + "</td></tr>" +
"<tr><td class=\"small\">Doctype</td><td class=\"tt\">" + entry.doctype() + "</td></tr>" +
"<tr><td class=\"small\">Copy-Count</td><td class=\"tt\">" + entry.copyCount() + "</td></tr>" +
"<tr><td class=\"small\">Local-Flag</td><td class=\"tt\">" + entry.local() + "</td></tr>" +

@ -43,6 +43,7 @@
// javac -classpath .:../classes IndexCreate_p.java
// if the shell's current path is HTROOT
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
@ -96,9 +97,12 @@ public class IndexCreateWWWLocalQueue_p {
while (iter.hasNext()) {
String value = null;
String nextHash = new String((byte[]) iter.next());
Entry entry = switchboard.urlPool.noticeURL.getEntry(nextHash);
if (entry == null) continue;
Entry entry = null;
try {
entry = switchboard.urlPool.noticeURL.getEntry(nextHash);
} catch (IOException e) {
continue;
}
if ((option.equals("URL")&&(entry.url() != null))) {
value = entry.url().toString();
} else if ((option.equals("AnchorName"))) {

@ -102,8 +102,10 @@ public class ViewFile {
String viewMode = post.get("viewMode","sentences");
// getting the urlEntry that belongs to the url hash
Entry urlEntry = sb.urlPool.loadedURL.getEntry(urlHash);
if (urlEntry == null) {
Entry urlEntry = null;
try {
urlEntry = sb.urlPool.loadedURL.getEntry(urlHash);
} catch (IOException e) {
prop.put("error",2);
prop.put("viewMode",VIEW_MODE_NO_TEXT);
return prop;

@ -45,6 +45,7 @@
// You must compile this file with
// javac -classpath .:../classes crawlOrder.java
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import de.anomic.htmlFilter.htmlFilterContentScraper;
@ -243,12 +244,12 @@ public final class crawlOrder {
// case where we have already the url loaded;
reason = reasonString;
// send lurl-Entry as response
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaURL.urlHash(url));
if (entry != null) {
try {
plasmaCrawlLURL.Entry entry = switchboard.urlPool.loadedURL.getEntry(plasmaURL.urlHash(url));
response = "double";
switchboard.urlPool.loadedURL.notifyGCrawl(entry.hash(), iam, youare);
lurl = crypt.simpleEncode(entry.toString());
} else {
} catch (IOException e) {
response = "rejected";
lurl = "";
}

@ -43,6 +43,8 @@
// javac -classpath .:../classes crawlOrder.java
import java.io.IOException;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlLURL;
@ -135,10 +137,12 @@ public final class crawlReceipt {
// ready for more
prop.put("delay", "10");
} else {
plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash);
if (en != null) {
try {
plasmaCrawlNURL.Entry en = switchboard.urlPool.noticeURL.getEntry(receivedUrlhash);
switchboard.urlPool.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new bitfield(plasmaURL.urlFlagLength), false);
switchboard.urlPool.noticeURL.remove(receivedUrlhash);
} catch (IOException e) {
}
prop.put("delay", "100"); // what shall we do with that???
}

@ -109,7 +109,7 @@ public class plasmaCrawlEURL extends plasmaURL {
return e;
}
public synchronized Entry getEntry(String hash) {
public synchronized Entry getEntry(String hash) throws IOException {
return new Entry(hash);
}
@ -157,32 +157,30 @@ public class plasmaCrawlEURL extends plasmaURL {
}
public Entry(String hash) {
// generates an plasmaEURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
// we have two options to find the url:
// - look into the hash cache
// - look into the filed properties
// if the url cannot be found, this returns null
this.hash = hash;
try {
byte[][] entry = urlHashCache.get(hash.getBytes());
if (entry != null) {
this.referrer = new String(entry[1]);
this.initiator = new String(entry[2]);
this.executor = new String(entry[3]);
this.url = new URL(new String(entry[4]).trim());
this.name = new String(entry[5]).trim();
this.initdate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[6])));
this.trydate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[7])));
this.trycount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[8]));
this.failreason = new String(entry[9]);
this.flags = new bitfield(entry[10]);
return;
}
} catch (Exception e) {}
}
public Entry(String hash) throws IOException {
// generates an plasmaEURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
// we have two options to find the url:
// - look into the hash cache
// - look into the filed properties
// if the url cannot be found, this returns null
this.hash = hash;
byte[][] entry = urlHashCache.get(hash.getBytes());
if (entry != null) {
this.referrer = new String(entry[1]);
this.initiator = new String(entry[2]);
this.executor = new String(entry[3]);
this.url = new URL(new String(entry[4]).trim());
this.name = new String(entry[5]).trim();
this.initdate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[6])));
this.trydate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[7])));
this.trycount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[8]));
this.failreason = new String(entry[9]);
this.flags = new bitfield(entry[10]);
return;
}
}
private void store() {
// stores the values from the object variables into the database
@ -257,16 +255,20 @@ public class plasmaCrawlEURL extends plasmaURL {
}
public class kenum implements Enumeration {
// enumerates entry elements
Iterator i;
public kenum(boolean up, boolean rotating) throws IOException {
// enumerates entry elements
Iterator i;
public kenum(boolean up, boolean rotating) throws IOException {
i = urlHashCache.rows(up, rotating);
}
public boolean hasMoreElements() {
public boolean hasMoreElements() {
return i.hasNext();
}
public Object nextElement() {
return new Entry(new String(((byte[][]) i.next())[0]));
public Object nextElement() {
try {
return new Entry(new String(((byte[][]) i.next())[0]));
} catch (IOException e) {
return null;
}
}
}

@ -57,17 +57,13 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Collections;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Locale;
import java.util.Properties;
import java.util.Set;
import de.anomic.http.httpc;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroTree;
import de.anomic.server.serverCodings;
import de.anomic.server.serverObjects;
@ -89,7 +85,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
private final LinkedList lcrawlResultStack; // 5 - local index: result of local crawling
private final LinkedList gcrawlResultStack; // 6 - local index: triggered external
public static Set damagedURLS = Collections.synchronizedSet(new HashSet());
//public static Set damagedURLS = Collections.synchronizedSet(new HashSet());
public plasmaCrawlLURL(File cachePath, int bufferkb) throws IOException {
super();
@ -173,7 +169,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
gcrawlResultStack.add(urlHash + initiatorHash + executorHash);
}
public synchronized Entry getEntry(String hash) {
public synchronized Entry getEntry(String hash) throws IOException {
return new Entry(hash);
}
@ -347,9 +343,9 @@ public final class plasmaCrawlLURL extends plasmaURL {
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps initiatorHash=" + initiatorHash + " executorHash=" + executorHash);
urlHash = getUrlHash(tabletype, i);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urlHash=" + urlHash);
urle = getEntry(urlHash);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString());
if (urle != null) try {
try {
urle = getEntry(urlHash);
// serverLog.logFinest("PLASMA", "plasmaCrawlLURL/genTableProps urle=" + urle.toString());
initiatorSeed = yacyCore.seedDB.getConnected(initiatorHash);
executorSeed = yacyCore.seedDB.getConnected(executorHash);
@ -457,41 +453,38 @@ public final class plasmaCrawlLURL extends plasmaURL {
store();
}
public Entry(String urlHash) {
// generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
// we have two options to find the url:
// - look into the hash cache
// - look into the filed properties
// if the url cannot be found, this returns null
this.urlHash = urlHash;
try {
public Entry(String urlHash) throws IOException {
// generates an plasmaLURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
// we have two options to find the url:
// - look into the hash cache
// - look into the filed properties
// if the url cannot be found, this returns null
this.urlHash = urlHash;
byte[][] entry = plasmaCrawlLURL.this.urlHashCache.get(urlHash.getBytes());
if (entry != null) {
this.url = new URL(new String(entry[1]).trim());
this.descr = (entry[2] == null) ? this.url.toString() : new String(entry[2]).trim();
this.moddate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[3])));
this.loaddate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[4])));
this.referrerHash = (entry[5]==null)?dummyHash:new String(entry[5]);
this.copyCount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[6]));
this.flags = new String(entry[7]);
this.quality = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[8]));
this.language = new String(entry[9]);
this.doctype = (char) entry[10][0];
this.size = serverCodings.enhancedCoder.decodeBase64Long(new String(entry[11]));
this.wordCount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[12]));
this.snippet = null;
return;
try {
if (entry != null) {
this.url = new URL(new String(entry[1]).trim());
this.descr = (entry[2] == null) ? this.url.toString() : new String(entry[2]).trim();
this.moddate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[3])));
this.loaddate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[4])));
this.referrerHash = (entry[5] == null) ? dummyHash : new String(entry[5]);
this.copyCount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[6]));
this.flags = new String(entry[7]);
this.quality = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[8]));
this.language = new String(entry[9]);
this.doctype = (char) entry[10][0];
this.size = serverCodings.enhancedCoder.decodeBase64Long(new String(entry[11]));
this.wordCount = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[12]));
this.snippet = null;
return;
}
} catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e);
throw new IOException("plasmaLURL.entry/1: " + e.toString());
}
} catch (MalformedURLException e) {
plasmaCrawlLURL.damagedURLS.add(this.urlHash);
System.out.println("DEBUG: Marked damaged Entry for removal (malformedURL). UrlHash: " + this.urlHash);
//serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e);
} catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/1: " + e.toString(), e);
}
}
public Entry(Properties prop, boolean setGlobal) {
// generates an plasmaLURLEntry using the properties from the argument
@ -742,14 +735,15 @@ public final class plasmaCrawlLURL extends plasmaURL {
return i.hasNext();
}
public Object next() {
public Object next() throws RuntimeException {
byte[] e = ((byte[][]) i.next())[0];
if (e == null) return null;
String hash = null;
try {
byte[] e = ((byte[][])i.next())[0];
if (e == null) return null; else return new Entry(new String(e));
} catch (kelondroException e) {
e.printStackTrace();
error = true;
return null;
hash = new String(e);
return new Entry(hash);
} catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + hash);
}
}

@ -363,7 +363,7 @@ public class plasmaCrawlNURL extends plasmaURL {
}
}
public synchronized Entry getEntry(String hash) {
public synchronized Entry getEntry(String hash) throws IOException {
return new Entry(hash);
}
@ -431,7 +431,7 @@ public class plasmaCrawlNURL extends plasmaURL {
return str.toString();
}
public Entry(String hash) {
public Entry(String hash) throws IOException {
// generates an plasmaNURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
@ -440,26 +440,28 @@ public class plasmaCrawlNURL extends plasmaURL {
// - look into the filed properties
// if the url cannot be found, this returns null
this.hash = hash;
try {
byte[][] entry = urlHashCache.get(hash.getBytes());
if (entry != null) {
this.initiator = new String(entry[1]);
this.url = new URL(new String(entry[2]).trim());
this.referrer = (entry[3]==null) ? dummyHash : new String(entry[3]);
this.name = (entry[4] == null) ? "" : new String(entry[4]).trim();
this.loaddate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[5])));
byte[][] entry = urlHashCache.get(hash.getBytes());
if (entry != null) {
//try {
this.initiator = new String(entry[1]);
this.url = new URL(new String(entry[2]).trim());
this.referrer = (entry[3] == null) ? dummyHash : new String(entry[3]);
this.name = (entry[4] == null) ? "" : new String(entry[4]).trim();
this.loaddate = new Date(86400000 * serverCodings.enhancedCoder.decodeBase64Long(new String(entry[5])));
this.profileHandle = (entry[6] == null) ? null : new String(entry[6]).trim();
this.depth = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[7]));
this.anchors = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[8]));
this.forkfactor = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[9]));
this.flags = new bitfield(entry[10]);
this.handle = Integer.parseInt(new String(entry[11]));
this.depth = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[7]));
this.anchors = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[8]));
this.forkfactor = (int) serverCodings.enhancedCoder.decodeBase64Long(new String(entry[9]));
this.flags = new bitfield(entry[10]);
this.handle = Integer.parseInt(new String(entry[11]));
return;
} else {
// show that we found nothing
this.url = null;
}
} catch (Exception e) {
//} catch (MalformedURLException e) {
// throw new IOException("plasmaCrawlNURL/Entry: " + e);
//}
} else {
// show that we found nothing
throw new IOException("hash not found");
//this.url = null;
}
}

@ -1230,8 +1230,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// create index
String descr = document.getMainLongTitle();
URL referrerURL = entry.referrerURL();
String referrerHash = (referrerURL == null) ? plasmaURL.dummyHash : plasmaURL.urlHash(referrerURL);
String referrerHash;
try {
URL referrerURL = entry.referrerURL();
referrerHash = plasmaURL.urlHash(referrerURL);
} catch (IOException e) {
referrerHash = plasmaURL.dummyHash;
}
String noIndexReason = "unspecified";
if (processCase == 4) {
// proxy-load
@ -1480,8 +1485,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
URL refererURL = null;
String refererHash = urlEntry.referrerHash();
if ((refererHash != null) && (!refererHash.equals(plasmaURL.dummyHash))) {
if ((refererHash != null) && (!refererHash.equals(plasmaURL.dummyHash))) try {
refererURL = this.urlPool.getURL(refererHash);
} catch (IOException e) {
refererURL = null;
}
cacheLoader.loadParallel(urlEntry.url(), urlEntry.name(), (refererURL!=null)?refererURL.toString():null, urlEntry.initiator(), urlEntry.depth(), profile);
log.logInfo(stats + ": enqueued for load " + urlEntry.url() + " [" + urlEntry.hash() + "]");
@ -1519,60 +1526,63 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// do the request
HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), urlPool.getURL(urlEntry.referrerHash()));
// check success
/*
the result of the 'response' value can have one of the following values:
negative cases, no retry
denied - the peer does not want to crawl that
exception - an exception occurred
negative case, retry possible
rejected - the peer has rejected to process, but a re-try should be possible
positive case with crawling
stacked - the resource is processed asap
positive case without crawling
double - the resource is already in database, believed to be fresh and not reloaded
the resource is also returned in lurl
*/
if ((page == null) || (page.get("delay") == null)) {
log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + urlEntry.url().toString() + ")");
if (remoteSeed != null) yacyCore.peerActions.peerDeparture(remoteSeed);
return false;
} else try {
log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed=" + remoteSeed.getName() + ", url=" + urlEntry.url().toString() + ", response=" + page.toString()); // DEBUG
int newdelay = Integer.parseInt((String) page.get("delay"));
yacyCore.dhtAgent.setCrawlDelay(remoteSeed.hash, newdelay);
String response = (String) page.get("response");
if (response.equals("stacked")) {
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " PLACED URL=" + urlEntry.url().toString() + "; NEW DELAY=" + newdelay);
return true;
} else if (response.equals("double")) {
String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.addEntry(
urlPool.loadedURL.newEntry(propStr, true),
yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1);
urlPool.noticeURL.remove(entry.hash());
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'");
return true;
} else {
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " REJECTED. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + ")");
try {
HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), urlPool.getURL(urlEntry.referrerHash()));
// check success
/*
* the result of the 'response' value can have one of the following
* values: negative cases, no retry denied - the peer does not want
* to crawl that exception - an exception occurred
*
* negative case, retry possible rejected - the peer has rejected to
* process, but a re-try should be possible
*
* positive case with crawling stacked - the resource is processed
* asap
*
* positive case without crawling double - the resource is already
* in database, believed to be fresh and not reloaded the resource
* is also returned in lurl
*/
if ((page == null) || (page.get("delay") == null)) {
log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + urlEntry.url().toString() + ")");
if (remoteSeed != null)
yacyCore.peerActions.peerDeparture(remoteSeed);
return false;
} else
try {
log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed=" + remoteSeed.getName() + ", url=" + urlEntry.url().toString() + ", response=" + page.toString()); // DEBUG
int newdelay = Integer.parseInt((String) page.get("delay"));
yacyCore.dhtAgent.setCrawlDelay(remoteSeed.hash, newdelay);
String response = (String) page.get("response");
if (response.equals("stacked")) {
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " PLACED URL=" + urlEntry.url().toString() + "; NEW DELAY=" + newdelay);
return true;
} else if (response.equals("double")) {
String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
plasmaCrawlLURL.Entry entry = urlPool.loadedURL.addEntry(urlPool.loadedURL.newEntry(propStr, true), yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1);
urlPool.noticeURL.remove(entry.hash());
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'");
return true;
} else {
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " REJECTED. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + ")");
return false;
}
} else {
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " DENIED. RESPONSE=" + response + ", CAUSE=" + page.get("reason") + ", URL=" + urlEntry.url().toString());
return false;
}
} catch (Exception e) {
// wrong values
log.logSevere(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. CLIENT RETURNED: " + page.toString(), e);
return false;
}
} else {
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " DENIED. RESPONSE=" + response + ", CAUSE=" + page.get("reason") + ", URL=" + urlEntry.url().toString());
return false;
}
} catch (Exception e) {
// wrong values
log.logSevere(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. CLIENT RETURNED: " + page.toString(), e);
} catch (IOException e) {
log.logSevere(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. URL CANNOT BE RETRIEVED from referrer hash: " + urlEntry.referrerHash(), e);
return false;
}
}
@ -1825,11 +1835,11 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// method for index deletion
public int removeAllUrlReferences(URL url, boolean fetchOnline) {
public int removeAllUrlReferences(URL url, boolean fetchOnline) throws IOException {
return removeAllUrlReferences(plasmaURL.urlHash(url), fetchOnline);
}
public int removeAllUrlReferences(String urlhash, boolean fetchOnline) {
public int removeAllUrlReferences(String urlhash, boolean fetchOnline) throws IOException {
// find all the words in a specific resource and remove the url reference from every word index
// finally, delete the url entry

@ -273,7 +273,7 @@ public class plasmaSwitchboardQueue {
return responseHeader;
}
public URL referrerURL() {
public URL referrerURL() throws IOException {
if (referrerURL == null) {
if ((referrerHash == null) || (referrerHash.equals(plasmaURL.dummyHash))) return null;
referrerURL = lurls.getEntry(referrerHash).url();

@ -71,7 +71,7 @@ public class plasmaURLPool {
return null;
}
public URL getURL(String urlhash) {
public URL getURL(String urlhash) throws IOException {
if (urlhash.equals(plasmaURL.dummyHash)) return null;
plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
if (ne != null) return ne.url();

@ -1180,70 +1180,67 @@ public final class yacy {
private static void urldbcleanup(String homePath) {
File root = new File(homePath);
File dbroot = new File(root, "DATA/PLASMADB");
HashSet damagedURLS = new HashSet();
try {
plasmaCrawlLURL currentUrlDB = new plasmaCrawlLURL(new File(dbroot, "urlHash.db"), 4194304);
Iterator eiter = currentUrlDB.entries(true, false);
int iteratorCount=0;
while (eiter.hasNext()) {
int iteratorCount = 0;
while (eiter.hasNext()) try {
eiter.next();
iteratorCount++;
} catch (RuntimeException e) {
String m = e.getMessage();
damagedURLS.add(m.substring(m.length() - 12));
}
try { Thread.sleep(1000); } catch (InterruptedException e) {}
System.out.println("URLs vorher: " + currentUrlDB.size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + plasmaCrawlLURL.damagedURLS.size());
synchronized(plasmaCrawlLURL.damagedURLS)
{
Iterator eiter2 = plasmaCrawlLURL.damagedURLS.iterator();
String urlHash;
while (eiter2.hasNext()) {
urlHash = (String) eiter2.next();
// trying to fix the invalid URL
httpc theHttpc = null;
String oldUrlStr = null;
try {
// getting the url data as byte array
byte[][] entry = currentUrlDB.urlHashCache.get(urlHash.getBytes());
// getting the wrong url string
oldUrlStr = new String(entry[1]).trim();
int pos = -1;
if ((pos = oldUrlStr.indexOf("://"))!= -1) {
// trying to correct the url
String newUrlStr = "http://" + oldUrlStr.substring(pos+3);
URL newUrl = new URL(newUrlStr);
// doing a http head request to test if the url is correct
theHttpc = httpc.getInstance(newUrl.getHost(), newUrl.getPort(), 30000, false);
response res = theHttpc.HEAD(newUrl.getPath(), null);
if (res.statusCode == 200) {
entry[1] = newUrl.toString().getBytes();
currentUrlDB.urlHashCache.put(entry);
System.out.println("UrlDB-Entry with urlHash '" + urlHash +
"' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr);
} else {
currentUrlDB.remove(urlHash);
System.out.println("UrlDB-Entry with urlHash '" + urlHash +
"' removed\n\tURL: " + oldUrlStr +
"\n\tConnection Status: " + res.status);
}
try { Thread.sleep(1000); } catch (InterruptedException e) { }
System.out.println("URLs vorher: " + currentUrlDB.size() + " Entries loaded during Iteratorloop: " + iteratorCount + " kaputte URLs: " + damagedURLS.size());
Iterator eiter2 = damagedURLS.iterator();
String urlHash;
while (eiter2.hasNext()) {
urlHash = (String) eiter2.next();
// trying to fix the invalid URL
httpc theHttpc = null;
String oldUrlStr = null;
try {
// getting the url data as byte array
byte[][] entry = currentUrlDB.urlHashCache.get(urlHash.getBytes());
// getting the wrong url string
oldUrlStr = new String(entry[1]).trim();
int pos = -1;
if ((pos = oldUrlStr.indexOf("://")) != -1) {
// trying to correct the url
String newUrlStr = "http://" + oldUrlStr.substring(pos + 3);
URL newUrl = new URL(newUrlStr);
// doing a http head request to test if the url is correct
theHttpc = httpc.getInstance(newUrl.getHost(), newUrl.getPort(), 30000, false);
response res = theHttpc.HEAD(newUrl.getPath(), null);
if (res.statusCode == 200) {
entry[1] = newUrl.toString().getBytes();
currentUrlDB.urlHashCache.put(entry);
System.out.println("UrlDB-Entry with urlHash '" + urlHash + "' corrected\n\tURL: " + oldUrlStr + " -> " + newUrlStr);
} else {
currentUrlDB.remove(urlHash);
System.out.println("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tConnection Status: " + res.status);
}
} catch (Exception e) {
currentUrlDB.remove(urlHash);
System.out.println("UrlDB-Entry with urlHash '" + urlHash +
"' removed\n\tURL: " + oldUrlStr +
"\n\tExecption: " + e.getMessage());
} finally {
if (theHttpc != null) try {
theHttpc.close();
httpc.returnInstance(theHttpc);
} catch (Exception e) {}
}
} catch (Exception e) {
currentUrlDB.remove(urlHash);
System.out.println("UrlDB-Entry with urlHash '" + urlHash + "' removed\n\tURL: " + oldUrlStr + "\n\tExecption: " + e.getMessage());
} finally {
if (theHttpc != null) try {
theHttpc.close();
httpc.returnInstance(theHttpc);
} catch (Exception e) { }
}
}
plasmaCrawlLURL.damagedURLS.clear();
System.out.println("URLs nachher: " + currentUrlDB.size() + " kaputte URLs: " + plasmaCrawlLURL.damagedURLS.size());
System.out.println("URLs nachher: " + currentUrlDB.size() + " kaputte URLs: " + damagedURLS.size());
currentUrlDB.close();
} catch (IOException e) {
e.printStackTrace();
@ -1251,16 +1248,23 @@ public final class yacy {
}
/**
* Main-method which is started by java. Checks for special arguments or
* starts up the application.
*
* @param args Given arguments from the command line.
*/
* Main-method which is started by java. Checks for special arguments or
* starts up the application.
*
* @param args
* Given arguments from the command line.
*/
public static void main(String args[]) {
// check memory amount
System.gc();
long startupMemFree = Runtime.getRuntime().freeMemory(); // the amount of free memory in the Java Virtual Machine
long startupMemFree = Runtime.getRuntime().freeMemory(); // the
// amount of
// free
// memory in
// the Java
// Virtual
// Machine
long startupMemTotal = Runtime.getRuntime().totalMemory(); // the total amount of memory in the Java virtual machine; may vary over time
// go into headless awt mode

Loading…
Cancel
Save