redesigned NURL-handling:

- the general NURL-index for all crawl stack types was splitted into separate indexes for these stacks
- the new NURL-index is managed by the crawl balancer
- the crawl balancer does not need an internal index any more, it is replaced by the NURL-index
- the NURL.Entry was generalized and is now a new class plasmaCrawlEntry
- the new class plasmaCrawlEntry replaces also the preNURL.Entry class, and will also replace the switchboardEntry class in the future
- the new class plasmaCrawlEntry is more accurate for date entries (holds milliseconds) and can contain larger 'name' entries (anchor tag names)
- the EURL object was replaced by a new ZURL object, which is a container for the plasmaCrawlEntry and some tracking information
- the EURL index is now filled with ZURL objects
- a new index delegatedURL holds ZURL objects about plasmaCrawlEntry obects to track which url is handed over to other peers
- redesigned handling of plasmaCrawlEntry - handover, because there is no need any more to convert one entry object into another
- found and fixed numerous bugs in the context of crawl state handling
- fixed a serious bug in kelondroCache which caused that entries could not be removed
- fixed some bugs in online interface and adopted monitor output to new entry objects
- adopted yacy protocol to handle new delegatedURL entries
all old crawl queues will disappear after this update!

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3483 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 094a1482f4
commit 861f41e67e

@ -56,6 +56,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.plasma.urlPattern.plasmaURLPattern;
@ -272,7 +273,7 @@ public class CrawlURLFetchStack_p {
} }
private static int shiftFromNotice(plasmaCrawlNURL nurl, int fromStackType, URLFetcherStack stack, int count) { private static int shiftFromNotice(plasmaCrawlNURL nurl, int fromStackType, URLFetcherStack stack, int count) {
plasmaCrawlNURL.Entry entry; plasmaCrawlEntry entry;
int failed = 0; int failed = 0;
for (int i=0; i<count; i++) try { for (int i=0; i<count; i++) try {
entry = nurl.pop(fromStackType); entry = nurl.pop(fromStackType);

@ -49,10 +49,9 @@ import java.util.Iterator;
import java.util.Random; import java.util.Random;
import java.util.TreeMap; import java.util.TreeMap;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
@ -499,14 +498,9 @@ public class CrawlURLFetch_p {
totalFailed++; totalFailed++;
this.failed.put(urls[i], reason); this.failed.put(urls[i], reason);
try { try {
plasmaCrawlEURL.Entry ee = this.sb.errorURL.newEntry( plasmaCrawlZURL.Entry ee = this.sb.errorURL.newEntry(
new URL(urls[i]), new URL(urls[i]),
null, reason);
yacyCore.seedDB.mySeed.hash,
yacyCore.seedDB.mySeed.hash,
"",
reason,
new kelondroBitfield());
ee.store(); ee.store();
this.sb.errorURL.stackPushEntry(ee); this.sb.errorURL.stackPushEntry(ee);
} catch (MalformedURLException e) { } } catch (MalformedURLException e) { }

@ -50,7 +50,7 @@ import java.util.ArrayList;
import de.anomic.data.wikiCode; import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSwitchboardQueue; import de.anomic.plasma.plasmaSwitchboardQueue;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
@ -186,7 +186,7 @@ public class IndexCreateIndexingQueue_p {
dark = true; dark = true;
URL url; URL url;
String initiatorHash, executorHash; String initiatorHash, executorHash;
plasmaCrawlEURL.Entry entry; plasmaCrawlZURL.Entry entry;
yacySeed initiatorSeed, executorSeed; yacySeed initiatorSeed, executorSeed;
int j=0; int j=0;
for (int i = switchboard.errorURL.stackSize() - 1; i >= (switchboard.errorURL.stackSize() - showRejectedCount); i--) { for (int i = switchboard.errorURL.stackSize() - 1; i >= (switchboard.errorURL.stackSize() - showRejectedCount); i--) {
@ -202,7 +202,7 @@ public class IndexCreateIndexingQueue_p {
prop.put("rejected_list_"+j+"_initiator", ((initiatorSeed == null) ? "proxy" : wikiCode.replaceHTML(initiatorSeed.getName()))); prop.put("rejected_list_"+j+"_initiator", ((initiatorSeed == null) ? "proxy" : wikiCode.replaceHTML(initiatorSeed.getName())));
prop.put("rejected_list_"+j+"_executor", ((executorSeed == null) ? "proxy" : wikiCode.replaceHTML(executorSeed.getName()))); prop.put("rejected_list_"+j+"_executor", ((executorSeed == null) ? "proxy" : wikiCode.replaceHTML(executorSeed.getName())));
prop.put("rejected_list_"+j+"_url", wikiCode.replaceHTML(url.toString())); prop.put("rejected_list_"+j+"_url", wikiCode.replaceHTML(url.toString()));
prop.put("rejected_list_"+j+"_failreason", entry.failreason()); prop.put("rejected_list_"+j+"_failreason", entry.anycause());
prop.put("rejected_list_"+j+"_dark", ((dark) ? 1 : 0)); prop.put("rejected_list_"+j+"_dark", ((dark) ? 1 : 0));
dark = !dark; dark = !dark;
j++; j++;

@ -49,6 +49,7 @@ import java.util.Locale;
import de.anomic.data.wikiCode; import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
@ -99,9 +100,9 @@ public class IndexCreateWWWGlobalQueue_p {
prop.put("crawler-queue", 0); prop.put("crawler-queue", 0);
} else { } else {
prop.put("crawler-queue", 1); prop.put("crawler-queue", 1);
plasmaCrawlNURL.Entry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, showLimit); plasmaCrawlEntry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, showLimit);
prop.put("crawler-queue_num", stackSize);//num Entries prop.put("crawler-queue_num", stackSize);//num Entries
plasmaCrawlNURL.Entry urle; plasmaCrawlEntry urle;
boolean dark = true; boolean dark = true;
yacySeed initiator; yacySeed initiator;
String profileHandle; String profileHandle;

@ -43,7 +43,6 @@
// javac -classpath .:../classes IndexCreate_p.java // javac -classpath .:../classes IndexCreate_p.java
// if the shell's current path is HTROOT // if the shell's current path is HTROOT
import java.io.IOException;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.Date; import java.util.Date;
import java.util.Iterator; import java.util.Iterator;
@ -54,10 +53,10 @@ import java.util.regex.PatternSyntaxException;
import de.anomic.data.wikiCode; import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaCrawlNURL.Entry;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyCore;
@ -101,15 +100,11 @@ public class IndexCreateWWWLocalQueue_p {
// iterating through the list of URLs // iterating through the list of URLs
Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE); Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
plasmaCrawlEntry entry;
while (iter.hasNext()) { while (iter.hasNext()) {
entry = (plasmaCrawlEntry) iter.next();
String value = null; String value = null;
String nextHash = (String) iter.next(); String nextHash = entry.urlhash();
Entry entry = null;
try {
entry = switchboard.noticeURL.getEntry(nextHash);
} catch (IOException e) {
continue;
}
if ((option.equals("URL")&&(entry.url() != null))) { if ((option.equals("URL")&&(entry.url() != null))) {
value = entry.url().toString(); value = entry.url().toString();
} else if ((option.equals("AnchorName"))) { } else if ((option.equals("AnchorName"))) {
@ -162,9 +157,9 @@ public class IndexCreateWWWLocalQueue_p {
prop.put("crawler-queue", 0); prop.put("crawler-queue", 0);
} else { } else {
prop.put("crawler-queue", 1); prop.put("crawler-queue", 1);
plasmaCrawlNURL.Entry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, (int) (showLimit * 1.20)); plasmaCrawlEntry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, (int) (showLimit * 1.20));
plasmaCrawlNURL.Entry urle; plasmaCrawlEntry urle;
boolean dark = true; boolean dark = true;
yacySeed initiator; yacySeed initiator;
String profileHandle; String profileHandle;
@ -183,7 +178,7 @@ public class IndexCreateWWWLocalQueue_p {
prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.loaddate()) ); prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.loaddate()) );
prop.put("crawler-queue_list_"+showNum+"_anchor", wikiCode.replaceHTML(urle.name())); prop.put("crawler-queue_list_"+showNum+"_anchor", wikiCode.replaceHTML(urle.name()));
prop.put("crawler-queue_list_"+showNum+"_url", wikiCode.replaceHTML(urle.url().toString())); prop.put("crawler-queue_list_"+showNum+"_url", wikiCode.replaceHTML(urle.url().toString()));
prop.put("crawler-queue_list_"+showNum+"_hash", urle.hash()); prop.put("crawler-queue_list_"+showNum+"_hash", urle.urlhash());
dark = !dark; dark = !dark;
showNum++; showNum++;
} else { } else {

@ -27,11 +27,6 @@
<td>Starting Point:</td> <td>Starting Point:</td>
<td> <td>
<table cellpadding="0" cellspacing="0"> <table cellpadding="0" cellspacing="0">
<tr>
<td>From&nbsp;File:</td>
<td><input type="radio" name="crawlingMode" value="file" /></td>
<td><input type="file" name="crawlingFile" size="28" /></td>
</tr>
<tr> <tr>
<td>From&nbsp;URL:</td> <td>From&nbsp;URL:</td>
<td><input type="radio" name="crawlingMode" value="url" checked="checked" /></td> <td><input type="radio" name="crawlingMode" value="url" checked="checked" /></td>
@ -41,7 +36,12 @@
</td> </td>
</tr> </tr>
<tr> <tr>
<td colspan="2"><span id="title"></span></td> <td>From&nbsp;File:</td>
<td><input type="radio" name="crawlingMode" value="file" /></td>
<td><input type="file" name="crawlingFile" size="28" /></td>
</tr>
<tr>
<td colspan="3" class="commit"><span id="title"><br></span></td>
</tr> </tr>
</table> </table>
</td> </td>
@ -125,7 +125,7 @@
</td> </td>
</tr> </tr>
<tr valign="top" class="TableCellLight"> <tr valign="top" class="TableCellLight">
<td>Store to Proxy Cache:</td> <td>Store to Web Cache:</td>
<td><input type="checkbox" name="storeHTCache" #(storeHTCacheChecked)#::checked="checked"#(/storeHTCacheChecked)# /></td> <td><input type="checkbox" name="storeHTCache" #(storeHTCacheChecked)#::checked="checked"#(/storeHTCacheChecked)# /></td>
<td> <td>
This option is used by default for proxy prefetch, but is not needed for explicit crawling. This option is used by default for proxy prefetch, but is not needed for explicit crawling.
@ -194,9 +194,9 @@
<tr valign="top" class="TableCellLight"> <tr valign="top" class="TableCellLight">
<td>Wanted Performance:</td> <td>Wanted Performance:</td>
<td> <td>
<input type="radio" name="crawlingSpeed" value="maximum" #(crawlingSpeedMaxChecked)#::checked="checked"#(/crawlingSpeedMaxChecked)# />maximum&nbsp;&nbsp; <input type="radio" name="crawlingPerformance" value="maximum" #(crawlingSpeedMaxChecked)#::checked="checked"#(/crawlingSpeedMaxChecked)# />maximum&nbsp;&nbsp;
<input type="radio" name="crawlingSpeed" value="custom" #(crawlingSpeedCustChecked)#::checked="checked"#(/crawlingSpeedCustChecked)# />custom: <input name="customPPM" type="text" size="4" maxlength="4" value="#[customPPMdefault]#" />PPM&nbsp;&nbsp; <input type="radio" name="crawlingPerformance" value="custom" #(crawlingSpeedCustChecked)#::checked="checked"#(/crawlingSpeedCustChecked)# />custom: <input name="customPPM" type="text" size="4" maxlength="4" value="#[customPPMdefault]#" />PPM&nbsp;&nbsp;
<input type="radio" name="crawlingSpeed" value="minimum" #(crawlingSpeedMinChecked)#::checked="checked"#(/crawlingSpeedMinChecked)# />optimal as background process <input type="radio" name="crawlingPerformance" value="minimum" #(crawlingSpeedMinChecked)#::checked="checked"#(/crawlingSpeedMinChecked)# />optimal as background process
</td> </td>
<td colspan="3"> <td colspan="3">
Set wanted level of computing power, used for this and other running crawl tasks. (PPM = pages per minute) Set wanted level of computing power, used for this and other running crawl tasks. (PPM = pages per minute)

@ -53,7 +53,6 @@ import java.util.Iterator;
import java.util.TreeMap; import java.util.TreeMap;
import de.anomic.data.messageBoard; import de.anomic.data.messageBoard;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;

@ -39,10 +39,9 @@ import de.anomic.data.wikiCode;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL; import de.anomic.plasma.plasmaURL;
import de.anomic.server.serverFileUtils; import de.anomic.server.serverFileUtils;
@ -222,8 +221,7 @@ public class WatchCrawler_p {
prop.put("info_crawlingURL", wikiCode.replaceHTML(((String) post.get("crawlingURL")))); prop.put("info_crawlingURL", wikiCode.replaceHTML(((String) post.get("crawlingURL"))));
prop.put("info_reasonString", reasonString); prop.put("info_reasonString", reasonString);
plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, plasmaCrawlZURL.Entry ee = switchboard.errorURL.newEntry(crawlingStartURL, reasonString);
crawlingStartURL.getHost(), reasonString, new kelondroBitfield());
ee.store(); ee.store();
switchboard.errorURL.stackPushEntry(ee); switchboard.errorURL.stackPushEntry(ee);
} }
@ -300,8 +298,7 @@ public class WatchCrawler_p {
if (rejectReason == null) { if (rejectReason == null) {
c++; c++;
} else { } else {
plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, plasmaCrawlZURL.Entry ee = switchboard.errorURL.newEntry(nexturlURL, rejectReason);
(String) e.getValue(), rejectReason, new kelondroBitfield());
ee.store(); ee.store();
switchboard.errorURL.stackPushEntry(ee); switchboard.errorURL.stackPushEntry(ee);
} }
@ -401,9 +398,10 @@ public class WatchCrawler_p {
private static void setPerformance(plasmaSwitchboard sb, serverObjects post) { private static void setPerformance(plasmaSwitchboard sb, serverObjects post) {
String crawlingPerformance = post.get("crawlingPerformance","custom"); String crawlingPerformance = post.get("crawlingPerformance","custom");
int wantedPPM = 1000; long LCbusySleep = Integer.parseInt(sb.getConfig(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, "100"));
int wantedPPM = (int) (60000L / LCbusySleep);
try { try {
wantedPPM = Integer.parseInt(post.get("customPPM","1000")); wantedPPM = Integer.parseInt(post.get("customPPM",Integer.toString(wantedPPM)));
} catch (NumberFormatException e) {} } catch (NumberFormatException e) {}
if (crawlingPerformance.equals("minimum")) wantedPPM = 10; if (crawlingPerformance.equals("minimum")) wantedPPM = 10;
if (crawlingPerformance.equals("maximum")) wantedPPM = 1000; if (crawlingPerformance.equals("maximum")) wantedPPM = 1000;

@ -54,6 +54,7 @@ import java.util.Locale;
import de.anomic.data.wikiCode; import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlLoaderMessage; import de.anomic.plasma.plasmaCrawlLoaderMessage;
import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
@ -183,10 +184,10 @@ public class queues_p {
} }
public static final void addNTable(serverObjects prop, String tableName, plasmaCrawlNURL.Entry[] crawlerList) { public static final void addNTable(serverObjects prop, String tableName, plasmaCrawlEntry[] crawlerList) {
int showNum = 0; int showNum = 0;
plasmaCrawlNURL.Entry urle; plasmaCrawlEntry urle;
yacySeed initiator; yacySeed initiator;
for (int i = 0; i < crawlerList.length; i++) { for (int i = 0; i < crawlerList.length; i++) {
urle = crawlerList[i]; urle = crawlerList[i];
@ -198,7 +199,7 @@ public class queues_p {
prop.put(tableName + "_" + showNum + "_modified", daydate(urle.loaddate())); prop.put(tableName + "_" + showNum + "_modified", daydate(urle.loaddate()));
prop.putSafeXML(tableName + "_" + showNum + "_anchor", urle.name()); prop.putSafeXML(tableName + "_" + showNum + "_anchor", urle.name());
prop.putSafeXML(tableName + "_" + showNum + "_url", urle.url().toString()); prop.putSafeXML(tableName + "_" + showNum + "_url", urle.url().toString());
prop.put(tableName + "_" + showNum + "_hash", urle.hash()); prop.put(tableName + "_" + showNum + "_hash", urle.urlhash());
showNum++; showNum++;
} }
} }

@ -66,7 +66,7 @@ public class snippet {
prop.putASIS("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown"); //FIXME: the ASIS should not be needed, but we have still htmlcode in .java files prop.putASIS("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown"); //FIXME: the ASIS should not be needed, but we have still htmlcode in .java files
} else { } else {
// problems with snippet fetch // problems with snippet fetch
prop.put("text", (remove) ? switchboard.snippetCache.failConsequences(snippet, query) : snippet.getError()); prop.put("text", (remove) ? switchboard.snippetCache.failConsequences(snippet, queryHashes) : snippet.getError());
} }
prop.put("link", 0); prop.put("link", 0);
prop.put("links", 0); prop.put("links", 0);

@ -49,11 +49,8 @@
import java.io.IOException; import java.io.IOException;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield; import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch; import de.anomic.server.serverSwitch;
@ -85,7 +82,7 @@ public final class crawlReceipt {
String youare = post.get("youare", ""); // seed hash of the target peer, needed for network stability String youare = post.get("youare", ""); // seed hash of the target peer, needed for network stability
//String process = post.get("process", ""); // process type //String process = post.get("process", ""); // process type
String key = post.get("key", ""); // transmission key String key = post.get("key", ""); // transmission key
String receivedUrlhash = post.get("urlhash", ""); // the url hash that has been crawled //String receivedUrlhash = post.get("urlhash", ""); // the url hash that has been crawled
String result = post.get("result", ""); // the result; either "ok" or "fail" String result = post.get("result", ""); // the result; either "ok" or "fail"
String reason = post.get("reason", ""); // the reason for that result String reason = post.get("reason", ""); // the reason for that result
//String words = post.get("wordh", ""); // priority word hashes //String words = post.get("wordh", ""); // priority word hashes
@ -114,60 +111,60 @@ public final class crawlReceipt {
final yacySeed otherPeer = yacyCore.seedDB.get(iam); final yacySeed otherPeer = yacyCore.seedDB.get(iam);
final String otherPeerName = iam + ":" + ((otherPeer == null) ? "NULL" : (otherPeer.getName() + "/" + otherPeer.getVersion())); final String otherPeerName = iam + ":" + ((otherPeer == null) ? "NULL" : (otherPeer.getName() + "/" + otherPeer.getVersion()));
if ((yacyCore.seedDB.mySeed == null) || (!(yacyCore.seedDB.mySeed.hash.equals(youare)))) { if ((yacyCore.seedDB.mySeed == null) || (!(yacyCore.seedDB.mySeed.hash.equals(youare)))) {
// no yacy connection / unknown peers // no yacy connection / unknown peers
prop.putASIS("delay", "3600"); prop.putASIS("delay", "3600");
} else if (propStr == null) { return prop;
}
if (propStr == null) {
// error with url / wrong key // error with url / wrong key
prop.putASIS("delay", "3600"); prop.putASIS("delay", "3600");
} else if (result.equals("fill")) { return prop;
// generating a new loaded URL entry }
indexURLEntry entry = switchboard.wordIndex.loadedURL.newEntry(propStr);
if (entry == null) { // generating a new loaded URL entry
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) for hash " + receivedUrlhash + " from peer " + iam + indexURLEntry entry = switchboard.wordIndex.loadedURL.newEntry(propStr);
"\n\tURL properties: "+ propStr); if (entry == null) {
} else { log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr);
indexURLEntry.Components comp = entry.comp(); prop.putASIS("delay", "3600");
if (comp.url() == null) { return prop;
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + receivedUrlhash + " from peer " + iam + }
"\n\tURL properties: "+ propStr);
} else try { indexURLEntry.Components comp = entry.comp();
// put new entry into database if (comp.url() == null) {
switchboard.wordIndex.loadedURL.store(entry); log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + entry.hash() + " from peer " + iam + "\n\tURL properties: "+ propStr);
switchboard.wordIndex.loadedURL.stack(entry, youare, iam, 1); prop.putASIS("delay", "3600");
return prop;
// generating url hash }
String newUrlHash = plasmaURL.urlHash(comp.url());
String oldUrlHash = plasmaURL.oldurlHash(comp.url()); if (result.equals("fill")) try {
// put new entry into database
// removing URL from notice URL switchboard.wordIndex.loadedURL.store(entry);
switchboard.noticeURL.remove(newUrlHash); switchboard.wordIndex.loadedURL.stack(entry, youare, iam, 1);
switchboard.noticeURL.remove(oldUrlHash); switchboard.delegatedURL.remove(entry.hash()); // the delegated work has been done
log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + entry.hash() + ":" + comp.url().toNormalform());
log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + receivedUrlhash + ":" + comp.url().toNormalform());
} catch (IOException e) {
e.printStackTrace();
}
}
// ready for more // ready for more
prop.putASIS("delay", "10"); prop.putASIS("delay", "10");
} else { return prop;
try { } catch (IOException e) {
plasmaCrawlNURL.Entry en = switchboard.noticeURL.getEntry(receivedUrlhash); e.printStackTrace();
plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new kelondroBitfield()); prop.putASIS("delay", "3600");
ee.store(); return prop;
switchboard.errorURL.stackPushEntry(ee);
switchboard.noticeURL.remove(receivedUrlhash);
} catch (IOException e) {
}
prop.putASIS("delay", "100"); // what shall we do with that???
} }
switchboard.delegatedURL.remove(entry.hash()); // the delegated work is transformed into an error case
plasmaCrawlZURL.Entry ee = switchboard.errorURL.newEntry(entry.toBalancerEntry(), youare, null, 0, result + ":" + reason);
ee.store();
switchboard.errorURL.stackPushEntry(ee);
//switchboard.noticeURL.remove(receivedUrlhash);
prop.putASIS("delay", "3600");
return prop;
// return rewrite properties
// return rewrite properties
return prop;
} }
} }

@ -34,6 +34,7 @@ import java.util.Date;
import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.index.indexRWIEntry; import de.anomic.index.indexRWIEntry;
public interface indexURLEntry { public interface indexURLEntry {
@ -60,6 +61,7 @@ public interface indexURLEntry {
public indexRWIEntry word(); public indexRWIEntry word();
public boolean isOlder(indexURLEntry other); public boolean isOlder(indexURLEntry other);
public String toString(String snippet); public String toString(String snippet);
public plasmaCrawlEntry toBalancerEntry();
public String toString(); public String toString();
public class Components { public class Components {

@ -13,6 +13,7 @@ import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaURL; import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.server.serverCharBuffer; import de.anomic.server.serverCharBuffer;
@ -367,6 +368,19 @@ public class indexURLEntryNew implements indexURLEntry {
//return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}"; //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
} }
public plasmaCrawlEntry toBalancerEntry() {
return new plasmaCrawlEntry(
null,
comp().url(),
referrerHash(),
comp().descr(),
loaddate(),
null,
0,
0,
0);
}
/** /**
* Returns this object as String.<br> * Returns this object as String.<br>
* This e.g. looks like this: * This e.g. looks like this:

@ -35,6 +35,7 @@ import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaURL; import de.anomic.plasma.plasmaURL;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
@ -335,6 +336,19 @@ public class indexURLEntryOld implements indexURLEntry {
//return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}"; //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
} }
public plasmaCrawlEntry toBalancerEntry() {
return new plasmaCrawlEntry(
null,
comp().url(),
referrerHash(),
comp().descr(),
loaddate(),
null,
0,
0,
0);
}
/** /**
* Returns this object as String.<br> * Returns this object as String.<br>
* This e.g. looks like this: * This e.g. looks like this:

@ -557,7 +557,6 @@ public class kelondroCache implements kelondroIndex {
} else { } else {
this.hasnotHit++; this.hasnotHit++;
this.hasnotDouble++; this.hasnotDouble++;
return null;
} }
} }
@ -569,8 +568,6 @@ public class kelondroCache implements kelondroIndex {
} else { } else {
this.readHit++; this.readHit++;
this.cacheDelete++; this.cacheDelete++;
index.remove(key);
return entry;
} }
} }

@ -223,14 +223,12 @@ public class kelondroFlexWidthArray implements kelondroArray {
assert rowentry.bytes().length == this.rowdef.objectsize; assert rowentry.bytes().length == this.rowdef.objectsize;
int c = 0; int c = 0;
kelondroRow.Entry e; kelondroRow.Entry e;
int lastcol;
synchronized (col) { synchronized (col) {
while (c < rowdef.columns()) { while (c < rowdef.columns()) {
lastcol = c + col[c].row().columns() - 1;
e = col[c].row().newEntry( e = col[c].row().newEntry(
rowentry.bytes(), rowentry.bytes(),
rowdef.colstart[c], rowdef.colstart[c],
rowdef.colstart[lastcol] - rowdef.colstart[c] + rowdef.width(lastcol)); col[c].row().objectsize());
col[c].set(index, e); col[c].set(index, e);
c = c + col[c].row().columns(); c = c + col[c].row().columns();
} }

@ -176,20 +176,20 @@ public class kelondroRow {
for (int i = 0; i < objectsize; i++) this.rowinstance[i] = 0; for (int i = 0; i < objectsize; i++) this.rowinstance[i] = 0;
} }
public Entry(byte[] rowinstance) { public Entry(byte[] newrow) {
this(rowinstance, 0, rowinstance.length); this(newrow, 0, newrow.length);
} }
public Entry(byte[] rowinstance, int start, int length) { public Entry(byte[] newrow, int start, int length) {
assert objectsize == length : "objectsize = " + objectsize + ", length = " + length; assert newrow.length >= (length + start) : "objectsize = " + objectsize + ", start = " + start + ", length = " + length;
assert objectsize == length : "objectsize = " + objectsize + ", start = " + start + ", length = " + length;
this.rowinstance = new byte[objectsize]; this.rowinstance = new byte[objectsize];
int ll = Math.min(objectsize, length); System.arraycopy(newrow, start, this.rowinstance, 0, objectsize);
System.arraycopy(rowinstance, start, this.rowinstance, 0, ll); //for (int i = ll; i < objectsize; i++) this.rowinstance[i] = 0;
for (int i = ll; i < objectsize; i++) this.rowinstance[i] = 0;
} }
public Entry(byte[][] cols) { public Entry(byte[][] cols) {
assert row.length == cols.length; assert row.length == cols.length : "cols.length = " + cols.length + ", row.length = " + row.length;
rowinstance = new byte[objectsize]; rowinstance = new byte[objectsize];
int ll; int ll;
int cs, cw; int cs, cw;

@ -311,6 +311,7 @@ public final class kelondroStack extends kelondroRecords {
} }
public void remove() { public void remove() {
ni.remove();
} }
} }

@ -51,11 +51,11 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import de.anomic.plasma.plasmaURL; import de.anomic.plasma.plasmaURL;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlLoaderMessage; import de.anomic.plasma.plasmaCrawlLoaderMessage;
import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
@ -290,15 +290,19 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
String referrerHash = (this.refererURLString==null)?null:plasmaURL.urlHash(this.refererURLString); String referrerHash = (this.refererURLString==null)?null:plasmaURL.urlHash(this.refererURLString);
// create a new errorURL DB entry // create a new errorURL DB entry
plasmaCrawlEURL.Entry ee = this.sb.errorURL.newEntry( plasmaCrawlEntry bentry = new plasmaCrawlEntry(
this.url,
referrerHash,
this.initiator, this.initiator,
yacyCore.seedDB.mySeed.hash, this.url,
this.name, referrerHash,
(failreason==null)?"Unknown reason":failreason, this.name,
new kelondroBitfield() null,
); this.profile.handle(),
this.depth,
0,
0);
plasmaCrawlZURL.Entry ee = this.sb.errorURL.newEntry(
bentry, yacyCore.seedDB.mySeed.hash, null,
0, (failreason==null)?"Unknown reason":failreason);
// store the entry // store the entry
ee.store(); ee.store();

@ -6,6 +6,7 @@ import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.TreeMap; import java.util.TreeMap;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
@ -89,7 +90,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
// init noticeUrlDB // init noticeUrlDB
this.log.logInfo("Initializing the source noticeUrlDB"); this.log.logInfo("Initializing the source noticeUrlDB");
this.importNurlDB = new plasmaCrawlNURL(this.importPath, preloadTime); this.importNurlDB = new plasmaCrawlNURL(this.importPath);
this.importStartSize = this.importNurlDB.size(); this.importStartSize = this.importNurlDB.size();
//int stackSize = this.importNurlDB.stackSize(); //int stackSize = this.importNurlDB.stackSize();
@ -101,7 +102,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
public void run() { public void run() {
try { try {
// waiting on init thread to finish // waiting on init thread to finish
this.importNurlDB.waitOnInitThread(); //this.importNurlDB.waitOnInitThread();
// the stack types we want to import // the stack types we want to import
int[] stackTypes = new int[] {plasmaCrawlNURL.STACK_TYPE_CORE, int[] stackTypes = new int[] {plasmaCrawlNURL.STACK_TYPE_CORE,
@ -110,38 +111,38 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
-1}; -1};
// looping through the various stacks // looping through the various stacks
for (int i=0; i< stackTypes.length; i++) { for (int stackType=0; stackType< stackTypes.length; stackType++) {
if (stackTypes[i] != -1) { if (stackTypes[stackType] != -1) {
this.log.logInfo("Starting to import stacktype '" + stackTypes[i] + "' containing '" + this.importNurlDB.stackSize(stackTypes[i]) + "' entries."); this.log.logInfo("Starting to import stacktype '" + stackTypes[stackType] + "' containing '" + this.importNurlDB.stackSize(stackTypes[stackType]) + "' entries.");
} else { } else {
this.log.logInfo("Starting to import '" + this.importNurlDB.size() + "' entries not available in any stack."); this.log.logInfo("Starting to import '" + this.importNurlDB.size() + "' entries not available in any stack.");
} }
// getting an interator and loop through the URL entries // getting an interator and loop through the URL entries
Iterator entryIter = (stackTypes[i] == -1) ? this.importNurlDB.entries(true, null) : null; Iterator entryIter = (stackTypes[stackType] == -1) ? this.importNurlDB.iterator(stackType) : null;
while (true) { while (true) {
String nextHash = null; String nextHash = null;
plasmaCrawlNURL.Entry nextEntry = null; plasmaCrawlEntry nextEntry = null;
try { try {
if (stackTypes[i] != -1) { if (stackTypes[stackType] != -1) {
if (this.importNurlDB.stackSize(stackTypes[i]) == 0) break; if (this.importNurlDB.stackSize(stackTypes[stackType]) == 0) break;
this.urlCount++; this.urlCount++;
nextEntry = this.importNurlDB.pop(stackTypes[i]); nextEntry = this.importNurlDB.pop(stackTypes[stackType]);
nextHash = nextEntry.hash(); nextHash = nextEntry.urlhash();
} else { } else {
if (!entryIter.hasNext()) break; if (!entryIter.hasNext()) break;
this.urlCount++; this.urlCount++;
nextEntry = (plasmaCrawlNURL.Entry) entryIter.next(); nextEntry = (plasmaCrawlEntry) entryIter.next();
nextHash = nextEntry.hash(); nextHash = nextEntry.urlhash();
} }
} catch (IOException e) { } catch (IOException e) {
this.log.logWarning("Unable to import entry: " + e.toString()); this.log.logWarning("Unable to import entry: " + e.toString());
if ((stackTypes[i] != -1) &&(this.importNurlDB.stackSize(stackTypes[i]) == 0)) break; if ((stackTypes[stackType] != -1) &&(this.importNurlDB.stackSize(stackTypes[stackType]) == 0)) break;
continue; continue;
} }
@ -176,9 +177,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
// if the url does not alredy exists in the destination stack we insert it now // if the url does not alredy exists in the destination stack we insert it now
if (!this.sb.noticeURL.existsInStack(nextHash)) { if (!this.sb.noticeURL.existsInStack(nextHash)) {
plasmaCrawlNURL.Entry ne = this.sb.noticeURL.newEntry(nextEntry); this.sb.noticeURL.push((stackTypes[stackType] != -1) ? stackTypes[stackType] : plasmaCrawlNURL.STACK_TYPE_CORE, nextEntry);
ne.store();
this.sb.noticeURL.push((stackTypes[i] != -1) ? stackTypes[i] : plasmaCrawlNURL.STACK_TYPE_CORE, ne.hash());
} }
// removing hash from the import db // removing hash from the import db
@ -191,7 +190,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
} }
if (this.isAborted()) break; if (this.isAborted()) break;
} }
this.log.logInfo("Finished to import stacktype '" + stackTypes[i] + "'"); this.log.logInfo("Finished to import stacktype '" + stackTypes[stackType] + "'");
} }
//int size = this.importNurlDB.size(); //int size = this.importNurlDB.size();

@ -43,17 +43,18 @@ package de.anomic.plasma;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.Map; import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroRecords; import de.anomic.kelondro.kelondroRecords;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroStack; import de.anomic.kelondro.kelondroStack;
@ -61,86 +62,136 @@ import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlBalancer { public class plasmaCrawlBalancer {
private static final String stackSuffix = "7.stack";
private static final String indexSuffix = "7.db";
// a shared domainAccess map for all balancers // a shared domainAccess map for all balancers
private static final Map domainAccess = Collections.synchronizedMap(new HashMap()); private static final Map domainAccess = Collections.synchronizedMap(new HashMap());
// definition of payload for fileStack // definition of payload for fileStack
private static final kelondroRow payload = new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength, kelondroBase64Order.enhancedCoder, 0); private static final kelondroRow stackrow = new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength, kelondroBase64Order.enhancedCoder, 0);
// class variables // class variables
private ArrayList ramStack; // a list that is flused first private ArrayList urlRAMStack; // a list that is flused first
private kelondroStack fileStack; // a file with url hashes private kelondroStack urlFileStack; // a file with url hashes
private HashMap domainStacks; // a map from domain name part to Lists with url hashs private kelondroIndex urlFileIndex;
private HashSet ramIndex; // an index is needed externally, we provide that internally private HashMap domainStacks; // a map from domain name part to Lists with url hashs
private File cacheStacksPath;
private String stackname;
public plasmaCrawlBalancer(File stackFile) { public plasmaCrawlBalancer(File cachePath, String stackname) {
fileStack = kelondroStack.open(stackFile, payload); this.cacheStacksPath = cachePath;
this.stackname = stackname;
File stackFile = new File(cachePath, stackname + stackSuffix);
urlFileStack = kelondroStack.open(stackFile, stackrow);
domainStacks = new HashMap(); domainStacks = new HashMap();
ramStack = new ArrayList(); urlRAMStack = new ArrayList();
ramIndex = makeIndex();
// create a stack for newly entered entries
if (!(cachePath.exists())) cachePath.mkdir(); // make the path
openFileIndex();
} }
public synchronized void close() { public synchronized void close() {
ramIndex = null;
while (sizeDomainStacks() > 0) flushOnceDomStacks(true); while (sizeDomainStacks() > 0) flushOnceDomStacks(true);
try { flushAllRamStack(); } catch (IOException e) {} try { flushAllRamStack(); } catch (IOException e) {}
fileStack.close(); if (urlFileIndex != null) {
fileStack = null; urlFileIndex.close();
urlFileIndex = null;
}
if (urlFileStack != null) {
urlFileStack.close();
urlFileStack = null;
}
} }
public void finalize() { public void finalize() {
if (fileStack != null) close(); if (urlFileStack != null) close();
} }
public synchronized void clear() { public synchronized void clear() {
fileStack = kelondroStack.reset(fileStack); urlFileStack = kelondroStack.reset(urlFileStack);
domainStacks.clear(); domainStacks.clear();
ramStack.clear(); urlRAMStack.clear();
ramIndex = new HashSet(); resetFileIndex();
} }
private HashSet makeIndex() {
HashSet index = new HashSet(); // TODO: replace with kelondroIndex private void openFileIndex() {
cacheStacksPath.mkdirs();
// take all elements from the file stack
try { try {
Iterator i = fileStack.keyIterator(); // iterates byte[] - objects urlFileIndex = new kelondroCache(new kelondroFlexTable(cacheStacksPath, stackname + indexSuffix, -1, plasmaCrawlEntry.rowdef), true, false);
while (i.hasNext()) index.add(new String((byte[]) i.next(), "UTF-8")); } catch (IOException e) {
} catch (UnsupportedEncodingException e) {} e.printStackTrace();
System.exit(-1);
// take elements from the ram stack
for (int i = 0; i < ramStack.size(); i++) index.add(ramStack.get(i));
// take elememts from domain stacks
Iterator i = domainStacks.entrySet().iterator();
Map.Entry entry;
LinkedList list;
Iterator ii;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
list = (LinkedList) entry.getValue();
ii = list.iterator();
while (ii.hasNext()) index.add(ii.next());
} }
return index;
} }
public boolean has(String urlhash) { private void resetFileIndex() {
return ramIndex.contains(urlhash); if (urlFileIndex != null) {
urlFileIndex.close();
urlFileIndex = null;
File cacheFile = new File(cacheStacksPath, stackname + indexSuffix);
cacheFile.delete();
}
openFileIndex();
} }
public Iterator iterator() { public synchronized plasmaCrawlEntry get(String urlhash) throws IOException {
return ramIndex.iterator(); kelondroRow.Entry entry = urlFileIndex.get(urlhash.getBytes());
if (entry == null) return null;
return new plasmaCrawlEntry(entry);
}
public synchronized plasmaCrawlEntry remove(String urlhash) throws IOException {
// this method is only here, because so many import/export methods need it
// and it was implemented in the previous architecture
// however, usage is not recommendet
kelondroRow.Entry entry = urlFileIndex.remove(urlhash.getBytes());
if (entry == null) return null;
// now delete that thing also from the queues
// iterate through the RAM stack
Iterator i = urlRAMStack.iterator();
String h;
while (i.hasNext()) {
h = (String) i.next();
if (h.equals(urlhash)) {
i.remove();
break;
}
}
// we cannot iterate through the file stack, because the stack iterator
// has not yet a delete method implemented. It would also be a bad idea
// to do that, it would make too much IO load
// instead, the top/pop methods that aquire elements from the stack, that
// cannot be found in the urlFileIndex must handle that case silently
return new plasmaCrawlEntry(entry);
}
public boolean has(String urlhash) {
try {
return urlFileIndex.has(urlhash.getBytes());
} catch (IOException e) {
e.printStackTrace();
return false;
}
} }
public synchronized int size() { public synchronized int size() {
int componentsize = fileStack.size() + ramStack.size() + sizeDomainStacks(); int componentsize = urlFileStack.size() + urlRAMStack.size() + sizeDomainStacks();
if ((kelondroRecords.debugmode) && (componentsize != ramIndex.size())) { try {
// hier ist ramIndex.size() immer grš§er. warum? if ((kelondroRecords.debugmode) && (componentsize != urlFileIndex.size())) {
serverLog.logWarning("PLASMA BALANCER", "size operation wrong - componentsize = " + componentsize + ", ramIndex.size() = " + ramIndex.size()); // hier ist urlIndexFile.size() immer grš§er. warum?
} serverLog.logWarning("PLASMA BALANCER", "size operation wrong - componentsize = " + componentsize + ", ramIndex.size() = " + urlFileIndex.size());
}
} catch (IOException e) {
e.printStackTrace();
}
return componentsize; return componentsize;
} }
@ -163,9 +214,9 @@ public class plasmaCrawlBalancer {
list = (LinkedList) entry.getValue(); list = (LinkedList) entry.getValue();
if (list.size() != 0) { if (list.size() != 0) {
if (ram) { if (ram) {
ramStack.add(list.removeFirst()); urlRAMStack.add(list.removeFirst());
} else try { } else try {
fileStack.push(fileStack.row().newEntry(new byte[][]{((String) list.removeFirst()).getBytes()})); urlFileStack.push(urlFileStack.row().newEntry(new byte[][]{((String) list.removeFirst()).getBytes()}));
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} }
@ -176,34 +227,36 @@ public class plasmaCrawlBalancer {
private void flushAllRamStack() throws IOException { private void flushAllRamStack() throws IOException {
// this flushes only the ramStack to the fileStack, but does not flush the domainStacks // this flushes only the ramStack to the fileStack, but does not flush the domainStacks
for (int i = 0; i < ramStack.size() / 2; i++) { for (int i = 0; i < urlRAMStack.size() / 2; i++) {
fileStack.push(fileStack.row().newEntry(new byte[][]{((String) ramStack.get(i)).getBytes()})); urlFileStack.push(urlFileStack.row().newEntry(new byte[][]{((String) urlRAMStack.get(i)).getBytes()}));
fileStack.push(fileStack.row().newEntry(new byte[][]{((String) ramStack.get(ramStack.size() - i - 1)).getBytes()})); urlFileStack.push(urlFileStack.row().newEntry(new byte[][]{((String) urlRAMStack.get(urlRAMStack.size() - i - 1)).getBytes()}));
} }
if (ramStack.size() % 2 == 1) if (urlRAMStack.size() % 2 == 1)
fileStack.push(fileStack.row().newEntry(new byte[][]{((String) ramStack.get(ramStack.size() / 2)).getBytes()})); urlFileStack.push(urlFileStack.row().newEntry(new byte[][]{((String) urlRAMStack.get(urlRAMStack.size() / 2)).getBytes()}));
} }
public synchronized void push(String urlhash) throws IOException { public synchronized void push(plasmaCrawlEntry entry) throws IOException {
assert urlhash != null; assert entry != null;
if (ramIndex.contains(urlhash)) { if (urlFileIndex.has(entry.urlhash().getBytes())) {
serverLog.logWarning("PLASMA BALANCER", "double-check has failed for urlhash " + urlhash + " - fixed"); serverLog.logWarning("PLASMA BALANCER", "double-check has failed for urlhash " + entry.urlhash() + " - fixed");
return; return;
} }
String dom = urlhash.substring(6);
// extend domain stack
String dom = entry.urlhash().substring(6);
LinkedList domainList = (LinkedList) domainStacks.get(dom); LinkedList domainList = (LinkedList) domainStacks.get(dom);
if (domainList == null) { if (domainList == null) {
// create new list // create new list
domainList = new LinkedList(); domainList = new LinkedList();
domainList.addLast(urlhash); domainList.addLast(entry.urlhash());
domainStacks.put(dom, domainList); domainStacks.put(dom, domainList);
} else { } else {
// extend existent domain list // extend existent domain list
domainList.add(urlhash); domainList.add(entry.urlhash());
} }
// add to index // add to index
ramIndex.add(urlhash); urlFileIndex.put(entry.toRow());
// check size of domainStacks and flush // check size of domainStacks and flush
if ((domainStacks.size() > 20) || (sizeDomainStacks() > 1000)) { if ((domainStacks.size() > 20) || (sizeDomainStacks() > 1000)) {
@ -211,15 +264,15 @@ public class plasmaCrawlBalancer {
} }
} }
public synchronized String pop(long minimumDelta, long maximumAge) throws IOException { public synchronized plasmaCrawlEntry pop(long minimumDelta, long maximumAge) throws IOException {
// returns an url-hash from the stack and ensures minimum delta times // returns an url-hash from the stack and ensures minimum delta times
// we have 3 sources to choose from: the ramStack, the domainStacks and the fileStack // we have 3 sources to choose from: the ramStack, the domainStacks and the fileStack
String result = null; // the result String result = null; // the result
// 1st: check ramStack // 1st: check ramStack
if (ramStack.size() > 0) { if (urlRAMStack.size() > 0) {
result = (String) ramStack.remove(0); result = (String) urlRAMStack.remove(0);
} }
// 2nd-a: check domainStacks for latest arrivals // 2nd-a: check domainStacks for latest arrivals
@ -301,12 +354,12 @@ public class plasmaCrawlBalancer {
} }
// 3rd: take entry from file // 3rd: take entry from file
if ((result == null) && (fileStack.size() > 0)) { if ((result == null) && (urlFileStack.size() > 0)) {
kelondroRow.Entry topentry = fileStack.top(); kelondroRow.Entry topentry = urlFileStack.top();
if (topentry == null) { if (topentry == null) {
// emergency case: this means that something with the stack organization is wrong // emergency case: this means that something with the stack organization is wrong
// the file appears to be broken. We kill the file. // the file appears to be broken. We kill the file.
kelondroStack.reset(fileStack); kelondroStack.reset(urlFileStack);
serverLog.logSevere("PLASMA BALANCER", "get() failed to fetch entry from file stack. reset stack file."); serverLog.logSevere("PLASMA BALANCER", "get() failed to fetch entry from file stack. reset stack file.");
} else { } else {
String top = new String(topentry.getColBytes(0)); String top = new String(topentry.getColBytes(0));
@ -316,10 +369,10 @@ public class plasmaCrawlBalancer {
long delta = lastAccessDelta(top); long delta = lastAccessDelta(top);
if (delta > minimumDelta) { if (delta > minimumDelta) {
// the entry from top is fine // the entry from top is fine
result = new String(fileStack.pop().getColBytes(0)); result = new String(urlFileStack.pop().getColBytes(0));
} else { } else {
// try entry from bottom // try entry from bottom
result = new String(fileStack.pot().getColBytes(0)); result = new String(urlFileStack.pot().getColBytes(0));
delta = lastAccessDelta(result); delta = lastAccessDelta(result);
} }
} }
@ -327,7 +380,7 @@ public class plasmaCrawlBalancer {
// check case where we did not found anything // check case where we did not found anything
if (result == null) { if (result == null) {
serverLog.logSevere("PLASMA BALANCER", "get() was not able to find a valid urlhash - total size = " + size() + ", fileStack.size() = " + fileStack.size() + ", ramStack.size() = " + ramStack.size() + ", domainStacks.size() = " + domainStacks.size()); serverLog.logSevere("PLASMA BALANCER", "get() was not able to find a valid urlhash - total size = " + size() + ", fileStack.size() = " + urlFileStack.size() + ", ramStack.size() = " + urlRAMStack.size() + ", domainStacks.size() = " + domainStacks.size());
return null; return null;
} }
@ -344,8 +397,9 @@ public class plasmaCrawlBalancer {
// update statistical data // update statistical data
domainAccess.put(result.substring(6), new Long(System.currentTimeMillis())); domainAccess.put(result.substring(6), new Long(System.currentTimeMillis()));
ramIndex.remove(result); kelondroRow.Entry entry = urlFileIndex.remove(result.getBytes());
return result; if (entry == null) return null;
return new plasmaCrawlEntry(entry);
} }
private long lastAccessDelta(String hash) { private long lastAccessDelta(String hash) {
@ -355,19 +409,55 @@ public class plasmaCrawlBalancer {
return System.currentTimeMillis() - lastAccess.longValue(); return System.currentTimeMillis() - lastAccess.longValue();
} }
public synchronized String top(int dist) { public synchronized plasmaCrawlEntry top(int dist) throws IOException {
int availableInRam = ramStack.size() + sizeDomainStacks(); int availableInRam = urlRAMStack.size() + sizeDomainStacks();
if ((availableInRam < dist) && (fileStack.size() > (dist - availableInRam))) { if ((availableInRam <= dist) && (urlFileStack.size() > (dist - availableInRam))) {
// flush some entries from disc to domain stacks // flush some entries from disc to domain stacks
try { try {
for (int i = 0; i < (dist - availableInRam); i++) { for (int i = 0; i <= (dist - availableInRam); i++) {
ramStack.add(new String(fileStack.pop().getColBytes(0))); if (urlFileStack.size() == 0) break;
urlRAMStack.add(new String(urlFileStack.pop().getColBytes(0)));
} }
} catch (IOException e) {} } catch (IOException e) {}
} }
while ((sizeDomainStacks() > 0) && (ramStack.size() <= dist)) flushOnceDomStacks(true); // flush only that much as we need to display while ((sizeDomainStacks() > 0) && (urlRAMStack.size() <= dist)) flushOnceDomStacks(true); // flush only that much as we need to display
if (dist >= ramStack.size()) return null; if (dist >= urlRAMStack.size()) return null;
return (String) ramStack.get(dist); String urlhash = (String) urlRAMStack.get(dist);
kelondroRow.Entry entry = urlFileIndex.get(urlhash.getBytes());
if (entry == null) return null;
return new plasmaCrawlEntry(entry);
}
public Iterator iterator() throws IOException {
return new EntryIterator();
}
public class EntryIterator implements Iterator {
Iterator rowIterator;
public EntryIterator() throws IOException {
rowIterator = urlFileIndex.rows(true, null);
}
public boolean hasNext() {
return (rowIterator == null) ? false : rowIterator.hasNext();
}
public Object next() {
kelondroRow.Entry entry = (kelondroRow.Entry) rowIterator.next();
try {
return (entry == null) ? null : new plasmaCrawlEntry(entry);
} catch (IOException e) {
rowIterator = null;
return null;
}
}
public void remove() {
if (rowIterator != null) rowIterator.remove();
}
} }
} }

@ -1,15 +1,15 @@
// plasmaEURL.java // plasmaCrawlEURL.java
// ----------------------- // (C) 2004 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// part of YaCy // first published 09.08.2004 on http://www.anomic.de
// (C) by Michael Peter Christen; mc@anomic.de //
// first published on http://www.anomic.de // This is a part of YaCy, a peer-to-peer based web search engine
// Frankfurt, Germany, 2004
// last major change: 09.08.2004
// //
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $ // $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $ // $LastChangedBy: orbiter $
// //
// LICENSE
//
// This program is free software; you can redistribute it and/or modify // This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by // it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or // the Free Software Foundation; either version 2 of the License, or
@ -23,50 +23,15 @@
// You should have received a copy of the GNU General Public License // You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software // along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
// EURL - noticed (known but not loaded) URL's
package de.anomic.plasma; package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import de.anomic.plasma.plasmaURL;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlEURL { public class plasmaCrawlEURL {
/* ======================================================================= /* =======================================================================
* Failure reason constants * Failure reason constants
* ======================================================================= */ * ======================================================================= */
// invalid urls // invalid urls
public static final String DENIED_URL_NULL = "denied_(url_null)"; public static final String DENIED_URL_NULL = "denied_(url_null)";
public static final String DENIED_MALFORMED_URL = "denied_(malformed_url)"; public static final String DENIED_MALFORMED_URL = "denied_(malformed_url)";
@ -125,290 +90,5 @@ public class plasmaCrawlEURL {
// indexing errors // indexing errors
public static final String DENIED_UNSPECIFIED_INDEXING_ERROR = "denied_(unspecified_indexing_error)"; public static final String DENIED_UNSPECIFIED_INDEXING_ERROR = "denied_(unspecified_indexing_error)";
public static final String DENIED_UNKNOWN_INDEXING_PROCESS_CASE = "denied_(unknown_indexing_process_case)"; public static final String DENIED_UNKNOWN_INDEXING_PROCESS_CASE = "denied_(unknown_indexing_process_case)";
/* =======================================================================
* Other object variables
* ======================================================================= */
private LinkedList rejectedStack = new LinkedList(); // strings: url
public final static kelondroRow rowdef = new kelondroRow(
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"String executor-" + yacySeedDB.commonHashLength + ", " + // the crawling executor
"String urlstring-256, " + // the url as string
"String urlname-40, " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-4 {b64e}, " + // the time when the url was first time appeared
"Cardinal loaddate-4 {b64e}, " + // the time when the url was last time tried to load
"Cardinal retrycount-2 {b64e}, " + // number of load retries
"String failcause-80, " + // string describing load failure
"byte[] flags-2", // extra space
kelondroBase64Order.enhancedCoder,
0);
// the class object
private kelondroIndex urlIndexFile = null;
public plasmaCrawlEURL(File cachePath, long preloadTime) {
super();
String newCacheName = "urlErr3.table";
cachePath.mkdirs();
try {
urlIndexFile = new kelondroFlexTable(cachePath, newCacheName, preloadTime, rowdef);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
}
public int size() {
try {
return urlIndexFile.size() ;
} catch (IOException e) {
return 0;
}
}
public void close() {
if (urlIndexFile != null) {
urlIndexFile.close();
urlIndexFile = null;
}
}
public synchronized Entry newEntry(URL url, String referrer, String initiator, String executor,
String name, String failreason, kelondroBitfield flags) {
if ((referrer == null) || (referrer.length() < yacySeedDB.commonHashLength)) referrer = plasmaURL.dummyHash;
if ((initiator == null) || (initiator.length() < yacySeedDB.commonHashLength)) initiator = plasmaURL.dummyHash;
if ((executor == null) || (executor.length() < yacySeedDB.commonHashLength)) executor = plasmaURL.dummyHash;
if (failreason == null) failreason = "unknown";
return new Entry(url, referrer, initiator, executor, name, failreason, flags);
}
public boolean remove(String hash) {
if (hash == null) return false;
try {
urlIndexFile.remove(hash.getBytes());
return true;
} catch (IOException e) {
return false;
}
}
public synchronized void stackPushEntry(Entry e) {
rejectedStack.add(e.hash);
}
public Entry stackPopEntry(int pos) throws IOException {
String urlhash = (String) rejectedStack.get(pos);
if (urlhash == null) return null;
return new Entry(urlhash);
}
public synchronized Entry getEntry(String hash) throws IOException {
return new Entry(hash);
}
public boolean getUseNewDB() {
return (urlIndexFile instanceof kelondroFlexTable);
}
public boolean exists(String urlHash) {
try {
return urlIndexFile.has(urlHash.getBytes());
} catch (IOException e) {
return false;
}
}
public void clearStack() {
rejectedStack.clear();
}
public int stackSize() {
return rejectedStack.size();
}
public class Entry {
private String hash; // the url's hash
private String referrer; // the url's referrer hash
private String initiator; // the crawling initiator
private String executor; // the crawling initiator
private URL url; // the url as string
private String name; // the name of the url, from anchor tag <a>name</a>
private Date initdate; // the time when the url was first time appeared
private Date trydate; // the time when the url was last time tried to load
private int trycount; // number of tryings
private String failreason; // string describing reason for load fail
private kelondroBitfield flags; // extra space
private boolean stored;
public Entry(URL url, String referrer, String initiator,
String executor, String name, String failreason, kelondroBitfield flags) {
// create new entry
this.hash = plasmaURL.urlHash(url);
this.referrer = (referrer == null) ? plasmaURL.dummyHash : referrer;
this.initiator = initiator;
this.executor = executor;
this.url = url;
this.name = name;
this.initdate = new Date();
this.trydate = new Date();
this.trycount = 0;
this.failreason = failreason;
this.flags = flags;
this.stored = false;
}
public Entry(String hash) throws IOException {
// generates an plasmaEURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
// we have two options to find the url:
// - look into the hash cache
// - look into the filed properties
// if the url cannot be found, this returns null
this.hash = hash;
kelondroRow.Entry entry = urlIndexFile.get(hash.getBytes());
if (entry != null) {
insertEntry(entry);
}
this.stored = true;
}
public Entry(kelondroRow.Entry entry) throws IOException {
insertEntry(entry);
this.stored = false;
}
private void insertEntry(kelondroRow.Entry entry) throws IOException {
assert (entry != null);
this.hash = entry.getColString(0, null);
this.referrer = entry.getColString(1, "UTF-8");
this.initiator = entry.getColString(2, "UTF-8");
this.executor = entry.getColString(3, "UTF-8");
this.url = new URL(entry.getColString(4, "UTF-8").trim());
String n = entry.getColString(5, "UTF-8");
this.name = (n == null) ? "" : n.trim();
this.initdate = new Date(86400000 * entry.getColLong(6));
this.trydate = new Date(86400000 * entry.getColLong(7));
this.trycount = (int) entry.getColLong(8);
this.failreason = entry.getColString(9, "UTF-8");
this.flags = new kelondroBitfield(entry.getColBytes(10));
return;
}
public void store() {
// stores the values from the object variables into the database
if (this.stored) return;
if (this.hash == null) return;
String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, rowdef.width(6));
String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, rowdef.width(7));
// store the hash in the hash cache
try {
// even if the entry exists, we simply overwrite it
byte[][] entry = new byte[][] {
this.hash.getBytes(),
this.referrer.getBytes(),
this.initiator.getBytes(),
this.executor.getBytes(),
this.url.toString().getBytes(),
this.name.getBytes(),
initdatestr.getBytes(),
trydatestr.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.trycount, rowdef.width(8)).getBytes(),
this.failreason.getBytes(),
this.flags.bytes()
};
urlIndexFile.put(urlIndexFile.row().newEntry(entry));
this.stored = true;
} catch (IOException e) {
System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString());
}
}
public String hash() {
// return a url-hash, based on the md5 algorithm
// the result is a String of 12 bytes within a 72-bit space
// (each byte has an 6-bit range)
// that should be enough for all web pages on the world
return this.hash;
}
public String referrer() {
return this.referrer;
}
public URL url() {
return url;
}
public Date initdate() {
return trydate;
}
public Date trydate() {
return trydate;
}
public String initiator() {
// return the creator's hash
return initiator;
}
public String executor() {
// return the creator's hash
return executor;
}
public String name() {
// return the creator's hash
return name;
}
public String failreason() {
return failreason;
}
}
public class kiter implements Iterator {
// enumerates entry elements
Iterator i;
boolean error = false;
public kiter(boolean up, String firstHash) throws IOException {
i = urlIndexFile.rows(up, (firstHash == null) ? null : firstHash.getBytes());
error = false;
}
public boolean hasNext() {
if (error) return false;
return i.hasNext();
}
public Object next() throws RuntimeException {
kelondroRow.Entry e = (kelondroRow.Entry) i.next();
if (e == null) return null;
try {
return new Entry(e);
} catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null));
}
}
public void remove() {
i.remove();
}
}
public Iterator entries(boolean up, String firstHash) throws IOException {
// enumerates entry elements
return new kiter(up, firstHash);
}
} }

@ -0,0 +1,238 @@
// plasmaCrawlBalancerEntry.java
// (C) 2007 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 14.03.2007 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Date;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlEntry {
// row definition for balancer-related NURL-entries
public final static kelondroRow rowdef = new kelondroRow(
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"String urlstring-256, " + // the url as string
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String urlname-80, " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-8 {b256}, " + // the time when the url was first time appeared
"String profile-4, " + // the name of the prefetch profile handle
"Cardinal depth-2 {b256}, " + // the prefetch depth so far, starts at 0
"Cardinal parentbr-3 {b256}, " + // number of anchors of the parent
"Cardinal forkfactor-4 {b256}, " + // sum of anchors of all ancestors
"byte[] flags-4, " + // flags
"String handle-4, " + // extra handle
"Cardinal loaddate-8 {b256}," + // time when the file was loaded
"Cardinal serverdate-8 {b256}," + // time when that the server returned as document date
"Cardinal modifiedSince-8 {b256}", // time that was given to server as ifModifiedSince
kelondroBase64Order.enhancedCoder,
0
);
private String initiator; // the initiator hash, is NULL or "" if it is the own proxy;
// if this is generated by a crawl, the own peer hash in entered
private String urlhash; // the url's hash
private String referrer; // the url's referrer hash
private URL url; // the url as string
private String name; // the name of the url, from anchor tag <a>name</a>
private long appdate; // the time when the url was first time appeared
private long loaddate; // the time when the url was loaded
private long serverdate; // the document date from the target server
private long imsdate; // the time of a ifModifiedSince request
private String profileHandle; // the name of the prefetch profile
private int depth; // the prefetch depth so far, starts at 0
private int anchors; // number of anchors of the parent
private int forkfactor; // sum of anchors of all ancestors
private kelondroBitfield flags;
private int handle;
public plasmaCrawlEntry(URL url) {
this(yacyCore.seedDB.mySeed.hash, url, null, null, new Date(), null, 0, 0, 0);
}
public plasmaCrawlEntry(
String initiator,
URL url,
String referrer,
String name,
Date appdate,
String profileHandle,
int depth,
int anchors,
int forkfactor
) {
// create new entry and store it into database
this.urlhash = plasmaURL.urlHash(url);
this.initiator = initiator;
this.url = url;
this.referrer = (referrer == null) ? plasmaURL.dummyHash : referrer;
this.name = (name == null) ? "" : name;
this.appdate = (appdate == null) ? 0 : appdate.getTime();
this.profileHandle = profileHandle; // must not be null
this.depth = depth;
this.anchors = anchors;
this.forkfactor = forkfactor;
this.flags = new kelondroBitfield(rowdef.width(10));
this.handle = 0;
this.loaddate = 0;
this.serverdate = 0;
this.imsdate = 0;
}
public plasmaCrawlEntry(kelondroRow.Entry entry) throws IOException {
assert (entry != null);
insertEntry(entry);
}
private void insertEntry(kelondroRow.Entry entry) throws IOException {
String urlstring = entry.getColString(2, null);
if (urlstring == null) throw new IOException ("url string is null");
this.urlhash = entry.getColString(0, null);
this.initiator = entry.getColString(1, null);
this.url = new URL(urlstring);
this.referrer = (entry.empty(3)) ? plasmaURL.dummyHash : entry.getColString(3, null);
this.name = (entry.empty(4)) ? "" : entry.getColString(4, "UTF-8").trim();
this.appdate = entry.getColLong(5);
this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6, null).trim();
this.depth = (int) entry.getColLong(7);
this.anchors = (int) entry.getColLong(8);
this.forkfactor = (int) entry.getColLong(9);
this.flags = new kelondroBitfield(entry.getColBytes(10));
this.handle = Integer.parseInt(entry.getColString(11, null), 16);
this.loaddate = entry.getColLong(12);
this.serverdate = entry.getColLong(13);
this.imsdate = entry.getColLong(14);
return;
}
private static String normalizeHandle(int h) {
String d = Integer.toHexString(h);
while (d.length() < rowdef.width(11)) d = "0" + d;
return d;
}
public kelondroRow.Entry toRow() {
byte[] appdatestr = kelondroNaturalOrder.encodeLong(appdate, rowdef.width(5));
byte[] loaddatestr = kelondroNaturalOrder.encodeLong(loaddate, rowdef.width(12));
byte[] serverdatestr = kelondroNaturalOrder.encodeLong(serverdate, rowdef.width(13));
byte[] imsdatestr = kelondroNaturalOrder.encodeLong(imsdate, rowdef.width(14));
// store the hash in the hash cache
byte[] namebytes;
try {
namebytes = this.name.getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
namebytes = this.name.getBytes();
}
byte[][] entry = new byte[][] {
this.urlhash.getBytes(),
(initiator == null) ? "".getBytes() : this.initiator.getBytes(),
this.url.toString().getBytes(),
this.referrer.getBytes(),
namebytes,
appdatestr,
(this.profileHandle == null) ? null : this.profileHandle.getBytes(),
kelondroNaturalOrder.encodeLong(this.depth, rowdef.width(7)),
kelondroNaturalOrder.encodeLong(this.anchors, rowdef.width(8)),
kelondroNaturalOrder.encodeLong(this.forkfactor, rowdef.width(9)),
this.flags.bytes(),
normalizeHandle(this.handle).getBytes(),
loaddatestr,
serverdatestr,
imsdatestr};
return rowdef.newEntry(entry);
}
public URL url() {
// the url
return url;
}
public String urlhash() {
// the hash of this url
return this.urlhash;
}
public String referrerhash() {
// the urlhash of a referer url
return this.referrer;
}
public String initiator() {
// returns the hash of the initiating peer
if (initiator == null) return null;
if (initiator.length() == 0) return null;
return initiator;
}
public boolean proxy() {
// true when the url was retrieved using the proxy
return (initiator() == null);
}
public Date appdate() {
// the date when the url appeared first
return new Date(appdate);
}
public Date loaddate() {
// the date when the url was loaded
return new Date(loaddate);
}
public Date serverdate() {
// the date that the server returned as document date
return new Date(serverdate);
}
public Date imsdate() {
// the date that the client (browser) send as ifModifiedSince in proxy mode
return new Date(imsdate);
}
public String name() {
// return the anchor name (text inside <a> tag)
return name;
}
public int depth() {
// crawl depth where the url appeared
return depth;
}
public String profileHandle() {
// the handle of the crawl profile
return profileHandle;
}
}

@ -46,24 +46,9 @@ package de.anomic.plasma;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import de.anomic.plasma.plasmaURL;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroRecords;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroStack;
import de.anomic.net.URL;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlNURL { public class plasmaCrawlNURL {
public static final int STACK_TYPE_NULL = 0; // do not stack public static final int STACK_TYPE_NULL = 0; // do not stack
@ -78,166 +63,33 @@ public class plasmaCrawlNURL {
private static final long minimumDelta = 500; // the minimum time difference between access of the same domain private static final long minimumDelta = 500; // the minimum time difference between access of the same domain
private static final long maximumDomAge = 60000; // the maximum age of a domain until it is used for another crawl attempt private static final long maximumDomAge = 60000; // the maximum age of a domain until it is used for another crawl attempt
/**
* column length definition for the {@link plasmaURL#urlIndexFile} DB
*/
public final static kelondroRow rowdef = new kelondroRow(
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"String urlstring-256, " + // the url as string
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String urlname-40, " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-4 {b64e}, " + // the time when the url was first time appeared
"String profile-4, " + // the name of the prefetch profile handle
"Cardinal depth-2 {b64e}, " + // the prefetch depth so far, starts at 0
"Cardinal parentbr-3 {b64e}, " + // number of anchors of the parent
"Cardinal forkfactor-4 {b64e}, " + // sum of anchors of all ancestors
"byte[] flags-4, " + // flags
"String handle-4", // extra handle
kelondroBase64Order.enhancedCoder,
0
);
private kelondroIndex urlIndexFile = null;
private final plasmaCrawlBalancer coreStack; // links found by crawling to depth-1 private final plasmaCrawlBalancer coreStack; // links found by crawling to depth-1
private final plasmaCrawlBalancer limitStack; // links found by crawling at target depth private final plasmaCrawlBalancer limitStack; // links found by crawling at target depth
private final plasmaCrawlBalancer overhangStack; // links found by crawling at depth+1
private final plasmaCrawlBalancer remoteStack; // links from remote crawl orders private final plasmaCrawlBalancer remoteStack; // links from remote crawl orders
private kelondroStack imageStack; // links pointing to image resources //private final plasmaCrawlBalancer overhangStack; // links found by crawling at depth+1
private kelondroStack movieStack; // links pointing to movie resources //private kelondroStack imageStack; // links pointing to image resources
private kelondroStack musicStack; // links pointing to music resources //private kelondroStack movieStack; // links pointing to movie resources
//private kelondroStack musicStack; // links pointing to music resources
private final HashSet imageStackIndex, movieStackIndex, musicStackIndex; // to find out if a specific link is already on any stack public plasmaCrawlNURL(File cachePath) {
private File cacheStacksPath;
private long preloadTime;
private initStackIndex initThead;
public plasmaCrawlNURL(File cachePath, long preloadTime) {
super(); super();
this.cacheStacksPath = cachePath; coreStack = new plasmaCrawlBalancer(cachePath, "urlNoticeCoreStack");
this.preloadTime = preloadTime; limitStack = new plasmaCrawlBalancer(cachePath, "urlNoticeLimitStack");
//overhangStack = new plasmaCrawlBalancer(overhangStackFile);
// create a stack for newly entered entries remoteStack = new plasmaCrawlBalancer(cachePath, "urlNoticeRemoteStack");
if (!(cachePath.exists())) cachePath.mkdir(); // make the path
openHashCache();
File coreStackFile = new File(cachePath, "urlNoticeLocal0.stack");
File limitStackFile = new File(cachePath, "urlNoticeLimit0.stack");
File overhangStackFile = new File(cachePath, "urlNoticeOverhang0.stack");
File remoteStackFile = new File(cachePath, "urlNoticeRemote0.stack");
File imageStackFile = new File(cachePath, "urlNoticeImage0.stack");
File movieStackFile = new File(cachePath, "urlNoticeMovie0.stack");
File musicStackFile = new File(cachePath, "urlNoticeMusic0.stack");
coreStack = new plasmaCrawlBalancer(coreStackFile);
limitStack = new plasmaCrawlBalancer(limitStackFile);
overhangStack = new plasmaCrawlBalancer(overhangStackFile);
remoteStack = new plasmaCrawlBalancer(remoteStackFile);
kelondroRow rowdef = new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength, kelondroBase64Order.enhancedCoder, 0);
imageStack = kelondroStack.open(imageStackFile, rowdef);
movieStack = kelondroStack.open(movieStackFile, rowdef);
musicStack = kelondroStack.open(musicStackFile, rowdef);
// init stack Index
imageStackIndex = new HashSet();
movieStackIndex = new HashSet();
musicStackIndex = new HashSet();
(initThead = new initStackIndex()).start();
} }
public int size() { public int size() {
try { return coreStack.size() + limitStack.size() + remoteStack.size();
return urlIndexFile.size() ;
} catch (IOException e) {
return 0;
}
}
public void waitOnInitThread() {
try {
if (this.initThead != null) {
this.initThead.join();
}
} catch (NullPointerException e) {
} catch (InterruptedException e) {}
}
private void openHashCache() {
String newCacheName = "urlNotice5.table";
cacheStacksPath.mkdirs();
try {
urlIndexFile = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, rowdef), true, false);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
}
private void resetHashCache() {
if (urlIndexFile != null) {
urlIndexFile.close();
urlIndexFile = null;
File cacheFile = new File(cacheStacksPath, "urlNotice2.db");
cacheFile.delete();
}
openHashCache();
} }
public void close() { public void close() {
coreStack.close(); coreStack.close();
limitStack.close(); limitStack.close();
overhangStack.close(); //overhangStack.close();
remoteStack.close(); remoteStack.close();
imageStack.close();
movieStack.close();
musicStack.close();
if (urlIndexFile != null) {
urlIndexFile.close();
urlIndexFile = null;
}
}
public class initStackIndex extends Thread {
public void run() {
Iterator i;
try {
i = imageStack.iterator();
while (i.hasNext()) imageStackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey(), "UTF-8"));
} catch (Exception e) {
imageStack = kelondroStack.reset(imageStack);
}
try {
i = movieStack.iterator();
while (i.hasNext()) movieStackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey(), "UTF-8"));
} catch (Exception e) {
movieStack = kelondroStack.reset(movieStack);
}
try {
i = musicStack.iterator();
while (i.hasNext()) musicStackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey(), "UTF-8"));
} catch (Exception e) {
musicStack = kelondroStack.reset(musicStack);
}
plasmaCrawlNURL.this.initThead = null;
}
}
public boolean remove(String hash) {
if (hash == null) return false;
try {
urlIndexFile.remove(hash.getBytes());
return true;
} catch (IOException e) {
return false;
}
} }
private static String normalizeHandle(int h) {
String d = Integer.toHexString(h);
while (d.length() < rowdef.width(11)) d = "0" + d;
return d;
}
public int stackSize() { public int stackSize() {
// this does not count the overhang stack size // this does not count the overhang stack size
return coreStack.size() + limitStack.size() + remoteStack.size(); return coreStack.size() + limitStack.size() + remoteStack.size();
@ -247,11 +99,8 @@ public class plasmaCrawlNURL {
switch (stackType) { switch (stackType) {
case STACK_TYPE_CORE: return coreStack.size(); case STACK_TYPE_CORE: return coreStack.size();
case STACK_TYPE_LIMIT: return limitStack.size(); case STACK_TYPE_LIMIT: return limitStack.size();
case STACK_TYPE_OVERHANG: return overhangStack.size(); case STACK_TYPE_OVERHANG: return 0;
case STACK_TYPE_REMOTE: return remoteStack.size(); case STACK_TYPE_REMOTE: return remoteStack.size();
case STACK_TYPE_IMAGE: return imageStack.size();
case STACK_TYPE_MOVIE: return movieStack.size();
case STACK_TYPE_MUSIC: return musicStack.size();
default: return -1; default: return -1;
} }
} }
@ -260,111 +109,65 @@ public class plasmaCrawlNURL {
return return
coreStack.has(urlhash) || coreStack.has(urlhash) ||
limitStack.has(urlhash) || limitStack.has(urlhash) ||
overhangStack.has(urlhash) || //overhangStack.has(urlhash) ||
remoteStack.has(urlhash) || remoteStack.has(urlhash);
imageStackIndex.contains(urlhash) ||
movieStackIndex.contains(urlhash) ||
musicStackIndex.contains(urlhash);
}
public synchronized Entry newEntry(String initiator, URL url, Date loaddate,
String referrer, String name, String profile,
int depth, int anchors, int forkfactor) {
return new Entry(initiator, url, referrer, name, loaddate,
profile, depth, anchors, forkfactor);
} }
public synchronized Entry newEntry(Entry oldEntry) { public void push(int stackType, plasmaCrawlEntry entry) {
if (oldEntry == null) return null;
return new Entry(
oldEntry.initiator(),
oldEntry.url(),
oldEntry.referrerHash(),
oldEntry.name(),
oldEntry.loaddate(),
oldEntry.profileHandle(),
oldEntry.depth(),
oldEntry.anchors,
oldEntry.forkfactor
);
}
public void push(int stackType, String urlhash) {
try { try {
switch (stackType) { switch (stackType) {
case STACK_TYPE_CORE: case STACK_TYPE_CORE:
coreStack.push(urlhash); coreStack.push(entry);
break; break;
case STACK_TYPE_LIMIT: case STACK_TYPE_LIMIT:
limitStack.push(urlhash); limitStack.push(entry);
break;
case STACK_TYPE_OVERHANG:
overhangStack.push(urlhash);
break; break;
case STACK_TYPE_REMOTE: case STACK_TYPE_REMOTE:
remoteStack.push(urlhash); remoteStack.push(entry);
break;
case STACK_TYPE_IMAGE:
imageStack.push(imageStack.row().newEntry(new byte[][] {urlhash.getBytes()}));
imageStackIndex.add(urlhash);
break;
case STACK_TYPE_MOVIE:
movieStack.push(movieStack.row().newEntry(new byte[][] {urlhash.getBytes()}));
movieStackIndex.add(urlhash);
break;
case STACK_TYPE_MUSIC:
musicStack.push(musicStack.row().newEntry(new byte[][] {urlhash.getBytes()}));
musicStackIndex.add(urlhash);
break; break;
default: break; default: break;
} }
} catch (IOException er) {} } catch (IOException er) {}
} }
public Entry[] top(int stackType, int count) { public plasmaCrawlEntry get(String urlhash) {
plasmaCrawlEntry entry = null;
try {if ((entry = coreStack.get(urlhash)) != null) return entry;} catch (IOException e) {}
try {if ((entry = limitStack.get(urlhash)) != null) return entry;} catch (IOException e) {}
try {if ((entry = remoteStack.get(urlhash)) != null) return entry;} catch (IOException e) {}
return null;
}
public plasmaCrawlEntry remove(String urlhash) {
plasmaCrawlEntry entry = null;
try {if ((entry = coreStack.remove(urlhash)) != null) return entry;} catch (IOException e) {}
try {if ((entry = limitStack.remove(urlhash)) != null) return entry;} catch (IOException e) {}
try {if ((entry = remoteStack.remove(urlhash)) != null) return entry;} catch (IOException e) {}
return null;
}
public plasmaCrawlEntry[] top(int stackType, int count) {
switch (stackType) { switch (stackType) {
case STACK_TYPE_CORE: return top(coreStack, count); case STACK_TYPE_CORE: return top(coreStack, count);
case STACK_TYPE_LIMIT: return top(limitStack, count); case STACK_TYPE_LIMIT: return top(limitStack, count);
case STACK_TYPE_OVERHANG: return top(overhangStack, count);
case STACK_TYPE_REMOTE: return top(remoteStack, count); case STACK_TYPE_REMOTE: return top(remoteStack, count);
case STACK_TYPE_IMAGE: return top(imageStack, count);
case STACK_TYPE_MOVIE: return top(movieStack, count);
case STACK_TYPE_MUSIC: return top(musicStack, count);
default: return null; default: return null;
} }
} }
public Iterator iterator(int stackType) { public plasmaCrawlEntry pop(int stackType) throws IOException {
// returns an iterator of String objects
switch (stackType) {
case STACK_TYPE_CORE: return coreStack.iterator();
case STACK_TYPE_LIMIT: return limitStack.iterator();
case STACK_TYPE_OVERHANG: return overhangStack.iterator();
case STACK_TYPE_REMOTE: return remoteStack.iterator();
case STACK_TYPE_IMAGE: return imageStackIndex.iterator();
case STACK_TYPE_MOVIE: return movieStackIndex.iterator();
case STACK_TYPE_MUSIC: return musicStackIndex.iterator();
default: return null;
}
}
public Entry pop(int stackType) throws IOException {
switch (stackType) { switch (stackType) {
case STACK_TYPE_CORE: return pop(coreStack); case STACK_TYPE_CORE: return pop(coreStack);
case STACK_TYPE_LIMIT: return pop(limitStack); case STACK_TYPE_LIMIT: return pop(limitStack);
case STACK_TYPE_OVERHANG: return pop(overhangStack);
case STACK_TYPE_REMOTE: return pop(remoteStack); case STACK_TYPE_REMOTE: return pop(remoteStack);
case STACK_TYPE_IMAGE: return pop(imageStack);
case STACK_TYPE_MOVIE: return pop(movieStack);
case STACK_TYPE_MUSIC: return pop(musicStack);
default: return null; default: return null;
} }
} }
public void shift(int fromStack, int toStack) { public void shift(int fromStack, int toStack) {
try { try {
Entry entry = pop(fromStack); plasmaCrawlEntry entry = pop(fromStack);
push(toStack, entry.hash()); if (entry != null) push(toStack, entry);
} catch (IOException e) { } catch (IOException e) {
return; return;
} }
@ -374,329 +177,55 @@ public class plasmaCrawlNURL {
switch (stackType) { switch (stackType) {
case STACK_TYPE_CORE: coreStack.clear(); break; case STACK_TYPE_CORE: coreStack.clear(); break;
case STACK_TYPE_LIMIT: limitStack.clear(); break; case STACK_TYPE_LIMIT: limitStack.clear(); break;
case STACK_TYPE_OVERHANG: overhangStack.clear(); break;
case STACK_TYPE_REMOTE: remoteStack.clear(); break; case STACK_TYPE_REMOTE: remoteStack.clear(); break;
case STACK_TYPE_IMAGE: imageStack = kelondroStack.reset(imageStack); break;
case STACK_TYPE_MOVIE: movieStack = kelondroStack.reset(movieStack); break;
case STACK_TYPE_MUSIC: musicStack = kelondroStack.reset(musicStack); break;
default: return; default: return;
} }
} }
private Entry pop(kelondroStack stack) throws IOException { private plasmaCrawlEntry pop(plasmaCrawlBalancer balancer) throws IOException {
// this is a filo - pop
int s;
Entry entry;
kelondroRow.Entry re;
synchronized (stack) {
while ((s = stack.size()) > 0) {
re = stack.pop();
if (re == null) {
if (s > stack.size()) continue;
stack = kelondroStack.reset(stack); // the stack is not able to shrink
throw new IOException("hash is null, stack cannot shrink; reset of stack (1)");
}
try {
entry = new Entry(new String(re.getColBytes(0)));
} catch (IOException e) {
serverLog.logWarning("NURL", e.getMessage());
if (s > stack.size()) continue;
stack = kelondroStack.reset(stack); // the stack is not able to shrink
throw new IOException("hash is null, stack cannot shrink; reset of stack (2)");
}
imageStackIndex.remove(entry.hash);
movieStackIndex.remove(entry.hash);
musicStackIndex.remove(entry.hash);
return entry;
}
}
throw new IOException("crawl stack is empty");
}
private Entry pop(plasmaCrawlBalancer balancer) throws IOException {
// this is a filo - pop // this is a filo - pop
String hash;
int s; int s;
Entry entry; plasmaCrawlEntry entry;
synchronized (balancer) { synchronized (balancer) {
while ((s = balancer.size()) > 0) { while ((s = balancer.size()) > 0) {
hash = balancer.pop(minimumDelta, maximumDomAge); entry = balancer.pop(minimumDelta, maximumDomAge);
if (hash == null) { if (entry == null) {
if (s > balancer.size()) continue;
balancer.clear(); // the balancer is broken and cannot shrink
throw new IOException("hash is null, balancer cannot shrink; reset of balancer (1)");
}
try {
entry = new Entry(hash);
} catch (IOException e) {
serverLog.logWarning("NURL", e.getMessage());
if (s > balancer.size()) continue; if (s > balancer.size()) continue;
balancer.clear(); // the balancer is broken and cannot shrink balancer.clear(); // the balancer is broken and cannot shrink
throw new IOException("IO error, balancer cannot shrink: " + e.getMessage() + "; reset of balancer (2)"); throw new IOException("entry is null, balancer cannot shrink; reset of balancer");
} }
imageStackIndex.remove(entry.hash);
movieStackIndex.remove(entry.hash);
musicStackIndex.remove(entry.hash);
return entry; return entry;
} }
} }
throw new IOException("balancer stack is empty"); throw new IOException("balancer stack is empty");
} }
private Entry[] top(kelondroStack stack, int count) { private plasmaCrawlEntry[] top(plasmaCrawlBalancer balancer, int count) {
// this is a filo - top
if (count > stack.size()) count = stack.size();
ArrayList list = new ArrayList(count);
for (int i = 0; i < count; i++) {
try {
byte[] hash = stack.top(i).getColBytes(0);
list.add(new Entry(new String(hash)));
} catch (IOException e) {
continue;
}
}
return (Entry[]) list.toArray(new Entry[list.size()]);
}
private Entry[] top(plasmaCrawlBalancer balancer, int count) {
// this is a filo - top // this is a filo - top
if (count > balancer.size()) count = balancer.size(); if (count > balancer.size()) count = balancer.size();
ArrayList list = new ArrayList(count); ArrayList list = new ArrayList(count);
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
try { try {
String urlhash = balancer.top(i); plasmaCrawlEntry entry = balancer.top(i);
if (urlhash == null) break; if (entry == null) break;
list.add(new Entry(urlhash)); list.add(entry);
} catch (IOException e) { } catch (IOException e) {
break; break;
} }
} }
return (Entry[])list.toArray(new Entry[list.size()]); return (plasmaCrawlEntry[]) list.toArray(new plasmaCrawlEntry[list.size()]);
}
public synchronized Entry getEntry(String hash) throws IOException {
return new Entry(hash);
} }
public class Entry { public Iterator iterator(int stackType) {
private String initiator; // the initiator hash, is NULL or "" if it is the own proxy; // returns an iterator of plasmaCrawlBalancerEntry Objects
// if this is generated by a crawl, the own peer hash in entered try {switch (stackType) {
private String hash; // the url's hash case STACK_TYPE_CORE: return coreStack.iterator();
private String referrer; // the url's referrer hash case STACK_TYPE_LIMIT: return limitStack.iterator();
private URL url; // the url as string case STACK_TYPE_REMOTE: return remoteStack.iterator();
private String name; // the name of the url, from anchor tag <a>name</a> default: return null;
private Date loaddate; // the time when the url was first time appeared }} catch (IOException e) {
private String profileHandle; // the name of the prefetch profile return new HashSet().iterator();
private int depth; // the prefetch depth so far, starts at 0
private int anchors; // number of anchors of the parent
private int forkfactor; // sum of anchors of all ancestors
private kelondroBitfield flags;
private int handle;
private boolean stored;
public Entry(String initiator,
URL url,
String referrer,
String name,
Date loaddate,
String profileHandle,
int depth,
int anchors,
int forkfactor
) {
// create new entry and store it into database
this.hash = plasmaURL.urlHash(url);
this.initiator = initiator;
this.url = url;
this.referrer = (referrer == null) ? plasmaURL.dummyHash : referrer;
this.name = (name == null) ? "" : name;
this.loaddate = (loaddate == null) ? new Date() : loaddate;
this.profileHandle = profileHandle; // must not be null
this.depth = depth;
this.anchors = anchors;
this.forkfactor = forkfactor;
this.flags = new kelondroBitfield(rowdef.width(10));
this.handle = 0;
this.stored = false;
}
public Entry(String hash) throws IOException {
// generates an plasmaNURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
// we have two options to find the url:
// - look into the hash cache
// - look into the filed properties
// if the url cannot be found, this returns null
this.hash = hash;
if (hash == null) throw new IOException("hash is null");
kelondroRow.Entry entry = urlIndexFile.get(hash.getBytes());
if (entry != null) {
insertEntry(entry);
this.stored = true;
return;
} else {
// show that we found nothing
throw new IOException("NURL: hash " + hash + " not found during initialization of entry object");
//this.url = null;
}
}
public Entry(kelondroRow.Entry entry) throws IOException {
assert (entry != null);
insertEntry(entry);
this.stored = false;
}
private void insertEntry(kelondroRow.Entry entry) throws IOException {
String urlstring = entry.getColString(2, null);
if (urlstring == null) throw new IOException ("url string is null");
this.hash = entry.getColString(0, null);
this.initiator = entry.getColString(1, null);
this.url = new URL(urlstring);
this.referrer = (entry.empty(3)) ? plasmaURL.dummyHash : entry.getColString(3, null);
this.name = (entry.empty(4)) ? "" : entry.getColString(4, "UTF-8").trim();
this.loaddate = new Date(86400000 * entry.getColLong(5));
this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6, null).trim();
this.depth = (int) entry.getColLong(7);
this.anchors = (int) entry.getColLong(8);
this.forkfactor = (int) entry.getColLong(9);
this.flags = new kelondroBitfield(entry.getColBytes(10));
this.handle = Integer.parseInt(entry.getColString(11, null), 16);
return;
}
public void store() {
// stores the values from the object variables into the database
if (this.stored) return;
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, rowdef.width(5));
// store the hash in the hash cache
try {
// even if the entry exists, we simply overwrite it
byte[][] entry = new byte[][] {
this.hash.getBytes(),
(initiator == null) ? "".getBytes() : this.initiator.getBytes(),
this.url.toString().getBytes(),
this.referrer.getBytes(),
this.name.getBytes("UTF-8"),
loaddatestr.getBytes(),
(this.profileHandle == null) ? null : this.profileHandle.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.depth, rowdef.width(7)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, rowdef.width(8)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, rowdef.width(9)).getBytes(),
this.flags.bytes(),
normalizeHandle(this.handle).getBytes()
};
if (urlIndexFile == null) System.out.println("urlHashCache is NULL");
if ((urlIndexFile != null) && (urlIndexFile.row() == null)) System.out.println("row() is NULL");
urlIndexFile.put(urlIndexFile.row().newEntry(entry));
this.stored = true;
} catch (IOException e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaNURL:store:" + e.toString() + ", resetting NURL-DB");
e.printStackTrace();
resetHashCache();
} catch (kelondroException e) {
serverLog.logSevere("PLASMA", "plasmaCrawlNURL.store failed: " + e.toString() + ", resetting NURL-DB");
e.printStackTrace();
resetHashCache();
}
}
public String toString() {
StringBuffer str = new StringBuffer();
str.append("hash: ").append(hash==null ? "null" : hash).append(" | ")
.append("initiator: ").append(initiator==null?"null":initiator).append(" | ")
.append("url: ").append(url==null?"null":url.toString()).append(" | ")
.append("referrer: ").append((referrer == null) ? plasmaURL.dummyHash : referrer).append(" | ")
.append("name: ").append((name == null) ? "null" : name).append(" | ")
.append("loaddate: ").append((loaddate == null) ? new Date() : loaddate).append(" | ")
.append("profile: ").append(profileHandle==null?"null":profileHandle).append(" | ")
.append("depth: ").append(Integer.toString(depth)).append(" | ")
.append("forkfactor: ").append(Integer.toString(forkfactor)).append(" | ")
.append("flags: ").append((flags==null) ? "null" : flags.exportB64());
return str.toString();
}
/**
* return a url-hash, based on the md5 algorithm
* the result is a String of 12 bytes within a 72-bit space
* (each byte has an 6-bit range)
* that should be enough for all web pages on the world
*/
public String hash() {
return this.hash;
}
public String initiator() {
if (initiator == null) return null;
if (initiator.length() == 0) return null;
return initiator;
}
public boolean proxy() {
return (initiator() == null);
}
public String referrerHash() {
return this.referrer;
}
public URL url() {
return url;
}
public Date loaddate() {
return loaddate;
}
public String name() {
// return the creator's hash
return name;
}
public int depth() {
return depth;
}
public String profileHandle() {
return profileHandle;
}
}
public class kiter implements Iterator {
// enumerates entry elements
Iterator i;
boolean error = false;
public kiter(boolean up, String firstHash) throws IOException {
i = urlIndexFile.rows(up, (firstHash == null) ? null : firstHash.getBytes());
error = false;
}
public boolean hasNext() {
if (error) return false;
return i.hasNext();
}
public Object next() throws RuntimeException {
kelondroRow.Entry e = (kelondroRow.Entry) i.next();
if (e == null) return null;
try {
return new Entry(e);
} catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null));
}
}
public void remove() {
i.remove();
} }
}
public Iterator entries(boolean up, String firstHash) throws IOException {
// enumerates entry elements
return new kiter(up, firstHash);
} }
} }

@ -48,7 +48,6 @@ package de.anomic.plasma;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Date; import java.util.Date;
@ -61,8 +60,6 @@ import de.anomic.data.robotsParser;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.plasma.plasmaURL; import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroFlexTable; import de.anomic.kelondro.kelondroFlexTable;
@ -171,7 +168,7 @@ public final class plasmaCrawlStacker {
try { try {
// getting a new message from the crawler queue // getting a new message from the crawler queue
checkInterruption(); checkInterruption();
stackCrawlMessage theMsg = this.queue.waitForMessage(); plasmaCrawlEntry theMsg = this.queue.waitForMessage();
if (theMsg != null) { if (theMsg != null) {
// getting a free session thread from the pool // getting a free session thread from the pool
@ -196,18 +193,18 @@ public final class plasmaCrawlStacker {
} }
public void enqueue( public void enqueue(
String nexturlString, URL nexturl,
String referrerString, String referrerhash,
String initiatorHash, String initiatorHash,
String name, String name,
Date loadDate, Date loadDate,
int currentdepth, int currentdepth,
plasmaCrawlProfile.entry profile) { plasmaCrawlProfile.entry profile) {
if (profile != null) try { if (profile != null) try {
this.queue.addMessage(new stackCrawlMessage( this.queue.addMessage(new plasmaCrawlEntry(
initiatorHash, initiatorHash,
nexturlString, nexturl,
referrerString, referrerhash,
name, name,
loadDate, loadDate,
profile.handle(), profile.handle(),
@ -220,7 +217,7 @@ public final class plasmaCrawlStacker {
} }
} }
public String dequeue(stackCrawlMessage theMsg) throws InterruptedException { public String dequeue(plasmaCrawlEntry theMsg) throws InterruptedException {
plasmaCrawlProfile.entry profile = this.sb.profiles.getEntry(theMsg.profileHandle()); plasmaCrawlProfile.entry profile = this.sb.profiles.getEntry(theMsg.profileHandle());
if (profile == null) { if (profile == null) {
@ -231,8 +228,8 @@ public final class plasmaCrawlStacker {
return stackCrawl( return stackCrawl(
theMsg.url().toString(), theMsg.url().toString(),
theMsg.referrerHash(), theMsg.referrerhash(),
theMsg.initiatorHash(), theMsg.initiator(),
theMsg.name(), theMsg.name(),
theMsg.loaddate(), theMsg.loaddate(),
theMsg.depth(), theMsg.depth(),
@ -424,175 +421,23 @@ public final class plasmaCrawlStacker {
// add the url into the crawling queue // add the url into the crawling queue
checkInterruption(); checkInterruption();
plasmaCrawlNURL.Entry ne = this.sb.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */ plasmaCrawlEntry ne = new plasmaCrawlEntry(initiatorHash, /* initiator, needed for p2p-feedback */
nexturl, /* url clear text string */ nexturl, /* url clear text string */
loadDate, /* load date */
referrerHash, /* last url in crawling queue */ referrerHash, /* last url in crawling queue */
name, /* the anchor name */ name, /* load date */
loadDate, /* the anchor name */
(profile == null) ? null : profile.handle(), // profile must not be null! (profile == null) ? null : profile.handle(), // profile must not be null!
currentdepth, /*depth so far*/ currentdepth, /*depth so far*/
0, /*anchors, default value */ 0, /*anchors, default value */
0 /*forkfactor, default value */ 0 /*forkfactor, default value */
); );
ne.store();
this.sb.noticeURL.push( this.sb.noticeURL.push(
((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT : ((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT :
((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/, ((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/,
ne.hash()); ne);
return null; return null;
} }
public final class stackCrawlMessage {
private String initiator; // the initiator hash, is NULL or "" if it is the own proxy;
String urlHash; // the url's hash
private String referrerHash; // the url's referrer hash
private String url; // the url as string
String name; // the name of the url, from anchor tag <a>name</a>
private Date loaddate; // the time when the url was first time appeared
private String profileHandle; // the name of the prefetch profile
private int depth; // the prefetch depth so far, starts at 0
private int anchors; // number of anchors of the parent
private int forkfactor; // sum of anchors of all ancestors
private kelondroBitfield flags;
private int handle;
// loadParallel(URL url, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile) {
public stackCrawlMessage(
String initiator,
String urlString,
String referrerUrlString,
String name,
Date loaddate,
String profileHandle,
int depth,
int anchors,
int forkfactor) {
try {
// create new entry and store it into database
this.urlHash = plasmaURL.urlHash(urlString);
this.initiator = initiator;
this.url = urlString;
this.referrerHash = (referrerUrlString == null) ? plasmaURL.dummyHash : plasmaURL.urlHash(referrerUrlString);
this.name = (name == null) ? "" : name;
this.loaddate = (loaddate == null) ? new Date() : loaddate;
this.profileHandle = profileHandle; // must not be null
this.depth = depth;
this.anchors = anchors;
this.forkfactor = forkfactor;
this.flags = new kelondroBitfield();
this.handle = 0;
} catch (Exception e) {
e.printStackTrace();
}
}
public stackCrawlMessage(String urlHash, kelondroRow.Entry entry) {
if (urlHash == null) throw new NullPointerException("Url hash was null");
if (entry == null) throw new NullPointerException("kelondroRow.Entry was null");
try {
this.urlHash = urlHash;
this.initiator = entry.getColString(1, "UTF-8");
this.url = entry.getColString(2, "UTF-8").trim();
this.referrerHash = (entry.empty(3)) ? plasmaURL.dummyHash : entry.getColString(3, "UTF-8");
this.name = (entry.empty(4)) ? "" : entry.getColString(4, "UTF-8").trim();
this.loaddate = new Date(86400000 * entry.getColLong(5));
this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6, "UTF-8").trim();
this.depth = (int) entry.getColLong(7);
this.anchors = (int) entry.getColLong(8);
this.forkfactor = (int) entry.getColLong(9);
this.flags = new kelondroBitfield(entry.getColBytes(10));
try {
this.handle = Integer.parseInt(new String(entry.getColBytes(11), "UTF-8"));
} catch (NumberFormatException ee) {
System.out.println("BUG in stackCrawlMessage. entry = " + entry.toString());
throw new RuntimeException(ee.getMessage());
}
} catch (Exception e) {
e.printStackTrace();
throw new IllegalStateException(e.toString());
}
}
public String url() {
return this.url;
}
public String referrerHash() {
return this.referrerHash;
}
public String initiatorHash() {
if (this.initiator == null) return null;
if (this.initiator.length() == 0) return null;
return this.initiator;
}
public Date loaddate() {
return this.loaddate;
}
public String name() {
return this.name;
}
public int depth() {
return this.depth;
}
public String profileHandle() {
return this.profileHandle;
}
public String toString() {
StringBuffer str = new StringBuffer();
str.append("urlHash: ").append(urlHash==null ? "null" : urlHash).append(" | ")
.append("initiator: ").append(initiator==null?"null":initiator).append(" | ")
.append("url: ").append(url==null?"null":url).append(" | ")
.append("referrer: ").append((referrerHash == null) ? plasmaURL.dummyHash : referrerHash).append(" | ")
.append("name: ").append((name == null) ? "null" : name).append(" | ")
.append("loaddate: ").append((loaddate == null) ? new Date() : loaddate).append(" | ")
.append("profile: ").append(profileHandle==null?"null":profileHandle).append(" | ")
.append("depth: ").append(Integer.toString(depth)).append(" | ")
.append("forkfactor: ").append(Integer.toString(forkfactor)).append(" | ")
//.append("flags: ").append((flags==null) ? "null" : flags.toString())
;
return str.toString();
}
public byte[][] getBytes() {
// stores the values from the object variables into the database
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, plasmaCrawlNURL.rowdef.width(5));
// store the hash in the hash cache
// even if the entry exists, we simply overwrite it
byte[][] entry = null;
try {
entry = new byte[][] {
this.urlHash.getBytes(),
(this.initiator == null) ? "".getBytes() : this.initiator.getBytes(),
this.url.getBytes(),
this.referrerHash.getBytes(),
this.name.getBytes("UTF-8"),
loaddatestr.getBytes(),
(this.profileHandle == null) ? null : this.profileHandle.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.depth, plasmaCrawlNURL.rowdef.width(7)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, plasmaCrawlNURL.rowdef.width(8)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, plasmaCrawlNURL.rowdef.width(9)).getBytes(),
this.flags.bytes(),
normalizeHandle(this.handle).getBytes()
};
} catch (UnsupportedEncodingException e) { /* ignore this */ }
return entry;
}
private String normalizeHandle(int h) {
String d = Integer.toHexString(h);
while (d.length() < plasmaCrawlNURL.rowdef.width(11)) d = "0" + d;
return d;
}
}
final class stackCrawlQueue { final class stackCrawlQueue {
private final serverSemaphore readSync; private final serverSemaphore readSync;
@ -657,10 +502,10 @@ public final class plasmaCrawlStacker {
// do nothing.. // do nothing..
} }
if (this.dbtype == QUEUE_DB_TYPE_FLEX) { if (this.dbtype == QUEUE_DB_TYPE_FLEX) {
kelondroFlexWidthArray.delete(cacheStacksPath, "urlPreNotice2.table"); kelondroFlexWidthArray.delete(cacheStacksPath, "urlNoticeStacker7.db");
} }
if (this.dbtype == QUEUE_DB_TYPE_TREE) { if (this.dbtype == QUEUE_DB_TYPE_TREE) {
File cacheFile = new File(cacheStacksPath, "urlPreNotice.db"); File cacheFile = new File(cacheStacksPath, "urlNoticeStacker.db");
cacheFile.delete(); cacheFile.delete();
} }
} }
@ -669,19 +514,19 @@ public final class plasmaCrawlStacker {
if (!(cacheStacksPath.exists())) cacheStacksPath.mkdir(); // make the path if (!(cacheStacksPath.exists())) cacheStacksPath.mkdir(); // make the path
if (this.dbtype == QUEUE_DB_TYPE_RAM) { if (this.dbtype == QUEUE_DB_TYPE_RAM) {
this.urlEntryCache = new kelondroRowSet(plasmaCrawlNURL.rowdef, 0); this.urlEntryCache = new kelondroRowSet(plasmaCrawlEntry.rowdef, 0);
} }
if (this.dbtype == QUEUE_DB_TYPE_FLEX) { if (this.dbtype == QUEUE_DB_TYPE_FLEX) {
String newCacheName = "urlPreNotice2.table"; String newCacheName = "urlNoticeStacker7.db";
cacheStacksPath.mkdirs(); cacheStacksPath.mkdirs();
try { try {
this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, plasmaCrawlNURL.rowdef), true, false); this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, plasmaCrawlEntry.rowdef), true, false);
} catch (Exception e) { } catch (Exception e) {
e.printStackTrace(); e.printStackTrace();
// kill DB and try again // kill DB and try again
kelondroFlexTable.delete(cacheStacksPath, newCacheName); kelondroFlexTable.delete(cacheStacksPath, newCacheName);
try { try {
this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, plasmaCrawlNURL.rowdef), true, false); this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, plasmaCrawlEntry.rowdef), true, false);
} catch (Exception ee) { } catch (Exception ee) {
ee.printStackTrace(); ee.printStackTrace();
System.exit(-1); System.exit(-1);
@ -689,10 +534,10 @@ public final class plasmaCrawlStacker {
} }
} }
if (this.dbtype == QUEUE_DB_TYPE_TREE) { if (this.dbtype == QUEUE_DB_TYPE_TREE) {
File cacheFile = new File(cacheStacksPath, "urlPreNotice.db"); File cacheFile = new File(cacheStacksPath, "urlNoticeStacker.db");
cacheFile.getParentFile().mkdirs(); cacheFile.getParentFile().mkdirs();
try { try {
this.urlEntryCache = new kelondroCache(kelondroTree.open(cacheFile, true, preloadTime, plasmaCrawlNURL.rowdef), true, true); this.urlEntryCache = new kelondroCache(kelondroTree.open(cacheFile, true, preloadTime, plasmaCrawlEntry.rowdef), true, true);
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
System.exit(-1); System.exit(-1);
@ -708,7 +553,7 @@ public final class plasmaCrawlStacker {
this.urlEntryHashCache.clear(); this.urlEntryHashCache.clear();
} }
public void addMessage(stackCrawlMessage newMessage) public void addMessage(plasmaCrawlEntry newMessage)
throws InterruptedException, IOException { throws InterruptedException, IOException {
if (newMessage == null) throw new NullPointerException(); if (newMessage == null) throw new NullPointerException();
@ -717,9 +562,9 @@ public final class plasmaCrawlStacker {
boolean insertionDoneSuccessfully = false; boolean insertionDoneSuccessfully = false;
synchronized(this.urlEntryHashCache) { synchronized(this.urlEntryHashCache) {
kelondroRow.Entry oldValue = this.urlEntryCache.put(this.urlEntryCache.row().newEntry(newMessage.getBytes())); kelondroRow.Entry oldValue = this.urlEntryCache.put(newMessage.toRow());
if (oldValue == null) { if (oldValue == null) {
insertionDoneSuccessfully = this.urlEntryHashCache.add(newMessage.urlHash); insertionDoneSuccessfully = this.urlEntryHashCache.add(newMessage.urlhash());
} }
} }
@ -741,7 +586,7 @@ public final class plasmaCrawlStacker {
return this.dbtype; return this.dbtype;
} }
public stackCrawlMessage waitForMessage() throws InterruptedException, IOException { public plasmaCrawlEntry waitForMessage() throws InterruptedException, IOException {
this.readSync.P(); this.readSync.P();
this.writeSync.P(); this.writeSync.P();
@ -759,7 +604,7 @@ public final class plasmaCrawlStacker {
} }
if ((urlHash == null) || (entry == null)) return null; if ((urlHash == null) || (entry == null)) return null;
return new stackCrawlMessage(urlHash, entry); return new plasmaCrawlEntry(entry);
} }
} }
@ -941,7 +786,7 @@ public final class plasmaCrawlStacker {
private boolean running = false; private boolean running = false;
private boolean stopped = false; private boolean stopped = false;
private boolean done = false; private boolean done = false;
private stackCrawlMessage theMsg; private plasmaCrawlEntry theMsg;
public Worker(ThreadGroup theThreadGroup) { public Worker(ThreadGroup theThreadGroup) {
super(theThreadGroup,"stackCrawlThread_created"); super(theThreadGroup,"stackCrawlThread_created");
@ -963,7 +808,7 @@ public final class plasmaCrawlStacker {
} }
} }
public synchronized void execute(stackCrawlMessage newMsg) { public synchronized void execute(plasmaCrawlEntry newMsg) {
this.theMsg = newMsg; this.theMsg = newMsg;
this.done = false; this.done = false;
@ -1020,7 +865,7 @@ public final class plasmaCrawlStacker {
private void execute() throws InterruptedException { private void execute() throws InterruptedException {
try { try {
this.setName("stackCrawlThread_" + this.theMsg.url); this.setName("stackCrawlThread_" + this.theMsg.url());
String rejectReason = dequeue(this.theMsg); String rejectReason = dequeue(this.theMsg);
// check for interruption // check for interruption
@ -1028,15 +873,9 @@ public final class plasmaCrawlStacker {
// if the url was rejected we store it into the error URL db // if the url was rejected we store it into the error URL db
if (rejectReason != null) { if (rejectReason != null) {
plasmaCrawlEURL.Entry ee = sb.errorURL.newEntry( plasmaCrawlZURL.Entry ee = sb.errorURL.newEntry(
new URL(this.theMsg.url()), this.theMsg, yacyCore.seedDB.mySeed.hash, null,
this.theMsg.referrerHash(), 0, rejectReason);
this.theMsg.initiatorHash(),
yacyCore.seedDB.mySeed.hash,
this.theMsg.name,
rejectReason,
new kelondroBitfield()
);
ee.store(); ee.store();
sb.errorURL.stackPushEntry(ee); sb.errorURL.stackPushEntry(ee);
} }

@ -0,0 +1,274 @@
// plasmaCrawlZURL.java
// (C) 2007 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 15.03.2007 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlZURL {
public final static kelondroRow rowdef = new kelondroRow(
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String executor-" + yacySeedDB.commonHashLength + ", " + // the crawling executor
"Cardinal workdate-8 {b256}, " + // the time when the url was last time tried to load
"Cardinal workcount-4 {b256}, " + // number of load retries
"String anycause-80, " + // string describing load failure
"byte[] entry-" + plasmaCrawlEntry.rowdef.objectsize(), // extra space
kelondroBase64Order.enhancedCoder,
0);
// the class object
private kelondroIndex urlIndexFile = null;
private LinkedList rejectedStack = new LinkedList(); // strings: url
public plasmaCrawlZURL(File cachePath, String tablename) {
cachePath.mkdirs();
try {
urlIndexFile = new kelondroFlexTable(cachePath, tablename, -1, rowdef);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
}
public int size() {
try {
return urlIndexFile.size() ;
} catch (IOException e) {
return 0;
}
}
public void close() {
if (urlIndexFile != null) {
urlIndexFile.close();
urlIndexFile = null;
}
}
public synchronized Entry newEntry(
plasmaCrawlEntry bentry, String executor, Date workdate,
int workcount, String anycause) {
if ((executor == null) || (executor.length() < yacySeedDB.commonHashLength)) executor = plasmaURL.dummyHash;
if (anycause == null) anycause = "unknown";
return new Entry(bentry, executor, workdate, workcount, anycause);
}
public synchronized Entry newEntry(URL url, String anycause) {
return new Entry(url, anycause);
}
public boolean remove(String hash) {
if (hash == null) return false;
try {
urlIndexFile.remove(hash.getBytes());
return true;
} catch (IOException e) {
return false;
}
}
public synchronized void stackPushEntry(Entry e) {
rejectedStack.add(e.hash());
}
public Entry stackPopEntry(int pos) throws IOException {
String urlhash = (String) rejectedStack.get(pos);
if (urlhash == null) return null;
return new Entry(urlhash);
}
public synchronized Entry getEntry(String hash) throws IOException {
return new Entry(hash);
}
public boolean getUseNewDB() {
return (urlIndexFile instanceof kelondroFlexTable);
}
public boolean exists(String urlHash) {
try {
return urlIndexFile.has(urlHash.getBytes());
} catch (IOException e) {
return false;
}
}
public void clearStack() {
rejectedStack.clear();
}
public int stackSize() {
return rejectedStack.size();
}
public class Entry {
plasmaCrawlEntry bentry; // the balancer entry
private String executor; // the crawling initiator
private Date workdate; // the time when the url was last time tried to load
private int workcount; // number of tryings
private String anycause; // string describing reason for load fail
private boolean stored;
public Entry(URL url, String reason) {
this(new plasmaCrawlEntry(url), null, new Date(), 0, reason);
}
public Entry(
plasmaCrawlEntry bentry, String executor, Date workdate,
int workcount, String anycause) {
// create new entry
this.bentry = bentry;
this.executor = (executor == null) ? yacyCore.seedDB.mySeed.hash : executor;
this.workdate = (workdate == null) ? new Date() : workdate;
this.workcount = workcount;
this.anycause = (anycause == null) ? "" : anycause;
stored = false;
}
public Entry(String hash) throws IOException {
kelondroRow.Entry entry = urlIndexFile.get(hash.getBytes());
if (entry != null) {
insertEntry(entry);
}
this.stored = true;
}
public Entry(kelondroRow.Entry entry) throws IOException {
insertEntry(entry);
this.stored = false;
}
private void insertEntry(kelondroRow.Entry entry) throws IOException {
assert (entry != null);
this.executor = entry.getColString(1, "UTF-8");
this.workdate = new Date(entry.getColLong(2));
this.workcount = (int) entry.getColLong(3);
this.anycause = entry.getColString(4, "UTF-8");
this.bentry = new plasmaCrawlEntry(plasmaCrawlEntry.rowdef.newEntry(entry.getColBytes(5)));
assert ((new String(entry.getColBytes(0))).equals(bentry.urlhash()));
return;
}
public void store() {
// stores the values from the object variables into the database
if (this.stored) return;
if (this.bentry == null) return;
kelondroRow.Entry newrow = rowdef.newEntry();
newrow.setCol(0, this.bentry.urlhash().getBytes());
newrow.setCol(1, this.executor.getBytes());
newrow.setCol(2, this.workdate.getTime());
newrow.setCol(3, this.workcount);
newrow.setCol(4, this.anycause.getBytes());
newrow.setCol(5, this.bentry.toRow().bytes());
try {
urlIndexFile.put(newrow);
this.stored = true;
} catch (IOException e) {
System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString());
}
}
public URL url() {
return this.bentry.url();
}
public String initiator() {
return this.bentry.initiator();
}
public String hash() {
// return a url-hash, based on the md5 algorithm
// the result is a String of 12 bytes within a 72-bit space
// (each byte has an 6-bit range)
// that should be enough for all web pages on the world
return this.bentry.urlhash();
}
public Date workdate() {
return workdate;
}
public String executor() {
// return the creator's hash
return executor;
}
public String anycause() {
return anycause;
}
}
public class kiter implements Iterator {
// enumerates entry elements
Iterator i;
boolean error = false;
public kiter(boolean up, String firstHash) throws IOException {
i = urlIndexFile.rows(up, (firstHash == null) ? null : firstHash.getBytes());
error = false;
}
public boolean hasNext() {
if (error) return false;
return i.hasNext();
}
public Object next() throws RuntimeException {
kelondroRow.Entry e = (kelondroRow.Entry) i.next();
if (e == null) return null;
try {
return new Entry(e);
} catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null));
}
}
public void remove() {
i.remove();
}
}
public Iterator entries(boolean up, String firstHash) throws IOException {
// enumerates entry elements
return new kiter(up, firstHash);
}
}

@ -454,7 +454,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
if (rcLocal == null) return; if (rcLocal == null) return;
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, rcLocal, timeout - System.currentTimeMillis()); plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, rcLocal, timeout - System.currentTimeMillis());
preorder.remove(true, true); if (preorder.filteredCount()> query.wantedResults) preorder.remove(true, true);
// start url-fetch // start url-fetch
indexRWIEntryNew entry; indexRWIEntryNew entry;

@ -512,6 +512,7 @@ public class plasmaSnippetCache {
maxLength = maxLength - result.length(); maxLength = maxLength - result.length();
if (maxLength < 20) maxLength = 20; if (maxLength < 20) maxLength = 20;
tsr = computeTextSnippet(os.values().iterator(), remaininghashes, maxLength); tsr = computeTextSnippet(os.values().iterator(), remaininghashes, maxLength);
if (tsr == null) return null;
String nextSnippet = (String) tsr[0]; String nextSnippet = (String) tsr[0];
if (nextSnippet == null) return tsr; if (nextSnippet == null) return tsr;
return new Object[]{result + (" / " + nextSnippet), tsr[1]}; return new Object[]{result + (" / " + nextSnippet), tsr[1]};

@ -209,7 +209,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public File workPath; public File workPath;
public HashMap rankingPermissions; public HashMap rankingPermissions;
public plasmaCrawlNURL noticeURL; public plasmaCrawlNURL noticeURL;
public plasmaCrawlEURL errorURL; public plasmaCrawlZURL errorURL, delegatedURL;
public plasmaWordIndex wordIndex; public plasmaWordIndex wordIndex;
public plasmaHTCache cacheManager; public plasmaHTCache cacheManager;
public plasmaSnippetCache snippetCache; public plasmaSnippetCache snippetCache;
@ -1038,8 +1038,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// start indexing management // start indexing management
log.logConfig("Starting Indexing Management"); log.logConfig("Starting Indexing Management");
noticeURL = new plasmaCrawlNURL(plasmaPath, -1); noticeURL = new plasmaCrawlNURL(plasmaPath);
errorURL = new plasmaCrawlEURL(plasmaPath, -1); errorURL = new plasmaCrawlZURL(plasmaPath, "urlError.db");
delegatedURL = new plasmaCrawlZURL(plasmaPath, "urlDelegated.db");
wordIndex = new plasmaWordIndex(indexPath, ramRWI_time, log); wordIndex = new plasmaWordIndex(indexPath, ramRWI_time, log);
// set a high maximum cache size to current size; this is adopted later automatically // set a high maximum cache size to current size; this is adopted later automatically
@ -1330,19 +1331,20 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// if it not exists, null is returned // if it not exists, null is returned
if (wordIndex.loadedURL.exists(hash)) return "loaded"; if (wordIndex.loadedURL.exists(hash)) return "loaded";
if (noticeURL.existsInStack(hash)) return "crawler"; if (noticeURL.existsInStack(hash)) return "crawler";
if (delegatedURL.exists(hash)) return "delegated";
if (errorURL.exists(hash)) return "errors"; if (errorURL.exists(hash)) return "errors";
return null; return null;
} }
public URL getURL(String urlhash) throws IOException { public URL getURL(String urlhash) throws IOException {
if (urlhash.equals(plasmaURL.dummyHash)) return null; if (urlhash.equals(plasmaURL.dummyHash)) return null;
try { plasmaCrawlEntry ne = noticeURL.get(urlhash);
plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash); if (ne != null) return ne.url();
if (ne != null) return ne.url();
} catch (IOException e) {}
indexURLEntry le = wordIndex.loadedURL.load(urlhash, null); indexURLEntry le = wordIndex.loadedURL.load(urlhash, null);
if (le != null) return le.comp().url(); if (le != null) return le.comp().url();
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash); plasmaCrawlZURL.Entry ee = delegatedURL.getEntry(urlhash);
if (ee != null) return ee.url();
ee = errorURL.getEntry(urlhash);
if (ee != null) return ee.url(); if (ee != null) return ee.url();
return null; return null;
} }
@ -1602,6 +1604,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
flushCitationReference(crg, "crg"); flushCitationReference(crg, "crg");
log.logConfig("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager (stand by...)"); log.logConfig("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager (stand by...)");
noticeURL.close(); noticeURL.close();
delegatedURL.close();
errorURL.close(); errorURL.close();
wordIndex.close(); wordIndex.close();
yc.close(); yc.close();
@ -1739,6 +1742,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public int cleanupJobSize() { public int cleanupJobSize() {
int c = 0; int c = 0;
if ((delegatedURL.stackSize() > 1000)) c++;
if ((errorURL.stackSize() > 1000)) c++; if ((errorURL.stackSize() > 1000)) c++;
for (int i = 1; i <= 6; i++) { for (int i = 1; i <= 6; i++) {
if (wordIndex.loadedURL.getStackSize(i) > 1000) c++; if (wordIndex.loadedURL.getStackSize(i) > 1000) c++;
@ -1758,6 +1762,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
rankingOwnDistribution.transferRanking(count); rankingOwnDistribution.transferRanking(count);
rankingOtherDistribution.transferRanking(1); rankingOtherDistribution.transferRanking(1);
// clean up delegated stack
checkInterruption();
if ((delegatedURL.stackSize() > 1000)) {
log.logFine("Cleaning Delegated-URLs report stack, " + delegatedURL.stackSize() + " entries on stack");
delegatedURL.clearStack();
hasDoneSomething = true;
}
// clean up error stack // clean up error stack
checkInterruption(); checkInterruption();
if ((errorURL.stackSize() > 1000)) { if ((errorURL.stackSize() > 1000)) {
@ -1765,6 +1777,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
errorURL.clearStack(); errorURL.clearStack();
hasDoneSomething = true; hasDoneSomething = true;
} }
// clean up loadedURL stack // clean up loadedURL stack
for (int i = 1; i <= 6; i++) { for (int i = 1; i <= 6; i++) {
checkInterruption(); checkInterruption();
@ -1774,6 +1787,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
hasDoneSomething = true; hasDoneSomething = true;
} }
} }
// clean up profiles // clean up profiles
checkInterruption(); checkInterruption();
if (cleanProfiles()) hasDoneSomething = true; if (cleanProfiles()) hasDoneSomething = true;
@ -1883,7 +1897,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
// do a local crawl // do a local crawl
plasmaCrawlNURL.Entry urlEntry = null; plasmaCrawlEntry urlEntry = null;
while (urlEntry == null && noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) > 0) { while (urlEntry == null && noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) > 0) {
String stats = "LOCALCRAWL[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; String stats = "LOCALCRAWL[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
try { try {
@ -1953,7 +1967,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String stats = "REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " String stats = "REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", "
+ noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
try { try {
plasmaCrawlNURL.Entry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT); plasmaCrawlEntry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT);
String profileHandle = urlEntry.profileHandle(); String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling: // System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); // profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
@ -2040,7 +2054,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", "
+ noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
try { try {
plasmaCrawlNURL.Entry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE); plasmaCrawlEntry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE);
String profileHandle = urlEntry.profileHandle(); String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling: // System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " + // profileHandle = " + profileHandle + ", urlEntry.url = " +
@ -2155,6 +2169,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
Map hl = document.getHyperlinks(); Map hl = document.getHyperlinks();
Iterator i = hl.entrySet().iterator(); Iterator i = hl.entrySet().iterator();
String nextUrlString; String nextUrlString;
URL nextUrl;
Map.Entry nextEntry; Map.Entry nextEntry;
while (i.hasNext()) { while (i.hasNext()) {
// check for interruption // check for interruption
@ -2164,10 +2179,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
nextEntry = (Map.Entry) i.next(); nextEntry = (Map.Entry) i.next();
nextUrlString = (String) nextEntry.getKey(); nextUrlString = (String) nextEntry.getKey();
try { try {
nextUrlString = new URL(nextUrlString).toNormalform(); nextUrl = new URL(nextUrlString);
// enqueue the hyperlink into the pre-notice-url db // enqueue the hyperlink into the pre-notice-url db
sbStackCrawlThread.enqueue(nextUrlString, entry.url().toString(), initiatorPeerHash, (String) nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile()); sbStackCrawlThread.enqueue(nextUrl, entry.urlHash(), initiatorPeerHash, (String) nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile());
} catch (MalformedURLException e1) {} } catch (MalformedURLException e1) {}
} }
log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.normalizedURLString() + log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.normalizedURLString() +
@ -2447,11 +2462,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
// removing current entry from notice URL queue // removing current entry from notice URL queue
/*
boolean removed = noticeURL.remove(entry.urlHash()); // worked-off boolean removed = noticeURL.remove(entry.urlHash()); // worked-off
if (!removed) { if (!removed) {
log.logFinest("Unable to remove indexed URL " + entry.url() + " from Crawler Queue. This could be because of an URL redirect."); log.logFinest("Unable to remove indexed URL " + entry.url() + " from Crawler Queue. This could be because of an URL redirect.");
} }
*/
// explicit delete/free resources // explicit delete/free resources
if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) { if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) {
plasmaHTCache.filesInUse.remove(entry.cacheFile()); plasmaHTCache.filesInUse.remove(entry.cacheFile());
@ -2540,7 +2557,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
} }
private void processLocalCrawling(plasmaCrawlNURL.Entry urlEntry, plasmaCrawlProfile.entry profile, String stats) { private void processLocalCrawling(plasmaCrawlEntry urlEntry, plasmaCrawlProfile.entry profile, String stats) {
// work off one Crawl stack entry // work off one Crawl stack entry
if ((urlEntry == null) || (urlEntry.url() == null)) { if ((urlEntry == null) || (urlEntry.url() == null)) {
log.logInfo(stats + ": urlEntry=null"); log.logInfo(stats + ": urlEntry=null");
@ -2549,114 +2566,117 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// convert the referrer hash into the corresponding URL // convert the referrer hash into the corresponding URL
URL refererURL = null; URL refererURL = null;
String refererHash = urlEntry.referrerHash(); String refererHash = urlEntry.referrerhash();
if ((refererHash != null) && (!refererHash.equals(plasmaURL.dummyHash))) try { if ((refererHash != null) && (!refererHash.equals(plasmaURL.dummyHash))) try {
refererURL = this.getURL(refererHash); refererURL = this.getURL(refererHash);
} catch (IOException e) { } catch (IOException e) {
refererURL = null; refererURL = null;
} }
cacheLoader.loadAsync(urlEntry.url(), urlEntry.name(), (refererURL!=null)?refererURL.toString():null, urlEntry.initiator(), urlEntry.depth(), profile); cacheLoader.loadAsync(urlEntry.url(), urlEntry.name(), (refererURL!=null)?refererURL.toString():null, urlEntry.initiator(), urlEntry.depth(), profile);
log.logInfo(stats + ": enqueued for load " + urlEntry.url() + " [" + urlEntry.hash() + "]"); log.logInfo(stats + ": enqueued for load " + urlEntry.url() + " [" + urlEntry.urlhash() + "]");
return; return;
} }
private boolean processRemoteCrawlTrigger(plasmaCrawlNURL.Entry urlEntry) { private boolean processRemoteCrawlTrigger(plasmaCrawlEntry urlEntry) {
// if this returns true, then the urlEntry is considered as stored somewhere and the case is finished
// if this returns false, the urlEntry will be enqueued to the local crawl again
// return true iff another peer has/will index(ed) the url // wrong access
if (urlEntry == null) { if (urlEntry == null) {
log.logInfo("REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null"); log.logInfo("REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null");
return true; // superfluous request; true correct in this context return true; // superfluous request; true correct in this context because the urlEntry shall not be tracked any more
}
// are we qualified?
if ((yacyCore.seedDB.mySeed == null) ||
(yacyCore.seedDB.mySeed.isJunior())) {
log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: no permission");
return false;
} }
// check url // check url
if (urlEntry.url() == null) { if (urlEntry.url() == null) {
log.logFine("ERROR: plasmaSwitchboard.processRemoteCrawlTrigger - url is null. name=" + urlEntry.name()); log.logFine("ERROR: plasmaSwitchboard.processRemoteCrawlTrigger - url is null. name=" + urlEntry.name());
return true; return true; // same case as above: no more consideration
} }
String urlhash = plasmaURL.urlHash(urlEntry.url());
// check remote crawl // are we qualified for a remote crawl?
yacySeed remoteSeed = yacyCore.dhtAgent.getCrawlSeed(urlhash); if ((yacyCore.seedDB.mySeed == null) || (yacyCore.seedDB.mySeed.isJunior())) {
log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: no permission");
return false; // no, we must crawl this page ourselves
}
// check if peer for remote crawl is available
yacySeed remoteSeed = yacyCore.dhtAgent.getCrawlSeed(urlEntry.urlhash());
if (remoteSeed == null) { if (remoteSeed == null) {
log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: no remote crawl seed available"); log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: no remote crawl seed available");
return false; return false;
} }
// do the request // do the request
HashMap page = null;
try { try {
HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), getURL(urlEntry.referrerHash()), 6000); page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), getURL(urlEntry.referrerhash()), 6000);
} catch (IOException e1) {
log.logSevere(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. URL CANNOT BE RETRIEVED from referrer hash: " + urlEntry.referrerhash(), e1);
return false;
}
// check success // check if we got contact to peer and the peer respondet
/* if ((page == null) || (page.get("delay") == null)) {
* the result of the 'response' value can have one of the following log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + urlEntry.url().toString() + "). Removed peer.");
* values: negative cases, no retry denied - the peer does not want yacyCore.peerActions.peerDeparture(remoteSeed);
* to crawl that exception - an exception occurred return false; // no response from peer, we will crawl this ourself
* }
* negative case, retry possible rejected - the peer has rejected to
* process, but a re-try should be possible log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed="
* + remoteSeed.getName() + ", url=" + urlEntry.url().toString()
* positive case with crawling stacked - the resource is processed + ", response=" + page.toString()); // DEBUG
* asap
* // we received an answer and we are told to wait a specific time until we shall ask again for another crawl
* positive case without crawling double - the resource is already int newdelay = Integer.parseInt((String) page.get("delay"));
* in database, believed to be fresh and not reloaded the resource yacyCore.dhtAgent.setCrawlDelay(remoteSeed.hash, newdelay);
* is also returned in lurl String response = (String) page.get("response");
*/ if (response.equals("stacked")) {
if ((page == null) || (page.get("delay") == null)) { // success, the remote peer accepted the crawl
log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + urlEntry.url().toString() + "). Removed peer."); log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
if (remoteSeed != null) { + " PLACED URL=" + urlEntry.url().toString()
yacyCore.peerActions.peerDeparture(remoteSeed); + "; NEW DELAY=" + newdelay);
} // track this remote crawl
return false; this.delegatedURL.newEntry(urlEntry, remoteSeed.hash, new Date(), 0, response).store();
} else return true;
}
// check other cases: the remote peer may respond that it already knows that url
if (response.equals("double")) {
// in case the peer answers double, it transmits the complete lurl data
String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
indexURLEntry entry = wordIndex.loadedURL.newEntry(propStr);
try { try {
log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed=" + remoteSeed.getName() + ", url=" + urlEntry.url().toString() + ", response=" + page.toString()); // DEBUG wordIndex.loadedURL.store(entry);
wordIndex.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
int newdelay = Integer.parseInt((String) page.get("delay")); // noticeURL.remove(entry.hash());
yacyCore.dhtAgent.setCrawlDelay(remoteSeed.hash, newdelay); } catch (IOException e) {
String response = (String) page.get("response"); // TODO Auto-generated catch block
if (response.equals("stacked")) { e.printStackTrace();
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " PLACED URL=" + urlEntry.url().toString() + "; NEW DELAY=" + newdelay);
return true;
} else if (response.equals("double")) {
String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
indexURLEntry entry = wordIndex.loadedURL.newEntry(propStr);
wordIndex.loadedURL.store(entry);
wordIndex.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
noticeURL.remove(entry.hash());
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'");
return true;
} else {
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " REJECTED. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + ")");
remoteSeed.setFlagAcceptRemoteCrawl(false);
yacyCore.seedDB.update(remoteSeed.hash, remoteSeed);
return false;
}
} else {
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " DENIED. RESPONSE=" + response + ", CAUSE=" + page.get("reason") + ", URL=" + urlEntry.url().toString());
remoteSeed.setFlagAcceptRemoteCrawl(false);
yacyCore.seedDB.update(remoteSeed.hash, remoteSeed);
return false;
}
} catch (Exception e) {
// wrong values
log.logSevere(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. CLIENT RETURNED: " + page.toString(), e);
return false;
} }
} catch (IOException e) {
log.logSevere(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. URL CANNOT BE RETRIEVED from referrer hash: " + urlEntry.referrerHash(), e); log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
return false; + " SUPERFLUOUS. CAUSE: " + page.get("reason")
+ " (URL=" + urlEntry.url().toString()
+ "). URL IS CONSIDERED AS 'LOADED!'");
return true;
} else {
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
+ " REJECTED. CAUSE: bad lurl response / " + page.get("reason") + " (URL="
+ urlEntry.url().toString() + ")");
remoteSeed.setFlagAcceptRemoteCrawl(false);
yacyCore.seedDB.update(remoteSeed.hash, remoteSeed);
return false;
}
} }
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
+ " DENIED. RESPONSE=" + response + ", CAUSE="
+ page.get("reason") + ", URL=" + urlEntry.url().toString());
remoteSeed.setFlagAcceptRemoteCrawl(false);
yacyCore.seedDB.update(remoteSeed.hash, remoteSeed);
return false;
} }
private static SimpleDateFormat DateFormatter = new SimpleDateFormat("EEE, dd MMM yyyy"); private static SimpleDateFormat DateFormatter = new SimpleDateFormat("EEE, dd MMM yyyy");
@ -3165,20 +3185,24 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
kelondroBitfield flags kelondroBitfield flags
) { ) {
// create a new errorURL DB entry // create a new errorURL DB entry
plasmaCrawlEURL.Entry ee = this.errorURL.newEntry( plasmaCrawlEntry bentry = new plasmaCrawlEntry(
url, initiator,
referrerHash, url,
initiator, referrerHash,
yacyCore.seedDB.mySeed.hash, (name == null) ? "" : name,
(name==null)?"":name, new Date(),
failreason, null,
flags 0,
); 0,
0);
plasmaCrawlZURL.Entry ee = this.errorURL.newEntry(
bentry, initiator, new Date(),
0, failreason);
// store the entry // store the entry
ee.store(); ee.store();
// push it onto the stack // push it onto the stack
this.errorURL.stackPushEntry(ee); this.errorURL.stackPushEntry(ee);
} }
public void checkInterruption() throws InterruptedException { public void checkInterruption() throws InterruptedException {
Thread curThread = Thread.currentThread(); Thread curThread = Thread.currentThread();

@ -253,20 +253,7 @@ public class plasmaSwitchboardQueue {
this.contentInfo = null; this.contentInfo = null;
this.referrerURL = null; this.referrerURL = null;
} }
public String toString() {
StringBuffer str = new StringBuffer();
str.append("url: ") .append(this.url==null ? "null" : this.url.toString()).append(" | ")
.append("referrer: ") .append(this.referrerHash==null?"null":this.referrerHash).append(" | ")
.append("ifModifiedSince: ").append(this.ifModifiedSince==null?"null":this.ifModifiedSince.toString()).append(" | ")
.append("flags: ") .append(Byte.toString(this.flags)).append(" | ")
.append("initiator: ") .append(this.initiator==null ? "null" : this.initiator).append(" | ")
.append("depth: ") .append(Integer.toString(this.depth)).append(" | ")
.append("profile: ") .append(this.profileHandle==null?"null":this.profileHandle).append(" | ")
.append("anchorName: ") .append(this.anchorName==null?"null":this.anchorName);
return str.toString();
}
public URL url() { public URL url() {
return url; return url;
} }

@ -84,9 +84,10 @@ import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroTree;
import de.anomic.net.URL; import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.plasmaWordIndexAssortment; import de.anomic.plasma.plasmaWordIndexAssortment;
@ -1011,11 +1012,11 @@ public final class yacy {
} }
if (source.equals("eurl")) { if (source.equals("eurl")) {
Iterator eiter = sb.errorURL.entries(true, null); Iterator eiter = sb.errorURL.entries(true, null);
plasmaCrawlEURL.Entry entry; plasmaCrawlZURL.Entry entry;
while (eiter.hasNext()) { while (eiter.hasNext()) {
try { try {
entry = (plasmaCrawlEURL.Entry) eiter.next(); entry = (plasmaCrawlZURL.Entry) eiter.next();
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), entry.failreason()); if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), entry.anycause());
} catch (Exception e) { } catch (Exception e) {
// here a MalformedURLException may occur // here a MalformedURLException may occur
// just ignore // just ignore
@ -1029,11 +1030,11 @@ public final class yacy {
} }
} }
if (source.equals("nurl")) { if (source.equals("nurl")) {
Iterator eiter = sb.noticeURL.entries(true, null); Iterator eiter = sb.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
plasmaCrawlNURL.Entry entry; plasmaCrawlEntry entry;
while (eiter.hasNext()) { while (eiter.hasNext()) {
try { try {
entry = (plasmaCrawlNURL.Entry) eiter.next(); entry = (plasmaCrawlEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), "profile=" + entry.profileHandle() + ", depth=" + entry.depth()); if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), "profile=" + entry.profileHandle() + ", depth=" + entry.depth());
} catch (Exception e) { } catch (Exception e) {
// here a MalformedURLException may occur // here a MalformedURLException may occur
@ -1120,12 +1121,12 @@ public final class yacy {
} }
if (source.equals("eurl")) { if (source.equals("eurl")) {
Iterator eiter = sb.errorURL.entries(true, null); Iterator eiter = sb.errorURL.entries(true, null);
plasmaCrawlEURL.Entry entry; plasmaCrawlZURL.Entry entry;
while (eiter.hasNext()) { while (eiter.hasNext()) {
entry = (plasmaCrawlEURL.Entry) eiter.next(); entry = (plasmaCrawlZURL.Entry) eiter.next();
if ((entry != null) && (entry.url() != null)) { if ((entry != null) && (entry.url() != null)) {
if (html) { if (html) {
bos.write(("<a href=\"" + entry.url() + "\">" + entry.url() + "</a> " + entry.failreason() + "<br>").getBytes("UTF-8")); bos.write(("<a href=\"" + entry.url() + "\">" + entry.url() + "</a> " + entry.anycause() + "<br>").getBytes("UTF-8"));
bos.write(serverCore.crlf); bos.write(serverCore.crlf);
} else { } else {
bos.write(entry.url().toString().getBytes()); bos.write(entry.url().toString().getBytes());
@ -1135,10 +1136,10 @@ public final class yacy {
} }
} }
if (source.equals("nurl")) { if (source.equals("nurl")) {
Iterator eiter = sb.noticeURL.entries(true, null); Iterator eiter = sb.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
plasmaCrawlNURL.Entry entry; plasmaCrawlEntry entry;
while (eiter.hasNext()) { while (eiter.hasNext()) {
entry = (plasmaCrawlNURL.Entry) eiter.next(); entry = (plasmaCrawlEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) { if ((entry != null) && (entry.url() != null)) {
if (html) { if (html) {
bos.write(("<a href=\"" + entry.url() + "\">" + entry.url() + "</a> " + "profile=" + entry.profileHandle() + ", depth=" + entry.depth() + "<br>").getBytes("UTF-8")); bos.write(("<a href=\"" + entry.url() + "\">" + entry.url() + "</a> " + "profile=" + entry.profileHandle() + ", depth=" + entry.depth() + "<br>").getBytes("UTF-8"));

Loading…
Cancel
Save