redesigned NURL-handling:

- the general NURL-index for all crawl stack types was splitted into separate indexes for these stacks
- the new NURL-index is managed by the crawl balancer
- the crawl balancer does not need an internal index any more, it is replaced by the NURL-index
- the NURL.Entry was generalized and is now a new class plasmaCrawlEntry
- the new class plasmaCrawlEntry replaces also the preNURL.Entry class, and will also replace the switchboardEntry class in the future
- the new class plasmaCrawlEntry is more accurate for date entries (holds milliseconds) and can contain larger 'name' entries (anchor tag names)
- the EURL object was replaced by a new ZURL object, which is a container for the plasmaCrawlEntry and some tracking information
- the EURL index is now filled with ZURL objects
- a new index delegatedURL holds ZURL objects about plasmaCrawlEntry obects to track which url is handed over to other peers
- redesigned handling of plasmaCrawlEntry - handover, because there is no need any more to convert one entry object into another
- found and fixed numerous bugs in the context of crawl state handling
- fixed a serious bug in kelondroCache which caused that entries could not be removed
- fixed some bugs in online interface and adopted monitor output to new entry objects
- adopted yacy protocol to handle new delegatedURL entries
all old crawl queues will disappear after this update!

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3483 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 094a1482f4
commit 861f41e67e

@ -56,6 +56,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
@ -272,7 +273,7 @@ public class CrawlURLFetchStack_p {
}
private static int shiftFromNotice(plasmaCrawlNURL nurl, int fromStackType, URLFetcherStack stack, int count) {
plasmaCrawlNURL.Entry entry;
plasmaCrawlEntry entry;
int failed = 0;
for (int i=0; i<count; i++) try {
entry = nurl.pop(fromStackType);

@ -49,10 +49,9 @@ import java.util.Iterator;
import java.util.Random;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverSwitch;
import de.anomic.http.httpHeader;
@ -499,14 +498,9 @@ public class CrawlURLFetch_p {
totalFailed++;
this.failed.put(urls[i], reason);
try {
plasmaCrawlEURL.Entry ee = this.sb.errorURL.newEntry(
plasmaCrawlZURL.Entry ee = this.sb.errorURL.newEntry(
new URL(urls[i]),
null,
yacyCore.seedDB.mySeed.hash,
yacyCore.seedDB.mySeed.hash,
"",
reason,
new kelondroBitfield());
reason);
ee.store();
this.sb.errorURL.stackPushEntry(ee);
} catch (MalformedURLException e) { }

@ -50,7 +50,7 @@ import java.util.ArrayList;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSwitchboardQueue;
import de.anomic.server.serverObjects;
@ -186,7 +186,7 @@ public class IndexCreateIndexingQueue_p {
dark = true;
URL url;
String initiatorHash, executorHash;
plasmaCrawlEURL.Entry entry;
plasmaCrawlZURL.Entry entry;
yacySeed initiatorSeed, executorSeed;
int j=0;
for (int i = switchboard.errorURL.stackSize() - 1; i >= (switchboard.errorURL.stackSize() - showRejectedCount); i--) {
@ -202,7 +202,7 @@ public class IndexCreateIndexingQueue_p {
prop.put("rejected_list_"+j+"_initiator", ((initiatorSeed == null) ? "proxy" : wikiCode.replaceHTML(initiatorSeed.getName())));
prop.put("rejected_list_"+j+"_executor", ((executorSeed == null) ? "proxy" : wikiCode.replaceHTML(executorSeed.getName())));
prop.put("rejected_list_"+j+"_url", wikiCode.replaceHTML(url.toString()));
prop.put("rejected_list_"+j+"_failreason", entry.failreason());
prop.put("rejected_list_"+j+"_failreason", entry.anycause());
prop.put("rejected_list_"+j+"_dark", ((dark) ? 1 : 0));
dark = !dark;
j++;

@ -49,6 +49,7 @@ import java.util.Locale;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
@ -99,9 +100,9 @@ public class IndexCreateWWWGlobalQueue_p {
prop.put("crawler-queue", 0);
} else {
prop.put("crawler-queue", 1);
plasmaCrawlNURL.Entry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, showLimit);
plasmaCrawlEntry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, showLimit);
prop.put("crawler-queue_num", stackSize);//num Entries
plasmaCrawlNURL.Entry urle;
plasmaCrawlEntry urle;
boolean dark = true;
yacySeed initiator;
String profileHandle;

@ -43,7 +43,6 @@
// javac -classpath .:../classes IndexCreate_p.java
// if the shell's current path is HTROOT
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
@ -54,10 +53,10 @@ import java.util.regex.PatternSyntaxException;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaCrawlNURL.Entry;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyCore;
@ -101,15 +100,11 @@ public class IndexCreateWWWLocalQueue_p {
// iterating through the list of URLs
Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
plasmaCrawlEntry entry;
while (iter.hasNext()) {
entry = (plasmaCrawlEntry) iter.next();
String value = null;
String nextHash = (String) iter.next();
Entry entry = null;
try {
entry = switchboard.noticeURL.getEntry(nextHash);
} catch (IOException e) {
continue;
}
String nextHash = entry.urlhash();
if ((option.equals("URL")&&(entry.url() != null))) {
value = entry.url().toString();
} else if ((option.equals("AnchorName"))) {
@ -162,9 +157,9 @@ public class IndexCreateWWWLocalQueue_p {
prop.put("crawler-queue", 0);
} else {
prop.put("crawler-queue", 1);
plasmaCrawlNURL.Entry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, (int) (showLimit * 1.20));
plasmaCrawlEntry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, (int) (showLimit * 1.20));
plasmaCrawlNURL.Entry urle;
plasmaCrawlEntry urle;
boolean dark = true;
yacySeed initiator;
String profileHandle;
@ -183,7 +178,7 @@ public class IndexCreateWWWLocalQueue_p {
prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.loaddate()) );
prop.put("crawler-queue_list_"+showNum+"_anchor", wikiCode.replaceHTML(urle.name()));
prop.put("crawler-queue_list_"+showNum+"_url", wikiCode.replaceHTML(urle.url().toString()));
prop.put("crawler-queue_list_"+showNum+"_hash", urle.hash());
prop.put("crawler-queue_list_"+showNum+"_hash", urle.urlhash());
dark = !dark;
showNum++;
} else {

@ -27,11 +27,6 @@
<td>Starting Point:</td>
<td>
<table cellpadding="0" cellspacing="0">
<tr>
<td>From&nbsp;File:</td>
<td><input type="radio" name="crawlingMode" value="file" /></td>
<td><input type="file" name="crawlingFile" size="28" /></td>
</tr>
<tr>
<td>From&nbsp;URL:</td>
<td><input type="radio" name="crawlingMode" value="url" checked="checked" /></td>
@ -41,7 +36,12 @@
</td>
</tr>
<tr>
<td colspan="2"><span id="title"></span></td>
<td>From&nbsp;File:</td>
<td><input type="radio" name="crawlingMode" value="file" /></td>
<td><input type="file" name="crawlingFile" size="28" /></td>
</tr>
<tr>
<td colspan="3" class="commit"><span id="title"><br></span></td>
</tr>
</table>
</td>
@ -125,7 +125,7 @@
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td>Store to Proxy Cache:</td>
<td>Store to Web Cache:</td>
<td><input type="checkbox" name="storeHTCache" #(storeHTCacheChecked)#::checked="checked"#(/storeHTCacheChecked)# /></td>
<td>
This option is used by default for proxy prefetch, but is not needed for explicit crawling.
@ -194,9 +194,9 @@
<tr valign="top" class="TableCellLight">
<td>Wanted Performance:</td>
<td>
<input type="radio" name="crawlingSpeed" value="maximum" #(crawlingSpeedMaxChecked)#::checked="checked"#(/crawlingSpeedMaxChecked)# />maximum&nbsp;&nbsp;
<input type="radio" name="crawlingSpeed" value="custom" #(crawlingSpeedCustChecked)#::checked="checked"#(/crawlingSpeedCustChecked)# />custom: <input name="customPPM" type="text" size="4" maxlength="4" value="#[customPPMdefault]#" />PPM&nbsp;&nbsp;
<input type="radio" name="crawlingSpeed" value="minimum" #(crawlingSpeedMinChecked)#::checked="checked"#(/crawlingSpeedMinChecked)# />optimal as background process
<input type="radio" name="crawlingPerformance" value="maximum" #(crawlingSpeedMaxChecked)#::checked="checked"#(/crawlingSpeedMaxChecked)# />maximum&nbsp;&nbsp;
<input type="radio" name="crawlingPerformance" value="custom" #(crawlingSpeedCustChecked)#::checked="checked"#(/crawlingSpeedCustChecked)# />custom: <input name="customPPM" type="text" size="4" maxlength="4" value="#[customPPMdefault]#" />PPM&nbsp;&nbsp;
<input type="radio" name="crawlingPerformance" value="minimum" #(crawlingSpeedMinChecked)#::checked="checked"#(/crawlingSpeedMinChecked)# />optimal as background process
</td>
<td colspan="3">
Set wanted level of computing power, used for this and other running crawl tasks. (PPM = pages per minute)

@ -53,7 +53,6 @@ import java.util.Iterator;
import java.util.TreeMap;
import de.anomic.data.messageBoard;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.http.httpc;
import de.anomic.plasma.plasmaSwitchboard;

@ -39,10 +39,9 @@ import de.anomic.data.wikiCode;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.server.serverFileUtils;
@ -222,8 +221,7 @@ public class WatchCrawler_p {
prop.put("info_crawlingURL", wikiCode.replaceHTML(((String) post.get("crawlingURL"))));
prop.put("info_reasonString", reasonString);
plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
crawlingStartURL.getHost(), reasonString, new kelondroBitfield());
plasmaCrawlZURL.Entry ee = switchboard.errorURL.newEntry(crawlingStartURL, reasonString);
ee.store();
switchboard.errorURL.stackPushEntry(ee);
}
@ -300,8 +298,7 @@ public class WatchCrawler_p {
if (rejectReason == null) {
c++;
} else {
plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash,
(String) e.getValue(), rejectReason, new kelondroBitfield());
plasmaCrawlZURL.Entry ee = switchboard.errorURL.newEntry(nexturlURL, rejectReason);
ee.store();
switchboard.errorURL.stackPushEntry(ee);
}
@ -401,9 +398,10 @@ public class WatchCrawler_p {
private static void setPerformance(plasmaSwitchboard sb, serverObjects post) {
String crawlingPerformance = post.get("crawlingPerformance","custom");
int wantedPPM = 1000;
long LCbusySleep = Integer.parseInt(sb.getConfig(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, "100"));
int wantedPPM = (int) (60000L / LCbusySleep);
try {
wantedPPM = Integer.parseInt(post.get("customPPM","1000"));
wantedPPM = Integer.parseInt(post.get("customPPM",Integer.toString(wantedPPM)));
} catch (NumberFormatException e) {}
if (crawlingPerformance.equals("minimum")) wantedPPM = 10;
if (crawlingPerformance.equals("maximum")) wantedPPM = 1000;

@ -54,6 +54,7 @@ import java.util.Locale;
import de.anomic.data.wikiCode;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlLoaderMessage;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaSwitchboard;
@ -183,10 +184,10 @@ public class queues_p {
}
public static final void addNTable(serverObjects prop, String tableName, plasmaCrawlNURL.Entry[] crawlerList) {
public static final void addNTable(serverObjects prop, String tableName, plasmaCrawlEntry[] crawlerList) {
int showNum = 0;
plasmaCrawlNURL.Entry urle;
plasmaCrawlEntry urle;
yacySeed initiator;
for (int i = 0; i < crawlerList.length; i++) {
urle = crawlerList[i];
@ -198,7 +199,7 @@ public class queues_p {
prop.put(tableName + "_" + showNum + "_modified", daydate(urle.loaddate()));
prop.putSafeXML(tableName + "_" + showNum + "_anchor", urle.name());
prop.putSafeXML(tableName + "_" + showNum + "_url", urle.url().toString());
prop.put(tableName + "_" + showNum + "_hash", urle.hash());
prop.put(tableName + "_" + showNum + "_hash", urle.urlhash());
showNum++;
}
}

@ -66,7 +66,7 @@ public class snippet {
prop.putASIS("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown"); //FIXME: the ASIS should not be needed, but we have still htmlcode in .java files
} else {
// problems with snippet fetch
prop.put("text", (remove) ? switchboard.snippetCache.failConsequences(snippet, query) : snippet.getError());
prop.put("text", (remove) ? switchboard.snippetCache.failConsequences(snippet, queryHashes) : snippet.getError());
}
prop.put("link", 0);
prop.put("links", 0);

@ -49,11 +49,8 @@
import java.io.IOException;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -85,7 +82,7 @@ public final class crawlReceipt {
String youare = post.get("youare", ""); // seed hash of the target peer, needed for network stability
//String process = post.get("process", ""); // process type
String key = post.get("key", ""); // transmission key
String receivedUrlhash = post.get("urlhash", ""); // the url hash that has been crawled
//String receivedUrlhash = post.get("urlhash", ""); // the url hash that has been crawled
String result = post.get("result", ""); // the result; either "ok" or "fail"
String reason = post.get("reason", ""); // the reason for that result
//String words = post.get("wordh", ""); // priority word hashes
@ -114,60 +111,60 @@ public final class crawlReceipt {
final yacySeed otherPeer = yacyCore.seedDB.get(iam);
final String otherPeerName = iam + ":" + ((otherPeer == null) ? "NULL" : (otherPeer.getName() + "/" + otherPeer.getVersion()));
if ((yacyCore.seedDB.mySeed == null) || (!(yacyCore.seedDB.mySeed.hash.equals(youare)))) {
// no yacy connection / unknown peers
prop.putASIS("delay", "3600");
} else if (propStr == null) {
return prop;
}
if (propStr == null) {
// error with url / wrong key
prop.putASIS("delay", "3600");
} else if (result.equals("fill")) {
// generating a new loaded URL entry
indexURLEntry entry = switchboard.wordIndex.loadedURL.newEntry(propStr);
if (entry == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr);
} else {
indexURLEntry.Components comp = entry.comp();
if (comp.url() == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + receivedUrlhash + " from peer " + iam +
"\n\tURL properties: "+ propStr);
} else try {
// put new entry into database
switchboard.wordIndex.loadedURL.store(entry);
switchboard.wordIndex.loadedURL.stack(entry, youare, iam, 1);
// generating url hash
String newUrlHash = plasmaURL.urlHash(comp.url());
String oldUrlHash = plasmaURL.oldurlHash(comp.url());
// removing URL from notice URL
switchboard.noticeURL.remove(newUrlHash);
switchboard.noticeURL.remove(oldUrlHash);
log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + receivedUrlhash + ":" + comp.url().toNormalform());
} catch (IOException e) {
e.printStackTrace();
}
}
return prop;
}
// generating a new loaded URL entry
indexURLEntry entry = switchboard.wordIndex.loadedURL.newEntry(propStr);
if (entry == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr);
prop.putASIS("delay", "3600");
return prop;
}
indexURLEntry.Components comp = entry.comp();
if (comp.url() == null) {
log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + entry.hash() + " from peer " + iam + "\n\tURL properties: "+ propStr);
prop.putASIS("delay", "3600");
return prop;
}
if (result.equals("fill")) try {
// put new entry into database
switchboard.wordIndex.loadedURL.store(entry);
switchboard.wordIndex.loadedURL.stack(entry, youare, iam, 1);
switchboard.delegatedURL.remove(entry.hash()); // the delegated work has been done
log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + entry.hash() + ":" + comp.url().toNormalform());
// ready for more
prop.putASIS("delay", "10");
} else {
try {
plasmaCrawlNURL.Entry en = switchboard.noticeURL.getEntry(receivedUrlhash);
plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new kelondroBitfield());
ee.store();
switchboard.errorURL.stackPushEntry(ee);
switchboard.noticeURL.remove(receivedUrlhash);
} catch (IOException e) {
}
prop.putASIS("delay", "100"); // what shall we do with that???
return prop;
} catch (IOException e) {
e.printStackTrace();
prop.putASIS("delay", "3600");
return prop;
}
switchboard.delegatedURL.remove(entry.hash()); // the delegated work is transformed into an error case
plasmaCrawlZURL.Entry ee = switchboard.errorURL.newEntry(entry.toBalancerEntry(), youare, null, 0, result + ":" + reason);
ee.store();
switchboard.errorURL.stackPushEntry(ee);
//switchboard.noticeURL.remove(receivedUrlhash);
prop.putASIS("delay", "3600");
return prop;
// return rewrite properties
// return rewrite properties
return prop;
}
}

@ -34,6 +34,7 @@ import java.util.Date;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.index.indexRWIEntry;
public interface indexURLEntry {
@ -60,6 +61,7 @@ public interface indexURLEntry {
public indexRWIEntry word();
public boolean isOlder(indexURLEntry other);
public String toString(String snippet);
public plasmaCrawlEntry toBalancerEntry();
public String toString();
public class Components {

@ -13,6 +13,7 @@ import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.server.serverCharBuffer;
@ -367,6 +368,19 @@ public class indexURLEntryNew implements indexURLEntry {
//return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
}
public plasmaCrawlEntry toBalancerEntry() {
return new plasmaCrawlEntry(
null,
comp().url(),
referrerHash(),
comp().descr(),
loaddate(),
null,
0,
0,
0);
}
/**
* Returns this object as String.<br>
* This e.g. looks like this:

@ -35,6 +35,7 @@ import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaURL;
import de.anomic.server.logging.serverLog;
@ -335,6 +336,19 @@ public class indexURLEntryOld implements indexURLEntry {
//return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
}
public plasmaCrawlEntry toBalancerEntry() {
return new plasmaCrawlEntry(
null,
comp().url(),
referrerHash(),
comp().descr(),
loaddate(),
null,
0,
0,
0);
}
/**
* Returns this object as String.<br>
* This e.g. looks like this:

@ -557,7 +557,6 @@ public class kelondroCache implements kelondroIndex {
} else {
this.hasnotHit++;
this.hasnotDouble++;
return null;
}
}
@ -569,8 +568,6 @@ public class kelondroCache implements kelondroIndex {
} else {
this.readHit++;
this.cacheDelete++;
index.remove(key);
return entry;
}
}

@ -223,14 +223,12 @@ public class kelondroFlexWidthArray implements kelondroArray {
assert rowentry.bytes().length == this.rowdef.objectsize;
int c = 0;
kelondroRow.Entry e;
int lastcol;
synchronized (col) {
while (c < rowdef.columns()) {
lastcol = c + col[c].row().columns() - 1;
e = col[c].row().newEntry(
rowentry.bytes(),
rowdef.colstart[c],
rowdef.colstart[lastcol] - rowdef.colstart[c] + rowdef.width(lastcol));
col[c].row().objectsize());
col[c].set(index, e);
c = c + col[c].row().columns();
}

@ -176,20 +176,20 @@ public class kelondroRow {
for (int i = 0; i < objectsize; i++) this.rowinstance[i] = 0;
}
public Entry(byte[] rowinstance) {
this(rowinstance, 0, rowinstance.length);
public Entry(byte[] newrow) {
this(newrow, 0, newrow.length);
}
public Entry(byte[] rowinstance, int start, int length) {
assert objectsize == length : "objectsize = " + objectsize + ", length = " + length;
public Entry(byte[] newrow, int start, int length) {
assert newrow.length >= (length + start) : "objectsize = " + objectsize + ", start = " + start + ", length = " + length;
assert objectsize == length : "objectsize = " + objectsize + ", start = " + start + ", length = " + length;
this.rowinstance = new byte[objectsize];
int ll = Math.min(objectsize, length);
System.arraycopy(rowinstance, start, this.rowinstance, 0, ll);
for (int i = ll; i < objectsize; i++) this.rowinstance[i] = 0;
System.arraycopy(newrow, start, this.rowinstance, 0, objectsize);
//for (int i = ll; i < objectsize; i++) this.rowinstance[i] = 0;
}
public Entry(byte[][] cols) {
assert row.length == cols.length;
assert row.length == cols.length : "cols.length = " + cols.length + ", row.length = " + row.length;
rowinstance = new byte[objectsize];
int ll;
int cs, cw;

@ -311,6 +311,7 @@ public final class kelondroStack extends kelondroRecords {
}
public void remove() {
ni.remove();
}
}

@ -51,11 +51,11 @@ import java.io.File;
import java.io.IOException;
import de.anomic.plasma.plasmaURL;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlLoaderMessage;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaHTCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.logging.serverLog;
@ -290,15 +290,19 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW
String referrerHash = (this.refererURLString==null)?null:plasmaURL.urlHash(this.refererURLString);
// create a new errorURL DB entry
plasmaCrawlEURL.Entry ee = this.sb.errorURL.newEntry(
this.url,
referrerHash,
plasmaCrawlEntry bentry = new plasmaCrawlEntry(
this.initiator,
yacyCore.seedDB.mySeed.hash,
this.name,
(failreason==null)?"Unknown reason":failreason,
new kelondroBitfield()
);
this.url,
referrerHash,
this.name,
null,
this.profile.handle(),
this.depth,
0,
0);
plasmaCrawlZURL.Entry ee = this.sb.errorURL.newEntry(
bentry, yacyCore.seedDB.mySeed.hash, null,
0, (failreason==null)?"Unknown reason":failreason);
// store the entry
ee.store();

@ -6,6 +6,7 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.TreeMap;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlProfile;
import de.anomic.plasma.plasmaSwitchboard;
@ -89,7 +90,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
// init noticeUrlDB
this.log.logInfo("Initializing the source noticeUrlDB");
this.importNurlDB = new plasmaCrawlNURL(this.importPath, preloadTime);
this.importNurlDB = new plasmaCrawlNURL(this.importPath);
this.importStartSize = this.importNurlDB.size();
//int stackSize = this.importNurlDB.stackSize();
@ -101,7 +102,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
public void run() {
try {
// waiting on init thread to finish
this.importNurlDB.waitOnInitThread();
//this.importNurlDB.waitOnInitThread();
// the stack types we want to import
int[] stackTypes = new int[] {plasmaCrawlNURL.STACK_TYPE_CORE,
@ -110,38 +111,38 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
-1};
// looping through the various stacks
for (int i=0; i< stackTypes.length; i++) {
if (stackTypes[i] != -1) {
this.log.logInfo("Starting to import stacktype '" + stackTypes[i] + "' containing '" + this.importNurlDB.stackSize(stackTypes[i]) + "' entries.");
for (int stackType=0; stackType< stackTypes.length; stackType++) {
if (stackTypes[stackType] != -1) {
this.log.logInfo("Starting to import stacktype '" + stackTypes[stackType] + "' containing '" + this.importNurlDB.stackSize(stackTypes[stackType]) + "' entries.");
} else {
this.log.logInfo("Starting to import '" + this.importNurlDB.size() + "' entries not available in any stack.");
}
// getting an interator and loop through the URL entries
Iterator entryIter = (stackTypes[i] == -1) ? this.importNurlDB.entries(true, null) : null;
Iterator entryIter = (stackTypes[stackType] == -1) ? this.importNurlDB.iterator(stackType) : null;
while (true) {
String nextHash = null;
plasmaCrawlNURL.Entry nextEntry = null;
plasmaCrawlEntry nextEntry = null;
try {
if (stackTypes[i] != -1) {
if (this.importNurlDB.stackSize(stackTypes[i]) == 0) break;
if (stackTypes[stackType] != -1) {
if (this.importNurlDB.stackSize(stackTypes[stackType]) == 0) break;
this.urlCount++;
nextEntry = this.importNurlDB.pop(stackTypes[i]);
nextHash = nextEntry.hash();
nextEntry = this.importNurlDB.pop(stackTypes[stackType]);
nextHash = nextEntry.urlhash();
} else {
if (!entryIter.hasNext()) break;
this.urlCount++;
nextEntry = (plasmaCrawlNURL.Entry) entryIter.next();
nextHash = nextEntry.hash();
nextEntry = (plasmaCrawlEntry) entryIter.next();
nextHash = nextEntry.urlhash();
}
} catch (IOException e) {
this.log.logWarning("Unable to import entry: " + e.toString());
if ((stackTypes[i] != -1) &&(this.importNurlDB.stackSize(stackTypes[i]) == 0)) break;
if ((stackTypes[stackType] != -1) &&(this.importNurlDB.stackSize(stackTypes[stackType]) == 0)) break;
continue;
}
@ -176,9 +177,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
// if the url does not alredy exists in the destination stack we insert it now
if (!this.sb.noticeURL.existsInStack(nextHash)) {
plasmaCrawlNURL.Entry ne = this.sb.noticeURL.newEntry(nextEntry);
ne.store();
this.sb.noticeURL.push((stackTypes[i] != -1) ? stackTypes[i] : plasmaCrawlNURL.STACK_TYPE_CORE, ne.hash());
this.sb.noticeURL.push((stackTypes[stackType] != -1) ? stackTypes[stackType] : plasmaCrawlNURL.STACK_TYPE_CORE, nextEntry);
}
// removing hash from the import db
@ -191,7 +190,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor
}
if (this.isAborted()) break;
}
this.log.logInfo("Finished to import stacktype '" + stackTypes[i] + "'");
this.log.logInfo("Finished to import stacktype '" + stackTypes[stackType] + "'");
}
//int size = this.importNurlDB.size();

@ -43,17 +43,18 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroRecords;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroStack;
@ -61,86 +62,136 @@ import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlBalancer {
private static final String stackSuffix = "7.stack";
private static final String indexSuffix = "7.db";
// a shared domainAccess map for all balancers
private static final Map domainAccess = Collections.synchronizedMap(new HashMap());
// definition of payload for fileStack
private static final kelondroRow payload = new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength, kelondroBase64Order.enhancedCoder, 0);
private static final kelondroRow stackrow = new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength, kelondroBase64Order.enhancedCoder, 0);
// class variables
private ArrayList ramStack; // a list that is flused first
private kelondroStack fileStack; // a file with url hashes
private HashMap domainStacks; // a map from domain name part to Lists with url hashs
private HashSet ramIndex; // an index is needed externally, we provide that internally
private ArrayList urlRAMStack; // a list that is flused first
private kelondroStack urlFileStack; // a file with url hashes
private kelondroIndex urlFileIndex;
private HashMap domainStacks; // a map from domain name part to Lists with url hashs
private File cacheStacksPath;
private String stackname;
public plasmaCrawlBalancer(File stackFile) {
fileStack = kelondroStack.open(stackFile, payload);
public plasmaCrawlBalancer(File cachePath, String stackname) {
this.cacheStacksPath = cachePath;
this.stackname = stackname;
File stackFile = new File(cachePath, stackname + stackSuffix);
urlFileStack = kelondroStack.open(stackFile, stackrow);
domainStacks = new HashMap();
ramStack = new ArrayList();
ramIndex = makeIndex();
urlRAMStack = new ArrayList();
// create a stack for newly entered entries
if (!(cachePath.exists())) cachePath.mkdir(); // make the path
openFileIndex();
}
public synchronized void close() {
ramIndex = null;
while (sizeDomainStacks() > 0) flushOnceDomStacks(true);
try { flushAllRamStack(); } catch (IOException e) {}
fileStack.close();
fileStack = null;
if (urlFileIndex != null) {
urlFileIndex.close();
urlFileIndex = null;
}
if (urlFileStack != null) {
urlFileStack.close();
urlFileStack = null;
}
}
public void finalize() {
if (fileStack != null) close();
if (urlFileStack != null) close();
}
public synchronized void clear() {
fileStack = kelondroStack.reset(fileStack);
urlFileStack = kelondroStack.reset(urlFileStack);
domainStacks.clear();
ramStack.clear();
ramIndex = new HashSet();
urlRAMStack.clear();
resetFileIndex();
}
private HashSet makeIndex() {
HashSet index = new HashSet(); // TODO: replace with kelondroIndex
// take all elements from the file stack
private void openFileIndex() {
cacheStacksPath.mkdirs();
try {
Iterator i = fileStack.keyIterator(); // iterates byte[] - objects
while (i.hasNext()) index.add(new String((byte[]) i.next(), "UTF-8"));
} catch (UnsupportedEncodingException e) {}
// take elements from the ram stack
for (int i = 0; i < ramStack.size(); i++) index.add(ramStack.get(i));
// take elememts from domain stacks
Iterator i = domainStacks.entrySet().iterator();
Map.Entry entry;
LinkedList list;
Iterator ii;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
list = (LinkedList) entry.getValue();
ii = list.iterator();
while (ii.hasNext()) index.add(ii.next());
urlFileIndex = new kelondroCache(new kelondroFlexTable(cacheStacksPath, stackname + indexSuffix, -1, plasmaCrawlEntry.rowdef), true, false);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
return index;
}
public boolean has(String urlhash) {
return ramIndex.contains(urlhash);
private void resetFileIndex() {
if (urlFileIndex != null) {
urlFileIndex.close();
urlFileIndex = null;
File cacheFile = new File(cacheStacksPath, stackname + indexSuffix);
cacheFile.delete();
}
openFileIndex();
}
public Iterator iterator() {
return ramIndex.iterator();
public synchronized plasmaCrawlEntry get(String urlhash) throws IOException {
kelondroRow.Entry entry = urlFileIndex.get(urlhash.getBytes());
if (entry == null) return null;
return new plasmaCrawlEntry(entry);
}
public synchronized plasmaCrawlEntry remove(String urlhash) throws IOException {
// this method is only here, because so many import/export methods need it
// and it was implemented in the previous architecture
// however, usage is not recommendet
kelondroRow.Entry entry = urlFileIndex.remove(urlhash.getBytes());
if (entry == null) return null;
// now delete that thing also from the queues
// iterate through the RAM stack
Iterator i = urlRAMStack.iterator();
String h;
while (i.hasNext()) {
h = (String) i.next();
if (h.equals(urlhash)) {
i.remove();
break;
}
}
// we cannot iterate through the file stack, because the stack iterator
// has not yet a delete method implemented. It would also be a bad idea
// to do that, it would make too much IO load
// instead, the top/pop methods that aquire elements from the stack, that
// cannot be found in the urlFileIndex must handle that case silently
return new plasmaCrawlEntry(entry);
}
public boolean has(String urlhash) {
try {
return urlFileIndex.has(urlhash.getBytes());
} catch (IOException e) {
e.printStackTrace();
return false;
}
}
public synchronized int size() {
int componentsize = fileStack.size() + ramStack.size() + sizeDomainStacks();
if ((kelondroRecords.debugmode) && (componentsize != ramIndex.size())) {
// hier ist ramIndex.size() immer grš§er. warum?
serverLog.logWarning("PLASMA BALANCER", "size operation wrong - componentsize = " + componentsize + ", ramIndex.size() = " + ramIndex.size());
}
int componentsize = urlFileStack.size() + urlRAMStack.size() + sizeDomainStacks();
try {
if ((kelondroRecords.debugmode) && (componentsize != urlFileIndex.size())) {
// hier ist urlIndexFile.size() immer grš§er. warum?
serverLog.logWarning("PLASMA BALANCER", "size operation wrong - componentsize = " + componentsize + ", ramIndex.size() = " + urlFileIndex.size());
}
} catch (IOException e) {
e.printStackTrace();
}
return componentsize;
}
@ -163,9 +214,9 @@ public class plasmaCrawlBalancer {
list = (LinkedList) entry.getValue();
if (list.size() != 0) {
if (ram) {
ramStack.add(list.removeFirst());
urlRAMStack.add(list.removeFirst());
} else try {
fileStack.push(fileStack.row().newEntry(new byte[][]{((String) list.removeFirst()).getBytes()}));
urlFileStack.push(urlFileStack.row().newEntry(new byte[][]{((String) list.removeFirst()).getBytes()}));
} catch (IOException e) {
e.printStackTrace();
}
@ -176,34 +227,36 @@ public class plasmaCrawlBalancer {
private void flushAllRamStack() throws IOException {
// this flushes only the ramStack to the fileStack, but does not flush the domainStacks
for (int i = 0; i < ramStack.size() / 2; i++) {
fileStack.push(fileStack.row().newEntry(new byte[][]{((String) ramStack.get(i)).getBytes()}));
fileStack.push(fileStack.row().newEntry(new byte[][]{((String) ramStack.get(ramStack.size() - i - 1)).getBytes()}));
for (int i = 0; i < urlRAMStack.size() / 2; i++) {
urlFileStack.push(urlFileStack.row().newEntry(new byte[][]{((String) urlRAMStack.get(i)).getBytes()}));
urlFileStack.push(urlFileStack.row().newEntry(new byte[][]{((String) urlRAMStack.get(urlRAMStack.size() - i - 1)).getBytes()}));
}
if (ramStack.size() % 2 == 1)
fileStack.push(fileStack.row().newEntry(new byte[][]{((String) ramStack.get(ramStack.size() / 2)).getBytes()}));
if (urlRAMStack.size() % 2 == 1)
urlFileStack.push(urlFileStack.row().newEntry(new byte[][]{((String) urlRAMStack.get(urlRAMStack.size() / 2)).getBytes()}));
}
public synchronized void push(String urlhash) throws IOException {
assert urlhash != null;
if (ramIndex.contains(urlhash)) {
serverLog.logWarning("PLASMA BALANCER", "double-check has failed for urlhash " + urlhash + " - fixed");
public synchronized void push(plasmaCrawlEntry entry) throws IOException {
assert entry != null;
if (urlFileIndex.has(entry.urlhash().getBytes())) {
serverLog.logWarning("PLASMA BALANCER", "double-check has failed for urlhash " + entry.urlhash() + " - fixed");
return;
}
String dom = urlhash.substring(6);
// extend domain stack
String dom = entry.urlhash().substring(6);
LinkedList domainList = (LinkedList) domainStacks.get(dom);
if (domainList == null) {
// create new list
domainList = new LinkedList();
domainList.addLast(urlhash);
domainList.addLast(entry.urlhash());
domainStacks.put(dom, domainList);
} else {
// extend existent domain list
domainList.add(urlhash);
domainList.add(entry.urlhash());
}
// add to index
ramIndex.add(urlhash);
urlFileIndex.put(entry.toRow());
// check size of domainStacks and flush
if ((domainStacks.size() > 20) || (sizeDomainStacks() > 1000)) {
@ -211,15 +264,15 @@ public class plasmaCrawlBalancer {
}
}
public synchronized String pop(long minimumDelta, long maximumAge) throws IOException {
public synchronized plasmaCrawlEntry pop(long minimumDelta, long maximumAge) throws IOException {
// returns an url-hash from the stack and ensures minimum delta times
// we have 3 sources to choose from: the ramStack, the domainStacks and the fileStack
String result = null; // the result
// 1st: check ramStack
if (ramStack.size() > 0) {
result = (String) ramStack.remove(0);
if (urlRAMStack.size() > 0) {
result = (String) urlRAMStack.remove(0);
}
// 2nd-a: check domainStacks for latest arrivals
@ -301,12 +354,12 @@ public class plasmaCrawlBalancer {
}
// 3rd: take entry from file
if ((result == null) && (fileStack.size() > 0)) {
kelondroRow.Entry topentry = fileStack.top();
if ((result == null) && (urlFileStack.size() > 0)) {
kelondroRow.Entry topentry = urlFileStack.top();
if (topentry == null) {
// emergency case: this means that something with the stack organization is wrong
// the file appears to be broken. We kill the file.
kelondroStack.reset(fileStack);
kelondroStack.reset(urlFileStack);
serverLog.logSevere("PLASMA BALANCER", "get() failed to fetch entry from file stack. reset stack file.");
} else {
String top = new String(topentry.getColBytes(0));
@ -316,10 +369,10 @@ public class plasmaCrawlBalancer {
long delta = lastAccessDelta(top);
if (delta > minimumDelta) {
// the entry from top is fine
result = new String(fileStack.pop().getColBytes(0));
result = new String(urlFileStack.pop().getColBytes(0));
} else {
// try entry from bottom
result = new String(fileStack.pot().getColBytes(0));
result = new String(urlFileStack.pot().getColBytes(0));
delta = lastAccessDelta(result);
}
}
@ -327,7 +380,7 @@ public class plasmaCrawlBalancer {
// check case where we did not found anything
if (result == null) {
serverLog.logSevere("PLASMA BALANCER", "get() was not able to find a valid urlhash - total size = " + size() + ", fileStack.size() = " + fileStack.size() + ", ramStack.size() = " + ramStack.size() + ", domainStacks.size() = " + domainStacks.size());
serverLog.logSevere("PLASMA BALANCER", "get() was not able to find a valid urlhash - total size = " + size() + ", fileStack.size() = " + urlFileStack.size() + ", ramStack.size() = " + urlRAMStack.size() + ", domainStacks.size() = " + domainStacks.size());
return null;
}
@ -344,8 +397,9 @@ public class plasmaCrawlBalancer {
// update statistical data
domainAccess.put(result.substring(6), new Long(System.currentTimeMillis()));
ramIndex.remove(result);
return result;
kelondroRow.Entry entry = urlFileIndex.remove(result.getBytes());
if (entry == null) return null;
return new plasmaCrawlEntry(entry);
}
private long lastAccessDelta(String hash) {
@ -355,19 +409,55 @@ public class plasmaCrawlBalancer {
return System.currentTimeMillis() - lastAccess.longValue();
}
public synchronized String top(int dist) {
int availableInRam = ramStack.size() + sizeDomainStacks();
if ((availableInRam < dist) && (fileStack.size() > (dist - availableInRam))) {
public synchronized plasmaCrawlEntry top(int dist) throws IOException {
int availableInRam = urlRAMStack.size() + sizeDomainStacks();
if ((availableInRam <= dist) && (urlFileStack.size() > (dist - availableInRam))) {
// flush some entries from disc to domain stacks
try {
for (int i = 0; i < (dist - availableInRam); i++) {
ramStack.add(new String(fileStack.pop().getColBytes(0)));
for (int i = 0; i <= (dist - availableInRam); i++) {
if (urlFileStack.size() == 0) break;
urlRAMStack.add(new String(urlFileStack.pop().getColBytes(0)));
}
} catch (IOException e) {}
}
while ((sizeDomainStacks() > 0) && (ramStack.size() <= dist)) flushOnceDomStacks(true); // flush only that much as we need to display
if (dist >= ramStack.size()) return null;
return (String) ramStack.get(dist);
while ((sizeDomainStacks() > 0) && (urlRAMStack.size() <= dist)) flushOnceDomStacks(true); // flush only that much as we need to display
if (dist >= urlRAMStack.size()) return null;
String urlhash = (String) urlRAMStack.get(dist);
kelondroRow.Entry entry = urlFileIndex.get(urlhash.getBytes());
if (entry == null) return null;
return new plasmaCrawlEntry(entry);
}
public Iterator iterator() throws IOException {
return new EntryIterator();
}
public class EntryIterator implements Iterator {
Iterator rowIterator;
public EntryIterator() throws IOException {
rowIterator = urlFileIndex.rows(true, null);
}
public boolean hasNext() {
return (rowIterator == null) ? false : rowIterator.hasNext();
}
public Object next() {
kelondroRow.Entry entry = (kelondroRow.Entry) rowIterator.next();
try {
return (entry == null) ? null : new plasmaCrawlEntry(entry);
} catch (IOException e) {
rowIterator = null;
return null;
}
}
public void remove() {
if (rowIterator != null) rowIterator.remove();
}
}
}

@ -1,15 +1,15 @@
// plasmaEURL.java
// -----------------------
// part of YaCy
// (C) by Michael Peter Christen; mc@anomic.de
// first published on http://www.anomic.de
// Frankfurt, Germany, 2004
// last major change: 09.08.2004
// plasmaCrawlEURL.java
// (C) 2004 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 09.08.2004 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -23,50 +23,15 @@
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
//
// Using this software in any meaning (reading, learning, copying, compiling,
// running) means that you agree that the Author(s) is (are) not responsible
// for cost, loss of data or any harm that may be caused directly or indirectly
// by usage of this softare or this documentation. The usage of this software
// is on your own risk. The installation and usage (starting/running) of this
// software may allow other people or application to access your computer and
// any attached devices and is highly dependent on the configuration of the
// software which must be done by the user of the software; the author(s) is
// (are) also not responsible for proper configuration and usage of the
// software, even if provoked by documentation provided together with
// the software.
//
// Any changes to this file according to the GPL as documented in the file
// gpl.txt aside this file in the shipment you received can be done to the
// lines that follows this copyright notice here, but changes must not be
// done inside the copyright notive above. A re-distribution must contain
// the intact and unchanged copyright notice.
// Contributions and changes to the program code must be marked as such.
// EURL - noticed (known but not loaded) URL's
package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import de.anomic.plasma.plasmaURL;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlEURL {
/* =======================================================================
* Failure reason constants
* ======================================================================= */
// invalid urls
public static final String DENIED_URL_NULL = "denied_(url_null)";
public static final String DENIED_MALFORMED_URL = "denied_(malformed_url)";
@ -125,290 +90,5 @@ public class plasmaCrawlEURL {
// indexing errors
public static final String DENIED_UNSPECIFIED_INDEXING_ERROR = "denied_(unspecified_indexing_error)";
public static final String DENIED_UNKNOWN_INDEXING_PROCESS_CASE = "denied_(unknown_indexing_process_case)";
/* =======================================================================
* Other object variables
* ======================================================================= */
private LinkedList rejectedStack = new LinkedList(); // strings: url
public final static kelondroRow rowdef = new kelondroRow(
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"String executor-" + yacySeedDB.commonHashLength + ", " + // the crawling executor
"String urlstring-256, " + // the url as string
"String urlname-40, " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-4 {b64e}, " + // the time when the url was first time appeared
"Cardinal loaddate-4 {b64e}, " + // the time when the url was last time tried to load
"Cardinal retrycount-2 {b64e}, " + // number of load retries
"String failcause-80, " + // string describing load failure
"byte[] flags-2", // extra space
kelondroBase64Order.enhancedCoder,
0);
// the class object
private kelondroIndex urlIndexFile = null;
public plasmaCrawlEURL(File cachePath, long preloadTime) {
super();
String newCacheName = "urlErr3.table";
cachePath.mkdirs();
try {
urlIndexFile = new kelondroFlexTable(cachePath, newCacheName, preloadTime, rowdef);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
}
public int size() {
try {
return urlIndexFile.size() ;
} catch (IOException e) {
return 0;
}
}
public void close() {
if (urlIndexFile != null) {
urlIndexFile.close();
urlIndexFile = null;
}
}
public synchronized Entry newEntry(URL url, String referrer, String initiator, String executor,
String name, String failreason, kelondroBitfield flags) {
if ((referrer == null) || (referrer.length() < yacySeedDB.commonHashLength)) referrer = plasmaURL.dummyHash;
if ((initiator == null) || (initiator.length() < yacySeedDB.commonHashLength)) initiator = plasmaURL.dummyHash;
if ((executor == null) || (executor.length() < yacySeedDB.commonHashLength)) executor = plasmaURL.dummyHash;
if (failreason == null) failreason = "unknown";
return new Entry(url, referrer, initiator, executor, name, failreason, flags);
}
public boolean remove(String hash) {
if (hash == null) return false;
try {
urlIndexFile.remove(hash.getBytes());
return true;
} catch (IOException e) {
return false;
}
}
public synchronized void stackPushEntry(Entry e) {
rejectedStack.add(e.hash);
}
public Entry stackPopEntry(int pos) throws IOException {
String urlhash = (String) rejectedStack.get(pos);
if (urlhash == null) return null;
return new Entry(urlhash);
}
public synchronized Entry getEntry(String hash) throws IOException {
return new Entry(hash);
}
public boolean getUseNewDB() {
return (urlIndexFile instanceof kelondroFlexTable);
}
public boolean exists(String urlHash) {
try {
return urlIndexFile.has(urlHash.getBytes());
} catch (IOException e) {
return false;
}
}
public void clearStack() {
rejectedStack.clear();
}
public int stackSize() {
return rejectedStack.size();
}
public class Entry {
private String hash; // the url's hash
private String referrer; // the url's referrer hash
private String initiator; // the crawling initiator
private String executor; // the crawling initiator
private URL url; // the url as string
private String name; // the name of the url, from anchor tag <a>name</a>
private Date initdate; // the time when the url was first time appeared
private Date trydate; // the time when the url was last time tried to load
private int trycount; // number of tryings
private String failreason; // string describing reason for load fail
private kelondroBitfield flags; // extra space
private boolean stored;
public Entry(URL url, String referrer, String initiator,
String executor, String name, String failreason, kelondroBitfield flags) {
// create new entry
this.hash = plasmaURL.urlHash(url);
this.referrer = (referrer == null) ? plasmaURL.dummyHash : referrer;
this.initiator = initiator;
this.executor = executor;
this.url = url;
this.name = name;
this.initdate = new Date();
this.trydate = new Date();
this.trycount = 0;
this.failreason = failreason;
this.flags = flags;
this.stored = false;
}
public Entry(String hash) throws IOException {
// generates an plasmaEURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
// we have two options to find the url:
// - look into the hash cache
// - look into the filed properties
// if the url cannot be found, this returns null
this.hash = hash;
kelondroRow.Entry entry = urlIndexFile.get(hash.getBytes());
if (entry != null) {
insertEntry(entry);
}
this.stored = true;
}
public Entry(kelondroRow.Entry entry) throws IOException {
insertEntry(entry);
this.stored = false;
}
private void insertEntry(kelondroRow.Entry entry) throws IOException {
assert (entry != null);
this.hash = entry.getColString(0, null);
this.referrer = entry.getColString(1, "UTF-8");
this.initiator = entry.getColString(2, "UTF-8");
this.executor = entry.getColString(3, "UTF-8");
this.url = new URL(entry.getColString(4, "UTF-8").trim());
String n = entry.getColString(5, "UTF-8");
this.name = (n == null) ? "" : n.trim();
this.initdate = new Date(86400000 * entry.getColLong(6));
this.trydate = new Date(86400000 * entry.getColLong(7));
this.trycount = (int) entry.getColLong(8);
this.failreason = entry.getColString(9, "UTF-8");
this.flags = new kelondroBitfield(entry.getColBytes(10));
return;
}
public void store() {
// stores the values from the object variables into the database
if (this.stored) return;
if (this.hash == null) return;
String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, rowdef.width(6));
String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, rowdef.width(7));
// store the hash in the hash cache
try {
// even if the entry exists, we simply overwrite it
byte[][] entry = new byte[][] {
this.hash.getBytes(),
this.referrer.getBytes(),
this.initiator.getBytes(),
this.executor.getBytes(),
this.url.toString().getBytes(),
this.name.getBytes(),
initdatestr.getBytes(),
trydatestr.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.trycount, rowdef.width(8)).getBytes(),
this.failreason.getBytes(),
this.flags.bytes()
};
urlIndexFile.put(urlIndexFile.row().newEntry(entry));
this.stored = true;
} catch (IOException e) {
System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString());
}
}
public String hash() {
// return a url-hash, based on the md5 algorithm
// the result is a String of 12 bytes within a 72-bit space
// (each byte has an 6-bit range)
// that should be enough for all web pages on the world
return this.hash;
}
public String referrer() {
return this.referrer;
}
public URL url() {
return url;
}
public Date initdate() {
return trydate;
}
public Date trydate() {
return trydate;
}
public String initiator() {
// return the creator's hash
return initiator;
}
public String executor() {
// return the creator's hash
return executor;
}
public String name() {
// return the creator's hash
return name;
}
public String failreason() {
return failreason;
}
}
public class kiter implements Iterator {
// enumerates entry elements
Iterator i;
boolean error = false;
public kiter(boolean up, String firstHash) throws IOException {
i = urlIndexFile.rows(up, (firstHash == null) ? null : firstHash.getBytes());
error = false;
}
public boolean hasNext() {
if (error) return false;
return i.hasNext();
}
public Object next() throws RuntimeException {
kelondroRow.Entry e = (kelondroRow.Entry) i.next();
if (e == null) return null;
try {
return new Entry(e);
} catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null));
}
}
public void remove() {
i.remove();
}
}
public Iterator entries(boolean up, String firstHash) throws IOException {
// enumerates entry elements
return new kiter(up, firstHash);
}
}

@ -0,0 +1,238 @@
// plasmaCrawlBalancerEntry.java
// (C) 2007 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 14.03.2007 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.util.Date;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlEntry {
// row definition for balancer-related NURL-entries
public final static kelondroRow rowdef = new kelondroRow(
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"String urlstring-256, " + // the url as string
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String urlname-80, " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-8 {b256}, " + // the time when the url was first time appeared
"String profile-4, " + // the name of the prefetch profile handle
"Cardinal depth-2 {b256}, " + // the prefetch depth so far, starts at 0
"Cardinal parentbr-3 {b256}, " + // number of anchors of the parent
"Cardinal forkfactor-4 {b256}, " + // sum of anchors of all ancestors
"byte[] flags-4, " + // flags
"String handle-4, " + // extra handle
"Cardinal loaddate-8 {b256}," + // time when the file was loaded
"Cardinal serverdate-8 {b256}," + // time when that the server returned as document date
"Cardinal modifiedSince-8 {b256}", // time that was given to server as ifModifiedSince
kelondroBase64Order.enhancedCoder,
0
);
private String initiator; // the initiator hash, is NULL or "" if it is the own proxy;
// if this is generated by a crawl, the own peer hash in entered
private String urlhash; // the url's hash
private String referrer; // the url's referrer hash
private URL url; // the url as string
private String name; // the name of the url, from anchor tag <a>name</a>
private long appdate; // the time when the url was first time appeared
private long loaddate; // the time when the url was loaded
private long serverdate; // the document date from the target server
private long imsdate; // the time of a ifModifiedSince request
private String profileHandle; // the name of the prefetch profile
private int depth; // the prefetch depth so far, starts at 0
private int anchors; // number of anchors of the parent
private int forkfactor; // sum of anchors of all ancestors
private kelondroBitfield flags;
private int handle;
public plasmaCrawlEntry(URL url) {
this(yacyCore.seedDB.mySeed.hash, url, null, null, new Date(), null, 0, 0, 0);
}
public plasmaCrawlEntry(
String initiator,
URL url,
String referrer,
String name,
Date appdate,
String profileHandle,
int depth,
int anchors,
int forkfactor
) {
// create new entry and store it into database
this.urlhash = plasmaURL.urlHash(url);
this.initiator = initiator;
this.url = url;
this.referrer = (referrer == null) ? plasmaURL.dummyHash : referrer;
this.name = (name == null) ? "" : name;
this.appdate = (appdate == null) ? 0 : appdate.getTime();
this.profileHandle = profileHandle; // must not be null
this.depth = depth;
this.anchors = anchors;
this.forkfactor = forkfactor;
this.flags = new kelondroBitfield(rowdef.width(10));
this.handle = 0;
this.loaddate = 0;
this.serverdate = 0;
this.imsdate = 0;
}
public plasmaCrawlEntry(kelondroRow.Entry entry) throws IOException {
assert (entry != null);
insertEntry(entry);
}
private void insertEntry(kelondroRow.Entry entry) throws IOException {
String urlstring = entry.getColString(2, null);
if (urlstring == null) throw new IOException ("url string is null");
this.urlhash = entry.getColString(0, null);
this.initiator = entry.getColString(1, null);
this.url = new URL(urlstring);
this.referrer = (entry.empty(3)) ? plasmaURL.dummyHash : entry.getColString(3, null);
this.name = (entry.empty(4)) ? "" : entry.getColString(4, "UTF-8").trim();
this.appdate = entry.getColLong(5);
this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6, null).trim();
this.depth = (int) entry.getColLong(7);
this.anchors = (int) entry.getColLong(8);
this.forkfactor = (int) entry.getColLong(9);
this.flags = new kelondroBitfield(entry.getColBytes(10));
this.handle = Integer.parseInt(entry.getColString(11, null), 16);
this.loaddate = entry.getColLong(12);
this.serverdate = entry.getColLong(13);
this.imsdate = entry.getColLong(14);
return;
}
private static String normalizeHandle(int h) {
String d = Integer.toHexString(h);
while (d.length() < rowdef.width(11)) d = "0" + d;
return d;
}
public kelondroRow.Entry toRow() {
byte[] appdatestr = kelondroNaturalOrder.encodeLong(appdate, rowdef.width(5));
byte[] loaddatestr = kelondroNaturalOrder.encodeLong(loaddate, rowdef.width(12));
byte[] serverdatestr = kelondroNaturalOrder.encodeLong(serverdate, rowdef.width(13));
byte[] imsdatestr = kelondroNaturalOrder.encodeLong(imsdate, rowdef.width(14));
// store the hash in the hash cache
byte[] namebytes;
try {
namebytes = this.name.getBytes("UTF-8");
} catch (UnsupportedEncodingException e) {
namebytes = this.name.getBytes();
}
byte[][] entry = new byte[][] {
this.urlhash.getBytes(),
(initiator == null) ? "".getBytes() : this.initiator.getBytes(),
this.url.toString().getBytes(),
this.referrer.getBytes(),
namebytes,
appdatestr,
(this.profileHandle == null) ? null : this.profileHandle.getBytes(),
kelondroNaturalOrder.encodeLong(this.depth, rowdef.width(7)),
kelondroNaturalOrder.encodeLong(this.anchors, rowdef.width(8)),
kelondroNaturalOrder.encodeLong(this.forkfactor, rowdef.width(9)),
this.flags.bytes(),
normalizeHandle(this.handle).getBytes(),
loaddatestr,
serverdatestr,
imsdatestr};
return rowdef.newEntry(entry);
}
public URL url() {
// the url
return url;
}
public String urlhash() {
// the hash of this url
return this.urlhash;
}
public String referrerhash() {
// the urlhash of a referer url
return this.referrer;
}
public String initiator() {
// returns the hash of the initiating peer
if (initiator == null) return null;
if (initiator.length() == 0) return null;
return initiator;
}
public boolean proxy() {
// true when the url was retrieved using the proxy
return (initiator() == null);
}
public Date appdate() {
// the date when the url appeared first
return new Date(appdate);
}
public Date loaddate() {
// the date when the url was loaded
return new Date(loaddate);
}
public Date serverdate() {
// the date that the server returned as document date
return new Date(serverdate);
}
public Date imsdate() {
// the date that the client (browser) send as ifModifiedSince in proxy mode
return new Date(imsdate);
}
public String name() {
// return the anchor name (text inside <a> tag)
return name;
}
public int depth() {
// crawl depth where the url appeared
return depth;
}
public String profileHandle() {
// the handle of the crawl profile
return profileHandle;
}
}

@ -46,24 +46,9 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import de.anomic.plasma.plasmaURL;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroRecords;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroStack;
import de.anomic.net.URL;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlNURL {
public static final int STACK_TYPE_NULL = 0; // do not stack
@ -78,166 +63,33 @@ public class plasmaCrawlNURL {
private static final long minimumDelta = 500; // the minimum time difference between access of the same domain
private static final long maximumDomAge = 60000; // the maximum age of a domain until it is used for another crawl attempt
/**
* column length definition for the {@link plasmaURL#urlIndexFile} DB
*/
public final static kelondroRow rowdef = new kelondroRow(
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator
"String urlstring-256, " + // the url as string
"String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash
"String urlname-40, " + // the name of the url, from anchor tag <a>name</a>
"Cardinal appdate-4 {b64e}, " + // the time when the url was first time appeared
"String profile-4, " + // the name of the prefetch profile handle
"Cardinal depth-2 {b64e}, " + // the prefetch depth so far, starts at 0
"Cardinal parentbr-3 {b64e}, " + // number of anchors of the parent
"Cardinal forkfactor-4 {b64e}, " + // sum of anchors of all ancestors
"byte[] flags-4, " + // flags
"String handle-4", // extra handle
kelondroBase64Order.enhancedCoder,
0
);
private kelondroIndex urlIndexFile = null;
private final plasmaCrawlBalancer coreStack; // links found by crawling to depth-1
private final plasmaCrawlBalancer limitStack; // links found by crawling at target depth
private final plasmaCrawlBalancer overhangStack; // links found by crawling at depth+1
private final plasmaCrawlBalancer remoteStack; // links from remote crawl orders
private kelondroStack imageStack; // links pointing to image resources
private kelondroStack movieStack; // links pointing to movie resources
private kelondroStack musicStack; // links pointing to music resources
//private final plasmaCrawlBalancer overhangStack; // links found by crawling at depth+1
//private kelondroStack imageStack; // links pointing to image resources
//private kelondroStack movieStack; // links pointing to movie resources
//private kelondroStack musicStack; // links pointing to music resources
private final HashSet imageStackIndex, movieStackIndex, musicStackIndex; // to find out if a specific link is already on any stack
private File cacheStacksPath;
private long preloadTime;
private initStackIndex initThead;
public plasmaCrawlNURL(File cachePath, long preloadTime) {
public plasmaCrawlNURL(File cachePath) {
super();
this.cacheStacksPath = cachePath;
this.preloadTime = preloadTime;
// create a stack for newly entered entries
if (!(cachePath.exists())) cachePath.mkdir(); // make the path
openHashCache();
File coreStackFile = new File(cachePath, "urlNoticeLocal0.stack");
File limitStackFile = new File(cachePath, "urlNoticeLimit0.stack");
File overhangStackFile = new File(cachePath, "urlNoticeOverhang0.stack");
File remoteStackFile = new File(cachePath, "urlNoticeRemote0.stack");
File imageStackFile = new File(cachePath, "urlNoticeImage0.stack");
File movieStackFile = new File(cachePath, "urlNoticeMovie0.stack");
File musicStackFile = new File(cachePath, "urlNoticeMusic0.stack");
coreStack = new plasmaCrawlBalancer(coreStackFile);
limitStack = new plasmaCrawlBalancer(limitStackFile);
overhangStack = new plasmaCrawlBalancer(overhangStackFile);
remoteStack = new plasmaCrawlBalancer(remoteStackFile);
kelondroRow rowdef = new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength, kelondroBase64Order.enhancedCoder, 0);
imageStack = kelondroStack.open(imageStackFile, rowdef);
movieStack = kelondroStack.open(movieStackFile, rowdef);
musicStack = kelondroStack.open(musicStackFile, rowdef);
// init stack Index
imageStackIndex = new HashSet();
movieStackIndex = new HashSet();
musicStackIndex = new HashSet();
(initThead = new initStackIndex()).start();
coreStack = new plasmaCrawlBalancer(cachePath, "urlNoticeCoreStack");
limitStack = new plasmaCrawlBalancer(cachePath, "urlNoticeLimitStack");
//overhangStack = new plasmaCrawlBalancer(overhangStackFile);
remoteStack = new plasmaCrawlBalancer(cachePath, "urlNoticeRemoteStack");
}
public int size() {
try {
return urlIndexFile.size() ;
} catch (IOException e) {
return 0;
}
}
public void waitOnInitThread() {
try {
if (this.initThead != null) {
this.initThead.join();
}
} catch (NullPointerException e) {
} catch (InterruptedException e) {}
}
private void openHashCache() {
String newCacheName = "urlNotice5.table";
cacheStacksPath.mkdirs();
try {
urlIndexFile = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, rowdef), true, false);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
}
private void resetHashCache() {
if (urlIndexFile != null) {
urlIndexFile.close();
urlIndexFile = null;
File cacheFile = new File(cacheStacksPath, "urlNotice2.db");
cacheFile.delete();
}
openHashCache();
return coreStack.size() + limitStack.size() + remoteStack.size();
}
public void close() {
coreStack.close();
limitStack.close();
overhangStack.close();
//overhangStack.close();
remoteStack.close();
imageStack.close();
movieStack.close();
musicStack.close();
if (urlIndexFile != null) {
urlIndexFile.close();
urlIndexFile = null;
}
}
public class initStackIndex extends Thread {
public void run() {
Iterator i;
try {
i = imageStack.iterator();
while (i.hasNext()) imageStackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey(), "UTF-8"));
} catch (Exception e) {
imageStack = kelondroStack.reset(imageStack);
}
try {
i = movieStack.iterator();
while (i.hasNext()) movieStackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey(), "UTF-8"));
} catch (Exception e) {
movieStack = kelondroStack.reset(movieStack);
}
try {
i = musicStack.iterator();
while (i.hasNext()) musicStackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey(), "UTF-8"));
} catch (Exception e) {
musicStack = kelondroStack.reset(musicStack);
}
plasmaCrawlNURL.this.initThead = null;
}
}
public boolean remove(String hash) {
if (hash == null) return false;
try {
urlIndexFile.remove(hash.getBytes());
return true;
} catch (IOException e) {
return false;
}
}
private static String normalizeHandle(int h) {
String d = Integer.toHexString(h);
while (d.length() < rowdef.width(11)) d = "0" + d;
return d;
}
public int stackSize() {
// this does not count the overhang stack size
return coreStack.size() + limitStack.size() + remoteStack.size();
@ -247,11 +99,8 @@ public class plasmaCrawlNURL {
switch (stackType) {
case STACK_TYPE_CORE: return coreStack.size();
case STACK_TYPE_LIMIT: return limitStack.size();
case STACK_TYPE_OVERHANG: return overhangStack.size();
case STACK_TYPE_OVERHANG: return 0;
case STACK_TYPE_REMOTE: return remoteStack.size();
case STACK_TYPE_IMAGE: return imageStack.size();
case STACK_TYPE_MOVIE: return movieStack.size();
case STACK_TYPE_MUSIC: return musicStack.size();
default: return -1;
}
}
@ -260,111 +109,65 @@ public class plasmaCrawlNURL {
return
coreStack.has(urlhash) ||
limitStack.has(urlhash) ||
overhangStack.has(urlhash) ||
remoteStack.has(urlhash) ||
imageStackIndex.contains(urlhash) ||
movieStackIndex.contains(urlhash) ||
musicStackIndex.contains(urlhash);
}
public synchronized Entry newEntry(String initiator, URL url, Date loaddate,
String referrer, String name, String profile,
int depth, int anchors, int forkfactor) {
return new Entry(initiator, url, referrer, name, loaddate,
profile, depth, anchors, forkfactor);
//overhangStack.has(urlhash) ||
remoteStack.has(urlhash);
}
public synchronized Entry newEntry(Entry oldEntry) {
if (oldEntry == null) return null;
return new Entry(
oldEntry.initiator(),
oldEntry.url(),
oldEntry.referrerHash(),
oldEntry.name(),
oldEntry.loaddate(),
oldEntry.profileHandle(),
oldEntry.depth(),
oldEntry.anchors,
oldEntry.forkfactor
);
}
public void push(int stackType, String urlhash) {
public void push(int stackType, plasmaCrawlEntry entry) {
try {
switch (stackType) {
case STACK_TYPE_CORE:
coreStack.push(urlhash);
coreStack.push(entry);
break;
case STACK_TYPE_LIMIT:
limitStack.push(urlhash);
break;
case STACK_TYPE_OVERHANG:
overhangStack.push(urlhash);
limitStack.push(entry);
break;
case STACK_TYPE_REMOTE:
remoteStack.push(urlhash);
break;
case STACK_TYPE_IMAGE:
imageStack.push(imageStack.row().newEntry(new byte[][] {urlhash.getBytes()}));
imageStackIndex.add(urlhash);
break;
case STACK_TYPE_MOVIE:
movieStack.push(movieStack.row().newEntry(new byte[][] {urlhash.getBytes()}));
movieStackIndex.add(urlhash);
break;
case STACK_TYPE_MUSIC:
musicStack.push(musicStack.row().newEntry(new byte[][] {urlhash.getBytes()}));
musicStackIndex.add(urlhash);
remoteStack.push(entry);
break;
default: break;
}
} catch (IOException er) {}
}
public Entry[] top(int stackType, int count) {
public plasmaCrawlEntry get(String urlhash) {
plasmaCrawlEntry entry = null;
try {if ((entry = coreStack.get(urlhash)) != null) return entry;} catch (IOException e) {}
try {if ((entry = limitStack.get(urlhash)) != null) return entry;} catch (IOException e) {}
try {if ((entry = remoteStack.get(urlhash)) != null) return entry;} catch (IOException e) {}
return null;
}
public plasmaCrawlEntry remove(String urlhash) {
plasmaCrawlEntry entry = null;
try {if ((entry = coreStack.remove(urlhash)) != null) return entry;} catch (IOException e) {}
try {if ((entry = limitStack.remove(urlhash)) != null) return entry;} catch (IOException e) {}
try {if ((entry = remoteStack.remove(urlhash)) != null) return entry;} catch (IOException e) {}
return null;
}
public plasmaCrawlEntry[] top(int stackType, int count) {
switch (stackType) {
case STACK_TYPE_CORE: return top(coreStack, count);
case STACK_TYPE_LIMIT: return top(limitStack, count);
case STACK_TYPE_OVERHANG: return top(overhangStack, count);
case STACK_TYPE_REMOTE: return top(remoteStack, count);
case STACK_TYPE_IMAGE: return top(imageStack, count);
case STACK_TYPE_MOVIE: return top(movieStack, count);
case STACK_TYPE_MUSIC: return top(musicStack, count);
default: return null;
}
}
public Iterator iterator(int stackType) {
// returns an iterator of String objects
switch (stackType) {
case STACK_TYPE_CORE: return coreStack.iterator();
case STACK_TYPE_LIMIT: return limitStack.iterator();
case STACK_TYPE_OVERHANG: return overhangStack.iterator();
case STACK_TYPE_REMOTE: return remoteStack.iterator();
case STACK_TYPE_IMAGE: return imageStackIndex.iterator();
case STACK_TYPE_MOVIE: return movieStackIndex.iterator();
case STACK_TYPE_MUSIC: return musicStackIndex.iterator();
default: return null;
}
}
public Entry pop(int stackType) throws IOException {
public plasmaCrawlEntry pop(int stackType) throws IOException {
switch (stackType) {
case STACK_TYPE_CORE: return pop(coreStack);
case STACK_TYPE_LIMIT: return pop(limitStack);
case STACK_TYPE_OVERHANG: return pop(overhangStack);
case STACK_TYPE_REMOTE: return pop(remoteStack);
case STACK_TYPE_IMAGE: return pop(imageStack);
case STACK_TYPE_MOVIE: return pop(movieStack);
case STACK_TYPE_MUSIC: return pop(musicStack);
default: return null;
}
}
public void shift(int fromStack, int toStack) {
try {
Entry entry = pop(fromStack);
push(toStack, entry.hash());
plasmaCrawlEntry entry = pop(fromStack);
if (entry != null) push(toStack, entry);
} catch (IOException e) {
return;
}
@ -374,329 +177,55 @@ public class plasmaCrawlNURL {
switch (stackType) {
case STACK_TYPE_CORE: coreStack.clear(); break;
case STACK_TYPE_LIMIT: limitStack.clear(); break;
case STACK_TYPE_OVERHANG: overhangStack.clear(); break;
case STACK_TYPE_REMOTE: remoteStack.clear(); break;
case STACK_TYPE_IMAGE: imageStack = kelondroStack.reset(imageStack); break;
case STACK_TYPE_MOVIE: movieStack = kelondroStack.reset(movieStack); break;
case STACK_TYPE_MUSIC: musicStack = kelondroStack.reset(musicStack); break;
default: return;
}
}
private Entry pop(kelondroStack stack) throws IOException {
// this is a filo - pop
int s;
Entry entry;
kelondroRow.Entry re;
synchronized (stack) {
while ((s = stack.size()) > 0) {
re = stack.pop();
if (re == null) {
if (s > stack.size()) continue;
stack = kelondroStack.reset(stack); // the stack is not able to shrink
throw new IOException("hash is null, stack cannot shrink; reset of stack (1)");
}
try {
entry = new Entry(new String(re.getColBytes(0)));
} catch (IOException e) {
serverLog.logWarning("NURL", e.getMessage());
if (s > stack.size()) continue;
stack = kelondroStack.reset(stack); // the stack is not able to shrink
throw new IOException("hash is null, stack cannot shrink; reset of stack (2)");
}
imageStackIndex.remove(entry.hash);
movieStackIndex.remove(entry.hash);
musicStackIndex.remove(entry.hash);
return entry;
}
}
throw new IOException("crawl stack is empty");
}
private Entry pop(plasmaCrawlBalancer balancer) throws IOException {
private plasmaCrawlEntry pop(plasmaCrawlBalancer balancer) throws IOException {
// this is a filo - pop
String hash;
int s;
Entry entry;
plasmaCrawlEntry entry;
synchronized (balancer) {
while ((s = balancer.size()) > 0) {
hash = balancer.pop(minimumDelta, maximumDomAge);
if (hash == null) {
if (s > balancer.size()) continue;
balancer.clear(); // the balancer is broken and cannot shrink
throw new IOException("hash is null, balancer cannot shrink; reset of balancer (1)");
}
try {
entry = new Entry(hash);
} catch (IOException e) {
serverLog.logWarning("NURL", e.getMessage());
entry = balancer.pop(minimumDelta, maximumDomAge);
if (entry == null) {
if (s > balancer.size()) continue;
balancer.clear(); // the balancer is broken and cannot shrink
throw new IOException("IO error, balancer cannot shrink: " + e.getMessage() + "; reset of balancer (2)");
throw new IOException("entry is null, balancer cannot shrink; reset of balancer");
}
imageStackIndex.remove(entry.hash);
movieStackIndex.remove(entry.hash);
musicStackIndex.remove(entry.hash);
return entry;
}
}
throw new IOException("balancer stack is empty");
}
private Entry[] top(kelondroStack stack, int count) {
// this is a filo - top
if (count > stack.size()) count = stack.size();
ArrayList list = new ArrayList(count);
for (int i = 0; i < count; i++) {
try {
byte[] hash = stack.top(i).getColBytes(0);
list.add(new Entry(new String(hash)));
} catch (IOException e) {
continue;
}
}
return (Entry[]) list.toArray(new Entry[list.size()]);
}
private Entry[] top(plasmaCrawlBalancer balancer, int count) {
private plasmaCrawlEntry[] top(plasmaCrawlBalancer balancer, int count) {
// this is a filo - top
if (count > balancer.size()) count = balancer.size();
ArrayList list = new ArrayList(count);
for (int i = 0; i < count; i++) {
try {
String urlhash = balancer.top(i);
if (urlhash == null) break;
list.add(new Entry(urlhash));
plasmaCrawlEntry entry = balancer.top(i);
if (entry == null) break;
list.add(entry);
} catch (IOException e) {
break;
}
}
return (Entry[])list.toArray(new Entry[list.size()]);
}
public synchronized Entry getEntry(String hash) throws IOException {
return new Entry(hash);
return (plasmaCrawlEntry[]) list.toArray(new plasmaCrawlEntry[list.size()]);
}
public class Entry {
private String initiator; // the initiator hash, is NULL or "" if it is the own proxy;
// if this is generated by a crawl, the own peer hash in entered
private String hash; // the url's hash
private String referrer; // the url's referrer hash
private URL url; // the url as string
private String name; // the name of the url, from anchor tag <a>name</a>
private Date loaddate; // the time when the url was first time appeared
private String profileHandle; // the name of the prefetch profile
private int depth; // the prefetch depth so far, starts at 0
private int anchors; // number of anchors of the parent
private int forkfactor; // sum of anchors of all ancestors
private kelondroBitfield flags;
private int handle;
private boolean stored;
public Entry(String initiator,
URL url,
String referrer,
String name,
Date loaddate,
String profileHandle,
int depth,
int anchors,
int forkfactor
) {
// create new entry and store it into database
this.hash = plasmaURL.urlHash(url);
this.initiator = initiator;
this.url = url;
this.referrer = (referrer == null) ? plasmaURL.dummyHash : referrer;
this.name = (name == null) ? "" : name;
this.loaddate = (loaddate == null) ? new Date() : loaddate;
this.profileHandle = profileHandle; // must not be null
this.depth = depth;
this.anchors = anchors;
this.forkfactor = forkfactor;
this.flags = new kelondroBitfield(rowdef.width(10));
this.handle = 0;
this.stored = false;
}
public Entry(String hash) throws IOException {
// generates an plasmaNURLEntry using the url hash
// to speed up the access, the url-hashes are buffered
// in the hash cache.
// we have two options to find the url:
// - look into the hash cache
// - look into the filed properties
// if the url cannot be found, this returns null
this.hash = hash;
if (hash == null) throw new IOException("hash is null");
kelondroRow.Entry entry = urlIndexFile.get(hash.getBytes());
if (entry != null) {
insertEntry(entry);
this.stored = true;
return;
} else {
// show that we found nothing
throw new IOException("NURL: hash " + hash + " not found during initialization of entry object");
//this.url = null;
}
}
public Entry(kelondroRow.Entry entry) throws IOException {
assert (entry != null);
insertEntry(entry);
this.stored = false;
}
private void insertEntry(kelondroRow.Entry entry) throws IOException {
String urlstring = entry.getColString(2, null);
if (urlstring == null) throw new IOException ("url string is null");
this.hash = entry.getColString(0, null);
this.initiator = entry.getColString(1, null);
this.url = new URL(urlstring);
this.referrer = (entry.empty(3)) ? plasmaURL.dummyHash : entry.getColString(3, null);
this.name = (entry.empty(4)) ? "" : entry.getColString(4, "UTF-8").trim();
this.loaddate = new Date(86400000 * entry.getColLong(5));
this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6, null).trim();
this.depth = (int) entry.getColLong(7);
this.anchors = (int) entry.getColLong(8);
this.forkfactor = (int) entry.getColLong(9);
this.flags = new kelondroBitfield(entry.getColBytes(10));
this.handle = Integer.parseInt(entry.getColString(11, null), 16);
return;
}
public void store() {
// stores the values from the object variables into the database
if (this.stored) return;
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, rowdef.width(5));
// store the hash in the hash cache
try {
// even if the entry exists, we simply overwrite it
byte[][] entry = new byte[][] {
this.hash.getBytes(),
(initiator == null) ? "".getBytes() : this.initiator.getBytes(),
this.url.toString().getBytes(),
this.referrer.getBytes(),
this.name.getBytes("UTF-8"),
loaddatestr.getBytes(),
(this.profileHandle == null) ? null : this.profileHandle.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.depth, rowdef.width(7)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, rowdef.width(8)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, rowdef.width(9)).getBytes(),
this.flags.bytes(),
normalizeHandle(this.handle).getBytes()
};
if (urlIndexFile == null) System.out.println("urlHashCache is NULL");
if ((urlIndexFile != null) && (urlIndexFile.row() == null)) System.out.println("row() is NULL");
urlIndexFile.put(urlIndexFile.row().newEntry(entry));
this.stored = true;
} catch (IOException e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaNURL:store:" + e.toString() + ", resetting NURL-DB");
e.printStackTrace();
resetHashCache();
} catch (kelondroException e) {
serverLog.logSevere("PLASMA", "plasmaCrawlNURL.store failed: " + e.toString() + ", resetting NURL-DB");
e.printStackTrace();
resetHashCache();
}
}
public String toString() {
StringBuffer str = new StringBuffer();
str.append("hash: ").append(hash==null ? "null" : hash).append(" | ")
.append("initiator: ").append(initiator==null?"null":initiator).append(" | ")
.append("url: ").append(url==null?"null":url.toString()).append(" | ")
.append("referrer: ").append((referrer == null) ? plasmaURL.dummyHash : referrer).append(" | ")
.append("name: ").append((name == null) ? "null" : name).append(" | ")
.append("loaddate: ").append((loaddate == null) ? new Date() : loaddate).append(" | ")
.append("profile: ").append(profileHandle==null?"null":profileHandle).append(" | ")
.append("depth: ").append(Integer.toString(depth)).append(" | ")
.append("forkfactor: ").append(Integer.toString(forkfactor)).append(" | ")
.append("flags: ").append((flags==null) ? "null" : flags.exportB64());
return str.toString();
}
/**
* return a url-hash, based on the md5 algorithm
* the result is a String of 12 bytes within a 72-bit space
* (each byte has an 6-bit range)
* that should be enough for all web pages on the world
*/
public String hash() {
return this.hash;
}
public String initiator() {
if (initiator == null) return null;
if (initiator.length() == 0) return null;
return initiator;
}
public boolean proxy() {
return (initiator() == null);
}
public String referrerHash() {
return this.referrer;
}
public URL url() {
return url;
}
public Date loaddate() {
return loaddate;
}
public String name() {
// return the creator's hash
return name;
}
public int depth() {
return depth;
}
public String profileHandle() {
return profileHandle;
}
}
public class kiter implements Iterator {
// enumerates entry elements
Iterator i;
boolean error = false;
public kiter(boolean up, String firstHash) throws IOException {
i = urlIndexFile.rows(up, (firstHash == null) ? null : firstHash.getBytes());
error = false;
}
public boolean hasNext() {
if (error) return false;
return i.hasNext();
}
public Object next() throws RuntimeException {
kelondroRow.Entry e = (kelondroRow.Entry) i.next();
if (e == null) return null;
try {
return new Entry(e);
} catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null));
}
}
public void remove() {
i.remove();
public Iterator iterator(int stackType) {
// returns an iterator of plasmaCrawlBalancerEntry Objects
try {switch (stackType) {
case STACK_TYPE_CORE: return coreStack.iterator();
case STACK_TYPE_LIMIT: return limitStack.iterator();
case STACK_TYPE_REMOTE: return remoteStack.iterator();
default: return null;
}} catch (IOException e) {
return new HashSet().iterator();
}
}
public Iterator entries(boolean up, String firstHash) throws IOException {
// enumerates entry elements
return new kiter(up, firstHash);
}
}

@ -48,7 +48,6 @@ package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.net.InetAddress;
import java.net.MalformedURLException;
import java.util.Date;
@ -61,8 +60,6 @@ import de.anomic.data.robotsParser;
import de.anomic.http.httpc;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroFlexTable;
@ -171,7 +168,7 @@ public final class plasmaCrawlStacker {
try {
// getting a new message from the crawler queue
checkInterruption();
stackCrawlMessage theMsg = this.queue.waitForMessage();
plasmaCrawlEntry theMsg = this.queue.waitForMessage();
if (theMsg != null) {
// getting a free session thread from the pool
@ -196,18 +193,18 @@ public final class plasmaCrawlStacker {
}
public void enqueue(
String nexturlString,
String referrerString,
URL nexturl,
String referrerhash,
String initiatorHash,
String name,
Date loadDate,
int currentdepth,
plasmaCrawlProfile.entry profile) {
if (profile != null) try {
this.queue.addMessage(new stackCrawlMessage(
this.queue.addMessage(new plasmaCrawlEntry(
initiatorHash,
nexturlString,
referrerString,
nexturl,
referrerhash,
name,
loadDate,
profile.handle(),
@ -220,7 +217,7 @@ public final class plasmaCrawlStacker {
}
}
public String dequeue(stackCrawlMessage theMsg) throws InterruptedException {
public String dequeue(plasmaCrawlEntry theMsg) throws InterruptedException {
plasmaCrawlProfile.entry profile = this.sb.profiles.getEntry(theMsg.profileHandle());
if (profile == null) {
@ -231,8 +228,8 @@ public final class plasmaCrawlStacker {
return stackCrawl(
theMsg.url().toString(),
theMsg.referrerHash(),
theMsg.initiatorHash(),
theMsg.referrerhash(),
theMsg.initiator(),
theMsg.name(),
theMsg.loaddate(),
theMsg.depth(),
@ -424,175 +421,23 @@ public final class plasmaCrawlStacker {
// add the url into the crawling queue
checkInterruption();
plasmaCrawlNURL.Entry ne = this.sb.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */
plasmaCrawlEntry ne = new plasmaCrawlEntry(initiatorHash, /* initiator, needed for p2p-feedback */
nexturl, /* url clear text string */
loadDate, /* load date */
referrerHash, /* last url in crawling queue */
name, /* the anchor name */
name, /* load date */
loadDate, /* the anchor name */
(profile == null) ? null : profile.handle(), // profile must not be null!
currentdepth, /*depth so far*/
0, /*anchors, default value */
0 /*forkfactor, default value */
);
ne.store();
this.sb.noticeURL.push(
((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT :
((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/,
ne.hash());
ne);
return null;
}
public final class stackCrawlMessage {
private String initiator; // the initiator hash, is NULL or "" if it is the own proxy;
String urlHash; // the url's hash
private String referrerHash; // the url's referrer hash
private String url; // the url as string
String name; // the name of the url, from anchor tag <a>name</a>
private Date loaddate; // the time when the url was first time appeared
private String profileHandle; // the name of the prefetch profile
private int depth; // the prefetch depth so far, starts at 0
private int anchors; // number of anchors of the parent
private int forkfactor; // sum of anchors of all ancestors
private kelondroBitfield flags;
private int handle;
// loadParallel(URL url, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile) {
public stackCrawlMessage(
String initiator,
String urlString,
String referrerUrlString,
String name,
Date loaddate,
String profileHandle,
int depth,
int anchors,
int forkfactor) {
try {
// create new entry and store it into database
this.urlHash = plasmaURL.urlHash(urlString);
this.initiator = initiator;
this.url = urlString;
this.referrerHash = (referrerUrlString == null) ? plasmaURL.dummyHash : plasmaURL.urlHash(referrerUrlString);
this.name = (name == null) ? "" : name;
this.loaddate = (loaddate == null) ? new Date() : loaddate;
this.profileHandle = profileHandle; // must not be null
this.depth = depth;
this.anchors = anchors;
this.forkfactor = forkfactor;
this.flags = new kelondroBitfield();
this.handle = 0;
} catch (Exception e) {
e.printStackTrace();
}
}
public stackCrawlMessage(String urlHash, kelondroRow.Entry entry) {
if (urlHash == null) throw new NullPointerException("Url hash was null");
if (entry == null) throw new NullPointerException("kelondroRow.Entry was null");
try {
this.urlHash = urlHash;
this.initiator = entry.getColString(1, "UTF-8");
this.url = entry.getColString(2, "UTF-8").trim();
this.referrerHash = (entry.empty(3)) ? plasmaURL.dummyHash : entry.getColString(3, "UTF-8");
this.name = (entry.empty(4)) ? "" : entry.getColString(4, "UTF-8").trim();
this.loaddate = new Date(86400000 * entry.getColLong(5));
this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6, "UTF-8").trim();
this.depth = (int) entry.getColLong(7);
this.anchors = (int) entry.getColLong(8);
this.forkfactor = (int) entry.getColLong(9);
this.flags = new kelondroBitfield(entry.getColBytes(10));
try {
this.handle = Integer.parseInt(new String(entry.getColBytes(11), "UTF-8"));
} catch (NumberFormatException ee) {
System.out.println("BUG in stackCrawlMessage. entry = " + entry.toString());
throw new RuntimeException(ee.getMessage());
}
} catch (Exception e) {
e.printStackTrace();
throw new IllegalStateException(e.toString());
}
}
public String url() {
return this.url;
}
public String referrerHash() {
return this.referrerHash;
}
public String initiatorHash() {
if (this.initiator == null) return null;
if (this.initiator.length() == 0) return null;
return this.initiator;
}
public Date loaddate() {
return this.loaddate;
}
public String name() {
return this.name;
}
public int depth() {
return this.depth;
}
public String profileHandle() {
return this.profileHandle;
}
public String toString() {
StringBuffer str = new StringBuffer();
str.append("urlHash: ").append(urlHash==null ? "null" : urlHash).append(" | ")
.append("initiator: ").append(initiator==null?"null":initiator).append(" | ")
.append("url: ").append(url==null?"null":url).append(" | ")
.append("referrer: ").append((referrerHash == null) ? plasmaURL.dummyHash : referrerHash).append(" | ")
.append("name: ").append((name == null) ? "null" : name).append(" | ")
.append("loaddate: ").append((loaddate == null) ? new Date() : loaddate).append(" | ")
.append("profile: ").append(profileHandle==null?"null":profileHandle).append(" | ")
.append("depth: ").append(Integer.toString(depth)).append(" | ")
.append("forkfactor: ").append(Integer.toString(forkfactor)).append(" | ")
//.append("flags: ").append((flags==null) ? "null" : flags.toString())
;
return str.toString();
}
public byte[][] getBytes() {
// stores the values from the object variables into the database
String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, plasmaCrawlNURL.rowdef.width(5));
// store the hash in the hash cache
// even if the entry exists, we simply overwrite it
byte[][] entry = null;
try {
entry = new byte[][] {
this.urlHash.getBytes(),
(this.initiator == null) ? "".getBytes() : this.initiator.getBytes(),
this.url.getBytes(),
this.referrerHash.getBytes(),
this.name.getBytes("UTF-8"),
loaddatestr.getBytes(),
(this.profileHandle == null) ? null : this.profileHandle.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.depth, plasmaCrawlNURL.rowdef.width(7)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, plasmaCrawlNURL.rowdef.width(8)).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, plasmaCrawlNURL.rowdef.width(9)).getBytes(),
this.flags.bytes(),
normalizeHandle(this.handle).getBytes()
};
} catch (UnsupportedEncodingException e) { /* ignore this */ }
return entry;
}
private String normalizeHandle(int h) {
String d = Integer.toHexString(h);
while (d.length() < plasmaCrawlNURL.rowdef.width(11)) d = "0" + d;
return d;
}
}
final class stackCrawlQueue {
private final serverSemaphore readSync;
@ -657,10 +502,10 @@ public final class plasmaCrawlStacker {
// do nothing..
}
if (this.dbtype == QUEUE_DB_TYPE_FLEX) {
kelondroFlexWidthArray.delete(cacheStacksPath, "urlPreNotice2.table");
kelondroFlexWidthArray.delete(cacheStacksPath, "urlNoticeStacker7.db");
}
if (this.dbtype == QUEUE_DB_TYPE_TREE) {
File cacheFile = new File(cacheStacksPath, "urlPreNotice.db");
File cacheFile = new File(cacheStacksPath, "urlNoticeStacker.db");
cacheFile.delete();
}
}
@ -669,19 +514,19 @@ public final class plasmaCrawlStacker {
if (!(cacheStacksPath.exists())) cacheStacksPath.mkdir(); // make the path
if (this.dbtype == QUEUE_DB_TYPE_RAM) {
this.urlEntryCache = new kelondroRowSet(plasmaCrawlNURL.rowdef, 0);
this.urlEntryCache = new kelondroRowSet(plasmaCrawlEntry.rowdef, 0);
}
if (this.dbtype == QUEUE_DB_TYPE_FLEX) {
String newCacheName = "urlPreNotice2.table";
String newCacheName = "urlNoticeStacker7.db";
cacheStacksPath.mkdirs();
try {
this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, plasmaCrawlNURL.rowdef), true, false);
this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, plasmaCrawlEntry.rowdef), true, false);
} catch (Exception e) {
e.printStackTrace();
// kill DB and try again
kelondroFlexTable.delete(cacheStacksPath, newCacheName);
try {
this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, plasmaCrawlNURL.rowdef), true, false);
this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, plasmaCrawlEntry.rowdef), true, false);
} catch (Exception ee) {
ee.printStackTrace();
System.exit(-1);
@ -689,10 +534,10 @@ public final class plasmaCrawlStacker {
}
}
if (this.dbtype == QUEUE_DB_TYPE_TREE) {
File cacheFile = new File(cacheStacksPath, "urlPreNotice.db");
File cacheFile = new File(cacheStacksPath, "urlNoticeStacker.db");
cacheFile.getParentFile().mkdirs();
try {
this.urlEntryCache = new kelondroCache(kelondroTree.open(cacheFile, true, preloadTime, plasmaCrawlNURL.rowdef), true, true);
this.urlEntryCache = new kelondroCache(kelondroTree.open(cacheFile, true, preloadTime, plasmaCrawlEntry.rowdef), true, true);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
@ -708,7 +553,7 @@ public final class plasmaCrawlStacker {
this.urlEntryHashCache.clear();
}
public void addMessage(stackCrawlMessage newMessage)
public void addMessage(plasmaCrawlEntry newMessage)
throws InterruptedException, IOException {
if (newMessage == null) throw new NullPointerException();
@ -717,9 +562,9 @@ public final class plasmaCrawlStacker {
boolean insertionDoneSuccessfully = false;
synchronized(this.urlEntryHashCache) {
kelondroRow.Entry oldValue = this.urlEntryCache.put(this.urlEntryCache.row().newEntry(newMessage.getBytes()));
kelondroRow.Entry oldValue = this.urlEntryCache.put(newMessage.toRow());
if (oldValue == null) {
insertionDoneSuccessfully = this.urlEntryHashCache.add(newMessage.urlHash);
insertionDoneSuccessfully = this.urlEntryHashCache.add(newMessage.urlhash());
}
}
@ -741,7 +586,7 @@ public final class plasmaCrawlStacker {
return this.dbtype;
}
public stackCrawlMessage waitForMessage() throws InterruptedException, IOException {
public plasmaCrawlEntry waitForMessage() throws InterruptedException, IOException {
this.readSync.P();
this.writeSync.P();
@ -759,7 +604,7 @@ public final class plasmaCrawlStacker {
}
if ((urlHash == null) || (entry == null)) return null;
return new stackCrawlMessage(urlHash, entry);
return new plasmaCrawlEntry(entry);
}
}
@ -941,7 +786,7 @@ public final class plasmaCrawlStacker {
private boolean running = false;
private boolean stopped = false;
private boolean done = false;
private stackCrawlMessage theMsg;
private plasmaCrawlEntry theMsg;
public Worker(ThreadGroup theThreadGroup) {
super(theThreadGroup,"stackCrawlThread_created");
@ -963,7 +808,7 @@ public final class plasmaCrawlStacker {
}
}
public synchronized void execute(stackCrawlMessage newMsg) {
public synchronized void execute(plasmaCrawlEntry newMsg) {
this.theMsg = newMsg;
this.done = false;
@ -1020,7 +865,7 @@ public final class plasmaCrawlStacker {
private void execute() throws InterruptedException {
try {
this.setName("stackCrawlThread_" + this.theMsg.url);
this.setName("stackCrawlThread_" + this.theMsg.url());
String rejectReason = dequeue(this.theMsg);
// check for interruption
@ -1028,15 +873,9 @@ public final class plasmaCrawlStacker {
// if the url was rejected we store it into the error URL db
if (rejectReason != null) {
plasmaCrawlEURL.Entry ee = sb.errorURL.newEntry(
new URL(this.theMsg.url()),
this.theMsg.referrerHash(),
this.theMsg.initiatorHash(),
yacyCore.seedDB.mySeed.hash,
this.theMsg.name,
rejectReason,
new kelondroBitfield()
);
plasmaCrawlZURL.Entry ee = sb.errorURL.newEntry(
this.theMsg, yacyCore.seedDB.mySeed.hash, null,
0, rejectReason);
ee.store();
sb.errorURL.stackPushEntry(ee);
}

@ -0,0 +1,274 @@
// plasmaCrawlZURL.java
// (C) 2007 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 15.03.2007 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.Iterator;
import java.util.LinkedList;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroFlexTable;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroRow;
import de.anomic.net.URL;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacySeedDB;
public class plasmaCrawlZURL {
public final static kelondroRow rowdef = new kelondroRow(
"String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash
"String executor-" + yacySeedDB.commonHashLength + ", " + // the crawling executor
"Cardinal workdate-8 {b256}, " + // the time when the url was last time tried to load
"Cardinal workcount-4 {b256}, " + // number of load retries
"String anycause-80, " + // string describing load failure
"byte[] entry-" + plasmaCrawlEntry.rowdef.objectsize(), // extra space
kelondroBase64Order.enhancedCoder,
0);
// the class object
private kelondroIndex urlIndexFile = null;
private LinkedList rejectedStack = new LinkedList(); // strings: url
public plasmaCrawlZURL(File cachePath, String tablename) {
cachePath.mkdirs();
try {
urlIndexFile = new kelondroFlexTable(cachePath, tablename, -1, rowdef);
} catch (IOException e) {
e.printStackTrace();
System.exit(-1);
}
}
public int size() {
try {
return urlIndexFile.size() ;
} catch (IOException e) {
return 0;
}
}
public void close() {
if (urlIndexFile != null) {
urlIndexFile.close();
urlIndexFile = null;
}
}
public synchronized Entry newEntry(
plasmaCrawlEntry bentry, String executor, Date workdate,
int workcount, String anycause) {
if ((executor == null) || (executor.length() < yacySeedDB.commonHashLength)) executor = plasmaURL.dummyHash;
if (anycause == null) anycause = "unknown";
return new Entry(bentry, executor, workdate, workcount, anycause);
}
public synchronized Entry newEntry(URL url, String anycause) {
return new Entry(url, anycause);
}
public boolean remove(String hash) {
if (hash == null) return false;
try {
urlIndexFile.remove(hash.getBytes());
return true;
} catch (IOException e) {
return false;
}
}
public synchronized void stackPushEntry(Entry e) {
rejectedStack.add(e.hash());
}
public Entry stackPopEntry(int pos) throws IOException {
String urlhash = (String) rejectedStack.get(pos);
if (urlhash == null) return null;
return new Entry(urlhash);
}
public synchronized Entry getEntry(String hash) throws IOException {
return new Entry(hash);
}
public boolean getUseNewDB() {
return (urlIndexFile instanceof kelondroFlexTable);
}
public boolean exists(String urlHash) {
try {
return urlIndexFile.has(urlHash.getBytes());
} catch (IOException e) {
return false;
}
}
public void clearStack() {
rejectedStack.clear();
}
public int stackSize() {
return rejectedStack.size();
}
public class Entry {
plasmaCrawlEntry bentry; // the balancer entry
private String executor; // the crawling initiator
private Date workdate; // the time when the url was last time tried to load
private int workcount; // number of tryings
private String anycause; // string describing reason for load fail
private boolean stored;
public Entry(URL url, String reason) {
this(new plasmaCrawlEntry(url), null, new Date(), 0, reason);
}
public Entry(
plasmaCrawlEntry bentry, String executor, Date workdate,
int workcount, String anycause) {
// create new entry
this.bentry = bentry;
this.executor = (executor == null) ? yacyCore.seedDB.mySeed.hash : executor;
this.workdate = (workdate == null) ? new Date() : workdate;
this.workcount = workcount;
this.anycause = (anycause == null) ? "" : anycause;
stored = false;
}
public Entry(String hash) throws IOException {
kelondroRow.Entry entry = urlIndexFile.get(hash.getBytes());
if (entry != null) {
insertEntry(entry);
}
this.stored = true;
}
public Entry(kelondroRow.Entry entry) throws IOException {
insertEntry(entry);
this.stored = false;
}
private void insertEntry(kelondroRow.Entry entry) throws IOException {
assert (entry != null);
this.executor = entry.getColString(1, "UTF-8");
this.workdate = new Date(entry.getColLong(2));
this.workcount = (int) entry.getColLong(3);
this.anycause = entry.getColString(4, "UTF-8");
this.bentry = new plasmaCrawlEntry(plasmaCrawlEntry.rowdef.newEntry(entry.getColBytes(5)));
assert ((new String(entry.getColBytes(0))).equals(bentry.urlhash()));
return;
}
public void store() {
// stores the values from the object variables into the database
if (this.stored) return;
if (this.bentry == null) return;
kelondroRow.Entry newrow = rowdef.newEntry();
newrow.setCol(0, this.bentry.urlhash().getBytes());
newrow.setCol(1, this.executor.getBytes());
newrow.setCol(2, this.workdate.getTime());
newrow.setCol(3, this.workcount);
newrow.setCol(4, this.anycause.getBytes());
newrow.setCol(5, this.bentry.toRow().bytes());
try {
urlIndexFile.put(newrow);
this.stored = true;
} catch (IOException e) {
System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString());
}
}
public URL url() {
return this.bentry.url();
}
public String initiator() {
return this.bentry.initiator();
}
public String hash() {
// return a url-hash, based on the md5 algorithm
// the result is a String of 12 bytes within a 72-bit space
// (each byte has an 6-bit range)
// that should be enough for all web pages on the world
return this.bentry.urlhash();
}
public Date workdate() {
return workdate;
}
public String executor() {
// return the creator's hash
return executor;
}
public String anycause() {
return anycause;
}
}
public class kiter implements Iterator {
// enumerates entry elements
Iterator i;
boolean error = false;
public kiter(boolean up, String firstHash) throws IOException {
i = urlIndexFile.rows(up, (firstHash == null) ? null : firstHash.getBytes());
error = false;
}
public boolean hasNext() {
if (error) return false;
return i.hasNext();
}
public Object next() throws RuntimeException {
kelondroRow.Entry e = (kelondroRow.Entry) i.next();
if (e == null) return null;
try {
return new Entry(e);
} catch (IOException ex) {
throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null));
}
}
public void remove() {
i.remove();
}
}
public Iterator entries(boolean up, String firstHash) throws IOException {
// enumerates entry elements
return new kiter(up, firstHash);
}
}

@ -454,7 +454,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
if (rcLocal == null) return;
plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, rcLocal, timeout - System.currentTimeMillis());
preorder.remove(true, true);
if (preorder.filteredCount()> query.wantedResults) preorder.remove(true, true);
// start url-fetch
indexRWIEntryNew entry;

@ -512,6 +512,7 @@ public class plasmaSnippetCache {
maxLength = maxLength - result.length();
if (maxLength < 20) maxLength = 20;
tsr = computeTextSnippet(os.values().iterator(), remaininghashes, maxLength);
if (tsr == null) return null;
String nextSnippet = (String) tsr[0];
if (nextSnippet == null) return tsr;
return new Object[]{result + (" / " + nextSnippet), tsr[1]};

@ -209,7 +209,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public File workPath;
public HashMap rankingPermissions;
public plasmaCrawlNURL noticeURL;
public plasmaCrawlEURL errorURL;
public plasmaCrawlZURL errorURL, delegatedURL;
public plasmaWordIndex wordIndex;
public plasmaHTCache cacheManager;
public plasmaSnippetCache snippetCache;
@ -1038,8 +1038,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// start indexing management
log.logConfig("Starting Indexing Management");
noticeURL = new plasmaCrawlNURL(plasmaPath, -1);
errorURL = new plasmaCrawlEURL(plasmaPath, -1);
noticeURL = new plasmaCrawlNURL(plasmaPath);
errorURL = new plasmaCrawlZURL(plasmaPath, "urlError.db");
delegatedURL = new plasmaCrawlZURL(plasmaPath, "urlDelegated.db");
wordIndex = new plasmaWordIndex(indexPath, ramRWI_time, log);
// set a high maximum cache size to current size; this is adopted later automatically
@ -1330,19 +1331,20 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// if it not exists, null is returned
if (wordIndex.loadedURL.exists(hash)) return "loaded";
if (noticeURL.existsInStack(hash)) return "crawler";
if (delegatedURL.exists(hash)) return "delegated";
if (errorURL.exists(hash)) return "errors";
return null;
}
public URL getURL(String urlhash) throws IOException {
if (urlhash.equals(plasmaURL.dummyHash)) return null;
try {
plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash);
if (ne != null) return ne.url();
} catch (IOException e) {}
plasmaCrawlEntry ne = noticeURL.get(urlhash);
if (ne != null) return ne.url();
indexURLEntry le = wordIndex.loadedURL.load(urlhash, null);
if (le != null) return le.comp().url();
plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash);
plasmaCrawlZURL.Entry ee = delegatedURL.getEntry(urlhash);
if (ee != null) return ee.url();
ee = errorURL.getEntry(urlhash);
if (ee != null) return ee.url();
return null;
}
@ -1602,6 +1604,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
flushCitationReference(crg, "crg");
log.logConfig("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager (stand by...)");
noticeURL.close();
delegatedURL.close();
errorURL.close();
wordIndex.close();
yc.close();
@ -1739,6 +1742,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public int cleanupJobSize() {
int c = 0;
if ((delegatedURL.stackSize() > 1000)) c++;
if ((errorURL.stackSize() > 1000)) c++;
for (int i = 1; i <= 6; i++) {
if (wordIndex.loadedURL.getStackSize(i) > 1000) c++;
@ -1758,6 +1762,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
rankingOwnDistribution.transferRanking(count);
rankingOtherDistribution.transferRanking(1);
// clean up delegated stack
checkInterruption();
if ((delegatedURL.stackSize() > 1000)) {
log.logFine("Cleaning Delegated-URLs report stack, " + delegatedURL.stackSize() + " entries on stack");
delegatedURL.clearStack();
hasDoneSomething = true;
}
// clean up error stack
checkInterruption();
if ((errorURL.stackSize() > 1000)) {
@ -1765,6 +1777,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
errorURL.clearStack();
hasDoneSomething = true;
}
// clean up loadedURL stack
for (int i = 1; i <= 6; i++) {
checkInterruption();
@ -1774,6 +1787,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
hasDoneSomething = true;
}
}
// clean up profiles
checkInterruption();
if (cleanProfiles()) hasDoneSomething = true;
@ -1883,7 +1897,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// do a local crawl
plasmaCrawlNURL.Entry urlEntry = null;
plasmaCrawlEntry urlEntry = null;
while (urlEntry == null && noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) > 0) {
String stats = "LOCALCRAWL[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
try {
@ -1953,7 +1967,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String stats = "REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", "
+ noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
try {
plasmaCrawlNURL.Entry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT);
plasmaCrawlEntry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT);
String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url());
@ -2040,7 +2054,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", "
+ noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]";
try {
plasmaCrawlNURL.Entry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE);
plasmaCrawlEntry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE);
String profileHandle = urlEntry.profileHandle();
// System.out.println("DEBUG plasmaSwitchboard.processCrawling:
// profileHandle = " + profileHandle + ", urlEntry.url = " +
@ -2155,6 +2169,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
Map hl = document.getHyperlinks();
Iterator i = hl.entrySet().iterator();
String nextUrlString;
URL nextUrl;
Map.Entry nextEntry;
while (i.hasNext()) {
// check for interruption
@ -2164,10 +2179,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
nextEntry = (Map.Entry) i.next();
nextUrlString = (String) nextEntry.getKey();
try {
nextUrlString = new URL(nextUrlString).toNormalform();
nextUrl = new URL(nextUrlString);
// enqueue the hyperlink into the pre-notice-url db
sbStackCrawlThread.enqueue(nextUrlString, entry.url().toString(), initiatorPeerHash, (String) nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile());
sbStackCrawlThread.enqueue(nextUrl, entry.urlHash(), initiatorPeerHash, (String) nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile());
} catch (MalformedURLException e1) {}
}
log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.normalizedURLString() +
@ -2447,11 +2462,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
// removing current entry from notice URL queue
/*
boolean removed = noticeURL.remove(entry.urlHash()); // worked-off
if (!removed) {
log.logFinest("Unable to remove indexed URL " + entry.url() + " from Crawler Queue. This could be because of an URL redirect.");
}
*/
// explicit delete/free resources
if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) {
plasmaHTCache.filesInUse.remove(entry.cacheFile());
@ -2540,7 +2557,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
}
private void processLocalCrawling(plasmaCrawlNURL.Entry urlEntry, plasmaCrawlProfile.entry profile, String stats) {
private void processLocalCrawling(plasmaCrawlEntry urlEntry, plasmaCrawlProfile.entry profile, String stats) {
// work off one Crawl stack entry
if ((urlEntry == null) || (urlEntry.url() == null)) {
log.logInfo(stats + ": urlEntry=null");
@ -2549,114 +2566,117 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// convert the referrer hash into the corresponding URL
URL refererURL = null;
String refererHash = urlEntry.referrerHash();
String refererHash = urlEntry.referrerhash();
if ((refererHash != null) && (!refererHash.equals(plasmaURL.dummyHash))) try {
refererURL = this.getURL(refererHash);
} catch (IOException e) {
refererURL = null;
}
cacheLoader.loadAsync(urlEntry.url(), urlEntry.name(), (refererURL!=null)?refererURL.toString():null, urlEntry.initiator(), urlEntry.depth(), profile);
log.logInfo(stats + ": enqueued for load " + urlEntry.url() + " [" + urlEntry.hash() + "]");
log.logInfo(stats + ": enqueued for load " + urlEntry.url() + " [" + urlEntry.urlhash() + "]");
return;
}
private boolean processRemoteCrawlTrigger(plasmaCrawlNURL.Entry urlEntry) {
private boolean processRemoteCrawlTrigger(plasmaCrawlEntry urlEntry) {
// if this returns true, then the urlEntry is considered as stored somewhere and the case is finished
// if this returns false, the urlEntry will be enqueued to the local crawl again
// return true iff another peer has/will index(ed) the url
// wrong access
if (urlEntry == null) {
log.logInfo("REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null");
return true; // superfluous request; true correct in this context
}
// are we qualified?
if ((yacyCore.seedDB.mySeed == null) ||
(yacyCore.seedDB.mySeed.isJunior())) {
log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: no permission");
return false;
return true; // superfluous request; true correct in this context because the urlEntry shall not be tracked any more
}
// check url
if (urlEntry.url() == null) {
log.logFine("ERROR: plasmaSwitchboard.processRemoteCrawlTrigger - url is null. name=" + urlEntry.name());
return true;
return true; // same case as above: no more consideration
}
String urlhash = plasmaURL.urlHash(urlEntry.url());
// check remote crawl
yacySeed remoteSeed = yacyCore.dhtAgent.getCrawlSeed(urlhash);
// are we qualified for a remote crawl?
if ((yacyCore.seedDB.mySeed == null) || (yacyCore.seedDB.mySeed.isJunior())) {
log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: no permission");
return false; // no, we must crawl this page ourselves
}
// check if peer for remote crawl is available
yacySeed remoteSeed = yacyCore.dhtAgent.getCrawlSeed(urlEntry.urlhash());
if (remoteSeed == null) {
log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: no remote crawl seed available");
return false;
}
// do the request
HashMap page = null;
try {
HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), getURL(urlEntry.referrerHash()), 6000);
page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), getURL(urlEntry.referrerhash()), 6000);
} catch (IOException e1) {
log.logSevere(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. URL CANNOT BE RETRIEVED from referrer hash: " + urlEntry.referrerhash(), e1);
return false;
}
// check success
/*
* the result of the 'response' value can have one of the following
* values: negative cases, no retry denied - the peer does not want
* to crawl that exception - an exception occurred
*
* negative case, retry possible rejected - the peer has rejected to
* process, but a re-try should be possible
*
* positive case with crawling stacked - the resource is processed
* asap
*
* positive case without crawling double - the resource is already
* in database, believed to be fresh and not reloaded the resource
* is also returned in lurl
*/
if ((page == null) || (page.get("delay") == null)) {
log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + urlEntry.url().toString() + "). Removed peer.");
if (remoteSeed != null) {
yacyCore.peerActions.peerDeparture(remoteSeed);
}
return false;
} else
// check if we got contact to peer and the peer respondet
if ((page == null) || (page.get("delay") == null)) {
log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + urlEntry.url().toString() + "). Removed peer.");
yacyCore.peerActions.peerDeparture(remoteSeed);
return false; // no response from peer, we will crawl this ourself
}
log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed="
+ remoteSeed.getName() + ", url=" + urlEntry.url().toString()
+ ", response=" + page.toString()); // DEBUG
// we received an answer and we are told to wait a specific time until we shall ask again for another crawl
int newdelay = Integer.parseInt((String) page.get("delay"));
yacyCore.dhtAgent.setCrawlDelay(remoteSeed.hash, newdelay);
String response = (String) page.get("response");
if (response.equals("stacked")) {
// success, the remote peer accepted the crawl
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
+ " PLACED URL=" + urlEntry.url().toString()
+ "; NEW DELAY=" + newdelay);
// track this remote crawl
this.delegatedURL.newEntry(urlEntry, remoteSeed.hash, new Date(), 0, response).store();
return true;
}
// check other cases: the remote peer may respond that it already knows that url
if (response.equals("double")) {
// in case the peer answers double, it transmits the complete lurl data
String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
indexURLEntry entry = wordIndex.loadedURL.newEntry(propStr);
try {
log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed=" + remoteSeed.getName() + ", url=" + urlEntry.url().toString() + ", response=" + page.toString()); // DEBUG
int newdelay = Integer.parseInt((String) page.get("delay"));
yacyCore.dhtAgent.setCrawlDelay(remoteSeed.hash, newdelay);
String response = (String) page.get("response");
if (response.equals("stacked")) {
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " PLACED URL=" + urlEntry.url().toString() + "; NEW DELAY=" + newdelay);
return true;
} else if (response.equals("double")) {
String lurl = (String) page.get("lurl");
if ((lurl != null) && (lurl.length() != 0)) {
String propStr = crypt.simpleDecode(lurl, (String) page.get("key"));
indexURLEntry entry = wordIndex.loadedURL.newEntry(propStr);
wordIndex.loadedURL.store(entry);
wordIndex.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
noticeURL.remove(entry.hash());
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'");
return true;
} else {
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " REJECTED. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + ")");
remoteSeed.setFlagAcceptRemoteCrawl(false);
yacyCore.seedDB.update(remoteSeed.hash, remoteSeed);
return false;
}
} else {
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " DENIED. RESPONSE=" + response + ", CAUSE=" + page.get("reason") + ", URL=" + urlEntry.url().toString());
remoteSeed.setFlagAcceptRemoteCrawl(false);
yacyCore.seedDB.update(remoteSeed.hash, remoteSeed);
return false;
}
} catch (Exception e) {
// wrong values
log.logSevere(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. CLIENT RETURNED: " + page.toString(), e);
return false;
wordIndex.loadedURL.store(entry);
wordIndex.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt?
// noticeURL.remove(entry.hash());
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} catch (IOException e) {
log.logSevere(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. URL CANNOT BE RETRIEVED from referrer hash: " + urlEntry.referrerHash(), e);
return false;
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
+ " SUPERFLUOUS. CAUSE: " + page.get("reason")
+ " (URL=" + urlEntry.url().toString()
+ "). URL IS CONSIDERED AS 'LOADED!'");
return true;
} else {
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
+ " REJECTED. CAUSE: bad lurl response / " + page.get("reason") + " (URL="
+ urlEntry.url().toString() + ")");
remoteSeed.setFlagAcceptRemoteCrawl(false);
yacyCore.seedDB.update(remoteSeed.hash, remoteSeed);
return false;
}
}
log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName()
+ " DENIED. RESPONSE=" + response + ", CAUSE="
+ page.get("reason") + ", URL=" + urlEntry.url().toString());
remoteSeed.setFlagAcceptRemoteCrawl(false);
yacyCore.seedDB.update(remoteSeed.hash, remoteSeed);
return false;
}
private static SimpleDateFormat DateFormatter = new SimpleDateFormat("EEE, dd MMM yyyy");
@ -3165,20 +3185,24 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
kelondroBitfield flags
) {
// create a new errorURL DB entry
plasmaCrawlEURL.Entry ee = this.errorURL.newEntry(
url,
referrerHash,
initiator,
yacyCore.seedDB.mySeed.hash,
(name==null)?"":name,
failreason,
flags
);
plasmaCrawlEntry bentry = new plasmaCrawlEntry(
initiator,
url,
referrerHash,
(name == null) ? "" : name,
new Date(),
null,
0,
0,
0);
plasmaCrawlZURL.Entry ee = this.errorURL.newEntry(
bentry, initiator, new Date(),
0, failreason);
// store the entry
ee.store();
// push it onto the stack
this.errorURL.stackPushEntry(ee);
}
}
public void checkInterruption() throws InterruptedException {
Thread curThread = Thread.currentThread();

@ -253,20 +253,7 @@ public class plasmaSwitchboardQueue {
this.contentInfo = null;
this.referrerURL = null;
}
public String toString() {
StringBuffer str = new StringBuffer();
str.append("url: ") .append(this.url==null ? "null" : this.url.toString()).append(" | ")
.append("referrer: ") .append(this.referrerHash==null?"null":this.referrerHash).append(" | ")
.append("ifModifiedSince: ").append(this.ifModifiedSince==null?"null":this.ifModifiedSince.toString()).append(" | ")
.append("flags: ") .append(Byte.toString(this.flags)).append(" | ")
.append("initiator: ") .append(this.initiator==null ? "null" : this.initiator).append(" | ")
.append("depth: ") .append(Integer.toString(this.depth)).append(" | ")
.append("profile: ") .append(this.profileHandle==null?"null":this.profileHandle).append(" | ")
.append("anchorName: ") .append(this.anchorName==null?"null":this.anchorName);
return str.toString();
}
public URL url() {
return url;
}

@ -84,9 +84,10 @@ import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroTree;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaCrawlEURL;
import de.anomic.plasma.plasmaCrawlEntry;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaCrawlNURL;
import de.anomic.plasma.plasmaCrawlZURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.plasma.plasmaWordIndexAssortment;
@ -1011,11 +1012,11 @@ public final class yacy {
}
if (source.equals("eurl")) {
Iterator eiter = sb.errorURL.entries(true, null);
plasmaCrawlEURL.Entry entry;
plasmaCrawlZURL.Entry entry;
while (eiter.hasNext()) {
try {
entry = (plasmaCrawlEURL.Entry) eiter.next();
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), entry.failreason());
entry = (plasmaCrawlZURL.Entry) eiter.next();
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), entry.anycause());
} catch (Exception e) {
// here a MalformedURLException may occur
// just ignore
@ -1029,11 +1030,11 @@ public final class yacy {
}
}
if (source.equals("nurl")) {
Iterator eiter = sb.noticeURL.entries(true, null);
plasmaCrawlNURL.Entry entry;
Iterator eiter = sb.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
plasmaCrawlEntry entry;
while (eiter.hasNext()) {
try {
entry = (plasmaCrawlNURL.Entry) eiter.next();
entry = (plasmaCrawlEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), "profile=" + entry.profileHandle() + ", depth=" + entry.depth());
} catch (Exception e) {
// here a MalformedURLException may occur
@ -1120,12 +1121,12 @@ public final class yacy {
}
if (source.equals("eurl")) {
Iterator eiter = sb.errorURL.entries(true, null);
plasmaCrawlEURL.Entry entry;
plasmaCrawlZURL.Entry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlEURL.Entry) eiter.next();
entry = (plasmaCrawlZURL.Entry) eiter.next();
if ((entry != null) && (entry.url() != null)) {
if (html) {
bos.write(("<a href=\"" + entry.url() + "\">" + entry.url() + "</a> " + entry.failreason() + "<br>").getBytes("UTF-8"));
bos.write(("<a href=\"" + entry.url() + "\">" + entry.url() + "</a> " + entry.anycause() + "<br>").getBytes("UTF-8"));
bos.write(serverCore.crlf);
} else {
bos.write(entry.url().toString().getBytes());
@ -1135,10 +1136,10 @@ public final class yacy {
}
}
if (source.equals("nurl")) {
Iterator eiter = sb.noticeURL.entries(true, null);
plasmaCrawlNURL.Entry entry;
Iterator eiter = sb.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE);
plasmaCrawlEntry entry;
while (eiter.hasNext()) {
entry = (plasmaCrawlNURL.Entry) eiter.next();
entry = (plasmaCrawlEntry) eiter.next();
if ((entry != null) && (entry.url() != null)) {
if (html) {
bos.write(("<a href=\"" + entry.url() + "\">" + entry.url() + "</a> " + "profile=" + entry.profileHandle() + ", depth=" + entry.depth() + "<br>").getBytes("UTF-8"));

Loading…
Cancel
Save