diff --git a/htroot/CrawlURLFetchStack_p.java b/htroot/CrawlURLFetchStack_p.java index d612320e4..9e612c736 100644 --- a/htroot/CrawlURLFetchStack_p.java +++ b/htroot/CrawlURLFetchStack_p.java @@ -56,6 +56,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.http.httpHeader; import de.anomic.net.URL; +import de.anomic.plasma.plasmaCrawlEntry; import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.urlPattern.plasmaURLPattern; @@ -272,7 +273,7 @@ public class CrawlURLFetchStack_p { } private static int shiftFromNotice(plasmaCrawlNURL nurl, int fromStackType, URLFetcherStack stack, int count) { - plasmaCrawlNURL.Entry entry; + plasmaCrawlEntry entry; int failed = 0; for (int i=0; i= (switchboard.errorURL.stackSize() - showRejectedCount); i--) { @@ -202,7 +202,7 @@ public class IndexCreateIndexingQueue_p { prop.put("rejected_list_"+j+"_initiator", ((initiatorSeed == null) ? "proxy" : wikiCode.replaceHTML(initiatorSeed.getName()))); prop.put("rejected_list_"+j+"_executor", ((executorSeed == null) ? "proxy" : wikiCode.replaceHTML(executorSeed.getName()))); prop.put("rejected_list_"+j+"_url", wikiCode.replaceHTML(url.toString())); - prop.put("rejected_list_"+j+"_failreason", entry.failreason()); + prop.put("rejected_list_"+j+"_failreason", entry.anycause()); prop.put("rejected_list_"+j+"_dark", ((dark) ? 1 : 0)); dark = !dark; j++; diff --git a/htroot/IndexCreateWWWGlobalQueue_p.java b/htroot/IndexCreateWWWGlobalQueue_p.java index 3c8767480..b1ebfb634 100644 --- a/htroot/IndexCreateWWWGlobalQueue_p.java +++ b/htroot/IndexCreateWWWGlobalQueue_p.java @@ -49,6 +49,7 @@ import java.util.Locale; import de.anomic.data.wikiCode; import de.anomic.http.httpHeader; +import de.anomic.plasma.plasmaCrawlEntry; import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaSwitchboard; @@ -99,9 +100,9 @@ public class IndexCreateWWWGlobalQueue_p { prop.put("crawler-queue", 0); } else { prop.put("crawler-queue", 1); - plasmaCrawlNURL.Entry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, showLimit); + plasmaCrawlEntry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_LIMIT, showLimit); prop.put("crawler-queue_num", stackSize);//num Entries - plasmaCrawlNURL.Entry urle; + plasmaCrawlEntry urle; boolean dark = true; yacySeed initiator; String profileHandle; diff --git a/htroot/IndexCreateWWWLocalQueue_p.java b/htroot/IndexCreateWWWLocalQueue_p.java index 607d0bc37..176d208e5 100644 --- a/htroot/IndexCreateWWWLocalQueue_p.java +++ b/htroot/IndexCreateWWWLocalQueue_p.java @@ -43,7 +43,6 @@ // javac -classpath .:../classes IndexCreate_p.java // if the shell's current path is HTROOT -import java.io.IOException; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Iterator; @@ -54,10 +53,10 @@ import java.util.regex.PatternSyntaxException; import de.anomic.data.wikiCode; import de.anomic.http.httpHeader; +import de.anomic.plasma.plasmaCrawlEntry; import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.plasmaCrawlNURL.Entry; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyCore; @@ -101,15 +100,11 @@ public class IndexCreateWWWLocalQueue_p { // iterating through the list of URLs Iterator iter = switchboard.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE); + plasmaCrawlEntry entry; while (iter.hasNext()) { + entry = (plasmaCrawlEntry) iter.next(); String value = null; - String nextHash = (String) iter.next(); - Entry entry = null; - try { - entry = switchboard.noticeURL.getEntry(nextHash); - } catch (IOException e) { - continue; - } + String nextHash = entry.urlhash(); if ((option.equals("URL")&&(entry.url() != null))) { value = entry.url().toString(); } else if ((option.equals("AnchorName"))) { @@ -162,9 +157,9 @@ public class IndexCreateWWWLocalQueue_p { prop.put("crawler-queue", 0); } else { prop.put("crawler-queue", 1); - plasmaCrawlNURL.Entry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, (int) (showLimit * 1.20)); + plasmaCrawlEntry[] crawlerList = switchboard.noticeURL.top(plasmaCrawlNURL.STACK_TYPE_CORE, (int) (showLimit * 1.20)); - plasmaCrawlNURL.Entry urle; + plasmaCrawlEntry urle; boolean dark = true; yacySeed initiator; String profileHandle; @@ -183,7 +178,7 @@ public class IndexCreateWWWLocalQueue_p { prop.put("crawler-queue_list_"+showNum+"_modified", daydate(urle.loaddate()) ); prop.put("crawler-queue_list_"+showNum+"_anchor", wikiCode.replaceHTML(urle.name())); prop.put("crawler-queue_list_"+showNum+"_url", wikiCode.replaceHTML(urle.url().toString())); - prop.put("crawler-queue_list_"+showNum+"_hash", urle.hash()); + prop.put("crawler-queue_list_"+showNum+"_hash", urle.urlhash()); dark = !dark; showNum++; } else { diff --git a/htroot/IndexCreate_p.html b/htroot/IndexCreate_p.html index 1df797804..edcd14977 100644 --- a/htroot/IndexCreate_p.html +++ b/htroot/IndexCreate_p.html @@ -27,11 +27,6 @@ Starting Point: - - - - - @@ -41,7 +36,12 @@ - + + + + + +
From File:
From URL:
From File:

@@ -125,7 +125,7 @@ - Store to Proxy Cache: + Store to Web Cache: This option is used by default for proxy prefetch, but is not needed for explicit crawling. @@ -194,9 +194,9 @@ Wanted Performance: - maximum   - custom: PPM   - optimal as background process + maximum   + custom: PPM   + optimal as background process Set wanted level of computing power, used for this and other running crawl tasks. (PPM = pages per minute) diff --git a/htroot/Messages_p.java b/htroot/Messages_p.java index 35099aac7..8420da73a 100644 --- a/htroot/Messages_p.java +++ b/htroot/Messages_p.java @@ -53,7 +53,6 @@ import java.util.Iterator; import java.util.TreeMap; import de.anomic.data.messageBoard; -import de.anomic.data.wikiCode; import de.anomic.http.httpHeader; import de.anomic.http.httpc; import de.anomic.plasma.plasmaSwitchboard; diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java index af43ccf88..0d9466930 100644 --- a/htroot/WatchCrawler_p.java +++ b/htroot/WatchCrawler_p.java @@ -39,10 +39,9 @@ import de.anomic.data.wikiCode; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.http.httpHeader; -import de.anomic.kelondro.kelondroBitfield; import de.anomic.net.URL; -import de.anomic.plasma.plasmaCrawlEURL; import de.anomic.plasma.plasmaCrawlProfile; +import de.anomic.plasma.plasmaCrawlZURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURL; import de.anomic.server.serverFileUtils; @@ -222,8 +221,7 @@ public class WatchCrawler_p { prop.put("info_crawlingURL", wikiCode.replaceHTML(((String) post.get("crawlingURL")))); prop.put("info_reasonString", reasonString); - plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(crawlingStartURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, - crawlingStartURL.getHost(), reasonString, new kelondroBitfield()); + plasmaCrawlZURL.Entry ee = switchboard.errorURL.newEntry(crawlingStartURL, reasonString); ee.store(); switchboard.errorURL.stackPushEntry(ee); } @@ -300,8 +298,7 @@ public class WatchCrawler_p { if (rejectReason == null) { c++; } else { - plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(nexturlURL, null, yacyCore.seedDB.mySeed.hash, yacyCore.seedDB.mySeed.hash, - (String) e.getValue(), rejectReason, new kelondroBitfield()); + plasmaCrawlZURL.Entry ee = switchboard.errorURL.newEntry(nexturlURL, rejectReason); ee.store(); switchboard.errorURL.stackPushEntry(ee); } @@ -401,9 +398,10 @@ public class WatchCrawler_p { private static void setPerformance(plasmaSwitchboard sb, serverObjects post) { String crawlingPerformance = post.get("crawlingPerformance","custom"); - int wantedPPM = 1000; + long LCbusySleep = Integer.parseInt(sb.getConfig(plasmaSwitchboard.CRAWLJOB_LOCAL_CRAWL_BUSYSLEEP, "100")); + int wantedPPM = (int) (60000L / LCbusySleep); try { - wantedPPM = Integer.parseInt(post.get("customPPM","1000")); + wantedPPM = Integer.parseInt(post.get("customPPM",Integer.toString(wantedPPM))); } catch (NumberFormatException e) {} if (crawlingPerformance.equals("minimum")) wantedPPM = 10; if (crawlingPerformance.equals("maximum")) wantedPPM = 1000; diff --git a/htroot/xml/queues_p.java b/htroot/xml/queues_p.java index 8800ca63f..12b3f3b63 100644 --- a/htroot/xml/queues_p.java +++ b/htroot/xml/queues_p.java @@ -54,6 +54,7 @@ import java.util.Locale; import de.anomic.data.wikiCode; import de.anomic.http.httpHeader; +import de.anomic.plasma.plasmaCrawlEntry; import de.anomic.plasma.plasmaCrawlLoaderMessage; import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaSwitchboard; @@ -183,10 +184,10 @@ public class queues_p { } - public static final void addNTable(serverObjects prop, String tableName, plasmaCrawlNURL.Entry[] crawlerList) { + public static final void addNTable(serverObjects prop, String tableName, plasmaCrawlEntry[] crawlerList) { int showNum = 0; - plasmaCrawlNURL.Entry urle; + plasmaCrawlEntry urle; yacySeed initiator; for (int i = 0; i < crawlerList.length; i++) { urle = crawlerList[i]; @@ -198,7 +199,7 @@ public class queues_p { prop.put(tableName + "_" + showNum + "_modified", daydate(urle.loaddate())); prop.putSafeXML(tableName + "_" + showNum + "_anchor", urle.name()); prop.putSafeXML(tableName + "_" + showNum + "_url", urle.url().toString()); - prop.put(tableName + "_" + showNum + "_hash", urle.hash()); + prop.put(tableName + "_" + showNum + "_hash", urle.urlhash()); showNum++; } } diff --git a/htroot/xml/snippet.java b/htroot/xml/snippet.java index 22f5b3dd8..a70858d05 100644 --- a/htroot/xml/snippet.java +++ b/htroot/xml/snippet.java @@ -66,7 +66,7 @@ public class snippet { prop.putASIS("text", (snippet.exists()) ? snippet.getLineMarked(queryHashes) : "unknown"); //FIXME: the ASIS should not be needed, but we have still htmlcode in .java files } else { // problems with snippet fetch - prop.put("text", (remove) ? switchboard.snippetCache.failConsequences(snippet, query) : snippet.getError()); + prop.put("text", (remove) ? switchboard.snippetCache.failConsequences(snippet, queryHashes) : snippet.getError()); } prop.put("link", 0); prop.put("links", 0); diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index bc91d9dae..9567e78c0 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -49,11 +49,8 @@ import java.io.IOException; import de.anomic.http.httpHeader; -import de.anomic.plasma.plasmaURL; import de.anomic.index.indexURLEntry; -import de.anomic.kelondro.kelondroBitfield; -import de.anomic.plasma.plasmaCrawlEURL; -import de.anomic.plasma.plasmaCrawlNURL; +import de.anomic.plasma.plasmaCrawlZURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -85,7 +82,7 @@ public final class crawlReceipt { String youare = post.get("youare", ""); // seed hash of the target peer, needed for network stability //String process = post.get("process", ""); // process type String key = post.get("key", ""); // transmission key - String receivedUrlhash = post.get("urlhash", ""); // the url hash that has been crawled + //String receivedUrlhash = post.get("urlhash", ""); // the url hash that has been crawled String result = post.get("result", ""); // the result; either "ok" or "fail" String reason = post.get("reason", ""); // the reason for that result //String words = post.get("wordh", ""); // priority word hashes @@ -114,60 +111,60 @@ public final class crawlReceipt { final yacySeed otherPeer = yacyCore.seedDB.get(iam); final String otherPeerName = iam + ":" + ((otherPeer == null) ? "NULL" : (otherPeer.getName() + "/" + otherPeer.getVersion())); - - + if ((yacyCore.seedDB.mySeed == null) || (!(yacyCore.seedDB.mySeed.hash.equals(youare)))) { // no yacy connection / unknown peers prop.putASIS("delay", "3600"); - } else if (propStr == null) { + return prop; + } + + if (propStr == null) { // error with url / wrong key prop.putASIS("delay", "3600"); - } else if (result.equals("fill")) { - // generating a new loaded URL entry - indexURLEntry entry = switchboard.wordIndex.loadedURL.newEntry(propStr); - if (entry == null) { - log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) for hash " + receivedUrlhash + " from peer " + iam + - "\n\tURL properties: "+ propStr); - } else { - indexURLEntry.Components comp = entry.comp(); - if (comp.url() == null) { - log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + receivedUrlhash + " from peer " + iam + - "\n\tURL properties: "+ propStr); - } else try { - // put new entry into database - switchboard.wordIndex.loadedURL.store(entry); - switchboard.wordIndex.loadedURL.stack(entry, youare, iam, 1); - - // generating url hash - String newUrlHash = plasmaURL.urlHash(comp.url()); - String oldUrlHash = plasmaURL.oldurlHash(comp.url()); - - // removing URL from notice URL - switchboard.noticeURL.remove(newUrlHash); - switchboard.noticeURL.remove(oldUrlHash); - - log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + receivedUrlhash + ":" + comp.url().toNormalform()); - } catch (IOException e) { - e.printStackTrace(); - } - } + return prop; + } + + // generating a new loaded URL entry + indexURLEntry entry = switchboard.wordIndex.loadedURL.newEntry(propStr); + if (entry == null) { + log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr); + prop.putASIS("delay", "3600"); + return prop; + } + + indexURLEntry.Components comp = entry.comp(); + if (comp.url() == null) { + log.logWarning("crawlReceipt: RECEIVED wrong RECEIPT (url null) for hash " + entry.hash() + " from peer " + iam + "\n\tURL properties: "+ propStr); + prop.putASIS("delay", "3600"); + return prop; + } + + if (result.equals("fill")) try { + // put new entry into database + switchboard.wordIndex.loadedURL.store(entry); + switchboard.wordIndex.loadedURL.stack(entry, youare, iam, 1); + switchboard.delegatedURL.remove(entry.hash()); // the delegated work has been done + log.logInfo("crawlReceipt: RECEIVED RECEIPT from " + otherPeerName + " for URL " + entry.hash() + ":" + comp.url().toNormalform()); + // ready for more prop.putASIS("delay", "10"); - } else { - try { - plasmaCrawlNURL.Entry en = switchboard.noticeURL.getEntry(receivedUrlhash); - plasmaCrawlEURL.Entry ee = switchboard.errorURL.newEntry(en.url(), en.referrerHash(), en.initiator(), iam, en.name(), result + ":" + reason, new kelondroBitfield()); - ee.store(); - switchboard.errorURL.stackPushEntry(ee); - switchboard.noticeURL.remove(receivedUrlhash); - } catch (IOException e) { - - } - prop.putASIS("delay", "100"); // what shall we do with that??? + return prop; + } catch (IOException e) { + e.printStackTrace(); + prop.putASIS("delay", "3600"); + return prop; } + + switchboard.delegatedURL.remove(entry.hash()); // the delegated work is transformed into an error case + plasmaCrawlZURL.Entry ee = switchboard.errorURL.newEntry(entry.toBalancerEntry(), youare, null, 0, result + ":" + reason); + ee.store(); + switchboard.errorURL.stackPushEntry(ee); + //switchboard.noticeURL.remove(receivedUrlhash); + prop.putASIS("delay", "3600"); + return prop; + + // return rewrite properties - // return rewrite properties - return prop; } } diff --git a/source/de/anomic/index/indexURLEntry.java b/source/de/anomic/index/indexURLEntry.java index 07c9e59ba..341e0feeb 100644 --- a/source/de/anomic/index/indexURLEntry.java +++ b/source/de/anomic/index/indexURLEntry.java @@ -34,6 +34,7 @@ import java.util.Date; import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroRow; import de.anomic.net.URL; +import de.anomic.plasma.plasmaCrawlEntry; import de.anomic.index.indexRWIEntry; public interface indexURLEntry { @@ -60,6 +61,7 @@ public interface indexURLEntry { public indexRWIEntry word(); public boolean isOlder(indexURLEntry other); public String toString(String snippet); + public plasmaCrawlEntry toBalancerEntry(); public String toString(); public class Components { diff --git a/source/de/anomic/index/indexURLEntryNew.java b/source/de/anomic/index/indexURLEntryNew.java index dbde6c86c..0c41c93e5 100644 --- a/source/de/anomic/index/indexURLEntryNew.java +++ b/source/de/anomic/index/indexURLEntryNew.java @@ -13,6 +13,7 @@ import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroRow; import de.anomic.net.URL; +import de.anomic.plasma.plasmaCrawlEntry; import de.anomic.plasma.plasmaURL; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.server.serverCharBuffer; @@ -367,6 +368,19 @@ public class indexURLEntryNew implements indexURLEntry { //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}"; } + public plasmaCrawlEntry toBalancerEntry() { + return new plasmaCrawlEntry( + null, + comp().url(), + referrerHash(), + comp().descr(), + loaddate(), + null, + 0, + 0, + 0); + } + /** * Returns this object as String.
* This e.g. looks like this: diff --git a/source/de/anomic/index/indexURLEntryOld.java b/source/de/anomic/index/indexURLEntryOld.java index 601fdc18d..1558ebb1f 100644 --- a/source/de/anomic/index/indexURLEntryOld.java +++ b/source/de/anomic/index/indexURLEntryOld.java @@ -35,6 +35,7 @@ import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroRow; import de.anomic.net.URL; +import de.anomic.plasma.plasmaCrawlEntry; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaURL; import de.anomic.server.logging.serverLog; @@ -335,6 +336,19 @@ public class indexURLEntryOld implements indexURLEntry { //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}"; } + public plasmaCrawlEntry toBalancerEntry() { + return new plasmaCrawlEntry( + null, + comp().url(), + referrerHash(), + comp().descr(), + loaddate(), + null, + 0, + 0, + 0); + } + /** * Returns this object as String.
* This e.g. looks like this: diff --git a/source/de/anomic/kelondro/kelondroCache.java b/source/de/anomic/kelondro/kelondroCache.java index dfd4e700c..0a260715b 100644 --- a/source/de/anomic/kelondro/kelondroCache.java +++ b/source/de/anomic/kelondro/kelondroCache.java @@ -557,7 +557,6 @@ public class kelondroCache implements kelondroIndex { } else { this.hasnotHit++; this.hasnotDouble++; - return null; } } @@ -569,8 +568,6 @@ public class kelondroCache implements kelondroIndex { } else { this.readHit++; this.cacheDelete++; - index.remove(key); - return entry; } } diff --git a/source/de/anomic/kelondro/kelondroFlexWidthArray.java b/source/de/anomic/kelondro/kelondroFlexWidthArray.java index 3d166d7de..0598bc4ff 100644 --- a/source/de/anomic/kelondro/kelondroFlexWidthArray.java +++ b/source/de/anomic/kelondro/kelondroFlexWidthArray.java @@ -223,14 +223,12 @@ public class kelondroFlexWidthArray implements kelondroArray { assert rowentry.bytes().length == this.rowdef.objectsize; int c = 0; kelondroRow.Entry e; - int lastcol; synchronized (col) { while (c < rowdef.columns()) { - lastcol = c + col[c].row().columns() - 1; e = col[c].row().newEntry( rowentry.bytes(), rowdef.colstart[c], - rowdef.colstart[lastcol] - rowdef.colstart[c] + rowdef.width(lastcol)); + col[c].row().objectsize()); col[c].set(index, e); c = c + col[c].row().columns(); } diff --git a/source/de/anomic/kelondro/kelondroRow.java b/source/de/anomic/kelondro/kelondroRow.java index 4d30f34eb..d8a59bd40 100644 --- a/source/de/anomic/kelondro/kelondroRow.java +++ b/source/de/anomic/kelondro/kelondroRow.java @@ -176,20 +176,20 @@ public class kelondroRow { for (int i = 0; i < objectsize; i++) this.rowinstance[i] = 0; } - public Entry(byte[] rowinstance) { - this(rowinstance, 0, rowinstance.length); + public Entry(byte[] newrow) { + this(newrow, 0, newrow.length); } - public Entry(byte[] rowinstance, int start, int length) { - assert objectsize == length : "objectsize = " + objectsize + ", length = " + length; + public Entry(byte[] newrow, int start, int length) { + assert newrow.length >= (length + start) : "objectsize = " + objectsize + ", start = " + start + ", length = " + length; + assert objectsize == length : "objectsize = " + objectsize + ", start = " + start + ", length = " + length; this.rowinstance = new byte[objectsize]; - int ll = Math.min(objectsize, length); - System.arraycopy(rowinstance, start, this.rowinstance, 0, ll); - for (int i = ll; i < objectsize; i++) this.rowinstance[i] = 0; + System.arraycopy(newrow, start, this.rowinstance, 0, objectsize); + //for (int i = ll; i < objectsize; i++) this.rowinstance[i] = 0; } public Entry(byte[][] cols) { - assert row.length == cols.length; + assert row.length == cols.length : "cols.length = " + cols.length + ", row.length = " + row.length; rowinstance = new byte[objectsize]; int ll; int cs, cw; diff --git a/source/de/anomic/kelondro/kelondroStack.java b/source/de/anomic/kelondro/kelondroStack.java index f103904d4..882bf4038 100644 --- a/source/de/anomic/kelondro/kelondroStack.java +++ b/source/de/anomic/kelondro/kelondroStack.java @@ -311,6 +311,7 @@ public final class kelondroStack extends kelondroRecords { } public void remove() { + ni.remove(); } } diff --git a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java index 4e302ef94..b926c501f 100644 --- a/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java +++ b/source/de/anomic/plasma/crawler/AbstractCrawlWorker.java @@ -51,11 +51,11 @@ import java.io.File; import java.io.IOException; import de.anomic.plasma.plasmaURL; -import de.anomic.kelondro.kelondroBitfield; import de.anomic.net.URL; -import de.anomic.plasma.plasmaCrawlEURL; +import de.anomic.plasma.plasmaCrawlEntry; import de.anomic.plasma.plasmaCrawlLoaderMessage; import de.anomic.plasma.plasmaCrawlProfile; +import de.anomic.plasma.plasmaCrawlZURL; import de.anomic.plasma.plasmaHTCache; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.logging.serverLog; @@ -290,15 +290,19 @@ public abstract class AbstractCrawlWorker extends Thread implements plasmaCrawlW String referrerHash = (this.refererURLString==null)?null:plasmaURL.urlHash(this.refererURLString); // create a new errorURL DB entry - plasmaCrawlEURL.Entry ee = this.sb.errorURL.newEntry( - this.url, - referrerHash, + plasmaCrawlEntry bentry = new plasmaCrawlEntry( this.initiator, - yacyCore.seedDB.mySeed.hash, - this.name, - (failreason==null)?"Unknown reason":failreason, - new kelondroBitfield() - ); + this.url, + referrerHash, + this.name, + null, + this.profile.handle(), + this.depth, + 0, + 0); + plasmaCrawlZURL.Entry ee = this.sb.errorURL.newEntry( + bentry, yacyCore.seedDB.mySeed.hash, null, + 0, (failreason==null)?"Unknown reason":failreason); // store the entry ee.store(); diff --git a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java index 08765dd28..998c2aa0d 100644 --- a/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java +++ b/source/de/anomic/plasma/dbImport/plasmaCrawlNURLImporter.java @@ -6,6 +6,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.TreeMap; +import de.anomic.plasma.plasmaCrawlEntry; import de.anomic.plasma.plasmaCrawlNURL; import de.anomic.plasma.plasmaCrawlProfile; import de.anomic.plasma.plasmaSwitchboard; @@ -89,7 +90,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor // init noticeUrlDB this.log.logInfo("Initializing the source noticeUrlDB"); - this.importNurlDB = new plasmaCrawlNURL(this.importPath, preloadTime); + this.importNurlDB = new plasmaCrawlNURL(this.importPath); this.importStartSize = this.importNurlDB.size(); //int stackSize = this.importNurlDB.stackSize(); @@ -101,7 +102,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor public void run() { try { // waiting on init thread to finish - this.importNurlDB.waitOnInitThread(); + //this.importNurlDB.waitOnInitThread(); // the stack types we want to import int[] stackTypes = new int[] {plasmaCrawlNURL.STACK_TYPE_CORE, @@ -110,38 +111,38 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor -1}; // looping through the various stacks - for (int i=0; i< stackTypes.length; i++) { - if (stackTypes[i] != -1) { - this.log.logInfo("Starting to import stacktype '" + stackTypes[i] + "' containing '" + this.importNurlDB.stackSize(stackTypes[i]) + "' entries."); + for (int stackType=0; stackType< stackTypes.length; stackType++) { + if (stackTypes[stackType] != -1) { + this.log.logInfo("Starting to import stacktype '" + stackTypes[stackType] + "' containing '" + this.importNurlDB.stackSize(stackTypes[stackType]) + "' entries."); } else { this.log.logInfo("Starting to import '" + this.importNurlDB.size() + "' entries not available in any stack."); } // getting an interator and loop through the URL entries - Iterator entryIter = (stackTypes[i] == -1) ? this.importNurlDB.entries(true, null) : null; + Iterator entryIter = (stackTypes[stackType] == -1) ? this.importNurlDB.iterator(stackType) : null; while (true) { String nextHash = null; - plasmaCrawlNURL.Entry nextEntry = null; + plasmaCrawlEntry nextEntry = null; try { - if (stackTypes[i] != -1) { - if (this.importNurlDB.stackSize(stackTypes[i]) == 0) break; + if (stackTypes[stackType] != -1) { + if (this.importNurlDB.stackSize(stackTypes[stackType]) == 0) break; this.urlCount++; - nextEntry = this.importNurlDB.pop(stackTypes[i]); - nextHash = nextEntry.hash(); + nextEntry = this.importNurlDB.pop(stackTypes[stackType]); + nextHash = nextEntry.urlhash(); } else { if (!entryIter.hasNext()) break; this.urlCount++; - nextEntry = (plasmaCrawlNURL.Entry) entryIter.next(); - nextHash = nextEntry.hash(); + nextEntry = (plasmaCrawlEntry) entryIter.next(); + nextHash = nextEntry.urlhash(); } } catch (IOException e) { this.log.logWarning("Unable to import entry: " + e.toString()); - if ((stackTypes[i] != -1) &&(this.importNurlDB.stackSize(stackTypes[i]) == 0)) break; + if ((stackTypes[stackType] != -1) &&(this.importNurlDB.stackSize(stackTypes[stackType]) == 0)) break; continue; } @@ -176,9 +177,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor // if the url does not alredy exists in the destination stack we insert it now if (!this.sb.noticeURL.existsInStack(nextHash)) { - plasmaCrawlNURL.Entry ne = this.sb.noticeURL.newEntry(nextEntry); - ne.store(); - this.sb.noticeURL.push((stackTypes[i] != -1) ? stackTypes[i] : plasmaCrawlNURL.STACK_TYPE_CORE, ne.hash()); + this.sb.noticeURL.push((stackTypes[stackType] != -1) ? stackTypes[stackType] : plasmaCrawlNURL.STACK_TYPE_CORE, nextEntry); } // removing hash from the import db @@ -191,7 +190,7 @@ public class plasmaCrawlNURLImporter extends AbstractImporter implements dbImpor } if (this.isAborted()) break; } - this.log.logInfo("Finished to import stacktype '" + stackTypes[i] + "'"); + this.log.logInfo("Finished to import stacktype '" + stackTypes[stackType] + "'"); } //int size = this.importNurlDB.size(); diff --git a/source/de/anomic/plasma/plasmaCrawlBalancer.java b/source/de/anomic/plasma/plasmaCrawlBalancer.java index 556d07081..a85a93589 100644 --- a/source/de/anomic/plasma/plasmaCrawlBalancer.java +++ b/source/de/anomic/plasma/plasmaCrawlBalancer.java @@ -43,17 +43,18 @@ package de.anomic.plasma; import java.io.File; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.Map; import java.util.TreeMap; import de.anomic.kelondro.kelondroBase64Order; +import de.anomic.kelondro.kelondroCache; +import de.anomic.kelondro.kelondroFlexTable; +import de.anomic.kelondro.kelondroIndex; import de.anomic.kelondro.kelondroRecords; import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroStack; @@ -61,86 +62,136 @@ import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySeedDB; public class plasmaCrawlBalancer { + + private static final String stackSuffix = "7.stack"; + private static final String indexSuffix = "7.db"; // a shared domainAccess map for all balancers private static final Map domainAccess = Collections.synchronizedMap(new HashMap()); // definition of payload for fileStack - private static final kelondroRow payload = new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength, kelondroBase64Order.enhancedCoder, 0); + private static final kelondroRow stackrow = new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength, kelondroBase64Order.enhancedCoder, 0); // class variables - private ArrayList ramStack; // a list that is flused first - private kelondroStack fileStack; // a file with url hashes - private HashMap domainStacks; // a map from domain name part to Lists with url hashs - private HashSet ramIndex; // an index is needed externally, we provide that internally + private ArrayList urlRAMStack; // a list that is flused first + private kelondroStack urlFileStack; // a file with url hashes + private kelondroIndex urlFileIndex; + private HashMap domainStacks; // a map from domain name part to Lists with url hashs + private File cacheStacksPath; + private String stackname; - public plasmaCrawlBalancer(File stackFile) { - fileStack = kelondroStack.open(stackFile, payload); + public plasmaCrawlBalancer(File cachePath, String stackname) { + this.cacheStacksPath = cachePath; + this.stackname = stackname; + File stackFile = new File(cachePath, stackname + stackSuffix); + urlFileStack = kelondroStack.open(stackFile, stackrow); domainStacks = new HashMap(); - ramStack = new ArrayList(); - ramIndex = makeIndex(); + urlRAMStack = new ArrayList(); + + // create a stack for newly entered entries + if (!(cachePath.exists())) cachePath.mkdir(); // make the path + openFileIndex(); } public synchronized void close() { - ramIndex = null; while (sizeDomainStacks() > 0) flushOnceDomStacks(true); try { flushAllRamStack(); } catch (IOException e) {} - fileStack.close(); - fileStack = null; + if (urlFileIndex != null) { + urlFileIndex.close(); + urlFileIndex = null; + } + if (urlFileStack != null) { + urlFileStack.close(); + urlFileStack = null; + } } public void finalize() { - if (fileStack != null) close(); + if (urlFileStack != null) close(); } public synchronized void clear() { - fileStack = kelondroStack.reset(fileStack); + urlFileStack = kelondroStack.reset(urlFileStack); domainStacks.clear(); - ramStack.clear(); - ramIndex = new HashSet(); + urlRAMStack.clear(); + resetFileIndex(); } - private HashSet makeIndex() { - HashSet index = new HashSet(); // TODO: replace with kelondroIndex - - // take all elements from the file stack + + private void openFileIndex() { + cacheStacksPath.mkdirs(); try { - Iterator i = fileStack.keyIterator(); // iterates byte[] - objects - while (i.hasNext()) index.add(new String((byte[]) i.next(), "UTF-8")); - } catch (UnsupportedEncodingException e) {} - - // take elements from the ram stack - for (int i = 0; i < ramStack.size(); i++) index.add(ramStack.get(i)); - - // take elememts from domain stacks - Iterator i = domainStacks.entrySet().iterator(); - Map.Entry entry; - LinkedList list; - Iterator ii; - while (i.hasNext()) { - entry = (Map.Entry) i.next(); - list = (LinkedList) entry.getValue(); - ii = list.iterator(); - while (ii.hasNext()) index.add(ii.next()); + urlFileIndex = new kelondroCache(new kelondroFlexTable(cacheStacksPath, stackname + indexSuffix, -1, plasmaCrawlEntry.rowdef), true, false); + } catch (IOException e) { + e.printStackTrace(); + System.exit(-1); } - - return index; } - public boolean has(String urlhash) { - return ramIndex.contains(urlhash); + private void resetFileIndex() { + if (urlFileIndex != null) { + urlFileIndex.close(); + urlFileIndex = null; + File cacheFile = new File(cacheStacksPath, stackname + indexSuffix); + cacheFile.delete(); + } + openFileIndex(); } - public Iterator iterator() { - return ramIndex.iterator(); + public synchronized plasmaCrawlEntry get(String urlhash) throws IOException { + kelondroRow.Entry entry = urlFileIndex.get(urlhash.getBytes()); + if (entry == null) return null; + return new plasmaCrawlEntry(entry); + } + + public synchronized plasmaCrawlEntry remove(String urlhash) throws IOException { + // this method is only here, because so many import/export methods need it + // and it was implemented in the previous architecture + // however, usage is not recommendet + kelondroRow.Entry entry = urlFileIndex.remove(urlhash.getBytes()); + if (entry == null) return null; + + // now delete that thing also from the queues + + // iterate through the RAM stack + Iterator i = urlRAMStack.iterator(); + String h; + while (i.hasNext()) { + h = (String) i.next(); + if (h.equals(urlhash)) { + i.remove(); + break; + } + } + + // we cannot iterate through the file stack, because the stack iterator + // has not yet a delete method implemented. It would also be a bad idea + // to do that, it would make too much IO load + // instead, the top/pop methods that aquire elements from the stack, that + // cannot be found in the urlFileIndex must handle that case silently + + return new plasmaCrawlEntry(entry); + } + + public boolean has(String urlhash) { + try { + return urlFileIndex.has(urlhash.getBytes()); + } catch (IOException e) { + e.printStackTrace(); + return false; + } } public synchronized int size() { - int componentsize = fileStack.size() + ramStack.size() + sizeDomainStacks(); - if ((kelondroRecords.debugmode) && (componentsize != ramIndex.size())) { - // hier ist ramIndex.size() immer grš§er. warum? - serverLog.logWarning("PLASMA BALANCER", "size operation wrong - componentsize = " + componentsize + ", ramIndex.size() = " + ramIndex.size()); - } + int componentsize = urlFileStack.size() + urlRAMStack.size() + sizeDomainStacks(); + try { + if ((kelondroRecords.debugmode) && (componentsize != urlFileIndex.size())) { + // hier ist urlIndexFile.size() immer grš§er. warum? + serverLog.logWarning("PLASMA BALANCER", "size operation wrong - componentsize = " + componentsize + ", ramIndex.size() = " + urlFileIndex.size()); + } + } catch (IOException e) { + e.printStackTrace(); + } return componentsize; } @@ -163,9 +214,9 @@ public class plasmaCrawlBalancer { list = (LinkedList) entry.getValue(); if (list.size() != 0) { if (ram) { - ramStack.add(list.removeFirst()); + urlRAMStack.add(list.removeFirst()); } else try { - fileStack.push(fileStack.row().newEntry(new byte[][]{((String) list.removeFirst()).getBytes()})); + urlFileStack.push(urlFileStack.row().newEntry(new byte[][]{((String) list.removeFirst()).getBytes()})); } catch (IOException e) { e.printStackTrace(); } @@ -176,34 +227,36 @@ public class plasmaCrawlBalancer { private void flushAllRamStack() throws IOException { // this flushes only the ramStack to the fileStack, but does not flush the domainStacks - for (int i = 0; i < ramStack.size() / 2; i++) { - fileStack.push(fileStack.row().newEntry(new byte[][]{((String) ramStack.get(i)).getBytes()})); - fileStack.push(fileStack.row().newEntry(new byte[][]{((String) ramStack.get(ramStack.size() - i - 1)).getBytes()})); + for (int i = 0; i < urlRAMStack.size() / 2; i++) { + urlFileStack.push(urlFileStack.row().newEntry(new byte[][]{((String) urlRAMStack.get(i)).getBytes()})); + urlFileStack.push(urlFileStack.row().newEntry(new byte[][]{((String) urlRAMStack.get(urlRAMStack.size() - i - 1)).getBytes()})); } - if (ramStack.size() % 2 == 1) - fileStack.push(fileStack.row().newEntry(new byte[][]{((String) ramStack.get(ramStack.size() / 2)).getBytes()})); + if (urlRAMStack.size() % 2 == 1) + urlFileStack.push(urlFileStack.row().newEntry(new byte[][]{((String) urlRAMStack.get(urlRAMStack.size() / 2)).getBytes()})); } - public synchronized void push(String urlhash) throws IOException { - assert urlhash != null; - if (ramIndex.contains(urlhash)) { - serverLog.logWarning("PLASMA BALANCER", "double-check has failed for urlhash " + urlhash + " - fixed"); + public synchronized void push(plasmaCrawlEntry entry) throws IOException { + assert entry != null; + if (urlFileIndex.has(entry.urlhash().getBytes())) { + serverLog.logWarning("PLASMA BALANCER", "double-check has failed for urlhash " + entry.urlhash() + " - fixed"); return; } - String dom = urlhash.substring(6); + + // extend domain stack + String dom = entry.urlhash().substring(6); LinkedList domainList = (LinkedList) domainStacks.get(dom); if (domainList == null) { // create new list domainList = new LinkedList(); - domainList.addLast(urlhash); + domainList.addLast(entry.urlhash()); domainStacks.put(dom, domainList); } else { // extend existent domain list - domainList.add(urlhash); + domainList.add(entry.urlhash()); } // add to index - ramIndex.add(urlhash); + urlFileIndex.put(entry.toRow()); // check size of domainStacks and flush if ((domainStacks.size() > 20) || (sizeDomainStacks() > 1000)) { @@ -211,15 +264,15 @@ public class plasmaCrawlBalancer { } } - public synchronized String pop(long minimumDelta, long maximumAge) throws IOException { + public synchronized plasmaCrawlEntry pop(long minimumDelta, long maximumAge) throws IOException { // returns an url-hash from the stack and ensures minimum delta times // we have 3 sources to choose from: the ramStack, the domainStacks and the fileStack String result = null; // the result // 1st: check ramStack - if (ramStack.size() > 0) { - result = (String) ramStack.remove(0); + if (urlRAMStack.size() > 0) { + result = (String) urlRAMStack.remove(0); } // 2nd-a: check domainStacks for latest arrivals @@ -301,12 +354,12 @@ public class plasmaCrawlBalancer { } // 3rd: take entry from file - if ((result == null) && (fileStack.size() > 0)) { - kelondroRow.Entry topentry = fileStack.top(); + if ((result == null) && (urlFileStack.size() > 0)) { + kelondroRow.Entry topentry = urlFileStack.top(); if (topentry == null) { // emergency case: this means that something with the stack organization is wrong // the file appears to be broken. We kill the file. - kelondroStack.reset(fileStack); + kelondroStack.reset(urlFileStack); serverLog.logSevere("PLASMA BALANCER", "get() failed to fetch entry from file stack. reset stack file."); } else { String top = new String(topentry.getColBytes(0)); @@ -316,10 +369,10 @@ public class plasmaCrawlBalancer { long delta = lastAccessDelta(top); if (delta > minimumDelta) { // the entry from top is fine - result = new String(fileStack.pop().getColBytes(0)); + result = new String(urlFileStack.pop().getColBytes(0)); } else { // try entry from bottom - result = new String(fileStack.pot().getColBytes(0)); + result = new String(urlFileStack.pot().getColBytes(0)); delta = lastAccessDelta(result); } } @@ -327,7 +380,7 @@ public class plasmaCrawlBalancer { // check case where we did not found anything if (result == null) { - serverLog.logSevere("PLASMA BALANCER", "get() was not able to find a valid urlhash - total size = " + size() + ", fileStack.size() = " + fileStack.size() + ", ramStack.size() = " + ramStack.size() + ", domainStacks.size() = " + domainStacks.size()); + serverLog.logSevere("PLASMA BALANCER", "get() was not able to find a valid urlhash - total size = " + size() + ", fileStack.size() = " + urlFileStack.size() + ", ramStack.size() = " + urlRAMStack.size() + ", domainStacks.size() = " + domainStacks.size()); return null; } @@ -344,8 +397,9 @@ public class plasmaCrawlBalancer { // update statistical data domainAccess.put(result.substring(6), new Long(System.currentTimeMillis())); - ramIndex.remove(result); - return result; + kelondroRow.Entry entry = urlFileIndex.remove(result.getBytes()); + if (entry == null) return null; + return new plasmaCrawlEntry(entry); } private long lastAccessDelta(String hash) { @@ -355,19 +409,55 @@ public class plasmaCrawlBalancer { return System.currentTimeMillis() - lastAccess.longValue(); } - public synchronized String top(int dist) { - int availableInRam = ramStack.size() + sizeDomainStacks(); - if ((availableInRam < dist) && (fileStack.size() > (dist - availableInRam))) { + public synchronized plasmaCrawlEntry top(int dist) throws IOException { + int availableInRam = urlRAMStack.size() + sizeDomainStacks(); + if ((availableInRam <= dist) && (urlFileStack.size() > (dist - availableInRam))) { // flush some entries from disc to domain stacks try { - for (int i = 0; i < (dist - availableInRam); i++) { - ramStack.add(new String(fileStack.pop().getColBytes(0))); + for (int i = 0; i <= (dist - availableInRam); i++) { + if (urlFileStack.size() == 0) break; + urlRAMStack.add(new String(urlFileStack.pop().getColBytes(0))); } } catch (IOException e) {} } - while ((sizeDomainStacks() > 0) && (ramStack.size() <= dist)) flushOnceDomStacks(true); // flush only that much as we need to display - if (dist >= ramStack.size()) return null; - return (String) ramStack.get(dist); + while ((sizeDomainStacks() > 0) && (urlRAMStack.size() <= dist)) flushOnceDomStacks(true); // flush only that much as we need to display + if (dist >= urlRAMStack.size()) return null; + String urlhash = (String) urlRAMStack.get(dist); + kelondroRow.Entry entry = urlFileIndex.get(urlhash.getBytes()); + if (entry == null) return null; + return new plasmaCrawlEntry(entry); + } + + public Iterator iterator() throws IOException { + return new EntryIterator(); + } + + public class EntryIterator implements Iterator { + + Iterator rowIterator; + + public EntryIterator() throws IOException { + rowIterator = urlFileIndex.rows(true, null); + } + + public boolean hasNext() { + return (rowIterator == null) ? false : rowIterator.hasNext(); + } + + public Object next() { + kelondroRow.Entry entry = (kelondroRow.Entry) rowIterator.next(); + try { + return (entry == null) ? null : new plasmaCrawlEntry(entry); + } catch (IOException e) { + rowIterator = null; + return null; + } + } + + public void remove() { + if (rowIterator != null) rowIterator.remove(); + } + } } diff --git a/source/de/anomic/plasma/plasmaCrawlEURL.java b/source/de/anomic/plasma/plasmaCrawlEURL.java index c68575cb7..a801f013f 100644 --- a/source/de/anomic/plasma/plasmaCrawlEURL.java +++ b/source/de/anomic/plasma/plasmaCrawlEURL.java @@ -1,15 +1,15 @@ -// plasmaEURL.java -// ----------------------- -// part of YaCy -// (C) by Michael Peter Christen; mc@anomic.de -// first published on http://www.anomic.de -// Frankfurt, Germany, 2004 -// last major change: 09.08.2004 +// plasmaCrawlEURL.java +// (C) 2004 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany +// first published 09.08.2004 on http://www.anomic.de +// +// This is a part of YaCy, a peer-to-peer based web search engine // // $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ // $LastChangedRevision: 1986 $ // $LastChangedBy: orbiter $ // +// LICENSE +// // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or @@ -23,50 +23,15 @@ // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// -// Using this software in any meaning (reading, learning, copying, compiling, -// running) means that you agree that the Author(s) is (are) not responsible -// for cost, loss of data or any harm that may be caused directly or indirectly -// by usage of this softare or this documentation. The usage of this software -// is on your own risk. The installation and usage (starting/running) of this -// software may allow other people or application to access your computer and -// any attached devices and is highly dependent on the configuration of the -// software which must be done by the user of the software; the author(s) is -// (are) also not responsible for proper configuration and usage of the -// software, even if provoked by documentation provided together with -// the software. -// -// Any changes to this file according to the GPL as documented in the file -// gpl.txt aside this file in the shipment you received can be done to the -// lines that follows this copyright notice here, but changes must not be -// done inside the copyright notive above. A re-distribution must contain -// the intact and unchanged copyright notice. -// Contributions and changes to the program code must be marked as such. - -// EURL - noticed (known but not loaded) URL's package de.anomic.plasma; -import java.io.File; -import java.io.IOException; -import java.util.Date; -import java.util.Iterator; -import java.util.LinkedList; - -import de.anomic.plasma.plasmaURL; -import de.anomic.kelondro.kelondroBase64Order; -import de.anomic.kelondro.kelondroBitfield; -import de.anomic.kelondro.kelondroFlexTable; -import de.anomic.kelondro.kelondroIndex; -import de.anomic.kelondro.kelondroRow; -import de.anomic.net.URL; -import de.anomic.yacy.yacySeedDB; - public class plasmaCrawlEURL { /* ======================================================================= * Failure reason constants * ======================================================================= */ + // invalid urls public static final String DENIED_URL_NULL = "denied_(url_null)"; public static final String DENIED_MALFORMED_URL = "denied_(malformed_url)"; @@ -125,290 +90,5 @@ public class plasmaCrawlEURL { // indexing errors public static final String DENIED_UNSPECIFIED_INDEXING_ERROR = "denied_(unspecified_indexing_error)"; public static final String DENIED_UNKNOWN_INDEXING_PROCESS_CASE = "denied_(unknown_indexing_process_case)"; - - - /* ======================================================================= - * Other object variables - * ======================================================================= */ - private LinkedList rejectedStack = new LinkedList(); // strings: url - - public final static kelondroRow rowdef = new kelondroRow( - "String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash - "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash - "String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator - "String executor-" + yacySeedDB.commonHashLength + ", " + // the crawling executor - "String urlstring-256, " + // the url as string - "String urlname-40, " + // the name of the url, from anchor tag name - "Cardinal appdate-4 {b64e}, " + // the time when the url was first time appeared - "Cardinal loaddate-4 {b64e}, " + // the time when the url was last time tried to load - "Cardinal retrycount-2 {b64e}, " + // number of load retries - "String failcause-80, " + // string describing load failure - "byte[] flags-2", // extra space - kelondroBase64Order.enhancedCoder, - 0); - - // the class object - private kelondroIndex urlIndexFile = null; - - public plasmaCrawlEURL(File cachePath, long preloadTime) { - super(); - String newCacheName = "urlErr3.table"; - cachePath.mkdirs(); - try { - urlIndexFile = new kelondroFlexTable(cachePath, newCacheName, preloadTime, rowdef); - } catch (IOException e) { - e.printStackTrace(); - System.exit(-1); - } - } - - - public int size() { - try { - return urlIndexFile.size() ; - } catch (IOException e) { - return 0; - } - } - - public void close() { - if (urlIndexFile != null) { - urlIndexFile.close(); - urlIndexFile = null; - } - } - - public synchronized Entry newEntry(URL url, String referrer, String initiator, String executor, - String name, String failreason, kelondroBitfield flags) { - if ((referrer == null) || (referrer.length() < yacySeedDB.commonHashLength)) referrer = plasmaURL.dummyHash; - if ((initiator == null) || (initiator.length() < yacySeedDB.commonHashLength)) initiator = plasmaURL.dummyHash; - if ((executor == null) || (executor.length() < yacySeedDB.commonHashLength)) executor = plasmaURL.dummyHash; - if (failreason == null) failreason = "unknown"; - return new Entry(url, referrer, initiator, executor, name, failreason, flags); - } - - public boolean remove(String hash) { - if (hash == null) return false; - try { - urlIndexFile.remove(hash.getBytes()); - return true; - } catch (IOException e) { - return false; - } - } - - public synchronized void stackPushEntry(Entry e) { - rejectedStack.add(e.hash); - } - - public Entry stackPopEntry(int pos) throws IOException { - String urlhash = (String) rejectedStack.get(pos); - if (urlhash == null) return null; - return new Entry(urlhash); - } - - public synchronized Entry getEntry(String hash) throws IOException { - return new Entry(hash); - } - - public boolean getUseNewDB() { - return (urlIndexFile instanceof kelondroFlexTable); - } - - public boolean exists(String urlHash) { - try { - return urlIndexFile.has(urlHash.getBytes()); - } catch (IOException e) { - return false; - } - } - - public void clearStack() { - rejectedStack.clear(); - } - - public int stackSize() { - return rejectedStack.size(); - } - - public class Entry { - - private String hash; // the url's hash - private String referrer; // the url's referrer hash - private String initiator; // the crawling initiator - private String executor; // the crawling initiator - private URL url; // the url as string - private String name; // the name of the url, from anchor tag name - private Date initdate; // the time when the url was first time appeared - private Date trydate; // the time when the url was last time tried to load - private int trycount; // number of tryings - private String failreason; // string describing reason for load fail - private kelondroBitfield flags; // extra space - private boolean stored; - - public Entry(URL url, String referrer, String initiator, - String executor, String name, String failreason, kelondroBitfield flags) { - // create new entry - this.hash = plasmaURL.urlHash(url); - this.referrer = (referrer == null) ? plasmaURL.dummyHash : referrer; - this.initiator = initiator; - this.executor = executor; - this.url = url; - this.name = name; - this.initdate = new Date(); - this.trydate = new Date(); - this.trycount = 0; - this.failreason = failreason; - this.flags = flags; - this.stored = false; - } - - public Entry(String hash) throws IOException { - // generates an plasmaEURLEntry using the url hash - // to speed up the access, the url-hashes are buffered - // in the hash cache. - // we have two options to find the url: - // - look into the hash cache - // - look into the filed properties - // if the url cannot be found, this returns null - this.hash = hash; - kelondroRow.Entry entry = urlIndexFile.get(hash.getBytes()); - if (entry != null) { - insertEntry(entry); - } - this.stored = true; - } - - public Entry(kelondroRow.Entry entry) throws IOException { - insertEntry(entry); - this.stored = false; - } - - private void insertEntry(kelondroRow.Entry entry) throws IOException { - assert (entry != null); - this.hash = entry.getColString(0, null); - this.referrer = entry.getColString(1, "UTF-8"); - this.initiator = entry.getColString(2, "UTF-8"); - this.executor = entry.getColString(3, "UTF-8"); - this.url = new URL(entry.getColString(4, "UTF-8").trim()); - String n = entry.getColString(5, "UTF-8"); - this.name = (n == null) ? "" : n.trim(); - this.initdate = new Date(86400000 * entry.getColLong(6)); - this.trydate = new Date(86400000 * entry.getColLong(7)); - this.trycount = (int) entry.getColLong(8); - this.failreason = entry.getColString(9, "UTF-8"); - this.flags = new kelondroBitfield(entry.getColBytes(10)); - return; - } - - public void store() { - // stores the values from the object variables into the database - if (this.stored) return; - if (this.hash == null) return; - String initdatestr = kelondroBase64Order.enhancedCoder.encodeLong(initdate.getTime() / 86400000, rowdef.width(6)); - String trydatestr = kelondroBase64Order.enhancedCoder.encodeLong(trydate.getTime() / 86400000, rowdef.width(7)); - - // store the hash in the hash cache - try { - // even if the entry exists, we simply overwrite it - byte[][] entry = new byte[][] { - this.hash.getBytes(), - this.referrer.getBytes(), - this.initiator.getBytes(), - this.executor.getBytes(), - this.url.toString().getBytes(), - this.name.getBytes(), - initdatestr.getBytes(), - trydatestr.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.trycount, rowdef.width(8)).getBytes(), - this.failreason.getBytes(), - this.flags.bytes() - }; - urlIndexFile.put(urlIndexFile.row().newEntry(entry)); - this.stored = true; - } catch (IOException e) { - System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString()); - } - } - - public String hash() { - // return a url-hash, based on the md5 algorithm - // the result is a String of 12 bytes within a 72-bit space - // (each byte has an 6-bit range) - // that should be enough for all web pages on the world - return this.hash; - } - - public String referrer() { - return this.referrer; - } - - public URL url() { - return url; - } - - public Date initdate() { - return trydate; - } - - public Date trydate() { - return trydate; - } - - public String initiator() { - // return the creator's hash - return initiator; - } - - public String executor() { - // return the creator's hash - return executor; - } - - public String name() { - // return the creator's hash - return name; - } - - public String failreason() { - return failreason; - } - - } - - public class kiter implements Iterator { - // enumerates entry elements - Iterator i; - boolean error = false; - - public kiter(boolean up, String firstHash) throws IOException { - i = urlIndexFile.rows(up, (firstHash == null) ? null : firstHash.getBytes()); - error = false; - } - - public boolean hasNext() { - if (error) return false; - return i.hasNext(); - } - - public Object next() throws RuntimeException { - kelondroRow.Entry e = (kelondroRow.Entry) i.next(); - if (e == null) return null; - try { - return new Entry(e); - } catch (IOException ex) { - throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null)); - } - } - - public void remove() { - i.remove(); - } - - } - public Iterator entries(boolean up, String firstHash) throws IOException { - // enumerates entry elements - return new kiter(up, firstHash); - } } diff --git a/source/de/anomic/plasma/plasmaCrawlEntry.java b/source/de/anomic/plasma/plasmaCrawlEntry.java new file mode 100644 index 000000000..310e267b4 --- /dev/null +++ b/source/de/anomic/plasma/plasmaCrawlEntry.java @@ -0,0 +1,238 @@ +// plasmaCrawlBalancerEntry.java +// (C) 2007 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany +// first published 14.03.2007 on http://www.anomic.de +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.plasma; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.util.Date; + +import de.anomic.kelondro.kelondroBase64Order; +import de.anomic.kelondro.kelondroBitfield; +import de.anomic.kelondro.kelondroNaturalOrder; +import de.anomic.kelondro.kelondroRow; +import de.anomic.net.URL; +import de.anomic.yacy.yacyCore; +import de.anomic.yacy.yacySeedDB; + +public class plasmaCrawlEntry { + + // row definition for balancer-related NURL-entries + public final static kelondroRow rowdef = new kelondroRow( + "String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash + "String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator + "String urlstring-256, " + // the url as string + "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash + "String urlname-80, " + // the name of the url, from anchor tag name + "Cardinal appdate-8 {b256}, " + // the time when the url was first time appeared + "String profile-4, " + // the name of the prefetch profile handle + "Cardinal depth-2 {b256}, " + // the prefetch depth so far, starts at 0 + "Cardinal parentbr-3 {b256}, " + // number of anchors of the parent + "Cardinal forkfactor-4 {b256}, " + // sum of anchors of all ancestors + "byte[] flags-4, " + // flags + "String handle-4, " + // extra handle + "Cardinal loaddate-8 {b256}," + // time when the file was loaded + "Cardinal serverdate-8 {b256}," + // time when that the server returned as document date + "Cardinal modifiedSince-8 {b256}", // time that was given to server as ifModifiedSince + kelondroBase64Order.enhancedCoder, + 0 + ); + + private String initiator; // the initiator hash, is NULL or "" if it is the own proxy; + // if this is generated by a crawl, the own peer hash in entered + private String urlhash; // the url's hash + private String referrer; // the url's referrer hash + private URL url; // the url as string + private String name; // the name of the url, from anchor tag name + private long appdate; // the time when the url was first time appeared + private long loaddate; // the time when the url was loaded + private long serverdate; // the document date from the target server + private long imsdate; // the time of a ifModifiedSince request + private String profileHandle; // the name of the prefetch profile + private int depth; // the prefetch depth so far, starts at 0 + private int anchors; // number of anchors of the parent + private int forkfactor; // sum of anchors of all ancestors + private kelondroBitfield flags; + private int handle; + + public plasmaCrawlEntry(URL url) { + this(yacyCore.seedDB.mySeed.hash, url, null, null, new Date(), null, 0, 0, 0); + } + + public plasmaCrawlEntry( + String initiator, + URL url, + String referrer, + String name, + Date appdate, + String profileHandle, + int depth, + int anchors, + int forkfactor + ) { + // create new entry and store it into database + this.urlhash = plasmaURL.urlHash(url); + this.initiator = initiator; + this.url = url; + this.referrer = (referrer == null) ? plasmaURL.dummyHash : referrer; + this.name = (name == null) ? "" : name; + this.appdate = (appdate == null) ? 0 : appdate.getTime(); + this.profileHandle = profileHandle; // must not be null + this.depth = depth; + this.anchors = anchors; + this.forkfactor = forkfactor; + this.flags = new kelondroBitfield(rowdef.width(10)); + this.handle = 0; + this.loaddate = 0; + this.serverdate = 0; + this.imsdate = 0; + } + + public plasmaCrawlEntry(kelondroRow.Entry entry) throws IOException { + assert (entry != null); + insertEntry(entry); + } + + private void insertEntry(kelondroRow.Entry entry) throws IOException { + String urlstring = entry.getColString(2, null); + if (urlstring == null) throw new IOException ("url string is null"); + this.urlhash = entry.getColString(0, null); + this.initiator = entry.getColString(1, null); + this.url = new URL(urlstring); + this.referrer = (entry.empty(3)) ? plasmaURL.dummyHash : entry.getColString(3, null); + this.name = (entry.empty(4)) ? "" : entry.getColString(4, "UTF-8").trim(); + this.appdate = entry.getColLong(5); + this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6, null).trim(); + this.depth = (int) entry.getColLong(7); + this.anchors = (int) entry.getColLong(8); + this.forkfactor = (int) entry.getColLong(9); + this.flags = new kelondroBitfield(entry.getColBytes(10)); + this.handle = Integer.parseInt(entry.getColString(11, null), 16); + this.loaddate = entry.getColLong(12); + this.serverdate = entry.getColLong(13); + this.imsdate = entry.getColLong(14); + return; + } + + private static String normalizeHandle(int h) { + String d = Integer.toHexString(h); + while (d.length() < rowdef.width(11)) d = "0" + d; + return d; + } + + public kelondroRow.Entry toRow() { + byte[] appdatestr = kelondroNaturalOrder.encodeLong(appdate, rowdef.width(5)); + byte[] loaddatestr = kelondroNaturalOrder.encodeLong(loaddate, rowdef.width(12)); + byte[] serverdatestr = kelondroNaturalOrder.encodeLong(serverdate, rowdef.width(13)); + byte[] imsdatestr = kelondroNaturalOrder.encodeLong(imsdate, rowdef.width(14)); + // store the hash in the hash cache + byte[] namebytes; + try { + namebytes = this.name.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + namebytes = this.name.getBytes(); + } + byte[][] entry = new byte[][] { + this.urlhash.getBytes(), + (initiator == null) ? "".getBytes() : this.initiator.getBytes(), + this.url.toString().getBytes(), + this.referrer.getBytes(), + namebytes, + appdatestr, + (this.profileHandle == null) ? null : this.profileHandle.getBytes(), + kelondroNaturalOrder.encodeLong(this.depth, rowdef.width(7)), + kelondroNaturalOrder.encodeLong(this.anchors, rowdef.width(8)), + kelondroNaturalOrder.encodeLong(this.forkfactor, rowdef.width(9)), + this.flags.bytes(), + normalizeHandle(this.handle).getBytes(), + loaddatestr, + serverdatestr, + imsdatestr}; + return rowdef.newEntry(entry); + } + + public URL url() { + // the url + return url; + } + + public String urlhash() { + // the hash of this url + return this.urlhash; + } + + public String referrerhash() { + // the urlhash of a referer url + return this.referrer; + } + + public String initiator() { + // returns the hash of the initiating peer + if (initiator == null) return null; + if (initiator.length() == 0) return null; + return initiator; + } + + public boolean proxy() { + // true when the url was retrieved using the proxy + return (initiator() == null); + } + + public Date appdate() { + // the date when the url appeared first + return new Date(appdate); + } + + public Date loaddate() { + // the date when the url was loaded + return new Date(loaddate); + } + + public Date serverdate() { + // the date that the server returned as document date + return new Date(serverdate); + } + + public Date imsdate() { + // the date that the client (browser) send as ifModifiedSince in proxy mode + return new Date(imsdate); + } + + public String name() { + // return the anchor name (text inside tag) + return name; + } + + public int depth() { + // crawl depth where the url appeared + return depth; + } + + public String profileHandle() { + // the handle of the crawl profile + return profileHandle; + } +} \ No newline at end of file diff --git a/source/de/anomic/plasma/plasmaCrawlNURL.java b/source/de/anomic/plasma/plasmaCrawlNURL.java index a72b17dc7..2728cef68 100644 --- a/source/de/anomic/plasma/plasmaCrawlNURL.java +++ b/source/de/anomic/plasma/plasmaCrawlNURL.java @@ -46,24 +46,9 @@ package de.anomic.plasma; import java.io.File; import java.io.IOException; import java.util.ArrayList; -import java.util.Date; import java.util.HashSet; import java.util.Iterator; -import de.anomic.plasma.plasmaURL; -import de.anomic.kelondro.kelondroBase64Order; -import de.anomic.kelondro.kelondroBitfield; -import de.anomic.kelondro.kelondroCache; -import de.anomic.kelondro.kelondroException; -import de.anomic.kelondro.kelondroFlexTable; -import de.anomic.kelondro.kelondroIndex; -import de.anomic.kelondro.kelondroRecords; -import de.anomic.kelondro.kelondroRow; -import de.anomic.kelondro.kelondroStack; -import de.anomic.net.URL; -import de.anomic.server.logging.serverLog; -import de.anomic.yacy.yacySeedDB; - public class plasmaCrawlNURL { public static final int STACK_TYPE_NULL = 0; // do not stack @@ -78,166 +63,33 @@ public class plasmaCrawlNURL { private static final long minimumDelta = 500; // the minimum time difference between access of the same domain private static final long maximumDomAge = 60000; // the maximum age of a domain until it is used for another crawl attempt - /** - * column length definition for the {@link plasmaURL#urlIndexFile} DB - */ - public final static kelondroRow rowdef = new kelondroRow( - "String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash - "String initiator-" + yacySeedDB.commonHashLength + ", " + // the crawling initiator - "String urlstring-256, " + // the url as string - "String refhash-" + yacySeedDB.commonHashLength + ", " + // the url's referrer hash - "String urlname-40, " + // the name of the url, from anchor tag name - "Cardinal appdate-4 {b64e}, " + // the time when the url was first time appeared - "String profile-4, " + // the name of the prefetch profile handle - "Cardinal depth-2 {b64e}, " + // the prefetch depth so far, starts at 0 - "Cardinal parentbr-3 {b64e}, " + // number of anchors of the parent - "Cardinal forkfactor-4 {b64e}, " + // sum of anchors of all ancestors - "byte[] flags-4, " + // flags - "String handle-4", // extra handle - kelondroBase64Order.enhancedCoder, - 0 - ); - - private kelondroIndex urlIndexFile = null; private final plasmaCrawlBalancer coreStack; // links found by crawling to depth-1 private final plasmaCrawlBalancer limitStack; // links found by crawling at target depth - private final plasmaCrawlBalancer overhangStack; // links found by crawling at depth+1 private final plasmaCrawlBalancer remoteStack; // links from remote crawl orders - private kelondroStack imageStack; // links pointing to image resources - private kelondroStack movieStack; // links pointing to movie resources - private kelondroStack musicStack; // links pointing to music resources + //private final plasmaCrawlBalancer overhangStack; // links found by crawling at depth+1 + //private kelondroStack imageStack; // links pointing to image resources + //private kelondroStack movieStack; // links pointing to movie resources + //private kelondroStack musicStack; // links pointing to music resources - private final HashSet imageStackIndex, movieStackIndex, musicStackIndex; // to find out if a specific link is already on any stack - private File cacheStacksPath; - private long preloadTime; - private initStackIndex initThead; - - public plasmaCrawlNURL(File cachePath, long preloadTime) { + public plasmaCrawlNURL(File cachePath) { super(); - this.cacheStacksPath = cachePath; - this.preloadTime = preloadTime; - - // create a stack for newly entered entries - if (!(cachePath.exists())) cachePath.mkdir(); // make the path - openHashCache(); - - File coreStackFile = new File(cachePath, "urlNoticeLocal0.stack"); - File limitStackFile = new File(cachePath, "urlNoticeLimit0.stack"); - File overhangStackFile = new File(cachePath, "urlNoticeOverhang0.stack"); - File remoteStackFile = new File(cachePath, "urlNoticeRemote0.stack"); - File imageStackFile = new File(cachePath, "urlNoticeImage0.stack"); - File movieStackFile = new File(cachePath, "urlNoticeMovie0.stack"); - File musicStackFile = new File(cachePath, "urlNoticeMusic0.stack"); - coreStack = new plasmaCrawlBalancer(coreStackFile); - limitStack = new plasmaCrawlBalancer(limitStackFile); - overhangStack = new plasmaCrawlBalancer(overhangStackFile); - remoteStack = new plasmaCrawlBalancer(remoteStackFile); - kelondroRow rowdef = new kelondroRow("byte[] urlhash-" + yacySeedDB.commonHashLength, kelondroBase64Order.enhancedCoder, 0); - imageStack = kelondroStack.open(imageStackFile, rowdef); - movieStack = kelondroStack.open(movieStackFile, rowdef); - musicStack = kelondroStack.open(musicStackFile, rowdef); - - // init stack Index - imageStackIndex = new HashSet(); - movieStackIndex = new HashSet(); - musicStackIndex = new HashSet(); - (initThead = new initStackIndex()).start(); + coreStack = new plasmaCrawlBalancer(cachePath, "urlNoticeCoreStack"); + limitStack = new plasmaCrawlBalancer(cachePath, "urlNoticeLimitStack"); + //overhangStack = new plasmaCrawlBalancer(overhangStackFile); + remoteStack = new plasmaCrawlBalancer(cachePath, "urlNoticeRemoteStack"); } public int size() { - try { - return urlIndexFile.size() ; - } catch (IOException e) { - return 0; - } - } - - public void waitOnInitThread() { - try { - if (this.initThead != null) { - this.initThead.join(); - } - } catch (NullPointerException e) { - } catch (InterruptedException e) {} - - } - - private void openHashCache() { - String newCacheName = "urlNotice5.table"; - cacheStacksPath.mkdirs(); - try { - urlIndexFile = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, rowdef), true, false); - } catch (IOException e) { - e.printStackTrace(); - System.exit(-1); - } - } - - private void resetHashCache() { - if (urlIndexFile != null) { - urlIndexFile.close(); - urlIndexFile = null; - File cacheFile = new File(cacheStacksPath, "urlNotice2.db"); - cacheFile.delete(); - } - openHashCache(); + return coreStack.size() + limitStack.size() + remoteStack.size(); } public void close() { coreStack.close(); limitStack.close(); - overhangStack.close(); + //overhangStack.close(); remoteStack.close(); - imageStack.close(); - movieStack.close(); - musicStack.close(); - if (urlIndexFile != null) { - urlIndexFile.close(); - urlIndexFile = null; - } - } - - public class initStackIndex extends Thread { - public void run() { - Iterator i; - try { - i = imageStack.iterator(); - while (i.hasNext()) imageStackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey(), "UTF-8")); - } catch (Exception e) { - imageStack = kelondroStack.reset(imageStack); - } - try { - i = movieStack.iterator(); - while (i.hasNext()) movieStackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey(), "UTF-8")); - } catch (Exception e) { - movieStack = kelondroStack.reset(movieStack); - } - try { - i = musicStack.iterator(); - while (i.hasNext()) musicStackIndex.add(new String(((kelondroRecords.Node) i.next()).getKey(), "UTF-8")); - } catch (Exception e) { - musicStack = kelondroStack.reset(musicStack); - } - plasmaCrawlNURL.this.initThead = null; - } - } - - public boolean remove(String hash) { - if (hash == null) return false; - try { - urlIndexFile.remove(hash.getBytes()); - return true; - } catch (IOException e) { - return false; - } } - private static String normalizeHandle(int h) { - String d = Integer.toHexString(h); - while (d.length() < rowdef.width(11)) d = "0" + d; - return d; - } - public int stackSize() { // this does not count the overhang stack size return coreStack.size() + limitStack.size() + remoteStack.size(); @@ -247,11 +99,8 @@ public class plasmaCrawlNURL { switch (stackType) { case STACK_TYPE_CORE: return coreStack.size(); case STACK_TYPE_LIMIT: return limitStack.size(); - case STACK_TYPE_OVERHANG: return overhangStack.size(); + case STACK_TYPE_OVERHANG: return 0; case STACK_TYPE_REMOTE: return remoteStack.size(); - case STACK_TYPE_IMAGE: return imageStack.size(); - case STACK_TYPE_MOVIE: return movieStack.size(); - case STACK_TYPE_MUSIC: return musicStack.size(); default: return -1; } } @@ -260,111 +109,65 @@ public class plasmaCrawlNURL { return coreStack.has(urlhash) || limitStack.has(urlhash) || - overhangStack.has(urlhash) || - remoteStack.has(urlhash) || - imageStackIndex.contains(urlhash) || - movieStackIndex.contains(urlhash) || - musicStackIndex.contains(urlhash); - } - - public synchronized Entry newEntry(String initiator, URL url, Date loaddate, - String referrer, String name, String profile, - int depth, int anchors, int forkfactor) { - return new Entry(initiator, url, referrer, name, loaddate, - profile, depth, anchors, forkfactor); + //overhangStack.has(urlhash) || + remoteStack.has(urlhash); } - public synchronized Entry newEntry(Entry oldEntry) { - if (oldEntry == null) return null; - return new Entry( - oldEntry.initiator(), - oldEntry.url(), - oldEntry.referrerHash(), - oldEntry.name(), - oldEntry.loaddate(), - oldEntry.profileHandle(), - oldEntry.depth(), - oldEntry.anchors, - oldEntry.forkfactor - ); - } - - public void push(int stackType, String urlhash) { + public void push(int stackType, plasmaCrawlEntry entry) { try { switch (stackType) { case STACK_TYPE_CORE: - coreStack.push(urlhash); + coreStack.push(entry); break; case STACK_TYPE_LIMIT: - limitStack.push(urlhash); - break; - case STACK_TYPE_OVERHANG: - overhangStack.push(urlhash); + limitStack.push(entry); break; case STACK_TYPE_REMOTE: - remoteStack.push(urlhash); - break; - case STACK_TYPE_IMAGE: - imageStack.push(imageStack.row().newEntry(new byte[][] {urlhash.getBytes()})); - imageStackIndex.add(urlhash); - break; - case STACK_TYPE_MOVIE: - movieStack.push(movieStack.row().newEntry(new byte[][] {urlhash.getBytes()})); - movieStackIndex.add(urlhash); - break; - case STACK_TYPE_MUSIC: - musicStack.push(musicStack.row().newEntry(new byte[][] {urlhash.getBytes()})); - musicStackIndex.add(urlhash); + remoteStack.push(entry); break; default: break; } } catch (IOException er) {} } - public Entry[] top(int stackType, int count) { + public plasmaCrawlEntry get(String urlhash) { + plasmaCrawlEntry entry = null; + try {if ((entry = coreStack.get(urlhash)) != null) return entry;} catch (IOException e) {} + try {if ((entry = limitStack.get(urlhash)) != null) return entry;} catch (IOException e) {} + try {if ((entry = remoteStack.get(urlhash)) != null) return entry;} catch (IOException e) {} + return null; + } + + public plasmaCrawlEntry remove(String urlhash) { + plasmaCrawlEntry entry = null; + try {if ((entry = coreStack.remove(urlhash)) != null) return entry;} catch (IOException e) {} + try {if ((entry = limitStack.remove(urlhash)) != null) return entry;} catch (IOException e) {} + try {if ((entry = remoteStack.remove(urlhash)) != null) return entry;} catch (IOException e) {} + return null; + } + + public plasmaCrawlEntry[] top(int stackType, int count) { switch (stackType) { case STACK_TYPE_CORE: return top(coreStack, count); case STACK_TYPE_LIMIT: return top(limitStack, count); - case STACK_TYPE_OVERHANG: return top(overhangStack, count); case STACK_TYPE_REMOTE: return top(remoteStack, count); - case STACK_TYPE_IMAGE: return top(imageStack, count); - case STACK_TYPE_MOVIE: return top(movieStack, count); - case STACK_TYPE_MUSIC: return top(musicStack, count); default: return null; } } - public Iterator iterator(int stackType) { - // returns an iterator of String objects - switch (stackType) { - case STACK_TYPE_CORE: return coreStack.iterator(); - case STACK_TYPE_LIMIT: return limitStack.iterator(); - case STACK_TYPE_OVERHANG: return overhangStack.iterator(); - case STACK_TYPE_REMOTE: return remoteStack.iterator(); - case STACK_TYPE_IMAGE: return imageStackIndex.iterator(); - case STACK_TYPE_MOVIE: return movieStackIndex.iterator(); - case STACK_TYPE_MUSIC: return musicStackIndex.iterator(); - default: return null; - } - } - - public Entry pop(int stackType) throws IOException { + public plasmaCrawlEntry pop(int stackType) throws IOException { switch (stackType) { case STACK_TYPE_CORE: return pop(coreStack); case STACK_TYPE_LIMIT: return pop(limitStack); - case STACK_TYPE_OVERHANG: return pop(overhangStack); case STACK_TYPE_REMOTE: return pop(remoteStack); - case STACK_TYPE_IMAGE: return pop(imageStack); - case STACK_TYPE_MOVIE: return pop(movieStack); - case STACK_TYPE_MUSIC: return pop(musicStack); default: return null; } } public void shift(int fromStack, int toStack) { try { - Entry entry = pop(fromStack); - push(toStack, entry.hash()); + plasmaCrawlEntry entry = pop(fromStack); + if (entry != null) push(toStack, entry); } catch (IOException e) { return; } @@ -374,329 +177,55 @@ public class plasmaCrawlNURL { switch (stackType) { case STACK_TYPE_CORE: coreStack.clear(); break; case STACK_TYPE_LIMIT: limitStack.clear(); break; - case STACK_TYPE_OVERHANG: overhangStack.clear(); break; case STACK_TYPE_REMOTE: remoteStack.clear(); break; - case STACK_TYPE_IMAGE: imageStack = kelondroStack.reset(imageStack); break; - case STACK_TYPE_MOVIE: movieStack = kelondroStack.reset(movieStack); break; - case STACK_TYPE_MUSIC: musicStack = kelondroStack.reset(musicStack); break; default: return; } } - - private Entry pop(kelondroStack stack) throws IOException { - // this is a filo - pop - int s; - Entry entry; - kelondroRow.Entry re; - synchronized (stack) { - while ((s = stack.size()) > 0) { - re = stack.pop(); - if (re == null) { - if (s > stack.size()) continue; - stack = kelondroStack.reset(stack); // the stack is not able to shrink - throw new IOException("hash is null, stack cannot shrink; reset of stack (1)"); - } - try { - entry = new Entry(new String(re.getColBytes(0))); - } catch (IOException e) { - serverLog.logWarning("NURL", e.getMessage()); - if (s > stack.size()) continue; - stack = kelondroStack.reset(stack); // the stack is not able to shrink - throw new IOException("hash is null, stack cannot shrink; reset of stack (2)"); - } - imageStackIndex.remove(entry.hash); - movieStackIndex.remove(entry.hash); - musicStackIndex.remove(entry.hash); - return entry; - } - } - throw new IOException("crawl stack is empty"); - } - - private Entry pop(plasmaCrawlBalancer balancer) throws IOException { + + private plasmaCrawlEntry pop(plasmaCrawlBalancer balancer) throws IOException { // this is a filo - pop - String hash; int s; - Entry entry; + plasmaCrawlEntry entry; synchronized (balancer) { while ((s = balancer.size()) > 0) { - hash = balancer.pop(minimumDelta, maximumDomAge); - if (hash == null) { - if (s > balancer.size()) continue; - balancer.clear(); // the balancer is broken and cannot shrink - throw new IOException("hash is null, balancer cannot shrink; reset of balancer (1)"); - } - try { - entry = new Entry(hash); - } catch (IOException e) { - serverLog.logWarning("NURL", e.getMessage()); + entry = balancer.pop(minimumDelta, maximumDomAge); + if (entry == null) { if (s > balancer.size()) continue; balancer.clear(); // the balancer is broken and cannot shrink - throw new IOException("IO error, balancer cannot shrink: " + e.getMessage() + "; reset of balancer (2)"); + throw new IOException("entry is null, balancer cannot shrink; reset of balancer"); } - imageStackIndex.remove(entry.hash); - movieStackIndex.remove(entry.hash); - musicStackIndex.remove(entry.hash); return entry; } } throw new IOException("balancer stack is empty"); } - - private Entry[] top(kelondroStack stack, int count) { - // this is a filo - top - if (count > stack.size()) count = stack.size(); - ArrayList list = new ArrayList(count); - for (int i = 0; i < count; i++) { - try { - byte[] hash = stack.top(i).getColBytes(0); - list.add(new Entry(new String(hash))); - } catch (IOException e) { - continue; - } - } - return (Entry[]) list.toArray(new Entry[list.size()]); - } - - private Entry[] top(plasmaCrawlBalancer balancer, int count) { + + private plasmaCrawlEntry[] top(plasmaCrawlBalancer balancer, int count) { // this is a filo - top if (count > balancer.size()) count = balancer.size(); ArrayList list = new ArrayList(count); for (int i = 0; i < count; i++) { try { - String urlhash = balancer.top(i); - if (urlhash == null) break; - list.add(new Entry(urlhash)); + plasmaCrawlEntry entry = balancer.top(i); + if (entry == null) break; + list.add(entry); } catch (IOException e) { break; } } - return (Entry[])list.toArray(new Entry[list.size()]); - } - - public synchronized Entry getEntry(String hash) throws IOException { - return new Entry(hash); + return (plasmaCrawlEntry[]) list.toArray(new plasmaCrawlEntry[list.size()]); } - - public class Entry { - private String initiator; // the initiator hash, is NULL or "" if it is the own proxy; - // if this is generated by a crawl, the own peer hash in entered - private String hash; // the url's hash - private String referrer; // the url's referrer hash - private URL url; // the url as string - private String name; // the name of the url, from anchor tag name - private Date loaddate; // the time when the url was first time appeared - private String profileHandle; // the name of the prefetch profile - private int depth; // the prefetch depth so far, starts at 0 - private int anchors; // number of anchors of the parent - private int forkfactor; // sum of anchors of all ancestors - private kelondroBitfield flags; - private int handle; - private boolean stored; - - public Entry(String initiator, - URL url, - String referrer, - String name, - Date loaddate, - String profileHandle, - int depth, - int anchors, - int forkfactor - ) { - // create new entry and store it into database - this.hash = plasmaURL.urlHash(url); - this.initiator = initiator; - this.url = url; - this.referrer = (referrer == null) ? plasmaURL.dummyHash : referrer; - this.name = (name == null) ? "" : name; - this.loaddate = (loaddate == null) ? new Date() : loaddate; - this.profileHandle = profileHandle; // must not be null - this.depth = depth; - this.anchors = anchors; - this.forkfactor = forkfactor; - this.flags = new kelondroBitfield(rowdef.width(10)); - this.handle = 0; - this.stored = false; - } - - public Entry(String hash) throws IOException { - // generates an plasmaNURLEntry using the url hash - // to speed up the access, the url-hashes are buffered - // in the hash cache. - // we have two options to find the url: - // - look into the hash cache - // - look into the filed properties - // if the url cannot be found, this returns null - this.hash = hash; - if (hash == null) throw new IOException("hash is null"); - kelondroRow.Entry entry = urlIndexFile.get(hash.getBytes()); - if (entry != null) { - insertEntry(entry); - this.stored = true; - return; - } else { - // show that we found nothing - throw new IOException("NURL: hash " + hash + " not found during initialization of entry object"); - //this.url = null; - } - } - - public Entry(kelondroRow.Entry entry) throws IOException { - assert (entry != null); - insertEntry(entry); - this.stored = false; - } - - private void insertEntry(kelondroRow.Entry entry) throws IOException { - String urlstring = entry.getColString(2, null); - if (urlstring == null) throw new IOException ("url string is null"); - this.hash = entry.getColString(0, null); - this.initiator = entry.getColString(1, null); - this.url = new URL(urlstring); - this.referrer = (entry.empty(3)) ? plasmaURL.dummyHash : entry.getColString(3, null); - this.name = (entry.empty(4)) ? "" : entry.getColString(4, "UTF-8").trim(); - this.loaddate = new Date(86400000 * entry.getColLong(5)); - this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6, null).trim(); - this.depth = (int) entry.getColLong(7); - this.anchors = (int) entry.getColLong(8); - this.forkfactor = (int) entry.getColLong(9); - this.flags = new kelondroBitfield(entry.getColBytes(10)); - this.handle = Integer.parseInt(entry.getColString(11, null), 16); - return; - } - - public void store() { - // stores the values from the object variables into the database - if (this.stored) return; - String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, rowdef.width(5)); - // store the hash in the hash cache - try { - // even if the entry exists, we simply overwrite it - byte[][] entry = new byte[][] { - this.hash.getBytes(), - (initiator == null) ? "".getBytes() : this.initiator.getBytes(), - this.url.toString().getBytes(), - this.referrer.getBytes(), - this.name.getBytes("UTF-8"), - loaddatestr.getBytes(), - (this.profileHandle == null) ? null : this.profileHandle.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.depth, rowdef.width(7)).getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, rowdef.width(8)).getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, rowdef.width(9)).getBytes(), - this.flags.bytes(), - normalizeHandle(this.handle).getBytes() - }; - if (urlIndexFile == null) System.out.println("urlHashCache is NULL"); - if ((urlIndexFile != null) && (urlIndexFile.row() == null)) System.out.println("row() is NULL"); - urlIndexFile.put(urlIndexFile.row().newEntry(entry)); - this.stored = true; - } catch (IOException e) { - serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaNURL:store:" + e.toString() + ", resetting NURL-DB"); - e.printStackTrace(); - resetHashCache(); - } catch (kelondroException e) { - serverLog.logSevere("PLASMA", "plasmaCrawlNURL.store failed: " + e.toString() + ", resetting NURL-DB"); - e.printStackTrace(); - resetHashCache(); - } - } - - public String toString() { - StringBuffer str = new StringBuffer(); - str.append("hash: ").append(hash==null ? "null" : hash).append(" | ") - .append("initiator: ").append(initiator==null?"null":initiator).append(" | ") - .append("url: ").append(url==null?"null":url.toString()).append(" | ") - .append("referrer: ").append((referrer == null) ? plasmaURL.dummyHash : referrer).append(" | ") - .append("name: ").append((name == null) ? "null" : name).append(" | ") - .append("loaddate: ").append((loaddate == null) ? new Date() : loaddate).append(" | ") - .append("profile: ").append(profileHandle==null?"null":profileHandle).append(" | ") - .append("depth: ").append(Integer.toString(depth)).append(" | ") - .append("forkfactor: ").append(Integer.toString(forkfactor)).append(" | ") - .append("flags: ").append((flags==null) ? "null" : flags.exportB64()); - return str.toString(); - } - - /** - * return a url-hash, based on the md5 algorithm - * the result is a String of 12 bytes within a 72-bit space - * (each byte has an 6-bit range) - * that should be enough for all web pages on the world - */ - public String hash() { - return this.hash; - } - - public String initiator() { - if (initiator == null) return null; - if (initiator.length() == 0) return null; - return initiator; - } - - public boolean proxy() { - return (initiator() == null); - } - - public String referrerHash() { - return this.referrer; - } - - public URL url() { - return url; - } - - public Date loaddate() { - return loaddate; - } - - public String name() { - // return the creator's hash - return name; - } - - public int depth() { - return depth; - } - - public String profileHandle() { - return profileHandle; - } - } - - public class kiter implements Iterator { - // enumerates entry elements - Iterator i; - boolean error = false; - - public kiter(boolean up, String firstHash) throws IOException { - i = urlIndexFile.rows(up, (firstHash == null) ? null : firstHash.getBytes()); - error = false; - } - - public boolean hasNext() { - if (error) return false; - return i.hasNext(); - } - - public Object next() throws RuntimeException { - kelondroRow.Entry e = (kelondroRow.Entry) i.next(); - if (e == null) return null; - try { - return new Entry(e); - } catch (IOException ex) { - throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null)); - } - } - - public void remove() { - i.remove(); + + public Iterator iterator(int stackType) { + // returns an iterator of plasmaCrawlBalancerEntry Objects + try {switch (stackType) { + case STACK_TYPE_CORE: return coreStack.iterator(); + case STACK_TYPE_LIMIT: return limitStack.iterator(); + case STACK_TYPE_REMOTE: return remoteStack.iterator(); + default: return null; + }} catch (IOException e) { + return new HashSet().iterator(); } - - } - - public Iterator entries(boolean up, String firstHash) throws IOException { - // enumerates entry elements - return new kiter(up, firstHash); } } diff --git a/source/de/anomic/plasma/plasmaCrawlStacker.java b/source/de/anomic/plasma/plasmaCrawlStacker.java index 2791114fe..757dc1f93 100644 --- a/source/de/anomic/plasma/plasmaCrawlStacker.java +++ b/source/de/anomic/plasma/plasmaCrawlStacker.java @@ -48,7 +48,6 @@ package de.anomic.plasma; import java.io.File; import java.io.IOException; -import java.io.UnsupportedEncodingException; import java.net.InetAddress; import java.net.MalformedURLException; import java.util.Date; @@ -61,8 +60,6 @@ import de.anomic.data.robotsParser; import de.anomic.http.httpc; import de.anomic.plasma.plasmaURL; import de.anomic.index.indexURLEntry; -import de.anomic.kelondro.kelondroBase64Order; -import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroCache; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroFlexTable; @@ -171,7 +168,7 @@ public final class plasmaCrawlStacker { try { // getting a new message from the crawler queue checkInterruption(); - stackCrawlMessage theMsg = this.queue.waitForMessage(); + plasmaCrawlEntry theMsg = this.queue.waitForMessage(); if (theMsg != null) { // getting a free session thread from the pool @@ -196,18 +193,18 @@ public final class plasmaCrawlStacker { } public void enqueue( - String nexturlString, - String referrerString, + URL nexturl, + String referrerhash, String initiatorHash, String name, Date loadDate, int currentdepth, plasmaCrawlProfile.entry profile) { if (profile != null) try { - this.queue.addMessage(new stackCrawlMessage( + this.queue.addMessage(new plasmaCrawlEntry( initiatorHash, - nexturlString, - referrerString, + nexturl, + referrerhash, name, loadDate, profile.handle(), @@ -220,7 +217,7 @@ public final class plasmaCrawlStacker { } } - public String dequeue(stackCrawlMessage theMsg) throws InterruptedException { + public String dequeue(plasmaCrawlEntry theMsg) throws InterruptedException { plasmaCrawlProfile.entry profile = this.sb.profiles.getEntry(theMsg.profileHandle()); if (profile == null) { @@ -231,8 +228,8 @@ public final class plasmaCrawlStacker { return stackCrawl( theMsg.url().toString(), - theMsg.referrerHash(), - theMsg.initiatorHash(), + theMsg.referrerhash(), + theMsg.initiator(), theMsg.name(), theMsg.loaddate(), theMsg.depth(), @@ -424,175 +421,23 @@ public final class plasmaCrawlStacker { // add the url into the crawling queue checkInterruption(); - plasmaCrawlNURL.Entry ne = this.sb.noticeURL.newEntry(initiatorHash, /* initiator, needed for p2p-feedback */ + plasmaCrawlEntry ne = new plasmaCrawlEntry(initiatorHash, /* initiator, needed for p2p-feedback */ nexturl, /* url clear text string */ - loadDate, /* load date */ referrerHash, /* last url in crawling queue */ - name, /* the anchor name */ + name, /* load date */ + loadDate, /* the anchor name */ (profile == null) ? null : profile.handle(), // profile must not be null! currentdepth, /*depth so far*/ 0, /*anchors, default value */ 0 /*forkfactor, default value */ ); - ne.store(); this.sb.noticeURL.push( ((global) ? plasmaCrawlNURL.STACK_TYPE_LIMIT : ((local) ? plasmaCrawlNURL.STACK_TYPE_CORE : plasmaCrawlNURL.STACK_TYPE_REMOTE)) /*local/remote stack*/, - ne.hash()); + ne); return null; } - public final class stackCrawlMessage { - private String initiator; // the initiator hash, is NULL or "" if it is the own proxy; - String urlHash; // the url's hash - private String referrerHash; // the url's referrer hash - private String url; // the url as string - String name; // the name of the url, from anchor tag name - private Date loaddate; // the time when the url was first time appeared - private String profileHandle; // the name of the prefetch profile - private int depth; // the prefetch depth so far, starts at 0 - private int anchors; // number of anchors of the parent - private int forkfactor; // sum of anchors of all ancestors - private kelondroBitfield flags; - private int handle; - - // loadParallel(URL url, String referer, String initiator, int depth, plasmaCrawlProfile.entry profile) { - public stackCrawlMessage( - String initiator, - String urlString, - String referrerUrlString, - String name, - Date loaddate, - String profileHandle, - int depth, - int anchors, - int forkfactor) { - try { - // create new entry and store it into database - this.urlHash = plasmaURL.urlHash(urlString); - this.initiator = initiator; - this.url = urlString; - this.referrerHash = (referrerUrlString == null) ? plasmaURL.dummyHash : plasmaURL.urlHash(referrerUrlString); - this.name = (name == null) ? "" : name; - this.loaddate = (loaddate == null) ? new Date() : loaddate; - this.profileHandle = profileHandle; // must not be null - this.depth = depth; - this.anchors = anchors; - this.forkfactor = forkfactor; - this.flags = new kelondroBitfield(); - this.handle = 0; - } catch (Exception e) { - e.printStackTrace(); - } - } - - public stackCrawlMessage(String urlHash, kelondroRow.Entry entry) { - if (urlHash == null) throw new NullPointerException("Url hash was null"); - if (entry == null) throw new NullPointerException("kelondroRow.Entry was null"); - - try { - this.urlHash = urlHash; - this.initiator = entry.getColString(1, "UTF-8"); - this.url = entry.getColString(2, "UTF-8").trim(); - this.referrerHash = (entry.empty(3)) ? plasmaURL.dummyHash : entry.getColString(3, "UTF-8"); - this.name = (entry.empty(4)) ? "" : entry.getColString(4, "UTF-8").trim(); - this.loaddate = new Date(86400000 * entry.getColLong(5)); - this.profileHandle = (entry.empty(6)) ? null : entry.getColString(6, "UTF-8").trim(); - this.depth = (int) entry.getColLong(7); - this.anchors = (int) entry.getColLong(8); - this.forkfactor = (int) entry.getColLong(9); - this.flags = new kelondroBitfield(entry.getColBytes(10)); - try { - this.handle = Integer.parseInt(new String(entry.getColBytes(11), "UTF-8")); - } catch (NumberFormatException ee) { - System.out.println("BUG in stackCrawlMessage. entry = " + entry.toString()); - throw new RuntimeException(ee.getMessage()); - } - } catch (Exception e) { - e.printStackTrace(); - throw new IllegalStateException(e.toString()); - } - } - - public String url() { - return this.url; - } - - public String referrerHash() { - return this.referrerHash; - } - - public String initiatorHash() { - if (this.initiator == null) return null; - if (this.initiator.length() == 0) return null; - return this.initiator; - } - - public Date loaddate() { - return this.loaddate; - } - - public String name() { - return this.name; - } - - public int depth() { - return this.depth; - } - - public String profileHandle() { - return this.profileHandle; - } - - public String toString() { - StringBuffer str = new StringBuffer(); - str.append("urlHash: ").append(urlHash==null ? "null" : urlHash).append(" | ") - .append("initiator: ").append(initiator==null?"null":initiator).append(" | ") - .append("url: ").append(url==null?"null":url).append(" | ") - .append("referrer: ").append((referrerHash == null) ? plasmaURL.dummyHash : referrerHash).append(" | ") - .append("name: ").append((name == null) ? "null" : name).append(" | ") - .append("loaddate: ").append((loaddate == null) ? new Date() : loaddate).append(" | ") - .append("profile: ").append(profileHandle==null?"null":profileHandle).append(" | ") - .append("depth: ").append(Integer.toString(depth)).append(" | ") - .append("forkfactor: ").append(Integer.toString(forkfactor)).append(" | ") - //.append("flags: ").append((flags==null) ? "null" : flags.toString()) - ; - return str.toString(); - } - - public byte[][] getBytes() { - // stores the values from the object variables into the database - String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, plasmaCrawlNURL.rowdef.width(5)); - // store the hash in the hash cache - - // even if the entry exists, we simply overwrite it - byte[][] entry = null; - try { - entry = new byte[][] { - this.urlHash.getBytes(), - (this.initiator == null) ? "".getBytes() : this.initiator.getBytes(), - this.url.getBytes(), - this.referrerHash.getBytes(), - this.name.getBytes("UTF-8"), - loaddatestr.getBytes(), - (this.profileHandle == null) ? null : this.profileHandle.getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.depth, plasmaCrawlNURL.rowdef.width(7)).getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.anchors, plasmaCrawlNURL.rowdef.width(8)).getBytes(), - kelondroBase64Order.enhancedCoder.encodeLong(this.forkfactor, plasmaCrawlNURL.rowdef.width(9)).getBytes(), - this.flags.bytes(), - normalizeHandle(this.handle).getBytes() - }; - } catch (UnsupportedEncodingException e) { /* ignore this */ } - return entry; - } - - private String normalizeHandle(int h) { - String d = Integer.toHexString(h); - while (d.length() < plasmaCrawlNURL.rowdef.width(11)) d = "0" + d; - return d; - } - } - final class stackCrawlQueue { private final serverSemaphore readSync; @@ -657,10 +502,10 @@ public final class plasmaCrawlStacker { // do nothing.. } if (this.dbtype == QUEUE_DB_TYPE_FLEX) { - kelondroFlexWidthArray.delete(cacheStacksPath, "urlPreNotice2.table"); + kelondroFlexWidthArray.delete(cacheStacksPath, "urlNoticeStacker7.db"); } if (this.dbtype == QUEUE_DB_TYPE_TREE) { - File cacheFile = new File(cacheStacksPath, "urlPreNotice.db"); + File cacheFile = new File(cacheStacksPath, "urlNoticeStacker.db"); cacheFile.delete(); } } @@ -669,19 +514,19 @@ public final class plasmaCrawlStacker { if (!(cacheStacksPath.exists())) cacheStacksPath.mkdir(); // make the path if (this.dbtype == QUEUE_DB_TYPE_RAM) { - this.urlEntryCache = new kelondroRowSet(plasmaCrawlNURL.rowdef, 0); + this.urlEntryCache = new kelondroRowSet(plasmaCrawlEntry.rowdef, 0); } if (this.dbtype == QUEUE_DB_TYPE_FLEX) { - String newCacheName = "urlPreNotice2.table"; + String newCacheName = "urlNoticeStacker7.db"; cacheStacksPath.mkdirs(); try { - this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, plasmaCrawlNURL.rowdef), true, false); + this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, plasmaCrawlEntry.rowdef), true, false); } catch (Exception e) { e.printStackTrace(); // kill DB and try again kelondroFlexTable.delete(cacheStacksPath, newCacheName); try { - this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, plasmaCrawlNURL.rowdef), true, false); + this.urlEntryCache = new kelondroCache(new kelondroFlexTable(cacheStacksPath, newCacheName, preloadTime, plasmaCrawlEntry.rowdef), true, false); } catch (Exception ee) { ee.printStackTrace(); System.exit(-1); @@ -689,10 +534,10 @@ public final class plasmaCrawlStacker { } } if (this.dbtype == QUEUE_DB_TYPE_TREE) { - File cacheFile = new File(cacheStacksPath, "urlPreNotice.db"); + File cacheFile = new File(cacheStacksPath, "urlNoticeStacker.db"); cacheFile.getParentFile().mkdirs(); try { - this.urlEntryCache = new kelondroCache(kelondroTree.open(cacheFile, true, preloadTime, plasmaCrawlNURL.rowdef), true, true); + this.urlEntryCache = new kelondroCache(kelondroTree.open(cacheFile, true, preloadTime, plasmaCrawlEntry.rowdef), true, true); } catch (IOException e) { e.printStackTrace(); System.exit(-1); @@ -708,7 +553,7 @@ public final class plasmaCrawlStacker { this.urlEntryHashCache.clear(); } - public void addMessage(stackCrawlMessage newMessage) + public void addMessage(plasmaCrawlEntry newMessage) throws InterruptedException, IOException { if (newMessage == null) throw new NullPointerException(); @@ -717,9 +562,9 @@ public final class plasmaCrawlStacker { boolean insertionDoneSuccessfully = false; synchronized(this.urlEntryHashCache) { - kelondroRow.Entry oldValue = this.urlEntryCache.put(this.urlEntryCache.row().newEntry(newMessage.getBytes())); + kelondroRow.Entry oldValue = this.urlEntryCache.put(newMessage.toRow()); if (oldValue == null) { - insertionDoneSuccessfully = this.urlEntryHashCache.add(newMessage.urlHash); + insertionDoneSuccessfully = this.urlEntryHashCache.add(newMessage.urlhash()); } } @@ -741,7 +586,7 @@ public final class plasmaCrawlStacker { return this.dbtype; } - public stackCrawlMessage waitForMessage() throws InterruptedException, IOException { + public plasmaCrawlEntry waitForMessage() throws InterruptedException, IOException { this.readSync.P(); this.writeSync.P(); @@ -759,7 +604,7 @@ public final class plasmaCrawlStacker { } if ((urlHash == null) || (entry == null)) return null; - return new stackCrawlMessage(urlHash, entry); + return new plasmaCrawlEntry(entry); } } @@ -941,7 +786,7 @@ public final class plasmaCrawlStacker { private boolean running = false; private boolean stopped = false; private boolean done = false; - private stackCrawlMessage theMsg; + private plasmaCrawlEntry theMsg; public Worker(ThreadGroup theThreadGroup) { super(theThreadGroup,"stackCrawlThread_created"); @@ -963,7 +808,7 @@ public final class plasmaCrawlStacker { } } - public synchronized void execute(stackCrawlMessage newMsg) { + public synchronized void execute(plasmaCrawlEntry newMsg) { this.theMsg = newMsg; this.done = false; @@ -1020,7 +865,7 @@ public final class plasmaCrawlStacker { private void execute() throws InterruptedException { try { - this.setName("stackCrawlThread_" + this.theMsg.url); + this.setName("stackCrawlThread_" + this.theMsg.url()); String rejectReason = dequeue(this.theMsg); // check for interruption @@ -1028,15 +873,9 @@ public final class plasmaCrawlStacker { // if the url was rejected we store it into the error URL db if (rejectReason != null) { - plasmaCrawlEURL.Entry ee = sb.errorURL.newEntry( - new URL(this.theMsg.url()), - this.theMsg.referrerHash(), - this.theMsg.initiatorHash(), - yacyCore.seedDB.mySeed.hash, - this.theMsg.name, - rejectReason, - new kelondroBitfield() - ); + plasmaCrawlZURL.Entry ee = sb.errorURL.newEntry( + this.theMsg, yacyCore.seedDB.mySeed.hash, null, + 0, rejectReason); ee.store(); sb.errorURL.stackPushEntry(ee); } diff --git a/source/de/anomic/plasma/plasmaCrawlZURL.java b/source/de/anomic/plasma/plasmaCrawlZURL.java new file mode 100644 index 000000000..e60400d00 --- /dev/null +++ b/source/de/anomic/plasma/plasmaCrawlZURL.java @@ -0,0 +1,274 @@ +// plasmaCrawlZURL.java +// (C) 2007 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany +// first published 15.03.2007 on http://www.anomic.de +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.plasma; + +import java.io.File; +import java.io.IOException; +import java.util.Date; +import java.util.Iterator; +import java.util.LinkedList; + +import de.anomic.kelondro.kelondroBase64Order; +import de.anomic.kelondro.kelondroFlexTable; +import de.anomic.kelondro.kelondroIndex; +import de.anomic.kelondro.kelondroRow; +import de.anomic.net.URL; +import de.anomic.yacy.yacyCore; +import de.anomic.yacy.yacySeedDB; + +public class plasmaCrawlZURL { + + public final static kelondroRow rowdef = new kelondroRow( + "String urlhash-" + yacySeedDB.commonHashLength + ", " + // the url's hash + "String executor-" + yacySeedDB.commonHashLength + ", " + // the crawling executor + "Cardinal workdate-8 {b256}, " + // the time when the url was last time tried to load + "Cardinal workcount-4 {b256}, " + // number of load retries + "String anycause-80, " + // string describing load failure + "byte[] entry-" + plasmaCrawlEntry.rowdef.objectsize(), // extra space + kelondroBase64Order.enhancedCoder, + 0); + + // the class object + private kelondroIndex urlIndexFile = null; + private LinkedList rejectedStack = new LinkedList(); // strings: url + + public plasmaCrawlZURL(File cachePath, String tablename) { + cachePath.mkdirs(); + try { + urlIndexFile = new kelondroFlexTable(cachePath, tablename, -1, rowdef); + } catch (IOException e) { + e.printStackTrace(); + System.exit(-1); + } + } + + public int size() { + try { + return urlIndexFile.size() ; + } catch (IOException e) { + return 0; + } + } + + public void close() { + if (urlIndexFile != null) { + urlIndexFile.close(); + urlIndexFile = null; + } + } + + public synchronized Entry newEntry( + plasmaCrawlEntry bentry, String executor, Date workdate, + int workcount, String anycause) { + if ((executor == null) || (executor.length() < yacySeedDB.commonHashLength)) executor = plasmaURL.dummyHash; + if (anycause == null) anycause = "unknown"; + return new Entry(bentry, executor, workdate, workcount, anycause); + } + + public synchronized Entry newEntry(URL url, String anycause) { + return new Entry(url, anycause); + } + + public boolean remove(String hash) { + if (hash == null) return false; + try { + urlIndexFile.remove(hash.getBytes()); + return true; + } catch (IOException e) { + return false; + } + } + + public synchronized void stackPushEntry(Entry e) { + rejectedStack.add(e.hash()); + } + + public Entry stackPopEntry(int pos) throws IOException { + String urlhash = (String) rejectedStack.get(pos); + if (urlhash == null) return null; + return new Entry(urlhash); + } + + public synchronized Entry getEntry(String hash) throws IOException { + return new Entry(hash); + } + + public boolean getUseNewDB() { + return (urlIndexFile instanceof kelondroFlexTable); + } + + public boolean exists(String urlHash) { + try { + return urlIndexFile.has(urlHash.getBytes()); + } catch (IOException e) { + return false; + } + } + + public void clearStack() { + rejectedStack.clear(); + } + + public int stackSize() { + return rejectedStack.size(); + } + + public class Entry { + + plasmaCrawlEntry bentry; // the balancer entry + private String executor; // the crawling initiator + private Date workdate; // the time when the url was last time tried to load + private int workcount; // number of tryings + private String anycause; // string describing reason for load fail + private boolean stored; + + public Entry(URL url, String reason) { + this(new plasmaCrawlEntry(url), null, new Date(), 0, reason); + } + + public Entry( + plasmaCrawlEntry bentry, String executor, Date workdate, + int workcount, String anycause) { + // create new entry + this.bentry = bentry; + this.executor = (executor == null) ? yacyCore.seedDB.mySeed.hash : executor; + this.workdate = (workdate == null) ? new Date() : workdate; + this.workcount = workcount; + this.anycause = (anycause == null) ? "" : anycause; + stored = false; + } + + public Entry(String hash) throws IOException { + kelondroRow.Entry entry = urlIndexFile.get(hash.getBytes()); + if (entry != null) { + insertEntry(entry); + } + this.stored = true; + } + + public Entry(kelondroRow.Entry entry) throws IOException { + insertEntry(entry); + this.stored = false; + } + + private void insertEntry(kelondroRow.Entry entry) throws IOException { + assert (entry != null); + this.executor = entry.getColString(1, "UTF-8"); + this.workdate = new Date(entry.getColLong(2)); + this.workcount = (int) entry.getColLong(3); + this.anycause = entry.getColString(4, "UTF-8"); + this.bentry = new plasmaCrawlEntry(plasmaCrawlEntry.rowdef.newEntry(entry.getColBytes(5))); + assert ((new String(entry.getColBytes(0))).equals(bentry.urlhash())); + return; + } + + public void store() { + // stores the values from the object variables into the database + if (this.stored) return; + if (this.bentry == null) return; + kelondroRow.Entry newrow = rowdef.newEntry(); + newrow.setCol(0, this.bentry.urlhash().getBytes()); + newrow.setCol(1, this.executor.getBytes()); + newrow.setCol(2, this.workdate.getTime()); + newrow.setCol(3, this.workcount); + newrow.setCol(4, this.anycause.getBytes()); + newrow.setCol(5, this.bentry.toRow().bytes()); + try { + urlIndexFile.put(newrow); + this.stored = true; + } catch (IOException e) { + System.out.println("INTERNAL ERROR AT plasmaEURL:url2hash:" + e.toString()); + } + } + + public URL url() { + return this.bentry.url(); + } + + public String initiator() { + return this.bentry.initiator(); + } + + public String hash() { + // return a url-hash, based on the md5 algorithm + // the result is a String of 12 bytes within a 72-bit space + // (each byte has an 6-bit range) + // that should be enough for all web pages on the world + return this.bentry.urlhash(); + } + + public Date workdate() { + return workdate; + } + + public String executor() { + // return the creator's hash + return executor; + } + + public String anycause() { + return anycause; + } + + } + + public class kiter implements Iterator { + // enumerates entry elements + Iterator i; + boolean error = false; + + public kiter(boolean up, String firstHash) throws IOException { + i = urlIndexFile.rows(up, (firstHash == null) ? null : firstHash.getBytes()); + error = false; + } + + public boolean hasNext() { + if (error) return false; + return i.hasNext(); + } + + public Object next() throws RuntimeException { + kelondroRow.Entry e = (kelondroRow.Entry) i.next(); + if (e == null) return null; + try { + return new Entry(e); + } catch (IOException ex) { + throw new RuntimeException("error '" + ex.getMessage() + "' for hash " + e.getColString(0, null)); + } + } + + public void remove() { + i.remove(); + } + + } + + public Iterator entries(boolean up, String firstHash) throws IOException { + // enumerates entry elements + return new kiter(up, firstHash); + } +} diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java index b064e83c2..88d6c10d8 100644 --- a/source/de/anomic/plasma/plasmaSearchEvent.java +++ b/source/de/anomic/plasma/plasmaSearchEvent.java @@ -454,7 +454,7 @@ public final class plasmaSearchEvent extends Thread implements Runnable { if (rcLocal == null) return; plasmaSearchPreOrder preorder = new plasmaSearchPreOrder(query, ranking, rcLocal, timeout - System.currentTimeMillis()); - preorder.remove(true, true); + if (preorder.filteredCount()> query.wantedResults) preorder.remove(true, true); // start url-fetch indexRWIEntryNew entry; diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index ab4c16ef2..c6ca8a9dd 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -512,6 +512,7 @@ public class plasmaSnippetCache { maxLength = maxLength - result.length(); if (maxLength < 20) maxLength = 20; tsr = computeTextSnippet(os.values().iterator(), remaininghashes, maxLength); + if (tsr == null) return null; String nextSnippet = (String) tsr[0]; if (nextSnippet == null) return tsr; return new Object[]{result + (" / " + nextSnippet), tsr[1]}; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index cba80353d..194b5d73a 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -209,7 +209,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public File workPath; public HashMap rankingPermissions; public plasmaCrawlNURL noticeURL; - public plasmaCrawlEURL errorURL; + public plasmaCrawlZURL errorURL, delegatedURL; public plasmaWordIndex wordIndex; public plasmaHTCache cacheManager; public plasmaSnippetCache snippetCache; @@ -1038,8 +1038,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // start indexing management log.logConfig("Starting Indexing Management"); - noticeURL = new plasmaCrawlNURL(plasmaPath, -1); - errorURL = new plasmaCrawlEURL(plasmaPath, -1); + noticeURL = new plasmaCrawlNURL(plasmaPath); + errorURL = new plasmaCrawlZURL(plasmaPath, "urlError.db"); + delegatedURL = new plasmaCrawlZURL(plasmaPath, "urlDelegated.db"); wordIndex = new plasmaWordIndex(indexPath, ramRWI_time, log); // set a high maximum cache size to current size; this is adopted later automatically @@ -1330,19 +1331,20 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // if it not exists, null is returned if (wordIndex.loadedURL.exists(hash)) return "loaded"; if (noticeURL.existsInStack(hash)) return "crawler"; + if (delegatedURL.exists(hash)) return "delegated"; if (errorURL.exists(hash)) return "errors"; return null; } public URL getURL(String urlhash) throws IOException { if (urlhash.equals(plasmaURL.dummyHash)) return null; - try { - plasmaCrawlNURL.Entry ne = noticeURL.getEntry(urlhash); - if (ne != null) return ne.url(); - } catch (IOException e) {} + plasmaCrawlEntry ne = noticeURL.get(urlhash); + if (ne != null) return ne.url(); indexURLEntry le = wordIndex.loadedURL.load(urlhash, null); if (le != null) return le.comp().url(); - plasmaCrawlEURL.Entry ee = errorURL.getEntry(urlhash); + plasmaCrawlZURL.Entry ee = delegatedURL.getEntry(urlhash); + if (ee != null) return ee.url(); + ee = errorURL.getEntry(urlhash); if (ee != null) return ee.url(); return null; } @@ -1602,6 +1604,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser flushCitationReference(crg, "crg"); log.logConfig("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager (stand by...)"); noticeURL.close(); + delegatedURL.close(); errorURL.close(); wordIndex.close(); yc.close(); @@ -1739,6 +1742,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser public int cleanupJobSize() { int c = 0; + if ((delegatedURL.stackSize() > 1000)) c++; if ((errorURL.stackSize() > 1000)) c++; for (int i = 1; i <= 6; i++) { if (wordIndex.loadedURL.getStackSize(i) > 1000) c++; @@ -1758,6 +1762,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser rankingOwnDistribution.transferRanking(count); rankingOtherDistribution.transferRanking(1); + // clean up delegated stack + checkInterruption(); + if ((delegatedURL.stackSize() > 1000)) { + log.logFine("Cleaning Delegated-URLs report stack, " + delegatedURL.stackSize() + " entries on stack"); + delegatedURL.clearStack(); + hasDoneSomething = true; + } + // clean up error stack checkInterruption(); if ((errorURL.stackSize() > 1000)) { @@ -1765,6 +1777,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser errorURL.clearStack(); hasDoneSomething = true; } + // clean up loadedURL stack for (int i = 1; i <= 6; i++) { checkInterruption(); @@ -1774,6 +1787,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser hasDoneSomething = true; } } + // clean up profiles checkInterruption(); if (cleanProfiles()) hasDoneSomething = true; @@ -1883,7 +1897,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // do a local crawl - plasmaCrawlNURL.Entry urlEntry = null; + plasmaCrawlEntry urlEntry = null; while (urlEntry == null && noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) > 0) { String stats = "LOCALCRAWL[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; try { @@ -1953,7 +1967,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String stats = "REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; try { - plasmaCrawlNURL.Entry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT); + plasmaCrawlEntry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_LIMIT); String profileHandle = urlEntry.profileHandle(); // System.out.println("DEBUG plasmaSwitchboard.processCrawling: // profileHandle = " + profileHandle + ", urlEntry.url = " + urlEntry.url()); @@ -2040,7 +2054,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser String stats = "REMOTETRIGGEREDCRAWL[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_LIMIT) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_OVERHANG) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]"; try { - plasmaCrawlNURL.Entry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE); + plasmaCrawlEntry urlEntry = noticeURL.pop(plasmaCrawlNURL.STACK_TYPE_REMOTE); String profileHandle = urlEntry.profileHandle(); // System.out.println("DEBUG plasmaSwitchboard.processCrawling: // profileHandle = " + profileHandle + ", urlEntry.url = " + @@ -2155,6 +2169,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser Map hl = document.getHyperlinks(); Iterator i = hl.entrySet().iterator(); String nextUrlString; + URL nextUrl; Map.Entry nextEntry; while (i.hasNext()) { // check for interruption @@ -2164,10 +2179,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser nextEntry = (Map.Entry) i.next(); nextUrlString = (String) nextEntry.getKey(); try { - nextUrlString = new URL(nextUrlString).toNormalform(); + nextUrl = new URL(nextUrlString); // enqueue the hyperlink into the pre-notice-url db - sbStackCrawlThread.enqueue(nextUrlString, entry.url().toString(), initiatorPeerHash, (String) nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile()); + sbStackCrawlThread.enqueue(nextUrl, entry.urlHash(), initiatorPeerHash, (String) nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile()); } catch (MalformedURLException e1) {} } log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.normalizedURLString() + @@ -2447,11 +2462,13 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } // removing current entry from notice URL queue + /* boolean removed = noticeURL.remove(entry.urlHash()); // worked-off if (!removed) { log.logFinest("Unable to remove indexed URL " + entry.url() + " from Crawler Queue. This could be because of an URL redirect."); } - + */ + // explicit delete/free resources if ((entry != null) && (entry.profile() != null) && (!(entry.profile().storeHTCache()))) { plasmaHTCache.filesInUse.remove(entry.cacheFile()); @@ -2540,7 +2557,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser } } - private void processLocalCrawling(plasmaCrawlNURL.Entry urlEntry, plasmaCrawlProfile.entry profile, String stats) { + private void processLocalCrawling(plasmaCrawlEntry urlEntry, plasmaCrawlProfile.entry profile, String stats) { // work off one Crawl stack entry if ((urlEntry == null) || (urlEntry.url() == null)) { log.logInfo(stats + ": urlEntry=null"); @@ -2549,114 +2566,117 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser // convert the referrer hash into the corresponding URL URL refererURL = null; - String refererHash = urlEntry.referrerHash(); + String refererHash = urlEntry.referrerhash(); if ((refererHash != null) && (!refererHash.equals(plasmaURL.dummyHash))) try { refererURL = this.getURL(refererHash); } catch (IOException e) { refererURL = null; } cacheLoader.loadAsync(urlEntry.url(), urlEntry.name(), (refererURL!=null)?refererURL.toString():null, urlEntry.initiator(), urlEntry.depth(), profile); - log.logInfo(stats + ": enqueued for load " + urlEntry.url() + " [" + urlEntry.hash() + "]"); + log.logInfo(stats + ": enqueued for load " + urlEntry.url() + " [" + urlEntry.urlhash() + "]"); return; } - private boolean processRemoteCrawlTrigger(plasmaCrawlNURL.Entry urlEntry) { + private boolean processRemoteCrawlTrigger(plasmaCrawlEntry urlEntry) { + // if this returns true, then the urlEntry is considered as stored somewhere and the case is finished + // if this returns false, the urlEntry will be enqueued to the local crawl again - // return true iff another peer has/will index(ed) the url + // wrong access if (urlEntry == null) { log.logInfo("REMOTECRAWLTRIGGER[" + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_CORE) + ", " + noticeURL.stackSize(plasmaCrawlNURL.STACK_TYPE_REMOTE) + "]: urlEntry=null"); - return true; // superfluous request; true correct in this context - } - - // are we qualified? - if ((yacyCore.seedDB.mySeed == null) || - (yacyCore.seedDB.mySeed.isJunior())) { - log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: no permission"); - return false; + return true; // superfluous request; true correct in this context because the urlEntry shall not be tracked any more } // check url if (urlEntry.url() == null) { log.logFine("ERROR: plasmaSwitchboard.processRemoteCrawlTrigger - url is null. name=" + urlEntry.name()); - return true; + return true; // same case as above: no more consideration } - String urlhash = plasmaURL.urlHash(urlEntry.url()); - // check remote crawl - yacySeed remoteSeed = yacyCore.dhtAgent.getCrawlSeed(urlhash); + // are we qualified for a remote crawl? + if ((yacyCore.seedDB.mySeed == null) || (yacyCore.seedDB.mySeed.isJunior())) { + log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: no permission"); + return false; // no, we must crawl this page ourselves + } + // check if peer for remote crawl is available + yacySeed remoteSeed = yacyCore.dhtAgent.getCrawlSeed(urlEntry.urlhash()); if (remoteSeed == null) { log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: no remote crawl seed available"); return false; } // do the request + HashMap page = null; try { - HashMap page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), getURL(urlEntry.referrerHash()), 6000); + page = yacyClient.crawlOrder(remoteSeed, urlEntry.url(), getURL(urlEntry.referrerhash()), 6000); + } catch (IOException e1) { + log.logSevere(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. URL CANNOT BE RETRIEVED from referrer hash: " + urlEntry.referrerhash(), e1); + return false; + } - // check success - /* - * the result of the 'response' value can have one of the following - * values: negative cases, no retry denied - the peer does not want - * to crawl that exception - an exception occurred - * - * negative case, retry possible rejected - the peer has rejected to - * process, but a re-try should be possible - * - * positive case with crawling stacked - the resource is processed - * asap - * - * positive case without crawling double - the resource is already - * in database, believed to be fresh and not reloaded the resource - * is also returned in lurl - */ - if ((page == null) || (page.get("delay") == null)) { - log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + urlEntry.url().toString() + "). Removed peer."); - if (remoteSeed != null) { - yacyCore.peerActions.peerDeparture(remoteSeed); - } - return false; - } else + // check if we got contact to peer and the peer respondet + if ((page == null) || (page.get("delay") == null)) { + log.logInfo("CRAWL: REMOTE CRAWL TO PEER " + remoteSeed.getName() + " FAILED. CAUSE: unknown (URL=" + urlEntry.url().toString() + "). Removed peer."); + yacyCore.peerActions.peerDeparture(remoteSeed); + return false; // no response from peer, we will crawl this ourself + } + + log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed=" + + remoteSeed.getName() + ", url=" + urlEntry.url().toString() + + ", response=" + page.toString()); // DEBUG + + // we received an answer and we are told to wait a specific time until we shall ask again for another crawl + int newdelay = Integer.parseInt((String) page.get("delay")); + yacyCore.dhtAgent.setCrawlDelay(remoteSeed.hash, newdelay); + String response = (String) page.get("response"); + if (response.equals("stacked")) { + // success, the remote peer accepted the crawl + log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + + " PLACED URL=" + urlEntry.url().toString() + + "; NEW DELAY=" + newdelay); + // track this remote crawl + this.delegatedURL.newEntry(urlEntry, remoteSeed.hash, new Date(), 0, response).store(); + return true; + } + + // check other cases: the remote peer may respond that it already knows that url + if (response.equals("double")) { + // in case the peer answers double, it transmits the complete lurl data + String lurl = (String) page.get("lurl"); + if ((lurl != null) && (lurl.length() != 0)) { + String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); + indexURLEntry entry = wordIndex.loadedURL.newEntry(propStr); try { - log.logFine("plasmaSwitchboard.processRemoteCrawlTrigger: remoteSeed=" + remoteSeed.getName() + ", url=" + urlEntry.url().toString() + ", response=" + page.toString()); // DEBUG - - int newdelay = Integer.parseInt((String) page.get("delay")); - yacyCore.dhtAgent.setCrawlDelay(remoteSeed.hash, newdelay); - String response = (String) page.get("response"); - if (response.equals("stacked")) { - log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " PLACED URL=" + urlEntry.url().toString() + "; NEW DELAY=" + newdelay); - return true; - } else if (response.equals("double")) { - String lurl = (String) page.get("lurl"); - if ((lurl != null) && (lurl.length() != 0)) { - String propStr = crypt.simpleDecode(lurl, (String) page.get("key")); - indexURLEntry entry = wordIndex.loadedURL.newEntry(propStr); - wordIndex.loadedURL.store(entry); - wordIndex.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt? - noticeURL.remove(entry.hash()); - log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " SUPERFLUOUS. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + "). URL IS CONSIDERED AS 'LOADED!'"); - return true; - } else { - log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " REJECTED. CAUSE: " + page.get("reason") + " (URL=" + urlEntry.url().toString() + ")"); - remoteSeed.setFlagAcceptRemoteCrawl(false); - yacyCore.seedDB.update(remoteSeed.hash, remoteSeed); - return false; - } - } else { - log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " DENIED. RESPONSE=" + response + ", CAUSE=" + page.get("reason") + ", URL=" + urlEntry.url().toString()); - remoteSeed.setFlagAcceptRemoteCrawl(false); - yacyCore.seedDB.update(remoteSeed.hash, remoteSeed); - return false; - } - } catch (Exception e) { - // wrong values - log.logSevere(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. CLIENT RETURNED: " + page.toString(), e); - return false; + wordIndex.loadedURL.store(entry); + wordIndex.loadedURL.stack(entry, yacyCore.seedDB.mySeed.hash, remoteSeed.hash, 1); // *** ueberfluessig/doppelt? + // noticeURL.remove(entry.hash()); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); } - } catch (IOException e) { - log.logSevere(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + " FAILED. URL CANNOT BE RETRIEVED from referrer hash: " + urlEntry.referrerHash(), e); - return false; + + log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + + " SUPERFLUOUS. CAUSE: " + page.get("reason") + + " (URL=" + urlEntry.url().toString() + + "). URL IS CONSIDERED AS 'LOADED!'"); + return true; + } else { + log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + + " REJECTED. CAUSE: bad lurl response / " + page.get("reason") + " (URL=" + + urlEntry.url().toString() + ")"); + remoteSeed.setFlagAcceptRemoteCrawl(false); + yacyCore.seedDB.update(remoteSeed.hash, remoteSeed); + return false; + } } + + log.logInfo(STR_REMOTECRAWLTRIGGER + remoteSeed.getName() + + " DENIED. RESPONSE=" + response + ", CAUSE=" + + page.get("reason") + ", URL=" + urlEntry.url().toString()); + remoteSeed.setFlagAcceptRemoteCrawl(false); + yacyCore.seedDB.update(remoteSeed.hash, remoteSeed); + return false; } private static SimpleDateFormat DateFormatter = new SimpleDateFormat("EEE, dd MMM yyyy"); @@ -3165,20 +3185,24 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser kelondroBitfield flags ) { // create a new errorURL DB entry - plasmaCrawlEURL.Entry ee = this.errorURL.newEntry( - url, - referrerHash, - initiator, - yacyCore.seedDB.mySeed.hash, - (name==null)?"":name, - failreason, - flags - ); + plasmaCrawlEntry bentry = new plasmaCrawlEntry( + initiator, + url, + referrerHash, + (name == null) ? "" : name, + new Date(), + null, + 0, + 0, + 0); + plasmaCrawlZURL.Entry ee = this.errorURL.newEntry( + bentry, initiator, new Date(), + 0, failreason); // store the entry ee.store(); // push it onto the stack this.errorURL.stackPushEntry(ee); - } + } public void checkInterruption() throws InterruptedException { Thread curThread = Thread.currentThread(); diff --git a/source/de/anomic/plasma/plasmaSwitchboardQueue.java b/source/de/anomic/plasma/plasmaSwitchboardQueue.java index 9645fa9d8..4a39f14dd 100644 --- a/source/de/anomic/plasma/plasmaSwitchboardQueue.java +++ b/source/de/anomic/plasma/plasmaSwitchboardQueue.java @@ -253,20 +253,7 @@ public class plasmaSwitchboardQueue { this.contentInfo = null; this.referrerURL = null; } - - public String toString() { - StringBuffer str = new StringBuffer(); - str.append("url: ") .append(this.url==null ? "null" : this.url.toString()).append(" | ") - .append("referrer: ") .append(this.referrerHash==null?"null":this.referrerHash).append(" | ") - .append("ifModifiedSince: ").append(this.ifModifiedSince==null?"null":this.ifModifiedSince.toString()).append(" | ") - .append("flags: ") .append(Byte.toString(this.flags)).append(" | ") - .append("initiator: ") .append(this.initiator==null ? "null" : this.initiator).append(" | ") - .append("depth: ") .append(Integer.toString(this.depth)).append(" | ") - .append("profile: ") .append(this.profileHandle==null?"null":this.profileHandle).append(" | ") - .append("anchorName: ") .append(this.anchorName==null?"null":this.anchorName); - return str.toString(); - } - + public URL url() { return url; } diff --git a/source/yacy.java b/source/yacy.java index 4b9887c6e..60039f149 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -84,9 +84,10 @@ import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroTree; import de.anomic.net.URL; import de.anomic.plasma.plasmaCondenser; -import de.anomic.plasma.plasmaCrawlEURL; +import de.anomic.plasma.plasmaCrawlEntry; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlNURL; +import de.anomic.plasma.plasmaCrawlZURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.plasmaWordIndexAssortment; @@ -1011,11 +1012,11 @@ public final class yacy { } if (source.equals("eurl")) { Iterator eiter = sb.errorURL.entries(true, null); - plasmaCrawlEURL.Entry entry; + plasmaCrawlZURL.Entry entry; while (eiter.hasNext()) { try { - entry = (plasmaCrawlEURL.Entry) eiter.next(); - if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), entry.failreason()); + entry = (plasmaCrawlZURL.Entry) eiter.next(); + if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), entry.anycause()); } catch (Exception e) { // here a MalformedURLException may occur // just ignore @@ -1029,11 +1030,11 @@ public final class yacy { } } if (source.equals("nurl")) { - Iterator eiter = sb.noticeURL.entries(true, null); - plasmaCrawlNURL.Entry entry; + Iterator eiter = sb.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE); + plasmaCrawlEntry entry; while (eiter.hasNext()) { try { - entry = (plasmaCrawlNURL.Entry) eiter.next(); + entry = (plasmaCrawlEntry) eiter.next(); if ((entry != null) && (entry.url() != null)) doms.put(entry.url().getHost(), "profile=" + entry.profileHandle() + ", depth=" + entry.depth()); } catch (Exception e) { // here a MalformedURLException may occur @@ -1120,12 +1121,12 @@ public final class yacy { } if (source.equals("eurl")) { Iterator eiter = sb.errorURL.entries(true, null); - plasmaCrawlEURL.Entry entry; + plasmaCrawlZURL.Entry entry; while (eiter.hasNext()) { - entry = (plasmaCrawlEURL.Entry) eiter.next(); + entry = (plasmaCrawlZURL.Entry) eiter.next(); if ((entry != null) && (entry.url() != null)) { if (html) { - bos.write(("" + entry.url() + " " + entry.failreason() + "
").getBytes("UTF-8")); + bos.write(("" + entry.url() + " " + entry.anycause() + "
").getBytes("UTF-8")); bos.write(serverCore.crlf); } else { bos.write(entry.url().toString().getBytes()); @@ -1135,10 +1136,10 @@ public final class yacy { } } if (source.equals("nurl")) { - Iterator eiter = sb.noticeURL.entries(true, null); - plasmaCrawlNURL.Entry entry; + Iterator eiter = sb.noticeURL.iterator(plasmaCrawlNURL.STACK_TYPE_CORE); + plasmaCrawlEntry entry; while (eiter.hasNext()) { - entry = (plasmaCrawlNURL.Entry) eiter.next(); + entry = (plasmaCrawlEntry) eiter.next(); if ((entry != null) && (entry.url() != null)) { if (html) { bos.write(("" + entry.url() + " " + "profile=" + entry.profileHandle() + ", depth=" + entry.depth() + "
").getBytes("UTF-8"));