diff --git a/defaults/yacy.init b/defaults/yacy.init
index 972604e05..0a42d7836 100644
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@@ -568,9 +568,12 @@ performanceSpeed=100
80_indexing_idlesleep=1000
80_indexing_busysleep=10
80_indexing_memprereq=6291456
-82_crawlstack_idlesleep=5000
+82_crawlstack_idlesleep=1000
82_crawlstack_busysleep=0
82_crawlstack_memprereq=1048576
+83_crawlstack_idlesleep=1200
+83_crawlstack_busysleep=0
+83_crawlstack_memprereq=1048576
90_cleanup_idlesleep=300000
90_cleanup_busysleep=300000
90_cleanup_memprereq=0
@@ -818,12 +821,6 @@ svnRevision=0
currentSkin=default
-# temporary flag for new database structure. set only true for testing
-# ALL DATA THAT IS CREATED WITH THIS FLAG ON WILL BE VOID IN A FINAL VERSION
-# table-types: RAM = 0, TREE = 1, FLEX = 2;
-# if you set this to a non-RAM value, you should increase the stacker.slots value
-tableTypeForPreNURL=0
-
# flag to show if pages shall be usable for non-admin users
# this can be applied to the Surftips.html and yacysearch.html page
publicSurftips = true
diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java
index afe797d24..00372ed19 100644
--- a/htroot/IndexControlRWIs_p.java
+++ b/htroot/IndexControlRWIs_p.java
@@ -109,11 +109,7 @@ public class IndexControlRWIs_p {
if (post.containsKey("deletecomplete") && post.containsKey("confirmDelete")) {
sb.webIndex.clear();
sb.crawlQueues.clear();
- try {
- sb.crawlStacker.clear();
- } catch (final IOException e) {
- e.printStackTrace();
- }
+ sb.crawlStacker.clear();
try {
sb.robots.clear();
} catch (final IOException e) {
diff --git a/htroot/PeerLoadPicture.java b/htroot/PeerLoadPicture.java
index 6eb25c0d0..30a5d2797 100644
--- a/htroot/PeerLoadPicture.java
+++ b/htroot/PeerLoadPicture.java
@@ -29,7 +29,8 @@ public class PeerLoadPicture {
final CircleThreadPiece misc = new CircleThreadPiece("Misc.", new Color(190, 50, 180));
final HashMap pieces = new HashMap();
pieces.put(null, idle);
- pieces.put(plasmaSwitchboardConstants.CRAWLSTACK, new CircleThreadPiece("Stacking", new Color(115, 200, 210)));
+ pieces.put(plasmaSwitchboardConstants.CRAWLSTACK0, new CircleThreadPiece("Stacking0", new Color(115, 200, 210)));
+ pieces.put(plasmaSwitchboardConstants.CRAWLSTACK1, new CircleThreadPiece("Stacking1", new Color(115, 200, 210)));
pieces.put(plasmaSwitchboardConstants.INDEXER, new CircleThreadPiece("Parsing/Indexing", new Color(255, 130, 0)));
pieces.put(plasmaSwitchboardConstants.INDEX_DIST, new CircleThreadPiece("DHT-Distribution", new Color(119, 136, 153)));
pieces.put(plasmaSwitchboardConstants.PEER_PING, new CircleThreadPiece("YaCy Core", new Color(255, 230, 160)));
diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java
index 5af704396..c7b94aa17 100644
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@@ -34,6 +34,7 @@ import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.util.Date;
+import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.http.httpRequestHeader;
import de.anomic.plasma.plasmaSwitchboard;
@@ -158,15 +159,18 @@ public class QuickCrawlLink_p {
// stack URL
String reasonString = null;
- reasonString = sb.crawlStacker.stackCrawl(
- crawlingStartURL,
- null,
- sb.webIndex.seedDB.mySeed().hash,
- (title==null)?"CRAWLING-ROOT":title,
- new Date(),
- 0,
- pe
- );
+ reasonString = sb.crawlStacker.stackCrawl(new CrawlEntry(
+ sb.webIndex.seedDB.mySeed().hash,
+ crawlingStartURL,
+ null,
+ (title==null)?"CRAWLING-ROOT":title,
+ new Date(),
+ null,
+ pe.handle(),
+ 0,
+ 0,
+ 0
+ ));
// validate rejection reason
if (reasonString == null) {
diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java
index 831a5a0de..804b9de0c 100644
--- a/htroot/WatchCrawler_p.java
+++ b/htroot/WatchCrawler_p.java
@@ -211,7 +211,18 @@ public class WatchCrawler_p {
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
- final String reasonString = sb.crawlStacker.stackCrawl(url, null, sb.webIndex.seedDB.mySeed().hash, "CRAWLING-ROOT", new Date(), 0, pe);
+ final String reasonString = sb.crawlStacker.stackCrawl(new CrawlEntry(
+ sb.webIndex.seedDB.mySeed().hash,
+ url,
+ null,
+ "CRAWLING-ROOT",
+ new Date(),
+ null,
+ pe.handle(),
+ 0,
+ 0,
+ 0
+ ));
if (reasonString == null) {
// create a bookmark from crawl start url
@@ -260,6 +271,7 @@ public class WatchCrawler_p {
"",
"",
new Date(),
+ null,
pe.handle(),
0,
0,
@@ -338,14 +350,18 @@ public class WatchCrawler_p {
if (nexturl == null) continue;
// enqueuing the url for crawling
- sb.crawlStacker.enqueueEntry(
+ sb.crawlStacker.enqueueEntry(new CrawlEntry(
+ sb.webIndex.seedDB.mySeed().hash,
nexturl,
"",
- sb.webIndex.seedDB.mySeed().hash,
e.getValue(),
- new Date(),
- 0,
- profile);
+ new Date(),
+ null,
+ profile.handle(),
+ 0,
+ 0,
+ 0
+ ));
}
} catch (final PatternSyntaxException e) {
diff --git a/htroot/rct_p.java b/htroot/rct_p.java
index cb2e2942d..b5b47babb 100644
--- a/htroot/rct_p.java
+++ b/htroot/rct_p.java
@@ -30,6 +30,7 @@ import java.text.ParseException;
import java.util.Date;
import java.util.Iterator;
+import de.anomic.crawler.CrawlEntry;
import de.anomic.http.httpRequestHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverDate;
@@ -76,7 +77,18 @@ public class rct_p {
if (urlRejectReason == null) {
// stack url
if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
- sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.webIndex.defaultRemoteProfile);
+ sb.crawlStacker.enqueueEntry(new CrawlEntry(
+ peerhash,
+ url,
+ (referrer == null) ? null : referrer.hash(),
+ "REMOTE-CRAWLING",
+ null,
+ loaddate,
+ sb.webIndex.defaultRemoteProfile.handle(),
+ 0,
+ 0,
+ 0
+ ));
} else {
env.getLog().logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
}
diff --git a/source/de/anomic/crawler/CrawlEntry.java b/source/de/anomic/crawler/CrawlEntry.java
index 24a58c057..08ded65a2 100755
--- a/source/de/anomic/crawler/CrawlEntry.java
+++ b/source/de/anomic/crawler/CrawlEntry.java
@@ -36,10 +36,11 @@ import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroRow;
import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.server.serverProcessorJob;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL;
-public class CrawlEntry {
+public class CrawlEntry extends serverProcessorJob {
// row definition for balancer-related NURL-entries
public final static kelondroRow rowdef = new kelondroRow(
@@ -80,7 +81,7 @@ public class CrawlEntry {
private int forkfactor; // sum of anchors of all ancestors
private kelondroBitfield flags;
private int handle;
- private String status;
+ private String statusMessage;
private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection
public static class domaccess {
@@ -116,38 +117,38 @@ public class CrawlEntry {
* @param forkfactor sum of anchors of all ancestors
*/
public CrawlEntry(
- final String initiator,
- final yacyURL url,
- final String referrerhash,
- final String name,
- final Date appdate,
- final String profileHandle,
- final int depth,
- final int anchors,
- final int forkfactor
+ final String initiator,
+ final yacyURL url,
+ final String referrerhash,
+ final String name,
+ final Date appdate,
+ final Date loaddate,
+ final String profileHandle,
+ final int depth,
+ final int anchors,
+ final int forkfactor
) {
// create new entry and store it into database
- assert appdate != null;
assert url != null;
assert initiator != null;
- assert referrerhash != null;
assert profileHandle.length() == yacySeedDB.commonHashLength : profileHandle + " != " + yacySeedDB.commonHashLength;
this.initiator = initiator;
this.url = url;
- this.refhash = referrerhash;
+ this.refhash = (referrerhash == null) ? "" : referrerhash;
this.name = (name == null) ? "" : name;
this.appdate = (appdate == null) ? 0 : appdate.getTime();
+ this.loaddate = (loaddate == null) ? 0 : loaddate.getTime();
this.profileHandle = profileHandle; // must not be null
this.depth = depth;
this.anchors = anchors;
this.forkfactor = forkfactor;
this.flags = new kelondroBitfield(rowdef.width(10));
this.handle = 0;
- this.loaddate = 0;
this.serverdate = 0;
this.imsdate = 0;
- this.status = "loaded(args)";
+ this.statusMessage = "loaded(args)";
this.initialHash = url.hashCode();
+ this.status = serverProcessorJob.STATUS_INITIATED;
}
public CrawlEntry(final kelondroRow.Entry entry) throws IOException {
@@ -172,7 +173,7 @@ public class CrawlEntry {
this.loaddate = entry.getColLong(12);
this.serverdate = entry.getColLong(13);
this.imsdate = entry.getColLong(14);
- this.status = "loaded(kelondroRow.Entry)";
+ this.statusMessage = "loaded(kelondroRow.Entry)";
this.initialHash = url.hashCode();
return;
}
@@ -182,12 +183,13 @@ public class CrawlEntry {
return this.initialHash;
}
- public void setStatus(final String s) {
- this.status = s;
+ public void setStatus(final String s, int code) {
+ this.statusMessage = s;
+ this.status = code;
}
public String getStatus() {
- return this.status;
+ return this.statusMessage;
}
private static String normalizeHandle(final int h) {
diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java
index 9e2e0621e..62a298c8b 100644
--- a/source/de/anomic/crawler/CrawlQueues.java
+++ b/source/de/anomic/crawler/CrawlQueues.java
@@ -42,6 +42,7 @@ import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSwitchboardConstants;
import de.anomic.server.serverDate;
+import de.anomic.server.serverProcessorJob;
import de.anomic.server.logging.serverLog;
import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage;
@@ -397,7 +398,18 @@ public class CrawlQueues {
if (urlRejectReason == null) {
// stack url
if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
- sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), hash, item.getDescription(), loaddate, 0, sb.webIndex.defaultRemoteProfile);
+ sb.crawlStacker.enqueueEntry(new CrawlEntry(
+ hash,
+ url,
+ (referrer == null) ? null : referrer.hash(),
+ item.getDescription(),
+ null,
+ loaddate,
+ sb.webIndex.defaultRemoteProfile.handle(),
+ 0,
+ 0,
+ 0
+ ));
} else {
log.logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
}
@@ -474,6 +486,7 @@ public class CrawlQueues {
"",
"",
new Date(),
+ new Date(),
(forText) ?
((global) ?
sb.webIndex.defaultTextSnippetGlobalProfile.handle() :
@@ -500,7 +513,7 @@ public class CrawlQueues {
public crawlWorker(final CrawlEntry entry) {
this.entry = entry;
- this.entry.setStatus("worker-initialized");
+ this.entry.setStatus("worker-initialized", serverProcessorJob.STATUS_INITIATED);
this.code = Integer.valueOf(entry.hashCode());
if (!workers.containsKey(code)) {
workers.put(code, this);
@@ -511,7 +524,7 @@ public class CrawlQueues {
public void run() {
try {
// checking robots.txt for http(s) resources
- this.entry.setStatus("worker-checkingrobots");
+ this.entry.setStatus("worker-checkingrobots", serverProcessorJob.STATUS_STARTED);
if ((entry.url().getProtocol().equals("http") || entry.url().getProtocol().equals("https")) && sb.robots.isDisallowed(entry.url())) {
if (log.isFine()) log.logFine("Crawling of URL '" + entry.url().toString() + "' disallowed by robots.txt.");
final ZURL.Entry eentry = errorURL.newEntry(
@@ -524,7 +537,7 @@ public class CrawlQueues {
errorURL.push(eentry);
} else {
// starting a load from the internet
- this.entry.setStatus("worker-loading");
+ this.entry.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING);
final String result = loader.process(this.entry, plasmaParser.PARSER_MODE_CRAWLER);
if (result != null) {
final ZURL.Entry eentry = errorURL.newEntry(
@@ -536,7 +549,7 @@ public class CrawlQueues {
eentry.store();
errorURL.push(eentry);
} else {
- this.entry.setStatus("worker-processed");
+ this.entry.setStatus("worker-processed", serverProcessorJob.STATUS_FINISHED);
}
}
} catch (final Exception e) {
@@ -551,7 +564,7 @@ public class CrawlQueues {
e.printStackTrace();
} finally {
workers.remove(code);
- this.entry.setStatus("worker-finalized");
+ this.entry.setStatus("worker-finalized", serverProcessorJob.STATUS_FINISHED);
}
}
diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java
index 4271b4241..82a13da11 100644
--- a/source/de/anomic/crawler/CrawlStacker.java
+++ b/source/de/anomic/crawler/CrawlStacker.java
@@ -28,17 +28,15 @@
package de.anomic.crawler;
-import java.io.IOException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Date;
-import java.util.LinkedList;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.LinkedBlockingQueue;
import de.anomic.index.indexReferenceBlacklist;
import de.anomic.index.indexURLReference;
-import de.anomic.kelondro.kelondroIndex;
-import de.anomic.kelondro.kelondroRow;
-import de.anomic.kelondro.kelondroRowSet;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverDomains;
@@ -49,13 +47,11 @@ public final class CrawlStacker {
final serverLog log = new serverLog("STACKCRAWL");
- private final LinkedList urlEntryHashCache; // the order how this queue is processed; entries with known DNS entries go first
- private kelondroIndex urlEntryCache; // the entries in the queue
- private long dnsHit, dnsMiss;
- private int alternateCount;
- private CrawlQueues nextQueue;
- private plasmaWordIndex wordIndex;
- private boolean acceptLocalURLs, acceptGlobalURLs;
+ private BlockingQueue fastQueue, slowQueue;
+ private long dnsHit, dnsMiss;
+ private CrawlQueues nextQueue;
+ private plasmaWordIndex wordIndex;
+ private boolean acceptLocalURLs, acceptGlobalURLs;
// objects for the prefetch task
private final ArrayList dnsfetchHosts = new ArrayList();
@@ -68,26 +64,21 @@ public final class CrawlStacker {
this.wordIndex = wordIndex;
this.dnsHit = 0;
this.dnsMiss = 0;
- this.alternateCount = 0;
this.acceptLocalURLs = acceptLocalURLs;
this.acceptGlobalURLs = acceptGlobalURLs;
- // init the message list
- this.urlEntryHashCache = new LinkedList();
-
- this.urlEntryCache = new kelondroRowSet(CrawlEntry.rowdef, 0);
+ this.fastQueue = new LinkedBlockingQueue();
+ this.slowQueue = new ArrayBlockingQueue(1000);
this.log.logInfo("STACKCRAWL thread initialized.");
}
public int size() {
- synchronized (this.urlEntryHashCache) {
- return this.urlEntryHashCache.size();
- }
+ return this.fastQueue.size() + this.slowQueue.size();
}
- public void clear() throws IOException {
- this.urlEntryHashCache.clear();
- this.urlEntryCache.clear();
+ public void clear() {
+ this.fastQueue.clear();
+ this.slowQueue.clear();
}
public void close() {
@@ -98,11 +89,7 @@ public final class CrawlStacker {
this.log.logInfo("Shutdown. Closing stackCrawl queue.");
- // closing the db
- this.urlEntryCache.close();
-
- // clearing the hash list
- this.urlEntryHashCache.clear();
+ clear();
}
private boolean prefetchHost(final String host) {
@@ -121,41 +108,17 @@ public final class CrawlStacker {
}
public boolean job() {
+ if (this.fastQueue.size() > 0 && job(this.fastQueue)) return true;
+ if (this.slowQueue.size() == 0) return false;
+ return job(this.slowQueue);
+ }
+
+ private boolean job(BlockingQueue queue) {
// this is the method that is called by the busy thread from outside
- if (this.urlEntryHashCache.size() == 0) return false;
+ if (queue.size() == 0) return false;
// get the next entry from the queue
- String urlHash = null;
- kelondroRow.Entry ec = null;
- synchronized (this.urlEntryHashCache) {
- urlHash = this.urlEntryHashCache.removeFirst();
- if (urlHash == null) {
- urlEntryHashCache.clear();
- try {
- urlEntryCache.clear();
- } catch (IOException e) {
- e.printStackTrace();
- }
- return false;
- }
- try {
- ec = this.urlEntryCache.remove(urlHash.getBytes());
- } catch (IOException e) {
- e.printStackTrace();
- return false;
- }
- }
- if (urlHash == null || ec == null) return false;
-
- // make a crawl Entry out of it
- CrawlEntry entry = null;
- try {
- entry = new CrawlEntry(ec);
- } catch (IOException e1) {
- e1.printStackTrace();
- return false;
- }
-
+ CrawlEntry entry = queue.poll();
if (entry == null) return false;
try {
@@ -173,95 +136,30 @@ public final class CrawlStacker {
}
return true;
}
-
- public String stackCrawl(
- final yacyURL url,
- final String referrerhash,
- final String initiatorHash,
- final String name,
- final Date loadDate,
- final int currentdepth,
- final CrawlProfile.entry profile) {
- // stacks a crawl item. The position can also be remote
- // returns null if successful, a reason string if not successful
- //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
-
- // add the url into the crawling queue
- final CrawlEntry entry = new CrawlEntry(
- initiatorHash, // initiator, needed for p2p-feedback
- url, // url clear text string
- (referrerhash == null) ? "" : referrerhash, // last url in crawling queue
- name, // load date
- loadDate, // the anchor name
- (profile == null) ? null : profile.handle(), // profile must not be null!
- currentdepth, // depth so far
- 0, // anchors, default value
- 0 // forkfactor, default value
- );
- return stackCrawl(entry);
- }
-
- public void enqueueEntry(
- final yacyURL nexturl,
- final String referrerhash,
- final String initiatorHash,
- final String name,
- final Date loadDate,
- final int currentdepth,
- final CrawlProfile.entry profile) {
- if (profile == null) return;
-
+
+ public void enqueueEntry(final CrawlEntry entry) {
+
// DEBUG
- if (log.isFinest()) log.logFinest("ENQUEUE "+ nexturl +", referer="+referrerhash +", initiator="+initiatorHash +", name="+name +", load="+loadDate +", depth="+currentdepth);
-
- // check first before we create a big object
- if (this.urlEntryCache.has(nexturl.hash().getBytes())) return;
+ if (log.isFinest()) log.logFinest("ENQUEUE "+ entry.url() +", referer="+entry.referrerhash() +", initiator="+entry.initiator() +", name="+entry.name() +", load="+entry.loaddate() +", depth="+entry.depth());
- // now create the big object before we enter the synchronized block
- final CrawlEntry newEntry = new CrawlEntry(
- initiatorHash,
- nexturl,
- referrerhash,
- name,
- loadDate,
- profile.handle(),
- currentdepth,
- 0,
- 0
- );
- if (newEntry == null) return;
- final kelondroRow.Entry newEntryRow = newEntry.toRow();
-
- synchronized(this.urlEntryHashCache) {
- kelondroRow.Entry oldValue;
+ if (prefetchHost(entry.url().getHost())) {
try {
- oldValue = this.urlEntryCache.put(newEntryRow);
- } catch (final IOException e) {
- oldValue = null;
- }
- if (oldValue == null) {
- //System.out.println("*** debug crawlStacker dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ((this.dnsMiss > 0) ? (", Q=" + (this.dnsHit / this.dnsMiss)) : ""));
- if (prefetchHost(nexturl.getHost())) {
- this.alternateCount++;
- this.urlEntryHashCache.addFirst(newEntry.url().hash());
- this.dnsHit++;
- } else {
- if ((this.dnsMiss > 0) && (this.alternateCount > 2 * this.dnsHit / this.dnsMiss)) {
- this.urlEntryHashCache.addFirst(newEntry.url().hash());
- this.alternateCount = 0;
- //System.out.println("*** debug crawlStacker alternate switch, dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ", Q=" + (this.dnsHit / this.dnsMiss));
- } else {
- this.urlEntryHashCache.addLast(newEntry.url().hash());
- }
- this.dnsMiss++;
- }
+ this.fastQueue.put(entry);
+ this.dnsHit++;
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ } else {
+ try {
+ this.slowQueue.put(entry);
+ this.dnsMiss++;
+ } catch (InterruptedException e) {
+ e.printStackTrace();
}
}
}
-
-
- private String stackCrawl(final CrawlEntry entry) {
+ public String stackCrawl(final CrawlEntry entry) {
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
diff --git a/source/de/anomic/crawler/ProtocolLoader.java b/source/de/anomic/crawler/ProtocolLoader.java
index ff142ab1d..2dd643d87 100644
--- a/source/de/anomic/crawler/ProtocolLoader.java
+++ b/source/de/anomic/crawler/ProtocolLoader.java
@@ -36,6 +36,7 @@ import java.util.concurrent.ConcurrentHashMap;
import de.anomic.index.indexDocumentMetadata;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
+import de.anomic.server.serverProcessorJob;
import de.anomic.server.logging.serverLog;
public final class ProtocolLoader {
@@ -111,14 +112,15 @@ public final class ProtocolLoader {
// returns null if everything went fine, a fail reason string if a problem occurred
indexDocumentMetadata h;
try {
+ entry.setStatus("loading", serverProcessorJob.STATUS_RUNNING);
h = load(entry, parserMode);
assert h != null;
- entry.setStatus("loaded");
+ entry.setStatus("loaded", serverProcessorJob.STATUS_RUNNING);
final boolean stored = sb.htEntryStoreProcess(h);
- entry.setStatus("stored-" + ((stored) ? "ok" : "fail"));
+ entry.setStatus("stored-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED);
return (stored) ? null : "not stored";
} catch (IOException e) {
- entry.setStatus("error");
+ entry.setStatus("error", serverProcessorJob.STATUS_FINISHED);
log.logWarning("problem loading " + entry.url().toString());
return "load error - " + e.getMessage();
}
diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java
index ab91f2ec7..250e49ac0 100644
--- a/source/de/anomic/data/SitemapParser.java
+++ b/source/de/anomic/data/SitemapParser.java
@@ -41,7 +41,6 @@ import org.xml.sax.helpers.DefaultHandler;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.HTTPLoader;
-import de.anomic.crawler.ZURL;
import de.anomic.http.JakartaCommonsHttpClient;
import de.anomic.http.JakartaCommonsHttpResponse;
import de.anomic.http.httpRequestHeader;
@@ -272,42 +271,20 @@ public class SitemapParser extends DefaultHandler {
}
// URL needs to crawled
- String error = null;
- error = this.sb.crawlStacker.stackCrawl(url,
- null, // this.siteMapURL.toString(),
- this.sb.webIndex.seedDB.mySeed().hash, this.nextURL, new Date(),
- 0, this.crawlingProfile);
-
- if (error != null) {
- try {
- this.logger.logInfo("The URL '" + this.nextURL + "' can not be crawled. Reason: " + error);
-
- // insert URL into the error DB
- final ZURL.Entry ee = this.sb.crawlQueues.errorURL.newEntry(
- new CrawlEntry(
- sb.webIndex.seedDB.mySeed().hash,
- new yacyURL(this.nextURL, null),
- "",
- "",
- new Date(),
- null,
- 0,
- 0,
- 0),
- this.sb.webIndex.seedDB.mySeed().hash,
- new Date(),
- 1,
- error);
- ee.store();
- this.sb.crawlQueues.errorURL.push(ee);
- } catch (final MalformedURLException e) {/* ignore this */
- }
- } else {
- this.logger.logInfo("New URL '" + this.nextURL + "' added for crawling.");
-
- // count successfully added URLs
- this.urlCounter++;
- }
+ this.sb.crawlStacker.enqueueEntry(new CrawlEntry(
+ this.sb.webIndex.seedDB.mySeed().hash,
+ url,
+ null, // this.siteMapURL.toString(),
+ this.nextURL,
+ new Date(),
+ null,
+ this.crawlingProfile.handle(),
+ 0,
+ 0,
+ 0
+ ));
+ this.logger.logInfo("New URL '" + this.nextURL + "' added for crawling.");
+ this.urlCounter++;
}
}
diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java
index fa021caac..153788b48 100644
--- a/source/de/anomic/data/bookmarksDB.java
+++ b/source/de/anomic/data/bookmarksDB.java
@@ -62,7 +62,6 @@ import org.xml.sax.SAXException;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
-import de.anomic.crawler.ZURL;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.index.indexWord;
@@ -259,49 +258,37 @@ public class bookmarksDB {
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
- String reasonString = sb.crawlStacker.stackCrawl(crawlingStartURL, null, sb.webIndex.seedDB.mySeed().hash, "CRAWLING-ROOT", new Date(), 0, pe);
-
- if (reasonString == null) {
- serverLog.logInfo("BOOKMARKS", "autoReCrawl - adding crawl profile for: " + crawlingStart);
- // serverLog.logInfo("BOOKMARKS", "autoReCrawl - crawl filter is set to: " + newcrawlingfilter);
- // generate a YaCyNews if the global flag was set
- if (crawlOrder) {
- Map m = new HashMap(pe.map()); // must be cloned
- m.remove("specificDepth");
- m.remove("indexText");
- m.remove("indexMedia");
- m.remove("remoteIndexing");
- m.remove("xsstopw");
- m.remove("xpstopw");
- m.remove("xdstopw");
- m.remove("storeTXCache");
- m.remove("storeHTCache");
- m.remove("generalFilter");
- m.remove("specificFilter");
- m.put("intention", "Automatic ReCrawl!");
- sb.webIndex.newsPool.publishMyNews(yacyNewsRecord.newRecord(sb.webIndex.seedDB.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m));
- }
- } else {
- serverLog.logInfo("BOOKMARKS", "autoReCrawl - error adding crawl profile: " + crawlingStart + "- " + reasonString);
- ZURL.Entry ee = sb.crawlQueues.errorURL.newEntry(
- new CrawlEntry(
- sb.webIndex.seedDB.mySeed().hash,
- crawlingStartURL,
- "",
- "",
- new Date(),
- pe.handle(),
- 0,
- 0,
- 0),
- sb.webIndex.seedDB.mySeed().hash,
- new Date(),
- 1,
- reasonString);
-
- ee.store();
- sb.crawlQueues.errorURL.push(ee);
- }
+ sb.crawlStacker.enqueueEntry(new CrawlEntry(
+ sb.webIndex.seedDB.mySeed().hash,
+ crawlingStartURL,
+ null,
+ "CRAWLING-ROOT",
+ new Date(),
+ null,
+ pe.handle(),
+ 0,
+ 0,
+ 0
+ ));
+ serverLog.logInfo("BOOKMARKS", "autoReCrawl - adding crawl profile for: " + crawlingStart);
+ // serverLog.logInfo("BOOKMARKS", "autoReCrawl - crawl filter is set to: " + newcrawlingfilter);
+ // generate a YaCyNews if the global flag was set
+ if (crawlOrder) {
+ Map m = new HashMap(pe.map()); // must be cloned
+ m.remove("specificDepth");
+ m.remove("indexText");
+ m.remove("indexMedia");
+ m.remove("remoteIndexing");
+ m.remove("xsstopw");
+ m.remove("xpstopw");
+ m.remove("xdstopw");
+ m.remove("storeTXCache");
+ m.remove("storeHTCache");
+ m.remove("generalFilter");
+ m.remove("specificFilter");
+ m.put("intention", "Automatic ReCrawl!");
+ sb.webIndex.newsPool.publishMyNews(yacyNewsRecord.newRecord(sb.webIndex.seedDB.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m));
+ }
} catch (MalformedURLException e1) {}
} // if
} // while(bit.hasNext())
diff --git a/source/de/anomic/index/indexURLReference.java b/source/de/anomic/index/indexURLReference.java
index 7e1c742d4..f767637c0 100644
--- a/source/de/anomic/index/indexURLReference.java
+++ b/source/de/anomic/index/indexURLReference.java
@@ -462,6 +462,7 @@ public class indexURLReference {
comp().url(),
referrerHash(),
comp().dc_title(),
+ null,
loaddate(),
null,
0,
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index 4fea492a4..d77218ac9 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -578,8 +578,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitchpublic static final String CRAWLSTACK = "82_crawlstack"
* Name of the crawl stacker thread, performing several checks on new URLs to crawl, i.e. double-check
*/
- public static final String CRAWLSTACK = "82_crawlstack";
+ public static final String CRAWLSTACK0 = "82_crawlstack";
+ public static final String CRAWLSTACK1 = "83_crawlstack";
public static final String CRAWLSTACK_METHOD_START = "job";
public static final String CRAWLSTACK_METHOD_JOBCOUNT = "size";
public static final String CRAWLSTACK_METHOD_FREEMEM = null;
diff --git a/source/de/anomic/server/serverDomains.java b/source/de/anomic/server/serverDomains.java
index 1bdea4a77..3e3ee8f80 100644
--- a/source/de/anomic/server/serverDomains.java
+++ b/source/de/anomic/server/serverDomains.java
@@ -389,6 +389,7 @@ public class serverDomains {
public static final int TLD_NorthAmericaOceania_ID = 4; // english-speaking countries
public static final int TLD_Africa_ID = 5; // africa
public static final int TLD_Generic_ID = 6; // anything else, also raw ip numbers
+ public static final int TLD_Local_ID = 7; // a local address
static {
// assign TLD-ids and names
@@ -552,7 +553,7 @@ public class serverDomains {
}
final Integer i = TLDID.get(tld);
if (i == null) {
- return (isLocal(host)) ? 7 : TLD_Generic_ID;
+ return (isLocal(host)) ? TLD_Local_ID : TLD_Generic_ID;
}
return i.intValue();
}
diff --git a/source/de/anomic/server/serverInstantBlockingThread.java b/source/de/anomic/server/serverInstantBlockingThread.java
index c55506af8..a143012e1 100644
--- a/source/de/anomic/server/serverInstantBlockingThread.java
+++ b/source/de/anomic/server/serverInstantBlockingThread.java
@@ -76,7 +76,7 @@ public class serverInstantBlockingThread extends s
@SuppressWarnings("unchecked")
public J job(final J next) throws Exception {
- if (next == null) return null; // poison pill: shutdown
+ if (next == null || next == serverProcessorJob.poisonPill) return null; // poison pill: shutdown
instantThreadCounter++;
//System.out.println("started job " + this.handle + ": " + this.getName());
jobs.put(this.handle, this.getName());
diff --git a/source/de/anomic/server/serverProcessorJob.java b/source/de/anomic/server/serverProcessorJob.java
index b8bd9ae0f..c91a88033 100644
--- a/source/de/anomic/server/serverProcessorJob.java
+++ b/source/de/anomic/server/serverProcessorJob.java
@@ -32,7 +32,7 @@ public class serverProcessorJob {
public final static int STATUS_FINISHED = 3;
public final static int STATUS_POISON = 99;
- public int status = 0;
+ public int status = STATUS_INITIATED;
public serverProcessorJob() {
this.status = STATUS_INITIATED;
diff --git a/source/de/anomic/urlRedirector/urlRedirectord.java b/source/de/anomic/urlRedirector/urlRedirectord.java
index ce781ae14..8e40d7778 100644
--- a/source/de/anomic/urlRedirector/urlRedirectord.java
+++ b/source/de/anomic/urlRedirector/urlRedirectord.java
@@ -7,6 +7,7 @@ import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.util.Date;
+import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.userDB;
import de.anomic.http.HttpClient;
@@ -195,15 +196,18 @@ public class urlRedirectord implements serverHandler, Cloneable {
sb.crawlQueues.errorURL.remove(urlhash);
// enqueuing URL for crawling
- sb.crawlStacker.enqueueEntry(
+ sb.crawlStacker.enqueueEntry(new CrawlEntry(
+ sb.webIndex.seedDB.mySeed().hash,
reqURL,
null,
- sb.webIndex.seedDB.mySeed().hash,
"URL Redirector",
new Date(),
+ null,
+ profile.handle(),
0,
- profile
- );
+ 0,
+ 0
+ ));
} else {
reasonString = "Unsupporte file extension";
}