- refactoring of CrawlEntry and CrawlStacker

- introduced blocking queues in CrawlStacker to make it ready for concurrency
- added a second busy thread for the CrawlStacker
The CrawlStacker is multithreaded. It shall be transformed into a BlockingThread in another step.
The concurrency of the stacker will hopefully solve some problems with cases where DNS blocks.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5395 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 6569cbbec1
commit 7535fd7447

@ -568,9 +568,12 @@ performanceSpeed=100
80_indexing_idlesleep=1000
80_indexing_busysleep=10
80_indexing_memprereq=6291456
82_crawlstack_idlesleep=5000
82_crawlstack_idlesleep=1000
82_crawlstack_busysleep=0
82_crawlstack_memprereq=1048576
83_crawlstack_idlesleep=1200
83_crawlstack_busysleep=0
83_crawlstack_memprereq=1048576
90_cleanup_idlesleep=300000
90_cleanup_busysleep=300000
90_cleanup_memprereq=0
@ -818,12 +821,6 @@ svnRevision=0
currentSkin=default
# temporary flag for new database structure. set only true for testing
# ALL DATA THAT IS CREATED WITH THIS FLAG ON WILL BE VOID IN A FINAL VERSION
# table-types: RAM = 0, TREE = 1, FLEX = 2;
# if you set this to a non-RAM value, you should increase the stacker.slots value
tableTypeForPreNURL=0
# flag to show if pages shall be usable for non-admin users
# this can be applied to the Surftips.html and yacysearch.html page
publicSurftips = true

@ -109,11 +109,7 @@ public class IndexControlRWIs_p {
if (post.containsKey("deletecomplete") && post.containsKey("confirmDelete")) {
sb.webIndex.clear();
sb.crawlQueues.clear();
try {
sb.crawlStacker.clear();
} catch (final IOException e) {
e.printStackTrace();
}
try {
sb.robots.clear();
} catch (final IOException e) {

@ -29,7 +29,8 @@ public class PeerLoadPicture {
final CircleThreadPiece misc = new CircleThreadPiece("Misc.", new Color(190, 50, 180));
final HashMap<String, CircleThreadPiece> pieces = new HashMap<String, CircleThreadPiece>();
pieces.put(null, idle);
pieces.put(plasmaSwitchboardConstants.CRAWLSTACK, new CircleThreadPiece("Stacking", new Color(115, 200, 210)));
pieces.put(plasmaSwitchboardConstants.CRAWLSTACK0, new CircleThreadPiece("Stacking0", new Color(115, 200, 210)));
pieces.put(plasmaSwitchboardConstants.CRAWLSTACK1, new CircleThreadPiece("Stacking1", new Color(115, 200, 210)));
pieces.put(plasmaSwitchboardConstants.INDEXER, new CircleThreadPiece("Parsing/Indexing", new Color(255, 130, 0)));
pieces.put(plasmaSwitchboardConstants.INDEX_DIST, new CircleThreadPiece("DHT-Distribution", new Color(119, 136, 153)));
pieces.put(plasmaSwitchboardConstants.PEER_PING, new CircleThreadPiece("YaCy Core", new Color(255, 230, 160)));

@ -34,6 +34,7 @@ import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.util.Date;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.http.httpRequestHeader;
import de.anomic.plasma.plasmaSwitchboard;
@ -158,15 +159,18 @@ public class QuickCrawlLink_p {
// stack URL
String reasonString = null;
reasonString = sb.crawlStacker.stackCrawl(
reasonString = sb.crawlStacker.stackCrawl(new CrawlEntry(
sb.webIndex.seedDB.mySeed().hash,
crawlingStartURL,
null,
sb.webIndex.seedDB.mySeed().hash,
(title==null)?"CRAWLING-ROOT":title,
new Date(),
null,
pe.handle(),
0,
pe
);
0,
0
));
// validate rejection reason
if (reasonString == null) {

@ -211,7 +211,18 @@ public class WatchCrawler_p {
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
final String reasonString = sb.crawlStacker.stackCrawl(url, null, sb.webIndex.seedDB.mySeed().hash, "CRAWLING-ROOT", new Date(), 0, pe);
final String reasonString = sb.crawlStacker.stackCrawl(new CrawlEntry(
sb.webIndex.seedDB.mySeed().hash,
url,
null,
"CRAWLING-ROOT",
new Date(),
null,
pe.handle(),
0,
0,
0
));
if (reasonString == null) {
// create a bookmark from crawl start url
@ -260,6 +271,7 @@ public class WatchCrawler_p {
"",
"",
new Date(),
null,
pe.handle(),
0,
0,
@ -338,14 +350,18 @@ public class WatchCrawler_p {
if (nexturl == null) continue;
// enqueuing the url for crawling
sb.crawlStacker.enqueueEntry(
sb.crawlStacker.enqueueEntry(new CrawlEntry(
sb.webIndex.seedDB.mySeed().hash,
nexturl,
"",
sb.webIndex.seedDB.mySeed().hash,
e.getValue(),
new Date(),
null,
profile.handle(),
0,
0,
profile);
0
));
}
} catch (final PatternSyntaxException e) {

@ -30,6 +30,7 @@ import java.text.ParseException;
import java.util.Date;
import java.util.Iterator;
import de.anomic.crawler.CrawlEntry;
import de.anomic.http.httpRequestHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverDate;
@ -76,7 +77,18 @@ public class rct_p {
if (urlRejectReason == null) {
// stack url
if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.webIndex.defaultRemoteProfile);
sb.crawlStacker.enqueueEntry(new CrawlEntry(
peerhash,
url,
(referrer == null) ? null : referrer.hash(),
"REMOTE-CRAWLING",
null,
loaddate,
sb.webIndex.defaultRemoteProfile.handle(),
0,
0,
0
));
} else {
env.getLog().logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
}

@ -36,10 +36,11 @@ import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroRow;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverProcessorJob;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL;
public class CrawlEntry {
public class CrawlEntry extends serverProcessorJob {
// row definition for balancer-related NURL-entries
public final static kelondroRow rowdef = new kelondroRow(
@ -80,7 +81,7 @@ public class CrawlEntry {
private int forkfactor; // sum of anchors of all ancestors
private kelondroBitfield flags;
private int handle;
private String status;
private String statusMessage;
private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection
public static class domaccess {
@ -121,33 +122,33 @@ public class CrawlEntry {
final String referrerhash,
final String name,
final Date appdate,
final Date loaddate,
final String profileHandle,
final int depth,
final int anchors,
final int forkfactor
) {
// create new entry and store it into database
assert appdate != null;
assert url != null;
assert initiator != null;
assert referrerhash != null;
assert profileHandle.length() == yacySeedDB.commonHashLength : profileHandle + " != " + yacySeedDB.commonHashLength;
this.initiator = initiator;
this.url = url;
this.refhash = referrerhash;
this.refhash = (referrerhash == null) ? "" : referrerhash;
this.name = (name == null) ? "" : name;
this.appdate = (appdate == null) ? 0 : appdate.getTime();
this.loaddate = (loaddate == null) ? 0 : loaddate.getTime();
this.profileHandle = profileHandle; // must not be null
this.depth = depth;
this.anchors = anchors;
this.forkfactor = forkfactor;
this.flags = new kelondroBitfield(rowdef.width(10));
this.handle = 0;
this.loaddate = 0;
this.serverdate = 0;
this.imsdate = 0;
this.status = "loaded(args)";
this.statusMessage = "loaded(args)";
this.initialHash = url.hashCode();
this.status = serverProcessorJob.STATUS_INITIATED;
}
public CrawlEntry(final kelondroRow.Entry entry) throws IOException {
@ -172,7 +173,7 @@ public class CrawlEntry {
this.loaddate = entry.getColLong(12);
this.serverdate = entry.getColLong(13);
this.imsdate = entry.getColLong(14);
this.status = "loaded(kelondroRow.Entry)";
this.statusMessage = "loaded(kelondroRow.Entry)";
this.initialHash = url.hashCode();
return;
}
@ -182,12 +183,13 @@ public class CrawlEntry {
return this.initialHash;
}
public void setStatus(final String s) {
this.status = s;
public void setStatus(final String s, int code) {
this.statusMessage = s;
this.status = code;
}
public String getStatus() {
return this.status;
return this.statusMessage;
}
private static String normalizeHandle(final int h) {

@ -42,6 +42,7 @@ import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSwitchboardConstants;
import de.anomic.server.serverDate;
import de.anomic.server.serverProcessorJob;
import de.anomic.server.logging.serverLog;
import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage;
@ -397,7 +398,18 @@ public class CrawlQueues {
if (urlRejectReason == null) {
// stack url
if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), hash, item.getDescription(), loaddate, 0, sb.webIndex.defaultRemoteProfile);
sb.crawlStacker.enqueueEntry(new CrawlEntry(
hash,
url,
(referrer == null) ? null : referrer.hash(),
item.getDescription(),
null,
loaddate,
sb.webIndex.defaultRemoteProfile.handle(),
0,
0,
0
));
} else {
log.logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
}
@ -474,6 +486,7 @@ public class CrawlQueues {
"",
"",
new Date(),
new Date(),
(forText) ?
((global) ?
sb.webIndex.defaultTextSnippetGlobalProfile.handle() :
@ -500,7 +513,7 @@ public class CrawlQueues {
public crawlWorker(final CrawlEntry entry) {
this.entry = entry;
this.entry.setStatus("worker-initialized");
this.entry.setStatus("worker-initialized", serverProcessorJob.STATUS_INITIATED);
this.code = Integer.valueOf(entry.hashCode());
if (!workers.containsKey(code)) {
workers.put(code, this);
@ -511,7 +524,7 @@ public class CrawlQueues {
public void run() {
try {
// checking robots.txt for http(s) resources
this.entry.setStatus("worker-checkingrobots");
this.entry.setStatus("worker-checkingrobots", serverProcessorJob.STATUS_STARTED);
if ((entry.url().getProtocol().equals("http") || entry.url().getProtocol().equals("https")) && sb.robots.isDisallowed(entry.url())) {
if (log.isFine()) log.logFine("Crawling of URL '" + entry.url().toString() + "' disallowed by robots.txt.");
final ZURL.Entry eentry = errorURL.newEntry(
@ -524,7 +537,7 @@ public class CrawlQueues {
errorURL.push(eentry);
} else {
// starting a load from the internet
this.entry.setStatus("worker-loading");
this.entry.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING);
final String result = loader.process(this.entry, plasmaParser.PARSER_MODE_CRAWLER);
if (result != null) {
final ZURL.Entry eentry = errorURL.newEntry(
@ -536,7 +549,7 @@ public class CrawlQueues {
eentry.store();
errorURL.push(eentry);
} else {
this.entry.setStatus("worker-processed");
this.entry.setStatus("worker-processed", serverProcessorJob.STATUS_FINISHED);
}
}
} catch (final Exception e) {
@ -551,7 +564,7 @@ public class CrawlQueues {
e.printStackTrace();
} finally {
workers.remove(code);
this.entry.setStatus("worker-finalized");
this.entry.setStatus("worker-finalized", serverProcessorJob.STATUS_FINISHED);
}
}

@ -28,17 +28,15 @@
package de.anomic.crawler;
import java.io.IOException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Date;
import java.util.LinkedList;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import de.anomic.index.indexReferenceBlacklist;
import de.anomic.index.indexURLReference;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRowSet;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverDomains;
@ -49,10 +47,8 @@ public final class CrawlStacker {
final serverLog log = new serverLog("STACKCRAWL");
private final LinkedList<String> urlEntryHashCache; // the order how this queue is processed; entries with known DNS entries go first
private kelondroIndex urlEntryCache; // the entries in the queue
private BlockingQueue<CrawlEntry> fastQueue, slowQueue;
private long dnsHit, dnsMiss;
private int alternateCount;
private CrawlQueues nextQueue;
private plasmaWordIndex wordIndex;
private boolean acceptLocalURLs, acceptGlobalURLs;
@ -68,26 +64,21 @@ public final class CrawlStacker {
this.wordIndex = wordIndex;
this.dnsHit = 0;
this.dnsMiss = 0;
this.alternateCount = 0;
this.acceptLocalURLs = acceptLocalURLs;
this.acceptGlobalURLs = acceptGlobalURLs;
// init the message list
this.urlEntryHashCache = new LinkedList<String>();
this.urlEntryCache = new kelondroRowSet(CrawlEntry.rowdef, 0);
this.fastQueue = new LinkedBlockingQueue<CrawlEntry>();
this.slowQueue = new ArrayBlockingQueue<CrawlEntry>(1000);
this.log.logInfo("STACKCRAWL thread initialized.");
}
public int size() {
synchronized (this.urlEntryHashCache) {
return this.urlEntryHashCache.size();
}
return this.fastQueue.size() + this.slowQueue.size();
}
public void clear() throws IOException {
this.urlEntryHashCache.clear();
this.urlEntryCache.clear();
public void clear() {
this.fastQueue.clear();
this.slowQueue.clear();
}
public void close() {
@ -98,11 +89,7 @@ public final class CrawlStacker {
this.log.logInfo("Shutdown. Closing stackCrawl queue.");
// closing the db
this.urlEntryCache.close();
// clearing the hash list
this.urlEntryHashCache.clear();
clear();
}
private boolean prefetchHost(final String host) {
@ -121,41 +108,17 @@ public final class CrawlStacker {
}
public boolean job() {
// this is the method that is called by the busy thread from outside
if (this.urlEntryHashCache.size() == 0) return false;
// get the next entry from the queue
String urlHash = null;
kelondroRow.Entry ec = null;
synchronized (this.urlEntryHashCache) {
urlHash = this.urlEntryHashCache.removeFirst();
if (urlHash == null) {
urlEntryHashCache.clear();
try {
urlEntryCache.clear();
} catch (IOException e) {
e.printStackTrace();
if (this.fastQueue.size() > 0 && job(this.fastQueue)) return true;
if (this.slowQueue.size() == 0) return false;
return job(this.slowQueue);
}
return false;
}
try {
ec = this.urlEntryCache.remove(urlHash.getBytes());
} catch (IOException e) {
e.printStackTrace();
return false;
}
}
if (urlHash == null || ec == null) return false;
// make a crawl Entry out of it
CrawlEntry entry = null;
try {
entry = new CrawlEntry(ec);
} catch (IOException e1) {
e1.printStackTrace();
return false;
}
private boolean job(BlockingQueue<CrawlEntry> queue) {
// this is the method that is called by the busy thread from outside
if (queue.size() == 0) return false;
// get the next entry from the queue
CrawlEntry entry = queue.poll();
if (entry == null) return false;
try {
@ -174,94 +137,29 @@ public final class CrawlStacker {
return true;
}
public String stackCrawl(
final yacyURL url,
final String referrerhash,
final String initiatorHash,
final String name,
final Date loadDate,
final int currentdepth,
final CrawlProfile.entry profile) {
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
// add the url into the crawling queue
final CrawlEntry entry = new CrawlEntry(
initiatorHash, // initiator, needed for p2p-feedback
url, // url clear text string
(referrerhash == null) ? "" : referrerhash, // last url in crawling queue
name, // load date
loadDate, // the anchor name
(profile == null) ? null : profile.handle(), // profile must not be null!
currentdepth, // depth so far
0, // anchors, default value
0 // forkfactor, default value
);
return stackCrawl(entry);
}
public void enqueueEntry(
final yacyURL nexturl,
final String referrerhash,
final String initiatorHash,
final String name,
final Date loadDate,
final int currentdepth,
final CrawlProfile.entry profile) {
if (profile == null) return;
public void enqueueEntry(final CrawlEntry entry) {
// DEBUG
if (log.isFinest()) log.logFinest("ENQUEUE "+ nexturl +", referer="+referrerhash +", initiator="+initiatorHash +", name="+name +", load="+loadDate +", depth="+currentdepth);
// check first before we create a big object
if (this.urlEntryCache.has(nexturl.hash().getBytes())) return;
// now create the big object before we enter the synchronized block
final CrawlEntry newEntry = new CrawlEntry(
initiatorHash,
nexturl,
referrerhash,
name,
loadDate,
profile.handle(),
currentdepth,
0,
0
);
if (newEntry == null) return;
final kelondroRow.Entry newEntryRow = newEntry.toRow();
synchronized(this.urlEntryHashCache) {
kelondroRow.Entry oldValue;
if (log.isFinest()) log.logFinest("ENQUEUE "+ entry.url() +", referer="+entry.referrerhash() +", initiator="+entry.initiator() +", name="+entry.name() +", load="+entry.loaddate() +", depth="+entry.depth());
if (prefetchHost(entry.url().getHost())) {
try {
oldValue = this.urlEntryCache.put(newEntryRow);
} catch (final IOException e) {
oldValue = null;
}
if (oldValue == null) {
//System.out.println("*** debug crawlStacker dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ((this.dnsMiss > 0) ? (", Q=" + (this.dnsHit / this.dnsMiss)) : ""));
if (prefetchHost(nexturl.getHost())) {
this.alternateCount++;
this.urlEntryHashCache.addFirst(newEntry.url().hash());
this.fastQueue.put(entry);
this.dnsHit++;
} else {
if ((this.dnsMiss > 0) && (this.alternateCount > 2 * this.dnsHit / this.dnsMiss)) {
this.urlEntryHashCache.addFirst(newEntry.url().hash());
this.alternateCount = 0;
//System.out.println("*** debug crawlStacker alternate switch, dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ", Q=" + (this.dnsHit / this.dnsMiss));
} else {
this.urlEntryHashCache.addLast(newEntry.url().hash());
} catch (InterruptedException e) {
e.printStackTrace();
}
} else {
try {
this.slowQueue.put(entry);
this.dnsMiss++;
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
private String stackCrawl(final CrawlEntry entry) {
public String stackCrawl(final CrawlEntry entry) {
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");

@ -36,6 +36,7 @@ import java.util.concurrent.ConcurrentHashMap;
import de.anomic.index.indexDocumentMetadata;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
import de.anomic.server.serverProcessorJob;
import de.anomic.server.logging.serverLog;
public final class ProtocolLoader {
@ -111,14 +112,15 @@ public final class ProtocolLoader {
// returns null if everything went fine, a fail reason string if a problem occurred
indexDocumentMetadata h;
try {
entry.setStatus("loading", serverProcessorJob.STATUS_RUNNING);
h = load(entry, parserMode);
assert h != null;
entry.setStatus("loaded");
entry.setStatus("loaded", serverProcessorJob.STATUS_RUNNING);
final boolean stored = sb.htEntryStoreProcess(h);
entry.setStatus("stored-" + ((stored) ? "ok" : "fail"));
entry.setStatus("stored-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED);
return (stored) ? null : "not stored";
} catch (IOException e) {
entry.setStatus("error");
entry.setStatus("error", serverProcessorJob.STATUS_FINISHED);
log.logWarning("problem loading " + entry.url().toString());
return "load error - " + e.getMessage();
}

@ -41,7 +41,6 @@ import org.xml.sax.helpers.DefaultHandler;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.ZURL;
import de.anomic.http.JakartaCommonsHttpClient;
import de.anomic.http.JakartaCommonsHttpResponse;
import de.anomic.http.httpRequestHeader;
@ -272,44 +271,22 @@ public class SitemapParser extends DefaultHandler {
}
// URL needs to crawled
String error = null;
error = this.sb.crawlStacker.stackCrawl(url,
this.sb.crawlStacker.enqueueEntry(new CrawlEntry(
this.sb.webIndex.seedDB.mySeed().hash,
url,
null, // this.siteMapURL.toString(),
this.sb.webIndex.seedDB.mySeed().hash, this.nextURL, new Date(),
0, this.crawlingProfile);
if (error != null) {
try {
this.logger.logInfo("The URL '" + this.nextURL + "' can not be crawled. Reason: " + error);
// insert URL into the error DB
final ZURL.Entry ee = this.sb.crawlQueues.errorURL.newEntry(
new CrawlEntry(
sb.webIndex.seedDB.mySeed().hash,
new yacyURL(this.nextURL, null),
"",
"",
this.nextURL,
new Date(),
null,
this.crawlingProfile.handle(),
0,
0,
0),
this.sb.webIndex.seedDB.mySeed().hash,
new Date(),
1,
error);
ee.store();
this.sb.crawlQueues.errorURL.push(ee);
} catch (final MalformedURLException e) {/* ignore this */
}
} else {
0
));
this.logger.logInfo("New URL '" + this.nextURL + "' added for crawling.");
// count successfully added URLs
this.urlCounter++;
}
}
}
public void characters(final char[] buf, final int offset, final int len) throws SAXException {
if (this.currentElement.equalsIgnoreCase(SITEMAP_URL_LOC)) {

@ -62,7 +62,6 @@ import org.xml.sax.SAXException;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.ZURL;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.index.indexWord;
@ -259,9 +258,18 @@ public class bookmarksDB {
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
String reasonString = sb.crawlStacker.stackCrawl(crawlingStartURL, null, sb.webIndex.seedDB.mySeed().hash, "CRAWLING-ROOT", new Date(), 0, pe);
if (reasonString == null) {
sb.crawlStacker.enqueueEntry(new CrawlEntry(
sb.webIndex.seedDB.mySeed().hash,
crawlingStartURL,
null,
"CRAWLING-ROOT",
new Date(),
null,
pe.handle(),
0,
0,
0
));
serverLog.logInfo("BOOKMARKS", "autoReCrawl - adding crawl profile for: " + crawlingStart);
// serverLog.logInfo("BOOKMARKS", "autoReCrawl - crawl filter is set to: " + newcrawlingfilter);
// generate a YaCyNews if the global flag was set
@ -281,27 +289,6 @@ public class bookmarksDB {
m.put("intention", "Automatic ReCrawl!");
sb.webIndex.newsPool.publishMyNews(yacyNewsRecord.newRecord(sb.webIndex.seedDB.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m));
}
} else {
serverLog.logInfo("BOOKMARKS", "autoReCrawl - error adding crawl profile: " + crawlingStart + "- " + reasonString);
ZURL.Entry ee = sb.crawlQueues.errorURL.newEntry(
new CrawlEntry(
sb.webIndex.seedDB.mySeed().hash,
crawlingStartURL,
"",
"",
new Date(),
pe.handle(),
0,
0,
0),
sb.webIndex.seedDB.mySeed().hash,
new Date(),
1,
reasonString);
ee.store();
sb.crawlQueues.errorURL.push(ee);
}
} catch (MalformedURLException e1) {}
} // if
} // while(bit.hasNext())

@ -462,6 +462,7 @@ public class indexURLReference {
comp().url(),
referrerHash(),
comp().dc_title(),
null,
loaddate(),
null,
0,

@ -578,7 +578,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
deployThread(plasmaSwitchboardConstants.CLEANUP, "Cleanup", "simple cleaning process for monitoring information", null,
new serverInstantBusyThread(this, plasmaSwitchboardConstants.CLEANUP_METHOD_START, plasmaSwitchboardConstants.CLEANUP_METHOD_JOBCOUNT, plasmaSwitchboardConstants.CLEANUP_METHOD_FREEMEM), 600000); // all 5 Minutes, wait 10 minutes until first run
deployThread(plasmaSwitchboardConstants.CRAWLSTACK, "Crawl URL Stacker", "process that checks url for double-occurrences and for allowance/disallowance by robots.txt", null,
deployThread(plasmaSwitchboardConstants.CRAWLSTACK0, "Crawl URL Stacker", "process that checks url for double-occurrences and for allowance/disallowance by robots.txt", null,
new serverInstantBusyThread(crawlStacker, plasmaSwitchboardConstants.CRAWLSTACK_METHOD_START, plasmaSwitchboardConstants.CRAWLSTACK_METHOD_JOBCOUNT, plasmaSwitchboardConstants.CRAWLSTACK_METHOD_FREEMEM), 8000);
deployThread(plasmaSwitchboardConstants.CRAWLSTACK1, "Crawl URL Stacker", "process that checks url for double-occurrences and for allowance/disallowance by robots.txt", null,
new serverInstantBusyThread(crawlStacker, plasmaSwitchboardConstants.CRAWLSTACK_METHOD_START, plasmaSwitchboardConstants.CRAWLSTACK_METHOD_JOBCOUNT, plasmaSwitchboardConstants.CRAWLSTACK_METHOD_FREEMEM), 8000);
deployThread(plasmaSwitchboardConstants.INDEXER, "Indexing", "thread that either initiates a parsing/indexing queue, distributes the index into the DHT, stores parsed documents or flushes the index cache", "/IndexCreateIndexingQueue_p.html",
new serverInstantBusyThread(this, plasmaSwitchboardConstants.INDEXER_METHOD_START, plasmaSwitchboardConstants.INDEXER_METHOD_JOBCOUNT, plasmaSwitchboardConstants.INDEXER_METHOD_FREEMEM), 10000);
@ -716,6 +718,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
synchronized (this.webIndex) {
this.webIndex.close();
}
// TODO: restart CrawlStacker
setConfig("network.unit.definition", networkDefinition);
overwriteNetworkDefinition();
final File indexPrimaryPath = getConfigPath(plasmaSwitchboardConstants.INDEX_PRIMARY_PATH, plasmaSwitchboardConstants.INDEX_PATH_DEFAULT);
@ -1557,7 +1560,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
nextEntry = i.next();
nextUrl = nextEntry.getKey();
// enqueue the hyperlink into the pre-notice-url db
crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), entry.initiator(), nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile());
crawlStacker.enqueueEntry(new CrawlEntry(
entry.initiator(),
nextUrl,
entry.urlHash(),
nextEntry.getValue(),
null,
docDate,
entry.profile().handle(),
entry.depth() + 1,
0,
0
));
}
final long stackEndTime = System.currentTimeMillis();
if (log.isInfo()) log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) +
@ -2051,6 +2065,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
(name == null) ? "" : name,
new Date(),
null,
null,
0,
0,
0);

@ -140,7 +140,8 @@ public final class plasmaSwitchboardConstants {
* <p><code>public static final String <strong>CRAWLSTACK</strong> = "82_crawlstack"</code></p>
* <p>Name of the crawl stacker thread, performing several checks on new URLs to crawl, i.e. double-check</p>
*/
public static final String CRAWLSTACK = "82_crawlstack";
public static final String CRAWLSTACK0 = "82_crawlstack";
public static final String CRAWLSTACK1 = "83_crawlstack";
public static final String CRAWLSTACK_METHOD_START = "job";
public static final String CRAWLSTACK_METHOD_JOBCOUNT = "size";
public static final String CRAWLSTACK_METHOD_FREEMEM = null;

@ -389,6 +389,7 @@ public class serverDomains {
public static final int TLD_NorthAmericaOceania_ID = 4; // english-speaking countries
public static final int TLD_Africa_ID = 5; // africa
public static final int TLD_Generic_ID = 6; // anything else, also raw ip numbers
public static final int TLD_Local_ID = 7; // a local address
static {
// assign TLD-ids and names
@ -552,7 +553,7 @@ public class serverDomains {
}
final Integer i = TLDID.get(tld);
if (i == null) {
return (isLocal(host)) ? 7 : TLD_Generic_ID;
return (isLocal(host)) ? TLD_Local_ID : TLD_Generic_ID;
}
return i.intValue();
}

@ -76,7 +76,7 @@ public class serverInstantBlockingThread<J extends serverProcessorJob> extends s
@SuppressWarnings("unchecked")
public J job(final J next) throws Exception {
if (next == null) return null; // poison pill: shutdown
if (next == null || next == serverProcessorJob.poisonPill) return null; // poison pill: shutdown
instantThreadCounter++;
//System.out.println("started job " + this.handle + ": " + this.getName());
jobs.put(this.handle, this.getName());

@ -32,7 +32,7 @@ public class serverProcessorJob {
public final static int STATUS_FINISHED = 3;
public final static int STATUS_POISON = 99;
public int status = 0;
public int status = STATUS_INITIATED;
public serverProcessorJob() {
this.status = STATUS_INITIATED;

@ -7,6 +7,7 @@ import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.util.Date;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.userDB;
import de.anomic.http.HttpClient;
@ -195,15 +196,18 @@ public class urlRedirectord implements serverHandler, Cloneable {
sb.crawlQueues.errorURL.remove(urlhash);
// enqueuing URL for crawling
sb.crawlStacker.enqueueEntry(
sb.crawlStacker.enqueueEntry(new CrawlEntry(
sb.webIndex.seedDB.mySeed().hash,
reqURL,
null,
sb.webIndex.seedDB.mySeed().hash,
"URL Redirector",
new Date(),
null,
profile.handle(),
0,
profile
);
0,
0
));
} else {
reasonString = "Unsupporte file extension";
}

Loading…
Cancel
Save