- refactoring of CrawlEntry and CrawlStacker

- introduced blocking queues in CrawlStacker to make it ready for concurrency
- added a second busy thread for the CrawlStacker
The CrawlStacker is multithreaded. It shall be transformed into a BlockingThread in another step.
The concurrency of the stacker will hopefully solve some problems with cases where DNS blocks.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5395 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 6569cbbec1
commit 7535fd7447

@ -568,9 +568,12 @@ performanceSpeed=100
80_indexing_idlesleep=1000
80_indexing_busysleep=10
80_indexing_memprereq=6291456
82_crawlstack_idlesleep=5000
82_crawlstack_idlesleep=1000
82_crawlstack_busysleep=0
82_crawlstack_memprereq=1048576
83_crawlstack_idlesleep=1200
83_crawlstack_busysleep=0
83_crawlstack_memprereq=1048576
90_cleanup_idlesleep=300000
90_cleanup_busysleep=300000
90_cleanup_memprereq=0
@ -818,12 +821,6 @@ svnRevision=0
currentSkin=default
# temporary flag for new database structure. set only true for testing
# ALL DATA THAT IS CREATED WITH THIS FLAG ON WILL BE VOID IN A FINAL VERSION
# table-types: RAM = 0, TREE = 1, FLEX = 2;
# if you set this to a non-RAM value, you should increase the stacker.slots value
tableTypeForPreNURL=0
# flag to show if pages shall be usable for non-admin users
# this can be applied to the Surftips.html and yacysearch.html page
publicSurftips = true

@ -109,11 +109,7 @@ public class IndexControlRWIs_p {
if (post.containsKey("deletecomplete") && post.containsKey("confirmDelete")) {
sb.webIndex.clear();
sb.crawlQueues.clear();
try {
sb.crawlStacker.clear();
} catch (final IOException e) {
e.printStackTrace();
}
sb.crawlStacker.clear();
try {
sb.robots.clear();
} catch (final IOException e) {

@ -29,7 +29,8 @@ public class PeerLoadPicture {
final CircleThreadPiece misc = new CircleThreadPiece("Misc.", new Color(190, 50, 180));
final HashMap<String, CircleThreadPiece> pieces = new HashMap<String, CircleThreadPiece>();
pieces.put(null, idle);
pieces.put(plasmaSwitchboardConstants.CRAWLSTACK, new CircleThreadPiece("Stacking", new Color(115, 200, 210)));
pieces.put(plasmaSwitchboardConstants.CRAWLSTACK0, new CircleThreadPiece("Stacking0", new Color(115, 200, 210)));
pieces.put(plasmaSwitchboardConstants.CRAWLSTACK1, new CircleThreadPiece("Stacking1", new Color(115, 200, 210)));
pieces.put(plasmaSwitchboardConstants.INDEXER, new CircleThreadPiece("Parsing/Indexing", new Color(255, 130, 0)));
pieces.put(plasmaSwitchboardConstants.INDEX_DIST, new CircleThreadPiece("DHT-Distribution", new Color(119, 136, 153)));
pieces.put(plasmaSwitchboardConstants.PEER_PING, new CircleThreadPiece("YaCy Core", new Color(255, 230, 160)));

@ -34,6 +34,7 @@ import java.net.MalformedURLException;
import java.net.URLDecoder;
import java.util.Date;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.http.httpRequestHeader;
import de.anomic.plasma.plasmaSwitchboard;
@ -158,15 +159,18 @@ public class QuickCrawlLink_p {
// stack URL
String reasonString = null;
reasonString = sb.crawlStacker.stackCrawl(
crawlingStartURL,
null,
sb.webIndex.seedDB.mySeed().hash,
(title==null)?"CRAWLING-ROOT":title,
new Date(),
0,
pe
);
reasonString = sb.crawlStacker.stackCrawl(new CrawlEntry(
sb.webIndex.seedDB.mySeed().hash,
crawlingStartURL,
null,
(title==null)?"CRAWLING-ROOT":title,
new Date(),
null,
pe.handle(),
0,
0,
0
));
// validate rejection reason
if (reasonString == null) {

@ -211,7 +211,18 @@ public class WatchCrawler_p {
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
final String reasonString = sb.crawlStacker.stackCrawl(url, null, sb.webIndex.seedDB.mySeed().hash, "CRAWLING-ROOT", new Date(), 0, pe);
final String reasonString = sb.crawlStacker.stackCrawl(new CrawlEntry(
sb.webIndex.seedDB.mySeed().hash,
url,
null,
"CRAWLING-ROOT",
new Date(),
null,
pe.handle(),
0,
0,
0
));
if (reasonString == null) {
// create a bookmark from crawl start url
@ -260,6 +271,7 @@ public class WatchCrawler_p {
"",
"",
new Date(),
null,
pe.handle(),
0,
0,
@ -338,14 +350,18 @@ public class WatchCrawler_p {
if (nexturl == null) continue;
// enqueuing the url for crawling
sb.crawlStacker.enqueueEntry(
sb.crawlStacker.enqueueEntry(new CrawlEntry(
sb.webIndex.seedDB.mySeed().hash,
nexturl,
"",
sb.webIndex.seedDB.mySeed().hash,
e.getValue(),
new Date(),
0,
profile);
new Date(),
null,
profile.handle(),
0,
0,
0
));
}
} catch (final PatternSyntaxException e) {

@ -30,6 +30,7 @@ import java.text.ParseException;
import java.util.Date;
import java.util.Iterator;
import de.anomic.crawler.CrawlEntry;
import de.anomic.http.httpRequestHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverDate;
@ -76,7 +77,18 @@ public class rct_p {
if (urlRejectReason == null) {
// stack url
if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.webIndex.defaultRemoteProfile);
sb.crawlStacker.enqueueEntry(new CrawlEntry(
peerhash,
url,
(referrer == null) ? null : referrer.hash(),
"REMOTE-CRAWLING",
null,
loaddate,
sb.webIndex.defaultRemoteProfile.handle(),
0,
0,
0
));
} else {
env.getLog().logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
}

@ -36,10 +36,11 @@ import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.kelondro.kelondroRow;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverProcessorJob;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL;
public class CrawlEntry {
public class CrawlEntry extends serverProcessorJob {
// row definition for balancer-related NURL-entries
public final static kelondroRow rowdef = new kelondroRow(
@ -80,7 +81,7 @@ public class CrawlEntry {
private int forkfactor; // sum of anchors of all ancestors
private kelondroBitfield flags;
private int handle;
private String status;
private String statusMessage;
private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection
public static class domaccess {
@ -116,38 +117,38 @@ public class CrawlEntry {
* @param forkfactor sum of anchors of all ancestors
*/
public CrawlEntry(
final String initiator,
final yacyURL url,
final String referrerhash,
final String name,
final Date appdate,
final String profileHandle,
final int depth,
final int anchors,
final int forkfactor
final String initiator,
final yacyURL url,
final String referrerhash,
final String name,
final Date appdate,
final Date loaddate,
final String profileHandle,
final int depth,
final int anchors,
final int forkfactor
) {
// create new entry and store it into database
assert appdate != null;
assert url != null;
assert initiator != null;
assert referrerhash != null;
assert profileHandle.length() == yacySeedDB.commonHashLength : profileHandle + " != " + yacySeedDB.commonHashLength;
this.initiator = initiator;
this.url = url;
this.refhash = referrerhash;
this.refhash = (referrerhash == null) ? "" : referrerhash;
this.name = (name == null) ? "" : name;
this.appdate = (appdate == null) ? 0 : appdate.getTime();
this.loaddate = (loaddate == null) ? 0 : loaddate.getTime();
this.profileHandle = profileHandle; // must not be null
this.depth = depth;
this.anchors = anchors;
this.forkfactor = forkfactor;
this.flags = new kelondroBitfield(rowdef.width(10));
this.handle = 0;
this.loaddate = 0;
this.serverdate = 0;
this.imsdate = 0;
this.status = "loaded(args)";
this.statusMessage = "loaded(args)";
this.initialHash = url.hashCode();
this.status = serverProcessorJob.STATUS_INITIATED;
}
public CrawlEntry(final kelondroRow.Entry entry) throws IOException {
@ -172,7 +173,7 @@ public class CrawlEntry {
this.loaddate = entry.getColLong(12);
this.serverdate = entry.getColLong(13);
this.imsdate = entry.getColLong(14);
this.status = "loaded(kelondroRow.Entry)";
this.statusMessage = "loaded(kelondroRow.Entry)";
this.initialHash = url.hashCode();
return;
}
@ -182,12 +183,13 @@ public class CrawlEntry {
return this.initialHash;
}
public void setStatus(final String s) {
this.status = s;
public void setStatus(final String s, int code) {
this.statusMessage = s;
this.status = code;
}
public String getStatus() {
return this.status;
return this.statusMessage;
}
private static String normalizeHandle(final int h) {

@ -42,6 +42,7 @@ import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaSwitchboardConstants;
import de.anomic.server.serverDate;
import de.anomic.server.serverProcessorJob;
import de.anomic.server.logging.serverLog;
import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage;
@ -397,7 +398,18 @@ public class CrawlQueues {
if (urlRejectReason == null) {
// stack url
if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'");
sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), hash, item.getDescription(), loaddate, 0, sb.webIndex.defaultRemoteProfile);
sb.crawlStacker.enqueueEntry(new CrawlEntry(
hash,
url,
(referrer == null) ? null : referrer.hash(),
item.getDescription(),
null,
loaddate,
sb.webIndex.defaultRemoteProfile.handle(),
0,
0,
0
));
} else {
log.logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason);
}
@ -474,6 +486,7 @@ public class CrawlQueues {
"",
"",
new Date(),
new Date(),
(forText) ?
((global) ?
sb.webIndex.defaultTextSnippetGlobalProfile.handle() :
@ -500,7 +513,7 @@ public class CrawlQueues {
public crawlWorker(final CrawlEntry entry) {
this.entry = entry;
this.entry.setStatus("worker-initialized");
this.entry.setStatus("worker-initialized", serverProcessorJob.STATUS_INITIATED);
this.code = Integer.valueOf(entry.hashCode());
if (!workers.containsKey(code)) {
workers.put(code, this);
@ -511,7 +524,7 @@ public class CrawlQueues {
public void run() {
try {
// checking robots.txt for http(s) resources
this.entry.setStatus("worker-checkingrobots");
this.entry.setStatus("worker-checkingrobots", serverProcessorJob.STATUS_STARTED);
if ((entry.url().getProtocol().equals("http") || entry.url().getProtocol().equals("https")) && sb.robots.isDisallowed(entry.url())) {
if (log.isFine()) log.logFine("Crawling of URL '" + entry.url().toString() + "' disallowed by robots.txt.");
final ZURL.Entry eentry = errorURL.newEntry(
@ -524,7 +537,7 @@ public class CrawlQueues {
errorURL.push(eentry);
} else {
// starting a load from the internet
this.entry.setStatus("worker-loading");
this.entry.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING);
final String result = loader.process(this.entry, plasmaParser.PARSER_MODE_CRAWLER);
if (result != null) {
final ZURL.Entry eentry = errorURL.newEntry(
@ -536,7 +549,7 @@ public class CrawlQueues {
eentry.store();
errorURL.push(eentry);
} else {
this.entry.setStatus("worker-processed");
this.entry.setStatus("worker-processed", serverProcessorJob.STATUS_FINISHED);
}
}
} catch (final Exception e) {
@ -551,7 +564,7 @@ public class CrawlQueues {
e.printStackTrace();
} finally {
workers.remove(code);
this.entry.setStatus("worker-finalized");
this.entry.setStatus("worker-finalized", serverProcessorJob.STATUS_FINISHED);
}
}

@ -28,17 +28,15 @@
package de.anomic.crawler;
import java.io.IOException;
import java.net.UnknownHostException;
import java.util.ArrayList;
import java.util.Date;
import java.util.LinkedList;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import de.anomic.index.indexReferenceBlacklist;
import de.anomic.index.indexURLReference;
import de.anomic.kelondro.kelondroIndex;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRowSet;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.server.serverDomains;
@ -49,13 +47,11 @@ public final class CrawlStacker {
final serverLog log = new serverLog("STACKCRAWL");
private final LinkedList<String> urlEntryHashCache; // the order how this queue is processed; entries with known DNS entries go first
private kelondroIndex urlEntryCache; // the entries in the queue
private long dnsHit, dnsMiss;
private int alternateCount;
private CrawlQueues nextQueue;
private plasmaWordIndex wordIndex;
private boolean acceptLocalURLs, acceptGlobalURLs;
private BlockingQueue<CrawlEntry> fastQueue, slowQueue;
private long dnsHit, dnsMiss;
private CrawlQueues nextQueue;
private plasmaWordIndex wordIndex;
private boolean acceptLocalURLs, acceptGlobalURLs;
// objects for the prefetch task
private final ArrayList<String> dnsfetchHosts = new ArrayList<String>();
@ -68,26 +64,21 @@ public final class CrawlStacker {
this.wordIndex = wordIndex;
this.dnsHit = 0;
this.dnsMiss = 0;
this.alternateCount = 0;
this.acceptLocalURLs = acceptLocalURLs;
this.acceptGlobalURLs = acceptGlobalURLs;
// init the message list
this.urlEntryHashCache = new LinkedList<String>();
this.urlEntryCache = new kelondroRowSet(CrawlEntry.rowdef, 0);
this.fastQueue = new LinkedBlockingQueue<CrawlEntry>();
this.slowQueue = new ArrayBlockingQueue<CrawlEntry>(1000);
this.log.logInfo("STACKCRAWL thread initialized.");
}
public int size() {
synchronized (this.urlEntryHashCache) {
return this.urlEntryHashCache.size();
}
return this.fastQueue.size() + this.slowQueue.size();
}
public void clear() throws IOException {
this.urlEntryHashCache.clear();
this.urlEntryCache.clear();
public void clear() {
this.fastQueue.clear();
this.slowQueue.clear();
}
public void close() {
@ -98,11 +89,7 @@ public final class CrawlStacker {
this.log.logInfo("Shutdown. Closing stackCrawl queue.");
// closing the db
this.urlEntryCache.close();
// clearing the hash list
this.urlEntryHashCache.clear();
clear();
}
private boolean prefetchHost(final String host) {
@ -121,41 +108,17 @@ public final class CrawlStacker {
}
public boolean job() {
if (this.fastQueue.size() > 0 && job(this.fastQueue)) return true;
if (this.slowQueue.size() == 0) return false;
return job(this.slowQueue);
}
private boolean job(BlockingQueue<CrawlEntry> queue) {
// this is the method that is called by the busy thread from outside
if (this.urlEntryHashCache.size() == 0) return false;
if (queue.size() == 0) return false;
// get the next entry from the queue
String urlHash = null;
kelondroRow.Entry ec = null;
synchronized (this.urlEntryHashCache) {
urlHash = this.urlEntryHashCache.removeFirst();
if (urlHash == null) {
urlEntryHashCache.clear();
try {
urlEntryCache.clear();
} catch (IOException e) {
e.printStackTrace();
}
return false;
}
try {
ec = this.urlEntryCache.remove(urlHash.getBytes());
} catch (IOException e) {
e.printStackTrace();
return false;
}
}
if (urlHash == null || ec == null) return false;
// make a crawl Entry out of it
CrawlEntry entry = null;
try {
entry = new CrawlEntry(ec);
} catch (IOException e1) {
e1.printStackTrace();
return false;
}
CrawlEntry entry = queue.poll();
if (entry == null) return false;
try {
@ -173,95 +136,30 @@ public final class CrawlStacker {
}
return true;
}
public String stackCrawl(
final yacyURL url,
final String referrerhash,
final String initiatorHash,
final String name,
final Date loadDate,
final int currentdepth,
final CrawlProfile.entry profile) {
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");
// add the url into the crawling queue
final CrawlEntry entry = new CrawlEntry(
initiatorHash, // initiator, needed for p2p-feedback
url, // url clear text string
(referrerhash == null) ? "" : referrerhash, // last url in crawling queue
name, // load date
loadDate, // the anchor name
(profile == null) ? null : profile.handle(), // profile must not be null!
currentdepth, // depth so far
0, // anchors, default value
0 // forkfactor, default value
);
return stackCrawl(entry);
}
public void enqueueEntry(
final yacyURL nexturl,
final String referrerhash,
final String initiatorHash,
final String name,
final Date loadDate,
final int currentdepth,
final CrawlProfile.entry profile) {
if (profile == null) return;
public void enqueueEntry(final CrawlEntry entry) {
// DEBUG
if (log.isFinest()) log.logFinest("ENQUEUE "+ nexturl +", referer="+referrerhash +", initiator="+initiatorHash +", name="+name +", load="+loadDate +", depth="+currentdepth);
// check first before we create a big object
if (this.urlEntryCache.has(nexturl.hash().getBytes())) return;
if (log.isFinest()) log.logFinest("ENQUEUE "+ entry.url() +", referer="+entry.referrerhash() +", initiator="+entry.initiator() +", name="+entry.name() +", load="+entry.loaddate() +", depth="+entry.depth());
// now create the big object before we enter the synchronized block
final CrawlEntry newEntry = new CrawlEntry(
initiatorHash,
nexturl,
referrerhash,
name,
loadDate,
profile.handle(),
currentdepth,
0,
0
);
if (newEntry == null) return;
final kelondroRow.Entry newEntryRow = newEntry.toRow();
synchronized(this.urlEntryHashCache) {
kelondroRow.Entry oldValue;
if (prefetchHost(entry.url().getHost())) {
try {
oldValue = this.urlEntryCache.put(newEntryRow);
} catch (final IOException e) {
oldValue = null;
}
if (oldValue == null) {
//System.out.println("*** debug crawlStacker dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ((this.dnsMiss > 0) ? (", Q=" + (this.dnsHit / this.dnsMiss)) : ""));
if (prefetchHost(nexturl.getHost())) {
this.alternateCount++;
this.urlEntryHashCache.addFirst(newEntry.url().hash());
this.dnsHit++;
} else {
if ((this.dnsMiss > 0) && (this.alternateCount > 2 * this.dnsHit / this.dnsMiss)) {
this.urlEntryHashCache.addFirst(newEntry.url().hash());
this.alternateCount = 0;
//System.out.println("*** debug crawlStacker alternate switch, dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ", Q=" + (this.dnsHit / this.dnsMiss));
} else {
this.urlEntryHashCache.addLast(newEntry.url().hash());
}
this.dnsMiss++;
}
this.fastQueue.put(entry);
this.dnsHit++;
} catch (InterruptedException e) {
e.printStackTrace();
}
} else {
try {
this.slowQueue.put(entry);
this.dnsMiss++;
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
private String stackCrawl(final CrawlEntry entry) {
public String stackCrawl(final CrawlEntry entry) {
// stacks a crawl item. The position can also be remote
// returns null if successful, a reason string if not successful
//this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'");

@ -36,6 +36,7 @@ import java.util.concurrent.ConcurrentHashMap;
import de.anomic.index.indexDocumentMetadata;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
import de.anomic.server.serverProcessorJob;
import de.anomic.server.logging.serverLog;
public final class ProtocolLoader {
@ -111,14 +112,15 @@ public final class ProtocolLoader {
// returns null if everything went fine, a fail reason string if a problem occurred
indexDocumentMetadata h;
try {
entry.setStatus("loading", serverProcessorJob.STATUS_RUNNING);
h = load(entry, parserMode);
assert h != null;
entry.setStatus("loaded");
entry.setStatus("loaded", serverProcessorJob.STATUS_RUNNING);
final boolean stored = sb.htEntryStoreProcess(h);
entry.setStatus("stored-" + ((stored) ? "ok" : "fail"));
entry.setStatus("stored-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED);
return (stored) ? null : "not stored";
} catch (IOException e) {
entry.setStatus("error");
entry.setStatus("error", serverProcessorJob.STATUS_FINISHED);
log.logWarning("problem loading " + entry.url().toString());
return "load error - " + e.getMessage();
}

@ -41,7 +41,6 @@ import org.xml.sax.helpers.DefaultHandler;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.HTTPLoader;
import de.anomic.crawler.ZURL;
import de.anomic.http.JakartaCommonsHttpClient;
import de.anomic.http.JakartaCommonsHttpResponse;
import de.anomic.http.httpRequestHeader;
@ -272,42 +271,20 @@ public class SitemapParser extends DefaultHandler {
}
// URL needs to crawled
String error = null;
error = this.sb.crawlStacker.stackCrawl(url,
null, // this.siteMapURL.toString(),
this.sb.webIndex.seedDB.mySeed().hash, this.nextURL, new Date(),
0, this.crawlingProfile);
if (error != null) {
try {
this.logger.logInfo("The URL '" + this.nextURL + "' can not be crawled. Reason: " + error);
// insert URL into the error DB
final ZURL.Entry ee = this.sb.crawlQueues.errorURL.newEntry(
new CrawlEntry(
sb.webIndex.seedDB.mySeed().hash,
new yacyURL(this.nextURL, null),
"",
"",
new Date(),
null,
0,
0,
0),
this.sb.webIndex.seedDB.mySeed().hash,
new Date(),
1,
error);
ee.store();
this.sb.crawlQueues.errorURL.push(ee);
} catch (final MalformedURLException e) {/* ignore this */
}
} else {
this.logger.logInfo("New URL '" + this.nextURL + "' added for crawling.");
// count successfully added URLs
this.urlCounter++;
}
this.sb.crawlStacker.enqueueEntry(new CrawlEntry(
this.sb.webIndex.seedDB.mySeed().hash,
url,
null, // this.siteMapURL.toString(),
this.nextURL,
new Date(),
null,
this.crawlingProfile.handle(),
0,
0,
0
));
this.logger.logInfo("New URL '" + this.nextURL + "' added for crawling.");
this.urlCounter++;
}
}

@ -62,7 +62,6 @@ import org.xml.sax.SAXException;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.ZURL;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.index.indexWord;
@ -259,49 +258,37 @@ public class bookmarksDB {
crawlingQ,
indexText, indexMedia,
storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw);
String reasonString = sb.crawlStacker.stackCrawl(crawlingStartURL, null, sb.webIndex.seedDB.mySeed().hash, "CRAWLING-ROOT", new Date(), 0, pe);
if (reasonString == null) {
serverLog.logInfo("BOOKMARKS", "autoReCrawl - adding crawl profile for: " + crawlingStart);
// serverLog.logInfo("BOOKMARKS", "autoReCrawl - crawl filter is set to: " + newcrawlingfilter);
// generate a YaCyNews if the global flag was set
if (crawlOrder) {
Map<String, String> m = new HashMap<String, String>(pe.map()); // must be cloned
m.remove("specificDepth");
m.remove("indexText");
m.remove("indexMedia");
m.remove("remoteIndexing");
m.remove("xsstopw");
m.remove("xpstopw");
m.remove("xdstopw");
m.remove("storeTXCache");
m.remove("storeHTCache");
m.remove("generalFilter");
m.remove("specificFilter");
m.put("intention", "Automatic ReCrawl!");
sb.webIndex.newsPool.publishMyNews(yacyNewsRecord.newRecord(sb.webIndex.seedDB.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m));
}
} else {
serverLog.logInfo("BOOKMARKS", "autoReCrawl - error adding crawl profile: " + crawlingStart + "- " + reasonString);
ZURL.Entry ee = sb.crawlQueues.errorURL.newEntry(
new CrawlEntry(
sb.webIndex.seedDB.mySeed().hash,
crawlingStartURL,
"",
"",
new Date(),
pe.handle(),
0,
0,
0),
sb.webIndex.seedDB.mySeed().hash,
new Date(),
1,
reasonString);
ee.store();
sb.crawlQueues.errorURL.push(ee);
}
sb.crawlStacker.enqueueEntry(new CrawlEntry(
sb.webIndex.seedDB.mySeed().hash,
crawlingStartURL,
null,
"CRAWLING-ROOT",
new Date(),
null,
pe.handle(),
0,
0,
0
));
serverLog.logInfo("BOOKMARKS", "autoReCrawl - adding crawl profile for: " + crawlingStart);
// serverLog.logInfo("BOOKMARKS", "autoReCrawl - crawl filter is set to: " + newcrawlingfilter);
// generate a YaCyNews if the global flag was set
if (crawlOrder) {
Map<String, String> m = new HashMap<String, String>(pe.map()); // must be cloned
m.remove("specificDepth");
m.remove("indexText");
m.remove("indexMedia");
m.remove("remoteIndexing");
m.remove("xsstopw");
m.remove("xpstopw");
m.remove("xdstopw");
m.remove("storeTXCache");
m.remove("storeHTCache");
m.remove("generalFilter");
m.remove("specificFilter");
m.put("intention", "Automatic ReCrawl!");
sb.webIndex.newsPool.publishMyNews(yacyNewsRecord.newRecord(sb.webIndex.seedDB.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m));
}
} catch (MalformedURLException e1) {}
} // if
} // while(bit.hasNext())

@ -462,6 +462,7 @@ public class indexURLReference {
comp().url(),
referrerHash(),
comp().dc_title(),
null,
loaddate(),
null,
0,

@ -578,8 +578,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
deployThread(plasmaSwitchboardConstants.CLEANUP, "Cleanup", "simple cleaning process for monitoring information", null,
new serverInstantBusyThread(this, plasmaSwitchboardConstants.CLEANUP_METHOD_START, plasmaSwitchboardConstants.CLEANUP_METHOD_JOBCOUNT, plasmaSwitchboardConstants.CLEANUP_METHOD_FREEMEM), 600000); // all 5 Minutes, wait 10 minutes until first run
deployThread(plasmaSwitchboardConstants.CRAWLSTACK, "Crawl URL Stacker", "process that checks url for double-occurrences and for allowance/disallowance by robots.txt", null,
deployThread(plasmaSwitchboardConstants.CRAWLSTACK0, "Crawl URL Stacker", "process that checks url for double-occurrences and for allowance/disallowance by robots.txt", null,
new serverInstantBusyThread(crawlStacker, plasmaSwitchboardConstants.CRAWLSTACK_METHOD_START, plasmaSwitchboardConstants.CRAWLSTACK_METHOD_JOBCOUNT, plasmaSwitchboardConstants.CRAWLSTACK_METHOD_FREEMEM), 8000);
deployThread(plasmaSwitchboardConstants.CRAWLSTACK1, "Crawl URL Stacker", "process that checks url for double-occurrences and for allowance/disallowance by robots.txt", null,
new serverInstantBusyThread(crawlStacker, plasmaSwitchboardConstants.CRAWLSTACK_METHOD_START, plasmaSwitchboardConstants.CRAWLSTACK_METHOD_JOBCOUNT, plasmaSwitchboardConstants.CRAWLSTACK_METHOD_FREEMEM), 8000);
deployThread(plasmaSwitchboardConstants.INDEXER, "Indexing", "thread that either initiates a parsing/indexing queue, distributes the index into the DHT, stores parsed documents or flushes the index cache", "/IndexCreateIndexingQueue_p.html",
new serverInstantBusyThread(this, plasmaSwitchboardConstants.INDEXER_METHOD_START, plasmaSwitchboardConstants.INDEXER_METHOD_JOBCOUNT, plasmaSwitchboardConstants.INDEXER_METHOD_FREEMEM), 10000);
deployThread(plasmaSwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL, "Remote Crawl Job", "thread that performes a single crawl/indexing step triggered by a remote peer", null,
@ -716,6 +718,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
synchronized (this.webIndex) {
this.webIndex.close();
}
// TODO: restart CrawlStacker
setConfig("network.unit.definition", networkDefinition);
overwriteNetworkDefinition();
final File indexPrimaryPath = getConfigPath(plasmaSwitchboardConstants.INDEX_PRIMARY_PATH, plasmaSwitchboardConstants.INDEX_PATH_DEFAULT);
@ -1557,7 +1560,18 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
nextEntry = i.next();
nextUrl = nextEntry.getKey();
// enqueue the hyperlink into the pre-notice-url db
crawlStacker.enqueueEntry(nextUrl, entry.urlHash(), entry.initiator(), nextEntry.getValue(), docDate, entry.depth() + 1, entry.profile());
crawlStacker.enqueueEntry(new CrawlEntry(
entry.initiator(),
nextUrl,
entry.urlHash(),
nextEntry.getValue(),
null,
docDate,
entry.profile().handle(),
entry.depth() + 1,
0,
0
));
}
final long stackEndTime = System.currentTimeMillis();
if (log.isInfo()) log.logInfo("CRAWL: ADDED " + hl.size() + " LINKS FROM " + entry.url().toNormalform(false, true) +
@ -2049,7 +2063,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
url,
referrerHash,
(name == null) ? "" : name,
new Date(),
new Date(),
null,
null,
0,
0,

@ -140,7 +140,8 @@ public final class plasmaSwitchboardConstants {
* <p><code>public static final String <strong>CRAWLSTACK</strong> = "82_crawlstack"</code></p>
* <p>Name of the crawl stacker thread, performing several checks on new URLs to crawl, i.e. double-check</p>
*/
public static final String CRAWLSTACK = "82_crawlstack";
public static final String CRAWLSTACK0 = "82_crawlstack";
public static final String CRAWLSTACK1 = "83_crawlstack";
public static final String CRAWLSTACK_METHOD_START = "job";
public static final String CRAWLSTACK_METHOD_JOBCOUNT = "size";
public static final String CRAWLSTACK_METHOD_FREEMEM = null;

@ -389,6 +389,7 @@ public class serverDomains {
public static final int TLD_NorthAmericaOceania_ID = 4; // english-speaking countries
public static final int TLD_Africa_ID = 5; // africa
public static final int TLD_Generic_ID = 6; // anything else, also raw ip numbers
public static final int TLD_Local_ID = 7; // a local address
static {
// assign TLD-ids and names
@ -552,7 +553,7 @@ public class serverDomains {
}
final Integer i = TLDID.get(tld);
if (i == null) {
return (isLocal(host)) ? 7 : TLD_Generic_ID;
return (isLocal(host)) ? TLD_Local_ID : TLD_Generic_ID;
}
return i.intValue();
}

@ -76,7 +76,7 @@ public class serverInstantBlockingThread<J extends serverProcessorJob> extends s
@SuppressWarnings("unchecked")
public J job(final J next) throws Exception {
if (next == null) return null; // poison pill: shutdown
if (next == null || next == serverProcessorJob.poisonPill) return null; // poison pill: shutdown
instantThreadCounter++;
//System.out.println("started job " + this.handle + ": " + this.getName());
jobs.put(this.handle, this.getName());

@ -32,7 +32,7 @@ public class serverProcessorJob {
public final static int STATUS_FINISHED = 3;
public final static int STATUS_POISON = 99;
public int status = 0;
public int status = STATUS_INITIATED;
public serverProcessorJob() {
this.status = STATUS_INITIATED;

@ -7,6 +7,7 @@ import java.io.PrintWriter;
import java.net.MalformedURLException;
import java.util.Date;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.userDB;
import de.anomic.http.HttpClient;
@ -195,15 +196,18 @@ public class urlRedirectord implements serverHandler, Cloneable {
sb.crawlQueues.errorURL.remove(urlhash);
// enqueuing URL for crawling
sb.crawlStacker.enqueueEntry(
sb.crawlStacker.enqueueEntry(new CrawlEntry(
sb.webIndex.seedDB.mySeed().hash,
reqURL,
null,
sb.webIndex.seedDB.mySeed().hash,
"URL Redirector",
new Date(),
null,
profile.handle(),
0,
profile
);
0,
0
));
} else {
reasonString = "Unsupporte file extension";
}

Loading…
Cancel
Save