From 7535fd7447b93ce2cd5a253fd15cfd672844761c Mon Sep 17 00:00:00 2001 From: orbiter Date: Wed, 17 Dec 2008 22:53:06 +0000 Subject: [PATCH] - refactoring of CrawlEntry and CrawlStacker - introduced blocking queues in CrawlStacker to make it ready for concurrency - added a second busy thread for the CrawlStacker The CrawlStacker is multithreaded. It shall be transformed into a BlockingThread in another step. The concurrency of the stacker will hopefully solve some problems with cases where DNS blocks. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5395 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- defaults/yacy.init | 11 +- htroot/IndexControlRWIs_p.java | 6 +- htroot/PeerLoadPicture.java | 3 +- htroot/QuickCrawlLink_p.java | 22 ++- htroot/WatchCrawler_p.java | 28 ++- htroot/rct_p.java | 14 +- source/de/anomic/crawler/CrawlEntry.java | 42 ++-- source/de/anomic/crawler/CrawlQueues.java | 25 ++- source/de/anomic/crawler/CrawlStacker.java | 182 ++++-------------- source/de/anomic/crawler/ProtocolLoader.java | 8 +- source/de/anomic/data/SitemapParser.java | 51 ++--- source/de/anomic/data/bookmarksDB.java | 75 +++----- source/de/anomic/index/indexURLReference.java | 1 + .../de/anomic/plasma/plasmaSwitchboard.java | 21 +- .../plasma/plasmaSwitchboardConstants.java | 3 +- source/de/anomic/server/serverDomains.java | 3 +- .../server/serverInstantBlockingThread.java | 2 +- .../de/anomic/server/serverProcessorJob.java | 2 +- .../anomic/urlRedirector/urlRedirectord.java | 12 +- 19 files changed, 219 insertions(+), 292 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index 972604e05..0a42d7836 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -568,9 +568,12 @@ performanceSpeed=100 80_indexing_idlesleep=1000 80_indexing_busysleep=10 80_indexing_memprereq=6291456 -82_crawlstack_idlesleep=5000 +82_crawlstack_idlesleep=1000 82_crawlstack_busysleep=0 82_crawlstack_memprereq=1048576 +83_crawlstack_idlesleep=1200 +83_crawlstack_busysleep=0 +83_crawlstack_memprereq=1048576 90_cleanup_idlesleep=300000 90_cleanup_busysleep=300000 90_cleanup_memprereq=0 @@ -818,12 +821,6 @@ svnRevision=0 currentSkin=default -# temporary flag for new database structure. set only true for testing -# ALL DATA THAT IS CREATED WITH THIS FLAG ON WILL BE VOID IN A FINAL VERSION -# table-types: RAM = 0, TREE = 1, FLEX = 2; -# if you set this to a non-RAM value, you should increase the stacker.slots value -tableTypeForPreNURL=0 - # flag to show if pages shall be usable for non-admin users # this can be applied to the Surftips.html and yacysearch.html page publicSurftips = true diff --git a/htroot/IndexControlRWIs_p.java b/htroot/IndexControlRWIs_p.java index afe797d24..00372ed19 100644 --- a/htroot/IndexControlRWIs_p.java +++ b/htroot/IndexControlRWIs_p.java @@ -109,11 +109,7 @@ public class IndexControlRWIs_p { if (post.containsKey("deletecomplete") && post.containsKey("confirmDelete")) { sb.webIndex.clear(); sb.crawlQueues.clear(); - try { - sb.crawlStacker.clear(); - } catch (final IOException e) { - e.printStackTrace(); - } + sb.crawlStacker.clear(); try { sb.robots.clear(); } catch (final IOException e) { diff --git a/htroot/PeerLoadPicture.java b/htroot/PeerLoadPicture.java index 6eb25c0d0..30a5d2797 100644 --- a/htroot/PeerLoadPicture.java +++ b/htroot/PeerLoadPicture.java @@ -29,7 +29,8 @@ public class PeerLoadPicture { final CircleThreadPiece misc = new CircleThreadPiece("Misc.", new Color(190, 50, 180)); final HashMap pieces = new HashMap(); pieces.put(null, idle); - pieces.put(plasmaSwitchboardConstants.CRAWLSTACK, new CircleThreadPiece("Stacking", new Color(115, 200, 210))); + pieces.put(plasmaSwitchboardConstants.CRAWLSTACK0, new CircleThreadPiece("Stacking0", new Color(115, 200, 210))); + pieces.put(plasmaSwitchboardConstants.CRAWLSTACK1, new CircleThreadPiece("Stacking1", new Color(115, 200, 210))); pieces.put(plasmaSwitchboardConstants.INDEXER, new CircleThreadPiece("Parsing/Indexing", new Color(255, 130, 0))); pieces.put(plasmaSwitchboardConstants.INDEX_DIST, new CircleThreadPiece("DHT-Distribution", new Color(119, 136, 153))); pieces.put(plasmaSwitchboardConstants.PEER_PING, new CircleThreadPiece("YaCy Core", new Color(255, 230, 160))); diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 5af704396..c7b94aa17 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -34,6 +34,7 @@ import java.net.MalformedURLException; import java.net.URLDecoder; import java.util.Date; +import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.CrawlProfile; import de.anomic.http.httpRequestHeader; import de.anomic.plasma.plasmaSwitchboard; @@ -158,15 +159,18 @@ public class QuickCrawlLink_p { // stack URL String reasonString = null; - reasonString = sb.crawlStacker.stackCrawl( - crawlingStartURL, - null, - sb.webIndex.seedDB.mySeed().hash, - (title==null)?"CRAWLING-ROOT":title, - new Date(), - 0, - pe - ); + reasonString = sb.crawlStacker.stackCrawl(new CrawlEntry( + sb.webIndex.seedDB.mySeed().hash, + crawlingStartURL, + null, + (title==null)?"CRAWLING-ROOT":title, + new Date(), + null, + pe.handle(), + 0, + 0, + 0 + )); // validate rejection reason if (reasonString == null) { diff --git a/htroot/WatchCrawler_p.java b/htroot/WatchCrawler_p.java index 831a5a0de..804b9de0c 100644 --- a/htroot/WatchCrawler_p.java +++ b/htroot/WatchCrawler_p.java @@ -211,7 +211,18 @@ public class WatchCrawler_p { crawlingQ, indexText, indexMedia, storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw); - final String reasonString = sb.crawlStacker.stackCrawl(url, null, sb.webIndex.seedDB.mySeed().hash, "CRAWLING-ROOT", new Date(), 0, pe); + final String reasonString = sb.crawlStacker.stackCrawl(new CrawlEntry( + sb.webIndex.seedDB.mySeed().hash, + url, + null, + "CRAWLING-ROOT", + new Date(), + null, + pe.handle(), + 0, + 0, + 0 + )); if (reasonString == null) { // create a bookmark from crawl start url @@ -260,6 +271,7 @@ public class WatchCrawler_p { "", "", new Date(), + null, pe.handle(), 0, 0, @@ -338,14 +350,18 @@ public class WatchCrawler_p { if (nexturl == null) continue; // enqueuing the url for crawling - sb.crawlStacker.enqueueEntry( + sb.crawlStacker.enqueueEntry(new CrawlEntry( + sb.webIndex.seedDB.mySeed().hash, nexturl, "", - sb.webIndex.seedDB.mySeed().hash, e.getValue(), - new Date(), - 0, - profile); + new Date(), + null, + profile.handle(), + 0, + 0, + 0 + )); } } catch (final PatternSyntaxException e) { diff --git a/htroot/rct_p.java b/htroot/rct_p.java index cb2e2942d..b5b47babb 100644 --- a/htroot/rct_p.java +++ b/htroot/rct_p.java @@ -30,6 +30,7 @@ import java.text.ParseException; import java.util.Date; import java.util.Iterator; +import de.anomic.crawler.CrawlEntry; import de.anomic.http.httpRequestHeader; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverDate; @@ -76,7 +77,18 @@ public class rct_p { if (urlRejectReason == null) { // stack url if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'"); - sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), peerhash, "REMOTE-CRAWLING", loaddate, 0, sb.webIndex.defaultRemoteProfile); + sb.crawlStacker.enqueueEntry(new CrawlEntry( + peerhash, + url, + (referrer == null) ? null : referrer.hash(), + "REMOTE-CRAWLING", + null, + loaddate, + sb.webIndex.defaultRemoteProfile.handle(), + 0, + 0, + 0 + )); } else { env.getLog().logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason); } diff --git a/source/de/anomic/crawler/CrawlEntry.java b/source/de/anomic/crawler/CrawlEntry.java index 24a58c057..08ded65a2 100755 --- a/source/de/anomic/crawler/CrawlEntry.java +++ b/source/de/anomic/crawler/CrawlEntry.java @@ -36,10 +36,11 @@ import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroRow; import de.anomic.plasma.plasmaSwitchboard; +import de.anomic.server.serverProcessorJob; import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacyURL; -public class CrawlEntry { +public class CrawlEntry extends serverProcessorJob { // row definition for balancer-related NURL-entries public final static kelondroRow rowdef = new kelondroRow( @@ -80,7 +81,7 @@ public class CrawlEntry { private int forkfactor; // sum of anchors of all ancestors private kelondroBitfield flags; private int handle; - private String status; + private String statusMessage; private int initialHash; // to provide a object hash that does not change even if the url changes because of redirection public static class domaccess { @@ -116,38 +117,38 @@ public class CrawlEntry { * @param forkfactor sum of anchors of all ancestors */ public CrawlEntry( - final String initiator, - final yacyURL url, - final String referrerhash, - final String name, - final Date appdate, - final String profileHandle, - final int depth, - final int anchors, - final int forkfactor + final String initiator, + final yacyURL url, + final String referrerhash, + final String name, + final Date appdate, + final Date loaddate, + final String profileHandle, + final int depth, + final int anchors, + final int forkfactor ) { // create new entry and store it into database - assert appdate != null; assert url != null; assert initiator != null; - assert referrerhash != null; assert profileHandle.length() == yacySeedDB.commonHashLength : profileHandle + " != " + yacySeedDB.commonHashLength; this.initiator = initiator; this.url = url; - this.refhash = referrerhash; + this.refhash = (referrerhash == null) ? "" : referrerhash; this.name = (name == null) ? "" : name; this.appdate = (appdate == null) ? 0 : appdate.getTime(); + this.loaddate = (loaddate == null) ? 0 : loaddate.getTime(); this.profileHandle = profileHandle; // must not be null this.depth = depth; this.anchors = anchors; this.forkfactor = forkfactor; this.flags = new kelondroBitfield(rowdef.width(10)); this.handle = 0; - this.loaddate = 0; this.serverdate = 0; this.imsdate = 0; - this.status = "loaded(args)"; + this.statusMessage = "loaded(args)"; this.initialHash = url.hashCode(); + this.status = serverProcessorJob.STATUS_INITIATED; } public CrawlEntry(final kelondroRow.Entry entry) throws IOException { @@ -172,7 +173,7 @@ public class CrawlEntry { this.loaddate = entry.getColLong(12); this.serverdate = entry.getColLong(13); this.imsdate = entry.getColLong(14); - this.status = "loaded(kelondroRow.Entry)"; + this.statusMessage = "loaded(kelondroRow.Entry)"; this.initialHash = url.hashCode(); return; } @@ -182,12 +183,13 @@ public class CrawlEntry { return this.initialHash; } - public void setStatus(final String s) { - this.status = s; + public void setStatus(final String s, int code) { + this.statusMessage = s; + this.status = code; } public String getStatus() { - return this.status; + return this.statusMessage; } private static String normalizeHandle(final int h) { diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 9e2e0621e..62a298c8b 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -42,6 +42,7 @@ import de.anomic.plasma.plasmaParser; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboardConstants; import de.anomic.server.serverDate; +import de.anomic.server.serverProcessorJob; import de.anomic.server.logging.serverLog; import de.anomic.xml.RSSFeed; import de.anomic.xml.RSSMessage; @@ -397,7 +398,18 @@ public class CrawlQueues { if (urlRejectReason == null) { // stack url if (sb.getLog().isFinest()) sb.getLog().logFinest("crawlOrder: stack: url='" + url + "'"); - sb.crawlStacker.enqueueEntry(url, (referrer == null) ? null : referrer.hash(), hash, item.getDescription(), loaddate, 0, sb.webIndex.defaultRemoteProfile); + sb.crawlStacker.enqueueEntry(new CrawlEntry( + hash, + url, + (referrer == null) ? null : referrer.hash(), + item.getDescription(), + null, + loaddate, + sb.webIndex.defaultRemoteProfile.handle(), + 0, + 0, + 0 + )); } else { log.logWarning("crawlOrder: Rejected URL '" + urlToString(url) + "': " + urlRejectReason); } @@ -474,6 +486,7 @@ public class CrawlQueues { "", "", new Date(), + new Date(), (forText) ? ((global) ? sb.webIndex.defaultTextSnippetGlobalProfile.handle() : @@ -500,7 +513,7 @@ public class CrawlQueues { public crawlWorker(final CrawlEntry entry) { this.entry = entry; - this.entry.setStatus("worker-initialized"); + this.entry.setStatus("worker-initialized", serverProcessorJob.STATUS_INITIATED); this.code = Integer.valueOf(entry.hashCode()); if (!workers.containsKey(code)) { workers.put(code, this); @@ -511,7 +524,7 @@ public class CrawlQueues { public void run() { try { // checking robots.txt for http(s) resources - this.entry.setStatus("worker-checkingrobots"); + this.entry.setStatus("worker-checkingrobots", serverProcessorJob.STATUS_STARTED); if ((entry.url().getProtocol().equals("http") || entry.url().getProtocol().equals("https")) && sb.robots.isDisallowed(entry.url())) { if (log.isFine()) log.logFine("Crawling of URL '" + entry.url().toString() + "' disallowed by robots.txt."); final ZURL.Entry eentry = errorURL.newEntry( @@ -524,7 +537,7 @@ public class CrawlQueues { errorURL.push(eentry); } else { // starting a load from the internet - this.entry.setStatus("worker-loading"); + this.entry.setStatus("worker-loading", serverProcessorJob.STATUS_RUNNING); final String result = loader.process(this.entry, plasmaParser.PARSER_MODE_CRAWLER); if (result != null) { final ZURL.Entry eentry = errorURL.newEntry( @@ -536,7 +549,7 @@ public class CrawlQueues { eentry.store(); errorURL.push(eentry); } else { - this.entry.setStatus("worker-processed"); + this.entry.setStatus("worker-processed", serverProcessorJob.STATUS_FINISHED); } } } catch (final Exception e) { @@ -551,7 +564,7 @@ public class CrawlQueues { e.printStackTrace(); } finally { workers.remove(code); - this.entry.setStatus("worker-finalized"); + this.entry.setStatus("worker-finalized", serverProcessorJob.STATUS_FINISHED); } } diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 4271b4241..82a13da11 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -28,17 +28,15 @@ package de.anomic.crawler; -import java.io.IOException; import java.net.UnknownHostException; import java.util.ArrayList; import java.util.Date; -import java.util.LinkedList; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.BlockingQueue; +import java.util.concurrent.LinkedBlockingQueue; import de.anomic.index.indexReferenceBlacklist; import de.anomic.index.indexURLReference; -import de.anomic.kelondro.kelondroIndex; -import de.anomic.kelondro.kelondroRow; -import de.anomic.kelondro.kelondroRowSet; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaWordIndex; import de.anomic.server.serverDomains; @@ -49,13 +47,11 @@ public final class CrawlStacker { final serverLog log = new serverLog("STACKCRAWL"); - private final LinkedList urlEntryHashCache; // the order how this queue is processed; entries with known DNS entries go first - private kelondroIndex urlEntryCache; // the entries in the queue - private long dnsHit, dnsMiss; - private int alternateCount; - private CrawlQueues nextQueue; - private plasmaWordIndex wordIndex; - private boolean acceptLocalURLs, acceptGlobalURLs; + private BlockingQueue fastQueue, slowQueue; + private long dnsHit, dnsMiss; + private CrawlQueues nextQueue; + private plasmaWordIndex wordIndex; + private boolean acceptLocalURLs, acceptGlobalURLs; // objects for the prefetch task private final ArrayList dnsfetchHosts = new ArrayList(); @@ -68,26 +64,21 @@ public final class CrawlStacker { this.wordIndex = wordIndex; this.dnsHit = 0; this.dnsMiss = 0; - this.alternateCount = 0; this.acceptLocalURLs = acceptLocalURLs; this.acceptGlobalURLs = acceptGlobalURLs; - // init the message list - this.urlEntryHashCache = new LinkedList(); - - this.urlEntryCache = new kelondroRowSet(CrawlEntry.rowdef, 0); + this.fastQueue = new LinkedBlockingQueue(); + this.slowQueue = new ArrayBlockingQueue(1000); this.log.logInfo("STACKCRAWL thread initialized."); } public int size() { - synchronized (this.urlEntryHashCache) { - return this.urlEntryHashCache.size(); - } + return this.fastQueue.size() + this.slowQueue.size(); } - public void clear() throws IOException { - this.urlEntryHashCache.clear(); - this.urlEntryCache.clear(); + public void clear() { + this.fastQueue.clear(); + this.slowQueue.clear(); } public void close() { @@ -98,11 +89,7 @@ public final class CrawlStacker { this.log.logInfo("Shutdown. Closing stackCrawl queue."); - // closing the db - this.urlEntryCache.close(); - - // clearing the hash list - this.urlEntryHashCache.clear(); + clear(); } private boolean prefetchHost(final String host) { @@ -121,41 +108,17 @@ public final class CrawlStacker { } public boolean job() { + if (this.fastQueue.size() > 0 && job(this.fastQueue)) return true; + if (this.slowQueue.size() == 0) return false; + return job(this.slowQueue); + } + + private boolean job(BlockingQueue queue) { // this is the method that is called by the busy thread from outside - if (this.urlEntryHashCache.size() == 0) return false; + if (queue.size() == 0) return false; // get the next entry from the queue - String urlHash = null; - kelondroRow.Entry ec = null; - synchronized (this.urlEntryHashCache) { - urlHash = this.urlEntryHashCache.removeFirst(); - if (urlHash == null) { - urlEntryHashCache.clear(); - try { - urlEntryCache.clear(); - } catch (IOException e) { - e.printStackTrace(); - } - return false; - } - try { - ec = this.urlEntryCache.remove(urlHash.getBytes()); - } catch (IOException e) { - e.printStackTrace(); - return false; - } - } - if (urlHash == null || ec == null) return false; - - // make a crawl Entry out of it - CrawlEntry entry = null; - try { - entry = new CrawlEntry(ec); - } catch (IOException e1) { - e1.printStackTrace(); - return false; - } - + CrawlEntry entry = queue.poll(); if (entry == null) return false; try { @@ -173,95 +136,30 @@ public final class CrawlStacker { } return true; } - - public String stackCrawl( - final yacyURL url, - final String referrerhash, - final String initiatorHash, - final String name, - final Date loadDate, - final int currentdepth, - final CrawlProfile.entry profile) { - // stacks a crawl item. The position can also be remote - // returns null if successful, a reason string if not successful - //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'"); - - // add the url into the crawling queue - final CrawlEntry entry = new CrawlEntry( - initiatorHash, // initiator, needed for p2p-feedback - url, // url clear text string - (referrerhash == null) ? "" : referrerhash, // last url in crawling queue - name, // load date - loadDate, // the anchor name - (profile == null) ? null : profile.handle(), // profile must not be null! - currentdepth, // depth so far - 0, // anchors, default value - 0 // forkfactor, default value - ); - return stackCrawl(entry); - } - - public void enqueueEntry( - final yacyURL nexturl, - final String referrerhash, - final String initiatorHash, - final String name, - final Date loadDate, - final int currentdepth, - final CrawlProfile.entry profile) { - if (profile == null) return; - + + public void enqueueEntry(final CrawlEntry entry) { + // DEBUG - if (log.isFinest()) log.logFinest("ENQUEUE "+ nexturl +", referer="+referrerhash +", initiator="+initiatorHash +", name="+name +", load="+loadDate +", depth="+currentdepth); - - // check first before we create a big object - if (this.urlEntryCache.has(nexturl.hash().getBytes())) return; + if (log.isFinest()) log.logFinest("ENQUEUE "+ entry.url() +", referer="+entry.referrerhash() +", initiator="+entry.initiator() +", name="+entry.name() +", load="+entry.loaddate() +", depth="+entry.depth()); - // now create the big object before we enter the synchronized block - final CrawlEntry newEntry = new CrawlEntry( - initiatorHash, - nexturl, - referrerhash, - name, - loadDate, - profile.handle(), - currentdepth, - 0, - 0 - ); - if (newEntry == null) return; - final kelondroRow.Entry newEntryRow = newEntry.toRow(); - - synchronized(this.urlEntryHashCache) { - kelondroRow.Entry oldValue; + if (prefetchHost(entry.url().getHost())) { try { - oldValue = this.urlEntryCache.put(newEntryRow); - } catch (final IOException e) { - oldValue = null; - } - if (oldValue == null) { - //System.out.println("*** debug crawlStacker dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ((this.dnsMiss > 0) ? (", Q=" + (this.dnsHit / this.dnsMiss)) : "")); - if (prefetchHost(nexturl.getHost())) { - this.alternateCount++; - this.urlEntryHashCache.addFirst(newEntry.url().hash()); - this.dnsHit++; - } else { - if ((this.dnsMiss > 0) && (this.alternateCount > 2 * this.dnsHit / this.dnsMiss)) { - this.urlEntryHashCache.addFirst(newEntry.url().hash()); - this.alternateCount = 0; - //System.out.println("*** debug crawlStacker alternate switch, dnsHit=" + this.dnsHit + ", dnsMiss=" + this.dnsMiss + ", alternateCount=" + this.alternateCount + ", Q=" + (this.dnsHit / this.dnsMiss)); - } else { - this.urlEntryHashCache.addLast(newEntry.url().hash()); - } - this.dnsMiss++; - } + this.fastQueue.put(entry); + this.dnsHit++; + } catch (InterruptedException e) { + e.printStackTrace(); + } + } else { + try { + this.slowQueue.put(entry); + this.dnsMiss++; + } catch (InterruptedException e) { + e.printStackTrace(); } } } - - - private String stackCrawl(final CrawlEntry entry) { + public String stackCrawl(final CrawlEntry entry) { // stacks a crawl item. The position can also be remote // returns null if successful, a reason string if not successful //this.log.logFinest("stackCrawl: nexturlString='" + nexturlString + "'"); diff --git a/source/de/anomic/crawler/ProtocolLoader.java b/source/de/anomic/crawler/ProtocolLoader.java index ff142ab1d..2dd643d87 100644 --- a/source/de/anomic/crawler/ProtocolLoader.java +++ b/source/de/anomic/crawler/ProtocolLoader.java @@ -36,6 +36,7 @@ import java.util.concurrent.ConcurrentHashMap; import de.anomic.index.indexDocumentMetadata; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverCore; +import de.anomic.server.serverProcessorJob; import de.anomic.server.logging.serverLog; public final class ProtocolLoader { @@ -111,14 +112,15 @@ public final class ProtocolLoader { // returns null if everything went fine, a fail reason string if a problem occurred indexDocumentMetadata h; try { + entry.setStatus("loading", serverProcessorJob.STATUS_RUNNING); h = load(entry, parserMode); assert h != null; - entry.setStatus("loaded"); + entry.setStatus("loaded", serverProcessorJob.STATUS_RUNNING); final boolean stored = sb.htEntryStoreProcess(h); - entry.setStatus("stored-" + ((stored) ? "ok" : "fail")); + entry.setStatus("stored-" + ((stored) ? "ok" : "fail"), serverProcessorJob.STATUS_FINISHED); return (stored) ? null : "not stored"; } catch (IOException e) { - entry.setStatus("error"); + entry.setStatus("error", serverProcessorJob.STATUS_FINISHED); log.logWarning("problem loading " + entry.url().toString()); return "load error - " + e.getMessage(); } diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java index ab91f2ec7..250e49ac0 100644 --- a/source/de/anomic/data/SitemapParser.java +++ b/source/de/anomic/data/SitemapParser.java @@ -41,7 +41,6 @@ import org.xml.sax.helpers.DefaultHandler; import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.HTTPLoader; -import de.anomic.crawler.ZURL; import de.anomic.http.JakartaCommonsHttpClient; import de.anomic.http.JakartaCommonsHttpResponse; import de.anomic.http.httpRequestHeader; @@ -272,42 +271,20 @@ public class SitemapParser extends DefaultHandler { } // URL needs to crawled - String error = null; - error = this.sb.crawlStacker.stackCrawl(url, - null, // this.siteMapURL.toString(), - this.sb.webIndex.seedDB.mySeed().hash, this.nextURL, new Date(), - 0, this.crawlingProfile); - - if (error != null) { - try { - this.logger.logInfo("The URL '" + this.nextURL + "' can not be crawled. Reason: " + error); - - // insert URL into the error DB - final ZURL.Entry ee = this.sb.crawlQueues.errorURL.newEntry( - new CrawlEntry( - sb.webIndex.seedDB.mySeed().hash, - new yacyURL(this.nextURL, null), - "", - "", - new Date(), - null, - 0, - 0, - 0), - this.sb.webIndex.seedDB.mySeed().hash, - new Date(), - 1, - error); - ee.store(); - this.sb.crawlQueues.errorURL.push(ee); - } catch (final MalformedURLException e) {/* ignore this */ - } - } else { - this.logger.logInfo("New URL '" + this.nextURL + "' added for crawling."); - - // count successfully added URLs - this.urlCounter++; - } + this.sb.crawlStacker.enqueueEntry(new CrawlEntry( + this.sb.webIndex.seedDB.mySeed().hash, + url, + null, // this.siteMapURL.toString(), + this.nextURL, + new Date(), + null, + this.crawlingProfile.handle(), + 0, + 0, + 0 + )); + this.logger.logInfo("New URL '" + this.nextURL + "' added for crawling."); + this.urlCounter++; } } diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java index fa021caac..153788b48 100644 --- a/source/de/anomic/data/bookmarksDB.java +++ b/source/de/anomic/data/bookmarksDB.java @@ -62,7 +62,6 @@ import org.xml.sax.SAXException; import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.ZURL; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterWriter; import de.anomic.index.indexWord; @@ -259,49 +258,37 @@ public class bookmarksDB { crawlingQ, indexText, indexMedia, storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw); - String reasonString = sb.crawlStacker.stackCrawl(crawlingStartURL, null, sb.webIndex.seedDB.mySeed().hash, "CRAWLING-ROOT", new Date(), 0, pe); - - if (reasonString == null) { - serverLog.logInfo("BOOKMARKS", "autoReCrawl - adding crawl profile for: " + crawlingStart); - // serverLog.logInfo("BOOKMARKS", "autoReCrawl - crawl filter is set to: " + newcrawlingfilter); - // generate a YaCyNews if the global flag was set - if (crawlOrder) { - Map m = new HashMap(pe.map()); // must be cloned - m.remove("specificDepth"); - m.remove("indexText"); - m.remove("indexMedia"); - m.remove("remoteIndexing"); - m.remove("xsstopw"); - m.remove("xpstopw"); - m.remove("xdstopw"); - m.remove("storeTXCache"); - m.remove("storeHTCache"); - m.remove("generalFilter"); - m.remove("specificFilter"); - m.put("intention", "Automatic ReCrawl!"); - sb.webIndex.newsPool.publishMyNews(yacyNewsRecord.newRecord(sb.webIndex.seedDB.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m)); - } - } else { - serverLog.logInfo("BOOKMARKS", "autoReCrawl - error adding crawl profile: " + crawlingStart + "- " + reasonString); - ZURL.Entry ee = sb.crawlQueues.errorURL.newEntry( - new CrawlEntry( - sb.webIndex.seedDB.mySeed().hash, - crawlingStartURL, - "", - "", - new Date(), - pe.handle(), - 0, - 0, - 0), - sb.webIndex.seedDB.mySeed().hash, - new Date(), - 1, - reasonString); - - ee.store(); - sb.crawlQueues.errorURL.push(ee); - } + sb.crawlStacker.enqueueEntry(new CrawlEntry( + sb.webIndex.seedDB.mySeed().hash, + crawlingStartURL, + null, + "CRAWLING-ROOT", + new Date(), + null, + pe.handle(), + 0, + 0, + 0 + )); + serverLog.logInfo("BOOKMARKS", "autoReCrawl - adding crawl profile for: " + crawlingStart); + // serverLog.logInfo("BOOKMARKS", "autoReCrawl - crawl filter is set to: " + newcrawlingfilter); + // generate a YaCyNews if the global flag was set + if (crawlOrder) { + Map m = new HashMap(pe.map()); // must be cloned + m.remove("specificDepth"); + m.remove("indexText"); + m.remove("indexMedia"); + m.remove("remoteIndexing"); + m.remove("xsstopw"); + m.remove("xpstopw"); + m.remove("xdstopw"); + m.remove("storeTXCache"); + m.remove("storeHTCache"); + m.remove("generalFilter"); + m.remove("specificFilter"); + m.put("intention", "Automatic ReCrawl!"); + sb.webIndex.newsPool.publishMyNews(yacyNewsRecord.newRecord(sb.webIndex.seedDB.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m)); + } } catch (MalformedURLException e1) {} } // if } // while(bit.hasNext()) diff --git a/source/de/anomic/index/indexURLReference.java b/source/de/anomic/index/indexURLReference.java index 7e1c742d4..f767637c0 100644 --- a/source/de/anomic/index/indexURLReference.java +++ b/source/de/anomic/index/indexURLReference.java @@ -462,6 +462,7 @@ public class indexURLReference { comp().url(), referrerHash(), comp().dc_title(), + null, loaddate(), null, 0, diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 4fea492a4..d77218ac9 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -578,8 +578,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitchpublic static final String CRAWLSTACK = "82_crawlstack"

*

Name of the crawl stacker thread, performing several checks on new URLs to crawl, i.e. double-check

*/ - public static final String CRAWLSTACK = "82_crawlstack"; + public static final String CRAWLSTACK0 = "82_crawlstack"; + public static final String CRAWLSTACK1 = "83_crawlstack"; public static final String CRAWLSTACK_METHOD_START = "job"; public static final String CRAWLSTACK_METHOD_JOBCOUNT = "size"; public static final String CRAWLSTACK_METHOD_FREEMEM = null; diff --git a/source/de/anomic/server/serverDomains.java b/source/de/anomic/server/serverDomains.java index 1bdea4a77..3e3ee8f80 100644 --- a/source/de/anomic/server/serverDomains.java +++ b/source/de/anomic/server/serverDomains.java @@ -389,6 +389,7 @@ public class serverDomains { public static final int TLD_NorthAmericaOceania_ID = 4; // english-speaking countries public static final int TLD_Africa_ID = 5; // africa public static final int TLD_Generic_ID = 6; // anything else, also raw ip numbers + public static final int TLD_Local_ID = 7; // a local address static { // assign TLD-ids and names @@ -552,7 +553,7 @@ public class serverDomains { } final Integer i = TLDID.get(tld); if (i == null) { - return (isLocal(host)) ? 7 : TLD_Generic_ID; + return (isLocal(host)) ? TLD_Local_ID : TLD_Generic_ID; } return i.intValue(); } diff --git a/source/de/anomic/server/serverInstantBlockingThread.java b/source/de/anomic/server/serverInstantBlockingThread.java index c55506af8..a143012e1 100644 --- a/source/de/anomic/server/serverInstantBlockingThread.java +++ b/source/de/anomic/server/serverInstantBlockingThread.java @@ -76,7 +76,7 @@ public class serverInstantBlockingThread extends s @SuppressWarnings("unchecked") public J job(final J next) throws Exception { - if (next == null) return null; // poison pill: shutdown + if (next == null || next == serverProcessorJob.poisonPill) return null; // poison pill: shutdown instantThreadCounter++; //System.out.println("started job " + this.handle + ": " + this.getName()); jobs.put(this.handle, this.getName()); diff --git a/source/de/anomic/server/serverProcessorJob.java b/source/de/anomic/server/serverProcessorJob.java index b8bd9ae0f..c91a88033 100644 --- a/source/de/anomic/server/serverProcessorJob.java +++ b/source/de/anomic/server/serverProcessorJob.java @@ -32,7 +32,7 @@ public class serverProcessorJob { public final static int STATUS_FINISHED = 3; public final static int STATUS_POISON = 99; - public int status = 0; + public int status = STATUS_INITIATED; public serverProcessorJob() { this.status = STATUS_INITIATED; diff --git a/source/de/anomic/urlRedirector/urlRedirectord.java b/source/de/anomic/urlRedirector/urlRedirectord.java index ce781ae14..8e40d7778 100644 --- a/source/de/anomic/urlRedirector/urlRedirectord.java +++ b/source/de/anomic/urlRedirector/urlRedirectord.java @@ -7,6 +7,7 @@ import java.io.PrintWriter; import java.net.MalformedURLException; import java.util.Date; +import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.CrawlProfile; import de.anomic.data.userDB; import de.anomic.http.HttpClient; @@ -195,15 +196,18 @@ public class urlRedirectord implements serverHandler, Cloneable { sb.crawlQueues.errorURL.remove(urlhash); // enqueuing URL for crawling - sb.crawlStacker.enqueueEntry( + sb.crawlStacker.enqueueEntry(new CrawlEntry( + sb.webIndex.seedDB.mySeed().hash, reqURL, null, - sb.webIndex.seedDB.mySeed().hash, "URL Redirector", new Date(), + null, + profile.handle(), 0, - profile - ); + 0, + 0 + )); } else { reasonString = "Unsupporte file extension"; }