From 3a807e10cf1b91170bde1c02e74468f47701aa9b Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 8 Nov 2011 15:38:08 +0000 Subject: [PATCH] - added a cache for active crawl profiles to the crawl switchboard - moved the domain cache for domain counter from the crawl switchboard to the crawl profiles. the crawl domain counter is now therefore relative for each crawl start, not for the whole crawler. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8018 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CrawlProfileEditor_p.java | 2 +- source/de/anomic/crawler/CrawlProfile.java | 61 ++++++++++ source/de/anomic/crawler/CrawlStacker.java | 57 +-------- .../de/anomic/crawler/CrawlSwitchboard.java | 110 +++++++++++------- source/net/yacy/search/Switchboard.java | 2 +- 5 files changed, 133 insertions(+), 99 deletions(-) diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java index 660970d8b..a87c1b977 100644 --- a/htroot/CrawlProfileEditor_p.java +++ b/htroot/CrawlProfileEditor_p.java @@ -261,7 +261,7 @@ public class CrawlProfileEditor_p { if (active && profile.domMaxPages() > 0 && profile.domMaxPages() != Integer.MAX_VALUE) { String item; - while (i <= domlistlength && !(item = crawlStacker.domName(true, i)).isEmpty()){ + while (i <= domlistlength && !(item = profile.domName(true, i)).isEmpty()){ if (i == domlistlength) { item += " ..."; } diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index 53ce0e231..4c095ae67 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -25,6 +25,7 @@ package de.anomic.crawler; +import java.util.Iterator; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; @@ -72,6 +73,25 @@ public class CrawlProfile extends ConcurrentHashMap implements M private Pattern urlmustmatch = null, urlmustnotmatch = null, ipmustmatch = null, ipmustnotmatch = null; + public final static class DomProfile { + + public String referrer; + public int depth, count; + + public DomProfile(final String ref, final int d) { + this.referrer = ref; + this.depth = d; + this.count = 1; + } + + public void inc() { + this.count++; + } + + } + + private final Map doms; + /** * Constructor which creates CrawlPofile from parameters. * @param name name of the crawl profile @@ -121,6 +141,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (name == null || name.isEmpty()) { throw new NullPointerException("name must not be null or empty"); } + this.doms = new ConcurrentHashMap(); + final String handle = (startURL == null) ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength) : ASCII.String(startURL.hash()); @@ -154,6 +176,45 @@ public class CrawlProfile extends ConcurrentHashMap implements M public CrawlProfile(final Map ext) { super(ext == null ? 1 : ext.size()); if (ext != null) putAll(ext); + this.doms = new ConcurrentHashMap(); + } + + + public void domInc(final String domain, final String referrer, final int depth) { + final DomProfile dp = this.doms.get(domain); + if (dp == null) { + // new domain + this.doms.put(domain, new DomProfile(referrer, depth)); + } else { + // increase counter + dp.inc(); + } + } + + public String domName(final boolean attr, final int index){ + final Iterator> domnamesi = this.doms.entrySet().iterator(); + String domname=""; + Map.Entry ey; + DomProfile dp; + int i = 0; + while ((domnamesi.hasNext()) && (i < index)) { + ey = domnamesi.next(); + i++; + } + if (domnamesi.hasNext()) { + ey = domnamesi.next(); + dp = ey.getValue(); + domname = ey.getKey() + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count) : " "); + } + return domname; + } + + public void clearDoms() { + this.doms.clear(); + } + + public DomProfile getDom(final String domain) { + return this.doms.get(domain); } /** diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 0e00ca4c2..ab53a7d6c 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -33,12 +33,10 @@ import java.net.InetAddress; import java.net.MalformedURLException; import java.net.UnknownHostException; import java.util.Date; -import java.util.Iterator; import java.util.Locale; import java.util.Map; import java.util.Properties; import java.util.concurrent.BlockingQueue; -import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; @@ -55,6 +53,7 @@ import net.yacy.repository.Blacklist; import net.yacy.repository.FilterEngine; import net.yacy.search.Switchboard; import net.yacy.search.index.Segment; +import de.anomic.crawler.CrawlProfile.DomProfile; import de.anomic.crawler.ResultURLs.EventOrigin; import de.anomic.crawler.ZURL.FailCategory; import de.anomic.crawler.retrieval.FTPLoader; @@ -71,29 +70,10 @@ public final class CrawlStacker { private final CrawlQueues nextQueue; private final CrawlSwitchboard crawler; private final Segment indexSegment; - private final SeedDB peers; + private final SeedDB peers; private final boolean acceptLocalURLs, acceptGlobalURLs; private final FilterEngine domainList; - public final static class DomProfile { - - public String referrer; - public int depth, count; - - public DomProfile(final String ref, final int d) { - this.referrer = ref; - this.depth = d; - this.count = 1; - } - - public void inc() { - this.count++; - } - - } - - private final Map doms; - // this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt public CrawlStacker( @@ -116,37 +96,9 @@ public final class CrawlStacker { this.fastQueue = new WorkflowProcessor("CrawlStackerFast", "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)", new String[]{"Balancer"}, this, "job", 10000, null, 2); this.slowQueue = new WorkflowProcessor("CrawlStackerSlow", "This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache.", new String[]{"Balancer"}, this, "job", 1000, null, 5); - this.doms = new ConcurrentHashMap(); this.log.logInfo("STACKCRAWL thread initialized."); } - private void domInc(final String domain, final String referrer, final int depth) { - final DomProfile dp = this.doms.get(domain); - if (dp == null) { - // new domain - this.doms.put(domain, new DomProfile(referrer, depth)); - } else { - // increase counter - dp.inc(); - } - } - public String domName(final boolean attr, final int index){ - final Iterator> domnamesi = this.doms.entrySet().iterator(); - String domname=""; - Map.Entry ey; - DomProfile dp; - int i = 0; - while ((domnamesi.hasNext()) && (i < index)) { - ey = domnamesi.next(); - i++; - } - if (domnamesi.hasNext()) { - ey = domnamesi.next(); - dp = ey.getValue(); - domname = ey.getKey() + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count) : " "); - } - return domname; - } public int size() { return this.fastQueue.queueSize() + this.slowQueue.queueSize(); @@ -160,7 +112,6 @@ public final class CrawlStacker { public void clear() { this.fastQueue.clear(); this.slowQueue.clear(); - this.doms.clear(); } public void announceClose() { @@ -412,7 +363,7 @@ public final class CrawlStacker { // add domain to profile domain list if (profile.domMaxPages() != Integer.MAX_VALUE) { - domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth()); + profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth()); } if (global) { @@ -520,7 +471,7 @@ public final class CrawlStacker { // deny urls that exceed allowed number of occurrences final int maxAllowedPagesPerDomain = profile.domMaxPages(); if (maxAllowedPagesPerDomain < Integer.MAX_VALUE) { - final DomProfile dp = this.doms.get(url.getHost()); + final DomProfile dp = profile.getDom(url.getHost()); if (dp != null && dp.count >= maxAllowedPagesPerDomain) { if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' appeared too often in crawl stack, a maximum of " + profile.domMaxPages() + " is allowed."); return "crawl stack domain counter exceeded"; diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java index ad4752697..d056c8e88 100644 --- a/source/de/anomic/crawler/CrawlSwitchboard.java +++ b/source/de/anomic/crawler/CrawlSwitchboard.java @@ -28,14 +28,18 @@ package de.anomic.crawler; import java.io.File; import java.io.IOException; +import java.util.Collections; import java.util.Map; import java.util.Set; +import java.util.TreeMap; import net.yacy.cora.document.UTF8; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.kelondro.blob.MapHeap; import net.yacy.kelondro.data.word.Word; +import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; +import net.yacy.kelondro.order.Base64Order; import net.yacy.kelondro.order.NaturalOrder; import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.kelondroException; @@ -53,7 +57,6 @@ public final class CrawlSwitchboard { public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap"; public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap"; - public static final String DBFILE_INVALID_CRAWL_PROFILES = "crawlProfilesInvalid.heap"; public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L; public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L; @@ -63,8 +66,9 @@ public final class CrawlSwitchboard { public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L; private final Log log; - private Map> profilesActiveCrawls; - private final Map> profilesPassiveCrawls, profilesInvalidCrawls; + private MapHeap profilesActiveCrawls; + private final MapHeap profilesPassiveCrawls; + private final Map profilesActiveCrawlsCache; //TreeMap(Base64Order.enhancedCoder); public CrawlProfile defaultProxyProfile; public CrawlProfile defaultRemoteProfile; public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile; @@ -84,28 +88,31 @@ public final class CrawlSwitchboard { System.exit(0); } this.log = log; + this.profilesActiveCrawlsCache = Collections.synchronizedMap(new TreeMap(Base64Order.enhancedCoder)); // make crawl profiles database and default profiles this.queuesRoot = queuesRoot; this.queuesRoot.mkdirs(); this.log.logConfig("Initializing Crawl Profiles"); - final File profilesInvalidFile = new File(queuesRoot, DBFILE_INVALID_CRAWL_PROFILES); - this.profilesInvalidCrawls = loadFromDB(profilesInvalidFile); - final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES); this.profilesActiveCrawls = loadFromDB(profilesActiveFile); for (final byte[] handle : this.profilesActiveCrawls.keySet()) { - final CrawlProfile p; - p = new CrawlProfile(this.profilesActiveCrawls.get(handle)); + CrawlProfile p; + try { + p = new CrawlProfile(this.profilesActiveCrawls.get(handle)); + } catch (final IOException e) { + p = null; + } catch (final RowSpaceExceededException e) { + p = null; + } + if (p == null) continue; if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTMATCH))) { removeActive(handle); - putInvalid(handle, p); Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name() + " from active crawls since " + CrawlProfile.FILTER_URL_MUSTMATCH + " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTMATCH)); } else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH))) { - putInvalid(handle, p); removeActive(handle); Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name() + " from active crawls since " + CrawlProfile.FILTER_URL_MUSTNOTMATCH @@ -121,8 +128,15 @@ public final class CrawlSwitchboard { final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES); this.profilesPassiveCrawls = loadFromDB(profilesPassiveFile); for (final byte[] handle : this.profilesPassiveCrawls.keySet()) { - final CrawlProfile p = new CrawlProfile(this.profilesPassiveCrawls.get(handle)); - Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name()); + CrawlProfile p; + try { + p = new CrawlProfile(this.profilesPassiveCrawls.get(handle)); + Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name()); + } catch (final IOException e) { + continue; + } catch (final RowSpaceExceededException e) { + continue; + } } log.logInfo("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() + ", " + this.profilesPassiveCrawls.size() + " entries" + @@ -131,21 +145,35 @@ public final class CrawlSwitchboard { public CrawlProfile getActive(final byte[] profileKey) { if (profileKey == null) return null; - final Map m = this.profilesActiveCrawls.get(profileKey); - if (m == null) return null; - return new CrawlProfile(m); - } + // get from cache + CrawlProfile p = this.profilesActiveCrawlsCache.get(profileKey); + if (p != null) return p; - public CrawlProfile getInvalid(final byte[] profileKey) { - if (profileKey == null) return null; - final Map m = this.profilesInvalidCrawls.get(profileKey); + // get from db + Map m; + try { + m = this.profilesActiveCrawls.get(profileKey); + } catch (final IOException e) { + m = null; + } catch (final RowSpaceExceededException e) { + m = null; + } if (m == null) return null; - return new CrawlProfile(m); + p = new CrawlProfile(m); + this.profilesActiveCrawlsCache.put(profileKey, p); + return p; } public CrawlProfile getPassive(final byte[] profileKey) { if (profileKey == null) return null; - final Map m = this.profilesPassiveCrawls.get(profileKey); + Map m; + try { + m = this.profilesPassiveCrawls.get(profileKey); + } catch (final IOException e) { + m = null; + } catch (final RowSpaceExceededException e) { + m = null; + } if (m == null) return null; return new CrawlProfile(m); } @@ -154,24 +182,16 @@ public final class CrawlSwitchboard { return this.profilesActiveCrawls.keySet(); } - public Set getInvalid() { - return this.profilesInvalidCrawls.keySet(); - } - public Set getPassive() { return this.profilesPassiveCrawls.keySet(); } public void removeActive(final byte[] profileKey) { if (profileKey == null) return; + this.profilesActiveCrawlsCache.remove(profileKey); this.profilesActiveCrawls.remove(profileKey); } - public void removeInvalid(final byte[] profileKey) { - if (profileKey == null) return; - this.profilesInvalidCrawls.remove(profileKey); - } - public void removePassive(final byte[] profileKey) { if (profileKey == null) return; this.profilesPassiveCrawls.remove(profileKey); @@ -179,19 +199,13 @@ public final class CrawlSwitchboard { public void putActive(final byte[] profileKey, final CrawlProfile profile) { this.profilesActiveCrawls.put(profileKey, profile); - } - - public void putInvalid(final byte[] profileKey, final CrawlProfile profile) { - this.profilesInvalidCrawls.put(profileKey, profile); + this.profilesActiveCrawlsCache.put(profileKey, profile); } public void putPassive(final byte[] profileKey, final CrawlProfile profile) { this.profilesPassiveCrawls.put(profileKey, profile); } - public void clear() { - } - private void initActiveCrawlProfiles() { this.defaultProxyProfile = null; this.defaultRemoteProfile = null; @@ -282,6 +296,7 @@ public final class CrawlSwitchboard { } private void resetProfiles() { + this.profilesActiveCrawlsCache.clear(); final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES); if (pdb.exists()) FileUtils.deletedelete(pdb); try { @@ -293,7 +308,8 @@ public final class CrawlSwitchboard { initActiveCrawlProfiles(); } - public boolean cleanProfiles() throws InterruptedException { + public boolean clear() throws InterruptedException { + this.profilesActiveCrawlsCache.clear(); CrawlProfile entry; boolean hasDoneSomething = false; try { @@ -302,7 +318,13 @@ public final class CrawlSwitchboard { if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress"); // getting next profile - entry = new CrawlProfile(this.profilesActiveCrawls.get(handle)); + try { + entry = new CrawlProfile(this.profilesActiveCrawls.get(handle)); + } catch (final IOException e) { + continue; + } catch (final RowSpaceExceededException e) { + continue; + } if (!((entry.name().equals(CRAWL_PROFILE_PROXY)) || (entry.name().equals(CRAWL_PROFILE_REMOTE)) || (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) || @@ -325,9 +347,9 @@ public final class CrawlSwitchboard { public void close() { - ((MapHeap) this.profilesActiveCrawls).close(); - ((MapHeap) this.profilesInvalidCrawls).close(); - ((MapHeap) this.profilesPassiveCrawls).close(); + this.profilesActiveCrawlsCache.clear(); + this.profilesActiveCrawls.close(); + this.profilesPassiveCrawls.close(); } @@ -336,8 +358,8 @@ public final class CrawlSwitchboard { * @param file DB file * @return crawl profile data */ - private Map> loadFromDB(final File file) { - Map> ret; + private MapHeap loadFromDB(final File file) { + MapHeap ret; try { ret = new MapHeap(file, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' '); } catch (final IOException e) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 1ab45c21e..64c8308b5 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -1283,7 +1283,7 @@ public final class Switchboard extends serverSwitch { (this.crawlStacker != null && !this.crawlStacker.isEmpty()) || this.crawlQueues.noticeURL.notEmpty()) return false; - return this.crawler.cleanProfiles(); + return this.crawler.clear(); } public void close() {