- added a cache for active crawl profiles to the crawl switchboard

- moved the domain cache for domain counter from the crawl switchboard to the crawl profiles. the crawl domain counter is now therefore relative for each crawl start, not for the whole crawler.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8018 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 13 years ago
parent 37e35f2741
commit 3a807e10cf

@ -261,7 +261,7 @@ public class CrawlProfileEditor_p {
if (active && profile.domMaxPages() > 0
&& profile.domMaxPages() != Integer.MAX_VALUE) {
String item;
while (i <= domlistlength && !(item = crawlStacker.domName(true, i)).isEmpty()){
while (i <= domlistlength && !(item = profile.domName(true, i)).isEmpty()){
if (i == domlistlength) {
item += " ...";
}

@ -25,6 +25,7 @@
package de.anomic.crawler;
import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.regex.Pattern;
@ -72,6 +73,25 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
private Pattern urlmustmatch = null, urlmustnotmatch = null, ipmustmatch = null, ipmustnotmatch = null;
public final static class DomProfile {
public String referrer;
public int depth, count;
public DomProfile(final String ref, final int d) {
this.referrer = ref;
this.depth = d;
this.count = 1;
}
public void inc() {
this.count++;
}
}
private final Map<String, DomProfile> doms;
/**
* Constructor which creates CrawlPofile from parameters.
* @param name name of the crawl profile
@ -121,6 +141,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
if (name == null || name.isEmpty()) {
throw new NullPointerException("name must not be null or empty");
}
this.doms = new ConcurrentHashMap<String, DomProfile>();
final String handle = (startURL == null)
? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength)
: ASCII.String(startURL.hash());
@ -154,6 +176,45 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public CrawlProfile(final Map<String, String> ext) {
super(ext == null ? 1 : ext.size());
if (ext != null) putAll(ext);
this.doms = new ConcurrentHashMap<String, DomProfile>();
}
public void domInc(final String domain, final String referrer, final int depth) {
final DomProfile dp = this.doms.get(domain);
if (dp == null) {
// new domain
this.doms.put(domain, new DomProfile(referrer, depth));
} else {
// increase counter
dp.inc();
}
}
public String domName(final boolean attr, final int index){
final Iterator<Map.Entry<String, DomProfile>> domnamesi = this.doms.entrySet().iterator();
String domname="";
Map.Entry<String, DomProfile> ey;
DomProfile dp;
int i = 0;
while ((domnamesi.hasNext()) && (i < index)) {
ey = domnamesi.next();
i++;
}
if (domnamesi.hasNext()) {
ey = domnamesi.next();
dp = ey.getValue();
domname = ey.getKey() + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count) : " ");
}
return domname;
}
public void clearDoms() {
this.doms.clear();
}
public DomProfile getDom(final String domain) {
return this.doms.get(domain);
}
/**

@ -33,12 +33,10 @@ import java.net.InetAddress;
import java.net.MalformedURLException;
import java.net.UnknownHostException;
import java.util.Date;
import java.util.Iterator;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
@ -55,6 +53,7 @@ import net.yacy.repository.Blacklist;
import net.yacy.repository.FilterEngine;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Segment;
import de.anomic.crawler.CrawlProfile.DomProfile;
import de.anomic.crawler.ResultURLs.EventOrigin;
import de.anomic.crawler.ZURL.FailCategory;
import de.anomic.crawler.retrieval.FTPLoader;
@ -71,29 +70,10 @@ public final class CrawlStacker {
private final CrawlQueues nextQueue;
private final CrawlSwitchboard crawler;
private final Segment indexSegment;
private final SeedDB peers;
private final SeedDB peers;
private final boolean acceptLocalURLs, acceptGlobalURLs;
private final FilterEngine domainList;
public final static class DomProfile {
public String referrer;
public int depth, count;
public DomProfile(final String ref, final int d) {
this.referrer = ref;
this.depth = d;
this.count = 1;
}
public void inc() {
this.count++;
}
}
private final Map<String, DomProfile> doms;
// this is the process that checks url for double-occurrences and for allowance/disallowance by robots.txt
public CrawlStacker(
@ -116,37 +96,9 @@ public final class CrawlStacker {
this.fastQueue = new WorkflowProcessor<Request>("CrawlStackerFast", "This process checks new urls before they are enqueued into the balancer (proper, double-check, correct domain, filter)", new String[]{"Balancer"}, this, "job", 10000, null, 2);
this.slowQueue = new WorkflowProcessor<Request>("CrawlStackerSlow", "This is like CrawlStackerFast, but does additionaly a DNS lookup. The CrawlStackerFast does not need this because it can use the DNS cache.", new String[]{"Balancer"}, this, "job", 1000, null, 5);
this.doms = new ConcurrentHashMap<String, DomProfile>();
this.log.logInfo("STACKCRAWL thread initialized.");
}
private void domInc(final String domain, final String referrer, final int depth) {
final DomProfile dp = this.doms.get(domain);
if (dp == null) {
// new domain
this.doms.put(domain, new DomProfile(referrer, depth));
} else {
// increase counter
dp.inc();
}
}
public String domName(final boolean attr, final int index){
final Iterator<Map.Entry<String, DomProfile>> domnamesi = this.doms.entrySet().iterator();
String domname="";
Map.Entry<String, DomProfile> ey;
DomProfile dp;
int i = 0;
while ((domnamesi.hasNext()) && (i < index)) {
ey = domnamesi.next();
i++;
}
if (domnamesi.hasNext()) {
ey = domnamesi.next();
dp = ey.getValue();
domname = ey.getKey() + ((attr) ? ("/r=" + dp.referrer + ", d=" + dp.depth + ", c=" + dp.count) : " ");
}
return domname;
}
public int size() {
return this.fastQueue.queueSize() + this.slowQueue.queueSize();
@ -160,7 +112,6 @@ public final class CrawlStacker {
public void clear() {
this.fastQueue.clear();
this.slowQueue.clear();
this.doms.clear();
}
public void announceClose() {
@ -412,7 +363,7 @@ public final class CrawlStacker {
// add domain to profile domain list
if (profile.domMaxPages() != Integer.MAX_VALUE) {
domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth());
profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth());
}
if (global) {
@ -520,7 +471,7 @@ public final class CrawlStacker {
// deny urls that exceed allowed number of occurrences
final int maxAllowedPagesPerDomain = profile.domMaxPages();
if (maxAllowedPagesPerDomain < Integer.MAX_VALUE) {
final DomProfile dp = this.doms.get(url.getHost());
final DomProfile dp = profile.getDom(url.getHost());
if (dp != null && dp.count >= maxAllowedPagesPerDomain) {
if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' appeared too often in crawl stack, a maximum of " + profile.domMaxPages() + " is allowed.");
return "crawl stack domain counter exceeded";

@ -28,14 +28,18 @@ package de.anomic.crawler;
import java.io.File;
import java.io.IOException;
import java.util.Collections;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.kelondro.blob.MapHeap;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.NaturalOrder;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.kelondroException;
@ -53,7 +57,6 @@ public final class CrawlSwitchboard {
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap";
public static final String DBFILE_INVALID_CRAWL_PROFILES = "crawlProfilesInvalid.heap";
public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
@ -63,8 +66,9 @@ public final class CrawlSwitchboard {
public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L;
private final Log log;
private Map<byte[], Map<String, String>> profilesActiveCrawls;
private final Map<byte[], Map<String, String>> profilesPassiveCrawls, profilesInvalidCrawls;
private MapHeap profilesActiveCrawls;
private final MapHeap profilesPassiveCrawls;
private final Map<byte[], CrawlProfile> profilesActiveCrawlsCache; //TreeMap<byte[], DigestURI>(Base64Order.enhancedCoder);
public CrawlProfile defaultProxyProfile;
public CrawlProfile defaultRemoteProfile;
public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
@ -84,28 +88,31 @@ public final class CrawlSwitchboard {
System.exit(0);
}
this.log = log;
this.profilesActiveCrawlsCache = Collections.synchronizedMap(new TreeMap<byte[], CrawlProfile>(Base64Order.enhancedCoder));
// make crawl profiles database and default profiles
this.queuesRoot = queuesRoot;
this.queuesRoot.mkdirs();
this.log.logConfig("Initializing Crawl Profiles");
final File profilesInvalidFile = new File(queuesRoot, DBFILE_INVALID_CRAWL_PROFILES);
this.profilesInvalidCrawls = loadFromDB(profilesInvalidFile);
final File profilesActiveFile = new File(queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
this.profilesActiveCrawls = loadFromDB(profilesActiveFile);
for (final byte[] handle : this.profilesActiveCrawls.keySet()) {
final CrawlProfile p;
p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
CrawlProfile p;
try {
p = new CrawlProfile(this.profilesActiveCrawls.get(handle));
} catch (final IOException e) {
p = null;
} catch (final RowSpaceExceededException e) {
p = null;
}
if (p == null) continue;
if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTMATCH))) {
removeActive(handle);
putInvalid(handle, p);
Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name()
+ " from active crawls since " + CrawlProfile.FILTER_URL_MUSTMATCH
+ " is no valid regular expression: " + p.get(CrawlProfile.FILTER_URL_MUSTMATCH));
} else if (!RegexHelper.isValidRegex(p.get(CrawlProfile.FILTER_URL_MUSTNOTMATCH))) {
putInvalid(handle, p);
removeActive(handle);
Log.logWarning("CrawlProfiles", "removed Profile " + p.handle() + ": " + p.name()
+ " from active crawls since " + CrawlProfile.FILTER_URL_MUSTNOTMATCH
@ -121,8 +128,15 @@ public final class CrawlSwitchboard {
final File profilesPassiveFile = new File(queuesRoot, DBFILE_PASSIVE_CRAWL_PROFILES);
this.profilesPassiveCrawls = loadFromDB(profilesPassiveFile);
for (final byte[] handle : this.profilesPassiveCrawls.keySet()) {
final CrawlProfile p = new CrawlProfile(this.profilesPassiveCrawls.get(handle));
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
CrawlProfile p;
try {
p = new CrawlProfile(this.profilesPassiveCrawls.get(handle));
Log.logInfo("CrawlProfiles", "loaded Profile " + p.handle() + ": " + p.name());
} catch (final IOException e) {
continue;
} catch (final RowSpaceExceededException e) {
continue;
}
}
log.logInfo("Loaded passive crawl profiles from file " + profilesPassiveFile.getName() +
", " + this.profilesPassiveCrawls.size() + " entries" +
@ -131,21 +145,35 @@ public final class CrawlSwitchboard {
public CrawlProfile getActive(final byte[] profileKey) {
if (profileKey == null) return null;
final Map<String, String> m = this.profilesActiveCrawls.get(profileKey);
if (m == null) return null;
return new CrawlProfile(m);
}
// get from cache
CrawlProfile p = this.profilesActiveCrawlsCache.get(profileKey);
if (p != null) return p;
public CrawlProfile getInvalid(final byte[] profileKey) {
if (profileKey == null) return null;
final Map<String, String> m = this.profilesInvalidCrawls.get(profileKey);
// get from db
Map<String, String> m;
try {
m = this.profilesActiveCrawls.get(profileKey);
} catch (final IOException e) {
m = null;
} catch (final RowSpaceExceededException e) {
m = null;
}
if (m == null) return null;
return new CrawlProfile(m);
p = new CrawlProfile(m);
this.profilesActiveCrawlsCache.put(profileKey, p);
return p;
}
public CrawlProfile getPassive(final byte[] profileKey) {
if (profileKey == null) return null;
final Map<String, String> m = this.profilesPassiveCrawls.get(profileKey);
Map<String, String> m;
try {
m = this.profilesPassiveCrawls.get(profileKey);
} catch (final IOException e) {
m = null;
} catch (final RowSpaceExceededException e) {
m = null;
}
if (m == null) return null;
return new CrawlProfile(m);
}
@ -154,24 +182,16 @@ public final class CrawlSwitchboard {
return this.profilesActiveCrawls.keySet();
}
public Set<byte[]> getInvalid() {
return this.profilesInvalidCrawls.keySet();
}
public Set<byte[]> getPassive() {
return this.profilesPassiveCrawls.keySet();
}
public void removeActive(final byte[] profileKey) {
if (profileKey == null) return;
this.profilesActiveCrawlsCache.remove(profileKey);
this.profilesActiveCrawls.remove(profileKey);
}
public void removeInvalid(final byte[] profileKey) {
if (profileKey == null) return;
this.profilesInvalidCrawls.remove(profileKey);
}
public void removePassive(final byte[] profileKey) {
if (profileKey == null) return;
this.profilesPassiveCrawls.remove(profileKey);
@ -179,19 +199,13 @@ public final class CrawlSwitchboard {
public void putActive(final byte[] profileKey, final CrawlProfile profile) {
this.profilesActiveCrawls.put(profileKey, profile);
}
public void putInvalid(final byte[] profileKey, final CrawlProfile profile) {
this.profilesInvalidCrawls.put(profileKey, profile);
this.profilesActiveCrawlsCache.put(profileKey, profile);
}
public void putPassive(final byte[] profileKey, final CrawlProfile profile) {
this.profilesPassiveCrawls.put(profileKey, profile);
}
public void clear() {
}
private void initActiveCrawlProfiles() {
this.defaultProxyProfile = null;
this.defaultRemoteProfile = null;
@ -282,6 +296,7 @@ public final class CrawlSwitchboard {
}
private void resetProfiles() {
this.profilesActiveCrawlsCache.clear();
final File pdb = new File(this.queuesRoot, DBFILE_ACTIVE_CRAWL_PROFILES);
if (pdb.exists()) FileUtils.deletedelete(pdb);
try {
@ -293,7 +308,8 @@ public final class CrawlSwitchboard {
initActiveCrawlProfiles();
}
public boolean cleanProfiles() throws InterruptedException {
public boolean clear() throws InterruptedException {
this.profilesActiveCrawlsCache.clear();
CrawlProfile entry;
boolean hasDoneSomething = false;
try {
@ -302,7 +318,13 @@ public final class CrawlSwitchboard {
if (Thread.currentThread().isInterrupted()) throw new InterruptedException("Shutdown in progress");
// getting next profile
entry = new CrawlProfile(this.profilesActiveCrawls.get(handle));
try {
entry = new CrawlProfile(this.profilesActiveCrawls.get(handle));
} catch (final IOException e) {
continue;
} catch (final RowSpaceExceededException e) {
continue;
}
if (!((entry.name().equals(CRAWL_PROFILE_PROXY)) ||
(entry.name().equals(CRAWL_PROFILE_REMOTE)) ||
(entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT)) ||
@ -325,9 +347,9 @@ public final class CrawlSwitchboard {
public void close() {
((MapHeap) this.profilesActiveCrawls).close();
((MapHeap) this.profilesInvalidCrawls).close();
((MapHeap) this.profilesPassiveCrawls).close();
this.profilesActiveCrawlsCache.clear();
this.profilesActiveCrawls.close();
this.profilesPassiveCrawls.close();
}
@ -336,8 +358,8 @@ public final class CrawlSwitchboard {
* @param file DB file
* @return crawl profile data
*/
private Map<byte[], Map<String, String>> loadFromDB(final File file) {
Map<byte[], Map<String, String>> ret;
private MapHeap loadFromDB(final File file) {
MapHeap ret;
try {
ret = new MapHeap(file, Word.commonHashLength, NaturalOrder.naturalOrder, 1024 * 64, 500, ' ');
} catch (final IOException e) {

@ -1283,7 +1283,7 @@ public final class Switchboard extends serverSwitch {
(this.crawlStacker != null && !this.crawlStacker.isEmpty()) ||
this.crawlQueues.noticeURL.notEmpty())
return false;
return this.crawler.cleanProfiles();
return this.crawler.clear();
}
public void close() {

Loading…
Cancel
Save