prepartion for greedy crawl profiles and refactoring

pull/1/head
Michael Peter Christen 12 years ago
parent e6f361f474
commit f1c5338210

@ -32,6 +32,7 @@ import java.util.TreeMap;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
@ -124,7 +125,7 @@ public class CrawlProfileEditor_p {
final Map<String, String> orderdHandles = new TreeMap<String, String>();
for (final byte[] h : sb.crawler.getActive()) {
selentry = sb.crawler.getActive(h);
if (selentry != null && !CrawlProfile.ignoreNames.contains(selentry.name())) {
if (selentry != null && !CrawlSwitchboard.DEFAULT_PROFILES.contains(selentry.name())) {
orderdHandles.put(selentry.collectionName(), selentry.handle());
}
}

@ -40,6 +40,7 @@ import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.data.ZURL.FailCategory;
@ -532,7 +533,7 @@ public class Crawler_p {
String hosts = "";
for (final byte[] h: sb.crawler.getActive()) {
profile = sb.crawler.getActive(h);
if (CrawlProfile.ignoreNames.contains(profile.name())) continue;
if (CrawlSwitchboard.DEFAULT_PROFILES.contains(profile.name())) continue;
profile.putProfileEntry("crawlProfilesShow_list_", prop, true, dark, count, domlistlength);
prop.put("crawlProfilesShow_list_" + count + "_debug", debug ? 1 : 0);
if (debug) {

@ -68,14 +68,7 @@ public class IndexCreateQueues_p {
for (final byte[] handle: sb.crawler.getActive()) {
entry = sb.crawler.getActive(handle);
final String name = entry.name();
if (name.equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ||
name.equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE))
continue;
if (CrawlSwitchboard.DEFAULT_PROFILES.contains(name)) continue;
if (compiledPattern.matcher(name).find()) sb.crawler.removeActive(entry.handle().getBytes());
}
} else {

@ -56,14 +56,7 @@ public class WatchWebStructure_p {
CrawlProfile e;
for (final byte[] handle: sb.crawler.getActive()) {
e = sb.crawler.getActive(handle);
if (e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY) ||
e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE) ||
e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT) ||
e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT) ||
e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ||
e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA) ||
e.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE))
continue;
if (CrawlSwitchboard.DEFAULT_PROFILES.contains(e.name())) continue;
host = e.name();
break; // take the first one
}

@ -60,16 +60,30 @@ public final class CrawlSwitchboard {
public static final String CRAWL_PROFILE_REMOTE = "remote";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_TEXT = "snippetLocalText";
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT = "snippetGlobalText";
public static final String CRAWL_PROFILE_GREEDY_LEARNING_TEXT = "snippetGreedyLearningText";
public static final String CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA = "snippetLocalMedia";
public static final String CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA = "snippetGlobalMedia";
public static final String CRAWL_PROFILE_SURROGATE = "surrogates";
public static Set<String> DEFAULT_PROFILES = new HashSet<String>();
static {
DEFAULT_PROFILES.add(CRAWL_PROFILE_PROXY);
DEFAULT_PROFILES.add(CRAWL_PROFILE_REMOTE);
DEFAULT_PROFILES.add(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT);
DEFAULT_PROFILES.add(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT);
DEFAULT_PROFILES.add(CRAWL_PROFILE_GREEDY_LEARNING_TEXT);
DEFAULT_PROFILES.add(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA);
DEFAULT_PROFILES.add(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA);
DEFAULT_PROFILES.add(CRAWL_PROFILE_SURROGATE);
}
public static final String DBFILE_ACTIVE_CRAWL_PROFILES = "crawlProfilesActive.heap";
public static final String DBFILE_PASSIVE_CRAWL_PROFILES = "crawlProfilesPassive.heap";
public static final long CRAWL_PROFILE_PROXY_RECRAWL_CYCLE = 60L * 24L;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final long CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final long CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final long CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE = 60L * 24L * 30L;
public static final long CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE = 60L * 24L * 30L;
@ -82,6 +96,7 @@ public final class CrawlSwitchboard {
public CrawlProfile defaultProxyProfile;
public CrawlProfile defaultRemoteProfile;
public CrawlProfile defaultTextSnippetLocalProfile, defaultTextSnippetGlobalProfile;
public CrawlProfile defaultTextGreedyLearningProfile;
public CrawlProfile defaultMediaSnippetLocalProfile, defaultMediaSnippetGlobalProfile;
public CrawlProfile defaultSurrogateProfile;
private final File queuesRoot;
@ -344,6 +359,34 @@ public final class CrawlSwitchboard {
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile);
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CacheStrategy.IFEXIST);
// generate new default entry for greedy learning
this.defaultTextGreedyLearningProfile =
new CrawlProfile(
CRAWL_PROFILE_GREEDY_LEARNING_TEXT,
CrawlProfile.MATCH_ALL_STRING, //crawlerUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE),
-1,
true,
false,
false,
true,
false,
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_GREEDY_LEARNING_TEXT);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile);
// generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetLocalProfile =
new CrawlProfile(
@ -465,13 +508,7 @@ public final class CrawlSwitchboard {
} catch ( final SpaceExceededException e ) {
continue;
}
if ( !((entry.name().equals(CRAWL_PROFILE_PROXY))
|| (entry.name().equals(CRAWL_PROFILE_REMOTE))
|| (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT))
|| (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT))
|| (entry.name().equals(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA))
|| (entry.name().equals(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA)) || (entry.name()
.equals(CRAWL_PROFILE_SURROGATE))) ) {
if (!DEFAULT_PROFILES.contains(entry.name())) {
final CrawlProfile p = new CrawlProfile(entry);
this.profilesPassiveCrawls.put(UTF8.getBytes(p.handle()), p);
this.profilesActiveCrawls.remove(handle);
@ -494,13 +531,7 @@ public final class CrawlSwitchboard {
for (final byte[] handle: this.getActive()) {
CrawlProfile entry;
entry = new CrawlProfile(this.getActive(handle));
if (!((entry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_PROXY))
|| (entry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_REMOTE))
|| (entry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT))
|| (entry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT))
|| (entry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA))
|| (entry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA))
|| (entry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE)))) {
if (!CrawlSwitchboard.DEFAULT_PROFILES.contains(entry.name())) {
deletionCandidate.add(ASCII.String(handle));
}
}

@ -26,7 +26,6 @@
package net.yacy.crawler.data;
import java.text.DateFormat;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
@ -546,19 +545,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return new StringBuilder(u.length() + 5).append(Pattern.quote(u)).append(".*").toString();
}
public static final Set<String> ignoreNames = new HashSet<String>();
static {
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_PROXY);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_REMOTE);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_TEXT);
ignoreNames.add(CrawlSwitchboard.CRAWL_PROFILE_SURROGATE);
ignoreNames.add(CrawlSwitchboard.DBFILE_ACTIVE_CRAWL_PROFILES);
ignoreNames.add(CrawlSwitchboard.DBFILE_PASSIVE_CRAWL_PROFILES);
}
public void putProfileEntry(
final String CRAWL_PROFILE_PREFIX,
final serverObjects prop,
@ -569,7 +555,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
prop.put(CRAWL_PROFILE_PREFIX + count + "_dark", dark ? "1" : "0");
prop.put(CRAWL_PROFILE_PREFIX + count + "_name", this.collectionName());
prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton", (!active || ignoreNames.contains(this.name())) ? "0" : "1");
prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton", (!active || CrawlSwitchboard.DEFAULT_PROFILES.contains(this.name())) ? "0" : "1");
prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton_handle", this.handle());
prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton", (active) ? "0" : "1");
prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton_handle", this.handle());

@ -2076,6 +2076,10 @@ public final class Switchboard extends serverSwitch {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE)));
insert = true;
}
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE)));
insert = true;
}
if ( selentry.name().equals(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA) ) {
selentry.put(CrawlProfile.RECRAWL_IF_OLDER, Long.toString(CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE)));
insert = true;

Loading…
Cancel
Save