update to default ranking profile which has now some settings to deny some phpbb3 pages which are redundant in the index when crawling phpbb3.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6288 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 44579fa06d
commit ce972ff4ef

@ -94,7 +94,7 @@ public class QuickCrawlLink_p {
// getting other parameters if set
final String crawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_BAD_URL);
final int CrawlingDepth = Integer.parseInt(post.get("crawlingDepth", "0"));
final boolean crawlDynamic = post.get("crawlingQ", "").equals("on");
final boolean indexText = post.get("indexText", "on").equals("on");

@ -47,6 +47,7 @@ public class CrawlProfile {
public static final String MATCH_ALL = ".*";
public static final String MATCH_NEVER = "";
public static final String MATCH_BAD_URL = ".*memberlist.*|.*previous.*|.*next.*|.*p=.*";
static HashMap<String, ConcurrentHashMap<String, DomProfile>> domsCache = new HashMap<String, ConcurrentHashMap<String, DomProfile>>();

@ -164,7 +164,7 @@ public final class CrawlSwitchboard {
if (this.defaultProxyProfile == null) {
// generate new default entry for proxy crawling
this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL,
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
@ -175,32 +175,32 @@ public final class CrawlSwitchboard {
}
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling
defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
-1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFFRESH);
}
if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFFRESH);
}
if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_CACHEONLY);
}
if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFEXIST);
}
if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFEXIST);
}
if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing
defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0,
this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_NOCACHE);
}
}

@ -311,7 +311,7 @@ public class SitemapParser extends DefaultHandler {
return this.sb.crawler.profilesActiveCrawls.newEntry(
domainName, sitemapURL,
// crawling Filter
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL,
// Depth
0,
// force recrawling

@ -264,7 +264,7 @@ public class bookmarksDB {
CrawlProfile.entry pe = sb.crawler.profilesActiveCrawls.newEntry(
folder+"/"+crawlingStartURL, crawlingStartURL,
newcrawlingMustMatch,
CrawlProfile.MATCH_NEVER,
CrawlProfile.MATCH_BAD_URL,
newcrawlingdepth,
sb.crawler.profilesActiveCrawls.getRecrawlDate(crawlingIfOlder), crawlingDomFilterDepth, crawlingDomMaxPages,
crawlingQ,

@ -100,21 +100,21 @@ public class MapView {
private static String map2string(final Map<String, String> map, final String comment) {
final StringBuilder bb = new StringBuilder(map.size() * 40);
bb.append("# ").append(comment).append("\r\n");
bb.append("# ").append(comment).append('\r').append('\n');
for (Map.Entry<String, String> entry: map.entrySet()) {
if (entry.getValue() != null) {
bb.append(entry.getKey());
bb.append('=');
bb.append(entry.getValue());
bb.append("\r\n");
bb.append('\r').append('\n');
}
}
bb.append("# EOF\r\n");
return bb.toString();
}
private static Map<String, String> string2map(final String s) throws IOException {
final BufferedReader br = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(s.getBytes())));
private static Map<String, String> bytes2map(byte[] b) throws IOException {
final BufferedReader br = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(b)));
final Map<String, String> map = new HashMap<String, String>();
String line;
int pos;
@ -220,7 +220,7 @@ public class MapView {
// read object
final byte[] b = blob.get(key.getBytes());
if (b == null) return null;
map = string2map(new String(b, "UTF-8"));
map = bytes2map(b);
if (storeCache) {
// write map to cache

Loading…
Cancel
Save