From ce972ff4ef3e797c390ac39967770c7caa1ec5b8 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 3 Sep 2009 20:54:47 +0000 Subject: [PATCH] update to default ranking profile which has now some settings to deny some phpbb3 pages which are redundant in the index when crawling phpbb3. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6288 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/QuickCrawlLink_p.java | 2 +- source/de/anomic/crawler/CrawlProfile.java | 1 + source/de/anomic/crawler/CrawlSwitchboard.java | 14 +++++++------- source/de/anomic/data/SitemapParser.java | 2 +- source/de/anomic/data/bookmarksDB.java | 2 +- source/de/anomic/kelondro/blob/MapView.java | 10 +++++----- 6 files changed, 16 insertions(+), 15 deletions(-) diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 504259e4a..372b72f5f 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -94,7 +94,7 @@ public class QuickCrawlLink_p { // getting other parameters if set final String crawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL); - final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER); + final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_BAD_URL); final int CrawlingDepth = Integer.parseInt(post.get("crawlingDepth", "0")); final boolean crawlDynamic = post.get("crawlingQ", "").equals("on"); final boolean indexText = post.get("indexText", "on").equals("on"); diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index 78fcbc14d..85781e9f6 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -47,6 +47,7 @@ public class CrawlProfile { public static final String MATCH_ALL = ".*"; public static final String MATCH_NEVER = ""; + public static final String MATCH_BAD_URL = ".*memberlist.*|.*previous.*|.*next.*|.*p=.*"; static HashMap> domsCache = new HashMap>(); diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java index b3811873b..177a83969 100644 --- a/source/de/anomic/crawler/CrawlSwitchboard.java +++ b/source/de/anomic/crawler/CrawlSwitchboard.java @@ -164,7 +164,7 @@ public final class CrawlSwitchboard { if (this.defaultProxyProfile == null) { // generate new default entry for proxy crawling - this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, + this.defaultProxyProfile = this.profilesActiveCrawls.newEntry("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false, true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/, @@ -175,32 +175,32 @@ public final class CrawlSwitchboard { } if (this.defaultRemoteProfile == null) { // generate new default entry for remote crawling - defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + defaultRemoteProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, -1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFFRESH); } if (this.defaultTextSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling - defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + defaultTextSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFFRESH); } if (this.defaultTextSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling - defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + defaultTextSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_CACHEONLY); } if (this.defaultMediaSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling - defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + defaultMediaSnippetLocalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFEXIST); } if (this.defaultMediaSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling - defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + defaultMediaSnippetGlobalProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CACHE_STRATEGY_IFEXIST); } if (this.defaultSurrogateProfile == null) { // generate new default entry for surrogate parsing - defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, + defaultSurrogateProfile = this.profilesActiveCrawls.newEntry(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, 0, this.profilesActiveCrawls.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CACHE_STRATEGY_NOCACHE); } } diff --git a/source/de/anomic/data/SitemapParser.java b/source/de/anomic/data/SitemapParser.java index 6cc88b8ab..9df7bd1d7 100644 --- a/source/de/anomic/data/SitemapParser.java +++ b/source/de/anomic/data/SitemapParser.java @@ -311,7 +311,7 @@ public class SitemapParser extends DefaultHandler { return this.sb.crawler.profilesActiveCrawls.newEntry( domainName, sitemapURL, // crawling Filter - CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, + CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_BAD_URL, // Depth 0, // force recrawling diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java index 3f460f0a4..b80b661d8 100644 --- a/source/de/anomic/data/bookmarksDB.java +++ b/source/de/anomic/data/bookmarksDB.java @@ -264,7 +264,7 @@ public class bookmarksDB { CrawlProfile.entry pe = sb.crawler.profilesActiveCrawls.newEntry( folder+"/"+crawlingStartURL, crawlingStartURL, newcrawlingMustMatch, - CrawlProfile.MATCH_NEVER, + CrawlProfile.MATCH_BAD_URL, newcrawlingdepth, sb.crawler.profilesActiveCrawls.getRecrawlDate(crawlingIfOlder), crawlingDomFilterDepth, crawlingDomMaxPages, crawlingQ, diff --git a/source/de/anomic/kelondro/blob/MapView.java b/source/de/anomic/kelondro/blob/MapView.java index 70edd653b..1a4d59d45 100644 --- a/source/de/anomic/kelondro/blob/MapView.java +++ b/source/de/anomic/kelondro/blob/MapView.java @@ -100,21 +100,21 @@ public class MapView { private static String map2string(final Map map, final String comment) { final StringBuilder bb = new StringBuilder(map.size() * 40); - bb.append("# ").append(comment).append("\r\n"); + bb.append("# ").append(comment).append('\r').append('\n'); for (Map.Entry entry: map.entrySet()) { if (entry.getValue() != null) { bb.append(entry.getKey()); bb.append('='); bb.append(entry.getValue()); - bb.append("\r\n"); + bb.append('\r').append('\n'); } } bb.append("# EOF\r\n"); return bb.toString(); } - private static Map string2map(final String s) throws IOException { - final BufferedReader br = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(s.getBytes()))); + private static Map bytes2map(byte[] b) throws IOException { + final BufferedReader br = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(b))); final Map map = new HashMap(); String line; int pos; @@ -220,7 +220,7 @@ public class MapView { // read object final byte[] b = blob.get(key.getBytes()); if (b == null) return null; - map = string2map(new String(b, "UTF-8")); + map = bytes2map(b); if (storeCache) { // write map to cache