From f6eebb6f99ef8fd0874be7bc29065c37fec90621 Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 30 Sep 2010 12:50:34 +0000 Subject: [PATCH] replaced auto-dom filter with easy-to-understand Site Link-List crawler option - nobody understand the auto-dom filter without a lenghtly introduction about the function of a crawler - nobody ever used the auto-dom filter other than with a crawl depth of 1 - the auto-dom filter was buggy since the filter did not survive a restart and then a search index contained waste - the function of the auto-dom filter was in fact to just load a link list from the given start url and then start separate crawls for all these urls restricted by their domain - the new Site Link-List option shows the target urls in real-time during input of the start url (like the robots check) and gives a transparent feed-back what it does before it can be used - the new option also fits into the easy site-crawl start menu git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7213 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/CrawlProfileEditor_p.html | 4 +- htroot/CrawlProfileEditor_p.java | 3 +- htroot/CrawlProfileEditor_p.xml | 1 - htroot/CrawlStartExpert_p.html | 23 +- htroot/CrawlStartSite_p.html | 13 +- htroot/Crawler_p.java | 709 +++++++++--------- htroot/QuickCrawlLink_p.java | 1 - htroot/api/util/getpageinfo_p.java | 14 + htroot/api/util/getpageinfo_p.xml | 7 + htroot/js/IndexCreate.js | 28 +- source/de/anomic/crawler/CrawlProfile.java | 33 +- source/de/anomic/crawler/CrawlStacker.java | 8 +- .../de/anomic/crawler/CrawlSwitchboard.java | 17 +- 13 files changed, 437 insertions(+), 424 deletions(-) diff --git a/htroot/CrawlProfileEditor_p.html b/htroot/CrawlProfileEditor_p.html index f00c8d814..0a027828e 100644 --- a/htroot/CrawlProfileEditor_p.html +++ b/htroot/CrawlProfileEditor_p.html @@ -38,8 +38,7 @@ Must Match Must Not Match MaxAge - Auto Filter Depth - Auto Filter Content + Domain Counter Content Max Page Per Domain Accept '?' URLs Fill Proxy Cache @@ -70,7 +69,6 @@ #[mustmatch]# #[mustnotmatch]# #[crawlingIfOlder]# - #[crawlingDomFilterDepth]# #{crawlingDomFilterContent}##[item]#
#{/crawlingDomFilterContent}# #[crawlingDomMaxPages]# #(withQuery)#no::yes#(/withQuery)# diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java index dd88cdb07..50f0cf8e2 100644 --- a/htroot/CrawlProfileEditor_p.java +++ b/htroot/CrawlProfileEditor_p.java @@ -87,7 +87,6 @@ public class CrawlProfileEditor_p { labels.add(new eentry(CrawlProfile.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING)); labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER)); - labels.add(new eentry(CrawlProfile.DOM_FILTER_DEPTH, "Domain Filter Depth", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.CRAWLING_Q, "CrawlingQ / '?'-URLs", false, eentry.BOOLEAN)); labels.add(new eentry(CrawlProfile.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN)); @@ -245,7 +244,7 @@ public class CrawlProfileEditor_p { prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", profile.mustMatchPattern().toString()); prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", profile.mustNotMatchPattern().toString()); prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(profile.recrawlIfOlder())); - prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth())); + prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive"); // start contrib [MN] int i = 0; diff --git a/htroot/CrawlProfileEditor_p.xml b/htroot/CrawlProfileEditor_p.xml index 5b5f54bee..67a254261 100644 --- a/htroot/CrawlProfileEditor_p.xml +++ b/htroot/CrawlProfileEditor_p.xml @@ -9,7 +9,6 @@ #[mustmatch]# #[mustnotmatch]# #[crawlingIfOlder]# - #[crawlingDomFilterDepth]# #{crawlingDomFilterContent}# #[item]# diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html index c4f9d0dd4..a05c05abe 100644 --- a/htroot/CrawlStartExpert_p.html +++ b/htroot/CrawlStartExpert_p.html @@ -44,6 +44,13 @@ + + : + + +
+ + : @@ -154,22 +161,6 @@ If you don't know what this means, please leave this field empty. - - Auto-Dom-Filter: - - : -    - : - - - - This option will automatically create a domain-filter which limits the crawl on domains the crawler - will find on the given depth. You can use this option i.e. to crawl a page with bookmarks while - restricting the crawl on only those domains that appear on the bookmark-page. The adequate depth - for this example would be 1.
- The default value 0 gives no restrictions. - - Maximum Pages per Domain: diff --git a/htroot/CrawlStartSite_p.html b/htroot/CrawlStartSite_p.html index a679b4f4d..153f752e8 100644 --- a/htroot/CrawlStartSite_p.html +++ b/htroot/CrawlStartSite_p.html @@ -42,13 +42,18 @@ - - empty - + empty + + + Link-List of URL +
+ Sitemap URL -
+ +
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 79c0abf76..a9a25d0db 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -60,10 +60,6 @@ import de.anomic.server.serverSwitch; import de.anomic.yacy.yacyNewsPool; public class Crawler_p { - public static final String CRAWLING_MODE_URL = "url"; - public static final String CRAWLING_MODE_FILE = "file"; - public static final String CRAWLING_MODE_SITEMAP = "sitemap"; - // this servlet does NOT create the Crawler servlet page content! // this servlet starts a web crawl. The interface for entering the web crawl parameters is in IndexCreate_p.html @@ -102,372 +98,405 @@ public class Crawler_p { } prop.put("info", "0"); - if (post != null) { - // a crawl start - if (post.containsKey("continue")) { - // continue queue - final String queue = post.get("continue", ""); - if (queue.equals("localcrawler")) { - sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - } else if (queue.equals("remotecrawler")) { - sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); - } + if (post != null && post.containsKey("continue")) { + // continue queue + final String queue = post.get("continue", ""); + if (queue.equals("localcrawler")) { + sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + } else if (queue.equals("remotecrawler")) { + sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); } + } - if (post.containsKey("pause")) { - // pause queue - final String queue = post.get("pause", ""); - if (queue.equals("localcrawler")) { - sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - } else if (queue.equals("remotecrawler")) { - sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); - } + if (post != null && post.containsKey("pause")) { + // pause queue + final String queue = post.get("pause", ""); + if (queue.equals("localcrawler")) { + sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + } else if (queue.equals("remotecrawler")) { + sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL); } - - if (post.containsKey("crawlingstart")) { - // init crawl - if (sb.peers == null) { - prop.put("info", "3"); - } else { - String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url - // add the prefix http:// if necessary - int pos = crawlingStart.indexOf("://"); - if (pos == -1) crawlingStart = "http://" + crawlingStart; + } + + if (post != null && post.containsKey("crawlingstart")) { + // init crawl + if (sb.peers == null) { + prop.put("info", "3"); + } else { + String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url + // add the prefix http:// if necessary + int pos = crawlingStart.indexOf("://"); + if (pos == -1) crawlingStart = "http://" + crawlingStart; - // normalizing URL - DigestURI crawlingStartURL = null; - try {crawlingStartURL = new DigestURI(crawlingStart, null);} catch (final MalformedURLException e1) {} - crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true); - - // set new properties - final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start - final boolean subPath = post.get("range", "wide").equals("subpath"); // special property in simple crawl start - - - // set the crawling filter - String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL); - String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER); - if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted - // special cases: - if (crawlingStartURL!= null && fullDomain) { - newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*"; - } - if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) { - newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*"; - } - - final boolean crawlOrder = post.get("crawlOrder", "off").equals("on"); - env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false"); - - int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "8")); - env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth)); - if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8; - - // recrawl - final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler - boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on"); - int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1")); - String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour - int repeat_time = Integer.parseInt(post.get("repeat_time", "-1")); - final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays - - if (recrawl.equals("scheduler") && repeat_time > 0) { - // set crawlingIfOlder attributes that are appropriate for scheduled crawling - crawlingIfOlderCheck = true; - crawlingIfOlderNumber = repeat_unit.equals("selminutes") ? 1 : repeat_unit.equals("selhours") ? repeat_time / 2 : repeat_time * 12; - crawlingIfOlderUnit = "hour"; - } else if (recrawl.equals("reload")) { - repeat_time = -1; - crawlingIfOlderCheck = true; - } else if (recrawl.equals("nodoubles")) { - repeat_time = -1; - crawlingIfOlderCheck = false; - } - long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit); - env.setConfig("crawlingIfOlder", crawlingIfOlder); + // normalize URL + DigestURI crawlingStartURL = null; + try {crawlingStartURL = new DigestURI(crawlingStart, null);} catch (final MalformedURLException e1) {} + crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true); + + // set new properties + final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start + final boolean subPath = post.get("range", "wide").equals("subpath"); // special property in simple crawl start + + + // set the crawl filter + String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL); + String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER); + if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted + // special cases: + if (crawlingStartURL!= null && fullDomain) { + newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*"; + } + if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) { + newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*"; + } + + final boolean crawlOrder = post.get("crawlOrder", "off").equals("on"); + env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false"); + + int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "8")); + env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth)); + if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8; + + // recrawl + final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler + boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on"); + int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1")); + String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour + int repeat_time = Integer.parseInt(post.get("repeat_time", "-1")); + final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays + + if (recrawl.equals("scheduler") && repeat_time > 0) { + // set crawlingIfOlder attributes that are appropriate for scheduled crawling + crawlingIfOlderCheck = true; + crawlingIfOlderNumber = repeat_unit.equals("selminutes") ? 1 : repeat_unit.equals("selhours") ? repeat_time / 2 : repeat_time * 12; + crawlingIfOlderUnit = "hour"; + } else if (recrawl.equals("reload")) { + repeat_time = -1; + crawlingIfOlderCheck = true; + } else if (recrawl.equals("nodoubles")) { + repeat_time = -1; + crawlingIfOlderCheck = false; + } + long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit); + env.setConfig("crawlingIfOlder", crawlingIfOlder); - // store this call as api call - if (repeat_time > 0) { - // store as scheduled api call - sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart, repeat_time, repeat_unit.substring(3)); - } else { - // store just a protocol - sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart); - } - final boolean crawlingDomFilterCheck = post.get("crawlingDomFilterCheck", "off").equals("on"); - final int crawlingDomFilterDepth = (crawlingDomFilterCheck) ? Integer.parseInt(post.get("crawlingDomFilterDepth", "-1")) : -1; - env.setConfig("crawlingDomFilterDepth", Integer.toString(crawlingDomFilterDepth)); - - final boolean crawlingDomMaxCheck = post.get("crawlingDomMaxCheck", "off").equals("on"); - final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? Integer.parseInt(post.get("crawlingDomMaxPages", "-1")) : -1; - env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages)); - - final boolean crawlingQ = post.get("crawlingQ", "off").equals("on"); - env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false"); - - final boolean indexText = post.get("indexText", "off").equals("on"); - env.setConfig("indexText", (indexText) ? "true" : "false"); - - final boolean indexMedia = post.get("indexMedia", "off").equals("on"); - env.setConfig("indexMedia", (indexMedia) ? "true" : "false"); - - final boolean storeHTCache = post.get("storeHTCache", "off").equals("on"); - env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false"); - - final String cachePolicyString = post.get("cachePolicy", "iffresh"); - CrawlProfile.CacheStrategy cachePolicy = CrawlProfile.CacheStrategy.IFFRESH; - if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CacheStrategy.NOCACHE; - if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CacheStrategy.IFFRESH; - if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CacheStrategy.IFEXIST; - if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CacheStrategy.CACHEONLY; - - final boolean xsstopw = post.get("xsstopw", "off").equals("on"); - env.setConfig("xsstopw", (xsstopw) ? "true" : "false"); - - final boolean xdstopw = post.get("xdstopw", "off").equals("on"); - env.setConfig("xdstopw", (xdstopw) ? "true" : "false"); - - final boolean xpstopw = post.get("xpstopw", "off").equals("on"); - env.setConfig("xpstopw", (xpstopw) ? "true" : "false"); + // store this call as api call + if (repeat_time > 0) { + // store as scheduled api call + sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart, repeat_time, repeat_unit.substring(3)); + } else { + // store just a protocol + sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart); + } + + final boolean crawlingDomMaxCheck = post.get("crawlingDomMaxCheck", "off").equals("on"); + final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? Integer.parseInt(post.get("crawlingDomMaxPages", "-1")) : -1; + env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages)); + + final boolean crawlingQ = post.get("crawlingQ", "off").equals("on"); + env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false"); + + final boolean indexText = post.get("indexText", "off").equals("on"); + env.setConfig("indexText", (indexText) ? "true" : "false"); + + final boolean indexMedia = post.get("indexMedia", "off").equals("on"); + env.setConfig("indexMedia", (indexMedia) ? "true" : "false"); + + final boolean storeHTCache = post.get("storeHTCache", "off").equals("on"); + env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false"); + + final String cachePolicyString = post.get("cachePolicy", "iffresh"); + CrawlProfile.CacheStrategy cachePolicy = CrawlProfile.CacheStrategy.IFFRESH; + if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CacheStrategy.NOCACHE; + if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CacheStrategy.IFFRESH; + if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CacheStrategy.IFEXIST; + if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CacheStrategy.CACHEONLY; + + final boolean xsstopw = post.get("xsstopw", "off").equals("on"); + env.setConfig("xsstopw", (xsstopw) ? "true" : "false"); + + final boolean xdstopw = post.get("xdstopw", "off").equals("on"); + env.setConfig("xdstopw", (xdstopw) ? "true" : "false"); + + final boolean xpstopw = post.get("xpstopw", "off").equals("on"); + env.setConfig("xpstopw", (xpstopw) ? "true" : "false"); + + final String crawlingMode = post.get("crawlingMode","url"); + if (crawlingMode.equals("url")) { - final String crawlingMode = post.get("crawlingMode","url"); - if (crawlingMode.equals(CRAWLING_MODE_URL)) { + // check if pattern matches + if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) { + // print error message + prop.put("info", "4"); //crawlfilter does not match url + prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); + prop.putHTML("info_crawlingStart", crawlingStart); + } else try { - // check if pattern matches - if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) { - // print error message - prop.put("info", "4"); //crawlfilter does not match url - prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); - prop.putHTML("info_crawlingStart", crawlingStart); - } else try { + // check if the crawl filter works correctly + Pattern.compile(newcrawlingMustMatch); + + // stack request + // first delete old entry, if exists + final DigestURI url = new DigestURI(crawlingStart, null); + final byte[] urlhash = url.hash(); + indexSegment.urlMetadata().remove(urlhash); + sb.crawlQueues.noticeURL.removeByURLHash(urlhash); + sb.crawlQueues.errorURL.remove(urlhash); + + // stack url + sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it + final CrawlProfile pe = new CrawlProfile( + (crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(), + crawlingStartURL, + newcrawlingMustMatch, + newcrawlingMustNotMatch, + newcrawlingdepth, + crawlingIfOlder, crawlingDomMaxPages, + crawlingQ, + indexText, indexMedia, + storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy); + sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe); + final String reasonString = sb.crawlStacker.stackCrawl(new Request( + sb.peers.mySeed().hash.getBytes(), + url, + null, + "CRAWLING-ROOT", + new Date(), + pe.handle(), + 0, + 0, + 0 + )); + + if (reasonString == null) { + // create a bookmark from crawl start url + Set tags=listManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart"))); + tags.add("crawlStart"); + if (post.get("createBookmark","off").equals("on")) { + bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin"); + if(bookmark != null){ + bookmark.setProperty(bookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart)); + bookmark.setOwner("admin"); + bookmark.setPublic(false); + bookmark.setTags(tags, true); + sb.bookmarksDB.saveBookmark(bookmark); + } + } + // liftoff! + prop.put("info", "8");//start msg + prop.putHTML("info_crawlingURL", (post.get("crawlingURL"))); + + // generate a YaCyNews if the global flag was set + if (crawlOrder) { + final Map m = new HashMap(pe); // must be cloned + m.remove("specificDepth"); + m.remove("indexText"); + m.remove("indexMedia"); + m.remove("remoteIndexing"); + m.remove("xsstopw"); + m.remove("xpstopw"); + m.remove("xdstopw"); + m.remove("storeTXCache"); + m.remove("storeHTCache"); + m.remove("generalFilter"); + m.remove("specificFilter"); + m.put("intention", post.get("intention", "").replace(',', '/')); + sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m); + } + } else { + prop.put("info", "5"); //Crawling failed + prop.putHTML("info_crawlingURL", (post.get("crawlingURL"))); + prop.putHTML("info_reasonString", reasonString); + sb.crawlQueues.errorURL.push( + new Request( + sb.peers.mySeed().hash.getBytes(), + crawlingStartURL, + null, + "", + new Date(), + pe.handle(), + 0, + 0, + 0), + sb.peers.mySeed().hash.getBytes(), + new Date(), + 1, + reasonString); + } + } catch (final PatternSyntaxException e) { + prop.put("info", "4"); //crawlfilter does not match url + prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); + prop.putHTML("info_error", e.getMessage()); + } catch (final Exception e) { + // mist + prop.put("info", "6");//Error with url + prop.putHTML("info_crawlingStart", crawlingStart); + prop.putHTML("info_error", e.getMessage()); + Log.logException(e); + } + + } else if (crawlingMode.equals("file")) { + if (post.containsKey("crawlingFile")) { + final String fileName = post.get("crawlingFile"); + try { // check if the crawl filter works correctly Pattern.compile(newcrawlingMustMatch); - - // stack request - // first delete old entry, if exists - final DigestURI url = new DigestURI(crawlingStart, null); - final byte[] urlhash = url.hash(); - indexSegment.urlMetadata().remove(urlhash); - sb.crawlQueues.noticeURL.removeByURLHash(urlhash); - sb.crawlQueues.errorURL.remove(urlhash); - - // stack url - sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it - final CrawlProfile pe = new CrawlProfile( - (crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(), - crawlingStartURL, + final File file = new File(fileName); + final String fileString = post.get("crawlingFile$file"); + final ContentScraper scraper = new ContentScraper(new DigestURI(file)); + final Writer writer = new TransformerWriter(null, null, scraper, null, false); + FileUtils.copy(fileString, writer); + writer.close(); + final Map hyperlinks = scraper.getAnchors(); + final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null); + final CrawlProfile profile = new CrawlProfile( + fileName, crawlURL, newcrawlingMustMatch, - newcrawlingMustNotMatch, + CrawlProfile.MATCH_NEVER, newcrawlingdepth, - crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, + crawlingIfOlder, + crawlingDomMaxPages, crawlingQ, - indexText, indexMedia, - storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy); - sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe); - final String reasonString = sb.crawlStacker.stackCrawl(new Request( - sb.peers.mySeed().hash.getBytes(), - url, - null, - "CRAWLING-ROOT", - new Date(), - pe.handle(), - 0, - 0, - 0 - )); - - if (reasonString == null) { - // create a bookmark from crawl start url - Set tags=listManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart"))); - tags.add("crawlStart"); - if (post.get("createBookmark","off").equals("on")) { - bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin"); - if(bookmark != null){ - bookmark.setProperty(bookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart)); - bookmark.setOwner("admin"); - bookmark.setPublic(false); - bookmark.setTags(tags, true); - sb.bookmarksDB.saveBookmark(bookmark); - } - } - // liftoff! - prop.put("info", "8");//start msg - prop.putHTML("info_crawlingURL", (post.get("crawlingURL"))); - - // generate a YaCyNews if the global flag was set - if (crawlOrder) { - final Map m = new HashMap(pe); // must be cloned - m.remove("specificDepth"); - m.remove("indexText"); - m.remove("indexMedia"); - m.remove("remoteIndexing"); - m.remove("xsstopw"); - m.remove("xpstopw"); - m.remove("xdstopw"); - m.remove("storeTXCache"); - m.remove("storeHTCache"); - m.remove("generalFilter"); - m.remove("specificFilter"); - m.put("intention", post.get("intention", "").replace(',', '/')); - sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m); - } - } else { - prop.put("info", "5"); //Crawling failed - prop.putHTML("info_crawlingURL", (post.get("crawlingURL"))); - prop.putHTML("info_reasonString", reasonString); - - sb.crawlQueues.errorURL.push( - new Request( - sb.peers.mySeed().hash.getBytes(), - crawlingStartURL, - null, - "", - new Date(), - pe.handle(), - 0, - 0, - 0), - sb.peers.mySeed().hash.getBytes(), + indexText, + indexMedia, + storeHTCache, + true, + crawlOrder, + xsstopw, xdstopw, xpstopw, + cachePolicy); + sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile); + sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + final Iterator> linkiterator = hyperlinks.entrySet().iterator(); + DigestURI nexturl; + while (linkiterator.hasNext()) { + final Map.Entry e = linkiterator.next(); + if (e.getKey() == null) continue; + nexturl = new DigestURI(e.getKey()); + sb.crawlStacker.enqueueEntry(new Request( + sb.peers.mySeed().hash.getBytes(), + nexturl, + null, + e.getValue(), new Date(), - 1, - reasonString); + profile.handle(), + 0, + 0, + 0 + )); } + } catch (final PatternSyntaxException e) { prop.put("info", "4"); //crawlfilter does not match url prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); prop.putHTML("info_error", e.getMessage()); } catch (final Exception e) { // mist - prop.put("info", "6");//Error with url - prop.putHTML("info_crawlingStart", crawlingStart); + prop.put("info", "7");//Error with file + prop.putHTML("info_crawlingStart", fileName); prop.putHTML("info_error", e.getMessage()); Log.logException(e); } + sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + } + } else if (crawlingMode.equals("sitemap")) { + String sitemapURLStr = post.get("sitemapURL",""); + try { + final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null); + final CrawlProfile pe = new CrawlProfile( + sitemapURLStr, sitemapURL, + newcrawlingMustMatch, + CrawlProfile.MATCH_NEVER, + newcrawlingdepth, + crawlingIfOlder, crawlingDomMaxPages, + crawlingQ, + indexText, indexMedia, + storeHTCache, true, crawlOrder, + xsstopw, xdstopw, xpstopw, + cachePolicy); + sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe); + final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe); + importer.start(); + } catch (final Exception e) { + // mist + prop.put("info", "6");//Error with url + prop.putHTML("info_crawlingStart", sitemapURLStr); + prop.putHTML("info_error", e.getMessage()); + Log.logException(e); + } + } else if (crawlingMode.equals("sitelist")) { + try { + final DigestURI sitelistURL = new DigestURI(crawlingStart, null); + // download document + ContentScraper scraper = null; + scraper = sb.loader.parseResource(sitelistURL, CrawlProfile.CacheStrategy.IFFRESH); + String title = scraper.getTitle(); + // String description = scraper.getDescription(); - } else if (crawlingMode.equals(CRAWLING_MODE_FILE)) { - if (post.containsKey("crawlingFile")) { - // getting the name of the uploaded file - final String fileName = post.get("crawlingFile"); - try { - // check if the crawl filter works correctly - Pattern.compile(newcrawlingMustMatch); - - // loading the file content - final File file = new File(fileName); - - // getting the content of the bookmark file - final String fileString = post.get("crawlingFile$file"); - - // parsing the bookmark file and fetching the headline and contained links - final ContentScraper scraper = new ContentScraper(new DigestURI(file)); - //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false); - final Writer writer = new TransformerWriter(null,null,scraper,null,false); - FileUtils.copy(fileString, writer); - writer.close(); - - //String headline = scraper.getHeadline(); - final Map hyperlinks = scraper.getAnchors(); - - // creating a crawler profile - final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null); - final CrawlProfile profile = new CrawlProfile( - fileName, crawlURL, - newcrawlingMustMatch, - CrawlProfile.MATCH_NEVER, - newcrawlingdepth, - crawlingIfOlder, - crawlingDomFilterDepth, - crawlingDomMaxPages, - crawlingQ, - indexText, - indexMedia, - storeHTCache, - true, - crawlOrder, - xsstopw, xdstopw, xpstopw, - cachePolicy); - sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile); - - // pause local crawl here - sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - - // loop through the contained links - final Iterator> linkiterator = hyperlinks.entrySet().iterator(); - DigestURI nexturl; - while (linkiterator.hasNext()) { - final Map.Entry e = linkiterator.next(); - if (e.getKey() == null) continue; - nexturl = new DigestURI(e.getKey()); - - // enqueuing the url for crawling - sb.crawlStacker.enqueueEntry(new Request( - sb.peers.mySeed().hash.getBytes(), - nexturl, - null, - e.getValue(), - new Date(), - profile.handle(), - 0, - 0, - 0 - )); - } - - } catch (final PatternSyntaxException e) { - // print error message - prop.put("info", "4"); //crawlfilter does not match url - prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); - prop.putHTML("info_error", e.getMessage()); - } catch (final Exception e) { - // mist - prop.put("info", "7");//Error with file - prop.putHTML("info_crawlingStart", fileName); - prop.putHTML("info_error", e.getMessage()); - Log.logException(e); - } - sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + // get links and generate filter + StringBuilder filter = new StringBuilder(); + final Map hyperlinks = scraper.getAnchors(); + for (MultiProtocolURI uri: hyperlinks.keySet()) { + filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*"); } - } else if (crawlingMode.equals(CRAWLING_MODE_SITEMAP)) { - String sitemapURLStr = null; - try { - // getting the sitemap URL - sitemapURLStr = post.get("sitemapURL",""); - final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null); - - // create a new profile - final CrawlProfile pe = new CrawlProfile( - sitemapURLStr, sitemapURL, - newcrawlingMustMatch, - CrawlProfile.MATCH_NEVER, - newcrawlingdepth, - crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages, - crawlingQ, - indexText, indexMedia, - storeHTCache, true, crawlOrder, - xsstopw, xdstopw, xpstopw, - cachePolicy); - sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe); - - // create a new sitemap importer - final SitemapImporter importer = new SitemapImporter(sb, new DigestURI(sitemapURLStr, null), pe); - importer.start(); - - } catch (final Exception e) { - // mist - prop.put("info", "6");//Error with url - prop.putHTML("info_crawlingStart", sitemapURLStr); - prop.putHTML("info_error", e.getMessage()); - Log.logException(e); - } + newcrawlingMustMatch = filter.length() > 0 ? filter.substring(1) : ""; + + // put links onto crawl queue + final CrawlProfile profile = new CrawlProfile( + title == null || title.length() == 0 ? sitelistURL.getHost() : title, + sitelistURL, + newcrawlingMustMatch, + CrawlProfile.MATCH_NEVER, + newcrawlingdepth, + crawlingIfOlder, + crawlingDomMaxPages, + crawlingQ, + indexText, + indexMedia, + storeHTCache, + true, + crawlOrder, + xsstopw, xdstopw, xpstopw, + cachePolicy); + sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile); + sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + final Iterator> linkiterator = hyperlinks.entrySet().iterator(); + DigestURI nexturl; + while (linkiterator.hasNext()) { + final Map.Entry e = linkiterator.next(); + if (e.getKey() == null) continue; + nexturl = new DigestURI(e.getKey()); + // remove the url from the database to be prepared to crawl them again + final byte[] urlhash = nexturl.hash(); + indexSegment.urlMetadata().remove(urlhash); + sb.crawlQueues.noticeURL.removeByURLHash(urlhash); + sb.crawlQueues.errorURL.remove(urlhash); + sb.crawlStacker.enqueueEntry(new Request( + sb.peers.mySeed().hash.getBytes(), + nexturl, + null, + e.getValue(), + new Date(), + profile.handle(), + 0, + 0, + 0 + )); + } + } catch (final Exception e) { + // mist + prop.put("info", "6");//Error with url + prop.putHTML("info_crawlingStart", crawlingStart); + prop.putHTML("info_error", e.getMessage()); + Log.logException(e); } } } - - if (post.containsKey("crawlingPerformance")) { - setPerformance(sb, post); - } + } + + if (post != null && post.containsKey("crawlingPerformance")) { + setPerformance(sb, post); } // performance settings diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index c951e7d10..c470db791 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -152,7 +152,6 @@ public class QuickCrawlLink_p { crawlingMustNotMatch, CrawlingDepth, 60 * 24 * 30, // recrawlIfOlder (minutes); here: one month - -1, // domFilterDepth, if negative: no auto-filter -1, // domMaxPages, if negative: no count restriction crawlDynamic, indexText, diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java index acd2bcb68..89bc7ad8e 100755 --- a/htroot/api/util/getpageinfo_p.java +++ b/htroot/api/util/getpageinfo_p.java @@ -81,6 +81,20 @@ public class getpageinfo_p { // put language Set languages = scraper.getContentLanguages(); prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next()); + + // get links and put them into a semicolon-separated list + StringBuilder links = new StringBuilder(); + StringBuilder filter = new StringBuilder(); + count = 0; + for (MultiProtocolURI uri: scraper.getAnchors().keySet()) { + links.append(';').append(uri.toNormalform(true, false)); + filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*"); + prop.putXML("links_" + count + "_link", uri.toNormalform(true, false)); + count++; + } + prop.put("links", count); + prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : ""); + prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*"); } } if(actions.indexOf("robots")>=0){ diff --git a/htroot/api/util/getpageinfo_p.xml b/htroot/api/util/getpageinfo_p.xml index 4942826da..b9590c990 100644 --- a/htroot/api/util/getpageinfo_p.xml +++ b/htroot/api/util/getpageinfo_p.xml @@ -6,9 +6,16 @@ #(robots-allowed)#0::1::#(/robots-allowed)# #[sitemap]# #[favicon]# + #[sitelist]# + #[filter]# #{tags}# #{/tags}# + + #{links}# + + #{/links}# + diff --git a/htroot/js/IndexCreate.js b/htroot/js/IndexCreate.js index fdb26ba84..b411f2261 100644 --- a/htroot/js/IndexCreate.js +++ b/htroot/js/IndexCreate.js @@ -3,12 +3,12 @@ var AJAX_ON="/env/grafics/ajax.gif"; var timeout=""; function handleResponse(){ - if(http.readyState == 4){ + if (http.readyState == 4){ var response = http.responseXML; - // getting the document title + // get the document title doctitle=""; - if(response.getElementsByTagName("title")[0].firstChild!=null){ + if (response.getElementsByTagName("title")[0].firstChild!=null){ doctitle=response.getElementsByTagName("title")[0].firstChild.nodeValue; } // document.getElementById("title").innerHTML=doctitle; @@ -23,43 +23,51 @@ function handleResponse(){ if(robotsOKspan.firstChild){ robotsOKspan.removeChild(robotsOKspan.firstChild); } - if(docrobotsOK==1){ + if (docrobotsOK==1){ img=document.createElement("img"); img.setAttribute("src", "/env/grafics/ok.png"); img.setAttribute("width", "32px"); img.setAttribute("height", "32px"); robotsOKspan.appendChild(img); - }else if(docrobotsOK==0){ + } else if(docrobotsOK==0){ img=document.createElement("img"); img.setAttribute("src", "/env/grafics/bad.png"); img.setAttribute("width", "32px"); img.setAttribute("height", "32px"); robotsOKspan.appendChild(img); robotsOKspan.appendChild(img); - }else{ + } else { robotsOKspan.appendChild(document.createTextNode("")); document.getElementById("robotsOK").innerHTML=""; } - // getting the sitemap URL contained in the robots.txt + // get the sitemap URL contained in the robots.txt if (document.getElementsByName("sitemapURL").length > 0) { sitemap=""; - if(response.getElementsByTagName("sitemap")[0].firstChild!=null){ + if (response.getElementsByTagName("sitemap")[0].firstChild!=null){ sitemap=response.getElementsByTagName("sitemap")[0].firstChild.nodeValue; } document.getElementsByName("sitemapURL")[0].value=sitemap; document.getElementById("sitemap").disabled=false; } + sitelist=""; + if (response.getElementsByTagName("sitelist")[0].firstChild!=null){ + sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue; + } + document.getElementById("sitelistURLs").innerHTML = sitelist; + document.getElementById("sitelist").disabled=false; // clear the ajax image document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF); } } -function changed(){ + +function changed() { window.clearTimeout(timeout); timeout=window.setTimeout("loadInfos()", 1500); } -function loadInfos(){ + +function loadInfos() { // displaying ajax image document.getElementsByName("ajax")[0].setAttribute("src",AJAX_ON); diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index 8dc5e13fc..23e26fa9d 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -48,7 +48,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M public static final String FILTER_MUSTNOTMATCH = "nevermatch"; public static final String DEPTH = "generalDepth"; public static final String RECRAWL_IF_OLDER = "recrawlIfOlder"; - public static final String DOM_FILTER_DEPTH = "domFilterDepth"; public static final String DOM_MAX_PAGES = "domMaxPages"; public static final String CRAWLING_Q = "crawlingQ"; public static final String INDEX_TEXT = "indexText"; @@ -70,7 +69,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M final String mustnotmatch, final int depth, final long recrawlIfOlder /*date*/, - final int domFilterDepth, final int domMaxPages, + final int domMaxPages, final boolean crawlingQ, final boolean indexText, final boolean indexMedia, final boolean storeHTCache, final boolean storeTXCache, @@ -87,7 +86,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? CrawlProfile.MATCH_NEVER : mustnotmatch); put(DEPTH, depth); put(RECRAWL_IF_OLDER, recrawlIfOlder); - put(DOM_FILTER_DEPTH, domFilterDepth); put(DOM_MAX_PAGES, domMaxPages); put(CRAWLING_Q, crawlingQ); // crawling of urls with '?' put(INDEX_TEXT, indexText); @@ -186,21 +184,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M return 0L; } } - public int domFilterDepth() { - // if the depth is equal or less to this depth, - // then the current url feeds with its domain the crawl filter - // if this is -1, all domains are feeded - final String r = get(DOM_FILTER_DEPTH); - if (r == null) return Integer.MAX_VALUE; - try { - final int i = Integer.parseInt(r); - if (i < 0) return Integer.MAX_VALUE; - return i; - } catch (final NumberFormatException e) { - Log.logException(e); - return Integer.MAX_VALUE; - } - } public int domMaxPages() { // this is the maximum number of pages that are crawled for a single domain // if -1, this means no limit @@ -270,16 +253,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M dp.inc(); } } - public boolean grantedDomAppearance(final String domain) { - final int max = domFilterDepth(); - if (max == Integer.MAX_VALUE) return true; - final DomProfile dp = doms.get(domain); - if (dp == null) { - return 0 < max; - } - return dp.depth <= max; - } - public boolean grantedDomCount(final String domain) { final int max = domMaxPages(); if (max == Integer.MAX_VALUE) return true; @@ -292,10 +265,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M public int domSize() { return doms.size(); } - public boolean domExists(final String domain) { - if (domFilterDepth() == Integer.MAX_VALUE) return true; - return doms.containsKey(domain); - } public String domName(final boolean attr, final int index){ final Iterator> domnamesi = doms.entrySet().iterator(); diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java index 82c955440..8c056ec64 100644 --- a/source/de/anomic/crawler/CrawlStacker.java +++ b/source/de/anomic/crawler/CrawlStacker.java @@ -196,7 +196,7 @@ public final class CrawlStacker { final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : nextQueue.getURL(entry.referrerhash()); // add domain to profile domain list - if ((profile.domFilterDepth() != Integer.MAX_VALUE) || (profile.domMaxPages() != Integer.MAX_VALUE)) { + if (profile.domMaxPages() != Integer.MAX_VALUE) { profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth()); } @@ -296,12 +296,6 @@ public final class CrawlStacker { return "post url not allowed"; } - // deny urls that do not match with the profile domain list - if (!(profile.grantedDomAppearance(url.getHost()))) { - if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is not listed in granted domains."); - return "url does not match domain filter"; - } - // deny urls that exceed allowed number of occurrences if (!(profile.grantedDomCount(url.getHost()))) { if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed."); diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java index 2b74c91b6..f90b0f40b 100644 --- a/source/de/anomic/crawler/CrawlSwitchboard.java +++ b/source/de/anomic/crawler/CrawlSwitchboard.java @@ -164,9 +164,10 @@ public final class CrawlSwitchboard { if (this.defaultProxyProfile == null) { // generate new default entry for proxy crawling - this.defaultProxyProfile = new CrawlProfile("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, + this.defaultProxyProfile = new CrawlProfile( + "proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/, - CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false, + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false, true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/, true, true, @@ -177,38 +178,38 @@ public final class CrawlSwitchboard { if (this.defaultRemoteProfile == null) { // generate new default entry for remote crawling this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - -1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); + -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); this.profilesActiveCrawls.put(this.defaultRemoteProfile.handle().getBytes(), this.defaultRemoteProfile); } if (this.defaultTextSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH); this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile); } if (this.defaultTextSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(this.defaultTextSnippetGlobalProfile.handle().getBytes(), this.defaultTextSnippetGlobalProfile); } this.defaultTextSnippetGlobalProfile.setCacheStrategy(CrawlProfile.CacheStrategy.IFEXIST); if (this.defaultMediaSnippetLocalProfile == null) { // generate new default entry for snippet fetch and optional crawling this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(this.defaultMediaSnippetLocalProfile.handle().getBytes(), this.defaultMediaSnippetLocalProfile); } if (this.defaultMediaSnippetGlobalProfile == null) { // generate new default entry for snippet fetch and optional crawling this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST); this.profilesActiveCrawls.put(this.defaultMediaSnippetGlobalProfile.handle().getBytes(), this.defaultMediaSnippetGlobalProfile); } if (this.defaultSurrogateProfile == null) { // generate new default entry for surrogate parsing this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0, - CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE); + CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE); this.profilesActiveCrawls.put(this.defaultSurrogateProfile.handle().getBytes(), this.defaultSurrogateProfile); } }