diff --git a/htroot/CrawlProfileEditor_p.html b/htroot/CrawlProfileEditor_p.html
index f00c8d814..0a027828e 100644
--- a/htroot/CrawlProfileEditor_p.html
+++ b/htroot/CrawlProfileEditor_p.html
@@ -38,8 +38,7 @@
Must Match
Must Not Match
MaxAge
- Auto Filter Depth
- Auto Filter Content
+ Domain Counter Content
Max Page Per Domain
Accept '?' URLs
Fill Proxy Cache
@@ -70,7 +69,6 @@
#[mustmatch]#
#[mustnotmatch]#
#[crawlingIfOlder]#
- #[crawlingDomFilterDepth]#
#{crawlingDomFilterContent}##[item]# #{/crawlingDomFilterContent}#
#[crawlingDomMaxPages]#
#(withQuery)#no::yes#(/withQuery)#
diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java
index dd88cdb07..50f0cf8e2 100644
--- a/htroot/CrawlProfileEditor_p.java
+++ b/htroot/CrawlProfileEditor_p.java
@@ -87,7 +87,6 @@ public class CrawlProfileEditor_p {
labels.add(new eentry(CrawlProfile.FILTER_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
- labels.add(new eentry(CrawlProfile.DOM_FILTER_DEPTH, "Domain Filter Depth", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.CRAWLING_Q, "CrawlingQ / '?'-URLs", false, eentry.BOOLEAN));
labels.add(new eentry(CrawlProfile.INDEX_TEXT, "Index Text", false, eentry.BOOLEAN));
@@ -245,7 +244,7 @@ public class CrawlProfileEditor_p {
prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", profile.mustMatchPattern().toString());
prop.put(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", profile.mustNotMatchPattern().toString());
prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (profile.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(profile.recrawlIfOlder()));
- prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", (profile.domFilterDepth() == Integer.MAX_VALUE) ? "inactive" : Integer.toString(profile.domFilterDepth()));
+ prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingDomFilterDepth", "inactive");
// start contrib [MN]
int i = 0;
diff --git a/htroot/CrawlProfileEditor_p.xml b/htroot/CrawlProfileEditor_p.xml
index 5b5f54bee..67a254261 100644
--- a/htroot/CrawlProfileEditor_p.xml
+++ b/htroot/CrawlProfileEditor_p.xml
@@ -9,7 +9,6 @@
#[mustmatch]#
#[mustnotmatch]#
#[crawlingIfOlder]#
- #[crawlingDomFilterDepth]#
#{crawlingDomFilterContent}#
- #[item]#
diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html
index c4f9d0dd4..a05c05abe 100644
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@@ -44,6 +44,13 @@
+
+ From Link-List of URL :
+
+
+
+
+
From Sitemap :
@@ -154,22 +161,6 @@
If you don't know what this means, please leave this field empty.
-
- Auto-Dom-Filter:
-
- Use :
-
- Depth :
-
-
-
- This option will automatically create a domain-filter which limits the crawl on domains the crawler
- will find on the given depth. You can use this option i.e. to crawl a page with bookmarks while
- restricting the crawl on only those domains that appear on the bookmark-page. The adequate depth
- for this example would be 1.
- The default value 0 gives no restrictions.
-
-
Maximum Pages per Domain:
diff --git a/htroot/CrawlStartSite_p.html b/htroot/CrawlStartSite_p.html
index a679b4f4d..153f752e8 100644
--- a/htroot/CrawlStartSite_p.html
+++ b/htroot/CrawlStartSite_p.html
@@ -42,13 +42,18 @@
-
-
-
+
+
+
+ Link-List of URL
+
+
Sitemap URL
-
+
+
Scheduler
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index 79c0abf76..a9a25d0db 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -60,10 +60,6 @@ import de.anomic.server.serverSwitch;
import de.anomic.yacy.yacyNewsPool;
public class Crawler_p {
- public static final String CRAWLING_MODE_URL = "url";
- public static final String CRAWLING_MODE_FILE = "file";
- public static final String CRAWLING_MODE_SITEMAP = "sitemap";
-
// this servlet does NOT create the Crawler servlet page content!
// this servlet starts a web crawl. The interface for entering the web crawl parameters is in IndexCreate_p.html
@@ -102,372 +98,405 @@ public class Crawler_p {
}
prop.put("info", "0");
- if (post != null) {
- // a crawl start
- if (post.containsKey("continue")) {
- // continue queue
- final String queue = post.get("continue", "");
- if (queue.equals("localcrawler")) {
- sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
- } else if (queue.equals("remotecrawler")) {
- sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
- }
+ if (post != null && post.containsKey("continue")) {
+ // continue queue
+ final String queue = post.get("continue", "");
+ if (queue.equals("localcrawler")) {
+ sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+ } else if (queue.equals("remotecrawler")) {
+ sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
}
+ }
- if (post.containsKey("pause")) {
- // pause queue
- final String queue = post.get("pause", "");
- if (queue.equals("localcrawler")) {
- sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
- } else if (queue.equals("remotecrawler")) {
- sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
- }
+ if (post != null && post.containsKey("pause")) {
+ // pause queue
+ final String queue = post.get("pause", "");
+ if (queue.equals("localcrawler")) {
+ sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+ } else if (queue.equals("remotecrawler")) {
+ sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_REMOTE_TRIGGERED_CRAWL);
}
-
- if (post.containsKey("crawlingstart")) {
- // init crawl
- if (sb.peers == null) {
- prop.put("info", "3");
- } else {
- String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url
- // add the prefix http:// if necessary
- int pos = crawlingStart.indexOf("://");
- if (pos == -1) crawlingStart = "http://" + crawlingStart;
+ }
+
+ if (post != null && post.containsKey("crawlingstart")) {
+ // init crawl
+ if (sb.peers == null) {
+ prop.put("info", "3");
+ } else {
+ String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url
+ // add the prefix http:// if necessary
+ int pos = crawlingStart.indexOf("://");
+ if (pos == -1) crawlingStart = "http://" + crawlingStart;
- // normalizing URL
- DigestURI crawlingStartURL = null;
- try {crawlingStartURL = new DigestURI(crawlingStart, null);} catch (final MalformedURLException e1) {}
- crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);
-
- // set new properties
- final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start
- final boolean subPath = post.get("range", "wide").equals("subpath"); // special property in simple crawl start
-
-
- // set the crawling filter
- String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
- String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
- if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
- // special cases:
- if (crawlingStartURL!= null && fullDomain) {
- newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
- }
- if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
- newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
- }
-
- final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
- env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
-
- int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "8"));
- env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
- if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
-
- // recrawl
- final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
- boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
- int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
- String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour
- int repeat_time = Integer.parseInt(post.get("repeat_time", "-1"));
- final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays
-
- if (recrawl.equals("scheduler") && repeat_time > 0) {
- // set crawlingIfOlder attributes that are appropriate for scheduled crawling
- crawlingIfOlderCheck = true;
- crawlingIfOlderNumber = repeat_unit.equals("selminutes") ? 1 : repeat_unit.equals("selhours") ? repeat_time / 2 : repeat_time * 12;
- crawlingIfOlderUnit = "hour";
- } else if (recrawl.equals("reload")) {
- repeat_time = -1;
- crawlingIfOlderCheck = true;
- } else if (recrawl.equals("nodoubles")) {
- repeat_time = -1;
- crawlingIfOlderCheck = false;
- }
- long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
- env.setConfig("crawlingIfOlder", crawlingIfOlder);
+ // normalize URL
+ DigestURI crawlingStartURL = null;
+ try {crawlingStartURL = new DigestURI(crawlingStart, null);} catch (final MalformedURLException e1) {}
+ crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true);
+
+ // set new properties
+ final boolean fullDomain = post.get("range", "wide").equals("domain"); // special property in simple crawl start
+ final boolean subPath = post.get("range", "wide").equals("subpath"); // special property in simple crawl start
+
+
+ // set the crawl filter
+ String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL);
+ String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER);
+ if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL; // avoid that all urls are filtered out if bad value was submitted
+ // special cases:
+ if (crawlingStartURL!= null && fullDomain) {
+ newcrawlingMustMatch = ".*" + crawlingStartURL.getHost() + ".*";
+ }
+ if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) {
+ newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*";
+ }
+
+ final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
+ env.setConfig("crawlOrder", (crawlOrder) ? "true" : "false");
+
+ int newcrawlingdepth = Integer.parseInt(post.get("crawlingDepth", "8"));
+ env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth));
+ if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8;
+
+ // recrawl
+ final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
+ boolean crawlingIfOlderCheck = post.get("crawlingIfOlderCheck", "off").equals("on");
+ int crawlingIfOlderNumber = Integer.parseInt(post.get("crawlingIfOlderNumber", "-1"));
+ String crawlingIfOlderUnit = post.get("crawlingIfOlderUnit","year"); // year, month, day, hour
+ int repeat_time = Integer.parseInt(post.get("repeat_time", "-1"));
+ final String repeat_unit = post.get("repeat_unit", "seldays"); // selminutes, selhours, seldays
+
+ if (recrawl.equals("scheduler") && repeat_time > 0) {
+ // set crawlingIfOlder attributes that are appropriate for scheduled crawling
+ crawlingIfOlderCheck = true;
+ crawlingIfOlderNumber = repeat_unit.equals("selminutes") ? 1 : repeat_unit.equals("selhours") ? repeat_time / 2 : repeat_time * 12;
+ crawlingIfOlderUnit = "hour";
+ } else if (recrawl.equals("reload")) {
+ repeat_time = -1;
+ crawlingIfOlderCheck = true;
+ } else if (recrawl.equals("nodoubles")) {
+ repeat_time = -1;
+ crawlingIfOlderCheck = false;
+ }
+ long crawlingIfOlder = recrawlIfOlderC(crawlingIfOlderCheck, crawlingIfOlderNumber, crawlingIfOlderUnit);
+ env.setConfig("crawlingIfOlder", crawlingIfOlder);
- // store this call as api call
- if (repeat_time > 0) {
- // store as scheduled api call
- sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart, repeat_time, repeat_unit.substring(3));
- } else {
- // store just a protocol
- sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart);
- }
- final boolean crawlingDomFilterCheck = post.get("crawlingDomFilterCheck", "off").equals("on");
- final int crawlingDomFilterDepth = (crawlingDomFilterCheck) ? Integer.parseInt(post.get("crawlingDomFilterDepth", "-1")) : -1;
- env.setConfig("crawlingDomFilterDepth", Integer.toString(crawlingDomFilterDepth));
-
- final boolean crawlingDomMaxCheck = post.get("crawlingDomMaxCheck", "off").equals("on");
- final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? Integer.parseInt(post.get("crawlingDomMaxPages", "-1")) : -1;
- env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages));
-
- final boolean crawlingQ = post.get("crawlingQ", "off").equals("on");
- env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false");
-
- final boolean indexText = post.get("indexText", "off").equals("on");
- env.setConfig("indexText", (indexText) ? "true" : "false");
-
- final boolean indexMedia = post.get("indexMedia", "off").equals("on");
- env.setConfig("indexMedia", (indexMedia) ? "true" : "false");
-
- final boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
- env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
-
- final String cachePolicyString = post.get("cachePolicy", "iffresh");
- CrawlProfile.CacheStrategy cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
- if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CacheStrategy.NOCACHE;
- if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
- if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CacheStrategy.IFEXIST;
- if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CacheStrategy.CACHEONLY;
-
- final boolean xsstopw = post.get("xsstopw", "off").equals("on");
- env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
-
- final boolean xdstopw = post.get("xdstopw", "off").equals("on");
- env.setConfig("xdstopw", (xdstopw) ? "true" : "false");
-
- final boolean xpstopw = post.get("xpstopw", "off").equals("on");
- env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
+ // store this call as api call
+ if (repeat_time > 0) {
+ // store as scheduled api call
+ sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart, repeat_time, repeat_unit.substring(3));
+ } else {
+ // store just a protocol
+ sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + crawlingStart);
+ }
+
+ final boolean crawlingDomMaxCheck = post.get("crawlingDomMaxCheck", "off").equals("on");
+ final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? Integer.parseInt(post.get("crawlingDomMaxPages", "-1")) : -1;
+ env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages));
+
+ final boolean crawlingQ = post.get("crawlingQ", "off").equals("on");
+ env.setConfig("crawlingQ", (crawlingQ) ? "true" : "false");
+
+ final boolean indexText = post.get("indexText", "off").equals("on");
+ env.setConfig("indexText", (indexText) ? "true" : "false");
+
+ final boolean indexMedia = post.get("indexMedia", "off").equals("on");
+ env.setConfig("indexMedia", (indexMedia) ? "true" : "false");
+
+ final boolean storeHTCache = post.get("storeHTCache", "off").equals("on");
+ env.setConfig("storeHTCache", (storeHTCache) ? "true" : "false");
+
+ final String cachePolicyString = post.get("cachePolicy", "iffresh");
+ CrawlProfile.CacheStrategy cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
+ if (cachePolicyString.equals("nocache")) cachePolicy = CrawlProfile.CacheStrategy.NOCACHE;
+ if (cachePolicyString.equals("iffresh")) cachePolicy = CrawlProfile.CacheStrategy.IFFRESH;
+ if (cachePolicyString.equals("ifexist")) cachePolicy = CrawlProfile.CacheStrategy.IFEXIST;
+ if (cachePolicyString.equals("cacheonly")) cachePolicy = CrawlProfile.CacheStrategy.CACHEONLY;
+
+ final boolean xsstopw = post.get("xsstopw", "off").equals("on");
+ env.setConfig("xsstopw", (xsstopw) ? "true" : "false");
+
+ final boolean xdstopw = post.get("xdstopw", "off").equals("on");
+ env.setConfig("xdstopw", (xdstopw) ? "true" : "false");
+
+ final boolean xpstopw = post.get("xpstopw", "off").equals("on");
+ env.setConfig("xpstopw", (xpstopw) ? "true" : "false");
+
+ final String crawlingMode = post.get("crawlingMode","url");
+ if (crawlingMode.equals("url")) {
- final String crawlingMode = post.get("crawlingMode","url");
- if (crawlingMode.equals(CRAWLING_MODE_URL)) {
+ // check if pattern matches
+ if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
+ // print error message
+ prop.put("info", "4"); //crawlfilter does not match url
+ prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
+ prop.putHTML("info_crawlingStart", crawlingStart);
+ } else try {
- // check if pattern matches
- if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) {
- // print error message
- prop.put("info", "4"); //crawlfilter does not match url
- prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
- prop.putHTML("info_crawlingStart", crawlingStart);
- } else try {
+ // check if the crawl filter works correctly
+ Pattern.compile(newcrawlingMustMatch);
+
+ // stack request
+ // first delete old entry, if exists
+ final DigestURI url = new DigestURI(crawlingStart, null);
+ final byte[] urlhash = url.hash();
+ indexSegment.urlMetadata().remove(urlhash);
+ sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
+ sb.crawlQueues.errorURL.remove(urlhash);
+
+ // stack url
+ sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
+ final CrawlProfile pe = new CrawlProfile(
+ (crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
+ crawlingStartURL,
+ newcrawlingMustMatch,
+ newcrawlingMustNotMatch,
+ newcrawlingdepth,
+ crawlingIfOlder, crawlingDomMaxPages,
+ crawlingQ,
+ indexText, indexMedia,
+ storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
+ sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
+ final String reasonString = sb.crawlStacker.stackCrawl(new Request(
+ sb.peers.mySeed().hash.getBytes(),
+ url,
+ null,
+ "CRAWLING-ROOT",
+ new Date(),
+ pe.handle(),
+ 0,
+ 0,
+ 0
+ ));
+
+ if (reasonString == null) {
+ // create a bookmark from crawl start url
+ Set tags=listManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
+ tags.add("crawlStart");
+ if (post.get("createBookmark","off").equals("on")) {
+ bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
+ if(bookmark != null){
+ bookmark.setProperty(bookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart));
+ bookmark.setOwner("admin");
+ bookmark.setPublic(false);
+ bookmark.setTags(tags, true);
+ sb.bookmarksDB.saveBookmark(bookmark);
+ }
+ }
+ // liftoff!
+ prop.put("info", "8");//start msg
+ prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
+
+ // generate a YaCyNews if the global flag was set
+ if (crawlOrder) {
+ final Map m = new HashMap(pe); // must be cloned
+ m.remove("specificDepth");
+ m.remove("indexText");
+ m.remove("indexMedia");
+ m.remove("remoteIndexing");
+ m.remove("xsstopw");
+ m.remove("xpstopw");
+ m.remove("xdstopw");
+ m.remove("storeTXCache");
+ m.remove("storeHTCache");
+ m.remove("generalFilter");
+ m.remove("specificFilter");
+ m.put("intention", post.get("intention", "").replace(',', '/'));
+ sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m);
+ }
+ } else {
+ prop.put("info", "5"); //Crawling failed
+ prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
+ prop.putHTML("info_reasonString", reasonString);
+ sb.crawlQueues.errorURL.push(
+ new Request(
+ sb.peers.mySeed().hash.getBytes(),
+ crawlingStartURL,
+ null,
+ "",
+ new Date(),
+ pe.handle(),
+ 0,
+ 0,
+ 0),
+ sb.peers.mySeed().hash.getBytes(),
+ new Date(),
+ 1,
+ reasonString);
+ }
+ } catch (final PatternSyntaxException e) {
+ prop.put("info", "4"); //crawlfilter does not match url
+ prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
+ prop.putHTML("info_error", e.getMessage());
+ } catch (final Exception e) {
+ // mist
+ prop.put("info", "6");//Error with url
+ prop.putHTML("info_crawlingStart", crawlingStart);
+ prop.putHTML("info_error", e.getMessage());
+ Log.logException(e);
+ }
+
+ } else if (crawlingMode.equals("file")) {
+ if (post.containsKey("crawlingFile")) {
+ final String fileName = post.get("crawlingFile");
+ try {
// check if the crawl filter works correctly
Pattern.compile(newcrawlingMustMatch);
-
- // stack request
- // first delete old entry, if exists
- final DigestURI url = new DigestURI(crawlingStart, null);
- final byte[] urlhash = url.hash();
- indexSegment.urlMetadata().remove(urlhash);
- sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
- sb.crawlQueues.errorURL.remove(urlhash);
-
- // stack url
- sb.crawler.profilesPassiveCrawls.remove(crawlingStartURL.hash()); // if there is an old entry, delete it
- final CrawlProfile pe = new CrawlProfile(
- (crawlingStartURL.getHost() == null) ? Long.toHexString(System.currentTimeMillis()) : crawlingStartURL.getHost(),
- crawlingStartURL,
+ final File file = new File(fileName);
+ final String fileString = post.get("crawlingFile$file");
+ final ContentScraper scraper = new ContentScraper(new DigestURI(file));
+ final Writer writer = new TransformerWriter(null, null, scraper, null, false);
+ FileUtils.copy(fileString, writer);
+ writer.close();
+ final Map hyperlinks = scraper.getAnchors();
+ final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
+ final CrawlProfile profile = new CrawlProfile(
+ fileName, crawlURL,
newcrawlingMustMatch,
- newcrawlingMustNotMatch,
+ CrawlProfile.MATCH_NEVER,
newcrawlingdepth,
- crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
+ crawlingIfOlder,
+ crawlingDomMaxPages,
crawlingQ,
- indexText, indexMedia,
- storeHTCache, true, crawlOrder, xsstopw, xdstopw, xpstopw, cachePolicy);
- sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
- final String reasonString = sb.crawlStacker.stackCrawl(new Request(
- sb.peers.mySeed().hash.getBytes(),
- url,
- null,
- "CRAWLING-ROOT",
- new Date(),
- pe.handle(),
- 0,
- 0,
- 0
- ));
-
- if (reasonString == null) {
- // create a bookmark from crawl start url
- Set tags=listManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
- tags.add("crawlStart");
- if (post.get("createBookmark","off").equals("on")) {
- bookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin");
- if(bookmark != null){
- bookmark.setProperty(bookmarksDB.Bookmark.BOOKMARK_TITLE, post.get("bookmarkTitle", crawlingStart));
- bookmark.setOwner("admin");
- bookmark.setPublic(false);
- bookmark.setTags(tags, true);
- sb.bookmarksDB.saveBookmark(bookmark);
- }
- }
- // liftoff!
- prop.put("info", "8");//start msg
- prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
-
- // generate a YaCyNews if the global flag was set
- if (crawlOrder) {
- final Map m = new HashMap(pe); // must be cloned
- m.remove("specificDepth");
- m.remove("indexText");
- m.remove("indexMedia");
- m.remove("remoteIndexing");
- m.remove("xsstopw");
- m.remove("xpstopw");
- m.remove("xdstopw");
- m.remove("storeTXCache");
- m.remove("storeHTCache");
- m.remove("generalFilter");
- m.remove("specificFilter");
- m.put("intention", post.get("intention", "").replace(',', '/'));
- sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), yacyNewsPool.CATEGORY_CRAWL_START, m);
- }
- } else {
- prop.put("info", "5"); //Crawling failed
- prop.putHTML("info_crawlingURL", (post.get("crawlingURL")));
- prop.putHTML("info_reasonString", reasonString);
-
- sb.crawlQueues.errorURL.push(
- new Request(
- sb.peers.mySeed().hash.getBytes(),
- crawlingStartURL,
- null,
- "",
- new Date(),
- pe.handle(),
- 0,
- 0,
- 0),
- sb.peers.mySeed().hash.getBytes(),
+ indexText,
+ indexMedia,
+ storeHTCache,
+ true,
+ crawlOrder,
+ xsstopw, xdstopw, xpstopw,
+ cachePolicy);
+ sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
+ sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+ final Iterator> linkiterator = hyperlinks.entrySet().iterator();
+ DigestURI nexturl;
+ while (linkiterator.hasNext()) {
+ final Map.Entry e = linkiterator.next();
+ if (e.getKey() == null) continue;
+ nexturl = new DigestURI(e.getKey());
+ sb.crawlStacker.enqueueEntry(new Request(
+ sb.peers.mySeed().hash.getBytes(),
+ nexturl,
+ null,
+ e.getValue(),
new Date(),
- 1,
- reasonString);
+ profile.handle(),
+ 0,
+ 0,
+ 0
+ ));
}
+
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); //crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) {
// mist
- prop.put("info", "6");//Error with url
- prop.putHTML("info_crawlingStart", crawlingStart);
+ prop.put("info", "7");//Error with file
+ prop.putHTML("info_crawlingStart", fileName);
prop.putHTML("info_error", e.getMessage());
Log.logException(e);
}
+ sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+ }
+ } else if (crawlingMode.equals("sitemap")) {
+ String sitemapURLStr = post.get("sitemapURL","");
+ try {
+ final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
+ final CrawlProfile pe = new CrawlProfile(
+ sitemapURLStr, sitemapURL,
+ newcrawlingMustMatch,
+ CrawlProfile.MATCH_NEVER,
+ newcrawlingdepth,
+ crawlingIfOlder, crawlingDomMaxPages,
+ crawlingQ,
+ indexText, indexMedia,
+ storeHTCache, true, crawlOrder,
+ xsstopw, xdstopw, xpstopw,
+ cachePolicy);
+ sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
+ final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
+ importer.start();
+ } catch (final Exception e) {
+ // mist
+ prop.put("info", "6");//Error with url
+ prop.putHTML("info_crawlingStart", sitemapURLStr);
+ prop.putHTML("info_error", e.getMessage());
+ Log.logException(e);
+ }
+ } else if (crawlingMode.equals("sitelist")) {
+ try {
+ final DigestURI sitelistURL = new DigestURI(crawlingStart, null);
+ // download document
+ ContentScraper scraper = null;
+ scraper = sb.loader.parseResource(sitelistURL, CrawlProfile.CacheStrategy.IFFRESH);
+ String title = scraper.getTitle();
+ // String description = scraper.getDescription();
- } else if (crawlingMode.equals(CRAWLING_MODE_FILE)) {
- if (post.containsKey("crawlingFile")) {
- // getting the name of the uploaded file
- final String fileName = post.get("crawlingFile");
- try {
- // check if the crawl filter works correctly
- Pattern.compile(newcrawlingMustMatch);
-
- // loading the file content
- final File file = new File(fileName);
-
- // getting the content of the bookmark file
- final String fileString = post.get("crawlingFile$file");
-
- // parsing the bookmark file and fetching the headline and contained links
- final ContentScraper scraper = new ContentScraper(new DigestURI(file));
- //OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
- final Writer writer = new TransformerWriter(null,null,scraper,null,false);
- FileUtils.copy(fileString, writer);
- writer.close();
-
- //String headline = scraper.getHeadline();
- final Map hyperlinks = scraper.getAnchors();
-
- // creating a crawler profile
- final DigestURI crawlURL = new DigestURI("file://" + file.toString(), null);
- final CrawlProfile profile = new CrawlProfile(
- fileName, crawlURL,
- newcrawlingMustMatch,
- CrawlProfile.MATCH_NEVER,
- newcrawlingdepth,
- crawlingIfOlder,
- crawlingDomFilterDepth,
- crawlingDomMaxPages,
- crawlingQ,
- indexText,
- indexMedia,
- storeHTCache,
- true,
- crawlOrder,
- xsstopw, xdstopw, xpstopw,
- cachePolicy);
- sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
-
- // pause local crawl here
- sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
-
- // loop through the contained links
- final Iterator> linkiterator = hyperlinks.entrySet().iterator();
- DigestURI nexturl;
- while (linkiterator.hasNext()) {
- final Map.Entry e = linkiterator.next();
- if (e.getKey() == null) continue;
- nexturl = new DigestURI(e.getKey());
-
- // enqueuing the url for crawling
- sb.crawlStacker.enqueueEntry(new Request(
- sb.peers.mySeed().hash.getBytes(),
- nexturl,
- null,
- e.getValue(),
- new Date(),
- profile.handle(),
- 0,
- 0,
- 0
- ));
- }
-
- } catch (final PatternSyntaxException e) {
- // print error message
- prop.put("info", "4"); //crawlfilter does not match url
- prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
- prop.putHTML("info_error", e.getMessage());
- } catch (final Exception e) {
- // mist
- prop.put("info", "7");//Error with file
- prop.putHTML("info_crawlingStart", fileName);
- prop.putHTML("info_error", e.getMessage());
- Log.logException(e);
- }
- sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+ // get links and generate filter
+ StringBuilder filter = new StringBuilder();
+ final Map hyperlinks = scraper.getAnchors();
+ for (MultiProtocolURI uri: hyperlinks.keySet()) {
+ filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
}
- } else if (crawlingMode.equals(CRAWLING_MODE_SITEMAP)) {
- String sitemapURLStr = null;
- try {
- // getting the sitemap URL
- sitemapURLStr = post.get("sitemapURL","");
- final DigestURI sitemapURL = new DigestURI(sitemapURLStr, null);
-
- // create a new profile
- final CrawlProfile pe = new CrawlProfile(
- sitemapURLStr, sitemapURL,
- newcrawlingMustMatch,
- CrawlProfile.MATCH_NEVER,
- newcrawlingdepth,
- crawlingIfOlder, crawlingDomFilterDepth, crawlingDomMaxPages,
- crawlingQ,
- indexText, indexMedia,
- storeHTCache, true, crawlOrder,
- xsstopw, xdstopw, xpstopw,
- cachePolicy);
- sb.crawler.profilesActiveCrawls.put(pe.handle().getBytes(), pe);
-
- // create a new sitemap importer
- final SitemapImporter importer = new SitemapImporter(sb, new DigestURI(sitemapURLStr, null), pe);
- importer.start();
-
- } catch (final Exception e) {
- // mist
- prop.put("info", "6");//Error with url
- prop.putHTML("info_crawlingStart", sitemapURLStr);
- prop.putHTML("info_error", e.getMessage());
- Log.logException(e);
- }
+ newcrawlingMustMatch = filter.length() > 0 ? filter.substring(1) : "";
+
+ // put links onto crawl queue
+ final CrawlProfile profile = new CrawlProfile(
+ title == null || title.length() == 0 ? sitelistURL.getHost() : title,
+ sitelistURL,
+ newcrawlingMustMatch,
+ CrawlProfile.MATCH_NEVER,
+ newcrawlingdepth,
+ crawlingIfOlder,
+ crawlingDomMaxPages,
+ crawlingQ,
+ indexText,
+ indexMedia,
+ storeHTCache,
+ true,
+ crawlOrder,
+ xsstopw, xdstopw, xpstopw,
+ cachePolicy);
+ sb.crawler.profilesActiveCrawls.put(profile.handle().getBytes(), profile);
+ sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
+ final Iterator> linkiterator = hyperlinks.entrySet().iterator();
+ DigestURI nexturl;
+ while (linkiterator.hasNext()) {
+ final Map.Entry e = linkiterator.next();
+ if (e.getKey() == null) continue;
+ nexturl = new DigestURI(e.getKey());
+ // remove the url from the database to be prepared to crawl them again
+ final byte[] urlhash = nexturl.hash();
+ indexSegment.urlMetadata().remove(urlhash);
+ sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
+ sb.crawlQueues.errorURL.remove(urlhash);
+ sb.crawlStacker.enqueueEntry(new Request(
+ sb.peers.mySeed().hash.getBytes(),
+ nexturl,
+ null,
+ e.getValue(),
+ new Date(),
+ profile.handle(),
+ 0,
+ 0,
+ 0
+ ));
+ }
+ } catch (final Exception e) {
+ // mist
+ prop.put("info", "6");//Error with url
+ prop.putHTML("info_crawlingStart", crawlingStart);
+ prop.putHTML("info_error", e.getMessage());
+ Log.logException(e);
}
}
}
-
- if (post.containsKey("crawlingPerformance")) {
- setPerformance(sb, post);
- }
+ }
+
+ if (post != null && post.containsKey("crawlingPerformance")) {
+ setPerformance(sb, post);
}
// performance settings
diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java
index c951e7d10..c470db791 100644
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@@ -152,7 +152,6 @@ public class QuickCrawlLink_p {
crawlingMustNotMatch,
CrawlingDepth,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
- -1, // domFilterDepth, if negative: no auto-filter
-1, // domMaxPages, if negative: no count restriction
crawlDynamic,
indexText,
diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/util/getpageinfo_p.java
index acd2bcb68..89bc7ad8e 100755
--- a/htroot/api/util/getpageinfo_p.java
+++ b/htroot/api/util/getpageinfo_p.java
@@ -81,6 +81,20 @@ public class getpageinfo_p {
// put language
Set languages = scraper.getContentLanguages();
prop.putXML("lang", (languages == null) ? "unknown" : languages.iterator().next());
+
+ // get links and put them into a semicolon-separated list
+ StringBuilder links = new StringBuilder();
+ StringBuilder filter = new StringBuilder();
+ count = 0;
+ for (MultiProtocolURI uri: scraper.getAnchors().keySet()) {
+ links.append(';').append(uri.toNormalform(true, false));
+ filter.append('|').append(uri.getProtocol()).append("://").append(uri.getHost()).append(".*");
+ prop.putXML("links_" + count + "_link", uri.toNormalform(true, false));
+ count++;
+ }
+ prop.put("links", count);
+ prop.putXML("sitelist", links.length() > 0 ? links.substring(1) : "");
+ prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
}
}
if(actions.indexOf("robots")>=0){
diff --git a/htroot/api/util/getpageinfo_p.xml b/htroot/api/util/getpageinfo_p.xml
index 4942826da..b9590c990 100644
--- a/htroot/api/util/getpageinfo_p.xml
+++ b/htroot/api/util/getpageinfo_p.xml
@@ -6,9 +6,16 @@
#(robots-allowed)#0::1::#(/robots-allowed)#
#[sitemap]#
#[favicon]#
+ #[sitelist]#
+ #[filter]#
#{tags}#
#{/tags}#
+
+ #{links}#
+
+ #{/links}#
+
diff --git a/htroot/js/IndexCreate.js b/htroot/js/IndexCreate.js
index fdb26ba84..b411f2261 100644
--- a/htroot/js/IndexCreate.js
+++ b/htroot/js/IndexCreate.js
@@ -3,12 +3,12 @@ var AJAX_ON="/env/grafics/ajax.gif";
var timeout="";
function handleResponse(){
- if(http.readyState == 4){
+ if (http.readyState == 4){
var response = http.responseXML;
- // getting the document title
+ // get the document title
doctitle="";
- if(response.getElementsByTagName("title")[0].firstChild!=null){
+ if (response.getElementsByTagName("title")[0].firstChild!=null){
doctitle=response.getElementsByTagName("title")[0].firstChild.nodeValue;
}
// document.getElementById("title").innerHTML=doctitle;
@@ -23,43 +23,51 @@ function handleResponse(){
if(robotsOKspan.firstChild){
robotsOKspan.removeChild(robotsOKspan.firstChild);
}
- if(docrobotsOK==1){
+ if (docrobotsOK==1){
img=document.createElement("img");
img.setAttribute("src", "/env/grafics/ok.png");
img.setAttribute("width", "32px");
img.setAttribute("height", "32px");
robotsOKspan.appendChild(img);
- }else if(docrobotsOK==0){
+ } else if(docrobotsOK==0){
img=document.createElement("img");
img.setAttribute("src", "/env/grafics/bad.png");
img.setAttribute("width", "32px");
img.setAttribute("height", "32px");
robotsOKspan.appendChild(img);
robotsOKspan.appendChild(img);
- }else{
+ } else {
robotsOKspan.appendChild(document.createTextNode(""));
document.getElementById("robotsOK").innerHTML="";
}
- // getting the sitemap URL contained in the robots.txt
+ // get the sitemap URL contained in the robots.txt
if (document.getElementsByName("sitemapURL").length > 0) {
sitemap="";
- if(response.getElementsByTagName("sitemap")[0].firstChild!=null){
+ if (response.getElementsByTagName("sitemap")[0].firstChild!=null){
sitemap=response.getElementsByTagName("sitemap")[0].firstChild.nodeValue;
}
document.getElementsByName("sitemapURL")[0].value=sitemap;
document.getElementById("sitemap").disabled=false;
}
+ sitelist="";
+ if (response.getElementsByTagName("sitelist")[0].firstChild!=null){
+ sitelist=response.getElementsByTagName("sitelist")[0].firstChild.nodeValue;
+ }
+ document.getElementById("sitelistURLs").innerHTML = sitelist;
+ document.getElementById("sitelist").disabled=false;
// clear the ajax image
document.getElementsByName("ajax")[0].setAttribute("src", AJAX_OFF);
}
}
-function changed(){
+
+function changed() {
window.clearTimeout(timeout);
timeout=window.setTimeout("loadInfos()", 1500);
}
-function loadInfos(){
+
+function loadInfos() {
// displaying ajax image
document.getElementsByName("ajax")[0].setAttribute("src",AJAX_ON);
diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java
index 8dc5e13fc..23e26fa9d 100644
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@@ -48,7 +48,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M
public static final String FILTER_MUSTNOTMATCH = "nevermatch";
public static final String DEPTH = "generalDepth";
public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
- public static final String DOM_FILTER_DEPTH = "domFilterDepth";
public static final String DOM_MAX_PAGES = "domMaxPages";
public static final String CRAWLING_Q = "crawlingQ";
public static final String INDEX_TEXT = "indexText";
@@ -70,7 +69,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
final String mustnotmatch,
final int depth,
final long recrawlIfOlder /*date*/,
- final int domFilterDepth, final int domMaxPages,
+ final int domMaxPages,
final boolean crawlingQ,
final boolean indexText, final boolean indexMedia,
final boolean storeHTCache, final boolean storeTXCache,
@@ -87,7 +86,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M
put(FILTER_MUSTNOTMATCH, (mustnotmatch == null) ? CrawlProfile.MATCH_NEVER : mustnotmatch);
put(DEPTH, depth);
put(RECRAWL_IF_OLDER, recrawlIfOlder);
- put(DOM_FILTER_DEPTH, domFilterDepth);
put(DOM_MAX_PAGES, domMaxPages);
put(CRAWLING_Q, crawlingQ); // crawling of urls with '?'
put(INDEX_TEXT, indexText);
@@ -186,21 +184,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M
return 0L;
}
}
- public int domFilterDepth() {
- // if the depth is equal or less to this depth,
- // then the current url feeds with its domain the crawl filter
- // if this is -1, all domains are feeded
- final String r = get(DOM_FILTER_DEPTH);
- if (r == null) return Integer.MAX_VALUE;
- try {
- final int i = Integer.parseInt(r);
- if (i < 0) return Integer.MAX_VALUE;
- return i;
- } catch (final NumberFormatException e) {
- Log.logException(e);
- return Integer.MAX_VALUE;
- }
- }
public int domMaxPages() {
// this is the maximum number of pages that are crawled for a single domain
// if -1, this means no limit
@@ -270,16 +253,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M
dp.inc();
}
}
- public boolean grantedDomAppearance(final String domain) {
- final int max = domFilterDepth();
- if (max == Integer.MAX_VALUE) return true;
- final DomProfile dp = doms.get(domain);
- if (dp == null) {
- return 0 < max;
- }
- return dp.depth <= max;
- }
-
public boolean grantedDomCount(final String domain) {
final int max = domMaxPages();
if (max == Integer.MAX_VALUE) return true;
@@ -292,10 +265,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M
public int domSize() {
return doms.size();
}
- public boolean domExists(final String domain) {
- if (domFilterDepth() == Integer.MAX_VALUE) return true;
- return doms.containsKey(domain);
- }
public String domName(final boolean attr, final int index){
final Iterator> domnamesi = doms.entrySet().iterator();
diff --git a/source/de/anomic/crawler/CrawlStacker.java b/source/de/anomic/crawler/CrawlStacker.java
index 82c955440..8c056ec64 100644
--- a/source/de/anomic/crawler/CrawlStacker.java
+++ b/source/de/anomic/crawler/CrawlStacker.java
@@ -196,7 +196,7 @@ public final class CrawlStacker {
final DigestURI referrerURL = (entry.referrerhash() == null || entry.referrerhash().length == 0) ? null : nextQueue.getURL(entry.referrerhash());
// add domain to profile domain list
- if ((profile.domFilterDepth() != Integer.MAX_VALUE) || (profile.domMaxPages() != Integer.MAX_VALUE)) {
+ if (profile.domMaxPages() != Integer.MAX_VALUE) {
profile.domInc(entry.url().getHost(), (referrerURL == null) ? null : referrerURL.getHost().toLowerCase(), entry.depth());
}
@@ -296,12 +296,6 @@ public final class CrawlStacker {
return "post url not allowed";
}
- // deny urls that do not match with the profile domain list
- if (!(profile.grantedDomAppearance(url.getHost()))) {
- if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' is not listed in granted domains.");
- return "url does not match domain filter";
- }
-
// deny urls that exceed allowed number of occurrences
if (!(profile.grantedDomCount(url.getHost()))) {
if (this.log.isFine()) this.log.logFine("URL '" + url.toString() + "' appeared too often, a maximum of " + profile.domMaxPages() + " is allowed.");
diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java
index 2b74c91b6..f90b0f40b 100644
--- a/source/de/anomic/crawler/CrawlSwitchboard.java
+++ b/source/de/anomic/crawler/CrawlSwitchboard.java
@@ -164,9 +164,10 @@ public final class CrawlSwitchboard {
if (this.defaultProxyProfile == null) {
// generate new default entry for proxy crawling
- this.defaultProxyProfile = new CrawlProfile("proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
+ this.defaultProxyProfile = new CrawlProfile(
+ "proxy", null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER,
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
- CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, -1, false,
+ CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, false,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
true, true,
@@ -177,38 +178,38 @@ public final class CrawlSwitchboard {
if (this.defaultRemoteProfile == null) {
// generate new default entry for remote crawling
this.defaultRemoteProfile = new CrawlProfile(CRAWL_PROFILE_REMOTE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
- -1, -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
+ -1, -1, true, true, true, false, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultRemoteProfile.handle().getBytes(), this.defaultRemoteProfile);
}
if (this.defaultTextSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
- CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
+ CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, true, false, false, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFFRESH);
this.profilesActiveCrawls.put(this.defaultTextSnippetLocalProfile.handle().getBytes(), this.defaultTextSnippetLocalProfile);
}
if (this.defaultTextSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultTextSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
- CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
+ CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, true, true, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultTextSnippetGlobalProfile.handle().getBytes(), this.defaultTextSnippetGlobalProfile);
}
this.defaultTextSnippetGlobalProfile.setCacheStrategy(CrawlProfile.CacheStrategy.IFEXIST);
if (this.defaultMediaSnippetLocalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetLocalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
- CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
+ CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, true, false, false, true, false, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultMediaSnippetLocalProfile.handle().getBytes(), this.defaultMediaSnippetLocalProfile);
}
if (this.defaultMediaSnippetGlobalProfile == null) {
// generate new default entry for snippet fetch and optional crawling
this.defaultMediaSnippetGlobalProfile = new CrawlProfile(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
- CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
+ CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, true, false, true, true, true, false, true, true, false, CrawlProfile.CacheStrategy.IFEXIST);
this.profilesActiveCrawls.put(this.defaultMediaSnippetGlobalProfile.handle().getBytes(), this.defaultMediaSnippetGlobalProfile);
}
if (this.defaultSurrogateProfile == null) {
// generate new default entry for surrogate parsing
this.defaultSurrogateProfile = new CrawlProfile(CRAWL_PROFILE_SURROGATE, null, CrawlProfile.MATCH_ALL, CrawlProfile.MATCH_NEVER, 0,
- CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
+ CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, true, true, false, false, false, false, true, true, false, CrawlProfile.CacheStrategy.NOCACHE);
this.profilesActiveCrawls.put(this.defaultSurrogateProfile.handle().getBytes(), this.defaultSurrogateProfile);
}
}