diff --git a/htroot/CrawlProfileEditor_p.html b/htroot/CrawlProfileEditor_p.html index c769d7d86..2ec64eb4a 100644 --- a/htroot/CrawlProfileEditor_p.html +++ b/htroot/CrawlProfileEditor_p.html @@ -33,7 +33,6 @@ Crawl Thread Status - Start URL Depth Must Match Must Not Match @@ -64,7 +63,6 @@ #(/deleteButton)# - #[startURL]# #[depth]# #[mustmatch]# #[mustnotmatch]# diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java index 1e6e8caf8..bb859e537 100644 --- a/htroot/CrawlProfileEditor_p.java +++ b/htroot/CrawlProfileEditor_p.java @@ -66,7 +66,6 @@ public class CrawlProfileEditor_p { private static final List labels = new ArrayList(); static { labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING)); - labels.add(new eentry(CrawlProfile.START_URL, "Start URL", true, eentry.STRING)); labels.add(new eentry(CrawlProfile.FILTER_URL_MUSTMATCH, "Must-Match Filter", false, eentry.STRING)); labels.add(new eentry(CrawlProfile.FILTER_URL_MUSTNOTMATCH, "Must-Not-Match Filter", false, eentry.STRING)); labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER)); diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html index f6b9d499e..8d3514708 100644 --- a/htroot/CrawlStartExpert_p.html +++ b/htroot/CrawlStartExpert_p.html @@ -44,7 +44,7 @@ : - + @@ -83,7 +83,8 @@ - Existing start URLs are always re-crawled. + Define the start-url(s) here. You can submit more than one URL, each line one URL please. + Each of these URLs are the root for a crawl start, existing start URLs are always re-loaded. Other already visited URLs are sorted out as "double", if they are not allowed using the re-crawl option. diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java index ffb26c9c9..2ab83f865 100644 --- a/htroot/CrawlStartExpert_p.java +++ b/htroot/CrawlStartExpert_p.java @@ -40,7 +40,7 @@ public class CrawlStartExpert_p { final serverObjects prop = new serverObjects(); // define visible variables - prop.put("starturl", /*(intranet) ? repository :*/ "http://"); + prop.put("starturl", /*(intranet) ? repository :*/ ""); prop.put("proxyPrefetchDepth", env.getConfig("proxyPrefetchDepth", "0")); prop.put("crawlingDepth", Math.min(3, env.getConfigLong("crawlingDepth", 0))); prop.put("directDocByURLChecked", sb.getConfigBool("crawlingDirectDocByURL", true) ? "1" : "0"); diff --git a/htroot/Crawler_p.html b/htroot/Crawler_p.html index b9bc539ee..3b06b914c 100644 --- a/htroot/Crawler_p.html +++ b/htroot/Crawler_p.html @@ -161,12 +161,12 @@ - Start URL + Name Status #{list}# - #[startURL]# + #[name]# #(terminateButton)#::
Running
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index a7209da1e..0c78150a2 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -27,23 +27,25 @@ import java.io.File; import java.io.FileInputStream; +import java.io.IOException; import java.io.Writer; import java.net.MalformedURLException; import java.util.Date; import java.util.HashMap; import java.util.HashSet; -import java.util.Iterator; import java.util.Map; import java.util.Properties; import java.util.Set; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; +import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.util.SpaceExceededException; import net.yacy.document.Document; +import net.yacy.document.Parser.Failure; import net.yacy.document.parser.html.ContentScraper; import net.yacy.document.parser.html.TransformerWriter; import net.yacy.kelondro.data.meta.DigestURI; @@ -138,14 +140,7 @@ public class Crawler_p { if (sb.peers == null) { prop.put("info", "3"); } else { - String crawlingStart = post.get("crawlingURL","").trim(); // the crawljob start url - // add the prefix http:// if necessary - int pos = crawlingStart.indexOf("://",0); - if (pos == -1) { - if (crawlingStart.startsWith("www")) crawlingStart = "http://" + crawlingStart; - if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart; - } - + // remove crawlingFileContent before we record the call String crawlingFileName = post.get("crawlingFile"); final File crawlingFile; @@ -158,20 +153,49 @@ public class Crawler_p { if (crawlingFile != null && crawlingFile.exists()) { post.remove("crawlingFile$file"); } - - // normalize URL - DigestURI crawlingStartURL = null; - if (crawlingFile == null) try {crawlingStartURL = new DigestURI(crawlingStart);} catch (final MalformedURLException e1) {Log.logException(e1);} - crawlingStart = (crawlingStartURL == null) ? null : crawlingStartURL.toNormalform(true, true); - - // set new properties + + // prepare some filter that are adjusted in case that this is wanted + boolean storeHTCache = "on".equals(post.get("storeHTCache", "on")); + String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING); + String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING); + if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // avoid that all urls are filtered out if bad value was submitted final boolean fullDomain = "domain".equals(post.get("range", "wide")); // special property in simple crawl start final boolean subPath = "subpath".equals(post.get("range", "wide")); // special property in simple crawl start + String crawlingStart0 = post.get("crawlingURL","").trim(); // the crawljob start url + String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|")); + Set rootURLs = new HashSet(); + String crawlName = ""; + if (crawlingFile == null) for (String crawlingStart: rootURLs0) { + if (crawlingStart == null || crawlingStart.length() == 0) continue; + // add the prefix http:// if necessary + int pos = crawlingStart.indexOf("://",0); + if (pos == -1) { + if (crawlingStart.startsWith("www")) crawlingStart = "http://" + crawlingStart; + if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart; + } + try { + DigestURI crawlingStartURL = new DigestURI(crawlingStart); + rootURLs.add(crawlingStartURL); + crawlName += crawlingStartURL.getHost() + "_"; + if (fullDomain) { + newcrawlingMustMatch = CrawlProfile.mustMatchFilterFullDomain(crawlingStartURL); + if (subPath) newcrawlingMustMatch = newcrawlingMustMatch.substring(0, newcrawlingMustMatch.length() - 2) + crawlingStartURL.getPath() + ".*"; + } + if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) { + newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*"; + } + if (crawlingStartURL != null && (crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false; + + } catch (MalformedURLException e) { + Log.logException(e); + } + } + if (crawlName.length() > 80) crawlName = crawlName.substring(0, 80); + if (crawlName.endsWith("_")) crawlName = crawlName.substring(0, crawlName.length() - 1); + + // set the crawl filter - String newcrawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING); - final String newcrawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING); - if (newcrawlingMustMatch.length() < 2) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // avoid that all urls are filtered out if bad value was submitted String ipMustMatch = post.get("ipMustmatch", CrawlProfile.MATCH_ALL_STRING); final String ipMustNotMatch = post.get("ipMustnotmatch", CrawlProfile.MATCH_NEVER_STRING); if (ipMustMatch.length() < 2) ipMustMatch = CrawlProfile.MATCH_ALL_STRING; @@ -180,15 +204,6 @@ public class Crawler_p { sb.setConfig("crawlingIPMustNotMatch", ipMustNotMatch); if (countryMustMatch.length() > 0) sb.setConfig("crawlingCountryMustMatch", countryMustMatch); - // special cases: - if (crawlingStartURL!= null && fullDomain) { - newcrawlingMustMatch = CrawlProfile.mustMatchFilterFullDomain(crawlingStartURL); - if (subPath) newcrawlingMustMatch = newcrawlingMustMatch.substring(0, newcrawlingMustMatch.length() - 2) + crawlingStartURL.getPath() + ".*"; - } - if (crawlingStart!= null && subPath && (pos = crawlingStart.lastIndexOf('/')) > 0) { - newcrawlingMustMatch = crawlingStart.substring(0, pos + 1) + ".*"; - } - final boolean crawlOrder = post.get("crawlOrder", "off").equals("on"); env.setConfig("crawlOrder", crawlOrder); @@ -196,7 +211,7 @@ public class Crawler_p { env.setConfig("crawlingDepth", Integer.toString(newcrawlingdepth)); if ((crawlOrder) && (newcrawlingdepth > 8)) newcrawlingdepth = 8; - final boolean directDocByURL = "on".equals(post.get("directDocByURL", "on")); // catch also all linked media documents without loading them + boolean directDocByURL = "on".equals(post.get("directDocByURL", "on")); // catch also all linked media documents without loading them env.setConfig("crawlingDirectDocByURL", directDocByURL); final String collection = post.get("collection", sb.getConfig("collection", "user")); @@ -228,17 +243,17 @@ public class Crawler_p { // store this call as api call if (repeat_time > 0) { // store as scheduled api call - sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((crawlingStart == null) ? post.get("crawlingFile", "") : crawlingStart), repeat_time, repeat_unit.substring(3)); + sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true, false)), repeat_time, repeat_unit.substring(3)); } else { // store just a protocol - sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((crawlingStart == null) ? post.get("crawlingFile", "") : crawlingStart)); + sb.tables.recordAPICall(post, "Crawler_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "crawl start for " + ((rootURLs.size() == 0) ? post.get("crawlingFile", "") : rootURLs.iterator().next().toNormalform(true, false))); } final boolean crawlingDomMaxCheck = "on".equals(post.get("crawlingDomMaxCheck", "off")); final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? post.getInt("crawlingDomMaxPages", -1) : -1; env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages)); - final boolean crawlingQ = "on".equals(post.get("crawlingQ", "off")); + boolean crawlingQ = "on".equals(post.get("crawlingQ", "off")); env.setConfig("crawlingQ", crawlingQ); final boolean indexText = "on".equals(post.get("indexText", "on")); @@ -247,8 +262,6 @@ public class Crawler_p { final boolean indexMedia = "on".equals(post.get("indexMedia", "on")); env.setConfig("indexMedia", indexMedia); - boolean storeHTCache = "on".equals(post.get("storeHTCache", "on")); - if (crawlingStartURL!= null &&(crawlingStartURL.isFile() || crawlingStartURL.isSMB())) storeHTCache = false; env.setConfig("storeHTCache", storeHTCache); CacheStrategy cachePolicy = CacheStrategy.parse(post.get("cachePolicy", "iffresh")); @@ -263,150 +276,118 @@ public class Crawler_p { final boolean xpstopw = "on".equals(post.get("xpstopw", "off")); env.setConfig("xpstopw", xpstopw); - final String crawlingMode = post.get("crawlingMode","url"); - if (crawlingStart != null && crawlingStart.startsWith("ftp")) { - try { - // check if the crawl filter works correctly - Pattern.compile(newcrawlingMustMatch); - final CrawlProfile profile = new CrawlProfile( - crawlingStart, - crawlingStartURL, - newcrawlingMustMatch, - newcrawlingMustNotMatch, - ipMustMatch, - ipMustNotMatch, - countryMustMatch, - newcrawlingdepth, - directDocByURL, - crawlingIfOlder, - crawlingDomMaxPages, - crawlingQ, - indexText, - indexMedia, - storeHTCache, - crawlOrder, - xsstopw, - xdstopw, - xpstopw, - cachePolicy, - collection); - sb.crawler.putActive(profile.handle().getBytes(), profile); - sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - final DigestURI url = crawlingStartURL; - sb.crawlStacker.enqueueEntriesFTP(sb.peers.mySeed().hash.getBytes(), profile.handle(), url.getHost(), url.getPort(), false); - } catch (final PatternSyntaxException e) { - prop.put("info", "4"); // crawlfilter does not match url - prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); - prop.putHTML("info_error", e.getMessage()); - } catch (final Exception e) { - // mist - prop.put("info", "7"); // Error with file - prop.putHTML("info_crawlingStart", crawlingStart); - prop.putHTML("info_error", e.getMessage()); - Log.logException(e); - } - sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - } else if ("url".equals(crawlingMode)) { - - // check if pattern matches - if ((crawlingStart == null || crawlingStartURL == null) /* || (!(crawlingStart.matches(newcrawlingfilter))) */) { - // print error message - prop.put("info", "4"); //crawlfilter does not match url - prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); - prop.putHTML("info_crawlingStart", crawlingStart); - } else try { - - - // check if the crawl filter works correctly - Pattern.compile(newcrawlingMustMatch); - - // stack request - // first delete old entry, if exists - final DigestURI url = new DigestURI(crawlingStart); - final byte[] urlhash = url.hash(); - sb.index.fulltext().remove(urlhash); - sb.crawlQueues.noticeURL.removeByURLHash(urlhash); - sb.crawlQueues.errorURL.remove(urlhash); - - // get a scraper to get the title - final Document scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); - final String title = scraper == null ? url.toNormalform(true, true) : scraper.dc_title(); - final String description = scraper.dc_description(); - - // stack url - sb.crawler.removePassive(crawlingStartURL.hash()); // if there is an old entry, delete it - final CrawlProfile pe = new CrawlProfile( - (crawlingStartURL.getHost() == null) ? crawlingStartURL.toNormalform(true, false) : crawlingStartURL.getHost(), - crawlingStartURL, - newcrawlingMustMatch, - newcrawlingMustNotMatch, - ipMustMatch, - ipMustNotMatch, - countryMustMatch, - newcrawlingdepth, - directDocByURL, - crawlingIfOlder, - crawlingDomMaxPages, - crawlingQ, - indexText, indexMedia, - storeHTCache, - crawlOrder, - xsstopw, - xdstopw, - xpstopw, - cachePolicy, - collection); - sb.crawler.putActive(pe.handle().getBytes(), pe); - final String reasonString = sb.crawlStacker.stackCrawl(new Request( - sb.peers.mySeed().hash.getBytes(), - url, - null, - "CRAWLING-ROOT", - new Date(), - pe.handle(), - 0, - 0, - 0, - 0 - )); - - if (reasonString == null) { - // create a bookmark from crawl start url - //final Set tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart"))); - final Set tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart")); - tags.add("crawlStart"); - final String[] keywords = scraper.dc_subject(); - if (keywords != null) { - for (final String k: keywords) { - final String kk = BookmarkHelper.cleanTagsString(k); - if (kk.length() > 0) tags.add(kk); - } - } - String tagStr = tags.toString(); - if (tagStr.length() > 2 && tagStr.startsWith("[") && tagStr.endsWith("]")) tagStr = tagStr.substring(1, tagStr.length() - 2); - - // we will create always a bookmark to use this to track crawled hosts - final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(crawlingStart, "admin"); - if (bookmark != null) { - bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, title); - bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_DESCRIPTION, description); - bookmark.setOwner("admin"); - bookmark.setPublic(false); - bookmark.setTags(tags, true); - sb.bookmarksDB.saveBookmark(bookmark); + String crawlingMode = post.get("crawlingMode","url"); + + if ("file".equals(crawlingMode) && post.containsKey("crawlingFile")) { + newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING; + directDocByURL = false; + } + + if ("sitemap".equals(crawlingMode)) { + newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; + newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING; + newcrawlingdepth = 0; + directDocByURL = false; + crawlingQ = true; + } + + if ("sitelist".equals(crawlingMode)) { + newcrawlingMustNotMatch = CrawlProfile.MATCH_NEVER_STRING; + Set newRootURLs = new HashSet(); + for (DigestURI sitelistURL: rootURLs) { + // download document + Document scraper; + try { + scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); + // get links and generate filter + for (MultiProtocolURI u: scraper.getAnchors().keySet()) { + newRootURLs.add(new DigestURI(u)); } - - // do the same for ymarks - // TODO: could a non admin user add crawls? - sb.tables.bookmarks.createBookmark(sb.loader, url, YMarkTables.USER_ADMIN, true, "crawlStart", "/Crawl Start"); - + } catch (IOException e) { + Log.logException(e); + } + } + rootURLs = newRootURLs; + crawlingMode = "url"; + if ((fullDomain || subPath) && newcrawlingdepth > 0) newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING; // to prevent that there is a restriction on the original urls + } + + // compute mustmatch filter according to rootURLs + if ((fullDomain || subPath) && newcrawlingdepth > 0) { + String siteFilter = ".*"; + if (fullDomain) { + siteFilter = siteFilter(rootURLs); + } else if (subPath) { + siteFilter = subpathFilter(rootURLs); + } + newcrawlingMustMatch = CrawlProfile.MATCH_ALL_STRING.equals(newcrawlingMustMatch) ? siteFilter : "(?=(" + newcrawlingMustMatch + "))(" + siteFilter + ")"; + } + + // check if the crawl filter works correctly + try { + Pattern.compile(newcrawlingMustMatch); + } catch (final PatternSyntaxException e) { + prop.put("info", "4"); // crawlfilter does not match url + prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); + prop.putHTML("info_error", e.getMessage()); + } + try { + Pattern.compile(newcrawlingMustNotMatch); + } catch (final PatternSyntaxException e) { + prop.put("info", "4"); // crawlfilter does not match url + prop.putHTML("info_newcrawlingfilter", newcrawlingMustNotMatch); + prop.putHTML("info_error", e.getMessage()); + } + + // prepare a new crawling profile + final CrawlProfile profile = new CrawlProfile( + crawlName, + newcrawlingMustMatch, + newcrawlingMustNotMatch, + ipMustMatch, + ipMustNotMatch, + countryMustMatch, + newcrawlingdepth, + directDocByURL, + crawlingIfOlder, + crawlingDomMaxPages, + crawlingQ, + indexText, + indexMedia, + storeHTCache, + crawlOrder, + xsstopw, + xdstopw, + xpstopw, + cachePolicy, + collection); + byte[] handle = ASCII.getBytes(profile.handle()); + + if ("url".equals(crawlingMode)) { + if (rootURLs.size() == 0) { + prop.put("info", "5"); //Crawling failed + prop.putHTML("info_crawlingURL", "(no url given)"); + prop.putHTML("info_reasonString", "you must submit at least one crawl url"); + } else { + + // stack requests + sb.crawler.putActive(handle, profile); + sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + Set successurls = new HashSet(); + Map failurls = new HashMap(); + String failreason; + for (DigestURI url: rootURLs) { + if ((failreason = stackUrl(sb, profile, url)) == null) successurls.add(url); else failurls.put(url, failreason); + } + + if (failurls.size() == 0) { // liftoff! - prop.put("info", "8");//start msg + prop.put("info", "8"); prop.putHTML("info_crawlingURL", post.get("crawlingURL")); - + // generate a YaCyNews if the global flag was set if (!sb.isRobinsonMode() && crawlOrder) { - final Map m = new HashMap(pe); // must be cloned + final Map m = new HashMap(profile); // must be cloned m.remove("specificDepth"); m.remove("indexText"); m.remove("indexMedia"); @@ -422,40 +403,49 @@ public class Crawler_p { sb.peers.newsPool.publishMyNews(sb.peers.mySeed(), NewsPool.CATEGORY_CRAWL_START, m); } } else { + StringBuilder fr = new StringBuilder(); + for (Map.Entry failure: failurls.entrySet()) { + sb.crawlQueues.errorURL.push( + new Request( + sb.peers.mySeed().hash.getBytes(), + failure.getKey(), + null, + "", + new Date(), + profile.handle(), + 0, + 0, + 0, + 0), + sb.peers.mySeed().hash.getBytes(), + new Date(), + 1, + FailCategory.FINAL_LOAD_CONTEXT, + failure.getValue(), -1); + fr.append(failure.getValue()).append('/'); + } + prop.put("info", "5"); //Crawling failed prop.putHTML("info_crawlingURL", (post.get("crawlingURL"))); - prop.putHTML("info_reasonString", reasonString); - - sb.crawlQueues.errorURL.push( - new Request( - sb.peers.mySeed().hash.getBytes(), - crawlingStartURL, - null, - "", - new Date(), - pe.handle(), - 0, - 0, - 0, - 0), - sb.peers.mySeed().hash.getBytes(), - new Date(), - 1, - FailCategory.FINAL_LOAD_CONTEXT, - reasonString, -1); + prop.putHTML("info_reasonString", fr.toString()); } - } catch (final PatternSyntaxException e) { - prop.put("info", "4"); // crawlfilter does not match url - prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); - prop.putHTML("info_error", e.getMessage()); + if (successurls.size() > 0) sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + } + } else if ("sitemap".equals(crawlingMode)) { + final String sitemapURLStr = post.get("sitemapURL",""); + try { + final DigestURI sitemapURL = new DigestURI(sitemapURLStr); + sb.crawler.putActive(handle, profile); + final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, profile); + importer.start(); + sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); } catch (final Exception e) { // mist - prop.put("info", "6"); // Error with url - prop.putHTML("info_crawlingStart", crawlingStart); + prop.put("info", "6");//Error with url + prop.putHTML("info_crawlingStart", sitemapURLStr); prop.putHTML("info_error", e.getMessage()); - Log.logInfo("Crawler_p", "start url rejected: " + e.getMessage()); + Log.logException(e); } - } else if ("file".equals(crawlingMode)) { if (post.containsKey("crawlingFile")) { final String crawlingFileContent = post.get("crawlingFile$file", ""); @@ -481,30 +471,7 @@ public class Crawler_p { } } - final DigestURI crawlURL = new DigestURI("file://" + crawlingFile.toString()); - final CrawlProfile profile = new CrawlProfile( - crawlingFileName, - crawlURL, - newcrawlingMustMatch, - CrawlProfile.MATCH_NEVER_STRING, - ipMustMatch, - ipMustNotMatch, - countryMustMatch, - newcrawlingdepth, - false, - crawlingIfOlder, - crawlingDomMaxPages, - crawlingQ, - indexText, - indexMedia, - storeHTCache, - crawlOrder, - xsstopw, - xdstopw, - xpstopw, - cachePolicy, - collection); - sb.crawler.putActive(profile.handle().getBytes(), profile); + sb.crawler.putActive(handle, profile); sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks); } catch (final PatternSyntaxException e) { @@ -520,110 +487,6 @@ public class Crawler_p { } sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); } - } else if ("sitemap".equals(crawlingMode)) { - final String sitemapURLStr = post.get("sitemapURL",""); - try { - final DigestURI sitemapURL = new DigestURI(sitemapURLStr); - final CrawlProfile pe = new CrawlProfile( - sitemapURLStr, - sitemapURL, - CrawlProfile.MATCH_ALL_STRING, - CrawlProfile.MATCH_NEVER_STRING, - ipMustMatch, - ipMustNotMatch, - countryMustMatch, - 0, - false, - crawlingIfOlder, - crawlingDomMaxPages, - true, - indexText, - indexMedia, - storeHTCache, - crawlOrder, - xsstopw, - xdstopw, - xpstopw, - cachePolicy, - collection); - sb.crawler.putActive(pe.handle().getBytes(), pe); - final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe); - importer.start(); - } catch (final Exception e) { - // mist - prop.put("info", "6");//Error with url - prop.putHTML("info_crawlingStart", sitemapURLStr); - prop.putHTML("info_error", e.getMessage()); - Log.logException(e); - } - } else if ("sitelist".equals(crawlingMode)) { - try { - final DigestURI sitelistURL = new DigestURI(crawlingStart); - // download document - Document scraper = sb.loader.loadDocument(sitelistURL, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); - // String title = scraper.getTitle(); - // String description = scraper.getDescription(); - - // get links and generate filter - final Map hyperlinks = scraper.getAnchors(); - if (fullDomain && newcrawlingdepth > 0) newcrawlingMustMatch = siteFilter(hyperlinks.keySet()); - - // put links onto crawl queue - final CrawlProfile profile = new CrawlProfile( - sitelistURL.getHost(), - sitelistURL, - newcrawlingMustMatch, - CrawlProfile.MATCH_NEVER_STRING, - ipMustMatch, - ipMustNotMatch, - countryMustMatch, - newcrawlingdepth, - directDocByURL, - crawlingIfOlder, - crawlingDomMaxPages, - crawlingQ, - indexText, - indexMedia, - storeHTCache, - crawlOrder, - xsstopw, - xdstopw, - xpstopw, - cachePolicy, - collection); - sb.crawler.putActive(profile.handle().getBytes(), profile); - sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - final Iterator> linkiterator = hyperlinks.entrySet().iterator(); - DigestURI nexturl; - while (linkiterator.hasNext()) { - final Map.Entry e = linkiterator.next(); - if (e.getKey() == null) continue; - nexturl = new DigestURI(e.getKey()); - // remove the url from the database to be prepared to crawl them again - final byte[] urlhash = nexturl.hash(); - sb.index.fulltext().remove(urlhash); - sb.crawlQueues.noticeURL.removeByURLHash(urlhash); - sb.crawlQueues.errorURL.remove(urlhash); - sb.crawlStacker.enqueueEntry(new Request( - sb.peers.mySeed().hash.getBytes(), - nexturl, - null, - e.getValue().getProperty("name", ""), - new Date(), - profile.handle(), - 0, - 0, - 0, - 0 - )); - } - } catch (final Exception e) { - // mist - prop.put("info", "6");//Error with url - prop.putHTML("info_crawlingStart", crawlingStart); - prop.putHTML("info_error", e.getMessage()); - Log.logException(e); - } } } } @@ -661,6 +524,106 @@ public class Crawler_p { return prop; } + /** + * stack the url to the crawler + * @param sb + * @param profile + * @param url + * @return null if this was ok. If this failed, return a string with a fail reason + */ + private static String stackUrl(Switchboard sb, CrawlProfile profile, DigestURI url) { + + byte[] handle = ASCII.getBytes(profile.handle()); + + // remove url from the index to be prepared for a re-crawl + final byte[] urlhash = url.hash(); + sb.index.fulltext().remove(urlhash); + sb.crawlQueues.noticeURL.removeByURLHash(urlhash); + sb.crawlQueues.errorURL.remove(urlhash); + + // special handling of ftp protocol + if (url.isFTP()) { + try { + sb.crawler.putActive(handle, profile); + sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + sb.crawlStacker.enqueueEntriesFTP(sb.peers.mySeed().hash.getBytes(), profile.handle(), url.getHost(), url.getPort(), false); + return null; + } catch (final Exception e) { + // mist + Log.logException(e); + return "problem crawling an ftp site: " + e.getMessage(); + } + } + + // get a scraper to get the title + Document scraper; + try { + scraper = sb.loader.loadDocument(url, CacheStrategy.IFFRESH, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); + } catch (IOException e) { + Log.logException(e); + return "scraper cannot load URL: " + e.getMessage(); + } + + final String title = scraper == null ? url.toNormalform(true, true) : scraper.dc_title(); + final String description = scraper.dc_description(); + + // add the url to the crawl stack + sb.crawler.removePassive(handle); // if there is an old entry, delete it + sb.crawler.putActive(handle, profile); + final String reasonString = sb.crawlStacker.stackCrawl(new Request( + sb.peers.mySeed().hash.getBytes(), + url, + null, + "CRAWLING-ROOT", + new Date(), + profile.handle(), + 0, + 0, + 0, + 0 + )); + + if (reasonString != null) return reasonString; + + // create a bookmark from crawl start url + //final Set tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart"))); + final Set tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart")); + tags.add("crawlStart"); + final String[] keywords = scraper.dc_subject(); + if (keywords != null) { + for (final String k: keywords) { + final String kk = BookmarkHelper.cleanTagsString(k); + if (kk.length() > 0) tags.add(kk); + } + } + String tagStr = tags.toString(); + if (tagStr.length() > 2 && tagStr.startsWith("[") && tagStr.endsWith("]")) tagStr = tagStr.substring(1, tagStr.length() - 2); + + // we will create always a bookmark to use this to track crawled hosts + final BookmarksDB.Bookmark bookmark = sb.bookmarksDB.createBookmark(url.toNormalform(true, false), "admin"); + if (bookmark != null) { + bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_TITLE, title); + bookmark.setProperty(BookmarksDB.Bookmark.BOOKMARK_DESCRIPTION, description); + bookmark.setOwner("admin"); + bookmark.setPublic(false); + bookmark.setTags(tags, true); + sb.bookmarksDB.saveBookmark(bookmark); + } + + // do the same for ymarks + // TODO: could a non admin user add crawls? + try { + sb.tables.bookmarks.createBookmark(sb.loader, url, YMarkTables.USER_ADMIN, true, "crawlStart", "/Crawl Start"); + } catch (IOException e) { + Log.logException(e); + } catch (Failure e) { + Log.logException(e); + } + + // that was ok + return null; + } + private static long recrawlIfOlderC(final boolean recrawlIfOlderCheck, final int recrawlIfOlderNumber, final String crawlingIfOlderUnit) { if (!recrawlIfOlderCheck) return 0L; if ("year".equals(crawlingIfOlderUnit)) return System.currentTimeMillis() - recrawlIfOlderNumber * 1000L * 60L * 60L * 24L * 365L; @@ -682,7 +645,7 @@ public class Crawler_p { sb.setPerformance(wantedPPM); } - private static String siteFilter(final Set uris) { + private static String siteFilter(final Set uris) { final StringBuilder filter = new StringBuilder(); final Set filterSet = new HashSet(); for (final MultiProtocolURI uri: uris) { @@ -697,7 +660,7 @@ public class Crawler_p { return filter.length() > 0 ? filter.substring(1) : ""; } - private static String subpathFilter(final Set uris) { + private static String subpathFilter(final Set uris) { final StringBuilder filter = new StringBuilder(); final Set filterSet = new HashSet(); for (final MultiProtocolURI uri: uris) { diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index bfaaf0ba9..7ad59f3fd 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -133,8 +133,7 @@ public class QuickCrawlLink_p { CrawlProfile pe = null; try { pe = new CrawlProfile( - crawlingStartURL.getHost(), - crawlingStartURL, + crawlingStartURL.toNormalform(true, false), crawlingMustMatch, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, diff --git a/source/de/anomic/crawler/CrawlProfile.java b/source/de/anomic/crawler/CrawlProfile.java index 8697d9aa0..b44a6f872 100644 --- a/source/de/anomic/crawler/CrawlProfile.java +++ b/source/de/anomic/crawler/CrawlProfile.java @@ -33,10 +33,8 @@ import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; -import net.yacy.cora.document.ASCII; import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.services.federated.yacy.CacheStrategy; -import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; @@ -55,7 +53,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M // this is a simple record structure that hold all properties of a single crawl start public static final String HANDLE = "handle"; public static final String NAME = "name"; - public static final String START_URL = "startURL"; public static final String DEPTH = "generalDepth"; public static final String DIRECT_DOC_BY_URL= "directDocByURL"; public static final String RECRAWL_IF_OLDER = "recrawlIfOlder"; @@ -124,8 +121,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M * @param collections a comma-separated list of tags which are attached to index entries */ public CrawlProfile( - final String name, - final DigestURI startURL, + String name, final String urlMustMatch, final String urlMustNotMatch, final String ipMustMatch, @@ -149,14 +145,11 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (name == null || name.isEmpty()) { throw new NullPointerException("name must not be null or empty"); } + if (name.length() > 60) name = name.substring(0, 60); this.doms = new ConcurrentHashMap(); - - final String handle = (startURL == null) - ? Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength) - : ASCII.String(startURL.hash()); + final String handle = Base64Order.enhancedCoder.encode(Digest.encodeMD5Raw(name)).substring(0, Word.commonHashLength); put(HANDLE, handle); put(NAME, name); - put(START_URL, (startURL == null) ? "" : startURL.toNormalform(true, false)); put(FILTER_URL_MUSTMATCH, (urlMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : urlMustMatch); put(FILTER_URL_MUSTNOTMATCH, (urlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : urlMustNotMatch); put(FILTER_IP_MUSTMATCH, (ipMustMatch == null) ? CrawlProfile.MATCH_ALL_STRING : ipMustMatch); @@ -258,6 +251,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M */ public String handle() { final String r = get(HANDLE); + assert r != null; //if (r == null) return null; return r; } @@ -282,15 +276,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M return r; } - /** - * Gets the root URL of the crawl job. - * @return root URL - */ - public String startURL() { - final String r = get(START_URL); - return r; - } - /** * Gets the regex which must be matched by URLs in order to be crawled. * @return regex which must be matched @@ -540,7 +525,6 @@ public class CrawlProfile extends ConcurrentHashMap implements M prop.put(CRAWL_PROFILE_PREFIX + count + "_terminateButton_handle", this.handle()); prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton", (active) ? "0" : "1"); prop.put(CRAWL_PROFILE_PREFIX + count + "_deleteButton_handle", this.handle()); - prop.putXML(CRAWL_PROFILE_PREFIX + count + "_startURL", this.startURL()); prop.put(CRAWL_PROFILE_PREFIX + count + "_handle", this.handle()); prop.put(CRAWL_PROFILE_PREFIX + count + "_depth", this.depth()); prop.put(CRAWL_PROFILE_PREFIX + count + "_mustmatch", this.urlMustMatchPattern().toString()); diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index c61aeff47..a5c4822a9 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -315,6 +315,7 @@ public class CrawlQueues { * @param stats String for log prefixing * @return */ + @SuppressWarnings("unused") private void load(final Request urlEntry, final String stats, final String profileHandle) { final CrawlProfile profile = this.sb.crawler.getActive(UTF8.getBytes(profileHandle)); if (profile != null) { diff --git a/source/de/anomic/crawler/CrawlSwitchboard.java b/source/de/anomic/crawler/CrawlSwitchboard.java index b67b1bbc2..48f2430c8 100644 --- a/source/de/anomic/crawler/CrawlSwitchboard.java +++ b/source/de/anomic/crawler/CrawlSwitchboard.java @@ -292,7 +292,6 @@ public final class CrawlSwitchboard this.defaultProxyProfile = new CrawlProfile( CRAWL_PROFILE_PROXY, - null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, @@ -321,7 +320,6 @@ public final class CrawlSwitchboard this.defaultRemoteProfile = new CrawlProfile( CRAWL_PROFILE_REMOTE, - null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, @@ -350,7 +348,6 @@ public final class CrawlSwitchboard this.defaultTextSnippetLocalProfile = new CrawlProfile( CRAWL_PROFILE_SNIPPET_LOCAL_TEXT, - null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, @@ -379,7 +376,6 @@ public final class CrawlSwitchboard this.defaultTextSnippetGlobalProfile = new CrawlProfile( CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT, - null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, @@ -409,7 +405,6 @@ public final class CrawlSwitchboard this.defaultMediaSnippetLocalProfile = new CrawlProfile( CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA, - null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, @@ -438,7 +433,6 @@ public final class CrawlSwitchboard this.defaultMediaSnippetGlobalProfile = new CrawlProfile( CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, - null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, @@ -467,7 +461,6 @@ public final class CrawlSwitchboard this.defaultSurrogateProfile = new CrawlProfile( CRAWL_PROFILE_SURROGATE, - null, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_ALL_STRING, diff --git a/source/de/anomic/data/ymark/YMarkCrawlStart.java b/source/de/anomic/data/ymark/YMarkCrawlStart.java index 573b31f0f..1cde8b4b6 100644 --- a/source/de/anomic/data/ymark/YMarkCrawlStart.java +++ b/source/de/anomic/data/ymark/YMarkCrawlStart.java @@ -98,9 +98,7 @@ public class YMarkCrawlStart extends HashMap{ while(iter.hasNext()) { final byte[] key = iter.next(); final CrawlProfile crawl = crawler.getActive(key); - if (crawl.startURL().equals(this.url)) { - return true; - } + if (crawl != null) return true; } return false; } @@ -175,7 +173,7 @@ public class YMarkCrawlStart extends HashMap{ final int depth, final boolean crawlingQ, final boolean medialink) { final CrawlProfile pe = new CrawlProfile( - (startURL.getHost() == null) ? startURL.toNormalform(true, false) : startURL.getHost(), null, + (startURL.getHost() == null) ? startURL.toNormalform(true, false) : startURL.getHost(), urlMustMatch, urlMustNotMatch, CrawlProfile.MATCH_ALL_STRING,