From abebb3b124759439ac8ac46604fb71a64efbdafe Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 10 Oct 2012 02:02:17 +0200 Subject: [PATCH] added a crawl start checker which makes a simple analysis on the list of all given urls: shows if the url can be loaded and if there is a robots and/or a sitemap. --- htroot/CrawlCheck_p.html | 74 ++++++++++ htroot/CrawlCheck_p.java | 132 ++++++++++++++++++ htroot/Crawler_p.java | 6 - .../env/templates/submenuIndexCreate.template | 8 ++ 4 files changed, 214 insertions(+), 6 deletions(-) create mode 100644 htroot/CrawlCheck_p.html create mode 100644 htroot/CrawlCheck_p.java diff --git a/htroot/CrawlCheck_p.html b/htroot/CrawlCheck_p.html new file mode 100644 index 000000000..7635f54aa --- /dev/null +++ b/htroot/CrawlCheck_p.html @@ -0,0 +1,74 @@ + + + + YaCy '#[clientname]#': Crawl Start + #%env/templates/metas.template%# + + + + + + + +
+ + #%env/templates/header.template%# + #%env/templates/submenuIndexCreate.template%# +

Crawl Check

+ +

This pages gives you an analysis about the possible success for a web crawl on given addresses.

+ +
+ + + +
+
+
+
+ +
+
+
+
+
+ +
+
+ + + #(table)#:: +
Analysis + + + + + + + + + #{list}# + + + + + + + + #{/list}# +
URLAccessRobotsCrawl-DelaySitemap
#[url]##[access]##[robots]##[crawldelay]##[sitemap]#
+
+ #(/table)# + + + #%env/templates/footer.template%# + + diff --git a/htroot/CrawlCheck_p.java b/htroot/CrawlCheck_p.java new file mode 100644 index 000000000..3f53425d2 --- /dev/null +++ b/htroot/CrawlCheck_p.java @@ -0,0 +1,132 @@ +/** + * CrawlCheck_p + * Copyright 2012 by Michael Peter Christen + * First released 10.10.2011 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General private + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +import java.io.IOException; +import java.net.MalformedURLException; +import java.util.HashSet; +import java.util.Set; +import java.util.regex.Pattern; + +import net.yacy.cora.federate.yacy.CacheStrategy; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.crawler.data.CrawlQueues; +import net.yacy.crawler.retrieval.Request; +import net.yacy.crawler.retrieval.Response; +import net.yacy.crawler.robots.RobotsTxtEntry; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; +import net.yacy.repository.Blacklist.BlacklistType; +import net.yacy.search.Switchboard; +import net.yacy.server.serverObjects; +import net.yacy.server.serverSwitch; + + +public class CrawlCheck_p { + + public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { + final Switchboard sb = (Switchboard) env; + final serverObjects prop = new serverObjects(); + prop.put("starturls", ""); + if (post == null) return prop; + + if (post.containsKey("crawlcheck")) { + + // get the list of rootURls for this crawl start + Set rootURLs = new HashSet(); + String crawlingStart0 = post.get("crawlingURLs","").trim(); + String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|")); + for (String crawlingStart: rootURLs0) { + if (crawlingStart == null || crawlingStart.length() == 0) continue; + // add the prefix http:// if necessary + int pos = crawlingStart.indexOf("://",0); + if (pos == -1) { + if (crawlingStart.startsWith("www")) crawlingStart = "http://" + crawlingStart; + if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart; + } + try { + DigestURI crawlingStartURL = new DigestURI(crawlingStart); + rootURLs.add(crawlingStartURL); + } catch (MalformedURLException e) { + Log.logException(e); + } + } + + if (rootURLs.size() == 0) { + prop.put("table", 0); + } else { + prop.put("table", 1); + + // make a string that is used to fill the starturls field again + // and analyze the urls to make the table rows + StringBuilder s = new StringBuilder(300); + int row = 0; + for (DigestURI u: rootURLs) { + s.append(u.toNormalform(true, true)).append('\n'); + prop.put("table_list_" + row + "_url", u.toNormalform(true, true)); + + // try to load the robots + RobotsTxtEntry robotsEntry; + boolean robotsAllowed = true; + try { + robotsEntry = sb.robots.getEntry(u, sb.peers.myBotIDs()); + if (robotsEntry == null) { + prop.put("table_list_" + row + "_robots", "no robots"); + prop.put("table_list_" + row + "_crawldelay", CrawlQueues.queuedMinLoadDelay + " ms"); + prop.put("table_list_" + row + "_sitemap", ""); + } else { + robotsAllowed = !robotsEntry.isDisallowed(u); + prop.put("table_list_" + row + "_robots", "robots exist: " + (robotsAllowed ? "crawl allowed" : "url disallowed")); + prop.put("table_list_" + row + "_crawldelay", Math.max(CrawlQueues.queuedMinLoadDelay, robotsEntry.getCrawlDelayMillis()) + " ms"); + prop.put("table_list_" + row + "_sitemap", robotsEntry.getSitemap() == null ? "-" : robotsEntry.getSitemap().toNormalform(true, true)); + } + } catch (final IOException e) { + } + + // try to load the url + if (robotsAllowed) try { + Request request = sb.loader.request(u, true, false); + final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, CrawlQueues.queuedMinLoadDelay); + if (response == null) { + prop.put("table_list_" + row + "_access", "no response"); + } else { + if (response.getResponseHeader().getStatusCode() == 200) { + prop.put("table_list_" + row + "_access", "200 ok, last-modified = " + response.lastModified()); + } else { + prop.put("table_list_" + row + "_access", response.getResponseHeader().getStatusCode() + " - load failed"); + } + } + } catch (final IOException e) { + prop.put("table_list_" + row + "_access", "error response: " + e.getMessage()); + } else { + prop.put("table_list_" + row + "_access", "not loaded - prevented by robots.txt"); + } + row++; + + } + prop.put("table_list", row); + prop.put("starturls", s.toString()); + + } + } + + return prop; + } + +} diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index a7ee8a840..ba4d81ca0 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -3,12 +3,6 @@ // first published 18.12.2006 on http://www.anomic.de // this file was created using the an implementation from IndexCreate_p.java, published 02.12.2004 // -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// // LICENSE // // This program is free software; you can redistribute it and/or modify diff --git a/htroot/env/templates/submenuIndexCreate.template b/htroot/env/templates/submenuIndexCreate.template index 6387127b6..78db4173f 100644 --- a/htroot/env/templates/submenuIndexCreate.template +++ b/htroot/env/templates/submenuIndexCreate.template @@ -36,4 +36,12 @@
  • Dump Reader for
    MediaWiki dumps
  • + + \ No newline at end of file