/** * CrawlCheck_p * Copyright 2012 by Michael Peter Christen * First released 10.10.2011 at http://yacy.net * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General private * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see . */ import java.io.IOException; import java.net.MalformedURLException; import java.util.HashSet; import java.util.Set; import java.util.regex.Pattern; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Response; import net.yacy.crawler.robots.RobotsTxtEntry; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; public class CrawlCheck_p { public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); prop.put("starturls", ""); if (post == null) return prop; if (post.containsKey("crawlcheck")) { // get the list of rootURls for this crawl start Set rootURLs = new HashSet(); String crawlingStart0 = post.get("crawlingURLs","").trim(); String[] rootURLs0 = crawlingStart0.indexOf('\n') > 0 || crawlingStart0.indexOf('\r') > 0 ? crawlingStart0.split("[\\r\\n]+") : crawlingStart0.split(Pattern.quote("|")); for (String crawlingStart: rootURLs0) { if (crawlingStart == null || crawlingStart.length() == 0) continue; // add the prefix http:// if necessary int pos = crawlingStart.indexOf("://",0); if (pos == -1) { if (crawlingStart.startsWith("www")) crawlingStart = "http://" + crawlingStart; if (crawlingStart.startsWith("ftp")) crawlingStart = "ftp://" + crawlingStart; } try { DigestURI crawlingStartURL = new DigestURI(crawlingStart); rootURLs.add(crawlingStartURL); } catch (final MalformedURLException e) { ConcurrentLog.logException(e); } } if (rootURLs.size() == 0) { prop.put("table", 0); } else { prop.put("table", 1); ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); // make a string that is used to fill the starturls field again // and analyze the urls to make the table rows StringBuilder s = new StringBuilder(300); int row = 0; for (DigestURI u: rootURLs) { s.append(u.toNormalform(true)).append('\n'); prop.put("table_list_" + row + "_url", u.toNormalform(true)); // try to load the robots RobotsTxtEntry robotsEntry; boolean robotsAllowed = true; robotsEntry = sb.robots.getEntry(u, agent); if (robotsEntry == null) { prop.put("table_list_" + row + "_robots", "no robots"); prop.put("table_list_" + row + "_crawldelay", agent.minimumDelta + " ms"); prop.put("table_list_" + row + "_sitemap", ""); } else { robotsAllowed = !robotsEntry.isDisallowed(u); prop.put("table_list_" + row + "_robots", "robots exist: " + (robotsAllowed ? "crawl allowed" : "url disallowed")); prop.put("table_list_" + row + "_crawldelay", Math.max(agent.minimumDelta, robotsEntry.getCrawlDelayMillis()) + " ms"); prop.put("table_list_" + row + "_sitemap", robotsEntry.getSitemap() == null ? "-" : robotsEntry.getSitemap().toNormalform(true)); } // try to load the url if (robotsAllowed) try { Request request = sb.loader.request(u, true, false); final Response response = sb.loader.load(request, CacheStrategy.NOCACHE, BlacklistType.CRAWLER, agent); if (response == null) { prop.put("table_list_" + row + "_access", "no response"); } else { if (response.getResponseHeader().getStatusCode() == 200) { prop.put("table_list_" + row + "_access", "200 ok, last-modified = " + response.lastModified()); } else { prop.put("table_list_" + row + "_access", response.getResponseHeader().getStatusCode() + " - load failed"); } } } catch (final IOException e) { prop.put("table_list_" + row + "_access", "error response: " + e.getMessage()); } else { prop.put("table_list_" + row + "_access", "not loaded - prevented by robots.txt"); } row++; } prop.put("table_list", row); prop.put("starturls", s.toString()); } } return prop; } }