From 57ffdfad4c01ed79d461a18540bdf839eb263ae3 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 3 Jul 2013 14:50:06 +0200 Subject: [PATCH] added a crawl option to obey html-meta-robots-noindex. This is on by default. --- defaults/yacy.init | 2 ++ htroot/CrawlStartExpert_p.html | 7 +++++-- htroot/CrawlStartExpert_p.java | 2 ++ htroot/CrawlStartSite_p.html | 11 +++++------ htroot/Crawler_p.java | 14 ++++++++++---- htroot/QuickCrawlLink_p.html | 2 +- htroot/QuickCrawlLink_p.java | 12 ++++++------ source/net/yacy/crawler/CrawlStacker.java | 4 ++-- source/net/yacy/crawler/CrawlSwitchboard.java | 16 ++++++++-------- source/net/yacy/crawler/data/CrawlProfile.java | 18 +++++++++++++++++- .../net/yacy/data/ymark/YMarkCrawlStart.java | 2 +- source/net/yacy/search/Switchboard.java | 13 +++++++------ .../search/schema/CollectionConfiguration.java | 9 ++++++--- 13 files changed, 72 insertions(+), 40 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index c20767fc3..4cb5ce370 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -537,6 +537,8 @@ indexMedia=true # URLs are only indexed and further crawled if they match this filter crawlingFilter=.* crawlingQ=true +followFrames=true +obeyHtmlRobotsNoindex=true storeHTCache=true storeTXCache=true diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html index cc82de4c4..f3b562e72 100644 --- a/htroot/CrawlStartExpert_p.html +++ b/htroot/CrawlStartExpert_p.html @@ -109,14 +109,17 @@ -
+
info A questionmark is usually a hint for a dynamic page. URLs pointing to dynamic content should usually not be crawled. However, there are sometimes web pages with static content that is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops. + Following frames is NOT done by Gxxg1e, but we do by default to have a richer content. 'nofollow' in robots metadata can be overridden; this does not affect obeying of the robots.txt which is never ignored. - + Accept URLs with query-part ('?'):    + Obey html-robots-noindex:
Load Filter on URLs
info diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java index 3f9cfd0df..514302541 100644 --- a/htroot/CrawlStartExpert_p.java +++ b/htroot/CrawlStartExpert_p.java @@ -62,6 +62,8 @@ public class CrawlStartExpert_p { prop.put("crawlingDomMaxCheck", (crawlingDomMaxPages == -1) ? "0" : "1"); prop.put("crawlingDomMaxPages", (crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages); prop.put("crawlingQChecked", env.getConfigBool("crawlingQ", true) ? "1" : "0"); + prop.put("followFramesChecked", env.getConfigBool("followFrames", true) ? "1" : "0"); + prop.put("obeyHtmlRobotsNoindexChecked", env.getConfigBool("obeyHtmlRobotsNoindex", true) ? "1" : "0"); prop.put("storeHTCacheChecked", env.getConfigBool("storeHTCache", true) ? "1" : "0"); prop.put("indexingTextChecked", env.getConfigBool("indexText", true) ? "1" : "0"); prop.put("indexingMediaChecked", env.getConfigBool("indexMedia", true) ? "1" : "0"); diff --git a/htroot/CrawlStartSite_p.html b/htroot/CrawlStartSite_p.html index 357e58ef6..cfa384317 100644 --- a/htroot/CrawlStartSite_p.html +++ b/htroot/CrawlStartSite_p.html @@ -39,7 +39,7 @@
+ onmousedown="document.getElementById('rangeDomain').disabled=false;document.getElementById('rangeSubpath').disabled=false;document.getElementById('crawlingDomMaxCheck').disabled=false;document.getElementById('crawlingDomMaxPages').disabled=false;"/>Start URL (must start with
http:// https:// ftp:// smb:// file://) + onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;"/>Sitemap URL
Start URL (must start with
http:// https:// ftp:// smb:// file://)

@@ -53,7 +53,7 @@
Sitemap URL

@@ -75,10 +75,6 @@ documents
-
-
- allow query-strings (urls with a '?' in the path) -
@@ -92,6 +88,9 @@ + + + diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index f0a112eca..3c368a95a 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -246,13 +246,19 @@ public class Crawler_p { final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? post.getInt("crawlingDomMaxPages", -1) : -1; env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages)); - boolean crawlingQ = "on".equals(post.get("crawlingQ", "off")); + boolean crawlingQ = "on".equals(post.get("crawlingQ", "on")); env.setConfig("crawlingQ", crawlingQ); + + boolean followFrames = "on".equals(post.get("followFrames", "on")); + env.setConfig("followFrames", followFrames); + + boolean obeyHtmlRobotsNoindex = "on".equals(post.get("obeyHtmlRobotsNoindex", "on")); + env.setConfig("obeyHtmlRobotsNoindex", obeyHtmlRobotsNoindex); - final boolean indexText = "on".equals(post.get("indexText", "off")); + final boolean indexText = "on".equals(post.get("indexText", "on")); env.setConfig("indexText", indexText); - final boolean indexMedia = "on".equals(post.get("indexMedia", "off")); + final boolean indexMedia = "on".equals(post.get("indexMedia", "on")); env.setConfig("indexMedia", indexMedia); env.setConfig("storeHTCache", storeHTCache); @@ -361,7 +367,7 @@ public class Crawler_p { directDocByURL, crawlingIfOlder, crawlingDomMaxPages, - crawlingQ, + crawlingQ, followFrames, obeyHtmlRobotsNoindex, indexText, indexMedia, storeHTCache, diff --git a/htroot/QuickCrawlLink_p.html b/htroot/QuickCrawlLink_p.html index 9d9a9d1b9..b2cfa73d3 100644 --- a/htroot/QuickCrawlLink_p.html +++ b/htroot/QuickCrawlLink_p.html @@ -15,7 +15,7 @@ If you click on it while browsing, the currently viewed website will be inserted into the YaCy crawling queue for indexing.

- Crawl with YaCy + Crawl with YaCy

:: diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 11e81c9df..d21f2e48e 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -97,7 +97,9 @@ public class QuickCrawlLink_p { final String crawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING); final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING); final int CrawlingDepth = post.getInt("crawlingDepth", 0); - final boolean crawlDynamic = post.get("crawlingQ", "").equals("on"); + final boolean crawlingQ = post.get("crawlingQ", "").equals("on"); + final boolean followFrames = post.get("followFrames", "").equals("on"); + final boolean obeyHtmlRobotsNoindex = post.get("obeyHtmlRobotsNoindex", "").equals("on"); final boolean indexText = post.get("indexText", "off").equals("on"); final boolean indexMedia = post.get("indexMedia", "off").equals("on"); final boolean storeHTCache = post.get("storeHTCache", "").equals("on"); @@ -145,11 +147,9 @@ public class QuickCrawlLink_p { true, 60 * 24 * 30, // recrawlIfOlder (minutes); here: one month -1, // domMaxPages, if negative: no count restriction - crawlDynamic, - indexText, - indexMedia, - storeHTCache, - remoteIndexing, + crawlingQ, followFrames, obeyHtmlRobotsNoindex, + indexText, indexMedia, + storeHTCache, remoteIndexing, CacheStrategy.IFFRESH, collection); sb.crawler.putActive(pe.handle().getBytes(), pe); diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index d97ec6706..daa5a71de 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -407,13 +407,13 @@ public final class CrawlStacker { } // deny cgi - if (url.isIndividual() && !(profile.crawlingQ())) { // TODO: make special property for crawlingIndividual + if (url.isIndividual() && !profile.crawlingQ()) { // TODO: make special property for crawlingIndividual if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is CGI URL."); return "individual url (sessionid etc) not wanted"; } // deny post properties - if (url.isPOST() && !(profile.crawlingQ())) { + if (url.isPOST() && !profile.crawlingQ()) { if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is post URL."); return "post url not allowed"; } diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index b56bece92..b7a273e89 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -264,7 +264,7 @@ public final class CrawlSwitchboard { true, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, - false, + false, true, true, true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/, true, @@ -292,7 +292,7 @@ public final class CrawlSwitchboard { false, -1, -1, - true, + true, true, true, true, true, false, @@ -320,7 +320,7 @@ public final class CrawlSwitchboard { false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, - true, + true, true, true, false, false, true, @@ -348,7 +348,7 @@ public final class CrawlSwitchboard { false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, - true, + true, true, true, true, true, true, @@ -377,7 +377,7 @@ public final class CrawlSwitchboard { false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE), -1, - true, + true, true, true, false, false, true, @@ -405,7 +405,7 @@ public final class CrawlSwitchboard { false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, - true, + true, true, true, false, false, true, @@ -433,7 +433,7 @@ public final class CrawlSwitchboard { false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, - true, + true, true, true, false, true, true, @@ -461,7 +461,7 @@ public final class CrawlSwitchboard { false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, - true, + true, true, true, true, false, false, diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 3d6619306..d1470a734 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -61,6 +61,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M public static final String RECRAWL_IF_OLDER = "recrawlIfOlder"; public static final String DOM_MAX_PAGES = "domMaxPages"; public static final String CRAWLING_Q = "crawlingQ"; + public static final String FOLLOW_FRAMES = "followFrames"; + public static final String OBEY_HTML_ROBOTS_NOINDEX = "obeyHtmlRobotsNoindex"; public static final String INDEX_TEXT = "indexText"; public static final String INDEX_MEDIA = "indexMedia"; public static final String STORE_HTCACHE = "storeHTCache"; @@ -127,7 +129,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M final boolean directDocByURL, final long recrawlIfOlder /*date*/, final int domMaxPages, - final boolean crawlingQ, + final boolean crawlingQ, final boolean followFrames, final boolean obeyHtmlRobotsNoindex, final boolean indexText, final boolean indexMedia, final boolean storeHTCache, @@ -158,6 +160,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(RECRAWL_IF_OLDER, recrawlIfOlder); put(DOM_MAX_PAGES, domMaxPages); put(CRAWLING_Q, crawlingQ); // crawling of urls with '?' + put(FOLLOW_FRAMES, followFrames); // load pages contained in frames or ifames + put(OBEY_HTML_ROBOTS_NOINDEX, obeyHtmlRobotsNoindex); // if false, then a meta robots tag containing 'noindex' is ignored put(INDEX_TEXT, indexText); put(INDEX_MEDIA, indexMedia); put(STORE_HTCACHE, storeHTCache); @@ -491,6 +495,18 @@ public class CrawlProfile extends ConcurrentHashMap implements M return (r.equals(Boolean.TRUE.toString())); } + public boolean followFrames() { + final String r = get(FOLLOW_FRAMES); + if (r == null) return false; + return (r.equals(Boolean.TRUE.toString())); + } + + public boolean obeyHtmlRobotsNoindex() { + final String r = get(OBEY_HTML_ROBOTS_NOINDEX); + if (r == null) return false; + return (r.equals(Boolean.TRUE.toString())); + } + public boolean indexText() { final String r = get(INDEX_TEXT); if (r == null) return true; diff --git a/source/net/yacy/data/ymark/YMarkCrawlStart.java b/source/net/yacy/data/ymark/YMarkCrawlStart.java index f81c5ac66..f4be6e982 100644 --- a/source/net/yacy/data/ymark/YMarkCrawlStart.java +++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java @@ -183,7 +183,7 @@ public class YMarkCrawlStart extends HashMap{ CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, crawlingQ, - true, true, true, false, + true, true, true, true, true, false, CacheStrategy.IFFRESH, "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA); // TODO: make this a default profile in CrawlSwitchboard sb.crawler.putActive(pe.handle().getBytes(), pe); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 3241b54d9..02c0b9b37 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2476,7 +2476,6 @@ public final class Switchboard extends serverSwitch { } final long parsingEndTime = System.currentTimeMillis(); - // put anchors on crawl stack final long stackStartTime = System.currentTimeMillis(); if ((processCase == EventOrigin.PROXY_LOAD || processCase == EventOrigin.LOCAL_CRAWLING) && @@ -2578,7 +2577,7 @@ public final class Switchboard extends serverSwitch { // check which files may take part in the indexing process final List doclist = new ArrayList(); docloop: for (final Document document : in.documents) { - if (document.indexingDenied()) { + if (document.indexingDenied() && profile.obeyHtmlRobotsNoindex()) { if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule"); addURLtoErrorDB( in.queueEntry.url(), @@ -2671,8 +2670,9 @@ public final class Switchboard extends serverSwitch { final DigestURI url = document.dc_source(); final DigestURI referrerURL = queueEntry.referrerURL(); EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash); + CrawlProfile profile = queueEntry.profile(); - if ( condenser == null || document.indexingDenied() ) { + if (condenser == null || (document.indexingDenied() && profile.obeyHtmlRobotsNoindex())) { //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase); addURLtoErrorDB( url, @@ -2684,7 +2684,7 @@ public final class Switchboard extends serverSwitch { return; } - if ( !queueEntry.profile().indexText() && !queueEntry.profile().indexMedia() ) { + if ( !profile.indexText() && !profile.indexMedia() ) { //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name()); addURLtoErrorDB( url, @@ -2695,7 +2695,7 @@ public final class Switchboard extends serverSwitch { "denied by profile rule, process case=" + processCase + ", profile name = " - + queueEntry.profile().collectionName()); + + profile.collectionName()); return; } @@ -2993,7 +2993,8 @@ public final class Switchboard extends serverSwitch { final Document[] documents = response.parse(); if (documents != null) { for (final Document document: documents) { - if (document.indexingDenied()) { + final CrawlProfile profile = crawler.getActive(ASCII.getBytes(request.profileHandle())); + if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) { throw new Parser.Failure("indexing is denied", url); } final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, true); diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 5e2073940..f4c6aba0b 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -493,6 +493,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri Map images = new HashMap(); int c = 0; final Object parser = document.getParserObject(); + boolean containsCanonical = false; if (parser instanceof ContentScraper) { final ContentScraper html = (ContentScraper) parser; images = html.getImages(); @@ -715,7 +716,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // canonical tag if (allAttr || contains(CollectionSchema.canonical_s)) { final DigestURI canonical = html.getCanonical(); - if (canonical != null) { + if (canonical != null && !ASCII.String(canonical.hash()).equals(id)) { + containsCanonical = true; inboundLinks.remove(canonical); outboundLinks.remove(canonical); add(doc, CollectionSchema.canonical_s, canonical.toNormalform(false)); @@ -811,10 +813,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount()); // create a subgraph - //if () { + if (!containsCanonical) { + // a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, inboundLinks, citations); webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, false, outboundLinks, citations); - //} + } // list all links doc.webgraphDocuments.addAll(subgraph.edges);