added a crawl option to obey html-meta-robots-noindex. This is on by

default.
12 years ago · 57ffdfad4c
parent 5a5d411ec0
commit 57ffdfad4c
13 changed files with 72 additions and 40 deletions
--- a/defaults/yacy.init
+++ b/defaults/yacy.init
@ -537,6 +537,8 @@ indexMedia=true
 # URLs are only indexed and further crawled if they match this filter
 crawlingFilter=.*
 crawlingQ=true
+followFrames=true
+obeyHtmlRobotsNoindex=true
 storeHTCache=true
 storeTXCache=true

--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@ -109,14 +109,17 @@
            <input name="crawlingDomMaxPages" id="crawlingDomMaxPages" type="text" size="6" maxlength="6" value="#[crawlingDomMaxPages]#" />
          </dd>
          
-          <dt><label for="crawlingQ">Accept URLs with '?' / dynamic URLs</label></dt>
+          <dt><label for="Constraints">misc. Constraints</label></dt>
          <dd>
            <span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
            A questionmark is usually a hint for a dynamic page. URLs pointing to dynamic content should usually not be crawled.
            However, there are sometimes web pages with static content that
            is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops.
+            Following frames is NOT done by Gxxg1e, but we do by default to have a richer content. 'nofollow' in robots metadata can be overridden; this does not affect obeying of the robots.txt which is never ignored.
            </span></span>
-            <input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# />
+            Accept URLs with query-part ('?'): <input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# />&nbsp;&nbsp;
+            Obey html-robots-noindex: <input type="checkbox" name="obeyHtmlRobotsNoindex" id="obeyHtmlRobotsNoindex" #(obeyHtmlRobotsNoindexChecked)#::checked="checked"#(/obeyHtmlRobotsNoindexChecked)# /><!--&nbsp;&nbsp;
+            Follow Frames: <input type="checkbox" name="followFrames" id="followFrames" #(followFramesChecked)#::checked="checked"#(/followFramesChecked)# />&nbsp;&nbsp;-->
          </dd>
 	        <dt>Load Filter on URLs</dt>
 	        <dd><span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
--- a/htroot/CrawlStartExpert_p.java
+++ b/htroot/CrawlStartExpert_p.java
@ -62,6 +62,8 @@ public class CrawlStartExpert_p {
        prop.put("crawlingDomMaxCheck", (crawlingDomMaxPages == -1) ? "0" : "1");
        prop.put("crawlingDomMaxPages", (crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages);
        prop.put("crawlingQChecked", env.getConfigBool("crawlingQ", true) ? "1" : "0");
+        prop.put("followFramesChecked", env.getConfigBool("followFrames", true) ? "1" : "0");
+        prop.put("obeyHtmlRobotsNoindexChecked", env.getConfigBool("obeyHtmlRobotsNoindex", true) ? "1" : "0");
        prop.put("storeHTCacheChecked", env.getConfigBool("storeHTCache", true) ? "1" : "0");
        prop.put("indexingTextChecked", env.getConfigBool("indexText", true) ? "1" : "0");
        prop.put("indexingMediaChecked", env.getConfigBool("indexMedia", true) ? "1" : "0");
--- a/htroot/CrawlStartSite_p.html
+++ b/htroot/CrawlStartSite_p.html
@ -39,7 +39,7 @@
        <dd>
          <table border="0" cellpadding="0" cellspacing="0"><tr valign="top">
            <td valign="top"><input type="radio" name="crawlingMode" id="url" value="url" checked="checked"
-            onmousedown="document.getElementById('rangeDomain').disabled=false;document.getElementById('rangeSubpath').disabled=false;document.getElementById('crawlingDomMaxCheck').disabled=false;document.getElementById('crawlingDomMaxPages').disabled=false;document.getElementById('crawlingQ').disabled=false;"/>Start URL&nbsp;(must start with<br/>http:// https:// ftp:// smb:// file://)</td>
+            onmousedown="document.getElementById('rangeDomain').disabled=false;document.getElementById('rangeSubpath').disabled=false;document.getElementById('crawlingDomMaxCheck').disabled=false;document.getElementById('crawlingDomMaxPages').disabled=false;"/>Start URL&nbsp;(must start with<br/>http:// https:// ftp:// smb:// file://)</td>
            <td valign="top">
            <input name="crawlingURL" id="crawlingURL" type="text" size="50" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" style="font-size:16px"/><br/>
            <input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
@ -53,7 +53,7 @@
            <td><div id="sitelistURLs"></div></td>
          </tr><tr>
            <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"
-            onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Sitemap URL</td>
+            onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;"/>Sitemap URL</td>
            <td><input name="sitemapURL" type="text" size="41" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/></td>
          </tr>
          </table><br/>
@ -75,10 +75,6 @@
          <td valign="top">documents</td>
          </tr></table>
        </dd>
-        <dt><label>Dynamic URLs</label></dt>
-        <dd>
-		  <input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /> allow <a href="http://en.wikipedia.org/wiki/Query_string">query-strings</a> (urls with a '?' in the path)
-        </dd>
        <dt><label>Collection</label></dt>
        <dd>
          <input name="collection" id="collection" type="text" size="60" maxlength="100" value="#[collection]#" #(collectionEnabled)#disabled="disabled"::#(/collectionEnabled)# />
@ -92,6 +88,9 @@
 		  <input type="hidden" name="deleteold" id="deleteold" value="on" />
 		  <input type="hidden" name="storeHTCache" id="storeHTCache" value="on" />
          <input type="hidden" name="cachePolicy" id="cachePolicy" value="iffresh" />
+          <input type="hidden" name="crawlingQ" id="crawlingQ" value="on" />
+          <input type="hidden" name="followFrames" id="followFrames" value="on" />
+          <input type="hidden" name="obeyHtmlRobotsNoindex" id="obeyHtmlRobotsNoindex" value="on" />
          <input type="hidden" name="indexText" id="indexText" value="on" />
          <input type="hidden" name="indexMedia" id="indexMedia" value="on" />
          <input type="hidden" name="intention" id="intention" value="" />
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@ -246,13 +246,19 @@ public class Crawler_p {
                final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? post.getInt("crawlingDomMaxPages", -1) : -1;
                env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages));

-                boolean crawlingQ = "on".equals(post.get("crawlingQ", "off"));
+                boolean crawlingQ = "on".equals(post.get("crawlingQ", "on"));
                env.setConfig("crawlingQ", crawlingQ);
                
-                final boolean indexText = "on".equals(post.get("indexText", "off"));
+                boolean followFrames = "on".equals(post.get("followFrames", "on"));
+                env.setConfig("followFrames", followFrames);
+                
+                boolean obeyHtmlRobotsNoindex = "on".equals(post.get("obeyHtmlRobotsNoindex", "on"));
+                env.setConfig("obeyHtmlRobotsNoindex", obeyHtmlRobotsNoindex);
+
+                final boolean indexText = "on".equals(post.get("indexText", "on"));
                env.setConfig("indexText", indexText);

-                final boolean indexMedia = "on".equals(post.get("indexMedia", "off"));
+                final boolean indexMedia = "on".equals(post.get("indexMedia", "on"));
                env.setConfig("indexMedia", indexMedia);

                env.setConfig("storeHTCache", storeHTCache);
@ -361,7 +367,7 @@ public class Crawler_p {
                        directDocByURL,
                        crawlingIfOlder,
                        crawlingDomMaxPages,
-                        crawlingQ,
+                        crawlingQ, followFrames, obeyHtmlRobotsNoindex,
                        indexText,
                        indexMedia,
                        storeHTCache,
--- a/htroot/QuickCrawlLink_p.html
+++ b/htroot/QuickCrawlLink_p.html
@ -15,7 +15,7 @@
      If you click on it while browsing, the currently viewed website will be inserted into the YaCy crawling queue for indexing.
    </p>
    <p>
-      <a class="BookmarkLink" href="javascript:w = window.open('http://#[host]#:#[port]#/QuickCrawlLink_p.html?indexText=on&amp;indexMedia=on&amp;crawlingQ=on&amp;xdstopw=on&amp;title='+escape(document.title)+'&amp;url='+escape(location.href),'_blank','height=150,width=500,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no');w.focus();">Crawl with YaCy</a>
+      <a class="BookmarkLink" href="javascript:w = window.open('http://#[host]#:#[port]#/QuickCrawlLink_p.html?indexText=on&amp;indexMedia=on&amp;crawlingQ=on&amp;followFrames=on&amp;obeyHtmlRobotsNoindex=on&amp;xdstopw=on&amp;title='+escape(document.title)+'&amp;url='+escape(location.href),'_blank','height=150,width=500,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no');w.focus();">Crawl with YaCy</a>
    </p>
    
    ::<!-- 1 -->
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@ -97,7 +97,9 @@ public class QuickCrawlLink_p {
        final String crawlingMustMatch  = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING);
        final String crawlingMustNotMatch  = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
        final int CrawlingDepth      = post.getInt("crawlingDepth", 0);
-        final boolean crawlDynamic   = post.get("crawlingQ", "").equals("on");
+        final boolean crawlingQ      = post.get("crawlingQ", "").equals("on");
+        final boolean followFrames   = post.get("followFrames", "").equals("on");
+        final boolean obeyHtmlRobotsNoindex = post.get("obeyHtmlRobotsNoindex", "").equals("on");
        final boolean indexText      = post.get("indexText", "off").equals("on");
        final boolean indexMedia     = post.get("indexMedia", "off").equals("on");
        final boolean storeHTCache   = post.get("storeHTCache", "").equals("on");
@ -145,11 +147,9 @@ public class QuickCrawlLink_p {
                        true,
                        60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
                        -1, // domMaxPages, if negative: no count restriction
-                        crawlDynamic,
-                        indexText,
-                        indexMedia,
-                        storeHTCache,
-                        remoteIndexing,
+                        crawlingQ, followFrames, obeyHtmlRobotsNoindex,
+                        indexText, indexMedia,
+                        storeHTCache, remoteIndexing,
                        CacheStrategy.IFFRESH,
                        collection);
                sb.crawler.putActive(pe.handle().getBytes(), pe);
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@ -407,13 +407,13 @@ public final class CrawlStacker {
        }

        // deny cgi
-        if (url.isIndividual() && !(profile.crawlingQ()))  { // TODO: make special property for crawlingIndividual
+        if (url.isIndividual() && !profile.crawlingQ())  { // TODO: make special property for crawlingIndividual
            if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is CGI URL.");
            return "individual url (sessionid etc) not wanted";
        }

        // deny post properties
-        if (url.isPOST() && !(profile.crawlingQ()))  {
+        if (url.isPOST() && !profile.crawlingQ())  {
            if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is post URL.");
            return "post url not allowed";
        }
--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@ -264,7 +264,7 @@ public final class CrawlSwitchboard {
                true,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
                -1,
-                false,
+                false, true, true,
                true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
                true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
                true,
@ -292,7 +292,7 @@ public final class CrawlSwitchboard {
                false,
                -1,
                -1,
-                true,
+                true, true, true,
                true,
                true,
                false,
@ -320,7 +320,7 @@ public final class CrawlSwitchboard {
                false,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
                -1,
-                true,
+                true, true, true,
                false,
                false,
                true,
@ -348,7 +348,7 @@ public final class CrawlSwitchboard {
                false,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
                -1,
-                true,
+                true, true, true,
                true,
                true,
                true,
@ -377,7 +377,7 @@ public final class CrawlSwitchboard {
                false,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE),
                -1,
-                true,
+                true, true, true,
                false,
                false,
                true,
@ -405,7 +405,7 @@ public final class CrawlSwitchboard {
                false,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
                -1,
-                true,
+                true, true, true,
                false,
                false,
                true,
@ -433,7 +433,7 @@ public final class CrawlSwitchboard {
                false,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
                -1,
-                true,
+                true, true, true,
                false,
                true,
                true,
@ -461,7 +461,7 @@ public final class CrawlSwitchboard {
                false,
                CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
                -1,
-                true,
+                true, true, true,
                true,
                false,
                false,
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -61,6 +61,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
    public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
    public static final String DOM_MAX_PAGES    = "domMaxPages";
    public static final String CRAWLING_Q       = "crawlingQ";
+    public static final String FOLLOW_FRAMES    = "followFrames";
+    public static final String OBEY_HTML_ROBOTS_NOINDEX = "obeyHtmlRobotsNoindex";
    public static final String INDEX_TEXT       = "indexText";
    public static final String INDEX_MEDIA      = "indexMedia";
    public static final String STORE_HTCACHE    = "storeHTCache";
@ -127,7 +129,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
                 final boolean directDocByURL,
                 final long recrawlIfOlder /*date*/,
                 final int domMaxPages,
-                 final boolean crawlingQ,
+                 final boolean crawlingQ, final boolean followFrames, final boolean obeyHtmlRobotsNoindex,
                 final boolean indexText,
                 final boolean indexMedia,
                 final boolean storeHTCache,
@ -158,6 +160,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        put(RECRAWL_IF_OLDER, recrawlIfOlder);
        put(DOM_MAX_PAGES,    domMaxPages);
        put(CRAWLING_Q,       crawlingQ); // crawling of urls with '?'
+        put(FOLLOW_FRAMES,    followFrames); // load pages contained in frames or ifames
+        put(OBEY_HTML_ROBOTS_NOINDEX, obeyHtmlRobotsNoindex); // if false, then a meta robots tag containing 'noindex' is ignored
        put(INDEX_TEXT,       indexText);
        put(INDEX_MEDIA,      indexMedia);
        put(STORE_HTCACHE,    storeHTCache);
@ -491,6 +495,18 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        return (r.equals(Boolean.TRUE.toString()));
    }

+    public boolean followFrames() {
+        final String r = get(FOLLOW_FRAMES);
+        if (r == null) return false;
+        return (r.equals(Boolean.TRUE.toString()));
+    }
+
+    public boolean obeyHtmlRobotsNoindex() {
+        final String r = get(OBEY_HTML_ROBOTS_NOINDEX);
+        if (r == null) return false;
+        return (r.equals(Boolean.TRUE.toString()));
+    }
+
    public boolean indexText() {
        final String r = get(INDEX_TEXT);
        if (r == null) return true;
--- a/source/net/yacy/data/ymark/YMarkCrawlStart.java
+++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java
@ -183,7 +183,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
 		                CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
 		                -1,
 		                crawlingQ,
-		                true, true, true, false,
+		                true, true, true, true, true, false,
 		                CacheStrategy.IFFRESH,
 		                "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA); // TODO: make this a default profile in CrawlSwitchboard
 		sb.crawler.putActive(pe.handle().getBytes(), pe);
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2476,7 +2476,6 @@ public final class Switchboard extends serverSwitch {
        }

        final long parsingEndTime = System.currentTimeMillis();
-
        // put anchors on crawl stack
        final long stackStartTime = System.currentTimeMillis();
        if ((processCase == EventOrigin.PROXY_LOAD || processCase == EventOrigin.LOCAL_CRAWLING) &&
@ -2578,7 +2577,7 @@ public final class Switchboard extends serverSwitch {
        // check which files may take part in the indexing process
        final List<Document> doclist = new ArrayList<Document>();
        docloop: for (final Document document : in.documents) {
-            if (document.indexingDenied()) {
+            if (document.indexingDenied() && profile.obeyHtmlRobotsNoindex()) {
                if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
                addURLtoErrorDB(
                    in.queueEntry.url(),
@ -2671,8 +2670,9 @@ public final class Switchboard extends serverSwitch {
        final DigestURI url = document.dc_source();
        final DigestURI referrerURL = queueEntry.referrerURL();
        EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash);
+        CrawlProfile profile = queueEntry.profile();

-        if ( condenser == null || document.indexingDenied() ) {
+        if (condenser == null || (document.indexingDenied() && profile.obeyHtmlRobotsNoindex())) {
            //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase);
            addURLtoErrorDB(
                url,
@ -2684,7 +2684,7 @@ public final class Switchboard extends serverSwitch {
            return;
        }

-        if ( !queueEntry.profile().indexText() && !queueEntry.profile().indexMedia() ) {
+        if ( !profile.indexText() && !profile.indexMedia() ) {
            //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
            addURLtoErrorDB(
                url,
@ -2695,7 +2695,7 @@ public final class Switchboard extends serverSwitch {
                "denied by profile rule, process case="
                    + processCase
                    + ", profile name = "
-                    + queueEntry.profile().collectionName());
+                    + profile.collectionName());
            return;
        }

@ -2993,7 +2993,8 @@ public final class Switchboard extends serverSwitch {
                        final Document[] documents = response.parse();
                        if (documents != null) {
                            for (final Document document: documents) {
-                                if (document.indexingDenied()) {
+                                final CrawlProfile profile = crawler.getActive(ASCII.getBytes(request.profileHandle()));
+                                if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) {
                                    throw new Parser.Failure("indexing is denied", url);
                                }
                                final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, true);
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -493,6 +493,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        Map<DigestURI, ImageEntry> images = new HashMap<DigestURI, ImageEntry>();
        int c = 0;
        final Object parser = document.getParserObject();
+        boolean containsCanonical = false;
        if (parser instanceof ContentScraper) {
            final ContentScraper html = (ContentScraper) parser;
            images = html.getImages();
@ -715,7 +716,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            // canonical tag
            if (allAttr || contains(CollectionSchema.canonical_s)) {
                final DigestURI canonical = html.getCanonical();
-                if (canonical != null) {
+                if (canonical != null && !ASCII.String(canonical.hash()).equals(id)) {
+                    containsCanonical = true;
                    inboundLinks.remove(canonical);
                    outboundLinks.remove(canonical);
                    add(doc, CollectionSchema.canonical_s, canonical.toNormalform(false));
@ -811,10 +813,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount());
        
        // create a subgraph
-        //if () {
+        if (!containsCanonical) {
+            // a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document
            webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, inboundLinks, citations);
            webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, false, outboundLinks, citations);
-        //}
+        }
            
        // list all links
        doc.webgraphDocuments.addAll(subgraph.edges);