added a crawl option to obey html-meta-robots-noindex. This is on by

default.
pull/1/head
Michael Peter Christen 12 years ago
parent 5a5d411ec0
commit 57ffdfad4c

@ -537,6 +537,8 @@ indexMedia=true
# URLs are only indexed and further crawled if they match this filter # URLs are only indexed and further crawled if they match this filter
crawlingFilter=.* crawlingFilter=.*
crawlingQ=true crawlingQ=true
followFrames=true
obeyHtmlRobotsNoindex=true
storeHTCache=true storeHTCache=true
storeTXCache=true storeTXCache=true

@ -109,14 +109,17 @@
<input name="crawlingDomMaxPages" id="crawlingDomMaxPages" type="text" size="6" maxlength="6" value="#[crawlingDomMaxPages]#" /> <input name="crawlingDomMaxPages" id="crawlingDomMaxPages" type="text" size="6" maxlength="6" value="#[crawlingDomMaxPages]#" />
</dd> </dd>
<dt><label for="crawlingQ">Accept URLs with '?' / dynamic URLs</label></dt> <dt><label for="Constraints">misc. Constraints</label></dt>
<dd> <dd>
<span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;"> <span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">
A questionmark is usually a hint for a dynamic page. URLs pointing to dynamic content should usually not be crawled. A questionmark is usually a hint for a dynamic page. URLs pointing to dynamic content should usually not be crawled.
However, there are sometimes web pages with static content that However, there are sometimes web pages with static content that
is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops. is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops.
Following frames is NOT done by Gxxg1e, but we do by default to have a richer content. 'nofollow' in robots metadata can be overridden; this does not affect obeying of the robots.txt which is never ignored.
</span></span> </span></span>
<input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /> Accept URLs with query-part ('?'): <input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# />&nbsp;&nbsp;
Obey html-robots-noindex: <input type="checkbox" name="obeyHtmlRobotsNoindex" id="obeyHtmlRobotsNoindex" #(obeyHtmlRobotsNoindexChecked)#::checked="checked"#(/obeyHtmlRobotsNoindexChecked)# /><!--&nbsp;&nbsp;
Follow Frames: <input type="checkbox" name="followFrames" id="followFrames" #(followFramesChecked)#::checked="checked"#(/followFramesChecked)# />&nbsp;&nbsp;-->
</dd> </dd>
<dt>Load Filter on URLs</dt> <dt>Load Filter on URLs</dt>
<dd><span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;"> <dd><span class="info" style="float:right"><img src="/env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">

@ -62,6 +62,8 @@ public class CrawlStartExpert_p {
prop.put("crawlingDomMaxCheck", (crawlingDomMaxPages == -1) ? "0" : "1"); prop.put("crawlingDomMaxCheck", (crawlingDomMaxPages == -1) ? "0" : "1");
prop.put("crawlingDomMaxPages", (crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages); prop.put("crawlingDomMaxPages", (crawlingDomMaxPages == -1) ? 10000 : crawlingDomMaxPages);
prop.put("crawlingQChecked", env.getConfigBool("crawlingQ", true) ? "1" : "0"); prop.put("crawlingQChecked", env.getConfigBool("crawlingQ", true) ? "1" : "0");
prop.put("followFramesChecked", env.getConfigBool("followFrames", true) ? "1" : "0");
prop.put("obeyHtmlRobotsNoindexChecked", env.getConfigBool("obeyHtmlRobotsNoindex", true) ? "1" : "0");
prop.put("storeHTCacheChecked", env.getConfigBool("storeHTCache", true) ? "1" : "0"); prop.put("storeHTCacheChecked", env.getConfigBool("storeHTCache", true) ? "1" : "0");
prop.put("indexingTextChecked", env.getConfigBool("indexText", true) ? "1" : "0"); prop.put("indexingTextChecked", env.getConfigBool("indexText", true) ? "1" : "0");
prop.put("indexingMediaChecked", env.getConfigBool("indexMedia", true) ? "1" : "0"); prop.put("indexingMediaChecked", env.getConfigBool("indexMedia", true) ? "1" : "0");

@ -39,7 +39,7 @@
<dd> <dd>
<table border="0" cellpadding="0" cellspacing="0"><tr valign="top"> <table border="0" cellpadding="0" cellspacing="0"><tr valign="top">
<td valign="top"><input type="radio" name="crawlingMode" id="url" value="url" checked="checked" <td valign="top"><input type="radio" name="crawlingMode" id="url" value="url" checked="checked"
onmousedown="document.getElementById('rangeDomain').disabled=false;document.getElementById('rangeSubpath').disabled=false;document.getElementById('crawlingDomMaxCheck').disabled=false;document.getElementById('crawlingDomMaxPages').disabled=false;document.getElementById('crawlingQ').disabled=false;"/>Start URL&nbsp;(must start with<br/>http:// https:// ftp:// smb:// file://)</td> onmousedown="document.getElementById('rangeDomain').disabled=false;document.getElementById('rangeSubpath').disabled=false;document.getElementById('crawlingDomMaxCheck').disabled=false;document.getElementById('crawlingDomMaxPages').disabled=false;"/>Start URL&nbsp;(must start with<br/>http:// https:// ftp:// smb:// file://)</td>
<td valign="top"> <td valign="top">
<input name="crawlingURL" id="crawlingURL" type="text" size="50" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" style="font-size:16px"/><br/> <input name="crawlingURL" id="crawlingURL" type="text" size="50" maxlength="256" value="#[starturl]#" onkeypress="changed()" onfocus="check('url')" style="font-size:16px"/><br/>
<input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/> <input name="bookmarkTitle" id="bookmarkTitle" type="text" size="50" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/>
@ -53,7 +53,7 @@
<td><div id="sitelistURLs"></div></td> <td><div id="sitelistURLs"></div></td>
</tr><tr> </tr><tr>
<td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled" <td><input type="radio" name="crawlingMode" id="sitemap" value="sitemap" disabled="disabled"
onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;document.getElementById('crawlingQ').disabled=true;"/>Sitemap URL</td> onmousedown="document.getElementById('rangeDomain').disabled=true;document.getElementById('rangeSubpath').disabled=true;document.getElementById('crawlingDomMaxCheck').disabled=true;document.getElementById('crawlingDomMaxPages').disabled=true;"/>Sitemap URL</td>
<td><input name="sitemapURL" type="text" size="41" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/></td> <td><input name="sitemapURL" type="text" size="41" maxlength="256" value="" readonly="readonly" style="background:transparent; border:0px"/></td>
</tr> </tr>
</table><br/> </table><br/>
@ -75,10 +75,6 @@
<td valign="top">documents</td> <td valign="top">documents</td>
</tr></table> </tr></table>
</dd> </dd>
<dt><label>Dynamic URLs</label></dt>
<dd>
<input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /> allow <a href="http://en.wikipedia.org/wiki/Query_string">query-strings</a> (urls with a '?' in the path)
</dd>
<dt><label>Collection</label></dt> <dt><label>Collection</label></dt>
<dd> <dd>
<input name="collection" id="collection" type="text" size="60" maxlength="100" value="#[collection]#" #(collectionEnabled)#disabled="disabled"::#(/collectionEnabled)# /> <input name="collection" id="collection" type="text" size="60" maxlength="100" value="#[collection]#" #(collectionEnabled)#disabled="disabled"::#(/collectionEnabled)# />
@ -92,6 +88,9 @@
<input type="hidden" name="deleteold" id="deleteold" value="on" /> <input type="hidden" name="deleteold" id="deleteold" value="on" />
<input type="hidden" name="storeHTCache" id="storeHTCache" value="on" /> <input type="hidden" name="storeHTCache" id="storeHTCache" value="on" />
<input type="hidden" name="cachePolicy" id="cachePolicy" value="iffresh" /> <input type="hidden" name="cachePolicy" id="cachePolicy" value="iffresh" />
<input type="hidden" name="crawlingQ" id="crawlingQ" value="on" />
<input type="hidden" name="followFrames" id="followFrames" value="on" />
<input type="hidden" name="obeyHtmlRobotsNoindex" id="obeyHtmlRobotsNoindex" value="on" />
<input type="hidden" name="indexText" id="indexText" value="on" /> <input type="hidden" name="indexText" id="indexText" value="on" />
<input type="hidden" name="indexMedia" id="indexMedia" value="on" /> <input type="hidden" name="indexMedia" id="indexMedia" value="on" />
<input type="hidden" name="intention" id="intention" value="" /> <input type="hidden" name="intention" id="intention" value="" />

@ -246,13 +246,19 @@ public class Crawler_p {
final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? post.getInt("crawlingDomMaxPages", -1) : -1; final int crawlingDomMaxPages = (crawlingDomMaxCheck) ? post.getInt("crawlingDomMaxPages", -1) : -1;
env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages)); env.setConfig("crawlingDomMaxPages", Integer.toString(crawlingDomMaxPages));
boolean crawlingQ = "on".equals(post.get("crawlingQ", "off")); boolean crawlingQ = "on".equals(post.get("crawlingQ", "on"));
env.setConfig("crawlingQ", crawlingQ); env.setConfig("crawlingQ", crawlingQ);
boolean followFrames = "on".equals(post.get("followFrames", "on"));
env.setConfig("followFrames", followFrames);
boolean obeyHtmlRobotsNoindex = "on".equals(post.get("obeyHtmlRobotsNoindex", "on"));
env.setConfig("obeyHtmlRobotsNoindex", obeyHtmlRobotsNoindex);
final boolean indexText = "on".equals(post.get("indexText", "off")); final boolean indexText = "on".equals(post.get("indexText", "on"));
env.setConfig("indexText", indexText); env.setConfig("indexText", indexText);
final boolean indexMedia = "on".equals(post.get("indexMedia", "off")); final boolean indexMedia = "on".equals(post.get("indexMedia", "on"));
env.setConfig("indexMedia", indexMedia); env.setConfig("indexMedia", indexMedia);
env.setConfig("storeHTCache", storeHTCache); env.setConfig("storeHTCache", storeHTCache);
@ -361,7 +367,7 @@ public class Crawler_p {
directDocByURL, directDocByURL,
crawlingIfOlder, crawlingIfOlder,
crawlingDomMaxPages, crawlingDomMaxPages,
crawlingQ, crawlingQ, followFrames, obeyHtmlRobotsNoindex,
indexText, indexText,
indexMedia, indexMedia,
storeHTCache, storeHTCache,

@ -15,7 +15,7 @@
If you click on it while browsing, the currently viewed website will be inserted into the YaCy crawling queue for indexing. If you click on it while browsing, the currently viewed website will be inserted into the YaCy crawling queue for indexing.
</p> </p>
<p> <p>
<a class="BookmarkLink" href="javascript:w = window.open('http://#[host]#:#[port]#/QuickCrawlLink_p.html?indexText=on&amp;indexMedia=on&amp;crawlingQ=on&amp;xdstopw=on&amp;title='+escape(document.title)+'&amp;url='+escape(location.href),'_blank','height=150,width=500,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no');w.focus();">Crawl with YaCy</a> <a class="BookmarkLink" href="javascript:w = window.open('http://#[host]#:#[port]#/QuickCrawlLink_p.html?indexText=on&amp;indexMedia=on&amp;crawlingQ=on&amp;followFrames=on&amp;obeyHtmlRobotsNoindex=on&amp;xdstopw=on&amp;title='+escape(document.title)+'&amp;url='+escape(location.href),'_blank','height=150,width=500,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no');w.focus();">Crawl with YaCy</a>
</p> </p>
::<!-- 1 --> ::<!-- 1 -->

@ -97,7 +97,9 @@ public class QuickCrawlLink_p {
final String crawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING); final String crawlingMustMatch = post.get("mustmatch", CrawlProfile.MATCH_ALL_STRING);
final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING); final String crawlingMustNotMatch = post.get("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
final int CrawlingDepth = post.getInt("crawlingDepth", 0); final int CrawlingDepth = post.getInt("crawlingDepth", 0);
final boolean crawlDynamic = post.get("crawlingQ", "").equals("on"); final boolean crawlingQ = post.get("crawlingQ", "").equals("on");
final boolean followFrames = post.get("followFrames", "").equals("on");
final boolean obeyHtmlRobotsNoindex = post.get("obeyHtmlRobotsNoindex", "").equals("on");
final boolean indexText = post.get("indexText", "off").equals("on"); final boolean indexText = post.get("indexText", "off").equals("on");
final boolean indexMedia = post.get("indexMedia", "off").equals("on"); final boolean indexMedia = post.get("indexMedia", "off").equals("on");
final boolean storeHTCache = post.get("storeHTCache", "").equals("on"); final boolean storeHTCache = post.get("storeHTCache", "").equals("on");
@ -145,11 +147,9 @@ public class QuickCrawlLink_p {
true, true,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month 60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
-1, // domMaxPages, if negative: no count restriction -1, // domMaxPages, if negative: no count restriction
crawlDynamic, crawlingQ, followFrames, obeyHtmlRobotsNoindex,
indexText, indexText, indexMedia,
indexMedia, storeHTCache, remoteIndexing,
storeHTCache,
remoteIndexing,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
collection); collection);
sb.crawler.putActive(pe.handle().getBytes(), pe); sb.crawler.putActive(pe.handle().getBytes(), pe);

@ -407,13 +407,13 @@ public final class CrawlStacker {
} }
// deny cgi // deny cgi
if (url.isIndividual() && !(profile.crawlingQ())) { // TODO: make special property for crawlingIndividual if (url.isIndividual() && !profile.crawlingQ()) { // TODO: make special property for crawlingIndividual
if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is CGI URL."); if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is CGI URL.");
return "individual url (sessionid etc) not wanted"; return "individual url (sessionid etc) not wanted";
} }
// deny post properties // deny post properties
if (url.isPOST() && !(profile.crawlingQ())) { if (url.isPOST() && !profile.crawlingQ()) {
if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is post URL."); if (this.log.isFine()) this.log.logFine("URL '" + urlstring + "' is post URL.");
return "post url not allowed"; return "post url not allowed";
} }

@ -264,7 +264,7 @@ public final class CrawlSwitchboard {
true, true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
-1, -1,
false, false, true, true,
true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_TEXT, true)*/,
true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/, true /*getConfigBool(PROXY_INDEXING_LOCAL_MEDIA, true)*/,
true, true,
@ -292,7 +292,7 @@ public final class CrawlSwitchboard {
false, false,
-1, -1,
-1, -1,
true, true, true, true,
true, true,
true, true,
false, false,
@ -320,7 +320,7 @@ public final class CrawlSwitchboard {
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
-1, -1,
true, true, true, true,
false, false,
false, false,
true, true,
@ -348,7 +348,7 @@ public final class CrawlSwitchboard {
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
-1, -1,
true, true, true, true,
true, true,
true, true,
true, true,
@ -377,7 +377,7 @@ public final class CrawlSwitchboard {
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE),
-1, -1,
true, true, true, true,
false, false,
false, false,
true, true,
@ -405,7 +405,7 @@ public final class CrawlSwitchboard {
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
-1, -1,
true, true, true, true,
false, false,
false, false,
true, true,
@ -433,7 +433,7 @@ public final class CrawlSwitchboard {
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
-1, -1,
true, true, true, true,
false, false,
true, true,
true, true,
@ -461,7 +461,7 @@ public final class CrawlSwitchboard {
false, false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
-1, -1,
true, true, true, true,
true, true,
false, false,
false, false,

@ -61,6 +61,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String RECRAWL_IF_OLDER = "recrawlIfOlder"; public static final String RECRAWL_IF_OLDER = "recrawlIfOlder";
public static final String DOM_MAX_PAGES = "domMaxPages"; public static final String DOM_MAX_PAGES = "domMaxPages";
public static final String CRAWLING_Q = "crawlingQ"; public static final String CRAWLING_Q = "crawlingQ";
public static final String FOLLOW_FRAMES = "followFrames";
public static final String OBEY_HTML_ROBOTS_NOINDEX = "obeyHtmlRobotsNoindex";
public static final String INDEX_TEXT = "indexText"; public static final String INDEX_TEXT = "indexText";
public static final String INDEX_MEDIA = "indexMedia"; public static final String INDEX_MEDIA = "indexMedia";
public static final String STORE_HTCACHE = "storeHTCache"; public static final String STORE_HTCACHE = "storeHTCache";
@ -127,7 +129,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final boolean directDocByURL, final boolean directDocByURL,
final long recrawlIfOlder /*date*/, final long recrawlIfOlder /*date*/,
final int domMaxPages, final int domMaxPages,
final boolean crawlingQ, final boolean crawlingQ, final boolean followFrames, final boolean obeyHtmlRobotsNoindex,
final boolean indexText, final boolean indexText,
final boolean indexMedia, final boolean indexMedia,
final boolean storeHTCache, final boolean storeHTCache,
@ -158,6 +160,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(RECRAWL_IF_OLDER, recrawlIfOlder); put(RECRAWL_IF_OLDER, recrawlIfOlder);
put(DOM_MAX_PAGES, domMaxPages); put(DOM_MAX_PAGES, domMaxPages);
put(CRAWLING_Q, crawlingQ); // crawling of urls with '?' put(CRAWLING_Q, crawlingQ); // crawling of urls with '?'
put(FOLLOW_FRAMES, followFrames); // load pages contained in frames or ifames
put(OBEY_HTML_ROBOTS_NOINDEX, obeyHtmlRobotsNoindex); // if false, then a meta robots tag containing 'noindex' is ignored
put(INDEX_TEXT, indexText); put(INDEX_TEXT, indexText);
put(INDEX_MEDIA, indexMedia); put(INDEX_MEDIA, indexMedia);
put(STORE_HTCACHE, storeHTCache); put(STORE_HTCACHE, storeHTCache);
@ -491,6 +495,18 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return (r.equals(Boolean.TRUE.toString())); return (r.equals(Boolean.TRUE.toString()));
} }
public boolean followFrames() {
final String r = get(FOLLOW_FRAMES);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean obeyHtmlRobotsNoindex() {
final String r = get(OBEY_HTML_ROBOTS_NOINDEX);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean indexText() { public boolean indexText() {
final String r = get(INDEX_TEXT); final String r = get(INDEX_TEXT);
if (r == null) return true; if (r == null) return true;

@ -183,7 +183,7 @@ public class YMarkCrawlStart extends HashMap<String,String>{
CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
-1, -1,
crawlingQ, crawlingQ,
true, true, true, false, true, true, true, true, true, false,
CacheStrategy.IFFRESH, CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA); // TODO: make this a default profile in CrawlSwitchboard "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA); // TODO: make this a default profile in CrawlSwitchboard
sb.crawler.putActive(pe.handle().getBytes(), pe); sb.crawler.putActive(pe.handle().getBytes(), pe);

@ -2476,7 +2476,6 @@ public final class Switchboard extends serverSwitch {
} }
final long parsingEndTime = System.currentTimeMillis(); final long parsingEndTime = System.currentTimeMillis();
// put anchors on crawl stack // put anchors on crawl stack
final long stackStartTime = System.currentTimeMillis(); final long stackStartTime = System.currentTimeMillis();
if ((processCase == EventOrigin.PROXY_LOAD || processCase == EventOrigin.LOCAL_CRAWLING) && if ((processCase == EventOrigin.PROXY_LOAD || processCase == EventOrigin.LOCAL_CRAWLING) &&
@ -2578,7 +2577,7 @@ public final class Switchboard extends serverSwitch {
// check which files may take part in the indexing process // check which files may take part in the indexing process
final List<Document> doclist = new ArrayList<Document>(); final List<Document> doclist = new ArrayList<Document>();
docloop: for (final Document document : in.documents) { docloop: for (final Document document : in.documents) {
if (document.indexingDenied()) { if (document.indexingDenied() && profile.obeyHtmlRobotsNoindex()) {
if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule"); if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
addURLtoErrorDB( addURLtoErrorDB(
in.queueEntry.url(), in.queueEntry.url(),
@ -2671,8 +2670,9 @@ public final class Switchboard extends serverSwitch {
final DigestURI url = document.dc_source(); final DigestURI url = document.dc_source();
final DigestURI referrerURL = queueEntry.referrerURL(); final DigestURI referrerURL = queueEntry.referrerURL();
EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash); EventOrigin processCase = queueEntry.processCase(this.peers.mySeed().hash);
CrawlProfile profile = queueEntry.profile();
if ( condenser == null || document.indexingDenied() ) { if (condenser == null || (document.indexingDenied() && profile.obeyHtmlRobotsNoindex())) {
//if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase); //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase);
addURLtoErrorDB( addURLtoErrorDB(
url, url,
@ -2684,7 +2684,7 @@ public final class Switchboard extends serverSwitch {
return; return;
} }
if ( !queueEntry.profile().indexText() && !queueEntry.profile().indexMedia() ) { if ( !profile.indexText() && !profile.indexMedia() ) {
//if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name()); //if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase + ", profile name = " + queueEntry.profile().name());
addURLtoErrorDB( addURLtoErrorDB(
url, url,
@ -2695,7 +2695,7 @@ public final class Switchboard extends serverSwitch {
"denied by profile rule, process case=" "denied by profile rule, process case="
+ processCase + processCase
+ ", profile name = " + ", profile name = "
+ queueEntry.profile().collectionName()); + profile.collectionName());
return; return;
} }
@ -2993,7 +2993,8 @@ public final class Switchboard extends serverSwitch {
final Document[] documents = response.parse(); final Document[] documents = response.parse();
if (documents != null) { if (documents != null) {
for (final Document document: documents) { for (final Document document: documents) {
if (document.indexingDenied()) { final CrawlProfile profile = crawler.getActive(ASCII.getBytes(request.profileHandle()));
if (document.indexingDenied() && (profile == null || profile.obeyHtmlRobotsNoindex())) {
throw new Parser.Failure("indexing is denied", url); throw new Parser.Failure("indexing is denied", url);
} }
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, true); final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, true);

@ -493,6 +493,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
Map<DigestURI, ImageEntry> images = new HashMap<DigestURI, ImageEntry>(); Map<DigestURI, ImageEntry> images = new HashMap<DigestURI, ImageEntry>();
int c = 0; int c = 0;
final Object parser = document.getParserObject(); final Object parser = document.getParserObject();
boolean containsCanonical = false;
if (parser instanceof ContentScraper) { if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser; final ContentScraper html = (ContentScraper) parser;
images = html.getImages(); images = html.getImages();
@ -715,7 +716,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// canonical tag // canonical tag
if (allAttr || contains(CollectionSchema.canonical_s)) { if (allAttr || contains(CollectionSchema.canonical_s)) {
final DigestURI canonical = html.getCanonical(); final DigestURI canonical = html.getCanonical();
if (canonical != null) { if (canonical != null && !ASCII.String(canonical.hash()).equals(id)) {
containsCanonical = true;
inboundLinks.remove(canonical); inboundLinks.remove(canonical);
outboundLinks.remove(canonical); outboundLinks.remove(canonical);
add(doc, CollectionSchema.canonical_s, canonical.toNormalform(false)); add(doc, CollectionSchema.canonical_s, canonical.toNormalform(false));
@ -811,10 +813,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount()); if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount());
// create a subgraph // create a subgraph
//if () { if (!containsCanonical) {
// a document with canonical tag should not get a webgraph relation, because that belongs to the canonical document
webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, inboundLinks, citations); webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, inboundLinks, citations);
webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, false, outboundLinks, citations); webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, false, outboundLinks, citations);
//} }
// list all links // list all links
doc.webgraphDocuments.addAll(subgraph.edges); doc.webgraphDocuments.addAll(subgraph.edges);

Loading…
Cancel
Save