added an option to set 'obey nofollow' for links with rel="nofollow"

attribute in the <a> tag for each crawl. This introduces a lot of
changes because it extends the usage of the AnchorURL Object type which
now also has a different toString method that the underlying
DigestURL.toString. It is therefore not advised to use .toString at all
for urls, just just toNormalform(false) instead.
pull/1/head
Michael Peter Christen 11 years ago
parent bf1b6b93e7
commit 2de159719b

@ -550,6 +550,7 @@ crawlingFilter=.*
crawlingQ=true
followFrames=true
obeyHtmlRobotsNoindex=true
obeyHtmlRobotsNofollow=false
storeHTCache=true
storeTXCache=true

@ -62,8 +62,8 @@ public class BlacklistTest_p {
testurl = null;
}
if(testurl != null) {
prop.putHTML("url",testurl.toString());
prop.putHTML("testlist_url",testurl.toString());
prop.putHTML("url",testurl.toNormalform(false));
prop.putHTML("testlist_url",testurl.toNormalform(false));
boolean isblocked = false;
if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, testurl)) {

@ -298,9 +298,10 @@
is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops.
Following frames is NOT done by Gxxg1e, but we do by default to have a richer content. 'nofollow' in robots metadata can be overridden; this does not affect obeying of the robots.txt which is never ignored.
</span></span>
Accept URLs with query-part ('?'): <input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# />&nbsp;&nbsp;
Obey html-robots-noindex: <input type="checkbox" name="obeyHtmlRobotsNoindex" id="obeyHtmlRobotsNoindex" #(obeyHtmlRobotsNoindexChecked)#::checked="checked"#(/obeyHtmlRobotsNoindexChecked)# /><!--&nbsp;&nbsp;
Follow Frames: <input type="checkbox" name="followFrames" id="followFrames" #(followFramesChecked)#::checked="checked"#(/followFramesChecked)# />&nbsp;&nbsp;-->
Accept URLs with query-part ('?'): <input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /><br/>
Obey html-robots-noindex: <input type="checkbox" name="obeyHtmlRobotsNoindex" id="obeyHtmlRobotsNoindex" #(obeyHtmlRobotsNoindexChecked)#::checked="checked"#(/obeyHtmlRobotsNoindexChecked)# /><br/>
Obey html-robots-nofollow: <input type="checkbox" name="obeyHtmlRobotsNofollow" id="obeyHtmlRobotsNofollow" #(obeyHtmlRobotsNofollowChecked)#::checked="checked"#(/obeyHtmlRobotsNofollowChecked)# /><!--<br/>
Follow Frames: <input type="checkbox" name="followFrames" id="followFrames" #(followFramesChecked)#::checked="checked"#(/followFramesChecked)# />-->
</dd>
<dt>Load Filter on URLs</dt>
<dd><span class="info" style="float:right"><img src="env/grafics/i16.gif" width="16" height="16" alt="info"/><span style="right:0px;">

@ -192,16 +192,15 @@ public class CrawlStartExpert {
}
// Accept URLs with query-part?
// Obey html-robots-noindex?
// Obey html-robots-noindex, nofollow?
if (post == null) {
prop.put("crawlingQChecked",
env.getConfigBool("crawlingQ", true) ? 1 : 0);
prop.put("obeyHtmlRobotsNoindexChecked",
env.getConfigBool("obeyHtmlRobotsNoindex", true) ? 1 : 0);
prop.put("crawlingQChecked", env.getConfigBool("crawlingQ", true) ? 1 : 0);
prop.put("obeyHtmlRobotsNoindexChecked", env.getConfigBool("obeyHtmlRobotsNoindex", true) ? 1 : 0);
prop.put("obeyHtmlRobotsNofollowChecked", env.getConfigBool("obeyHtmlRobotsNofollow", true) ? 1 : 0);
} else {
prop.put("crawlingQChecked", post.getBoolean("crawlingQ") ? 1 : 0);
prop.put("obeyHtmlRobotsNoindexChecked",
post.getBoolean("obeyHtmlRobotsNoindex") ? 1 : 0);
prop.put("obeyHtmlRobotsNoindexChecked", post.getBoolean("obeyHtmlRobotsNoindex") ? 1 : 0);
prop.put("obeyHtmlRobotsNofollowChecked", post.getBoolean("obeyHtmlRobotsNofollow") ? 1 : 0);
}
// Load Filter on URLs (range)

@ -311,6 +311,9 @@ public class Crawler_p {
boolean obeyHtmlRobotsNoindex = "on".equals(post.get("obeyHtmlRobotsNoindex", "false"));
env.setConfig("obeyHtmlRobotsNoindex", obeyHtmlRobotsNoindex);
boolean obeyHtmlRobotsNofollow = "on".equals(post.get("obeyHtmlRobotsNofollow", "false"));
env.setConfig("obeyHtmlRobotsNofollow", obeyHtmlRobotsNofollow);
final boolean indexText = "on".equals(post.get("indexText", "false"));
env.setConfig("indexText", indexText);
@ -444,7 +447,8 @@ public class Crawler_p {
directDocByURL,
crawlingIfOlder,
crawlingDomMaxPages,
crawlingQ, followFrames, obeyHtmlRobotsNoindex,
crawlingQ, followFrames,
obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
indexText,
indexMedia,
storeHTCache,

@ -101,6 +101,7 @@ public class QuickCrawlLink_p {
final boolean crawlingQ = post.get("crawlingQ", "").equals("on");
final boolean followFrames = post.get("followFrames", "").equals("on");
final boolean obeyHtmlRobotsNoindex = post.get("obeyHtmlRobotsNoindex", "").equals("on");
final boolean obeyHtmlRobotsNofollow = post.get("obeyHtmlRobotsNofollow", "").equals("on");
final boolean indexText = post.get("indexText", "off").equals("on");
final boolean indexMedia = post.get("indexMedia", "off").equals("on");
final boolean storeHTCache = post.get("storeHTCache", "").equals("on");
@ -147,7 +148,8 @@ public class QuickCrawlLink_p {
true,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
-1, // domMaxPages, if negative: no count restriction
crawlingQ, followFrames, obeyHtmlRobotsNoindex,
crawlingQ, followFrames,
obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
indexText, indexMedia,
storeHTCache, remoteIndexing,
CacheStrategy.IFFRESH,

@ -243,7 +243,7 @@ public class ViewFile {
prop.put("viewMode_publisher", document.dc_publisher());
prop.put("viewMode_format", document.dc_format());
prop.put("viewMode_identifier", document.dc_identifier());
prop.put("viewMode_source", url.toString());
prop.put("viewMode_source", url.toNormalform(false));
prop.put("viewMode_lat", document.lat());
prop.put("viewMode_lon", document.lon());
prop.put("viewMode_parsedText", markup(wordArray, content).replaceAll("\n", "<br />").replaceAll("\t", "&nbsp;&nbsp;&nbsp;&nbsp;"));

@ -168,10 +168,8 @@ public class getpageinfo {
}
if (actions.indexOf("oai",0) >= 0) {
try {
final DigestURL theURL = new DigestURL(url
+ "?verb=Identify");
final String oairesult = checkOAI(theURL.toString());
final DigestURL theURL = new DigestURL(url + "?verb=Identify");
final String oairesult = checkOAI(theURL.toNormalform(false));
prop.put("oai", oairesult == "" ? 0 : 1);

@ -173,7 +173,7 @@ public class getpageinfo_p {
final DigestURL theURL = new DigestURL(url
+ "?verb=Identify");
final String oairesult = checkOAI(theURL.toString());
final String oairesult = checkOAI(theURL.toNormalform(false));
prop.put("oai", oairesult == "" ? 0 : 1);

@ -127,4 +127,16 @@ public class AnchorURL extends DigestURL {
return tagopts;
}
public boolean attachedNofollow() {
return this.relProperty.indexOf("nofollow") >= 0;
}
@Override
public String toString() {
return "<a href=\"" + this.toNormalform(false) + "\"" +
(this.nameProperty.length() > 0 ? (" name=\"" + this.nameProperty + "\"") : "") +
(this.relProperty.length() > 0 ? (" rel=\"" + this.relProperty + "\"") : "") +
">" + this.textProperty + "</a>";
}
}

@ -857,6 +857,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
@Override
public String toString() {
assert false; // this shall not be used to avoid confusion with AnchorURL.toString
return toNormalform(false);
}
@ -2006,9 +2007,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
if (isSMB()) try {
return TimeoutRequest.exists(getSmbFile(), SMB_TIMEOUT);
} catch (final SmbException e) {
throw new IOException("SMB.exists SmbException (" + e.getMessage() + ") for " + toString());
throw new IOException("SMB.exists SmbException (" + e.getMessage() + ") for " + toNormalform(false));
} catch (final MalformedURLException e) {
throw new IOException("SMB.exists MalformedURLException (" + e.getMessage() + ") for " + toString());
throw new IOException("SMB.exists MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false));
}
return false;
}
@ -2018,9 +2019,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
if (isSMB()) try {
return TimeoutRequest.canRead(getSmbFile(), SMB_TIMEOUT);
} catch (final SmbException e) {
throw new IOException("SMB.canRead SmbException (" + e.getMessage() + ") for " + toString());
throw new IOException("SMB.canRead SmbException (" + e.getMessage() + ") for " + toNormalform(false));
} catch (final MalformedURLException e) {
throw new IOException("SMB.canRead MalformedURLException (" + e.getMessage() + ") for " + toString());
throw new IOException("SMB.canRead MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false));
}
return false;
}
@ -2030,9 +2031,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
if (isSMB()) try {
return TimeoutRequest.canWrite(getSmbFile(), SMB_TIMEOUT);
} catch (final SmbException e) {
throw new IOException("SMB.canWrite SmbException (" + e.getMessage() + ") for " + toString());
throw new IOException("SMB.canWrite SmbException (" + e.getMessage() + ") for " + toNormalform(false));
} catch (final MalformedURLException e) {
throw new IOException("SMB.canWrite MalformedURLException (" + e.getMessage() + ") for " + toString());
throw new IOException("SMB.canWrite MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false));
}
return false;
}
@ -2042,9 +2043,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
if (isSMB()) try {
return TimeoutRequest.isHidden(getSmbFile(), SMB_TIMEOUT);
} catch (final SmbException e) {
throw new IOException("SMB.isHidden SmbException (" + e.getMessage() + ") for " + toString());
throw new IOException("SMB.isHidden SmbException (" + e.getMessage() + ") for " + toNormalform(false));
} catch (final MalformedURLException e) {
throw new IOException("SMB.isHidden MalformedURLException (" + e.getMessage() + ") for " + toString());
throw new IOException("SMB.isHidden MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false));
}
return false;
}
@ -2054,9 +2055,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
if (isSMB()) try {
return TimeoutRequest.isDirectory(getSmbFile(), SMB_TIMEOUT);
} catch (final SmbException e) {
throw new IOException("SMB.isDirectory SmbException (" + e.getMessage() + ") for " + toString());
throw new IOException("SMB.isDirectory SmbException (" + e.getMessage() + ") for " + toNormalform(false));
} catch (final MalformedURLException e) {
throw new IOException("SMB.isDirectory MalformedURLException (" + e.getMessage() + ") for " + toString());
throw new IOException("SMB.isDirectory MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false));
}
return false;
}
@ -2082,9 +2083,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
if (isSMB()) try {
return TimeoutRequest.lastModified(getSmbFile(), SMB_TIMEOUT);
} catch (final SmbException e) {
throw new IOException("SMB.lastModified SmbException (" + e.getMessage() + ") for " + toString());
throw new IOException("SMB.lastModified SmbException (" + e.getMessage() + ") for " + toNormalform(false));
} catch (final MalformedURLException e) {
throw new IOException("SMB.lastModified MalformedURLException (" + e.getMessage() + ") for " + toString());
throw new IOException("SMB.lastModified MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false));
}
return 0;
}
@ -2094,7 +2095,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
if (isSMB()) try {
return getSmbFile().getName();
} catch (final MalformedURLException e) {
throw new IOException("SMB.getName MalformedURLException (" + e.getMessage() + ") for " + toString() );
throw new IOException("SMB.getName MalformedURLException (" + e.getMessage() + ") for " + toNormalform(false) );
}
if (isFTP()) {
return this.getFileName();
@ -2113,7 +2114,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
throw new IOException("SMB.list SmbException for " + sf.toString() + ": " + e.getMessage());
}
} catch (final MalformedURLException e) {
throw new IOException("SMB.list MalformedURLException for " + toString() + ": " + e.getMessage());
throw new IOException("SMB.list MalformedURLException for " + toNormalform(false) + ": " + e.getMessage());
}
return null;
}
@ -2262,10 +2263,10 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
// check equality to java.net.URL
if (((aURL == null) && (jURL != null)) ||
((aURL != null) && (jURL == null)) ||
((aURL != null) && (jURL != null) && (!(jURL.toString().equals(aURL.toString()))))) {
((aURL != null) && (jURL != null) && (!(jURL.toString().equals(aURL.toNormalform(false)))))) {
System.out.println("Difference for environment=" + environment + ", url=" + url + ":");
System.out.println((jURL == null) ? "jURL rejected input" : "jURL=" + jURL.toString());
System.out.println((aURL == null) ? "aURL rejected input" : "aURL=" + aURL.toString());
System.out.println((aURL == null) ? "aURL rejected input" : "aURL=" + aURL.toNormalform(false));
}
// check stability: the normalform of the normalform must be equal to the normalform
@ -2273,12 +2274,12 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
aURL1 = new MultiProtocolURL(aURL.toNormalform(false));
if (!(aURL1.toNormalform(false).equals(aURL.toNormalform(false)))) {
System.out.println("no stability for url:");
System.out.println("aURL0=" + aURL.toString());
System.out.println("aURL1=" + aURL1.toString());
System.out.println("aURL0=" + aURL.toNormalform(false));
System.out.println("aURL1=" + aURL1.toNormalform(false));
}
} catch (final MalformedURLException e) {
System.out.println("no stability for url:");
System.out.println("aURL0=" + aURL.toString());
System.out.println("aURL0=" + aURL.toNormalform(false));
System.out.println("aURL1 cannot be computed:" + e.getMessage());
}
}

@ -159,7 +159,7 @@ public class HTMLResponseWriter implements QueryResponseWriter {
// add a link to re-crawl this url (in case it is a remote metadata only entry)
String sku = tdoc.get(CollectionSchema.sku.getSolrFieldName());
final String jsc= "javascript:w = window.open('/QuickCrawlLink_p.html?indexText=on&indexMedia=on&crawlingQ=on&followFrames=on&obeyHtmlRobotsNoindex=on&xdstopw=on&title='+escape('"+title+"')+'&url='+escape('"+sku+"'),'_blank','height=250,width=600,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no');w.focus();";
final String jsc= "javascript:w = window.open('/QuickCrawlLink_p.html?indexText=on&indexMedia=on&crawlingQ=on&followFrames=on&obeyHtmlRobotsNoindex=on&obeyHtmlRobotsNofollow=off&xdstopw=on&title='+escape('"+title+"')+'&url='+escape('"+sku+"'),'_blank','height=250,width=600,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no');w.focus();";
writer.write("<div class='btn btn-default btn-sm' style='float:right' onclick=\""+jsc+"\">re-crawl url</div>\n");
writer.write("<h1 property=\"dc:Title\">" + title + "</h1>\n");

@ -395,7 +395,7 @@ public final class CrawlStacker {
return null; // no evidence that we know that url
}
final boolean recrawl = profile.recrawlIfOlder() > oldDate.longValue();
final String urlstring = url.toString();
final String urlstring = url.toNormalform(false);
if (recrawl) {
if (CrawlStacker.log.isInfo())
CrawlStacker.log.info("RE-CRAWL of URL '" + urlstring + "': this url was crawled " +
@ -409,7 +409,7 @@ public final class CrawlStacker {
if (maxAllowedPagesPerDomain < Integer.MAX_VALUE && maxAllowedPagesPerDomain > 0) {
final AtomicInteger dp = profile.getCount(url.getHost());
if (dp != null && dp.get() >= maxAllowedPagesPerDomain) {
if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL '" + url.toString() + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL '" + url.toNormalform(false) + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
return "crawl stack domain counter exceeded (test by profile)";
}
@ -435,7 +435,7 @@ public final class CrawlStacker {
// check if the protocol is supported
final String urlProtocol = url.getProtocol();
final String urlstring = url.toString();
final String urlstring = url.toNormalform(true);
if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) {
CrawlStacker.log.severe("Unsupported protocol in URL '" + urlstring + "'.");
return "unsupported protocol";

@ -288,7 +288,7 @@ public final class CrawlSwitchboard {
true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
-1,
false, true, true,
false, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_TEXT, true),
sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true),
true,
@ -317,7 +317,7 @@ public final class CrawlSwitchboard {
false,
-1,
-1,
true, true, true,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true,
true,
false,
@ -346,7 +346,7 @@ public final class CrawlSwitchboard {
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
-1,
true, true, true,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
false,
false,
true,
@ -375,7 +375,7 @@ public final class CrawlSwitchboard {
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
-1,
true, true, true,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true,
true,
true,
@ -405,7 +405,7 @@ public final class CrawlSwitchboard {
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE),
-1,
true, true, true,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
false,
false,
true,
@ -434,7 +434,7 @@ public final class CrawlSwitchboard {
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
-1,
true, true, true,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
false,
false,
true,
@ -463,7 +463,7 @@ public final class CrawlSwitchboard {
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
-1,
true, true, true,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
false,
true,
true,
@ -492,7 +492,7 @@ public final class CrawlSwitchboard {
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
-1,
true, true, false,
true, true, false, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true,
false,
false,
@ -524,7 +524,7 @@ public final class CrawlSwitchboard {
false,
System.currentTimeMillis(),
-1,
true, true, false,
true, true, false, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true,
true,
false,

@ -201,9 +201,9 @@ public final class Cache {
public static void store(final DigestURL url, final ResponseHeader responseHeader, final byte[] file) throws IOException {
if (maxCacheSize == 0) return;
if (responseHeader == null) throw new IOException("Cache.store of url " + url.toString() + " not possible: responseHeader == null");
if (file == null) throw new IOException("Cache.store of url " + url.toString() + " not possible: file == null");
log.info("storing content of url " + url.toString() + ", " + file.length + " bytes");
if (responseHeader == null) throw new IOException("Cache.store of url " + url.toNormalform(false) + " not possible: responseHeader == null");
if (file == null) throw new IOException("Cache.store of url " + url.toNormalform(false) + " not possible: file == null");
log.info("storing content of url " + url.toNormalform(false) + ", " + file.length + " bytes");
// store the file
try {

@ -69,6 +69,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String CRAWLING_Q = "crawlingQ";
public static final String FOLLOW_FRAMES = "followFrames";
public static final String OBEY_HTML_ROBOTS_NOINDEX = "obeyHtmlRobotsNoindex";
public static final String OBEY_HTML_ROBOTS_NOFOLLOW = "obeyHtmlRobotsNofollow";
public static final String INDEX_TEXT = "indexText";
public static final String INDEX_MEDIA = "indexMedia";
public static final String STORE_HTCACHE = "storeHTCache";
@ -135,7 +136,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final boolean directDocByURL,
final long recrawlIfOlder /*date*/,
final int domMaxPages,
final boolean crawlingQ, final boolean followFrames, final boolean obeyHtmlRobotsNoindex,
final boolean crawlingQ, final boolean followFrames,
final boolean obeyHtmlRobotsNoindex, final boolean obeyHtmlRobotsNofollow,
final boolean indexText,
final boolean indexMedia,
final boolean storeHTCache,
@ -170,6 +172,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(CRAWLING_Q, crawlingQ); // crawling of urls with '?'
put(FOLLOW_FRAMES, followFrames); // load pages contained in frames or ifames
put(OBEY_HTML_ROBOTS_NOINDEX, obeyHtmlRobotsNoindex); // if false, then a meta robots tag containing 'noindex' is ignored
put(OBEY_HTML_ROBOTS_NOFOLLOW, obeyHtmlRobotsNofollow);
put(INDEX_TEXT, indexText);
put(INDEX_MEDIA, indexMedia);
put(STORE_HTCACHE, storeHTCache);
@ -534,6 +537,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return (r.equals(Boolean.TRUE.toString()));
}
public boolean obeyHtmlRobotsNofollow() {
final String r = get(OBEY_HTML_ROBOTS_NOFOLLOW);
if (r == null) return false;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean indexText() {
final String r = get(INDEX_TEXT);
if (r == null) return true;

@ -356,7 +356,7 @@ public class CrawlQueues {
}
}
} else {
CrawlQueues.log.severe("Unsupported protocol in URL '" + url.toString());
CrawlQueues.log.severe("Unsupported protocol in URL '" + url.toNormalform(false));
}
} else {
if (CrawlQueues.log.isFine()) CrawlQueues.log.fine(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url());
@ -627,7 +627,7 @@ public class CrawlQueues {
while ((request = CrawlQueues.this.workerQueue.poll(10, TimeUnit.SECONDS)) != POISON_REQUEST) {
if (request == null) break; // we run this only for a specific time and then let the process die to clear up resources
request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED);
this.setName("CrawlQueues.Loader(" + request.url() + ")");
this.setName("CrawlQueues.Loader(" + request.url().toNormalform(false) + ")");
CrawlProfile profile = CrawlQueues.this.sb.crawler.get(UTF8.getBytes(request.profileHandle()));
try {
// checking robots.txt for http(s) resources

@ -149,7 +149,7 @@ public final class HTTPLoader {
// restart crawling with new url
this.log.info("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString);
this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl);
this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false));
this.sb.webStructure.generateCitationReference(url, redirectionUrl);

@ -225,7 +225,7 @@ public class Request extends WorkflowJob
new byte[][] {
this.url.hash(),
this.initiator,
UTF8.getBytes(this.url.toString()),
UTF8.getBytes(this.url.toNormalform(false)),
this.refhash,
namebytes,
appdatestr,

@ -98,7 +98,7 @@ public class SitemapImporter extends Thread {
this.sb.crawlStacker.enqueueEntry(new Request(
ASCII.getBytes(this.sb.peers.mySeed().hash),
url,
null, // this.siteMapURL.toString(),
null, // this.siteMapURL.toNormalform(false),
entry.url(),
entry.lastmod(new Date()),
this.crawlingProfile.handle(),

@ -149,7 +149,7 @@ public class BookmarkHelper {
title = url.getNameProperty();
ConcurrentLog.info("BOOKMARKS", "links.get(url)");
if ("".equals(title)) {//cannot be displayed
title = url.toString();
title = url.toNormalform(false);
}
bm = db.new Bookmark(url);
bm.setProperty(Bookmark.BOOKMARK_TITLE, title);

@ -184,7 +184,8 @@ public class YMarkCrawlStart extends HashMap<String,String>{
CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
-1,
crawlingQ,
true, true, true, true, true, false,
true, true, true, false,
true, true, false,
CacheStrategy.IFFRESH,
"robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA,
ClientIdentification.yacyIntranetCrawlerAgentName); // TODO: make this a default profile in CrawlSwitchboard

@ -818,7 +818,7 @@ dc_rights
final List<String> descriptions = new ArrayList<String>();
final Collection<String> titles = new LinkedHashSet<String>();
final Collection<String> sectionTitles = new LinkedHashSet<String>();
final List<AnchorURL> anchors = new ArrayList<AnchorURL>();
final List<AnchorURL> anchors = new ArrayList<AnchorURL>();
final LinkedHashMap<DigestURL, String> rss = new LinkedHashMap<DigestURL, String>();
final LinkedHashMap<AnchorURL, ImageEntry> images = new LinkedHashMap<AnchorURL, ImageEntry>();
final Set<String> languages = new HashSet<String>();
@ -913,16 +913,22 @@ dc_rights
public final static String CANONICAL_MARKER = "canonical";
public static Map<DigestURL, String> getHyperlinks(final Document[] documents) {
final Map<DigestURL, String> result = new HashMap<DigestURL, String>();
public static Map<AnchorURL, String> getHyperlinks(final Document[] documents, boolean includeNofollow) {
final Map<AnchorURL, String> result = new HashMap<>();
for (final Document d: documents) {
result.putAll(d.getHyperlinks());
if (includeNofollow) {
result.putAll(d.getHyperlinks());
} else {
for (Map.Entry<AnchorURL, String> entry: d.getHyperlinks().entrySet()) {
if (!entry.getKey().attachedNofollow()) result.put(entry.getKey(), entry.getValue());
}
}
final Object parser = d.getParserObject();
if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser;
String refresh = html.getRefreshPath();
if (refresh != null && refresh.length() > 0) try {result.put(new DigestURL(refresh), "refresh");} catch (final MalformedURLException e) {}
DigestURL canonical = html.getCanonical();
if (refresh != null && refresh.length() > 0) try {result.put(new AnchorURL(refresh), "refresh");} catch (final MalformedURLException e) {}
AnchorURL canonical = html.getCanonical();
if (canonical != null) {
result.put(canonical, CANONICAL_MARKER);
}

@ -183,7 +183,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final CharBuffer content;
private final EventListenerList htmlFilterEventListeners;
private double lon, lat;
private DigestURL canonical, publisher;
private AnchorURL canonical, publisher;
private final int maxLinks;
private int breadcrumbs;
@ -771,7 +771,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return this.script;
}
public DigestURL getCanonical() {
public AnchorURL getCanonical() {
return this.canonical;
}

@ -133,7 +133,7 @@ public class ImageEntry implements Comparable<ImageEntry>, Comparator<ImageEntry
final int ohc = (h).hashCode();
if (thc < ohc) return -1;
if (thc > ohc) return 1;
return this.imageurl.toString().compareTo((h).imageurl.toString());
return this.imageurl.toNormalform(true).compareTo((h).imageurl.toNormalform(true));
}
@Override

@ -55,7 +55,7 @@ public class RDFaParser extends AbstractParser implements Parser {
// TODO: current hardcoded restriction: apply rdfa parser only on selected sources.
if (url.toString().contains(".yacy") || url.toString().contains("experiments")) {
if (url.toNormalform(true).contains(".yacy") || url.toNormalform(true).contains("experiments")) {
// if (true == false) {
Document rdfaDoc = parseRDFa(url, mimeType, charset, source);
Document[] retDocs = new Document[htmlDocs.length + 1];

@ -116,7 +116,7 @@ public class sitemapParser extends AbstractParser implements Parser {
final HTTPClient client = new HTTPClient(agent);
client.setHeader(requestHeader.entrySet());
try {
client.GET(sitemapURL.toString(), false);
client.GET(sitemapURL.toNormalform(false), false);
if (client.getStatusCode() != 200) {
throw new IOException("Unable to download the sitemap file " + sitemapURL +
"\nServer returned status: " + client.getHttpResponse().getStatusLine());

@ -179,7 +179,7 @@ public class vcfParser extends AbstractParser implements Parser {
} else if (key.toUpperCase().startsWith("URL")) {
try {
final AnchorURL newURL = new AnchorURL(value);
newURL.setNameProperty(newURL.toString());
newURL.setNameProperty(newURL.toNormalform(false));
anchors.add(newURL);
//parsedData.put(key,value);
} catch (final MalformedURLException ex) {/* ignore this */}

@ -396,7 +396,7 @@ public final class LoaderDispatcher {
* @return a map from URLs to the anchor texts of the urls
* @throws IOException
*/
public final Map<DigestURL, String> loadLinks(final AnchorURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
public final Map<AnchorURL, String> loadLinks(final AnchorURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException {
final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, agent);
if (response == null) throw new IOException("response == null");
final ResponseHeader responseHeader = response.getResponseHeader();
@ -413,7 +413,7 @@ public final class LoaderDispatcher {
throw new IOException("parser error: " + e.getMessage());
}
return Document.getHyperlinks(documents);
return Document.getHyperlinks(documents, true);
}
public synchronized static void cleanupAccessTimeTable(final long timeout) {

@ -2584,26 +2584,27 @@ public final class Switchboard extends serverSwitch {
for (Document d: documents) d.setDepth(response.depth());
// get the hyperlinks
final Map<DigestURL, String> hl = Document.getHyperlinks(documents);
final Map<AnchorURL, String> hl = Document.getHyperlinks(documents, !response.profile().obeyHtmlRobotsNofollow());
if (response.profile().indexMedia()) {
for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(entry.getKey(), entry.getValue());
if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(new AnchorURL(entry.getKey()), entry.getValue());
}
}
// add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links
if (response.profile().directDocByURL()) {
for (Map.Entry<DigestURL, String> entry: Document.getImagelinks(documents).entrySet()) {
if (TextParser.supportsExtension(entry.getKey()) != null) hl.put(entry.getKey(), entry.getValue());
if (TextParser.supportsExtension(entry.getKey()) != null) hl.put(new AnchorURL(entry.getKey()), entry.getValue());
}
hl.putAll(Document.getApplinks(documents));
hl.putAll(Document.getVideolinks(documents));
hl.putAll(Document.getAudiolinks(documents));
for (Map.Entry<DigestURL, String> d: Document.getApplinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
for (Map.Entry<DigestURL, String> d: Document.getVideolinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
for (Map.Entry<DigestURL, String> d: Document.getAudiolinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue());
}
// insert those hyperlinks to the crawler
MultiProtocolURL nextUrl;
for ( final Map.Entry<DigestURL, String> nextEntry : hl.entrySet() ) {
for ( final Map.Entry<AnchorURL, String> nextEntry : hl.entrySet() ) {
// check for interruption
checkInterruption();
@ -2880,7 +2881,7 @@ public final class Switchboard extends serverSwitch {
public final void addAllToIndex(
final DigestURL url,
final Map<DigestURL, String> links,
final Map<AnchorURL, String> links,
final SearchEvent searchEvent,
final String heuristicName,
final Map<String, Pattern> collections,
@ -2893,15 +2894,15 @@ public final class Switchboard extends serverSwitch {
}
// check if some of the links match with the query
final Map<DigestURL, String> matcher = searchEvent.query.separateMatches(links);
final Map<AnchorURL, String> matcher = searchEvent.query.separateMatches(links);
// take the matcher and load them all
for (final Map.Entry<DigestURL, String> entry : matcher.entrySet()) {
for (final Map.Entry<AnchorURL, String> entry : matcher.entrySet()) {
urls.add(new DigestURL(entry.getKey(), (byte[]) null));
}
// take then the no-matcher and load them also
for (final Map.Entry<DigestURL, String> entry : links.entrySet()) {
for (final Map.Entry<AnchorURL, String> entry : links.entrySet()) {
urls.add(new DigestURL(entry.getKey(), (byte[]) null));
}
addToIndex(urls, searchEvent, heuristicName, collections, doublecheck);
@ -3479,12 +3480,12 @@ public final class Switchboard extends serverSwitch {
return;
}
final Map<DigestURL, String> links;
final Map<AnchorURL, String> links;
searchEvent.oneFeederStarted();
try {
links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent);
if ( links != null ) {
final Iterator<DigestURL> i = links.keySet().iterator();
final Iterator<AnchorURL> i = links.keySet().iterator();
while ( i.hasNext() ) {
if ( !i.next().getHost().endsWith(host) ) {
i.remove();
@ -3518,13 +3519,13 @@ public final class Switchboard extends serverSwitch {
return;
}
final Map<DigestURL, String> links;
final Map<AnchorURL, String> links;
DigestURL url;
try {
links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent);
if (links != null) {
if (links.size() < 1000) { // limit to 1000 to skip large index pages
final Iterator<DigestURL> i = links.keySet().iterator();
final Iterator<AnchorURL> i = links.keySet().iterator();
final boolean globalcrawljob = Switchboard.this.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL,false);
Collection<DigestURL> urls = new ArrayList<DigestURL>();
while (i.hasNext()) {
@ -3590,11 +3591,11 @@ public final class Switchboard extends serverSwitch {
//System.out.println("BLEKKO: " + UTF8.String(resource));
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
if ( rss != null ) {
final Map<DigestURL, String> links = new TreeMap<DigestURL, String>();
DigestURL uri;
final Map<AnchorURL, String> links = new TreeMap<>();
AnchorURL uri;
for ( final RSSMessage message : rss.getFeed() ) {
try {
uri = new DigestURL(message.getLink());
uri = new AnchorURL(message.getLink());
links.put(uri, message.getTitle());
} catch (final MalformedURLException e ) {
}
@ -3720,7 +3721,7 @@ public final class Switchboard extends serverSwitch {
final HTTPClient client = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, timeout);
client.setHeader(reqHeader.entrySet());
client.HEADResponse(url.toString(), false);
client.HEADResponse(url.toNormalform(false), false);
int statusCode = client.getHttpResponse().getStatusLine().getStatusCode();
ResponseHeader header = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders());
if (checkAge) {

@ -151,7 +151,7 @@ public class DocumentIndex extends Segment {
try {
documents = TextParser.parseSource(url, null, null, 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null));
} catch (final Exception e ) {
throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage());
}
//Document document = Document.mergeDocuments(url, null, documents);
final SolrInputDocument[] rows = new SolrInputDocument[documents.length];

@ -592,7 +592,7 @@ public class Segment {
int outlinksSame = document.inboundLinks().size();
int outlinksOther = document.outboundLinks().size();
final int urlLength = urlNormalform.length();
final int urlComps = MultiProtocolURL.urlComps(url.toString()).length;
final int urlComps = MultiProtocolURL.urlComps(url.toNormalform(false)).length;
// create a word prototype which is re-used for all entries
if ((this.termIndex != null && storeToRWI) || searchEvent != null) {

@ -35,10 +35,11 @@ import java.util.Set;
import java.util.SortedSet;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.geo.GeoLocation;
@ -60,6 +61,7 @@ import net.yacy.search.index.Segment;
import net.yacy.search.ranking.RankingProfile;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrQuery.SortClause;
import org.apache.solr.common.params.CommonParams;
@ -522,11 +524,11 @@ public final class QueryParams {
return this.queryGoal;
}
public final Map<DigestURL, String> separateMatches(final Map<DigestURL, String> links) {
final Map<DigestURL, String> matcher = new HashMap<DigestURL, String>();
final Iterator <Map.Entry<DigestURL, String>> i = links.entrySet().iterator();
Map.Entry<DigestURL, String> entry;
DigestURL url;
public final Map<AnchorURL, String> separateMatches(final Map<AnchorURL, String> links) {
final Map<AnchorURL, String> matcher = new HashMap<>();
final Iterator <Map.Entry<AnchorURL, String>> i = links.entrySet().iterator();
Map.Entry<AnchorURL, String> entry;
AnchorURL url;
String anchorText;
while (i.hasNext()) {
entry = i.next();

@ -1531,7 +1531,7 @@ public final class SearchEvent {
}
@Override
public String toString() {
return this.imageUrl.toString();
return this.imageUrl.toNormalform(false);
}
}

@ -1199,10 +1199,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
proccount.incrementAndGet();
allcount.incrementAndGet();
if (proccount.get() % 1000 == 0) ConcurrentLog.info(
"CollectionConfiguration", "webgraph - postprocessed " + proccount + " from " + count + " documents; " +
if (proccount.get() % 1000 == 0) {
postprocessingActivity = "writing cr values to webgraph for host " + hostfinal + "postprocessed " + proccount + " from " + count + " documents; " +
(proccount.get() * 1000 / (System.currentTimeMillis() - start)) + " docs/second; " +
((System.currentTimeMillis() - start) * (count - proccount.get()) / proccount.get() / 60000) + " minutes remaining for host " + hostfinal);
((System.currentTimeMillis() - start) * (count - proccount.get()) / proccount.get() / 60000) + " minutes remaining";
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
}
}
} catch (InterruptedException e) {
ConcurrentLog.warn("CollectionConfiguration", e.getMessage(), e);
@ -1301,10 +1303,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
collectionConnector.add(sid);
proccount++; allcount.incrementAndGet();
if (proccount % 100 == 0) ConcurrentLog.info(
"CollectionConfiguration", "collection - postprocessed " + proccount + " from " + count + " documents; " +
if (proccount % 100 == 0) {
postprocessingActivity = "postprocessed " + proccount + " from " + count + " collection documents; " +
(proccount * 1000 / (System.currentTimeMillis() - start)) + " docs/second; " +
((System.currentTimeMillis() - start) * (count - proccount) / proccount / 60000) + " minutes remaining");
((System.currentTimeMillis() - start) * (count - proccount) / proccount / 60000) + " minutes remaining";
ConcurrentLog.info("CollectionConfiguration", postprocessingActivity);
}
} catch (final Throwable e1) {
ConcurrentLog.logException(e1);
failids.add(i);

@ -202,7 +202,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
while (i.hasNext()) {
ientry = i.next();
url = ientry.url();
final String u = url.toString();
final String u = url.toNormalform(false);
if (isUrlBlacklisted(BlacklistType.SEARCH, url)) continue;
if (u.indexOf(".ico",0) >= 0 || u.indexOf("favicon",0) >= 0) continue;
if (ientry.height() > 0 && ientry.height() < 32) continue;
@ -262,7 +262,7 @@ public class MediaSnippet implements Comparable<MediaSnippet>, Comparator<MediaS
if (isBlacklisted) {
Switchboard.getSwitchboard().crawlQueues.errorURL.push(url, 996, null, FailCategory.FINAL_LOAD_CONTEXT, "url in blacklist", -1);
ConcurrentLog.fine("snippet fetch", "MEDIA-SNIPPET Rejecting URL '" + url.toString() + "'. URL is in blacklist.");
ConcurrentLog.fine("snippet fetch", "MEDIA-SNIPPET Rejecting URL '" + url.toNormalform(false) + "'. URL is in blacklist.");
}
return isBlacklisted;

@ -29,7 +29,6 @@ package net.yacy.search.snippet;
import java.io.IOException;
import java.util.Comparator;
import java.util.Date;
import java.util.List;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;

@ -585,7 +585,7 @@ public final class HTTPDProxyHandler {
}
} else {
// no caching
if (log.isFine()) log.fine(reqID +" "+ url.toString() + " not cached." +
if (log.isFine()) log.fine(reqID +" "+ url.toNormalform(false) + " not cached." +
" StoreError=" + ((storeError==null)?"None":storeError) +
" StoreHTCache=" + storeHTCache +
" SupportError=" + supportError);
@ -663,7 +663,7 @@ public final class HTTPDProxyHandler {
if (requestHeader.containsKey(RequestHeader.IF_MODIFIED_SINCE)) {
// conditional request: freshness of cache for that condition was already
// checked within shallUseCache(). Now send only a 304 response
log.info("CACHE HIT/304 " + url.toString());
log.info("CACHE HIT/304 " + url.toNormalform(false));
conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_REFRESH_HIT");
// setting the content length header to 0
@ -674,7 +674,7 @@ public final class HTTPDProxyHandler {
//respondHeader(respond, "304 OK", cachedResponseHeader); // respond with 'not modified'
} else {
// unconditional request: send content of cache
log.info("CACHE HIT/203 " + url.toString());
log.info("CACHE HIT/203 " + url.toNormalform(false));
conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_CODE,"TCP_HIT");
// setting the content header to the proper length

@ -34,7 +34,7 @@ public class TextSnippetTest {
doc = new SolrDocument();
DigestURL url = new DigestURL("http://localhost/page.html");
doc.addField(CollectionSchema.id.name(), ASCII.String(url.hash()));
doc.addField(CollectionSchema.sku.name(),url.toString());
doc.addField(CollectionSchema.sku.name(), url.toNormalform(false));
// for testcases add other fields
// fields involved in snippet extraction:
// url, title, keywords, author, text_t

Loading…
Cancel
Save