From 2de159719b4612dde3e86c96bf66ccd52093b6e6 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 18 Jul 2014 12:43:01 +0200 Subject: [PATCH] added an option to set 'obey nofollow' for links with rel="nofollow" attribute in the tag for each crawl. This introduces a lot of changes because it extends the usage of the AnchorURL Object type which now also has a different toString method that the underlying DigestURL.toString. It is therefore not advised to use .toString at all for urls, just just toNormalform(false) instead. --- defaults/yacy.init | 1 + htroot/BlacklistTest_p.java | 4 +- htroot/CrawlStartExpert.html | 7 ++-- htroot/CrawlStartExpert.java | 13 +++---- htroot/Crawler_p.java | 6 ++- htroot/QuickCrawlLink_p.java | 4 +- htroot/ViewFile.java | 2 +- htroot/api/getpageinfo.java | 6 +-- htroot/api/getpageinfo_p.java | 2 +- .../net/yacy/cora/document/id/AnchorURL.java | 12 ++++++ .../cora/document/id/MultiProtocolURL.java | 39 ++++++++++--------- .../responsewriter/HTMLResponseWriter.java | 2 +- source/net/yacy/crawler/CrawlStacker.java | 6 +-- source/net/yacy/crawler/CrawlSwitchboard.java | 18 ++++----- source/net/yacy/crawler/data/Cache.java | 6 +-- .../net/yacy/crawler/data/CrawlProfile.java | 11 +++++- source/net/yacy/crawler/data/CrawlQueues.java | 4 +- .../yacy/crawler/retrieval/HTTPLoader.java | 2 +- .../net/yacy/crawler/retrieval/Request.java | 2 +- .../crawler/retrieval/SitemapImporter.java | 2 +- source/net/yacy/data/BookmarkHelper.java | 2 +- .../net/yacy/data/ymark/YMarkCrawlStart.java | 3 +- source/net/yacy/document/Document.java | 18 ++++++--- .../document/parser/html/ContentScraper.java | 4 +- .../yacy/document/parser/html/ImageEntry.java | 2 +- .../document/parser/rdfa/impl/RDFaParser.java | 2 +- .../yacy/document/parser/sitemapParser.java | 2 +- .../net/yacy/document/parser/vcfParser.java | 2 +- .../net/yacy/repository/LoaderDispatcher.java | 4 +- source/net/yacy/search/Switchboard.java | 39 ++++++++++--------- .../net/yacy/search/index/DocumentIndex.java | 2 +- source/net/yacy/search/index/Segment.java | 2 +- source/net/yacy/search/query/QueryParams.java | 14 ++++--- source/net/yacy/search/query/SearchEvent.java | 2 +- .../schema/CollectionConfiguration.java | 16 +++++--- .../net/yacy/search/snippet/MediaSnippet.java | 4 +- .../net/yacy/search/snippet/ResultEntry.java | 1 - .../yacy/server/http/HTTPDProxyHandler.java | 6 +-- .../yacy/search/snippet/TextSnippetTest.java | 2 +- 39 files changed, 158 insertions(+), 118 deletions(-) diff --git a/defaults/yacy.init b/defaults/yacy.init index d79b2684c..f79fcfbc7 100644 --- a/defaults/yacy.init +++ b/defaults/yacy.init @@ -550,6 +550,7 @@ crawlingFilter=.* crawlingQ=true followFrames=true obeyHtmlRobotsNoindex=true +obeyHtmlRobotsNofollow=false storeHTCache=true storeTXCache=true diff --git a/htroot/BlacklistTest_p.java b/htroot/BlacklistTest_p.java index 429a7de40..7eb96d272 100644 --- a/htroot/BlacklistTest_p.java +++ b/htroot/BlacklistTest_p.java @@ -62,8 +62,8 @@ public class BlacklistTest_p { testurl = null; } if(testurl != null) { - prop.putHTML("url",testurl.toString()); - prop.putHTML("testlist_url",testurl.toString()); + prop.putHTML("url",testurl.toNormalform(false)); + prop.putHTML("testlist_url",testurl.toNormalform(false)); boolean isblocked = false; if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, testurl)) { diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html index f38e18b78..227e978ad 100644 --- a/htroot/CrawlStartExpert.html +++ b/htroot/CrawlStartExpert.html @@ -298,9 +298,10 @@ is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops. Following frames is NOT done by Gxxg1e, but we do by default to have a richer content. 'nofollow' in robots metadata can be overridden; this does not affect obeying of the robots.txt which is never ignored. - Accept URLs with query-part ('?'):    - Obey html-robots-noindex: + Accept URLs with query-part ('?'):
+ Obey html-robots-noindex:
+ Obey html-robots-nofollow:
Load Filter on URLs
info diff --git a/htroot/CrawlStartExpert.java b/htroot/CrawlStartExpert.java index 587d1b810..b3f2a6640 100644 --- a/htroot/CrawlStartExpert.java +++ b/htroot/CrawlStartExpert.java @@ -192,16 +192,15 @@ public class CrawlStartExpert { } // Accept URLs with query-part? - // Obey html-robots-noindex? + // Obey html-robots-noindex, nofollow? if (post == null) { - prop.put("crawlingQChecked", - env.getConfigBool("crawlingQ", true) ? 1 : 0); - prop.put("obeyHtmlRobotsNoindexChecked", - env.getConfigBool("obeyHtmlRobotsNoindex", true) ? 1 : 0); + prop.put("crawlingQChecked", env.getConfigBool("crawlingQ", true) ? 1 : 0); + prop.put("obeyHtmlRobotsNoindexChecked", env.getConfigBool("obeyHtmlRobotsNoindex", true) ? 1 : 0); + prop.put("obeyHtmlRobotsNofollowChecked", env.getConfigBool("obeyHtmlRobotsNofollow", true) ? 1 : 0); } else { prop.put("crawlingQChecked", post.getBoolean("crawlingQ") ? 1 : 0); - prop.put("obeyHtmlRobotsNoindexChecked", - post.getBoolean("obeyHtmlRobotsNoindex") ? 1 : 0); + prop.put("obeyHtmlRobotsNoindexChecked", post.getBoolean("obeyHtmlRobotsNoindex") ? 1 : 0); + prop.put("obeyHtmlRobotsNofollowChecked", post.getBoolean("obeyHtmlRobotsNofollow") ? 1 : 0); } // Load Filter on URLs (range) diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 7e1ac3828..c83c4aba0 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -310,6 +310,9 @@ public class Crawler_p { boolean obeyHtmlRobotsNoindex = "on".equals(post.get("obeyHtmlRobotsNoindex", "false")); env.setConfig("obeyHtmlRobotsNoindex", obeyHtmlRobotsNoindex); + + boolean obeyHtmlRobotsNofollow = "on".equals(post.get("obeyHtmlRobotsNofollow", "false")); + env.setConfig("obeyHtmlRobotsNofollow", obeyHtmlRobotsNofollow); final boolean indexText = "on".equals(post.get("indexText", "false")); env.setConfig("indexText", indexText); @@ -444,7 +447,8 @@ public class Crawler_p { directDocByURL, crawlingIfOlder, crawlingDomMaxPages, - crawlingQ, followFrames, obeyHtmlRobotsNoindex, + crawlingQ, followFrames, + obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, indexText, indexMedia, storeHTCache, diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index d79613767..46fef42fc 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -101,6 +101,7 @@ public class QuickCrawlLink_p { final boolean crawlingQ = post.get("crawlingQ", "").equals("on"); final boolean followFrames = post.get("followFrames", "").equals("on"); final boolean obeyHtmlRobotsNoindex = post.get("obeyHtmlRobotsNoindex", "").equals("on"); + final boolean obeyHtmlRobotsNofollow = post.get("obeyHtmlRobotsNofollow", "").equals("on"); final boolean indexText = post.get("indexText", "off").equals("on"); final boolean indexMedia = post.get("indexMedia", "off").equals("on"); final boolean storeHTCache = post.get("storeHTCache", "").equals("on"); @@ -147,7 +148,8 @@ public class QuickCrawlLink_p { true, 60 * 24 * 30, // recrawlIfOlder (minutes); here: one month -1, // domMaxPages, if negative: no count restriction - crawlingQ, followFrames, obeyHtmlRobotsNoindex, + crawlingQ, followFrames, + obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, indexText, indexMedia, storeHTCache, remoteIndexing, CacheStrategy.IFFRESH, diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 5694966d0..ecfe14f8b 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -243,7 +243,7 @@ public class ViewFile { prop.put("viewMode_publisher", document.dc_publisher()); prop.put("viewMode_format", document.dc_format()); prop.put("viewMode_identifier", document.dc_identifier()); - prop.put("viewMode_source", url.toString()); + prop.put("viewMode_source", url.toNormalform(false)); prop.put("viewMode_lat", document.lat()); prop.put("viewMode_lon", document.lon()); prop.put("viewMode_parsedText", markup(wordArray, content).replaceAll("\n", "
").replaceAll("\t", "    ")); diff --git a/htroot/api/getpageinfo.java b/htroot/api/getpageinfo.java index 3c1ec6ac5..bd5c9e7e7 100644 --- a/htroot/api/getpageinfo.java +++ b/htroot/api/getpageinfo.java @@ -168,10 +168,8 @@ public class getpageinfo { } if (actions.indexOf("oai",0) >= 0) { try { - final DigestURL theURL = new DigestURL(url - + "?verb=Identify"); - - final String oairesult = checkOAI(theURL.toString()); + final DigestURL theURL = new DigestURL(url + "?verb=Identify"); + final String oairesult = checkOAI(theURL.toNormalform(false)); prop.put("oai", oairesult == "" ? 0 : 1); diff --git a/htroot/api/getpageinfo_p.java b/htroot/api/getpageinfo_p.java index f280f87a0..1b7418ff3 100644 --- a/htroot/api/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -173,7 +173,7 @@ public class getpageinfo_p { final DigestURL theURL = new DigestURL(url + "?verb=Identify"); - final String oairesult = checkOAI(theURL.toString()); + final String oairesult = checkOAI(theURL.toNormalform(false)); prop.put("oai", oairesult == "" ? 0 : 1); diff --git a/source/net/yacy/cora/document/id/AnchorURL.java b/source/net/yacy/cora/document/id/AnchorURL.java index 21fb4dd3d..fc05853b7 100644 --- a/source/net/yacy/cora/document/id/AnchorURL.java +++ b/source/net/yacy/cora/document/id/AnchorURL.java @@ -127,4 +127,16 @@ public class AnchorURL extends DigestURL { return tagopts; } + public boolean attachedNofollow() { + return this.relProperty.indexOf("nofollow") >= 0; + } + + @Override + public String toString() { + return "
0 ? (" name=\"" + this.nameProperty + "\"") : "") + + (this.relProperty.length() > 0 ? (" rel=\"" + this.relProperty + "\"") : "") + + ">" + this.textProperty + ""; + } + } diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index ed3f45596..f4fe6126b 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -857,6 +857,7 @@ public class MultiProtocolURL implements Serializable, Comparablere-crawl url\n"); writer.write("

" + title + "

\n"); diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java index e0e6a30b8..c81a0dd07 100644 --- a/source/net/yacy/crawler/CrawlStacker.java +++ b/source/net/yacy/crawler/CrawlStacker.java @@ -395,7 +395,7 @@ public final class CrawlStacker { return null; // no evidence that we know that url } final boolean recrawl = profile.recrawlIfOlder() > oldDate.longValue(); - final String urlstring = url.toString(); + final String urlstring = url.toNormalform(false); if (recrawl) { if (CrawlStacker.log.isInfo()) CrawlStacker.log.info("RE-CRAWL of URL '" + urlstring + "': this url was crawled " + @@ -409,7 +409,7 @@ public final class CrawlStacker { if (maxAllowedPagesPerDomain < Integer.MAX_VALUE && maxAllowedPagesPerDomain > 0) { final AtomicInteger dp = profile.getCount(url.getHost()); if (dp != null && dp.get() >= maxAllowedPagesPerDomain) { - if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL '" + url.toString() + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed."); + if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL '" + url.toNormalform(false) + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed."); return "crawl stack domain counter exceeded (test by profile)"; } @@ -435,7 +435,7 @@ public final class CrawlStacker { // check if the protocol is supported final String urlProtocol = url.getProtocol(); - final String urlstring = url.toString(); + final String urlstring = url.toNormalform(true); if (!Switchboard.getSwitchboard().loader.isSupportedProtocol(urlProtocol)) { CrawlStacker.log.severe("Unsupported protocol in URL '" + urlstring + "'."); return "unsupported protocol"; diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index bfcb399e1..69bf338ea 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -288,7 +288,7 @@ public final class CrawlSwitchboard { true, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, - false, true, true, + false, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_TEXT, true), sb.getConfigBool(SwitchboardConstants.PROXY_INDEXING_LOCAL_MEDIA, true), true, @@ -317,7 +317,7 @@ public final class CrawlSwitchboard { false, -1, -1, - true, true, true, + true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, true, false, @@ -346,7 +346,7 @@ public final class CrawlSwitchboard { false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), -1, - true, true, true, + true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, false, false, true, @@ -375,7 +375,7 @@ public final class CrawlSwitchboard { false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), -1, - true, true, true, + true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, true, true, @@ -405,7 +405,7 @@ public final class CrawlSwitchboard { false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE), -1, - true, true, true, + true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, false, false, true, @@ -434,7 +434,7 @@ public final class CrawlSwitchboard { false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), -1, - true, true, true, + true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, false, false, true, @@ -463,7 +463,7 @@ public final class CrawlSwitchboard { false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), -1, - true, true, true, + true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, false, true, true, @@ -492,7 +492,7 @@ public final class CrawlSwitchboard { false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), -1, - true, true, false, + true, true, false, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, false, false, @@ -524,7 +524,7 @@ public final class CrawlSwitchboard { false, System.currentTimeMillis(), -1, - true, true, false, + true, true, false, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow, true, true, false, diff --git a/source/net/yacy/crawler/data/Cache.java b/source/net/yacy/crawler/data/Cache.java index 9973f08a0..683b3b17d 100644 --- a/source/net/yacy/crawler/data/Cache.java +++ b/source/net/yacy/crawler/data/Cache.java @@ -201,9 +201,9 @@ public final class Cache { public static void store(final DigestURL url, final ResponseHeader responseHeader, final byte[] file) throws IOException { if (maxCacheSize == 0) return; - if (responseHeader == null) throw new IOException("Cache.store of url " + url.toString() + " not possible: responseHeader == null"); - if (file == null) throw new IOException("Cache.store of url " + url.toString() + " not possible: file == null"); - log.info("storing content of url " + url.toString() + ", " + file.length + " bytes"); + if (responseHeader == null) throw new IOException("Cache.store of url " + url.toNormalform(false) + " not possible: responseHeader == null"); + if (file == null) throw new IOException("Cache.store of url " + url.toNormalform(false) + " not possible: file == null"); + log.info("storing content of url " + url.toNormalform(false) + ", " + file.length + " bytes"); // store the file try { diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 8af014fbe..98c979758 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -69,6 +69,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M public static final String CRAWLING_Q = "crawlingQ"; public static final String FOLLOW_FRAMES = "followFrames"; public static final String OBEY_HTML_ROBOTS_NOINDEX = "obeyHtmlRobotsNoindex"; + public static final String OBEY_HTML_ROBOTS_NOFOLLOW = "obeyHtmlRobotsNofollow"; public static final String INDEX_TEXT = "indexText"; public static final String INDEX_MEDIA = "indexMedia"; public static final String STORE_HTCACHE = "storeHTCache"; @@ -135,7 +136,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M final boolean directDocByURL, final long recrawlIfOlder /*date*/, final int domMaxPages, - final boolean crawlingQ, final boolean followFrames, final boolean obeyHtmlRobotsNoindex, + final boolean crawlingQ, final boolean followFrames, + final boolean obeyHtmlRobotsNoindex, final boolean obeyHtmlRobotsNofollow, final boolean indexText, final boolean indexMedia, final boolean storeHTCache, @@ -170,6 +172,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(CRAWLING_Q, crawlingQ); // crawling of urls with '?' put(FOLLOW_FRAMES, followFrames); // load pages contained in frames or ifames put(OBEY_HTML_ROBOTS_NOINDEX, obeyHtmlRobotsNoindex); // if false, then a meta robots tag containing 'noindex' is ignored + put(OBEY_HTML_ROBOTS_NOFOLLOW, obeyHtmlRobotsNofollow); put(INDEX_TEXT, indexText); put(INDEX_MEDIA, indexMedia); put(STORE_HTCACHE, storeHTCache); @@ -534,6 +537,12 @@ public class CrawlProfile extends ConcurrentHashMap implements M return (r.equals(Boolean.TRUE.toString())); } + public boolean obeyHtmlRobotsNofollow() { + final String r = get(OBEY_HTML_ROBOTS_NOFOLLOW); + if (r == null) return false; + return (r.equals(Boolean.TRUE.toString())); + } + public boolean indexText() { final String r = get(INDEX_TEXT); if (r == null) return true; diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 55c938502..803959c91 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -356,7 +356,7 @@ public class CrawlQueues { } } } else { - CrawlQueues.log.severe("Unsupported protocol in URL '" + url.toString()); + CrawlQueues.log.severe("Unsupported protocol in URL '" + url.toNormalform(false)); } } else { if (CrawlQueues.log.isFine()) CrawlQueues.log.fine(stats + ": LOST PROFILE HANDLE '" + urlEntry.profileHandle() + "' for URL " + urlEntry.url()); @@ -627,7 +627,7 @@ public class CrawlQueues { while ((request = CrawlQueues.this.workerQueue.poll(10, TimeUnit.SECONDS)) != POISON_REQUEST) { if (request == null) break; // we run this only for a specific time and then let the process die to clear up resources request.setStatus("worker-initialized", WorkflowJob.STATUS_INITIATED); - this.setName("CrawlQueues.Loader(" + request.url() + ")"); + this.setName("CrawlQueues.Loader(" + request.url().toNormalform(false) + ")"); CrawlProfile profile = CrawlQueues.this.sb.crawler.get(UTF8.getBytes(request.profileHandle())); try { // checking robots.txt for http(s) resources diff --git a/source/net/yacy/crawler/retrieval/HTTPLoader.java b/source/net/yacy/crawler/retrieval/HTTPLoader.java index a00b563af..9dbdcfda2 100644 --- a/source/net/yacy/crawler/retrieval/HTTPLoader.java +++ b/source/net/yacy/crawler/retrieval/HTTPLoader.java @@ -149,7 +149,7 @@ public final class HTTPLoader { // restart crawling with new url this.log.info("CRAWLER Redirection detected ('" + client.getHttpResponse().getStatusLine() + "') for URL " + requestURLString); - this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl); + this.log.info("CRAWLER ..Redirecting request to: " + redirectionUrl.toNormalform(false)); this.sb.webStructure.generateCitationReference(url, redirectionUrl); diff --git a/source/net/yacy/crawler/retrieval/Request.java b/source/net/yacy/crawler/retrieval/Request.java index bcb5c6b24..1d027e5bc 100644 --- a/source/net/yacy/crawler/retrieval/Request.java +++ b/source/net/yacy/crawler/retrieval/Request.java @@ -225,7 +225,7 @@ public class Request extends WorkflowJob new byte[][] { this.url.hash(), this.initiator, - UTF8.getBytes(this.url.toString()), + UTF8.getBytes(this.url.toNormalform(false)), this.refhash, namebytes, appdatestr, diff --git a/source/net/yacy/crawler/retrieval/SitemapImporter.java b/source/net/yacy/crawler/retrieval/SitemapImporter.java index a2530a391..b3594b109 100644 --- a/source/net/yacy/crawler/retrieval/SitemapImporter.java +++ b/source/net/yacy/crawler/retrieval/SitemapImporter.java @@ -98,7 +98,7 @@ public class SitemapImporter extends Thread { this.sb.crawlStacker.enqueueEntry(new Request( ASCII.getBytes(this.sb.peers.mySeed().hash), url, - null, // this.siteMapURL.toString(), + null, // this.siteMapURL.toNormalform(false), entry.url(), entry.lastmod(new Date()), this.crawlingProfile.handle(), diff --git a/source/net/yacy/data/BookmarkHelper.java b/source/net/yacy/data/BookmarkHelper.java index abc7e2e51..c93019db1 100644 --- a/source/net/yacy/data/BookmarkHelper.java +++ b/source/net/yacy/data/BookmarkHelper.java @@ -149,7 +149,7 @@ public class BookmarkHelper { title = url.getNameProperty(); ConcurrentLog.info("BOOKMARKS", "links.get(url)"); if ("".equals(title)) {//cannot be displayed - title = url.toString(); + title = url.toNormalform(false); } bm = db.new Bookmark(url); bm.setProperty(Bookmark.BOOKMARK_TITLE, title); diff --git a/source/net/yacy/data/ymark/YMarkCrawlStart.java b/source/net/yacy/data/ymark/YMarkCrawlStart.java index bbda8b5b7..7db41a887 100644 --- a/source/net/yacy/data/ymark/YMarkCrawlStart.java +++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java @@ -184,7 +184,8 @@ public class YMarkCrawlStart extends HashMap{ CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, crawlingQ, - true, true, true, true, true, false, + true, true, true, false, + true, true, false, CacheStrategy.IFFRESH, "robot_" + CrawlSwitchboard.CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA, ClientIdentification.yacyIntranetCrawlerAgentName); // TODO: make this a default profile in CrawlSwitchboard diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index fe3a1b0c2..7fb72b1a0 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -818,7 +818,7 @@ dc_rights final List descriptions = new ArrayList(); final Collection titles = new LinkedHashSet(); final Collection sectionTitles = new LinkedHashSet(); - final List anchors = new ArrayList(); + final List anchors = new ArrayList(); final LinkedHashMap rss = new LinkedHashMap(); final LinkedHashMap images = new LinkedHashMap(); final Set languages = new HashSet(); @@ -913,16 +913,22 @@ dc_rights public final static String CANONICAL_MARKER = "canonical"; - public static Map getHyperlinks(final Document[] documents) { - final Map result = new HashMap(); + public static Map getHyperlinks(final Document[] documents, boolean includeNofollow) { + final Map result = new HashMap<>(); for (final Document d: documents) { - result.putAll(d.getHyperlinks()); + if (includeNofollow) { + result.putAll(d.getHyperlinks()); + } else { + for (Map.Entry entry: d.getHyperlinks().entrySet()) { + if (!entry.getKey().attachedNofollow()) result.put(entry.getKey(), entry.getValue()); + } + } final Object parser = d.getParserObject(); if (parser instanceof ContentScraper) { final ContentScraper html = (ContentScraper) parser; String refresh = html.getRefreshPath(); - if (refresh != null && refresh.length() > 0) try {result.put(new DigestURL(refresh), "refresh");} catch (final MalformedURLException e) {} - DigestURL canonical = html.getCanonical(); + if (refresh != null && refresh.length() > 0) try {result.put(new AnchorURL(refresh), "refresh");} catch (final MalformedURLException e) {} + AnchorURL canonical = html.getCanonical(); if (canonical != null) { result.put(canonical, CANONICAL_MARKER); } diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index a41a9cc4d..e1d447616 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -183,7 +183,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { private final CharBuffer content; private final EventListenerList htmlFilterEventListeners; private double lon, lat; - private DigestURL canonical, publisher; + private AnchorURL canonical, publisher; private final int maxLinks; private int breadcrumbs; @@ -771,7 +771,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { return this.script; } - public DigestURL getCanonical() { + public AnchorURL getCanonical() { return this.canonical; } diff --git a/source/net/yacy/document/parser/html/ImageEntry.java b/source/net/yacy/document/parser/html/ImageEntry.java index 60003f23c..f1d6061d7 100644 --- a/source/net/yacy/document/parser/html/ImageEntry.java +++ b/source/net/yacy/document/parser/html/ImageEntry.java @@ -133,7 +133,7 @@ public class ImageEntry implements Comparable, Comparator ohc) return 1; - return this.imageurl.toString().compareTo((h).imageurl.toString()); + return this.imageurl.toNormalform(true).compareTo((h).imageurl.toNormalform(true)); } @Override diff --git a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java index ceeaff2f7..e4cadf152 100644 --- a/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java +++ b/source/net/yacy/document/parser/rdfa/impl/RDFaParser.java @@ -55,7 +55,7 @@ public class RDFaParser extends AbstractParser implements Parser { // TODO: current hardcoded restriction: apply rdfa parser only on selected sources. - if (url.toString().contains(".yacy") || url.toString().contains("experiments")) { + if (url.toNormalform(true).contains(".yacy") || url.toNormalform(true).contains("experiments")) { // if (true == false) { Document rdfaDoc = parseRDFa(url, mimeType, charset, source); Document[] retDocs = new Document[htmlDocs.length + 1]; diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java index a2616a6a6..4e14899e1 100644 --- a/source/net/yacy/document/parser/sitemapParser.java +++ b/source/net/yacy/document/parser/sitemapParser.java @@ -116,7 +116,7 @@ public class sitemapParser extends AbstractParser implements Parser { final HTTPClient client = new HTTPClient(agent); client.setHeader(requestHeader.entrySet()); try { - client.GET(sitemapURL.toString(), false); + client.GET(sitemapURL.toNormalform(false), false); if (client.getStatusCode() != 200) { throw new IOException("Unable to download the sitemap file " + sitemapURL + "\nServer returned status: " + client.getHttpResponse().getStatusLine()); diff --git a/source/net/yacy/document/parser/vcfParser.java b/source/net/yacy/document/parser/vcfParser.java index 1c78b213f..a60269b1d 100644 --- a/source/net/yacy/document/parser/vcfParser.java +++ b/source/net/yacy/document/parser/vcfParser.java @@ -179,7 +179,7 @@ public class vcfParser extends AbstractParser implements Parser { } else if (key.toUpperCase().startsWith("URL")) { try { final AnchorURL newURL = new AnchorURL(value); - newURL.setNameProperty(newURL.toString()); + newURL.setNameProperty(newURL.toNormalform(false)); anchors.add(newURL); //parsedData.put(key,value); } catch (final MalformedURLException ex) {/* ignore this */} diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index c25cde6d6..004907dcc 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -396,7 +396,7 @@ public final class LoaderDispatcher { * @return a map from URLs to the anchor texts of the urls * @throws IOException */ - public final Map loadLinks(final AnchorURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { + public final Map loadLinks(final AnchorURL url, final CacheStrategy cacheStrategy, BlacklistType blacklistType, final ClientIdentification.Agent agent) throws IOException { final Response response = load(request(url, true, false), cacheStrategy, Integer.MAX_VALUE, blacklistType, agent); if (response == null) throw new IOException("response == null"); final ResponseHeader responseHeader = response.getResponseHeader(); @@ -413,7 +413,7 @@ public final class LoaderDispatcher { throw new IOException("parser error: " + e.getMessage()); } - return Document.getHyperlinks(documents); + return Document.getHyperlinks(documents, true); } public synchronized static void cleanupAccessTimeTable(final long timeout) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 7d55deaaf..ea48acef2 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2584,26 +2584,27 @@ public final class Switchboard extends serverSwitch { for (Document d: documents) d.setDepth(response.depth()); // get the hyperlinks - final Map hl = Document.getHyperlinks(documents); + final Map hl = Document.getHyperlinks(documents, !response.profile().obeyHtmlRobotsNofollow()); + if (response.profile().indexMedia()) { for (Map.Entry entry: Document.getImagelinks(documents).entrySet()) { - if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(entry.getKey(), entry.getValue()); + if (TextParser.supportsExtension(entry.getKey()) == null) hl.put(new AnchorURL(entry.getKey()), entry.getValue()); } } // add all media links also to the crawl stack. They will be re-sorted to the NOLOAD queue and indexed afterwards as pure links if (response.profile().directDocByURL()) { for (Map.Entry entry: Document.getImagelinks(documents).entrySet()) { - if (TextParser.supportsExtension(entry.getKey()) != null) hl.put(entry.getKey(), entry.getValue()); + if (TextParser.supportsExtension(entry.getKey()) != null) hl.put(new AnchorURL(entry.getKey()), entry.getValue()); } - hl.putAll(Document.getApplinks(documents)); - hl.putAll(Document.getVideolinks(documents)); - hl.putAll(Document.getAudiolinks(documents)); + for (Map.Entry d: Document.getApplinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue()); + for (Map.Entry d: Document.getVideolinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue()); + for (Map.Entry d: Document.getAudiolinks(documents).entrySet()) hl.put(new AnchorURL(d.getKey()), d.getValue()); } // insert those hyperlinks to the crawler MultiProtocolURL nextUrl; - for ( final Map.Entry nextEntry : hl.entrySet() ) { + for ( final Map.Entry nextEntry : hl.entrySet() ) { // check for interruption checkInterruption(); @@ -2880,7 +2881,7 @@ public final class Switchboard extends serverSwitch { public final void addAllToIndex( final DigestURL url, - final Map links, + final Map links, final SearchEvent searchEvent, final String heuristicName, final Map collections, @@ -2893,15 +2894,15 @@ public final class Switchboard extends serverSwitch { } // check if some of the links match with the query - final Map matcher = searchEvent.query.separateMatches(links); + final Map matcher = searchEvent.query.separateMatches(links); // take the matcher and load them all - for (final Map.Entry entry : matcher.entrySet()) { + for (final Map.Entry entry : matcher.entrySet()) { urls.add(new DigestURL(entry.getKey(), (byte[]) null)); } // take then the no-matcher and load them also - for (final Map.Entry entry : links.entrySet()) { + for (final Map.Entry entry : links.entrySet()) { urls.add(new DigestURL(entry.getKey(), (byte[]) null)); } addToIndex(urls, searchEvent, heuristicName, collections, doublecheck); @@ -3479,12 +3480,12 @@ public final class Switchboard extends serverSwitch { return; } - final Map links; + final Map links; searchEvent.oneFeederStarted(); try { links = Switchboard.this.loader.loadLinks(url, CacheStrategy.NOCACHE, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent); if ( links != null ) { - final Iterator i = links.keySet().iterator(); + final Iterator i = links.keySet().iterator(); while ( i.hasNext() ) { if ( !i.next().getHost().endsWith(host) ) { i.remove(); @@ -3518,13 +3519,13 @@ public final class Switchboard extends serverSwitch { return; } - final Map links; + final Map links; DigestURL url; try { links = Switchboard.this.loader.loadLinks(startUrl, CacheStrategy.IFFRESH, BlacklistType.SEARCH, ClientIdentification.yacyIntranetCrawlerAgent); if (links != null) { if (links.size() < 1000) { // limit to 1000 to skip large index pages - final Iterator i = links.keySet().iterator(); + final Iterator i = links.keySet().iterator(); final boolean globalcrawljob = Switchboard.this.getConfigBool(SwitchboardConstants.HEURISTIC_SEARCHRESULTS_CRAWLGLOBAL,false); Collection urls = new ArrayList(); while (i.hasNext()) { @@ -3590,11 +3591,11 @@ public final class Switchboard extends serverSwitch { //System.out.println("BLEKKO: " + UTF8.String(resource)); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); if ( rss != null ) { - final Map links = new TreeMap(); - DigestURL uri; + final Map links = new TreeMap<>(); + AnchorURL uri; for ( final RSSMessage message : rss.getFeed() ) { try { - uri = new DigestURL(message.getLink()); + uri = new AnchorURL(message.getLink()); links.put(uri, message.getTitle()); } catch (final MalformedURLException e ) { } @@ -3720,7 +3721,7 @@ public final class Switchboard extends serverSwitch { final HTTPClient client = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent, timeout); client.setHeader(reqHeader.entrySet()); - client.HEADResponse(url.toString(), false); + client.HEADResponse(url.toNormalform(false), false); int statusCode = client.getHttpResponse().getStatusLine().getStatusCode(); ResponseHeader header = new ResponseHeader(statusCode, client.getHttpResponse().getAllHeaders()); if (checkAge) { diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index 03161e768..77eb7120e 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -151,7 +151,7 @@ public class DocumentIndex extends Segment { try { documents = TextParser.parseSource(url, null, null, 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null)); } catch (final Exception e ) { - throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage()); + throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage()); } //Document document = Document.mergeDocuments(url, null, documents); final SolrInputDocument[] rows = new SolrInputDocument[documents.length]; diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 0b54dcee8..dff407505 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -592,7 +592,7 @@ public class Segment { int outlinksSame = document.inboundLinks().size(); int outlinksOther = document.outboundLinks().size(); final int urlLength = urlNormalform.length(); - final int urlComps = MultiProtocolURL.urlComps(url.toString()).length; + final int urlComps = MultiProtocolURL.urlComps(url.toNormalform(false)).length; // create a word prototype which is re-used for all entries if ((this.termIndex != null && storeToRWI) || searchEvent != null) { diff --git a/source/net/yacy/search/query/QueryParams.java b/source/net/yacy/search/query/QueryParams.java index c5dd35a5e..60599232b 100644 --- a/source/net/yacy/search/query/QueryParams.java +++ b/source/net/yacy/search/query/QueryParams.java @@ -35,10 +35,11 @@ import java.util.Set; import java.util.SortedSet; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; + import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; import net.yacy.cora.document.encoding.ASCII; -import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.geo.GeoLocation; @@ -60,6 +61,7 @@ import net.yacy.search.index.Segment; import net.yacy.search.ranking.RankingProfile; import net.yacy.search.schema.CollectionConfiguration; import net.yacy.search.schema.CollectionSchema; + import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrQuery.SortClause; import org.apache.solr.common.params.CommonParams; @@ -522,11 +524,11 @@ public final class QueryParams { return this.queryGoal; } - public final Map separateMatches(final Map links) { - final Map matcher = new HashMap(); - final Iterator > i = links.entrySet().iterator(); - Map.Entry entry; - DigestURL url; + public final Map separateMatches(final Map links) { + final Map matcher = new HashMap<>(); + final Iterator > i = links.entrySet().iterator(); + Map.Entry entry; + AnchorURL url; String anchorText; while (i.hasNext()) { entry = i.next(); diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 0be3a2e7a..2349849b6 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -1531,7 +1531,7 @@ public final class SearchEvent { } @Override public String toString() { - return this.imageUrl.toString(); + return this.imageUrl.toNormalform(false); } } diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index e694cc3d9..5c070785d 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -1199,10 +1199,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } proccount.incrementAndGet(); allcount.incrementAndGet(); - if (proccount.get() % 1000 == 0) ConcurrentLog.info( - "CollectionConfiguration", "webgraph - postprocessed " + proccount + " from " + count + " documents; " + + if (proccount.get() % 1000 == 0) { + postprocessingActivity = "writing cr values to webgraph for host " + hostfinal + "postprocessed " + proccount + " from " + count + " documents; " + (proccount.get() * 1000 / (System.currentTimeMillis() - start)) + " docs/second; " + - ((System.currentTimeMillis() - start) * (count - proccount.get()) / proccount.get() / 60000) + " minutes remaining for host " + hostfinal); + ((System.currentTimeMillis() - start) * (count - proccount.get()) / proccount.get() / 60000) + " minutes remaining"; + ConcurrentLog.info("CollectionConfiguration", postprocessingActivity); + } } } catch (InterruptedException e) { ConcurrentLog.warn("CollectionConfiguration", e.getMessage(), e); @@ -1301,10 +1303,12 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri collectionConnector.add(sid); proccount++; allcount.incrementAndGet(); - if (proccount % 100 == 0) ConcurrentLog.info( - "CollectionConfiguration", "collection - postprocessed " + proccount + " from " + count + " documents; " + + if (proccount % 100 == 0) { + postprocessingActivity = "postprocessed " + proccount + " from " + count + " collection documents; " + (proccount * 1000 / (System.currentTimeMillis() - start)) + " docs/second; " + - ((System.currentTimeMillis() - start) * (count - proccount) / proccount / 60000) + " minutes remaining"); + ((System.currentTimeMillis() - start) * (count - proccount) / proccount / 60000) + " minutes remaining"; + ConcurrentLog.info("CollectionConfiguration", postprocessingActivity); + } } catch (final Throwable e1) { ConcurrentLog.logException(e1); failids.add(i); diff --git a/source/net/yacy/search/snippet/MediaSnippet.java b/source/net/yacy/search/snippet/MediaSnippet.java index b2d2fc2c2..c88890007 100644 --- a/source/net/yacy/search/snippet/MediaSnippet.java +++ b/source/net/yacy/search/snippet/MediaSnippet.java @@ -202,7 +202,7 @@ public class MediaSnippet implements Comparable, Comparator= 0 || u.indexOf("favicon",0) >= 0) continue; if (ientry.height() > 0 && ientry.height() < 32) continue; @@ -262,7 +262,7 @@ public class MediaSnippet implements Comparable, Comparator