diff --git a/htroot/CrawlProfileEditor_p.xml b/htroot/CrawlProfileEditor_p.xml
index 0b880ac3f..58036b48e 100644
--- a/htroot/CrawlProfileEditor_p.xml
+++ b/htroot/CrawlProfileEditor_p.xml
@@ -1,48 +1,49 @@
#{crawlProfiles}#
-
- #[handle]#
- #[name]#
- #[collections]#
- #[agentName]#
- #[userAgent]#
- #[depth]#
- #(directDocByURL)#false::true#(/directDocByURL)#
- #[recrawlIfOlder]#
- #[domMaxPages]#
- #(crawlingQ)#false::true#(/crawlingQ)#
- #(followFrames)#false::true#(/followFrames)#
- #(obeyHtmlRobotsNoindex)#false::true#(/obeyHtmlRobotsNoindex)#
- #(obeyHtmlRobotsNofollow)#false::true#(/obeyHtmlRobotsNofollow)#
- #(indexText)#false::true#(/indexText)#
- #(indexMedia)#false::true#(/indexMedia)#
- #(storeHTCache)#false::true#(/storeHTCache)#
- #(remoteIndexing)#false::true#(/remoteIndexing)#
- #[cacheStrategy]#
- #(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)#
- #[crawlerURLMustMatch]#
- #[crawlerURLMustNotMatch]#
- #[crawlerOriginURLMustMatch]#
- #[crawlerOriginURLMustNotMatch]#
- #[crawlerIPMustMatch]#
- #[crawlerIPMustNotMatch]#
- #[crawlerCountryMustMatch]#
- #[crawlerNoLimitURLMustMatch]#
- #[indexURLMustMatch]#
- #[indexURLMustNotMatch]#
- #[indexContentMustMatch]#
- #[indexContentMustNotMatch]#
- #[indexMediaTypeMustMatch]#
- #[indexMediaTypeMustNotMatch]#
- #[indexSolrQueryMustMatch]#
- #[indexSolrQueryMustNotMatch]#
- #(status)#terminated::active::system#(/status)#
-
- #{crawlingDomFilterContent}#
- - #[item]#
- #{/crawlingDomFilterContent}#
-
-
+
+ #[handle]#
+ #[name]#
+ #[collections]#
+ #[agentName]#
+ #[userAgent]#
+ #[depth]#
+ #(directDocByURL)#false::true#(/directDocByURL)#
+ #[recrawlIfOlder]#
+ #[domMaxPages]#
+ #(crawlingQ)#false::true#(/crawlingQ)#
+ #(followFrames)#false::true#(/followFrames)#
+ #(obeyHtmlRobotsNoindex)#false::true#(/obeyHtmlRobotsNoindex)#
+ #(obeyHtmlRobotsNofollow)#false::true#(/obeyHtmlRobotsNofollow)#
+ #(indexText)#false::true#(/indexText)#
+ #(indexMedia)#false::true#(/indexMedia)#
+ #(storeHTCache)#false::true#(/storeHTCache)#
+ #(remoteIndexing)#false::true#(/remoteIndexing)#
+ #[cacheStrategy]#
+ #(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)#
+ #[crawlerURLMustMatch]#
+ #[crawlerURLMustNotMatch]#
+ #[crawlerOriginURLMustMatch]#
+ #[crawlerOriginURLMustNotMatch]#
+ #[crawlerIPMustMatch]#
+ #[crawlerIPMustNotMatch]#
+ #[crawlerCountryMustMatch]#
+ #[crawlerNoLimitURLMustMatch]#
+ #[indexURLMustMatch]#
+ #[indexURLMustNotMatch]#
+ #[indexContentMustMatch]#
+ #[indexContentMustNotMatch]#
+ #[indexMediaTypeMustMatch]#
+ #[indexMediaTypeMustNotMatch]#
+ #[indexSolrQueryMustMatch]#
+ #[indexSolrQueryMustNotMatch]#
+ #(noindexWhenCanonicalUnequalURL)#false::true#(/noindexWhenCanonicalUnequalURL)#
+ #(status)#terminated::active::system#(/status)#
+
+ #{crawlingDomFilterContent}#
+ - #[item]#
+ #{/crawlingDomFilterContent}#
+
+
#{/crawlProfiles}#
diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html
index c3e60c1b8..9917cd7e1 100644
--- a/htroot/CrawlStartExpert.html
+++ b/htroot/CrawlStartExpert.html
@@ -412,6 +412,9 @@
Filter on Content of Document
(all visible text, including camel-case-tokenized url and title)
@@ -470,7 +473,7 @@
must-not-match |
-
+
|
#(/embeddedSolrConnected)#
diff --git a/source/net/yacy/crawler/CrawlStacker.java b/source/net/yacy/crawler/CrawlStacker.java
index 16ffea78b..1b4874a96 100644
--- a/source/net/yacy/crawler/CrawlStacker.java
+++ b/source/net/yacy/crawler/CrawlStacker.java
@@ -488,7 +488,7 @@ public final class CrawlStacker implements WorkflowTask{
// check if ip is local ip address
final String urlRejectReason = this.urlInAcceptedDomain(url);
if (urlRejectReason != null) {
- if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("denied_(" + urlRejectReason + ")");
+ if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL not in accepted Domain (" + urlRejectReason + ")");
return "denied_(" + urlRejectReason + ")";
}
diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java
index aca7b95cd..3c17c75cf 100644
--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@@ -294,6 +294,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+ true, //noindexWhenCanonicalUnequalURL
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")),
true,
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
@@ -328,6 +329,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+ true, //noindexWhenCanonicalUnequalURL
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")),
true,
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
@@ -362,6 +364,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+ true, //noindexWhenCanonicalUnequalURL
Integer.parseInt(sb.getConfig(SwitchboardConstants.PROXY_PREFETCH_DEPTH, "0")),
true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
@@ -395,6 +398,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+ true, //noindexWhenCanonicalUnequalURL
0,
false,
null,
@@ -428,6 +432,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+ true, //noindexWhenCanonicalUnequalURL
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
@@ -461,6 +466,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+ true, //noindexWhenCanonicalUnequalURL
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
@@ -502,6 +508,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+ true, //noindexWhenCanonicalUnequalURL
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE),
@@ -535,6 +542,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+ true, //noindexWhenCanonicalUnequalURL
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
@@ -568,6 +576,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+ true, //noindexWhenCanonicalUnequalURL
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
@@ -601,6 +610,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+ true, //noindexWhenCanonicalUnequalURL
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
@@ -637,6 +647,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+ true, //noindexWhenCanonicalUnequalURL
0,
false,
null,
diff --git a/source/net/yacy/crawler/RecrawlBusyThread.java b/source/net/yacy/crawler/RecrawlBusyThread.java
index af32db0cf..11b3d3a98 100644
--- a/source/net/yacy/crawler/RecrawlBusyThread.java
+++ b/source/net/yacy/crawler/RecrawlBusyThread.java
@@ -352,6 +352,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
CrawlProfile.MATCH_NEVER_STRING, // indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, // indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, // indexContentMustNotMatch
+ true, //noindexWhenCanonicalUnequalURL
0, false, CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE), -1,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH,
diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java
index 19f36b613..64af0c489 100644
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@@ -115,6 +115,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
INDEXING_MEDIA_TYPE_MUSTNOTMATCH("indexMediaTypeMustNotMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Not-Match Filter"),
INDEXING_SOLR_QUERY_MUSTMATCH("indexSolrQueryMustMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Match Filter"),
INDEXING_SOLR_QUERY_MUSTNOTMATCH("indexSolrQueryMustNotMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Not-Match Filter"),
+ NOINDEX_WHEN_CANONICAL_UNEQUAL_URL("noindexWhenCanonicalUnequalURL", false, CrawlAttribute.STRING, "No Indexing for Documents with Canonical != URL"),
RECRAWL_IF_OLDER ("recrawlIfOlder", false, CrawlAttribute.INTEGER, "Recrawl If Older"),
STORE_HTCACHE ("storeHTCache", false, CrawlAttribute.BOOLEAN, "Store in HTCache"),
CACHE_STRAGEGY ("cacheStrategy", false, CrawlAttribute.STRING, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)"),
@@ -223,6 +224,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
final String crawlerCountryMustMatch, final String crawlerNoDepthLimitMatch,
final String indexUrlMustMatch, final String indexUrlMustNotMatch,
final String indexContentMustMatch, final String indexContentMustNotMatch,
+ final boolean noindexWhenCanonicalUnequalURL,
final int depth,
final boolean directDocByURL,
final Date recrawlIfOlder /*date*/,
@@ -300,6 +302,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY);
put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY);
+ put(CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL.key, noindexWhenCanonicalUnequalURL);
}
/**
@@ -851,6 +854,12 @@ public class CrawlProfile extends ConcurrentHashMap implements M
return (r.equals(Boolean.TRUE.toString()));
}
+ public boolean noindexWhenCanonicalUnequalURL() {
+ final String r = get(CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL.key);
+ if (r == null) return true;
+ return (r.equals(Boolean.TRUE.toString()));
+ }
+
public boolean storeHTCache() {
final String r = get(CrawlAttribute.STORE_HTCACHE.key);
if (r == null) return false;
@@ -997,6 +1006,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key));
+ prop.put(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL, noindexWhenCanonicalUnequalURL() ? 1 : 0);
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key));
diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java
index acf7c6fd0..567685510 100644
--- a/source/net/yacy/crawler/retrieval/Response.java
+++ b/source/net/yacy/crawler/retrieval/Response.java
@@ -175,10 +175,10 @@ public class Response {
int p = mime.indexOf('/');
if (p < 0) return new String[]{mime};
if (doctype == DT_TEXT) return new String[]{"text" + mime.substring(p)};
- if (doctype == DT_IMAGE) return new String[]{"image" + mime.substring(p)};
- if (doctype == DT_AUDIO) return new String[]{"audio" + mime.substring(p)};
- if (doctype == DT_MOVIE) return new String[]{"video" + mime.substring(p)};
- return new String[]{mime};
+ if (doctype == DT_IMAGE) return new String[]{"image" + mime.substring(p)};
+ if (doctype == DT_AUDIO) return new String[]{"audio" + mime.substring(p)};
+ if (doctype == DT_MOVIE) return new String[]{"video" + mime.substring(p)};
+ return new String[]{mime};
}
public static final int QUEUE_STATE_FRESH = 0;
@@ -235,16 +235,16 @@ public class Response {
* @return the original request that produced this response
*/
public Request getRequest() {
- return request;
- }
+ return request;
+ }
public ResponseHeader getResponseHeader() {
return this.responseHeader;
}
public RequestHeader getRequestHeader() {
- return this.requestHeader;
- }
+ return this.requestHeader;
+ }
public boolean fromCache() {
return this.fromCache;
@@ -260,11 +260,11 @@ public class Response {
return this.request.name();
}
- /**
- * @return the requested URL that produced this response. When redirection(s)
- * occurred, this is not the initial URL, but the last redirection
- * target.
- */
+ /**
+ * @return the requested URL that produced this response. When redirection(s)
+ * occurred, this is not the initial URL, but the last redirection
+ * target.
+ */
public DigestURL url() {
return this.request.url();
}
@@ -745,11 +745,11 @@ public class Response {
// -ranges in request
// we checked that in shallStoreCache
- /*
- * Eventually check if a parser supports the media yype. Depending on the crawl
- * profile, the indexingDocumentProcessor can eventually index only URL metadata
- * using the generic parser for unsupported media types
- */
+ /*
+ * Eventually check if a parser supports the media yype. Depending on the crawl
+ * profile, the indexingDocumentProcessor can eventually index only URL metadata
+ * using the generic parser for unsupported media types
+ */
if (this.responseHeader != null && !profile().isIndexNonParseableUrls()) {
final String mimeType = this.responseHeader.getContentType();
final String parserError = TextParser.supportsMime(mimeType);
diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java
index ba3365d5c..34676836a 100644
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@@ -91,12 +91,12 @@ public class Document {
/** links to icons that belongs to the document (mapped by absolute URL) */
private Map icons;
- /**
- * URLs of linked data item types/classes referenced by the document (for example in
- * HTML with standard annotations such as RDFa, microdata, microformats or
- * JSON-LD)
- */
- private Set linkedDataTypes;
+ /**
+ * URLs of linked data item types/classes referenced by the document (for example in
+ * HTML with standard annotations such as RDFa, microdata, microformats or
+ * JSON-LD)
+ */
+ private Set linkedDataTypes;
private boolean resorted;
private final Set languages;
private boolean indexingDenied;
@@ -131,13 +131,13 @@ public class Document {
this.parserObject = parserObject;
this.keywords = new LinkedHashSet();
if (keywords != null) {
- Collections.addAll(this.keywords, keywords);
+ Collections.addAll(this.keywords, keywords);
}
this.titles = (titles == null) ? new ArrayList(1) : titles;
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
this.sections = new LinkedList() ;
if (sections != null) {
- Collections.addAll(this.sections, sections);
+ Collections.addAll(this.sections, sections);
}
this.descriptions = (abstrcts == null) ? new ArrayList() : abstrcts;
if (lat >= -90.0d && lat <= 90.0d && lon >= -180.0d && lon <= 180.0d) {
@@ -216,13 +216,21 @@ public class Document {
}
this.scraperObject = scraper;
}
+
+ public AnchorURL getCanonical() {
+ final Object scraper = this.getScraperObject();
+ if (!(scraper instanceof ContentScraper)) return null;
+ final ContentScraper html = (ContentScraper) scraper;
+ AnchorURL canonical = html.getCanonical();
+ return canonical;
+ }
public Set getContentLanguages() {
return this.languages;
}
public String getFileName() {
- return this.source.getFileName();
+ return this.source.getFileName();
}
public Map> getGenericFacets() {
@@ -233,15 +241,15 @@ public class Document {
* @return true when this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit
*/
public boolean isPartiallyParsed() {
- return this.partiallyParsed;
- }
+ return this.partiallyParsed;
+ }
/**
* @param partiallyParsed set to true to indicates this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit
*/
public void setPartiallyParsed(final boolean partiallyParsed) {
- this.partiallyParsed = partiallyParsed;
- }
+ this.partiallyParsed = partiallyParsed;
+ }
/**
* compute a set of languages that this document contains
@@ -637,13 +645,13 @@ dc_rights
// we add artificial hyperlinks to the hyperlink set
// that can be calculated from given hyperlinks and imagelinks
- /*
- * Should we also include icons ? with
- * this.hyperlinks.putAll(allReflinks(this.icons.keySet())); It is
- * problematic as allReflinks will modify icons set set, removing those whose URL is
- * starting with "/www" but it is not desired for icons such as
- * www.wikipedia.org/static/favicon/wikipedia.ico
- */
+ /*
+ * Should we also include icons ? with
+ * this.hyperlinks.putAll(allReflinks(this.icons.keySet())); It is
+ * problematic as allReflinks will modify icons set set, removing those whose URL is
+ * starting with "/www" but it is not desired for icons such as
+ * www.wikipedia.org/static/favicon/wikipedia.ico
+ */
this.hyperlinks.putAll(allReflinks(this.images.values()));
this.hyperlinks.putAll(allReflinks(this.audiolinks.keySet()));
@@ -804,16 +812,16 @@ dc_rights
}
InputStream textStream = doc.getTextStream();
try {
- FileUtils.copy(textStream, (ByteArrayOutputStream) this.text);
+ FileUtils.copy(textStream, (ByteArrayOutputStream) this.text);
} finally {
- try {
- if(textStream != null) {
- /* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
- textStream.close();
- }
- } catch(IOException e) {
- ConcurrentLog.warn("DOCUMENT", "Could not close text input stream");
- }
+ try {
+ if(textStream != null) {
+ /* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
+ textStream.close();
+ }
+ } catch(IOException e) {
+ ConcurrentLog.warn("DOCUMENT", "Could not close text input stream");
+ }
}
this.anchors.addAll(doc.getAnchors());
@@ -826,41 +834,41 @@ dc_rights
* @return links to icons that belongs to the document (mapped by absolute URL)
*/
public Map getIcons() {
- return icons;
- }
+ return icons;
+ }
/**
* Set links to icons that belongs to the document (mapped by absolute URL)
* @param icons
*/
public void setIcons(final Map icons) {
- /* Better to ensure now icons property will not be null */
- if(icons != null) {
- this.icons = icons;
- } else {
- this.icons = new HashMap<>();
- }
- }
+ /* Better to ensure now icons property will not be null */
+ if(icons != null) {
+ this.icons = icons;
+ } else {
+ this.icons = new HashMap<>();
+ }
+ }
- /**
- * @return URLs of linked data item types/classes referenced by the document (for example in
- * HTML with standard annotations such as RDFa, microdata, microformats or
- * JSON-LD)
- */
+ /**
+ * @return URLs of linked data item types/classes referenced by the document (for example in
+ * HTML with standard annotations such as RDFa, microdata, microformats or
+ * JSON-LD)
+ */
public Set getLinkedDataTypes() {
- return this.linkedDataTypes;
- }
+ return this.linkedDataTypes;
+ }
- /**
- * @return URLs of linked data item types/classes referenced by the document
- */
+ /**
+ * @return URLs of linked data item types/classes referenced by the document
+ */
public void setLinkedDataTypes(final Set linkedDataTypes) {
- if(linkedDataTypes != null) {
- /* Ensure non null property */
- this.linkedDataTypes = linkedDataTypes;
- } else {
- this.linkedDataTypes.clear();
- }
+ if(linkedDataTypes != null) {
+ /* Ensure non null property */
+ this.linkedDataTypes = linkedDataTypes;
+ } else {
+ this.linkedDataTypes.clear();
+ }
}
@@ -1034,14 +1042,14 @@ dc_rights
} catch (final IOException e) {
ConcurrentLog.logException(e);
} finally {
- try {
- if(textStream != null) {
- /* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
- textStream.close();
- }
- } catch (IOException e) {
- ConcurrentLog.warn("DOCUMENT", "Could not close text input stream");
- }
+ try {
+ if(textStream != null) {
+ /* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
+ textStream.close();
+ }
+ } catch (IOException e) {
+ ConcurrentLog.warn("DOCUMENT", "Could not close text input stream");
+ }
}
}
anchors.addAll(doc.getAnchors());
@@ -1098,7 +1106,7 @@ dc_rights
public final static String IFRAME_MARKER = "iframe";
public final static String FRAME_MARKER = "frame";
public final static String EMBED_MARKER = "embed";
-
+
public static Map getHyperlinks(final Document[] documents, boolean includeNofollow) {
final Map result = new HashMap<>();
for (final Document d: documents) {
diff --git a/source/net/yacy/htroot/CrawlStartExpert.java b/source/net/yacy/htroot/CrawlStartExpert.java
index 9c8d55149..56b17d767 100644
--- a/source/net/yacy/htroot/CrawlStartExpert.java
+++ b/source/net/yacy/htroot/CrawlStartExpert.java
@@ -369,6 +369,13 @@ public class CrawlStartExpert {
}
}
+ // Check Canonical?
+ if (post == null) {
+ prop.put("noindexWhenCanonicalUnequalURLChecked", 1);
+ } else {
+ prop.put("noindexWhenCanonicalUnequalURLChecked",
+ post.getBoolean("noindexWhenCanonicalUnequalURL") ? 1 : 0);
+ }
// ---------- Clean-Up before Crawl Start
// delete if older settings: number value
diff --git a/source/net/yacy/htroot/Crawler_p.java b/source/net/yacy/htroot/Crawler_p.java
index 98ca68dc1..d6b19e883 100644
--- a/source/net/yacy/htroot/Crawler_p.java
+++ b/source/net/yacy/htroot/Crawler_p.java
@@ -316,6 +316,7 @@ public class Crawler_p {
final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
final String indexContentMustMatch = post.get("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
final String indexContentMustNotMatch = post.get("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
+ final boolean noindexWhenCanonicalUnequalURL = "on".equals(post.get("noindexWhenCanonicalUnequalURL", "off"));
final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
env.setConfig("crawlOrder", crawlOrder);
@@ -614,6 +615,7 @@ public class Crawler_p {
indexUrlMustNotMatch,
indexContentMustMatch,
indexContentMustNotMatch,
+ noindexWhenCanonicalUnequalURL,
newcrawlingdepth,
directDocByURL,
crawlingIfOlder,
diff --git a/source/net/yacy/htroot/QuickCrawlLink_p.java b/source/net/yacy/htroot/QuickCrawlLink_p.java
index 4230b69ea..6c2f432f4 100644
--- a/source/net/yacy/htroot/QuickCrawlLink_p.java
+++ b/source/net/yacy/htroot/QuickCrawlLink_p.java
@@ -150,6 +150,7 @@ public class QuickCrawlLink_p {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
+ false,
CrawlingDepth,
true,
CrawlProfile.getRecrawlDate(60 * 24 * 30), // recrawlIfOlder (minutes); here: one month
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 34ebfcc94..50902296e 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -3152,28 +3152,73 @@ public final class Switchboard extends serverSwitch {
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
}
- if (!(profile.indexUrlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexUrlMustMatchPattern().matcher(urls).matches()) ||
- (profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) {
- if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
+
+ // check mustmatch pattern
+ Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
+ if (mustmatchurl != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchurl.matcher(urls).matches()) {
+ String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
+ if (this.log.isInfo()) this.log.info(info);
+ // create a new errorURL DB entry
+ this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
+ return new IndexingQueueEntry(in.queueEntry, in.documents, null);
+ }
+
+ // check mustnotmatch
+ Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
+ if (mustnotmatchurl != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchurl.matcher(urls).matches()) {
+ String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
+ if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
- this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern(), -1);
+ this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
// check which files may take part in the indexing process
final List doclist = new ArrayList<>();
docloop: for (final Document document : in.documents) {
+
+ // check canonical
+ if (profile.noindexWhenCanonicalUnequalURL()) {
+ AnchorURL canonical = document.getCanonical();
+ DigestURL source = document.dc_source();
+ if (canonical != null && source != null) {
+ String canonical_norm = canonical.toNormalform(true);
+ String source_norm = source.toNormalform(true);
+ if (!canonical_norm.equals(source_norm)) {
+ String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
+ if (this.log.isInfo()) this.log.info(info);
+ // create a new errorURL DB entry
+ this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
+ continue docloop;
+ }
+ }
+ }
+
+ // check indexing denied flags
if (document.indexingDenied() && profile.obeyHtmlRobotsNoindex() && !this.isIntranetMode()) {
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by document-attached noindexing rule", -1);
continue docloop;
}
- if (!(profile.indexContentMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexContentMustMatchPattern().matcher(document.getTextString()).matches()) ||
- (profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) {
- if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern());
+
+ // check content pattern must-match
+ Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
+ if (mustmatchcontent != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchcontent.matcher(document.getTextString()).matches()) {
+ String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
+ if (this.log.isInfo()) this.log.info(info);
+ // create a new errorURL DB entry
+ this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
+ continue docloop;
+ }
+
+ // check content pattern must-not-match
+ Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
+ if (mustnotmatchcontent != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchcontent.matcher(document.getTextString()).matches()) {
+ String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
+ if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
- this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern(), -1);
+ this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
continue docloop;
}