added canonical filter

attention: this is on by default!
(it should do the right thing)
pull/554/head
Michael Peter Christen 2 years ago
parent 5a52b01c09
commit 9fcd8f1bda

@ -1,48 +1,49 @@
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<crawlProfiles>
#{crawlProfiles}#
<crawlProfile>
<handle>#[handle]#</handle>
<name>#[name]#</name>
<collections>#[collections]#</collections>
<agentName>#[agentName]#</agentName>
<userAgent>#[userAgent]#</userAgent>
<depth>#[depth]#</depth>
<directDocByURL>#(directDocByURL)#false::true#(/directDocByURL)#</directDocByURL>
<recrawlIfOlder>#[recrawlIfOlder]#</recrawlIfOlder>
<domMaxPages>#[domMaxPages]#</domMaxPages>
<crawlingQ>#(crawlingQ)#false::true#(/crawlingQ)#</crawlingQ>
<followFrames>#(followFrames)#false::true#(/followFrames)#</followFrames>
<obeyHtmlRobotsNoindex>#(obeyHtmlRobotsNoindex)#false::true#(/obeyHtmlRobotsNoindex)#</obeyHtmlRobotsNoindex>
<obeyHtmlRobotsNofollow>#(obeyHtmlRobotsNofollow)#false::true#(/obeyHtmlRobotsNofollow)#</obeyHtmlRobotsNofollow>
<indexText>#(indexText)#false::true#(/indexText)#</indexText>
<indexMedia>#(indexMedia)#false::true#(/indexMedia)#</indexMedia>
<storeHTCache>#(storeHTCache)#false::true#(/storeHTCache)#</storeHTCache>
<remoteIndexing>#(remoteIndexing)#false::true#(/remoteIndexing)#</remoteIndexing>
<cacheStrategy>#[cacheStrategy]#</cacheStrategy>
<crawlerAlwaysCheckMediaType>#(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)#</crawlerAlwaysCheckMediaType>
<crawlerURLMustMatch>#[crawlerURLMustMatch]#</crawlerURLMustMatch>
<crawlerURLMustNotMatch>#[crawlerURLMustNotMatch]#</crawlerURLMustNotMatch>
<crawlerOriginURLMustMatch>#[crawlerOriginURLMustMatch]#</crawlerOriginURLMustMatch>
<crawlerOriginURLMustNotMatch>#[crawlerOriginURLMustNotMatch]#</crawlerOriginURLMustNotMatch>
<crawlerIPMustMatch>#[crawlerIPMustMatch]#</crawlerIPMustMatch>
<crawlerIPMustNotMatch>#[crawlerIPMustNotMatch]#</crawlerIPMustNotMatch>
<crawlerCountryMustMatch>#[crawlerCountryMustMatch]#</crawlerCountryMustMatch>
<crawlerNoLimitURLMustMatch>#[crawlerNoLimitURLMustMatch]#</crawlerNoLimitURLMustMatch>
<indexURLMustMatch>#[indexURLMustMatch]#</indexURLMustMatch>
<indexURLMustNotMatch>#[indexURLMustNotMatch]#</indexURLMustNotMatch>
<indexContentMustMatch>#[indexContentMustMatch]#</indexContentMustMatch>
<indexContentMustNotMatch>#[indexContentMustNotMatch]#</indexContentMustNotMatch>
<indexMediaTypeMustMatch>#[indexMediaTypeMustMatch]#</indexMediaTypeMustMatch>
<indexMediaTypeMustNotMatch>#[indexMediaTypeMustNotMatch]#</indexMediaTypeMustNotMatch>
<indexSolrQueryMustMatch>#[indexSolrQueryMustMatch]#</indexSolrQueryMustMatch>
<indexSolrQueryMustNotMatch>#[indexSolrQueryMustNotMatch]#</indexSolrQueryMustNotMatch>
<status>#(status)#terminated::active::system#(/status)#</status>
<crawlingDomFilterContent>
#{crawlingDomFilterContent}#
<item>#[item]#</item>
#{/crawlingDomFilterContent}#
</crawlingDomFilterContent>
</crawlProfile>
<crawlProfile>
<handle>#[handle]#</handle>
<name>#[name]#</name>
<collections>#[collections]#</collections>
<agentName>#[agentName]#</agentName>
<userAgent>#[userAgent]#</userAgent>
<depth>#[depth]#</depth>
<directDocByURL>#(directDocByURL)#false::true#(/directDocByURL)#</directDocByURL>
<recrawlIfOlder>#[recrawlIfOlder]#</recrawlIfOlder>
<domMaxPages>#[domMaxPages]#</domMaxPages>
<crawlingQ>#(crawlingQ)#false::true#(/crawlingQ)#</crawlingQ>
<followFrames>#(followFrames)#false::true#(/followFrames)#</followFrames>
<obeyHtmlRobotsNoindex>#(obeyHtmlRobotsNoindex)#false::true#(/obeyHtmlRobotsNoindex)#</obeyHtmlRobotsNoindex>
<obeyHtmlRobotsNofollow>#(obeyHtmlRobotsNofollow)#false::true#(/obeyHtmlRobotsNofollow)#</obeyHtmlRobotsNofollow>
<indexText>#(indexText)#false::true#(/indexText)#</indexText>
<indexMedia>#(indexMedia)#false::true#(/indexMedia)#</indexMedia>
<storeHTCache>#(storeHTCache)#false::true#(/storeHTCache)#</storeHTCache>
<remoteIndexing>#(remoteIndexing)#false::true#(/remoteIndexing)#</remoteIndexing>
<cacheStrategy>#[cacheStrategy]#</cacheStrategy>
<crawlerAlwaysCheckMediaType>#(crawlerAlwaysCheckMediaType)#false::true#(/crawlerAlwaysCheckMediaType)#</crawlerAlwaysCheckMediaType>
<crawlerURLMustMatch>#[crawlerURLMustMatch]#</crawlerURLMustMatch>
<crawlerURLMustNotMatch>#[crawlerURLMustNotMatch]#</crawlerURLMustNotMatch>
<crawlerOriginURLMustMatch>#[crawlerOriginURLMustMatch]#</crawlerOriginURLMustMatch>
<crawlerOriginURLMustNotMatch>#[crawlerOriginURLMustNotMatch]#</crawlerOriginURLMustNotMatch>
<crawlerIPMustMatch>#[crawlerIPMustMatch]#</crawlerIPMustMatch>
<crawlerIPMustNotMatch>#[crawlerIPMustNotMatch]#</crawlerIPMustNotMatch>
<crawlerCountryMustMatch>#[crawlerCountryMustMatch]#</crawlerCountryMustMatch>
<crawlerNoLimitURLMustMatch>#[crawlerNoLimitURLMustMatch]#</crawlerNoLimitURLMustMatch>
<indexURLMustMatch>#[indexURLMustMatch]#</indexURLMustMatch>
<indexURLMustNotMatch>#[indexURLMustNotMatch]#</indexURLMustNotMatch>
<indexContentMustMatch>#[indexContentMustMatch]#</indexContentMustMatch>
<indexContentMustNotMatch>#[indexContentMustNotMatch]#</indexContentMustNotMatch>
<indexMediaTypeMustMatch>#[indexMediaTypeMustMatch]#</indexMediaTypeMustMatch>
<indexMediaTypeMustNotMatch>#[indexMediaTypeMustNotMatch]#</indexMediaTypeMustNotMatch>
<indexSolrQueryMustMatch>#[indexSolrQueryMustMatch]#</indexSolrQueryMustMatch>
<indexSolrQueryMustNotMatch>#[indexSolrQueryMustNotMatch]#</indexSolrQueryMustNotMatch>
<noindexWhenCanonicalUnequalURL>#(noindexWhenCanonicalUnequalURL)#false::true#(/noindexWhenCanonicalUnequalURL)#</noindexWhenCanonicalUnequalURL>
<status>#(status)#terminated::active::system#(/status)#</status>
<crawlingDomFilterContent>
#{crawlingDomFilterContent}#
<item>#[item]#</item>
#{/crawlingDomFilterContent}#
</crawlingDomFilterContent>
</crawlProfile>
#{/crawlProfiles}#
</crawlProfiles>

@ -412,6 +412,9 @@
<table style="border-width: 0px">
<tr><td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td><td><input name="indexmustmatch" id="indexmustmatch" type="text" size="55" maxlength="100000" value="#[indexmustmatch]#" onblur="if (this.value=='') this.value='.*';"/> (must not be empty)</td></tr>
<tr><td><img src="env/grafics/minus.gif" alt=""> must-not-match</td><td><input name="indexmustnotmatch" id="indexmustnotmatch" type="text" size="55" maxlength="100000" value="#[indexmustnotmatch]#" /></td></tr>
<tr>
<td colspan="2"><input type="checkbox" name="noindexWhenCanonicalUnequalURL" id="noindexWhenCanonicalUnequalURL" #(noindexWhenCanonicalUnequalURLChecked)#::checked="checked"#(/noindexWhenCanonicalUnequalURLChecked)#/> No Indexing when Canonical present and Canonical != URL</td>
</tr>
</table>
</dd>
<dt>Filter on Content of Document<br/>(all visible text, including camel-case-tokenized url and title)</dt>
@ -470,7 +473,7 @@
<tr>
<td style="width:110px"><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
<td>
<input name="indexSolrQueryMustNotMatch" id="indexSolrQueryMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustNotMatch]#" aria-describedby="indexSolrQueryInfo" />
<input name="indexSolrQueryMustNotMatch" id="indexSolrQueryMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustNotMatch]#" aria-describedby="indexSolrQueryInfo" enabled="false"/>
</td>
</tr>
#(/embeddedSolrConnected)#

@ -488,7 +488,7 @@ public final class CrawlStacker implements WorkflowTask<Request>{
// check if ip is local ip address
final String urlRejectReason = this.urlInAcceptedDomain(url);
if (urlRejectReason != null) {
if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("denied_(" + urlRejectReason + ")");
if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL not in accepted Domain (" + urlRejectReason + ")");
return "denied_(" + urlRejectReason + ")";
}

@ -294,6 +294,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DEEP_DEPTH, "3")),
true,
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
@ -328,6 +329,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_SHALLOW_DEPTH, "1")),
true,
CrawlProfile.getRecrawlDate(Integer.parseInt(sb.getConfig(SwitchboardConstants.AUTOCRAWL_DAYS, "1"))*1440),
@ -362,6 +364,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
Integer.parseInt(sb.getConfig(SwitchboardConstants.PROXY_PREFETCH_DEPTH, "0")),
true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
@ -395,6 +398,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0,
false,
null,
@ -428,6 +432,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
@ -461,6 +466,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
@ -502,6 +508,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_GREEDY_LEARNING_TEXT_RECRAWL_CYCLE),
@ -535,6 +542,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
@ -568,6 +576,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
@ -601,6 +610,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
@ -637,6 +647,7 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0,
false,
null,

@ -352,6 +352,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
CrawlProfile.MATCH_NEVER_STRING, // indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, // indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, // indexContentMustNotMatch
true, //noindexWhenCanonicalUnequalURL
0, false, CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_RECRAWL_JOB_RECRAWL_CYCLE), -1,
true, true, true, false, // crawlingQ, followFrames, obeyHtmlRobotsNoindex, obeyHtmlRobotsNofollow,
true, true, true, false, -1, false, true, CrawlProfile.MATCH_NEVER_STRING, CacheStrategy.IFFRESH,

@ -115,6 +115,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
INDEXING_MEDIA_TYPE_MUSTNOTMATCH("indexMediaTypeMustNotMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Not-Match Filter"),
INDEXING_SOLR_QUERY_MUSTMATCH("indexSolrQueryMustMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Match Filter"),
INDEXING_SOLR_QUERY_MUSTNOTMATCH("indexSolrQueryMustNotMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Not-Match Filter"),
NOINDEX_WHEN_CANONICAL_UNEQUAL_URL("noindexWhenCanonicalUnequalURL", false, CrawlAttribute.STRING, "No Indexing for Documents with Canonical != URL"),
RECRAWL_IF_OLDER ("recrawlIfOlder", false, CrawlAttribute.INTEGER, "Recrawl If Older"),
STORE_HTCACHE ("storeHTCache", false, CrawlAttribute.BOOLEAN, "Store in HTCache"),
CACHE_STRAGEGY ("cacheStrategy", false, CrawlAttribute.STRING, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)"),
@ -223,6 +224,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final String crawlerCountryMustMatch, final String crawlerNoDepthLimitMatch,
final String indexUrlMustMatch, final String indexUrlMustNotMatch,
final String indexContentMustMatch, final String indexContentMustNotMatch,
final boolean noindexWhenCanonicalUnequalURL,
final int depth,
final boolean directDocByURL,
final Date recrawlIfOlder /*date*/,
@ -300,6 +302,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY);
put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY);
put(CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL.key, noindexWhenCanonicalUnequalURL);
}
/**
@ -851,6 +854,12 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return (r.equals(Boolean.TRUE.toString()));
}
public boolean noindexWhenCanonicalUnequalURL() {
final String r = get(CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL.key);
if (r == null) return true;
return (r.equals(Boolean.TRUE.toString()));
}
public boolean storeHTCache() {
final String r = get(CrawlAttribute.STORE_HTCACHE.key);
if (r == null) return false;
@ -997,6 +1006,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexURLMustNotMatch", this.get(CrawlAttribute.INDEXING_URL_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key));
prop.put(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.NOINDEX_WHEN_CANONICAL_UNEQUAL_URL, noindexWhenCanonicalUnequalURL() ? 1 : 0);
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key));

@ -175,10 +175,10 @@ public class Response {
int p = mime.indexOf('/');
if (p < 0) return new String[]{mime};
if (doctype == DT_TEXT) return new String[]{"text" + mime.substring(p)};
if (doctype == DT_IMAGE) return new String[]{"image" + mime.substring(p)};
if (doctype == DT_AUDIO) return new String[]{"audio" + mime.substring(p)};
if (doctype == DT_MOVIE) return new String[]{"video" + mime.substring(p)};
return new String[]{mime};
if (doctype == DT_IMAGE) return new String[]{"image" + mime.substring(p)};
if (doctype == DT_AUDIO) return new String[]{"audio" + mime.substring(p)};
if (doctype == DT_MOVIE) return new String[]{"video" + mime.substring(p)};
return new String[]{mime};
}
public static final int QUEUE_STATE_FRESH = 0;
@ -235,16 +235,16 @@ public class Response {
* @return the original request that produced this response
*/
public Request getRequest() {
return request;
}
return request;
}
public ResponseHeader getResponseHeader() {
return this.responseHeader;
}
public RequestHeader getRequestHeader() {
return this.requestHeader;
}
return this.requestHeader;
}
public boolean fromCache() {
return this.fromCache;
@ -260,11 +260,11 @@ public class Response {
return this.request.name();
}
/**
* @return the requested URL that produced this response. When redirection(s)
* occurred, this is not the initial URL, but the last redirection
* target.
*/
/**
* @return the requested URL that produced this response. When redirection(s)
* occurred, this is not the initial URL, but the last redirection
* target.
*/
public DigestURL url() {
return this.request.url();
}
@ -745,11 +745,11 @@ public class Response {
// -ranges in request
// we checked that in shallStoreCache
/*
* Eventually check if a parser supports the media yype. Depending on the crawl
* profile, the indexingDocumentProcessor can eventually index only URL metadata
* using the generic parser for unsupported media types
*/
/*
* Eventually check if a parser supports the media yype. Depending on the crawl
* profile, the indexingDocumentProcessor can eventually index only URL metadata
* using the generic parser for unsupported media types
*/
if (this.responseHeader != null && !profile().isIndexNonParseableUrls()) {
final String mimeType = this.responseHeader.getContentType();
final String parserError = TextParser.supportsMime(mimeType);

@ -91,12 +91,12 @@ public class Document {
/** links to icons that belongs to the document (mapped by absolute URL) */
private Map<DigestURL, IconEntry> icons;
/**
* URLs of linked data item types/classes referenced by the document (for example in
* HTML with standard annotations such as RDFa, microdata, microformats or
* JSON-LD)
*/
private Set<DigestURL> linkedDataTypes;
/**
* URLs of linked data item types/classes referenced by the document (for example in
* HTML with standard annotations such as RDFa, microdata, microformats or
* JSON-LD)
*/
private Set<DigestURL> linkedDataTypes;
private boolean resorted;
private final Set<String> languages;
private boolean indexingDenied;
@ -131,13 +131,13 @@ public class Document {
this.parserObject = parserObject;
this.keywords = new LinkedHashSet<String>();
if (keywords != null) {
Collections.addAll(this.keywords, keywords);
Collections.addAll(this.keywords, keywords);
}
this.titles = (titles == null) ? new ArrayList<String>(1) : titles;
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
this.sections = new LinkedList<String>() ;
if (sections != null) {
Collections.addAll(this.sections, sections);
Collections.addAll(this.sections, sections);
}
this.descriptions = (abstrcts == null) ? new ArrayList<String>() : abstrcts;
if (lat >= -90.0d && lat <= 90.0d && lon >= -180.0d && lon <= 180.0d) {
@ -216,13 +216,21 @@ public class Document {
}
this.scraperObject = scraper;
}
public AnchorURL getCanonical() {
final Object scraper = this.getScraperObject();
if (!(scraper instanceof ContentScraper)) return null;
final ContentScraper html = (ContentScraper) scraper;
AnchorURL canonical = html.getCanonical();
return canonical;
}
public Set<String> getContentLanguages() {
return this.languages;
}
public String getFileName() {
return this.source.getFileName();
return this.source.getFileName();
}
public Map<String, Set<String>> getGenericFacets() {
@ -233,15 +241,15 @@ public class Document {
* @return true when this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit
*/
public boolean isPartiallyParsed() {
return this.partiallyParsed;
}
return this.partiallyParsed;
}
/**
* @param partiallyParsed set to true to indicates this document is the result of a partially parsed resource, for example due to resource content size exceeding a given limit
*/
public void setPartiallyParsed(final boolean partiallyParsed) {
this.partiallyParsed = partiallyParsed;
}
this.partiallyParsed = partiallyParsed;
}
/**
* compute a set of languages that this document contains
@ -637,13 +645,13 @@ dc_rights
// we add artificial hyperlinks to the hyperlink set
// that can be calculated from given hyperlinks and imagelinks
/*
* Should we also include icons ? with
* this.hyperlinks.putAll(allReflinks(this.icons.keySet())); It is
* problematic as allReflinks will modify icons set set, removing those whose URL is
* starting with "/www" but it is not desired for icons such as
* www.wikipedia.org/static/favicon/wikipedia.ico
*/
/*
* Should we also include icons ? with
* this.hyperlinks.putAll(allReflinks(this.icons.keySet())); It is
* problematic as allReflinks will modify icons set set, removing those whose URL is
* starting with "/www" but it is not desired for icons such as
* www.wikipedia.org/static/favicon/wikipedia.ico
*/
this.hyperlinks.putAll(allReflinks(this.images.values()));
this.hyperlinks.putAll(allReflinks(this.audiolinks.keySet()));
@ -804,16 +812,16 @@ dc_rights
}
InputStream textStream = doc.getTextStream();
try {
FileUtils.copy(textStream, (ByteArrayOutputStream) this.text);
FileUtils.copy(textStream, (ByteArrayOutputStream) this.text);
} finally {
try {
if(textStream != null) {
/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
textStream.close();
}
} catch(IOException e) {
ConcurrentLog.warn("DOCUMENT", "Could not close text input stream");
}
try {
if(textStream != null) {
/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
textStream.close();
}
} catch(IOException e) {
ConcurrentLog.warn("DOCUMENT", "Could not close text input stream");
}
}
this.anchors.addAll(doc.getAnchors());
@ -826,41 +834,41 @@ dc_rights
* @return links to icons that belongs to the document (mapped by absolute URL)
*/
public Map<DigestURL, IconEntry> getIcons() {
return icons;
}
return icons;
}
/**
* Set links to icons that belongs to the document (mapped by absolute URL)
* @param icons
*/
public void setIcons(final Map<DigestURL, IconEntry> icons) {
/* Better to ensure now icons property will not be null */
if(icons != null) {
this.icons = icons;
} else {
this.icons = new HashMap<>();
}
}
/* Better to ensure now icons property will not be null */
if(icons != null) {
this.icons = icons;
} else {
this.icons = new HashMap<>();
}
}
/**
* @return URLs of linked data item types/classes referenced by the document (for example in
* HTML with standard annotations such as RDFa, microdata, microformats or
* JSON-LD)
*/
/**
* @return URLs of linked data item types/classes referenced by the document (for example in
* HTML with standard annotations such as RDFa, microdata, microformats or
* JSON-LD)
*/
public Set<DigestURL> getLinkedDataTypes() {
return this.linkedDataTypes;
}
return this.linkedDataTypes;
}
/**
* @return URLs of linked data item types/classes referenced by the document
*/
/**
* @return URLs of linked data item types/classes referenced by the document
*/
public void setLinkedDataTypes(final Set<DigestURL> linkedDataTypes) {
if(linkedDataTypes != null) {
/* Ensure non null property */
this.linkedDataTypes = linkedDataTypes;
} else {
this.linkedDataTypes.clear();
}
if(linkedDataTypes != null) {
/* Ensure non null property */
this.linkedDataTypes = linkedDataTypes;
} else {
this.linkedDataTypes.clear();
}
}
@ -1034,14 +1042,14 @@ dc_rights
} catch (final IOException e) {
ConcurrentLog.logException(e);
} finally {
try {
if(textStream != null) {
/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
textStream.close();
}
} catch (IOException e) {
ConcurrentLog.warn("DOCUMENT", "Could not close text input stream");
}
try {
if(textStream != null) {
/* textStream can be a FileInputStream : we must close it to ensure releasing system resource */
textStream.close();
}
} catch (IOException e) {
ConcurrentLog.warn("DOCUMENT", "Could not close text input stream");
}
}
}
anchors.addAll(doc.getAnchors());
@ -1098,7 +1106,7 @@ dc_rights
public final static String IFRAME_MARKER = "iframe";
public final static String FRAME_MARKER = "frame";
public final static String EMBED_MARKER = "embed";
public static Map<AnchorURL, String> getHyperlinks(final Document[] documents, boolean includeNofollow) {
final Map<AnchorURL, String> result = new HashMap<>();
for (final Document d: documents) {

@ -369,6 +369,13 @@ public class CrawlStartExpert {
}
}
// Check Canonical?
if (post == null) {
prop.put("noindexWhenCanonicalUnequalURLChecked", 1);
} else {
prop.put("noindexWhenCanonicalUnequalURLChecked",
post.getBoolean("noindexWhenCanonicalUnequalURL") ? 1 : 0);
}
// ---------- Clean-Up before Crawl Start
// delete if older settings: number value

@ -316,6 +316,7 @@ public class Crawler_p {
final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
final String indexContentMustMatch = post.get("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
final String indexContentMustNotMatch = post.get("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
final boolean noindexWhenCanonicalUnequalURL = "on".equals(post.get("noindexWhenCanonicalUnequalURL", "off"));
final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
env.setConfig("crawlOrder", crawlOrder);
@ -614,6 +615,7 @@ public class Crawler_p {
indexUrlMustNotMatch,
indexContentMustMatch,
indexContentMustNotMatch,
noindexWhenCanonicalUnequalURL,
newcrawlingdepth,
directDocByURL,
crawlingIfOlder,

@ -150,6 +150,7 @@ public class QuickCrawlLink_p {
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
false,
CrawlingDepth,
true,
CrawlProfile.getRecrawlDate(60 * 24 * 30), // recrawlIfOlder (minutes); here: one month

@ -3152,28 +3152,73 @@ public final class Switchboard extends serverSwitch {
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
}
if (!(profile.indexUrlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexUrlMustMatchPattern().matcher(urls).matches()) ||
(profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) {
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
// check mustmatch pattern
Pattern mustmatchurl = profile.indexUrlMustMatchPattern();
if (mustmatchurl != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchurl.matcher(urls).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern();
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
// check mustnotmatch
Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern();
if (mustnotmatchurl != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchurl.matcher(urls).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl;
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern(), -1);
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
// check which files may take part in the indexing process
final List<Document> doclist = new ArrayList<>();
docloop: for (final Document document : in.documents) {
// check canonical
if (profile.noindexWhenCanonicalUnequalURL()) {
AnchorURL canonical = document.getCanonical();
DigestURL source = document.dc_source();
if (canonical != null && source != null) {
String canonical_norm = canonical.toNormalform(true);
String source_norm = source.toNormalform(true);
if (!canonical_norm.equals(source_norm)) {
String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm;
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
continue docloop;
}
}
}
// check indexing denied flags
if (document.indexingDenied() && profile.obeyHtmlRobotsNoindex() && !this.isIntranetMode()) {
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "denied by document-attached noindexing rule", -1);
continue docloop;
}
if (!(profile.indexContentMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexContentMustMatchPattern().matcher(document.getTextString()).matches()) ||
(profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) {
if (this.log.isInfo()) this.log.info("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern());
// check content pattern must-match
Pattern mustmatchcontent = profile.indexContentMustMatchPattern();
if (mustmatchcontent != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchcontent.matcher(document.getTextString()).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ;
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
continue docloop;
}
// check content pattern must-not-match
Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern();
if (mustnotmatchcontent != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchcontent.matcher(document.getTextString()).matches()) {
String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern();
if (this.log.isInfo()) this.log.info(info);
// create a new errorURL DB entry
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern(), -1);
this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1);
continue docloop;
}

Loading…
Cancel
Save