diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java
index 3272863b3..2c40befb5 100644
--- a/htroot/CrawlProfileEditor_p.java
+++ b/htroot/CrawlProfileEditor_p.java
@@ -64,17 +64,19 @@ public class CrawlProfileEditor_p {
private static final List labels = new ArrayList();
static {
- labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING));
- labels.add(new eentry(CrawlProfile.COLLECTIONS, "Collections (comma-separated list)", false, eentry.STRING));
- labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTMATCH, "URL Must-Match Filter", false, eentry.STRING));
- labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTNOTMATCH, "URL Must-Not-Match Filter", false, eentry.STRING));
- labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTMATCH, "IP Must-Match Filter", false, eentry.STRING));
- labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTNOTMATCH, "IP Must-Not-Match Filter", false, eentry.STRING));
- labels.add(new eentry(CrawlProfile.CRAWLER_COUNTRY_MUSTMATCH, "Country Must-Match Filter", false, eentry.STRING));
- labels.add(new eentry(CrawlProfile.CRAWLER_URL_NODEPTHLIMITMATCH, "URL No-Depth-Limit Must-Match Filter", false, eentry.STRING));
- labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTMATCH, "Indexing Must-Match Filter", false, eentry.STRING));
- labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTNOTMATCH, "Indexing Must-Not-Match Filter", false, eentry.STRING));
- labels.add(new eentry(CrawlProfile.CACHE_STRAGEGY, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)", false, eentry.STRING));
+ labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING));
+ labels.add(new eentry(CrawlProfile.COLLECTIONS, "Collections (comma-separated list)", false, eentry.STRING));
+ labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTMATCH, "URL Must-Match Filter", false, eentry.STRING));
+ labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTNOTMATCH, "URL Must-Not-Match Filter", false, eentry.STRING));
+ labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTMATCH, "IP Must-Match Filter", false, eentry.STRING));
+ labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTNOTMATCH, "IP Must-Not-Match Filter", false, eentry.STRING));
+ labels.add(new eentry(CrawlProfile.CRAWLER_COUNTRY_MUSTMATCH, "Country Must-Match Filter", false, eentry.STRING));
+ labels.add(new eentry(CrawlProfile.CRAWLER_URL_NODEPTHLIMITMATCH, "URL No-Depth-Limit Must-Match Filter", false, eentry.STRING));
+ labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTMATCH, "Indexing URL Must-Match Filter", false, eentry.STRING));
+ labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTNOTMATCH, "Indexing URL Must-Not-Match Filter", false, eentry.STRING));
+ labels.add(new eentry(CrawlProfile.INDEXING_CONTENT_MUSTMATCH, "Indexing Content Must-Match Filter", false, eentry.STRING));
+ labels.add(new eentry(CrawlProfile.INDEXING_CONTENT_MUSTNOTMATCH, "Indexing Content Must-Not-Match Filter",false, eentry.STRING));
+ labels.add(new eentry(CrawlProfile.CACHE_STRAGEGY, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)", false, eentry.STRING));
labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER));
labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER));
diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html
index 81c106a42..cc82de4c4 100644
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@@ -36,119 +36,159 @@
#%env/templates/footer.template%#
diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java
index 6a4425cf0..3f9cfd0df 100644
--- a/htroot/CrawlStartExpert_p.java
+++ b/htroot/CrawlStartExpert_p.java
@@ -49,6 +49,8 @@ public class CrawlStartExpert_p {
prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
prop.put("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
prop.put("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
+ prop.put("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
+ prop.put("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL_STRING));
prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER_STRING));
prop.put("countryMustMatch", sb.getConfig("crawlingCountryMustMatch", ""));
diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java
index 291346528..0d8679610 100644
--- a/htroot/Crawler_p.java
+++ b/htroot/Crawler_p.java
@@ -212,6 +212,8 @@ public class Crawler_p {
String crawlerNoDepthLimitMatch = post.get("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING);
final String indexUrlMustMatch = post.get("indexmustmatch", CrawlProfile.MATCH_ALL_STRING);
final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
+ final String indexContentMustMatch = post.get("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING);
+ final String indexContentMustNotMatch = post.get("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING);
final boolean crawlOrder = post.get("crawlOrder", "off").equals("on");
env.setConfig("crawlOrder", crawlOrder);
@@ -352,6 +354,8 @@ public class Crawler_p {
crawlerNoDepthLimitMatch,
indexUrlMustMatch,
indexUrlMustNotMatch,
+ indexContentMustMatch,
+ indexContentMustNotMatch,
newcrawlingdepth,
directDocByURL,
crawlingIfOlder,
diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java
index 18fa543dc..11e81c9df 100644
--- a/htroot/QuickCrawlLink_p.java
+++ b/htroot/QuickCrawlLink_p.java
@@ -135,10 +135,12 @@ public class QuickCrawlLink_p {
crawlingMustNotMatch, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
- "", //crawlerCountryMustMatch
+ CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
+ CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
+ CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
CrawlingDepth,
true,
60 * 24 * 30, // recrawlIfOlder (minutes); here: one month
diff --git a/htroot/env/base.css b/htroot/env/base.css
index 308ea4d9a..c9f6d0be6 100644
--- a/htroot/env/base.css
+++ b/htroot/env/base.css
@@ -97,7 +97,7 @@ td {
fieldset {
margin:10px 5px;
- padding:10px;
+ padding:2px 10px 2px 10px;
}
legend {
@@ -1009,7 +1009,7 @@ div#info:hover span {
padding: 3px;
color: #000000;
background: #DDDDDD;
- text-align: center;
+ text-align: left;
border: 1px dashed black;
z-index: 100;
}
\ No newline at end of file
diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java
index 04834dbb7..aea158e1c 100644
--- a/source/net/yacy/crawler/CrawlSwitchboard.java
+++ b/source/net/yacy/crawler/CrawlSwitchboard.java
@@ -239,10 +239,12 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
- "", //crawlerCountryMustMatch
+ CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
+ CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
+ CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/,
true,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
@@ -265,10 +267,12 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
- "", //crawlerCountryMustMatch
+ CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
+ CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
+ CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0,
false,
-1,
@@ -291,10 +295,12 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
- "", //crawlerCountryMustMatch
+ CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
+ CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
+ CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE),
@@ -317,10 +323,12 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
- "", //crawlerCountryMustMatch
+ CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
+ CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
+ CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE),
@@ -344,10 +352,12 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
- "", //crawlerCountryMustMatch
+ CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
+ CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
+ CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE),
@@ -370,10 +380,12 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
- "", //crawlerCountryMustMatch
+ CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
+ CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
+ CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE),
@@ -396,10 +408,12 @@ public final class CrawlSwitchboard {
CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch
CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch
- "", //crawlerCountryMustMatch
+ CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch
CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch
CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch
CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch
+ CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch
+ CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch
0,
false,
CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE),
diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java
index 70efcf115..82da4cbfd 100644
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@@ -76,11 +76,14 @@ public class CrawlProfile extends ConcurrentHashMap implements M
public static final String CRAWLER_URL_NODEPTHLIMITMATCH = "crawlerNoLimitURLMustMatch";
public static final String INDEXING_URL_MUSTMATCH = "indexURLMustMatch";
public static final String INDEXING_URL_MUSTNOTMATCH = "indexURLMustNotMatch";
+ public static final String INDEXING_CONTENT_MUSTMATCH = "indexContentMustMatch";
+ public static final String INDEXING_CONTENT_MUSTNOTMATCH = "indexContentMustNotMatch";
private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null;
private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null;
private Pattern crawlernodepthlimitmatch = null;
private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null;
+ private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null;
private final Map doms;
@@ -96,6 +99,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M
* @param crawlerNoDepthLimitMatch if matches, no depth limit is applied to the crawler
* @param indexUrlMustMatch URLs which do not match this regex will be ignored for indexing
* @param indexUrlMustNotMatch URLs which match this regex will be ignored for indexing
+ * @param indexContentMustMatch content which do not match this regex will be ignored for indexing
+ * @param indexContentMustNotMatch content which match this regex will be ignored for indexing
* @param depth height of the tree which will be created by the crawler
* @param directDocByURL if true, then linked documents that cannot be parsed are indexed as document
* @param recrawlIfOlder documents which have been indexed in the past will
@@ -118,6 +123,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
final String crawlerIpMustMatch, final String crawlerIpMustNotMatch,
final String crawlerCountryMustMatch, final String crawlerNoDepthLimitMatch,
final String indexUrlMustMatch, final String indexUrlMustNotMatch,
+ final String indexContentMustMatch, final String indexContentMustNotMatch,
final int depth,
final boolean directDocByURL,
final long recrawlIfOlder /*date*/,
@@ -146,6 +152,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M
put(CRAWLER_URL_NODEPTHLIMITMATCH, (crawlerNoDepthLimitMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerNoDepthLimitMatch);
put(INDEXING_URL_MUSTMATCH, (indexUrlMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustMatch);
put(INDEXING_URL_MUSTNOTMATCH, (indexUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustNotMatch);
+ put(INDEXING_CONTENT_MUSTMATCH, (indexContentMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustMatch);
+ put(INDEXING_CONTENT_MUSTNOTMATCH, (indexContentMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustNotMatch);
put(DEPTH, depth);
put(DIRECT_DOC_BY_URL, directDocByURL);
put(RECRAWL_IF_OLDER, recrawlIfOlder);
@@ -277,7 +285,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
if (this.crawlerurlmustmatch == null) {
final String r = get(CRAWLER_URL_MUSTMATCH);
try {
- this.crawlerurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r);
+ this.crawlerurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) { this.crawlerurlmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.crawlerurlmustmatch;
@@ -291,7 +299,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
if (this.crawlerurlmustnotmatch == null) {
final String r = get(CRAWLER_URL_MUSTNOTMATCH);
try {
- this.crawlerurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r);
+ this.crawlerurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) { this.crawlerurlmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.crawlerurlmustnotmatch;
@@ -305,7 +313,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
if (this.crawleripmustmatch == null) {
final String r = get(CRAWLER_IP_MUSTMATCH);
try {
- this.crawleripmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r);
+ this.crawleripmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) { this.crawleripmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.crawleripmustmatch;
@@ -319,7 +327,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
if (this.crawleripmustnotmatch == null) {
final String r = get(CRAWLER_IP_MUSTNOTMATCH);
try {
- this.crawleripmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r);
+ this.crawleripmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) { this.crawleripmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.crawleripmustnotmatch;
@@ -346,7 +354,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
if (this.crawlernodepthlimitmatch == null) {
final String r = get(CRAWLER_URL_NODEPTHLIMITMATCH);
try {
- this.crawlernodepthlimitmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r);
+ this.crawlernodepthlimitmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) { this.crawlernodepthlimitmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.crawlernodepthlimitmatch;
@@ -360,7 +368,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M
if (this.indexurlmustmatch == null) {
final String r = get(INDEXING_URL_MUSTMATCH);
try {
- this.indexurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r);
+ this.indexurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) { this.indexurlmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.indexurlmustmatch;
@@ -374,12 +382,40 @@ public class CrawlProfile extends ConcurrentHashMap implements M
if (this.indexurlmustnotmatch == null) {
final String r = get(INDEXING_URL_MUSTNOTMATCH);
try {
- this.indexurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r);
+ this.indexurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
} catch (PatternSyntaxException e) { this.indexurlmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
}
return this.indexurlmustnotmatch;
}
+ /**
+ * Gets the regex which must be matched by URLs in order to be indexed.
+ * @return regex which must be matched
+ */
+ public Pattern indexContentMustMatchPattern() {
+ if (this.indexcontentmustmatch == null) {
+ final String r = get(INDEXING_CONTENT_MUSTMATCH);
+ try {
+ this.indexcontentmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
+ } catch (PatternSyntaxException e) { this.indexcontentmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
+ }
+ return this.indexcontentmustmatch;
+ }
+
+ /**
+ * Gets the regex which must not be matched by URLs in order to be indexed.
+ * @return regex which must not be matched
+ */
+ public Pattern indexContentMustNotMatchPattern() {
+ if (this.indexcontentmustnotmatch == null) {
+ final String r = get(INDEXING_CONTENT_MUSTNOTMATCH);
+ try {
+ this.indexcontentmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE);
+ } catch (PatternSyntaxException e) { this.indexcontentmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; }
+ }
+ return this.indexcontentmustnotmatch;
+ }
+
/**
* Gets depth of crawl job (or height of the tree which will be
* created by the crawler).
diff --git a/source/net/yacy/data/ymark/YMarkCrawlStart.java b/source/net/yacy/data/ymark/YMarkCrawlStart.java
index 8b925f726..f81c5ac66 100644
--- a/source/net/yacy/data/ymark/YMarkCrawlStart.java
+++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java
@@ -172,10 +172,12 @@ public class YMarkCrawlStart extends HashMap{
urlMustNotMatch,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
- "",
- CrawlProfile.MATCH_NEVER_STRING,
- CrawlProfile.MATCH_ALL_STRING,
+ CrawlProfile.MATCH_NEVER_STRING,
CrawlProfile.MATCH_NEVER_STRING,
+ CrawlProfile.MATCH_ALL_STRING,
+ CrawlProfile.MATCH_NEVER_STRING,
+ CrawlProfile.MATCH_ALL_STRING,
+ CrawlProfile.MATCH_NEVER_STRING,
depth,
medialink,
CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE),
diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java
index 70fea62b6..030b97296 100644
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@@ -343,20 +343,19 @@ dc_rights
public String getTextString() {
try {
- if (this.text == null) return "";
- if (this.text instanceof String) {
- return (String) this.text;
+ if (this.text == null) {
+ this.text = "";
} else if (this.text instanceof InputStream) {
- return UTF8.String(FileUtils.read((InputStream) this.text));
+ this.text = UTF8.String(FileUtils.read((InputStream) this.text));
} else if (this.text instanceof File) {
- return UTF8.String(FileUtils.read((File) this.text));
+ this.text = UTF8.String(FileUtils.read((File) this.text));
} else if (this.text instanceof byte[]) {
- return UTF8.String((byte[]) this.text);
+ this.text = UTF8.String((byte[]) this.text);
} else if (this.text instanceof ByteArrayOutputStream) {
- return UTF8.String(((ByteArrayOutputStream) this.text).toByteArray());
+ this.text = UTF8.String(((ByteArrayOutputStream) this.text).toByteArray());
}
- assert false : this.text.getClass().toString();
- return null;
+ assert this.text instanceof String : this.text.getClass().toString();
+ return (String) this.text;
} catch (final Exception e) {
Log.logException(e);
}
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index 21c0703c6..f425325ea 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -2555,17 +2555,24 @@ public final class Switchboard extends serverSwitch {
if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing of this media type not wanted by crawl profile");
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
- if (!profile.indexUrlMustMatchPattern().matcher(urls).matches() ||
- profile.indexUrlMustNotMatchPattern().matcher(urls).matches() ) {
+ if (!(profile.indexUrlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexUrlMustMatchPattern().matcher(urls).matches()) ||
+ (profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) {
if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
+ addURLtoErrorDB(
+ in.queueEntry.url(),
+ in.queueEntry.referrerHash(),
+ in.queueEntry.initiator(),
+ in.queueEntry.name(),
+ FailCategory.FINAL_PROCESS_CONTEXT,
+ "indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern());
return new IndexingQueueEntry(in.queueEntry, in.documents, null);
}
// check which files may take part in the indexing process
final List doclist = new ArrayList();
- for ( final Document document : in.documents ) {
- if ( document.indexingDenied() ) {
- if ( this.log.isInfo() ) this.log.logInfo("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
+ docloop: for (final Document document : in.documents) {
+ if (document.indexingDenied()) {
+ if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule");
addURLtoErrorDB(
in.queueEntry.url(),
in.queueEntry.referrerHash(),
@@ -2573,7 +2580,19 @@ public final class Switchboard extends serverSwitch {
in.queueEntry.name(),
FailCategory.FINAL_PROCESS_CONTEXT,
"denied by document-attached noindexing rule");
- continue;
+ continue docloop;
+ }
+ if (!(profile.indexContentMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexContentMustMatchPattern().matcher(document.getTextString()).matches()) ||
+ (profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) {
+ if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern());
+ addURLtoErrorDB(
+ in.queueEntry.url(),
+ in.queueEntry.referrerHash(),
+ in.queueEntry.initiator(),
+ in.queueEntry.name(),
+ FailCategory.FINAL_PROCESS_CONTEXT,
+ "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern());
+ continue docloop;
}
doclist.add(document);
}