diff --git a/htroot/CrawlProfileEditor_p.java b/htroot/CrawlProfileEditor_p.java index 3272863b3..2c40befb5 100644 --- a/htroot/CrawlProfileEditor_p.java +++ b/htroot/CrawlProfileEditor_p.java @@ -64,17 +64,19 @@ public class CrawlProfileEditor_p { private static final List labels = new ArrayList(); static { - labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING)); - labels.add(new eentry(CrawlProfile.COLLECTIONS, "Collections (comma-separated list)", false, eentry.STRING)); - labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTMATCH, "URL Must-Match Filter", false, eentry.STRING)); - labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTNOTMATCH, "URL Must-Not-Match Filter", false, eentry.STRING)); - labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTMATCH, "IP Must-Match Filter", false, eentry.STRING)); - labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTNOTMATCH, "IP Must-Not-Match Filter", false, eentry.STRING)); - labels.add(new eentry(CrawlProfile.CRAWLER_COUNTRY_MUSTMATCH, "Country Must-Match Filter", false, eentry.STRING)); - labels.add(new eentry(CrawlProfile.CRAWLER_URL_NODEPTHLIMITMATCH, "URL No-Depth-Limit Must-Match Filter", false, eentry.STRING)); - labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTMATCH, "Indexing Must-Match Filter", false, eentry.STRING)); - labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTNOTMATCH, "Indexing Must-Not-Match Filter", false, eentry.STRING)); - labels.add(new eentry(CrawlProfile.CACHE_STRAGEGY, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.NAME, "Name", true, eentry.STRING)); + labels.add(new eentry(CrawlProfile.COLLECTIONS, "Collections (comma-separated list)", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTMATCH, "URL Must-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.CRAWLER_URL_MUSTNOTMATCH, "URL Must-Not-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTMATCH, "IP Must-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.CRAWLER_IP_MUSTNOTMATCH, "IP Must-Not-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.CRAWLER_COUNTRY_MUSTMATCH, "Country Must-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.CRAWLER_URL_NODEPTHLIMITMATCH, "URL No-Depth-Limit Must-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTMATCH, "Indexing URL Must-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.INDEXING_URL_MUSTNOTMATCH, "Indexing URL Must-Not-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.INDEXING_CONTENT_MUSTMATCH, "Indexing Content Must-Match Filter", false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.INDEXING_CONTENT_MUSTNOTMATCH, "Indexing Content Must-Not-Match Filter",false, eentry.STRING)); + labels.add(new eentry(CrawlProfile.CACHE_STRAGEGY, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)", false, eentry.STRING)); labels.add(new eentry(CrawlProfile.DEPTH, "Crawl Depth", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.RECRAWL_IF_OLDER, "Recrawl If Older", false, eentry.INTEGER)); labels.add(new eentry(CrawlProfile.DOM_MAX_PAGES, "Domain Max. Pages", false, eentry.INTEGER)); diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html index 81c106a42..cc82de4c4 100644 --- a/htroot/CrawlStartExpert_p.html +++ b/htroot/CrawlStartExpert_p.html @@ -36,119 +36,159 @@

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
AttributeValueDescription
Starting Point: - - - - - - - - - - - - - - - - - - - - - - - - - - -
: - -   - -
- empty -
- -
: -
-
: - -
:
-
- Define the start-url(s) here. You can submit more than one URL, each line one URL please. +
+ + + +

A Crawl Job consist of one or more start point, crawl limitations and document freshness rules.

+
+ +
+
One Start URL or a list of URLs:
(must start with http:// https:// ftp:// smb:// file://)
+
+ infoDefine the start-url(s) here. You can submit more than one URL, each line one URL please. Each of these URLs are the root for a crawl start, existing start URLs are always re-loaded. Other already visited URLs are sorted out as "double", if they are not allowed using the re-crawl option. -
Crawling Depth: -     - also all linked non-parsable documents
- Unlimited crawl depth for URLs matching with: -
+ + + +   + +
+ empty + +
+
+ +
+
From Link-List of URL
+
+
+
+
+
From Sitemap
+
+ +
+
From File (enter a path
within your local file system)
+
+ +
+ + +
+ +

These are limitations on the crawl stacker. The filters will be applied before a web page is loaded.

+
+
Crawling Depth
+
+ info This defines how often the Crawler will follow links (of links..) embedded in websites. 0 means that only the page you enter under "Starting Point" will be added to the index. 2-4 is good for normal indexing. Values over 8 are not useful, since a depth-8 crawl will index approximately 25.600.000.000 pages, maybe this is the whole WWW. -
: + +     + also all linked non-parsable documents + +
Unlimited crawl depth for URLs matching with
+
+ +
+ +
Maximum Pages per Domain
+
+ info + You can limit the maximum number of pages that are fetched and indexed from a single domain with this option. + You can combine this limitation with the 'Auto-Dom-Filter', so that the limit is applied to all the domains within + the given depth. Domains outside the given depth are then sorted-out anyway. + + : +    + : + +
+ +
+
+ info + A questionmark is usually a hint for a dynamic page. URLs pointing to dynamic content should usually not be crawled. + However, there are sometimes web pages with static content that + is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops. + + +
+
Load Filter on URLs
+
info + The filter is a regular expression. + Example: to allow only urls that contain the word 'science', set the must-match filter to '.*science.*'. + You can also use an automatic domain-restriction to fully crawl a single domain. + - + + + + - - +
on URLs for Crawling:
- Restrict to start domain(s)
- Restrict to sub-path(s)
- Use filter
must-match
Restrict to start domain(s)
Restrict to sub-path(s)
Use filter
on IPs for Crawling:
on URLs for Indexing
must-not-match
-
- The filter is a regular expression - that must match with the URLs which are used to be crawled; default is 'catch all'. - Example: to allow only urls that contain the word 'science', set the filter to '.*science.*'. - You can also use an automatic domain-restriction to fully crawl a single domain. -
: + +
Load Filter on IPs
+
- - - + +
on URLs for Crawling:
on IPs for Crawling:
on URLs for Indexing:
must-match
must-not-match
-
+ +
+
+
info + Crawls can be restricted to specific countries. This uses the country code that can be computed from + the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma. + + no country code restriction
+ Use filter   + +
+ + +
+ +

These are limitations on index feeder. The filters will be applied after a web page was loaded.

+
+
Filter on URLs
+
+ info The filter is a regular expression that must not match with the URLs to allow that the content of the url is indexed. -
Document Deletion -
-
No Deletion
-
Do not delete any document before the crawl is started.
-
Delete sub-path
-
For each host in the start url list, delete all documents (in the given subpath) from that host.
-
Delete only old
-
Treat documents that are loaded + + + + +
must-match
must-not-match
+
+
Filter on Content of Document
(all visible text, including camel-case-tokenized url and title)
+
+ + + +
must-match
must-not-match
+
+
+ +
+ +
+
No Deletion
+
info + After a crawl was done in the past, document may become stale and eventually they are also deleted on the target host. + To remove old files from the search index it is not sufficient to just consider them for re-load but it may be necessary + to delete them because they simply do not exist any more. Use this in combination with re-crawl while this time should be longer. + Do not delete any document before the crawl is started.
+
Delete sub-path
+
For each host in the start url list, delete all documents (in the given subpath) from that host.
+
Delete only old
+
Treat documents that are loaded ago as stale and delete them before the crawl is started.
-
-
- After a crawl was done in the past, document may become stale and eventually they are also deleted on the target host. - To remove old files from the search index it is not sufficient to just consider them for re-load but it may be necessary - to delete them because they simply do not exist any more. Use this in combination with re-crawl while this time should be longer. -
Document Double-Check -
-
No Doubles
-
Never load any page that is already known.
Only the start-url may be loaded again.
-
Re-load
-
Treat documents that are loaded +
+ +
+ +
+
No Doubles
+
info + A web crawl performs a double-check on all links found in the internet against the internal database. If the same url is found again, + then the url is treated as double when you check the 'no doubles' option. A url may be loaded again when it has reached a specific age, + to use that check the 're-load' option. + Never load any page that is already known. Only the start-url may be loaded again.
+
Re-load
+
Treat documents that are loaded ago as stale and load them again. If they are younger, they are ignored.
-
-
- A web crawl performs a double-check on all links found in the internet against the internal database. If the same url is found again, - then the url is treated as double when you check the 'no doubles' option. A url may be loaded again when it has reached a specific age, - to use that check the 're-load' option. -
: - Use filter   -
- no country code restriction -
- Crawls can be restricted to specific countries. This uses the country code that can be computed from - the IP of the server that hosts the page. The filter is not a regular expressions but a list of country codes, separated by comma. -
Maximum Pages per Domain: - : -    - : - - - You can limit the maximum number of pages that are fetched and indexed from a single domain with this option. - You can combine this limitation with the 'Auto-Dom-Filter', so that the limit is applied to all the domains within - the given depth. Domains outside the given depth are then sorted-out anyway. -
: - A questionmark is usually a hint for a dynamic page. URLs pointing to dynamic content should usually not be crawled. However, there are sometimes web pages with static content that - is accessed with URLs containing question marks. If you are unsure, do not check this to avoid crawl loops. -
: + + +
+ +
+
+ info This option is used by default for proxy prefetch, but is not needed for explicit crawling. -
: - no cache    - if fresh    - if exist    - cache only - + + + + +
+
+ info The caching policy states when to use the cache during crawling: no cache: never use the cache, all content from fresh internet source; if fresh: use the cache if the cache exists and is fresh using the proxy-fresh rules; if exist: use the cache if the cache exist. Do no check freshness. Otherwise use online source; cache only: never go online, use all content from cache. If no cache exist, treat content as unavailable -
Do Local Indexing: + + no cache    + if fresh    + if exist    + cache only + + + +
+ +
+
Do Local Indexing
+
+ info + This enables indexing of the wepages the crawler will download. This should be switched on by default, unless you want to crawl only to fill the + Document Cache without indexing. + :     : -
- This enables indexing of the wepages the crawler will download. This should be switched on by default, unless you want to crawl only to fill the - Document Cache without indexing. -
: + + +
+
+ info + If checked, the crawler will contact other peers and use them as remote indexers for your crawl. + If you need your crawling results locally, you should switch this off. + Only senior and principal peers can initiate or receive remote crawls. + A YaCyNews message will be created to inform all peers about a global crawl, + so they can omit starting a crawl with the same start point. +
@@ -288,28 +296,23 @@
-
- If checked, the crawler will contact other peers and use them as remote indexers for your crawl. - If you need your crawling results locally, you should switch this off. - Only senior and principal peers can initiate or receive remote crawls. - A YaCyNews message will be created to inform all peers about a global crawl, - so they can omit starting a crawl with the same start point. -
: + + +
+
+ info + A crawl result can be tagged with names which are candidates for a collection request. + These tags can be selected with the GSA interface using the 'site' operator. + To use this option, the 'collection_sxt'-field must be switched on in the Solr Schema + -
- A crawl result can be tagged with names which are candidates for a collection request. These tags can be selected with the GSA interface using the 'site' operator. To use this option, the 'collection_sxt'-field must be switched on in the Solr Schema -
+ + + + +
+ +
#%env/templates/footer.template%# diff --git a/htroot/CrawlStartExpert_p.java b/htroot/CrawlStartExpert_p.java index 6a4425cf0..3f9cfd0df 100644 --- a/htroot/CrawlStartExpert_p.java +++ b/htroot/CrawlStartExpert_p.java @@ -49,6 +49,8 @@ public class CrawlStartExpert_p { prop.put("mustnotmatch", CrawlProfile.MATCH_NEVER_STRING); prop.put("indexmustmatch", CrawlProfile.MATCH_ALL_STRING); prop.put("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING); + prop.put("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING); + prop.put("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING); prop.put("ipMustmatch", sb.getConfig("crawlingIPMustMatch", CrawlProfile.MATCH_ALL_STRING)); prop.put("ipMustnotmatch", sb.getConfig("crawlingIPMustNotMatch", CrawlProfile.MATCH_NEVER_STRING)); prop.put("countryMustMatch", sb.getConfig("crawlingCountryMustMatch", "")); diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 291346528..0d8679610 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -212,6 +212,8 @@ public class Crawler_p { String crawlerNoDepthLimitMatch = post.get("crawlingDepthExtension", CrawlProfile.MATCH_NEVER_STRING); final String indexUrlMustMatch = post.get("indexmustmatch", CrawlProfile.MATCH_ALL_STRING); final String indexUrlMustNotMatch = post.get("indexmustnotmatch", CrawlProfile.MATCH_NEVER_STRING); + final String indexContentMustMatch = post.get("indexcontentmustmatch", CrawlProfile.MATCH_ALL_STRING); + final String indexContentMustNotMatch = post.get("indexcontentmustnotmatch", CrawlProfile.MATCH_NEVER_STRING); final boolean crawlOrder = post.get("crawlOrder", "off").equals("on"); env.setConfig("crawlOrder", crawlOrder); @@ -352,6 +354,8 @@ public class Crawler_p { crawlerNoDepthLimitMatch, indexUrlMustMatch, indexUrlMustNotMatch, + indexContentMustMatch, + indexContentMustNotMatch, newcrawlingdepth, directDocByURL, crawlingIfOlder, diff --git a/htroot/QuickCrawlLink_p.java b/htroot/QuickCrawlLink_p.java index 18fa543dc..11e81c9df 100644 --- a/htroot/QuickCrawlLink_p.java +++ b/htroot/QuickCrawlLink_p.java @@ -135,10 +135,12 @@ public class QuickCrawlLink_p { crawlingMustNotMatch, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch - "", //crawlerCountryMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch + CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch + CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch CrawlingDepth, true, 60 * 24 * 30, // recrawlIfOlder (minutes); here: one month diff --git a/htroot/env/base.css b/htroot/env/base.css index 308ea4d9a..c9f6d0be6 100644 --- a/htroot/env/base.css +++ b/htroot/env/base.css @@ -97,7 +97,7 @@ td { fieldset { margin:10px 5px; - padding:10px; + padding:2px 10px 2px 10px; } legend { @@ -1009,7 +1009,7 @@ div#info:hover span { padding: 3px; color: #000000; background: #DDDDDD; - text-align: center; + text-align: left; border: 1px dashed black; z-index: 100; } \ No newline at end of file diff --git a/source/net/yacy/crawler/CrawlSwitchboard.java b/source/net/yacy/crawler/CrawlSwitchboard.java index 04834dbb7..aea158e1c 100644 --- a/source/net/yacy/crawler/CrawlSwitchboard.java +++ b/source/net/yacy/crawler/CrawlSwitchboard.java @@ -239,10 +239,12 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch - "", //crawlerCountryMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch + CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch + CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0 /*Integer.parseInt(getConfig(PROXY_PREFETCH_DEPTH, "0"))*/, true, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), @@ -265,10 +267,12 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch - "", //crawlerCountryMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch + CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch + CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0, false, -1, @@ -291,10 +295,12 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch - "", //crawlerCountryMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch + CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch + CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_TEXT_RECRAWL_CYCLE), @@ -317,10 +323,12 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch - "", //crawlerCountryMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch + CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch + CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT_RECRAWL_CYCLE), @@ -344,10 +352,12 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch - "", //crawlerCountryMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch + CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch + CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA_RECRAWL_CYCLE), @@ -370,10 +380,12 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch - "", //crawlerCountryMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch + CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch + CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA_RECRAWL_CYCLE), @@ -396,10 +408,12 @@ public final class CrawlSwitchboard { CrawlProfile.MATCH_NEVER_STRING, //crawlerUrlMustNotMatch CrawlProfile.MATCH_ALL_STRING, //crawlerIpMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerIpMustNotMatch - "", //crawlerCountryMustMatch + CrawlProfile.MATCH_NEVER_STRING, //crawlerCountryMustMatch CrawlProfile.MATCH_NEVER_STRING, //crawlerNoDepthLimitMatch CrawlProfile.MATCH_ALL_STRING, //indexUrlMustMatch CrawlProfile.MATCH_NEVER_STRING, //indexUrlMustNotMatch + CrawlProfile.MATCH_ALL_STRING, //indexContentMustMatch + CrawlProfile.MATCH_NEVER_STRING, //indexContentMustNotMatch 0, false, CrawlProfile.getRecrawlDate(CRAWL_PROFILE_SURROGATE_RECRAWL_CYCLE), diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index 70efcf115..82da4cbfd 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -76,11 +76,14 @@ public class CrawlProfile extends ConcurrentHashMap implements M public static final String CRAWLER_URL_NODEPTHLIMITMATCH = "crawlerNoLimitURLMustMatch"; public static final String INDEXING_URL_MUSTMATCH = "indexURLMustMatch"; public static final String INDEXING_URL_MUSTNOTMATCH = "indexURLMustNotMatch"; + public static final String INDEXING_CONTENT_MUSTMATCH = "indexContentMustMatch"; + public static final String INDEXING_CONTENT_MUSTNOTMATCH = "indexContentMustNotMatch"; private Pattern crawlerurlmustmatch = null, crawlerurlmustnotmatch = null; private Pattern crawleripmustmatch = null, crawleripmustnotmatch = null; private Pattern crawlernodepthlimitmatch = null; private Pattern indexurlmustmatch = null, indexurlmustnotmatch = null; + private Pattern indexcontentmustmatch = null, indexcontentmustnotmatch = null; private final Map doms; @@ -96,6 +99,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M * @param crawlerNoDepthLimitMatch if matches, no depth limit is applied to the crawler * @param indexUrlMustMatch URLs which do not match this regex will be ignored for indexing * @param indexUrlMustNotMatch URLs which match this regex will be ignored for indexing + * @param indexContentMustMatch content which do not match this regex will be ignored for indexing + * @param indexContentMustNotMatch content which match this regex will be ignored for indexing * @param depth height of the tree which will be created by the crawler * @param directDocByURL if true, then linked documents that cannot be parsed are indexed as document * @param recrawlIfOlder documents which have been indexed in the past will @@ -118,6 +123,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M final String crawlerIpMustMatch, final String crawlerIpMustNotMatch, final String crawlerCountryMustMatch, final String crawlerNoDepthLimitMatch, final String indexUrlMustMatch, final String indexUrlMustNotMatch, + final String indexContentMustMatch, final String indexContentMustNotMatch, final int depth, final boolean directDocByURL, final long recrawlIfOlder /*date*/, @@ -146,6 +152,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(CRAWLER_URL_NODEPTHLIMITMATCH, (crawlerNoDepthLimitMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : crawlerNoDepthLimitMatch); put(INDEXING_URL_MUSTMATCH, (indexUrlMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustMatch); put(INDEXING_URL_MUSTNOTMATCH, (indexUrlMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexUrlMustNotMatch); + put(INDEXING_CONTENT_MUSTMATCH, (indexContentMustMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustMatch); + put(INDEXING_CONTENT_MUSTNOTMATCH, (indexContentMustNotMatch == null) ? CrawlProfile.MATCH_NEVER_STRING : indexContentMustNotMatch); put(DEPTH, depth); put(DIRECT_DOC_BY_URL, directDocByURL); put(RECRAWL_IF_OLDER, recrawlIfOlder); @@ -277,7 +285,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (this.crawlerurlmustmatch == null) { final String r = get(CRAWLER_URL_MUSTMATCH); try { - this.crawlerurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r); + this.crawlerurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); } catch (PatternSyntaxException e) { this.crawlerurlmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } return this.crawlerurlmustmatch; @@ -291,7 +299,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (this.crawlerurlmustnotmatch == null) { final String r = get(CRAWLER_URL_MUSTNOTMATCH); try { - this.crawlerurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r); + this.crawlerurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); } catch (PatternSyntaxException e) { this.crawlerurlmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } return this.crawlerurlmustnotmatch; @@ -305,7 +313,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (this.crawleripmustmatch == null) { final String r = get(CRAWLER_IP_MUSTMATCH); try { - this.crawleripmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r); + this.crawleripmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); } catch (PatternSyntaxException e) { this.crawleripmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } return this.crawleripmustmatch; @@ -319,7 +327,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (this.crawleripmustnotmatch == null) { final String r = get(CRAWLER_IP_MUSTNOTMATCH); try { - this.crawleripmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r); + this.crawleripmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); } catch (PatternSyntaxException e) { this.crawleripmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } return this.crawleripmustnotmatch; @@ -346,7 +354,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (this.crawlernodepthlimitmatch == null) { final String r = get(CRAWLER_URL_NODEPTHLIMITMATCH); try { - this.crawlernodepthlimitmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r); + this.crawlernodepthlimitmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); } catch (PatternSyntaxException e) { this.crawlernodepthlimitmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } return this.crawlernodepthlimitmatch; @@ -360,7 +368,7 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (this.indexurlmustmatch == null) { final String r = get(INDEXING_URL_MUSTMATCH); try { - this.indexurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r); + this.indexurlmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); } catch (PatternSyntaxException e) { this.indexurlmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } return this.indexurlmustmatch; @@ -374,12 +382,40 @@ public class CrawlProfile extends ConcurrentHashMap implements M if (this.indexurlmustnotmatch == null) { final String r = get(INDEXING_URL_MUSTNOTMATCH); try { - this.indexurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r); + this.indexurlmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); } catch (PatternSyntaxException e) { this.indexurlmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; } } return this.indexurlmustnotmatch; } + /** + * Gets the regex which must be matched by URLs in order to be indexed. + * @return regex which must be matched + */ + public Pattern indexContentMustMatchPattern() { + if (this.indexcontentmustmatch == null) { + final String r = get(INDEXING_CONTENT_MUSTMATCH); + try { + this.indexcontentmustmatch = (r == null || r.equals(CrawlProfile.MATCH_ALL_STRING)) ? CrawlProfile.MATCH_ALL_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); + } catch (PatternSyntaxException e) { this.indexcontentmustmatch = CrawlProfile.MATCH_NEVER_PATTERN; } + } + return this.indexcontentmustmatch; + } + + /** + * Gets the regex which must not be matched by URLs in order to be indexed. + * @return regex which must not be matched + */ + public Pattern indexContentMustNotMatchPattern() { + if (this.indexcontentmustnotmatch == null) { + final String r = get(INDEXING_CONTENT_MUSTNOTMATCH); + try { + this.indexcontentmustnotmatch = (r == null || r.equals(CrawlProfile.MATCH_NEVER_STRING)) ? CrawlProfile.MATCH_NEVER_PATTERN : Pattern.compile(r, Pattern.CASE_INSENSITIVE); + } catch (PatternSyntaxException e) { this.indexcontentmustnotmatch = CrawlProfile.MATCH_NEVER_PATTERN; } + } + return this.indexcontentmustnotmatch; + } + /** * Gets depth of crawl job (or height of the tree which will be * created by the crawler). diff --git a/source/net/yacy/data/ymark/YMarkCrawlStart.java b/source/net/yacy/data/ymark/YMarkCrawlStart.java index 8b925f726..f81c5ac66 100644 --- a/source/net/yacy/data/ymark/YMarkCrawlStart.java +++ b/source/net/yacy/data/ymark/YMarkCrawlStart.java @@ -172,10 +172,12 @@ public class YMarkCrawlStart extends HashMap{ urlMustNotMatch, CrawlProfile.MATCH_ALL_STRING, CrawlProfile.MATCH_NEVER_STRING, - "", - CrawlProfile.MATCH_NEVER_STRING, - CrawlProfile.MATCH_ALL_STRING, + CrawlProfile.MATCH_NEVER_STRING, CrawlProfile.MATCH_NEVER_STRING, + CrawlProfile.MATCH_ALL_STRING, + CrawlProfile.MATCH_NEVER_STRING, + CrawlProfile.MATCH_ALL_STRING, + CrawlProfile.MATCH_NEVER_STRING, depth, medialink, CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index 70fea62b6..030b97296 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -343,20 +343,19 @@ dc_rights public String getTextString() { try { - if (this.text == null) return ""; - if (this.text instanceof String) { - return (String) this.text; + if (this.text == null) { + this.text = ""; } else if (this.text instanceof InputStream) { - return UTF8.String(FileUtils.read((InputStream) this.text)); + this.text = UTF8.String(FileUtils.read((InputStream) this.text)); } else if (this.text instanceof File) { - return UTF8.String(FileUtils.read((File) this.text)); + this.text = UTF8.String(FileUtils.read((File) this.text)); } else if (this.text instanceof byte[]) { - return UTF8.String((byte[]) this.text); + this.text = UTF8.String((byte[]) this.text); } else if (this.text instanceof ByteArrayOutputStream) { - return UTF8.String(((ByteArrayOutputStream) this.text).toByteArray()); + this.text = UTF8.String(((ByteArrayOutputStream) this.text).toByteArray()); } - assert false : this.text.getClass().toString(); - return null; + assert this.text instanceof String : this.text.getClass().toString(); + return (String) this.text; } catch (final Exception e) { Log.logException(e); } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 21c0703c6..f425325ea 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2555,17 +2555,24 @@ public final class Switchboard extends serverSwitch { if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing of this media type not wanted by crawl profile"); return new IndexingQueueEntry(in.queueEntry, in.documents, null); } - if (!profile.indexUrlMustMatchPattern().matcher(urls).matches() || - profile.indexUrlMustNotMatchPattern().matcher(urls).matches() ) { + if (!(profile.indexUrlMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexUrlMustMatchPattern().matcher(urls).matches()) || + (profile.indexUrlMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexUrlMustNotMatchPattern().matcher(urls).matches())) { if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern()); + addURLtoErrorDB( + in.queueEntry.url(), + in.queueEntry.referrerHash(), + in.queueEntry.initiator(), + in.queueEntry.name(), + FailCategory.FINAL_PROCESS_CONTEXT, + "indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + profile.indexUrlMustMatchPattern().pattern() + ", indexUrlMustNotMatchPattern = " + profile.indexUrlMustNotMatchPattern().pattern()); return new IndexingQueueEntry(in.queueEntry, in.documents, null); } // check which files may take part in the indexing process final List doclist = new ArrayList(); - for ( final Document document : in.documents ) { - if ( document.indexingDenied() ) { - if ( this.log.isInfo() ) this.log.logInfo("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule"); + docloop: for (final Document document : in.documents) { + if (document.indexingDenied()) { + if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': denied by document-attached noindexing rule"); addURLtoErrorDB( in.queueEntry.url(), in.queueEntry.referrerHash(), @@ -2573,7 +2580,19 @@ public final class Switchboard extends serverSwitch { in.queueEntry.name(), FailCategory.FINAL_PROCESS_CONTEXT, "denied by document-attached noindexing rule"); - continue; + continue docloop; + } + if (!(profile.indexContentMustMatchPattern() == CrawlProfile.MATCH_ALL_PATTERN || profile.indexContentMustMatchPattern().matcher(document.getTextString()).matches()) || + (profile.indexContentMustNotMatchPattern() != CrawlProfile.MATCH_NEVER_PATTERN && profile.indexContentMustNotMatchPattern().matcher(document.getTextString()).matches())) { + if (this.log.isInfo()) this.log.logInfo("Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern()); + addURLtoErrorDB( + in.queueEntry.url(), + in.queueEntry.referrerHash(), + in.queueEntry.initiator(), + in.queueEntry.name(), + FailCategory.FINAL_PROCESS_CONTEXT, + "indexing prevented by regular expression on content; indexContentMustMatchPattern = " + profile.indexContentMustMatchPattern().pattern() + ", indexContentMustNotMatchPattern = " + profile.indexContentMustNotMatchPattern().pattern()); + continue docloop; } doclist.add(document); }