From cced94298ab946125bc29e58583431ac4dd6a426 Mon Sep 17 00:00:00 2001 From: luccioman Date: Tue, 19 Jun 2018 10:12:20 +0200 Subject: [PATCH] Added a new crawler document filter type using Solr syntax This makes possbile to set up much more advanced document crawl filters, by filtering on one or more document indexed fields before inserting in the index. --- htroot/CrawlProfileEditor_p.xml | 2 + htroot/CrawlStartExpert.html | 37 +++- htroot/CrawlStartExpert.java | 30 ++- htroot/Crawler_p.html | 16 +- htroot/Crawler_p.java | 166 ++++++++++------ .../net/yacy/crawler/data/CrawlProfile.java | 20 ++ source/net/yacy/search/Switchboard.java | 93 ++++++++- source/net/yacy/search/index/Segment.java | 36 +++- .../search/index/SingleDocumentMatcher.java | 119 ++++++++++++ .../index/SingleDocumentMatcherTest.java | 177 ++++++++++++++++++ 10 files changed, 629 insertions(+), 67 deletions(-) create mode 100644 source/net/yacy/search/index/SingleDocumentMatcher.java create mode 100644 test/java/net/yacy/search/index/SingleDocumentMatcherTest.java diff --git a/htroot/CrawlProfileEditor_p.xml b/htroot/CrawlProfileEditor_p.xml index 8f1be2591..01271eb45 100644 --- a/htroot/CrawlProfileEditor_p.xml +++ b/htroot/CrawlProfileEditor_p.xml @@ -32,6 +32,8 @@ #[indexContentMustNotMatch]# #[indexMediaTypeMustMatch]# #[indexMediaTypeMustNotMatch]# + #[indexSolrQueryMustMatch]# + #[indexSolrQueryMustNotMatch]# #(status)#terminated::active::system#(/status)# #{crawlingDomFilterContent}# diff --git a/htroot/CrawlStartExpert.html b/htroot/CrawlStartExpert.html index 56ab1bba7..66be62a9f 100644 --- a/htroot/CrawlStartExpert.html +++ b/htroot/CrawlStartExpert.html @@ -171,6 +171,8 @@ disableIf('indexmustnotmatch', defaultMatchNone); disableIf('indexcontentmustnotmatch', defaultMatchNone); disableIf('indexMediaTypeMustNotMatch', defaultMatchNone); + disableIf('indexSolrQueryMustMatch', "#[solrQueryMatchAllStr]#"); + disableIf('indexSolrQueryMustNotMatch', "#[solrEmptyQueryStr]#"); // remove if MATCH_ALL_STRING disableIf('mustmatch', defaultMatchAll); @@ -369,7 +371,7 @@
Filter on Document Media Type (aka MIME type)
- Clean up search events cache info + Media Type filter info The filter is a regular expression that must match with the document Media Type (also known as MIME Type) to allow the URL to be indexed. @@ -388,6 +390,39 @@
+
Solr query filter on any active indexed field(s)
+
+
+ Solr query filter info + + Each parsed document is checked against the given Solr query before being added to the index. + The query must be written in respect to the standard Solr query syntax. + +
+ + #(embeddedSolrConnected)# + + + + :: + + + + + + + + + #(/embeddedSolrConnected)# +
+

The embedded local Solr index must be connected to use this kind of filter.

+

You can configure this with the Index Sources & targets page.

+
must-match + +
must-not-match + +
+
diff --git a/htroot/CrawlStartExpert.java b/htroot/CrawlStartExpert.java index fc5e47af0..f11f403e1 100644 --- a/htroot/CrawlStartExpert.java +++ b/htroot/CrawlStartExpert.java @@ -28,6 +28,9 @@ import java.util.ArrayList; import java.util.Collection; import java.util.List; +import org.apache.solr.core.SolrCore; + +import net.yacy.cora.federate.solr.instance.EmbeddedInstance; import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; @@ -49,9 +52,11 @@ public class CrawlStartExpert { final serverObjects prop = new serverObjects(); final String defaultCollection = "user"; - // javascript values + // javascript constants prop.put("matchAllStr", CrawlProfile.MATCH_ALL_STRING); prop.put("matchNoneStr", CrawlProfile.MATCH_NEVER_STRING); + prop.put("solrQueryMatchAllStr", CrawlProfile.SOLR_MATCH_ALL_QUERY); + prop.put("solrEmptyQueryStr", CrawlProfile.SOLR_EMPTY_QUERY); prop.put("defaultCollection", defaultCollection); // ---------- Start point @@ -317,6 +322,29 @@ public class CrawlStartExpert { } else { prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING); } + + // Filter with a Solr syntax query + /* Check that the embedded local Solr index is connected, as its schema is required to apply the eventual Solr filter query */ + final EmbeddedInstance embeddedSolr = sb.index.fulltext().getEmbeddedInstance(); + final SolrCore embeddedCore = embeddedSolr != null ? embeddedSolr.getDefaultCore() : null; + final boolean embeddedSolrConnected = embeddedSolr != null && embeddedCore != null; + prop.put("embeddedSolrConnected", embeddedSolrConnected); + + if(embeddedSolrConnected) { + if (post != null && post.containsKey(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key)) { + prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, + post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY).trim()); + } else { + prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY); + } + + if (post != null && post.containsKey(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key)) { + prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, + post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY).trim()); + } else { + prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY); + } + } // ---------- Clean-Up before Crawl Start diff --git a/htroot/Crawler_p.html b/htroot/Crawler_p.html index 52a126107..c3a7fb6b7 100644 --- a/htroot/Crawler_p.html +++ b/htroot/Crawler_p.html @@ -216,9 +216,23 @@ window.setInterval("setTableSize()", 1000); Crawling of "#[crawlingURL]#" started. Please wait some seconds, it may take some seconds until the first result appears there. - If you crawl any un-wanted pages, you can delete them here.
+ If you crawl any un-wanted pages, you can delete them here.
:: + + No embedded local Solr index is connected. This is required to use a Solr query filter. + You can configure this with the Index Sources & targets page.:: + + The Solr filter query syntax is not valid : #[solrQuery]#:: + + Could not parse the Solr filter query : #[solrQuery]# #(/info)#

+ + #(info-queue)#::
#[message]#
#(/info-queue)# diff --git a/htroot/Crawler_p.java b/htroot/Crawler_p.java index 3a2c4f377..707d67bcd 100644 --- a/htroot/Crawler_p.java +++ b/htroot/Crawler_p.java @@ -35,12 +35,17 @@ import java.util.Set; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; +import org.apache.solr.common.SolrException; +import org.apache.solr.core.SolrCore; +import org.apache.solr.search.SyntaxError; + import net.yacy.cora.date.AbstractFormatter; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.FailCategory; +import net.yacy.cora.federate.solr.instance.EmbeddedInstance; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; @@ -70,6 +75,7 @@ import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.search.index.Fulltext; import net.yacy.search.index.Segment; +import net.yacy.search.index.SingleDocumentMatcher; import net.yacy.search.query.SearchEventCache; import net.yacy.search.schema.CollectionSchema; import net.yacy.server.serverObjects; @@ -464,7 +470,12 @@ public class Crawler_p { boolean hasCrawlstartDataOK = !crawlName.isEmpty(); if (hasCrawlstartDataOK) { // check crawlurl was given in sitecrawl - if ("url".equals(crawlingMode) && rootURLs.size() == 0) hasCrawlstartDataOK = false; + if ("url".equals(crawlingMode) && rootURLs.size() == 0) { + prop.put("info", "5"); //Crawling failed + prop.putHTML("info_crawlingURL", "(no url given)"); + prop.putHTML("info_reasonString", "you must submit at least one crawl url"); + hasCrawlstartDataOK = false; + } } String snapshotsMaxDepthString = post.get("snapshotsMaxDepth", "-1"); @@ -533,6 +544,52 @@ public class Crawler_p { sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); } + /* If a solr query filter is defined, verify now its syntax and that the embedded Solr schema is available */ + final String solrQueryMustMatch = post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY).trim(); + final String solrQueryMustNotMatch = post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY).trim(); + if(!(solrQueryMustMatch.isEmpty() || CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(solrQueryMustMatch)) || !CrawlProfile.SOLR_EMPTY_QUERY.equals(solrQueryMustNotMatch)) { + + final EmbeddedInstance embeddedSolr = sb.index.fulltext().getEmbeddedInstance(); + final SolrCore embeddedCore = embeddedSolr != null ? embeddedSolr.getDefaultCore() : null; + final boolean embeddedSolrConnected = embeddedSolr != null && embeddedCore != null; + prop.put("noEmbeddedSolr", !embeddedSolrConnected); + if (embeddedSolrConnected) { + if(!(solrQueryMustMatch.isEmpty() || CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(solrQueryMustMatch))) { + try { + SingleDocumentMatcher.toLuceneQuery(solrQueryMustMatch, embeddedCore); + } catch(final SyntaxError | SolrException e) { + hasCrawlstartDataOK = false; + prop.put("info", "10"); + prop.put("info_solrQuery", solrQueryMustMatch); + } catch(final RuntimeException e) { + hasCrawlstartDataOK = false; + prop.put("info", "11"); + prop.put("info_solrQuery", solrQueryMustMatch); + } + } + + if(!CrawlProfile.SOLR_EMPTY_QUERY.equals(solrQueryMustNotMatch)) { + try { + SingleDocumentMatcher.toLuceneQuery(solrQueryMustNotMatch, embeddedCore); + } catch(final SyntaxError | SolrException e) { + hasCrawlstartDataOK = false; + prop.put("info", "10"); + prop.put("info_solrQuery", solrQueryMustNotMatch); + } catch(final RuntimeException e) { + hasCrawlstartDataOK = false; + prop.put("info", "11"); + prop.put("info_solrQuery", solrQueryMustNotMatch); + } + } + } else { + hasCrawlstartDataOK = false; + prop.put("info", "9"); + } + + + + } + // prepare a new crawling profile final CrawlProfile profile; byte[] handle; @@ -574,6 +631,9 @@ public class Crawler_p { post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING)); profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, post .get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING)); + profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, solrQueryMustMatch); + profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, solrQueryMustNotMatch); + handle = ASCII.getBytes(profile.handle()); @@ -587,15 +647,11 @@ public class Crawler_p { profile = null; handle = null; } + // start the crawl - if ("url".equals(crawlingMode)) { - if (rootURLs.size() == 0) { - prop.put("info", "5"); //Crawling failed - prop.putHTML("info_crawlingURL", "(no url given)"); - prop.putHTML("info_reasonString", "you must submit at least one crawl url"); - } else { - + if(hasCrawlstartDataOK) { + if ("url".equals(crawlingMode)) { // stack requests sb.crawler.putActive(handle, profile); final Set successurls = new HashSet(); @@ -639,53 +695,53 @@ public class Crawler_p { prop.putHTML("info_reasonString", fr.toString()); } if (successurls.size() > 0) sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - } - } else if ("sitemap".equals(crawlingMode)) { - try { - final DigestURL sitemapURL = sitemapURLStr.indexOf("//") > 0 ? new DigestURL(sitemapURLStr) : new DigestURL(rootURLs.iterator().next(), sitemapURLStr); // fix for relative paths which should not exist but are used anyway - sb.crawler.putActive(handle, profile); - final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, profile); - importer.start(); - sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - } catch (final Exception e) { - // mist - prop.put("info", "6");//Error with url - prop.putHTML("info_crawlingStart", sitemapURLStr); - prop.putHTML("info_error", e.getMessage()); - ConcurrentLog.logException(e); - } - } else if ("file".equals(crawlingMode)) { - if (post.containsKey("crawlingFile") && crawlingFile != null) { - try { - if(newcrawlingdepth > 0 && (fullDomain || subPath)) { - /* All links must have already been loaded because they are the part of the newcrawlingMustMatch filter */ - if(hyperlinks_from_file != null) { - sb.crawler.putActive(handle, profile); - sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset()); - } - } else { - /* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */ - final String crawlingFileContent = post.get("crawlingFile$file", ""); - final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, - new HashSet(), new VocabularyScraper(), profile.timezoneOffset()); - FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile, - sb.crawlStacker, sb.peers.mySeed().hash.getBytes()); - sb.crawler.putActive(handle, profile); - crawlStarterTask.start(); - } - } catch (final PatternSyntaxException e) { - prop.put("info", "4"); // crawlfilter does not match url - prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); - prop.putHTML("info_error", e.getMessage()); - } catch (final Exception e) { - // mist - prop.put("info", "7"); // Error with file - prop.putHTML("info_crawlingStart", crawlingFileName); - prop.putHTML("info_error", e.getMessage()); - ConcurrentLog.logException(e); - } - sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); - } + } else if ("sitemap".equals(crawlingMode)) { + try { + final DigestURL sitemapURL = sitemapURLStr.indexOf("//") > 0 ? new DigestURL(sitemapURLStr) : new DigestURL(rootURLs.iterator().next(), sitemapURLStr); // fix for relative paths which should not exist but are used anyway + sb.crawler.putActive(handle, profile); + final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, profile); + importer.start(); + sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + } catch (final Exception e) { + // mist + prop.put("info", "6");//Error with url + prop.putHTML("info_crawlingStart", sitemapURLStr); + prop.putHTML("info_error", e.getMessage()); + ConcurrentLog.logException(e); + } + } else if ("file".equals(crawlingMode)) { + if (post.containsKey("crawlingFile") && crawlingFile != null) { + try { + if(newcrawlingdepth > 0 && (fullDomain || subPath)) { + /* All links must have already been loaded because they are the part of the newcrawlingMustMatch filter */ + if(hyperlinks_from_file != null) { + sb.crawler.putActive(handle, profile); + sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset()); + } + } else { + /* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */ + final String crawlingFileContent = post.get("crawlingFile$file", ""); + final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000, + new HashSet(), new VocabularyScraper(), profile.timezoneOffset()); + FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile, + sb.crawlStacker, sb.peers.mySeed().hash.getBytes()); + sb.crawler.putActive(handle, profile); + crawlStarterTask.start(); + } + } catch (final PatternSyntaxException e) { + prop.put("info", "4"); // crawlfilter does not match url + prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch); + prop.putHTML("info_error", e.getMessage()); + } catch (final Exception e) { + // mist + prop.put("info", "7"); // Error with file + prop.putHTML("info_crawlingStart", crawlingFileName); + prop.putHTML("info_error", e.getMessage()); + ConcurrentLog.logException(e); + } + sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL); + } + } } } } diff --git a/source/net/yacy/crawler/data/CrawlProfile.java b/source/net/yacy/crawler/data/CrawlProfile.java index a3ee1bdca..a3c406be2 100644 --- a/source/net/yacy/crawler/data/CrawlProfile.java +++ b/source/net/yacy/crawler/data/CrawlProfile.java @@ -41,6 +41,7 @@ import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Digest; @@ -63,9 +64,22 @@ public class CrawlProfile extends ConcurrentHashMap implements M private static final long serialVersionUID = 5527325718810703504L; + /** Regular expression pattern matching everything */ public static final String MATCH_ALL_STRING = ".*"; + + /** Regular expression pattern matching nothing */ public static final String MATCH_NEVER_STRING = ""; + + /** Empty Solr query */ + public static final String SOLR_EMPTY_QUERY = ""; + + /** Match all Solr query */ + public static final String SOLR_MATCH_ALL_QUERY = AbstractSolrConnector.CATCHALL_QUERY; + + /** Regular expression matching everything */ public static final Pattern MATCH_ALL_PATTERN = Pattern.compile(MATCH_ALL_STRING); + + /** Regular expression matching nothing */ public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING); public static final String CRAWL_PROFILE_PUSH_STUB = "push_"; @@ -92,6 +106,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M INDEXING_CONTENT_MUSTNOTMATCH("indexContentMustNotMatch", false, CrawlAttribute.STRING, "Indexing Content Must-Not-Match Filter"), INDEXING_MEDIA_TYPE_MUSTMATCH("indexMediaTypeMustMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Match Filter"), INDEXING_MEDIA_TYPE_MUSTNOTMATCH("indexMediaTypeMustNotMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Not-Match Filter"), + INDEXING_SOLR_QUERY_MUSTMATCH("indexSolrQueryMustMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Match Filter"), + INDEXING_SOLR_QUERY_MUSTNOTMATCH("indexSolrQueryMustNotMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Not-Match Filter"), RECRAWL_IF_OLDER ("recrawlIfOlder", false, CrawlAttribute.INTEGER, "Recrawl If Older"), STORE_HTCACHE ("storeHTCache", false, CrawlAttribute.BOOLEAN, "Store in HTCache"), CACHE_STRAGEGY ("cacheStrategy", false, CrawlAttribute.STRING, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)"), @@ -261,6 +277,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M put(CrawlAttribute.TIMEZONEOFFSET.key, timezoneOffset); put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING); put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING); + put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY); + put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY); } /** @@ -857,6 +875,8 @@ public class CrawlProfile extends ConcurrentHashMap implements M prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key)); prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key)); + prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key)); //prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustmatch", this.urlMustMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustMatch //prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", this.urlMustNotMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustNotMatch //prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (this.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(this.recrawlIfOlder())); // TODO: remove, replace with recrawlIfOlder diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index c847b1632..8af9024f5 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -91,7 +91,10 @@ import java.util.zip.ZipInputStream; import javax.servlet.http.HttpServletRequest; import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrException; import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.core.SolrCore; +import org.apache.solr.search.SyntaxError; import com.cybozu.labs.langdetect.DetectorFactory; import com.cybozu.labs.langdetect.LangDetectException; @@ -114,6 +117,7 @@ import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.federate.solr.connector.ShardSelection; import net.yacy.cora.federate.solr.connector.SolrConnector.LoadTimeURL; +import net.yacy.cora.federate.solr.instance.EmbeddedInstance; import net.yacy.cora.federate.solr.instance.RemoteInstance; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.lod.vocabulary.Tagging; @@ -218,6 +222,7 @@ import net.yacy.repository.LoaderDispatcher; import net.yacy.search.index.Fulltext; import net.yacy.search.index.Segment; import net.yacy.search.index.Segment.ReferenceReportCache; +import net.yacy.search.index.SingleDocumentMatcher; import net.yacy.search.query.AccessTracker; import net.yacy.search.query.SearchEvent; import net.yacy.search.query.SearchEventCache; @@ -3212,6 +3217,10 @@ public final class Switchboard extends serverSwitch { FailCategory.FINAL_PROCESS_CONTEXT, failReason, -1); continue docloop; } + + /* The eventual Solr/Lucene filter query will be checked just before adding the document to the index, + * when the SolrInputDocument is built, at storeDocumentIndex()*/ + doclist.add(document); } @@ -3327,16 +3336,36 @@ public final class Switchboard extends serverSwitch { // remove stopwords this.log.info("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url.toNormalform(true)); + + final CollectionConfiguration collectionConfig = this.index.fulltext().getDefaultConfiguration(); + final String language = Segment.votedLanguage(url, url.toNormalform(true), document, condenser); // identification of the language + + final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(this.index, collections, queueEntry.getResponseHeader(), + document, condenser, referrerURL, language, profile.isPushCrawlProfile(), + this.index.fulltext().useWebgraph() ? this.index.fulltext().getWebgraphConfiguration() : null, sourceName); + + /* + * One last posible filtering step before adding to index : using the eventual + * profile Solr querie filters + */ + final String profileSolrFilterError = checkCrawlProfileSolrFilters(profile, vector); + if (profileSolrFilterError != null) { + this.crawlQueues.errorURL.push(url, queueEntry.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT, + profileSolrFilterError + ", process case=" + processCase + ", profile name = " + + profile.collectionName(), + -1); + return; + } // STORE WORD INDEX SolrInputDocument newEntry = this.index.storeDocument( url, - referrerURL, - collections, profile, queueEntry.getResponseHeader(), document, + vector, + language, condenser, searchEvent, sourceName, @@ -3401,6 +3430,66 @@ public final class Switchboard extends serverSwitch { } } + /** + * Check that the given Solr document matches the eventual crawl profil Solr + * query filters. + * + * @param profile + * the eventual crawl profile. + * @param document + * the Solr document to check. Must not be null. + * @return an eventual error message or null when no Solr query filters are + * defined or when they match with the Solr document. + * @throws IllegalArgumentException + * when the document is null + */ + private String checkCrawlProfileSolrFilters(final CrawlProfile profile, + final CollectionConfiguration.SolrVector document) throws IllegalArgumentException { + if (profile != null) { + final String indexFilterQuery = profile.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key); + final String indexSolrQueryMustNotMatch = profile.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key); + if ((indexFilterQuery != null && !indexFilterQuery.isEmpty() + && !CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(indexFilterQuery)) + || (indexSolrQueryMustNotMatch != null + && !CrawlProfile.SOLR_EMPTY_QUERY.equals(indexSolrQueryMustNotMatch))) { + final EmbeddedInstance embeddedSolr = this.index.fulltext().getEmbeddedInstance(); + final SolrCore embeddedCore = embeddedSolr != null ? embeddedSolr.getDefaultCore() : null; + final boolean embeddedSolrConnected = embeddedSolr != null && embeddedCore != null; + + if (!embeddedSolrConnected) { + return "no connected embedded instance for profile Solr query filter"; + } + + if ((indexFilterQuery != null && !indexFilterQuery.isEmpty() + && !CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(indexFilterQuery))) { + try { + if (!SingleDocumentMatcher.matches(document, indexFilterQuery, embeddedCore)) { + return "denied by profile Solr query must-match filter"; + } + } catch (final SyntaxError | SolrException e) { + return "invalid syntax for profile Solr query must-match filter"; + } catch (final RuntimeException e) { + return "could not parse the Solr query must-match filter"; + } + } + + if (indexSolrQueryMustNotMatch != null + && !CrawlProfile.SOLR_EMPTY_QUERY.equals(indexSolrQueryMustNotMatch)) { + try { + if (SingleDocumentMatcher.matches(document, indexSolrQueryMustNotMatch, embeddedCore)) { + return "denied by profile Solr query must-not-match filter"; + } + } catch (final SyntaxError | SolrException e) { + return "invalid syntax for profile Solr query must-not-match filter"; + } catch (final RuntimeException e) { + return "could not parse the Solr query must-not-match filter"; + } + } + } + } + return null; + } + public final void addAllToIndex( final DigestURL url, final Map links, diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 1fa22ff32..1a88924b9 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -489,7 +489,7 @@ public class Segment { } } - private static String votedLanguage( + public static String votedLanguage( final DigestURL url, final String urlNormalform, final Document document, @@ -573,15 +573,41 @@ public class Segment { final String proxy, final String acceptLanguage ) { + final CollectionConfiguration collectionConfig = this.fulltext.getDefaultConfiguration(); + final String language = votedLanguage(url, url.toNormalform(true), document, condenser); // identification of the language + + final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(this, collections, responseHeader, + document, condenser, referrerURL, language, crawlProfile.isPushCrawlProfile(), + this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName); + + return storeDocument(url, crawlProfile, responseHeader, document, vector, language, condenser, + searchEvent, sourceName, storeToRWI, proxy, acceptLanguage); + } + + public SolrInputDocument storeDocument( + final DigestURL url, + final CrawlProfile crawlProfile, + final ResponseHeader responseHeader, + final Document document, + final CollectionConfiguration.SolrVector vector, + final String language, + final Condenser condenser, + final SearchEvent searchEvent, + final String sourceName, // contains the crawl profile hash if this comes from a web crawl + final boolean storeToRWI, + final String proxy, + final String acceptLanguage + ) { final long startTime = System.currentTimeMillis(); + final CollectionConfiguration collectionConfig = this.fulltext.getDefaultConfiguration(); + final String urlNormalform = url.toNormalform(true); + // CREATE INDEX // load some document metadata final Date loadDate = new Date(); final String id = ASCII.String(url.hash()); final String dc_title = document.dc_title(); - final String urlNormalform = url.toNormalform(true); - final String language = votedLanguage(url, urlNormalform, document, condenser); // identification of the language // get last modified date of the document to be used for the rwi index // (the lastmodified document propery should be the same in rwi and fulltext (calculated in yacy2solr)) @@ -591,10 +617,6 @@ public class Segment { if (modDate.getTime() > loadDate.getTime()) modDate = loadDate; char docType = Response.docType(document.dc_format()); - // CREATE SOLR DOCUMENT - final CollectionConfiguration collectionConfig = this.fulltext.getDefaultConfiguration(); - final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(this, collections, responseHeader, document, condenser, referrerURL, language, crawlProfile.isPushCrawlProfile(), this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName); - // ENRICH DOCUMENT WITH RANKING INFORMATION this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null); diff --git a/source/net/yacy/search/index/SingleDocumentMatcher.java b/source/net/yacy/search/index/SingleDocumentMatcher.java new file mode 100644 index 000000000..cb98f7c3e --- /dev/null +++ b/source/net/yacy/search/index/SingleDocumentMatcher.java @@ -0,0 +1,119 @@ +// SingleDocumentMatcher.java +// --------------------------- +// Copyright 2018 by luccioman; https://github.com/luccioman +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.search.index; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.memory.MemoryIndex; +import org.apache.lucene.search.Query; +import org.apache.solr.client.solrj.SolrQuery; +import org.apache.solr.common.SolrException; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.core.SolrCore; +import org.apache.solr.request.SolrQueryRequestBase; +import org.apache.solr.schema.IndexSchema; +import org.apache.solr.search.LuceneQParserPlugin; +import org.apache.solr.search.QParser; +import org.apache.solr.search.SyntaxError; +import org.apache.solr.update.DocumentBuilder; + +import net.yacy.search.schema.CollectionSchema; + +/** + * Provide utility functions to check if a single indexable Document matches a + * given Solr query. + */ +public abstract class SingleDocumentMatcher { + + /** + * @param query a Solr query string to parse + * @param targetCore an open Solr index core that is the target of the query + * @return a lucene Query instance parsed from the given Solr query string on the provided Solr core. + * @throws SyntaxError when the query syntax is not valid + * @throws SolrException when a query required element is missing, or when a problem occurred when accessing the target core + */ + public static Query toLuceneQuery(final String query, final SolrCore targetCore) throws SyntaxError, SolrException { + if (query == null || targetCore == null) { + throw new IllegalArgumentException("All parameters must be non null"); + } + + final SolrQuery solrQuery = new SolrQuery(query); + solrQuery.setParam(CommonParams.DF, CollectionSchema.text_t.getSolrFieldName()); + + final SolrQueryRequestBase solrRequest = new SolrQueryRequestBase(targetCore, solrQuery) { + }; + + final LuceneQParserPlugin luceneParserPlugin = new LuceneQParserPlugin(); + final QParser solrParser = luceneParserPlugin.createParser(query, null, solrRequest.getParams(), solrRequest); + return solrParser.parse(); + } + + /** + * Check a given Solr document against a Solr query, without requesting a Solr + * index, but using instead in-memory Lucene utility. This lets checking if a + * single document matches some criterias, before adding it to a Solr index. + * + * @param solrDoc + * the Solr document to check + * @param query + * a standard Solr query string + * @param core + * the Solr index core holding the Solr schema of the document + * @return true when the document matches the given Solr query + * @throws SyntaxError + * when the query String syntax is not valid + * @throws SolrException when a query required element is missing, or when a problem occurred when accessing the target core + * @throws IllegalArgumentException + * when a parameter is null. + * @see The + * Solr Standard Query Parser + */ + public static boolean matches(final SolrInputDocument solrDoc, final String query, final SolrCore core) + throws SyntaxError, IllegalArgumentException { + if (solrDoc == null || query == null || core == null) { + throw new IllegalArgumentException("All parameters must be non null"); + } + final IndexSchema schema = core.getLatestSchema(); + if (schema == null) { + throw new IllegalArgumentException("All parameters must be non null"); + } + + final org.apache.lucene.document.Document luceneDoc = DocumentBuilder.toDocument(solrDoc, schema); + + final Analyzer indexAnalyzer = schema.getIndexAnalyzer(); + + /* + * Using the Lucene RAMDirectory could be an alternative, but it is slower with + * a larger memory footprint + */ + final MemoryIndex index = MemoryIndex.fromDocument(luceneDoc, indexAnalyzer); + + final Query luceneQuery = toLuceneQuery(query, core); + + final float score = index.search(luceneQuery); + + return score > 0.0f; + } + +} diff --git a/test/java/net/yacy/search/index/SingleDocumentMatcherTest.java b/test/java/net/yacy/search/index/SingleDocumentMatcherTest.java new file mode 100644 index 000000000..dc7a30004 --- /dev/null +++ b/test/java/net/yacy/search/index/SingleDocumentMatcherTest.java @@ -0,0 +1,177 @@ +// SingleDocumentMatcherTest.java +// --------------------------- +// Copyright 2018 by luccioman; https://github.com/luccioman +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package net.yacy.search.index; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.Calendar; +import java.util.Date; +import java.util.GregorianCalendar; + +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.core.SolrCore; +import org.apache.solr.search.SyntaxError; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import net.yacy.cora.date.ISO8601Formatter; +import net.yacy.cora.document.encoding.ASCII; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.federate.solr.instance.EmbeddedInstance; +import net.yacy.search.schema.CollectionConfiguration; +import net.yacy.search.schema.CollectionSchema; +import net.yacy.search.schema.WebgraphSchema; + +/** + * Unit tests for the {@link SingleDocumentMatcher} class. + */ +public class SingleDocumentMatcherTest { + + /** Embedded Solr test instance */ + private static EmbeddedInstance EMBEDDED_INSTANCE; + + /** The configuration of the main Solr collection */ + private static CollectionConfiguration COLLECTION_CONFIG; + + /** + * Inits the embedded Solr index used for these tests. + */ + @BeforeClass + public static void initSolr() { + final File solr_config = new File("defaults/solr"); + final File storage = new File("test/DATA/INDEX/webportal/SEGMENTS/text/solr/"); + storage.mkdirs(); + System.out.println("setup EmeddedSolrConnector using config dir: " + solr_config.getAbsolutePath()); + try { + SingleDocumentMatcherTest.EMBEDDED_INSTANCE = new EmbeddedInstance(solr_config, storage, + CollectionSchema.CORE_NAME, new String[] { CollectionSchema.CORE_NAME, WebgraphSchema.CORE_NAME }); + } catch (final IOException ex) { + Assert.fail("IOException on embedded Solr initialization"); + } + + final File config = new File("defaults/solr.collection.schema"); + try { + SingleDocumentMatcherTest.COLLECTION_CONFIG = new CollectionConfiguration(config, true); + } catch (final IOException e) { + Assert.fail("IOException on collection configuration initialization"); + } + } + + /** + * Closes the embedded Solr index. + */ + @AfterClass + public static void finalizeTesting() { + SingleDocumentMatcherTest.EMBEDDED_INSTANCE.close(); + } + + /** + * @throws Exception + * when an unexpected exception occurred + */ + @Test + public void testMatches() throws Exception { + final CollectionConfiguration collectionConfig = SingleDocumentMatcherTest.COLLECTION_CONFIG; + final SolrCore solrCore = SingleDocumentMatcherTest.EMBEDDED_INSTANCE.getDefaultCore(); + + final SolrInputDocument solrDoc = new SolrInputDocument(); + final DigestURL docUrl = new DigestURL("http://example.com/"); + /* Using fields active in the defaults/solr.collection.schema */ + collectionConfig.add(solrDoc, CollectionSchema.id, ASCII.String(docUrl.hash())); + collectionConfig.add(solrDoc, CollectionSchema.sku, docUrl.toNormalform(true)); + collectionConfig.add(solrDoc, CollectionSchema.http_unique_b, true); + collectionConfig.add(solrDoc, CollectionSchema.title, Arrays.asList(new String[] { "Lorem ipsum" })); + collectionConfig.add(solrDoc, CollectionSchema.host_s, "example.com"); + collectionConfig.add(solrDoc, CollectionSchema.last_modified, new Date()); + collectionConfig.add(solrDoc, CollectionSchema.text_t, + "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."); + collectionConfig.add(solrDoc, CollectionSchema.size_i, 126); + + /* query on the default field */ + Assert.assertFalse(SingleDocumentMatcher.matches(solrDoc, "absent", solrCore)); + Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, "adipisicing", solrCore)); + + /* query on a multi valued text field */ + Assert.assertFalse(SingleDocumentMatcher.matches(solrDoc, "title:test", solrCore)); + Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, "title:ipsum", solrCore)); + + /* query on a string field */ + Assert.assertFalse(SingleDocumentMatcher.matches(solrDoc, "host_s:example.org", solrCore)); + Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, "host_s:example.com", solrCore)); + Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, "host_s:example.*", solrCore)); + + /* query on a boolean field */ + Assert.assertFalse(SingleDocumentMatcher.matches(solrDoc, "http_unique_b:false", solrCore)); + Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, "http_unique_b:true", solrCore)); + + final Calendar yesterdayCal = new GregorianCalendar(); + yesterdayCal.add(Calendar.DAY_OF_MONTH, -1); + final String yesterday = ISO8601Formatter.FORMATTER.format(yesterdayCal.getTime()); + + final Calendar tomorrowCal = new GregorianCalendar(); + tomorrowCal.add(Calendar.DAY_OF_MONTH, 1); + final String tomorrow = ISO8601Formatter.FORMATTER.format(tomorrowCal.getTime()); + + /* range query on a date field */ + Assert.assertFalse(SingleDocumentMatcher.matches(solrDoc, "last_modified:[" + tomorrow + " TO * ]", solrCore)); + Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, + "last_modified:[" + yesterday + " TO " + tomorrow + "]", solrCore)); + Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, "last_modified:[" + yesterday + " TO * ]", solrCore)); + Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, "last_modified:[ * TO " + tomorrow + "]", solrCore)); + + /* range query on an integer field */ + Assert.assertFalse(SingleDocumentMatcher.matches(solrDoc, "size_i:[ 0 TO 50 ]", solrCore)); + Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, "size_i:[ 0 TO * ]", solrCore)); + Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, "size_i:[ * TO 200 ]", solrCore)); + } + + /** + * @throws Exception + * when an unexpected exception occurred + */ + @Test + public void testMatchesSyntaxError() throws Exception { + final CollectionConfiguration collectionConfig = SingleDocumentMatcherTest.COLLECTION_CONFIG; + final SolrCore solrCore = SingleDocumentMatcherTest.EMBEDDED_INSTANCE.getDefaultCore(); + + final SolrInputDocument solrDoc = new SolrInputDocument(); + collectionConfig.add(solrDoc, CollectionSchema.id, ASCII.String(new DigestURL("http://example.com").hash())); + collectionConfig.add(solrDoc, CollectionSchema.title, Arrays.asList(new String[] { "Lorem ipsum" })); + collectionConfig.add(solrDoc, CollectionSchema.host_s, "example.com"); + collectionConfig.add(solrDoc, CollectionSchema.last_modified, new Date()); + collectionConfig.add(solrDoc, CollectionSchema.text_t, + "Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."); + collectionConfig.add(solrDoc, CollectionSchema.size_i, 126); + + try { + SingleDocumentMatcher.matches(solrDoc, ":", solrCore); + Assert.fail("Should have raised a syntax error"); + } catch (final SyntaxError e) { + return; + } + } + +}