Added a new crawler document filter type using Solr syntax

This makes possbile to set up much more advanced document crawl filters,
by filtering on one or more document indexed fields before inserting in
the index.
pull/186/head
luccioman 7 years ago
parent 2c155ece77
commit cced94298a

@ -32,6 +32,8 @@
<indexContentMustNotMatch>#[indexContentMustNotMatch]#</indexContentMustNotMatch>
<indexMediaTypeMustMatch>#[indexMediaTypeMustMatch]#</indexMediaTypeMustMatch>
<indexMediaTypeMustNotMatch>#[indexMediaTypeMustNotMatch]#</indexMediaTypeMustNotMatch>
<indexSolrQueryMustMatch>#[indexSolrQueryMustMatch]#</indexSolrQueryMustMatch>
<indexSolrQueryMustNotMatch>#[indexSolrQueryMustNotMatch]#</indexSolrQueryMustNotMatch>
<status>#(status)#terminated::active::system#(/status)#</status>
<crawlingDomFilterContent>
#{crawlingDomFilterContent}#

@ -171,6 +171,8 @@
disableIf('indexmustnotmatch', defaultMatchNone);
disableIf('indexcontentmustnotmatch', defaultMatchNone);
disableIf('indexMediaTypeMustNotMatch', defaultMatchNone);
disableIf('indexSolrQueryMustMatch', "#[solrQueryMatchAllStr]#");
disableIf('indexSolrQueryMustNotMatch', "#[solrEmptyQueryStr]#");
// remove if MATCH_ALL_STRING
disableIf('mustmatch', defaultMatchAll);
@ -369,7 +371,7 @@
<dt>Filter on Document Media Type (aka MIME type)</dt>
<dd>
<div class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="Clean up search events cache info"/>
<img src="env/grafics/i16.gif" width="16" height="16" alt="Media Type filter info"/>
<span style="right:0px;" id="mediaTypeMustMatchInfo">
The filter is a <b><a href="https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a></b>
that <b>must match</b> with the document Media Type (also known as MIME Type) to allow the URL to be indexed.
@ -388,6 +390,39 @@
</tr>
</table>
</dd>
<dt>Solr query filter on any active <a href="IndexSchema_p.html" target="_blank">indexed</a> field(s)</dt>
<dd>
<div class="info" style="float:right">
<img src="env/grafics/i16.gif" width="16" height="16" alt="Solr query filter info"/>
<span style="right:0px;" id="indexSolrQueryInfo">
Each parsed document is checked against the given Solr query before being added to the index.
The query must be written in respect to the <a href="https://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html#the-standard-query-parser" target="_blank">standard</a> Solr query syntax.
</span>
</div>
<table style="border-width: 0px" role="presentation">
#(embeddedSolrConnected)#
<tr>
<td>
<div class="info"><p>The embedded local Solr index must be connected to use this kind of filter.</p>
<p>You can configure this with the <a href="IndexFederated_p.html">Index Sources &amp; targets</a> page.</p></div>
</td>
</tr>
::
<tr>
<td style="width:110px"><img src="env/grafics/plus.gif" alt=""> must-match</td>
<td>
<input name="indexSolrQueryMustMatch" id="indexSolrQueryMustMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustMatch]#" aria-describedby="indexSolrQueryInfo" />
</td>
</tr>
<tr>
<td style="width:110px"><img src="env/grafics/minus.gif" alt=""> must-not-match</td>
<td>
<input name="indexSolrQueryMustNotMatch" id="indexSolrQueryMustNotMatch" type="text" size="55" maxlength="100000" value="#[indexSolrQueryMustNotMatch]#" aria-describedby="indexSolrQueryInfo" />
</td>
</tr>
#(/embeddedSolrConnected)#
</table>
</dd>
</dl>
</fieldset>
<fieldset>

@ -28,6 +28,9 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import org.apache.solr.core.SolrCore;
import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
@ -49,9 +52,11 @@ public class CrawlStartExpert {
final serverObjects prop = new serverObjects();
final String defaultCollection = "user";
// javascript values
// javascript constants
prop.put("matchAllStr", CrawlProfile.MATCH_ALL_STRING);
prop.put("matchNoneStr", CrawlProfile.MATCH_NEVER_STRING);
prop.put("solrQueryMatchAllStr", CrawlProfile.SOLR_MATCH_ALL_QUERY);
prop.put("solrEmptyQueryStr", CrawlProfile.SOLR_EMPTY_QUERY);
prop.put("defaultCollection", defaultCollection);
// ---------- Start point
@ -317,6 +322,29 @@ public class CrawlStartExpert {
} else {
prop.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
}
// Filter with a Solr syntax query
/* Check that the embedded local Solr index is connected, as its schema is required to apply the eventual Solr filter query */
final EmbeddedInstance embeddedSolr = sb.index.fulltext().getEmbeddedInstance();
final SolrCore embeddedCore = embeddedSolr != null ? embeddedSolr.getDefaultCore() : null;
final boolean embeddedSolrConnected = embeddedSolr != null && embeddedCore != null;
prop.put("embeddedSolrConnected", embeddedSolrConnected);
if(embeddedSolrConnected) {
if (post != null && post.containsKey(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key)) {
prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key,
post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY).trim());
} else {
prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY);
}
if (post != null && post.containsKey(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key)) {
prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key,
post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY).trim());
} else {
prop.put("embeddedSolrConnected_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY);
}
}
// ---------- Clean-Up before Crawl Start

@ -216,9 +216,23 @@ window.setInterval("setTableSize()", 1000);
<!-- 8 -->
Crawling of "#[crawlingURL]#" started. <strong>Please wait some seconds,
it may take some seconds until the first result appears there.</strong>
If you crawl any un-wanted pages, you can delete them <a href="IndexCreateQueues_p.html?stack=LOCAL">here</a>.<br />
If you crawl any un-wanted pages, you can delete them <a href="IndexCreateQueues_p.html?stack=LOCAL">here</a>.<br />::
<!-- 9 -->
No embedded local Solr index is connected. This is required to use a Solr query filter.
You can configure this with the <a href="IndexFederated_p.html">Index Sources &amp; targets</a> page.::
<!-- 10 -->
The Solr filter query syntax is not valid : <code>#[solrQuery]#</code>::
<!-- 11 -->
Could not parse the Solr filter query : <code>#[solrQuery]#</code>
#(/info)#
</p>
<!-- #(noEmbeddedSolr)#::<div class="alert alert-error">No embedded local Solr index is connected. This is required to use the Solr filter query.
You can configure this with the <a href="IndexFederated_p.html">Index Sources &amp; targets</a> page.</div>
#(/noEmbeddedSolr)#
#(solrQuerySyntaxtError)#::<div class="alert alert-error">The Solr filter query syntax is not valid : #[solrQuery]#</div>
#(/solrQuerySyntaxtError)#-->
<!-- crawl queues -->
#(info-queue)#::<div class="alert alert-warning">#[message]#</div>#(/info-queue)#

@ -35,12 +35,17 @@ import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.apache.solr.common.SolrException;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SyntaxError;
import net.yacy.cora.date.AbstractFormatter;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
@ -70,6 +75,7 @@ import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Segment;
import net.yacy.search.index.SingleDocumentMatcher;
import net.yacy.search.query.SearchEventCache;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects;
@ -464,7 +470,12 @@ public class Crawler_p {
boolean hasCrawlstartDataOK = !crawlName.isEmpty();
if (hasCrawlstartDataOK) {
// check crawlurl was given in sitecrawl
if ("url".equals(crawlingMode) && rootURLs.size() == 0) hasCrawlstartDataOK = false;
if ("url".equals(crawlingMode) && rootURLs.size() == 0) {
prop.put("info", "5"); //Crawling failed
prop.putHTML("info_crawlingURL", "(no url given)");
prop.putHTML("info_reasonString", "you must submit at least one crawl url");
hasCrawlstartDataOK = false;
}
}
String snapshotsMaxDepthString = post.get("snapshotsMaxDepth", "-1");
@ -533,6 +544,52 @@ public class Crawler_p {
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
}
/* If a solr query filter is defined, verify now its syntax and that the embedded Solr schema is available */
final String solrQueryMustMatch = post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY).trim();
final String solrQueryMustNotMatch = post.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_EMPTY_QUERY).trim();
if(!(solrQueryMustMatch.isEmpty() || CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(solrQueryMustMatch)) || !CrawlProfile.SOLR_EMPTY_QUERY.equals(solrQueryMustNotMatch)) {
final EmbeddedInstance embeddedSolr = sb.index.fulltext().getEmbeddedInstance();
final SolrCore embeddedCore = embeddedSolr != null ? embeddedSolr.getDefaultCore() : null;
final boolean embeddedSolrConnected = embeddedSolr != null && embeddedCore != null;
prop.put("noEmbeddedSolr", !embeddedSolrConnected);
if (embeddedSolrConnected) {
if(!(solrQueryMustMatch.isEmpty() || CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(solrQueryMustMatch))) {
try {
SingleDocumentMatcher.toLuceneQuery(solrQueryMustMatch, embeddedCore);
} catch(final SyntaxError | SolrException e) {
hasCrawlstartDataOK = false;
prop.put("info", "10");
prop.put("info_solrQuery", solrQueryMustMatch);
} catch(final RuntimeException e) {
hasCrawlstartDataOK = false;
prop.put("info", "11");
prop.put("info_solrQuery", solrQueryMustMatch);
}
}
if(!CrawlProfile.SOLR_EMPTY_QUERY.equals(solrQueryMustNotMatch)) {
try {
SingleDocumentMatcher.toLuceneQuery(solrQueryMustNotMatch, embeddedCore);
} catch(final SyntaxError | SolrException e) {
hasCrawlstartDataOK = false;
prop.put("info", "10");
prop.put("info_solrQuery", solrQueryMustNotMatch);
} catch(final RuntimeException e) {
hasCrawlstartDataOK = false;
prop.put("info", "11");
prop.put("info_solrQuery", solrQueryMustNotMatch);
}
}
} else {
hasCrawlstartDataOK = false;
prop.put("info", "9");
}
}
// prepare a new crawling profile
final CrawlProfile profile;
byte[] handle;
@ -574,6 +631,9 @@ public class Crawler_p {
post.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING));
profile.put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, post
.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING));
profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, solrQueryMustMatch);
profile.put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, solrQueryMustNotMatch);
handle = ASCII.getBytes(profile.handle());
@ -587,15 +647,11 @@ public class Crawler_p {
profile = null;
handle = null;
}
// start the crawl
if ("url".equals(crawlingMode)) {
if (rootURLs.size() == 0) {
prop.put("info", "5"); //Crawling failed
prop.putHTML("info_crawlingURL", "(no url given)");
prop.putHTML("info_reasonString", "you must submit at least one crawl url");
} else {
if(hasCrawlstartDataOK) {
if ("url".equals(crawlingMode)) {
// stack requests
sb.crawler.putActive(handle, profile);
final Set<DigestURL> successurls = new HashSet<DigestURL>();
@ -639,53 +695,53 @@ public class Crawler_p {
prop.putHTML("info_reasonString", fr.toString());
}
if (successurls.size() > 0) sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
}
} else if ("sitemap".equals(crawlingMode)) {
try {
final DigestURL sitemapURL = sitemapURLStr.indexOf("//") > 0 ? new DigestURL(sitemapURLStr) : new DigestURL(rootURLs.iterator().next(), sitemapURLStr); // fix for relative paths which should not exist but are used anyway
sb.crawler.putActive(handle, profile);
final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, profile);
importer.start();
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
} catch (final Exception e) {
// mist
prop.put("info", "6");//Error with url
prop.putHTML("info_crawlingStart", sitemapURLStr);
prop.putHTML("info_error", e.getMessage());
ConcurrentLog.logException(e);
}
} else if ("file".equals(crawlingMode)) {
if (post.containsKey("crawlingFile") && crawlingFile != null) {
try {
if(newcrawlingdepth > 0 && (fullDomain || subPath)) {
/* All links must have already been loaded because they are the part of the newcrawlingMustMatch filter */
if(hyperlinks_from_file != null) {
sb.crawler.putActive(handle, profile);
sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset());
}
} else {
/* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
final String crawlingFileContent = post.get("crawlingFile$file", "");
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000,
new HashSet<String>(), new VocabularyScraper(), profile.timezoneOffset());
FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile,
sb.crawlStacker, sb.peers.mySeed().hash.getBytes());
sb.crawler.putActive(handle, profile);
crawlStarterTask.start();
}
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); // crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) {
// mist
prop.put("info", "7"); // Error with file
prop.putHTML("info_crawlingStart", crawlingFileName);
prop.putHTML("info_error", e.getMessage());
ConcurrentLog.logException(e);
}
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
}
} else if ("sitemap".equals(crawlingMode)) {
try {
final DigestURL sitemapURL = sitemapURLStr.indexOf("//") > 0 ? new DigestURL(sitemapURLStr) : new DigestURL(rootURLs.iterator().next(), sitemapURLStr); // fix for relative paths which should not exist but are used anyway
sb.crawler.putActive(handle, profile);
final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, profile);
importer.start();
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
} catch (final Exception e) {
// mist
prop.put("info", "6");//Error with url
prop.putHTML("info_crawlingStart", sitemapURLStr);
prop.putHTML("info_error", e.getMessage());
ConcurrentLog.logException(e);
}
} else if ("file".equals(crawlingMode)) {
if (post.containsKey("crawlingFile") && crawlingFile != null) {
try {
if(newcrawlingdepth > 0 && (fullDomain || subPath)) {
/* All links must have already been loaded because they are the part of the newcrawlingMustMatch filter */
if(hyperlinks_from_file != null) {
sb.crawler.putActive(handle, profile);
sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks_from_file, profile.timezoneOffset());
}
} else {
/* No restriction on domains or subpath : we scrape now links and asynchronously push them to the crawlStacker */
final String crawlingFileContent = post.get("crawlingFile$file", "");
final ContentScraper scraper = new ContentScraper(new DigestURL(crawlingFile), 10000000,
new HashSet<String>(), new VocabularyScraper(), profile.timezoneOffset());
FileCrawlStarterTask crawlStarterTask = new FileCrawlStarterTask(crawlingFile, crawlingFileContent, scraper, profile,
sb.crawlStacker, sb.peers.mySeed().hash.getBytes());
sb.crawler.putActive(handle, profile);
crawlStarterTask.start();
}
} catch (final PatternSyntaxException e) {
prop.put("info", "4"); // crawlfilter does not match url
prop.putHTML("info_newcrawlingfilter", newcrawlingMustMatch);
prop.putHTML("info_error", e.getMessage());
} catch (final Exception e) {
// mist
prop.put("info", "7"); // Error with file
prop.putHTML("info_crawlingStart", crawlingFileName);
prop.putHTML("info_error", e.getMessage());
ConcurrentLog.logException(e);
}
sb.continueCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
}
}
}
}
}

@ -41,6 +41,7 @@ import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.Digest;
@ -63,9 +64,22 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
private static final long serialVersionUID = 5527325718810703504L;
/** Regular expression pattern matching everything */
public static final String MATCH_ALL_STRING = ".*";
/** Regular expression pattern matching nothing */
public static final String MATCH_NEVER_STRING = "";
/** Empty Solr query */
public static final String SOLR_EMPTY_QUERY = "";
/** Match all Solr query */
public static final String SOLR_MATCH_ALL_QUERY = AbstractSolrConnector.CATCHALL_QUERY;
/** Regular expression matching everything */
public static final Pattern MATCH_ALL_PATTERN = Pattern.compile(MATCH_ALL_STRING);
/** Regular expression matching nothing */
public static final Pattern MATCH_NEVER_PATTERN = Pattern.compile(MATCH_NEVER_STRING);
public static final String CRAWL_PROFILE_PUSH_STUB = "push_";
@ -92,6 +106,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
INDEXING_CONTENT_MUSTNOTMATCH("indexContentMustNotMatch", false, CrawlAttribute.STRING, "Indexing Content Must-Not-Match Filter"),
INDEXING_MEDIA_TYPE_MUSTMATCH("indexMediaTypeMustMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Match Filter"),
INDEXING_MEDIA_TYPE_MUSTNOTMATCH("indexMediaTypeMustNotMatch", false, CrawlAttribute.STRING, "Indexing Media Type (MIME) Must-Not-Match Filter"),
INDEXING_SOLR_QUERY_MUSTMATCH("indexSolrQueryMustMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Match Filter"),
INDEXING_SOLR_QUERY_MUSTNOTMATCH("indexSolrQueryMustNotMatch", false, CrawlAttribute.STRING, "Indexing Solr Query Must-Not-Match Filter"),
RECRAWL_IF_OLDER ("recrawlIfOlder", false, CrawlAttribute.INTEGER, "Recrawl If Older"),
STORE_HTCACHE ("storeHTCache", false, CrawlAttribute.BOOLEAN, "Store in HTCache"),
CACHE_STRAGEGY ("cacheStrategy", false, CrawlAttribute.STRING, "Cache Strategy (NOCACHE,IFFRESH,IFEXIST,CACHEONLY)"),
@ -261,6 +277,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(CrawlAttribute.TIMEZONEOFFSET.key, timezoneOffset);
put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, CrawlProfile.MATCH_ALL_STRING);
put(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, CrawlProfile.MATCH_NEVER_STRING);
put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY);
put(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key, CrawlProfile.SOLR_MATCH_ALL_QUERY);
}
/**
@ -857,6 +875,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_indexContentMustNotMatch", this.get(CrawlAttribute.INDEXING_CONTENT_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key, this.get(CrawlAttribute.INDEXING_MEDIA_TYPE_MUSTNOTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key));
prop.putXML(CRAWL_PROFILE_PREFIX + count + "_" + CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key, this.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key));
//prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustmatch", this.urlMustMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustMatch
//prop.putXML(CRAWL_PROFILE_PREFIX + count + "_mustnotmatch", this.urlMustNotMatchPattern().toString()); // TODO: remove, replace with crawlerURLMustNotMatch
//prop.put(CRAWL_PROFILE_PREFIX + count + "_crawlingIfOlder", (this.recrawlIfOlder() == 0L) ? "no re-crawl" : DateFormat.getDateTimeInstance().format(this.recrawlIfOlder())); // TODO: remove, replace with recrawlIfOlder

@ -91,7 +91,10 @@ import java.util.zip.ZipInputStream;
import javax.servlet.http.HttpServletRequest;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SyntaxError;
import com.cybozu.labs.langdetect.DetectorFactory;
import com.cybozu.labs.langdetect.LangDetectException;
@ -114,6 +117,7 @@ import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.connector.ShardSelection;
import net.yacy.cora.federate.solr.connector.SolrConnector.LoadTimeURL;
import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
import net.yacy.cora.federate.solr.instance.RemoteInstance;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.lod.vocabulary.Tagging;
@ -218,6 +222,7 @@ import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.index.SingleDocumentMatcher;
import net.yacy.search.query.AccessTracker;
import net.yacy.search.query.SearchEvent;
import net.yacy.search.query.SearchEventCache;
@ -3212,6 +3217,10 @@ public final class Switchboard extends serverSwitch {
FailCategory.FINAL_PROCESS_CONTEXT, failReason, -1);
continue docloop;
}
/* The eventual Solr/Lucene filter query will be checked just before adding the document to the index,
* when the SolrInputDocument is built, at storeDocumentIndex()*/
doclist.add(document);
}
@ -3327,16 +3336,36 @@ public final class Switchboard extends serverSwitch {
// remove stopwords
this.log.info("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + url.toNormalform(true));
final CollectionConfiguration collectionConfig = this.index.fulltext().getDefaultConfiguration();
final String language = Segment.votedLanguage(url, url.toNormalform(true), document, condenser); // identification of the language
final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(this.index, collections, queueEntry.getResponseHeader(),
document, condenser, referrerURL, language, profile.isPushCrawlProfile(),
this.index.fulltext().useWebgraph() ? this.index.fulltext().getWebgraphConfiguration() : null, sourceName);
/*
* One last posible filtering step before adding to index : using the eventual
* profile Solr querie filters
*/
final String profileSolrFilterError = checkCrawlProfileSolrFilters(profile, vector);
if (profileSolrFilterError != null) {
this.crawlQueues.errorURL.push(url, queueEntry.depth(), profile, FailCategory.FINAL_LOAD_CONTEXT,
profileSolrFilterError + ", process case=" + processCase + ", profile name = "
+ profile.collectionName(),
-1);
return;
}
// STORE WORD INDEX
SolrInputDocument newEntry =
this.index.storeDocument(
url,
referrerURL,
collections,
profile,
queueEntry.getResponseHeader(),
document,
vector,
language,
condenser,
searchEvent,
sourceName,
@ -3401,6 +3430,66 @@ public final class Switchboard extends serverSwitch {
}
}
/**
* Check that the given Solr document matches the eventual crawl profil Solr
* query filters.
*
* @param profile
* the eventual crawl profile.
* @param document
* the Solr document to check. Must not be null.
* @return an eventual error message or null when no Solr query filters are
* defined or when they match with the Solr document.
* @throws IllegalArgumentException
* when the document is null
*/
private String checkCrawlProfileSolrFilters(final CrawlProfile profile,
final CollectionConfiguration.SolrVector document) throws IllegalArgumentException {
if (profile != null) {
final String indexFilterQuery = profile.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTMATCH.key);
final String indexSolrQueryMustNotMatch = profile.get(CrawlAttribute.INDEXING_SOLR_QUERY_MUSTNOTMATCH.key);
if ((indexFilterQuery != null && !indexFilterQuery.isEmpty()
&& !CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(indexFilterQuery))
|| (indexSolrQueryMustNotMatch != null
&& !CrawlProfile.SOLR_EMPTY_QUERY.equals(indexSolrQueryMustNotMatch))) {
final EmbeddedInstance embeddedSolr = this.index.fulltext().getEmbeddedInstance();
final SolrCore embeddedCore = embeddedSolr != null ? embeddedSolr.getDefaultCore() : null;
final boolean embeddedSolrConnected = embeddedSolr != null && embeddedCore != null;
if (!embeddedSolrConnected) {
return "no connected embedded instance for profile Solr query filter";
}
if ((indexFilterQuery != null && !indexFilterQuery.isEmpty()
&& !CrawlProfile.SOLR_MATCH_ALL_QUERY.equals(indexFilterQuery))) {
try {
if (!SingleDocumentMatcher.matches(document, indexFilterQuery, embeddedCore)) {
return "denied by profile Solr query must-match filter";
}
} catch (final SyntaxError | SolrException e) {
return "invalid syntax for profile Solr query must-match filter";
} catch (final RuntimeException e) {
return "could not parse the Solr query must-match filter";
}
}
if (indexSolrQueryMustNotMatch != null
&& !CrawlProfile.SOLR_EMPTY_QUERY.equals(indexSolrQueryMustNotMatch)) {
try {
if (SingleDocumentMatcher.matches(document, indexSolrQueryMustNotMatch, embeddedCore)) {
return "denied by profile Solr query must-not-match filter";
}
} catch (final SyntaxError | SolrException e) {
return "invalid syntax for profile Solr query must-not-match filter";
} catch (final RuntimeException e) {
return "could not parse the Solr query must-not-match filter";
}
}
}
}
return null;
}
public final void addAllToIndex(
final DigestURL url,
final Map<AnchorURL, String> links,

@ -489,7 +489,7 @@ public class Segment {
}
}
private static String votedLanguage(
public static String votedLanguage(
final DigestURL url,
final String urlNormalform,
final Document document,
@ -573,15 +573,41 @@ public class Segment {
final String proxy,
final String acceptLanguage
) {
final CollectionConfiguration collectionConfig = this.fulltext.getDefaultConfiguration();
final String language = votedLanguage(url, url.toNormalform(true), document, condenser); // identification of the language
final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(this, collections, responseHeader,
document, condenser, referrerURL, language, crawlProfile.isPushCrawlProfile(),
this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName);
return storeDocument(url, crawlProfile, responseHeader, document, vector, language, condenser,
searchEvent, sourceName, storeToRWI, proxy, acceptLanguage);
}
public SolrInputDocument storeDocument(
final DigestURL url,
final CrawlProfile crawlProfile,
final ResponseHeader responseHeader,
final Document document,
final CollectionConfiguration.SolrVector vector,
final String language,
final Condenser condenser,
final SearchEvent searchEvent,
final String sourceName, // contains the crawl profile hash if this comes from a web crawl
final boolean storeToRWI,
final String proxy,
final String acceptLanguage
) {
final long startTime = System.currentTimeMillis();
final CollectionConfiguration collectionConfig = this.fulltext.getDefaultConfiguration();
final String urlNormalform = url.toNormalform(true);
// CREATE INDEX
// load some document metadata
final Date loadDate = new Date();
final String id = ASCII.String(url.hash());
final String dc_title = document.dc_title();
final String urlNormalform = url.toNormalform(true);
final String language = votedLanguage(url, urlNormalform, document, condenser); // identification of the language
// get last modified date of the document to be used for the rwi index
// (the lastmodified document propery should be the same in rwi and fulltext (calculated in yacy2solr))
@ -591,10 +617,6 @@ public class Segment {
if (modDate.getTime() > loadDate.getTime()) modDate = loadDate;
char docType = Response.docType(document.dc_format());
// CREATE SOLR DOCUMENT
final CollectionConfiguration collectionConfig = this.fulltext.getDefaultConfiguration();
final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(this, collections, responseHeader, document, condenser, referrerURL, language, crawlProfile.isPushCrawlProfile(), this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName);
// ENRICH DOCUMENT WITH RANKING INFORMATION
this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null);

@ -0,0 +1,119 @@
// SingleDocumentMatcher.java
// ---------------------------
// Copyright 2018 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.search.index;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.search.Query;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.CommonParams;
import org.apache.solr.core.SolrCore;
import org.apache.solr.request.SolrQueryRequestBase;
import org.apache.solr.schema.IndexSchema;
import org.apache.solr.search.LuceneQParserPlugin;
import org.apache.solr.search.QParser;
import org.apache.solr.search.SyntaxError;
import org.apache.solr.update.DocumentBuilder;
import net.yacy.search.schema.CollectionSchema;
/**
* Provide utility functions to check if a single indexable Document matches a
* given Solr query.
*/
public abstract class SingleDocumentMatcher {
/**
* @param query a Solr query string to parse
* @param targetCore an open Solr index core that is the target of the query
* @return a lucene Query instance parsed from the given Solr query string on the provided Solr core.
* @throws SyntaxError when the query syntax is not valid
* @throws SolrException when a query required element is missing, or when a problem occurred when accessing the target core
*/
public static Query toLuceneQuery(final String query, final SolrCore targetCore) throws SyntaxError, SolrException {
if (query == null || targetCore == null) {
throw new IllegalArgumentException("All parameters must be non null");
}
final SolrQuery solrQuery = new SolrQuery(query);
solrQuery.setParam(CommonParams.DF, CollectionSchema.text_t.getSolrFieldName());
final SolrQueryRequestBase solrRequest = new SolrQueryRequestBase(targetCore, solrQuery) {
};
final LuceneQParserPlugin luceneParserPlugin = new LuceneQParserPlugin();
final QParser solrParser = luceneParserPlugin.createParser(query, null, solrRequest.getParams(), solrRequest);
return solrParser.parse();
}
/**
* Check a given Solr document against a Solr query, without requesting a Solr
* index, but using instead in-memory Lucene utility. This lets checking if a
* single document matches some criterias, before adding it to a Solr index.
*
* @param solrDoc
* the Solr document to check
* @param query
* a standard Solr query string
* @param core
* the Solr index core holding the Solr schema of the document
* @return true when the document matches the given Solr query
* @throws SyntaxError
* when the query String syntax is not valid
* @throws SolrException when a query required element is missing, or when a problem occurred when accessing the target core
* @throws IllegalArgumentException
* when a parameter is null.
* @see <a href=
* "http://lucene.apache.org/solr/guide/6_6/the-standard-query-parser.html">The
* Solr Standard Query Parser</a>
*/
public static boolean matches(final SolrInputDocument solrDoc, final String query, final SolrCore core)
throws SyntaxError, IllegalArgumentException {
if (solrDoc == null || query == null || core == null) {
throw new IllegalArgumentException("All parameters must be non null");
}
final IndexSchema schema = core.getLatestSchema();
if (schema == null) {
throw new IllegalArgumentException("All parameters must be non null");
}
final org.apache.lucene.document.Document luceneDoc = DocumentBuilder.toDocument(solrDoc, schema);
final Analyzer indexAnalyzer = schema.getIndexAnalyzer();
/*
* Using the Lucene RAMDirectory could be an alternative, but it is slower with
* a larger memory footprint
*/
final MemoryIndex index = MemoryIndex.fromDocument(luceneDoc, indexAnalyzer);
final Query luceneQuery = toLuceneQuery(query, core);
final float score = index.search(luceneQuery);
return score > 0.0f;
}
}

@ -0,0 +1,177 @@
// SingleDocumentMatcherTest.java
// ---------------------------
// Copyright 2018 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.search.index;
import java.io.File;
import java.io.IOException;
import java.util.Arrays;
import java.util.Calendar;
import java.util.Date;
import java.util.GregorianCalendar;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SyntaxError;
import org.junit.AfterClass;
import org.junit.Assert;
import org.junit.BeforeClass;
import org.junit.Test;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
import net.yacy.search.schema.CollectionConfiguration;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.schema.WebgraphSchema;
/**
* Unit tests for the {@link SingleDocumentMatcher} class.
*/
public class SingleDocumentMatcherTest {
/** Embedded Solr test instance */
private static EmbeddedInstance EMBEDDED_INSTANCE;
/** The configuration of the main Solr collection */
private static CollectionConfiguration COLLECTION_CONFIG;
/**
* Inits the embedded Solr index used for these tests.
*/
@BeforeClass
public static void initSolr() {
final File solr_config = new File("defaults/solr");
final File storage = new File("test/DATA/INDEX/webportal/SEGMENTS/text/solr/");
storage.mkdirs();
System.out.println("setup EmeddedSolrConnector using config dir: " + solr_config.getAbsolutePath());
try {
SingleDocumentMatcherTest.EMBEDDED_INSTANCE = new EmbeddedInstance(solr_config, storage,
CollectionSchema.CORE_NAME, new String[] { CollectionSchema.CORE_NAME, WebgraphSchema.CORE_NAME });
} catch (final IOException ex) {
Assert.fail("IOException on embedded Solr initialization");
}
final File config = new File("defaults/solr.collection.schema");
try {
SingleDocumentMatcherTest.COLLECTION_CONFIG = new CollectionConfiguration(config, true);
} catch (final IOException e) {
Assert.fail("IOException on collection configuration initialization");
}
}
/**
* Closes the embedded Solr index.
*/
@AfterClass
public static void finalizeTesting() {
SingleDocumentMatcherTest.EMBEDDED_INSTANCE.close();
}
/**
* @throws Exception
* when an unexpected exception occurred
*/
@Test
public void testMatches() throws Exception {
final CollectionConfiguration collectionConfig = SingleDocumentMatcherTest.COLLECTION_CONFIG;
final SolrCore solrCore = SingleDocumentMatcherTest.EMBEDDED_INSTANCE.getDefaultCore();
final SolrInputDocument solrDoc = new SolrInputDocument();
final DigestURL docUrl = new DigestURL("http://example.com/");
/* Using fields active in the defaults/solr.collection.schema */
collectionConfig.add(solrDoc, CollectionSchema.id, ASCII.String(docUrl.hash()));
collectionConfig.add(solrDoc, CollectionSchema.sku, docUrl.toNormalform(true));
collectionConfig.add(solrDoc, CollectionSchema.http_unique_b, true);
collectionConfig.add(solrDoc, CollectionSchema.title, Arrays.asList(new String[] { "Lorem ipsum" }));
collectionConfig.add(solrDoc, CollectionSchema.host_s, "example.com");
collectionConfig.add(solrDoc, CollectionSchema.last_modified, new Date());
collectionConfig.add(solrDoc, CollectionSchema.text_t,
"Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.");
collectionConfig.add(solrDoc, CollectionSchema.size_i, 126);
/* query on the default field */
Assert.assertFalse(SingleDocumentMatcher.matches(solrDoc, "absent", solrCore));
Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, "adipisicing", solrCore));
/* query on a multi valued text field */
Assert.assertFalse(SingleDocumentMatcher.matches(solrDoc, "title:test", solrCore));
Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, "title:ipsum", solrCore));
/* query on a string field */
Assert.assertFalse(SingleDocumentMatcher.matches(solrDoc, "host_s:example.org", solrCore));
Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, "host_s:example.com", solrCore));
Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, "host_s:example.*", solrCore));
/* query on a boolean field */
Assert.assertFalse(SingleDocumentMatcher.matches(solrDoc, "http_unique_b:false", solrCore));
Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, "http_unique_b:true", solrCore));
final Calendar yesterdayCal = new GregorianCalendar();
yesterdayCal.add(Calendar.DAY_OF_MONTH, -1);
final String yesterday = ISO8601Formatter.FORMATTER.format(yesterdayCal.getTime());
final Calendar tomorrowCal = new GregorianCalendar();
tomorrowCal.add(Calendar.DAY_OF_MONTH, 1);
final String tomorrow = ISO8601Formatter.FORMATTER.format(tomorrowCal.getTime());
/* range query on a date field */
Assert.assertFalse(SingleDocumentMatcher.matches(solrDoc, "last_modified:[" + tomorrow + " TO * ]", solrCore));
Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc,
"last_modified:[" + yesterday + " TO " + tomorrow + "]", solrCore));
Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, "last_modified:[" + yesterday + " TO * ]", solrCore));
Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, "last_modified:[ * TO " + tomorrow + "]", solrCore));
/* range query on an integer field */
Assert.assertFalse(SingleDocumentMatcher.matches(solrDoc, "size_i:[ 0 TO 50 ]", solrCore));
Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, "size_i:[ 0 TO * ]", solrCore));
Assert.assertTrue(SingleDocumentMatcher.matches(solrDoc, "size_i:[ * TO 200 ]", solrCore));
}
/**
* @throws Exception
* when an unexpected exception occurred
*/
@Test
public void testMatchesSyntaxError() throws Exception {
final CollectionConfiguration collectionConfig = SingleDocumentMatcherTest.COLLECTION_CONFIG;
final SolrCore solrCore = SingleDocumentMatcherTest.EMBEDDED_INSTANCE.getDefaultCore();
final SolrInputDocument solrDoc = new SolrInputDocument();
collectionConfig.add(solrDoc, CollectionSchema.id, ASCII.String(new DigestURL("http://example.com").hash()));
collectionConfig.add(solrDoc, CollectionSchema.title, Arrays.asList(new String[] { "Lorem ipsum" }));
collectionConfig.add(solrDoc, CollectionSchema.host_s, "example.com");
collectionConfig.add(solrDoc, CollectionSchema.last_modified, new Date());
collectionConfig.add(solrDoc, CollectionSchema.text_t,
"Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.");
collectionConfig.add(solrDoc, CollectionSchema.size_i, 126);
try {
SingleDocumentMatcher.matches(solrDoc, ":", solrCore);
Assert.fail("Should have raised a syntax error");
} catch (final SyntaxError e) {
return;
}
}
}
Loading…
Cancel
Save