added a collection attribute to crawls and searches:

- a solr field collection_sxt can be used to store a set of crawl tags
- when this field is activated, a crawl tag can be assigned when crawls
are started
- the content of the collection field can be comma-separated, all of
them are assigned to the documents when they are indexed as result of
such a crawl start
- a search result can be drilled down to a specific collection; this is
currently only available in the solr interface and also in the gsa
interface using the 'site' option
- this adds a mandatory field for gsa queries (the google api demands
that field all the time)
pull/1/head
Michael Peter Christen 13 years ago
parent 174530a9e0
commit b2b516cc3e

@ -140,6 +140,9 @@ h6_txt
### optional values, not part of standard YaCy handling (but useful for external applications)
## tags that are attached to crawls/index generation to separate the search result into user-defined subsets
#collection_sxt
## tags of css entries, normalized with absolute URL
#css_tag_txt

@ -564,6 +564,13 @@ crawlingIPMustNotMatch=
# the default country codes are all codes for countries in Europe
crawlingCountryMustMatch=AD,AL,AT,BA,BE,BG,BY,CH,CY,CZ,DE,DK,EE,ES,FI,FO,FR,GG,GI,GR,HR,HU,IE,IM,IS,IT,JE,LI,LT,LU,LV,MC,MD,MK,MT,NL,NO,PL,PT,RO,RU,SE,SI,SJ,SK,SM,TR,UA,UK,VA,YU
# collections for index data separation
# these collections can either be used to produce search tenants.
# The collection is used in the site-parameter in the GSA interface.
# Collections are assigned during crawl-time and defined in the crawl start.
# The YaCyScheme field collection_sxt must be switched on to use this field.
collection=user
# performance-settings
# delay-times for permanent loops (milliseconds)
# the idlesleep is the pause that an proces sleeps if the last call to the

@ -296,6 +296,15 @@
check this box.
</td>
</tr>
<tr valign="top" class="TableCellLight">
<td><label for="collection">Add Crawl result to collection(s)</label>:</td>
<td>
<input name="collection" id="collection" type="text" size="60" maxlength="100" value="#[collection]#" #(collectionEnabled)#disabled="disabled"::#(/collectionEnabled)# />
</td>
<td>
A crawl result can be tagged with names which are candidates for a collection request. These tags can be selected with the <a href="/gsa/search?q=www&site=#[collection]#">GSA interface</a> using the 'site' operator. To use this option, the 'collection_sxt'-field must be switched on in the <a href="/IndexFederated_p.html">Solr Schema</a>
</td>
</tr>
<!--
<tr valign="top" class="TableCellDark">
<td>Exclude <em>dynamic</em> Stop-Words</td>
@ -314,8 +323,8 @@
</td>
</tr>
-->
<tr valign="top" class="TableCellLight">
<td colspan="5"><input type="submit" name="crawlingstart" value="Start New Crawl" /></td>
<tr valign="top" class="TableCellSummary">
<td colspan="5"><input type="submit" name="crawlingstart" value="Start New Crawl" class="submitready"/></td>
</tr>
</table>
</form>

@ -27,6 +27,7 @@
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.YaCySchema;
import de.anomic.crawler.CrawlProfile;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@ -79,6 +80,10 @@ public class CrawlStartExpert_p {
prop.put("xdstopwChecked", env.getConfigBool("xdstopw", true) ? "1" : "0");
prop.put("xpstopwChecked", env.getConfigBool("xpstopw", true) ? "1" : "0");
boolean collectionEnabled = sb.index.fulltext().getSolrScheme().isEmpty() || sb.index.fulltext().getSolrScheme().contains(YaCySchema.collection_sxt);
prop.put("collectionEnabled", collectionEnabled ? 1 : 0);
prop.put("collection", collectionEnabled ? sb.getConfig("collection", "user") : "");
// return rewrite properties
return prop;
}

@ -104,10 +104,10 @@
<input type="hidden" name="xsstopw" id="xsstopw" value="on" />
<input type="hidden" name="xdstopw" id="xdstopw" value="off" />
<input type="hidden" name="xpstopw" id="xpstopw" value="off" />
<input type="hidden" name="collection" id="collection" value="" />
</dd>
<!-- <dt>&nbsp;</dt><dd>&nbsp;</dd><dt>&nbsp;</dt><dd>&nbsp;</dd> -->
<dt><label>Start</label></dt>
<dd><input type="submit" name="crawlingstart" value="Start New Crawl" />
<dd><input type="submit" name="crawlingstart" value="Start New Crawl" class="submitready"/>
</dd>
</dl>

@ -53,7 +53,6 @@ import net.yacy.peers.NewsPool;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.Segment;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlQueues;
import de.anomic.crawler.SitemapImporter;
@ -95,9 +94,6 @@ public class Crawler_p {
prop.put("list-remote", 0);
prop.put("forwardToCrawlStart", "0");
// get segment
Segment indexSegment = sb.index;
prop.put("info", "0");
if (post != null && post.containsKey("continue")) {
@ -198,6 +194,9 @@ public class Crawler_p {
final boolean directDocByURL = "on".equals(post.get("directDocByURL", "on")); // catch also all linked media documents without loading them
env.setConfig("crawlingDirectDocByURL", directDocByURL);
final String collection = post.get("collection", sb.getConfig("collection", "user"));
env.setConfig("collection", collection);
// recrawl
final String recrawl = post.get("recrawl", "nodoubles"); // nodoubles, reload, scheduler
boolean crawlingIfOlderCheck = "on".equals(post.get("crawlingIfOlderCheck", "off"));
@ -284,7 +283,8 @@ public class Crawler_p {
xsstopw,
xdstopw,
xpstopw,
cachePolicy);
cachePolicy,
collection);
sb.crawler.putActive(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
final DigestURI url = crawlingStartURL;
@ -319,7 +319,7 @@ public class Crawler_p {
// first delete old entry, if exists
final DigestURI url = new DigestURI(crawlingStart);
final byte[] urlhash = url.hash();
indexSegment.fulltext().remove(urlhash);
sb.index.fulltext().remove(urlhash);
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
sb.crawlQueues.errorURL.remove(urlhash);
@ -349,7 +349,8 @@ public class Crawler_p {
xsstopw,
xdstopw,
xpstopw,
cachePolicy);
cachePolicy,
collection);
sb.crawler.putActive(pe.handle().getBytes(), pe);
final String reasonString = sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(),
@ -496,7 +497,8 @@ public class Crawler_p {
xsstopw,
xdstopw,
xpstopw,
cachePolicy);
cachePolicy,
collection);
sb.crawler.putActive(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
sb.crawlStacker.enqueueEntriesAsynchronous(sb.peers.mySeed().hash.getBytes(), profile.handle(), hyperlinks);
@ -537,7 +539,8 @@ public class Crawler_p {
xsstopw,
xdstopw,
xpstopw,
cachePolicy);
cachePolicy,
collection);
sb.crawler.putActive(pe.handle().getBytes(), pe);
final SitemapImporter importer = new SitemapImporter(sb, sitemapURL, pe);
importer.start();
@ -581,7 +584,8 @@ public class Crawler_p {
xsstopw,
xdstopw,
xpstopw,
cachePolicy);
cachePolicy,
collection);
sb.crawler.putActive(profile.handle().getBytes(), profile);
sb.pauseCrawlJob(SwitchboardConstants.CRAWLJOB_LOCAL_CRAWL);
final Iterator<Map.Entry<MultiProtocolURI, Properties>> linkiterator = hyperlinks.entrySet().iterator();
@ -592,7 +596,7 @@ public class Crawler_p {
nexturl = new DigestURI(e.getKey());
// remove the url from the database to be prepared to crawl them again
final byte[] urlhash = nexturl.hash();
indexSegment.fulltext().remove(urlhash);
sb.index.fulltext().remove(urlhash);
sb.crawlQueues.noticeURL.removeByURLHash(urlhash);
sb.crawlQueues.errorURL.remove(urlhash);
sb.crawlStacker.enqueueEntry(new Request(
@ -631,7 +635,6 @@ public class Crawler_p {
prop.put("crawlingSpeedMinChecked", (LCppm <= 10) ? "1" : "0");
prop.put("customPPMdefault", Integer.toString(LCppm));
// generate crawl profile table
int count = 0;
boolean dark = true;

@ -105,6 +105,7 @@ public class QuickCrawlLink_p {
final boolean xsstopw = post.get("xsstopw", "").equals("on");
final boolean xdstopw = post.get("xdstopw", "").equals("on");
final boolean xpstopw = post.get("xpstopw", "").equals("on");
final String collection = post.get("collection", "user");
prop.put("mode_url", (crawlingStart == null) ? "unknown" : crawlingStart);
prop.putHTML("mode_title", (title == null) ? "unknown" : title);
@ -151,7 +152,8 @@ public class QuickCrawlLink_p {
xsstopw,
xdstopw,
xpstopw,
CacheStrategy.IFFRESH);
CacheStrategy.IFFRESH,
collection);
sb.crawler.putActive(pe.handle().getBytes(), pe);
} catch (final Exception e) {
// mist

@ -268,7 +268,8 @@ public class import_ymark {
"", depth, medialink,
CrawlProfile.getRecrawlDate(CrawlSwitchboard.CRAWL_PROFILE_PROXY_RECRAWL_CYCLE), -1, crawlingQ,
true, true, true, false, true, true, true,
CacheStrategy.IFFRESH);
CacheStrategy.IFFRESH,
"robot_import");
sb.crawler.putActive(pe.handle().getBytes(), pe);
return sb.crawlStacker.stackCrawl(new Request(
sb.peers.mySeed().hash.getBytes(),

@ -121,6 +121,11 @@ public class searchresult {
String access = post.remove("access");
String entqr = post.remove("entqr");
if (site != null && site.length() > 0) {
q = q + " AND " + YaCySchema.collection_sxt.name() + ":" + site;
post.put(CommonParams.Q, q);
}
// get the embedded connector
EmbeddedSolrConnector connector = (EmbeddedSolrConnector) sb.index.fulltext().getLocalSolr();
if (connector == null) return null;

@ -75,6 +75,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
public static final String FILTER_IP_MUSTMATCH = "crawlingIPMustMatch";
public static final String FILTER_IP_MUSTNOTMATCH = "crawlingIPMustNotMatch";
public static final String FILTER_COUNTRY_MUSTMATCH = "crawlingCountryMustMatch";
public static final String COLLECTIONS = "collections";
private Pattern urlmustmatch = null, urlmustnotmatch = null, ipmustmatch = null, ipmustnotmatch = null;
@ -120,6 +121,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
* @param xdstopw true if dynamic stop words shall be ignored
* @param xpstopw true if parent stop words shall be ignored
* @param cacheStrategy determines if and how cache is used loading content
* @param collections a comma-separated list of tags which are attached to index entries
*/
public CrawlProfile(
final String name,
@ -141,7 +143,8 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
final boolean xsstopw,
final boolean xdstopw,
final boolean xpstopw,
final CacheStrategy cacheStrategy) {
final CacheStrategy cacheStrategy,
final String collections) {
super(40);
if (name == null || name.isEmpty()) {
throw new NullPointerException("name must not be null or empty");
@ -172,6 +175,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
put(XDSTOPW, xdstopw); // exclude dynamic stop-word
put(XPSTOPW, xpstopw); // exclude parent stop-words
put(CACHE_STRAGEGY, cacheStrategy.toString());
put(COLLECTIONS, collections.trim().replaceAll(" ", ""));
}
/**
@ -184,7 +188,6 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
this.doms = new ConcurrentHashMap<String, DomProfile>();
}
public void domInc(final String domain, final String referrer, final int depth) {
final DomProfile dp = this.doms.get(domain);
if (dp == null) {
@ -259,6 +262,16 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
return r;
}
/**
* get the collections for this crawl
* @return a list of collection names
*/
public String[] collections() {
final String r = get(COLLECTIONS);
if (r == null) return new String[0];
return r.split(",");
}
/**
* Gets the name of the CrawlProfile.
* @return name of the profile

@ -291,7 +291,7 @@ public final class CrawlSwitchboard
// generate new default entry for proxy crawling
this.defaultProxyProfile =
new CrawlProfile(
"proxy",
CRAWL_PROFILE_PROXY,
null,
CrawlProfile.MATCH_ALL_STRING,
CrawlProfile.MATCH_NEVER_STRING,
@ -310,7 +310,8 @@ public final class CrawlSwitchboard
true,
true,
true,
CacheStrategy.IFFRESH);
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_PROXY);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultProxyProfile.handle()),
this.defaultProxyProfile);
@ -338,7 +339,8 @@ public final class CrawlSwitchboard
true,
true,
false,
CacheStrategy.IFFRESH);
CacheStrategy.IFFRESH,
"robot_" + CRAWL_PROFILE_REMOTE);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultRemoteProfile.handle()),
this.defaultRemoteProfile);
@ -366,7 +368,8 @@ public final class CrawlSwitchboard
true,
true,
false,
CacheStrategy.IFEXIST);
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_TEXT);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetLocalProfile.handle()),
this.defaultTextSnippetLocalProfile);
@ -394,7 +397,8 @@ public final class CrawlSwitchboard
true,
true,
false,
CacheStrategy.IFEXIST);
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_TEXT);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultTextSnippetGlobalProfile.handle()),
this.defaultTextSnippetGlobalProfile);
@ -423,7 +427,8 @@ public final class CrawlSwitchboard
true,
true,
false,
CacheStrategy.IFEXIST);
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_LOCAL_MEDIA);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetLocalProfile.handle()),
this.defaultMediaSnippetLocalProfile);
@ -451,7 +456,8 @@ public final class CrawlSwitchboard
true,
true,
false,
CacheStrategy.IFEXIST);
CacheStrategy.IFEXIST,
"robot_" + CRAWL_PROFILE_SNIPPET_GLOBAL_MEDIA);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultMediaSnippetGlobalProfile.handle()),
this.defaultMediaSnippetGlobalProfile);
@ -479,7 +485,8 @@ public final class CrawlSwitchboard
true,
true,
false,
CacheStrategy.NOCACHE);
CacheStrategy.NOCACHE,
"robot_" + CRAWL_PROFILE_SURROGATE);
this.profilesActiveCrawls.put(
UTF8.getBytes(this.defaultSurrogateProfile.handle()),
this.defaultSurrogateProfile);

@ -162,6 +162,7 @@ public class GSAResponseWriter implements QueryResponseWriter {
// write header
writer.write(XML_START);
String query = request.getParams().get("q");
String site = (String) context.get("site");
OpensearchResponseWriter.solitaireTag(writer, "TM", Long.toString(System.currentTimeMillis() - start));
OpensearchResponseWriter.solitaireTag(writer, "Q", query);
paramTag(writer, "sort", (String) context.get("sort"));
@ -170,7 +171,7 @@ public class GSAResponseWriter implements QueryResponseWriter {
paramTag(writer, "oe", "UTF-8");
paramTag(writer, "client", (String) context.get("client"));
paramTag(writer, "q", request.getParams().get("q"));
paramTag(writer, "site", (String) context.get("site"));
paramTag(writer, "site", site);
paramTag(writer, "start", Integer.toString(resHead.offset));
paramTag(writer, "num", Integer.toString(resHead.rows));
paramTag(writer, "ip", (String) context.get("ip"));

@ -2560,6 +2560,7 @@ public final class Switchboard extends serverSwitch
queueEntry.lastModified(),
new Date(),
queueEntry.size(),
queueEntry.profile(),
queueEntry.getResponseHeader(),
document,
condenser,

@ -173,6 +173,7 @@ public class DocumentIndex extends Segment {
new Date(),
url.length(),
null,
null,
document,
condenser,
null,

@ -68,6 +68,7 @@ import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.Switchboard;
import net.yacy.search.query.RWIProcess;
import net.yacy.search.query.SearchEvent;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlQueues;
import de.anomic.crawler.retrieval.Response;
@ -345,6 +346,7 @@ public class Segment {
Date modDate,
final Date loadDate,
final long sourcesize,
final CrawlProfile profile,
final ResponseHeader responseHeader,
final Document document,
final Condenser condenser,
@ -394,7 +396,7 @@ public class Segment {
// we do not store the data in metadatadb any more if a solr is connected
if (this.fulltext.connectedSolr()) {
try {
this.fulltext.putDocument(this.fulltext.getSolrScheme().yacy2solr(id, responseHeader, document, metadata));
this.fulltext.putDocument(this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, metadata));
} catch ( final IOException e ) {
Log.logWarning("SOLR", "failed to send " + urlNormalform + " to solr: " + e.getMessage());
}

@ -59,6 +59,7 @@ import org.apache.solr.client.solrj.util.ClientUtils;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.Response;
public class SolrConfiguration extends ConfigurationSet implements Serializable {
@ -105,7 +106,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
this.lazy = lazy;
}
private boolean contains(YaCySchema field) {
public boolean contains(YaCySchema field) {
return this.contains(field.name());
}
@ -332,7 +333,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.');
}
public SolrInputDocument yacy2solr(final String id, final ResponseHeader header, final Document yacydoc, final URIMetadata metadata) {
public SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader header, final Document yacydoc, final URIMetadata metadata) {
// we use the SolrCell design as index scheme
final SolrInputDocument doc = new SolrInputDocument();
final DigestURI digestURI = new DigestURI(yacydoc.dc_source());
@ -345,6 +346,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
final InetAddress address = digestURI.getInetAddress();
if (address != null) add(doc, YaCySchema.ip_s, address.getHostAddress());
}
if (allAttr || contains(YaCySchema.collection_sxt) && profile != null) add(doc, YaCySchema.collection_sxt, profile.collections());
if (allAttr || contains(YaCySchema.url_protocol_s)) add(doc, YaCySchema.url_protocol_s, digestURI.getProtocol());
Map<String, String> searchpart = digestURI.getSearchpartMap();
if (searchpart == null) {

@ -80,7 +80,8 @@ public enum YaCySchema implements Schema {
h5_txt(SolrType.text_general, true, true, true, "h5 header"),
h6_txt(SolrType.text_general, true, true, true, "h6 header"),
// optional values
// optional values, not part of standard YaCy handling (but useful for external applications)
collection_sxt(SolrType.string, true, true, true, "tags that are attached to crawls/index generation to separate the search result into user-defined subsets"),
csscount_i(SolrType.integer, true, true, false, "number of entries in css_tag_txt and css_url_txt"),
css_tag_txt(SolrType.text_general, true, true, true, "full css tag with normalized url"),
css_url_txt(SolrType.text_general, true, true, true, "normalized urls within a css tag"),

Loading…
Cancel
Save