added a feature to set a collection for a crawl result based on a

regular expression on th url: the collection attribut for a crawl start
may be now either a token or a list of tokens, seperated by ',' where a
token is either a string or a pair <string,pattern> where the string is
separated to the pattern with a ':' and the string is assigned to the
document as collection only if the pattern matches with the url.
pull/1/head
Michael Peter Christen 12 years ago
parent 3c5abedabf
commit a88a62f7aa

@ -42,6 +42,7 @@ import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.HarvestProcess;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.RSSLoader;
import net.yacy.crawler.retrieval.Response;
import net.yacy.data.WorkTables;
@ -62,7 +63,7 @@ public class Load_RSS_p {
final Switchboard sb = (Switchboard)env;
final String collection = post == null ? "user" : CommonPattern.SPACE.matcher(post.get("collection", "user").trim()).replaceAll("");
final String[] collections = collection.length() == 0 ? new String[0] : collection.split(",");
Map<String, Pattern> collections = CrawlProfile.collectionParser(collection);
boolean collectionEnabled = sb.index.fulltext().getDefaultConfiguration().isEmpty() || sb.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.collection_sxt);
prop.put("showload_collectionEnabled", collectionEnabled ? 1 : 0);
prop.put("showload_collection", collection);

@ -26,7 +26,9 @@
package net.yacy.crawler.data;
import java.text.DateFormat;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
@ -43,6 +45,7 @@ import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.CrawlSwitchboard;
import net.yacy.kelondro.data.word.Word;
import net.yacy.search.query.QueryParams;
import net.yacy.server.serverObjects;
public class CrawlProfile extends ConcurrentHashMap<String, String> implements Map<String, String> {
@ -259,15 +262,29 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
//if (r == null) return null;
return r;
}
private Map<String, Pattern> cmap = null;
/**
* get the collections for this crawl
* @return a list of collection names
*/
public String[] collections() {
public Map<String, Pattern> collections() {
if (cmap != null) return cmap;
final String r = get(COLLECTIONS);
if (r == null) return new String[0];
return r.split(",");
this.cmap = collectionParser(r);
return this.cmap;
}
public static Map<String, Pattern> collectionParser(String collectionString) {
if (collectionString == null || collectionString.length() == 0) return new HashMap<String, Pattern>();
String[] cs = collectionString.split(",");
final Map<String, Pattern> cm = new LinkedHashMap<String, Pattern>();
for (String c: cs) {
int p = c.indexOf(':');
if (p < 0) cm.put(c, QueryParams.catchall_pattern); else cm.put(c.substring(0, p), Pattern.compile(c.substring(p + 1)));
}
return cm;
}
/**

@ -31,6 +31,7 @@ import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.RSSFeed;
@ -58,10 +59,10 @@ public class RSSLoader extends Thread {
private final DigestURI urlf;
private final Switchboard sb;
private final String[] collections;
private final Map<String, Pattern> collections;
private final ClientIdentification.Agent agent;
public RSSLoader(final Switchboard sb, final DigestURI urlf, final String[] collections, final ClientIdentification.Agent agent) {
public RSSLoader(final Switchboard sb, final DigestURI urlf, final Map<String, Pattern> collections, final ClientIdentification.Agent agent) {
this.sb = sb;
this.urlf = urlf;
this.collections = collections;
@ -93,7 +94,7 @@ public class RSSLoader extends Thread {
recordAPI(this.sb, null, this.urlf, feed, 7, "seldays");
}
public static void indexAllRssFeed(final Switchboard sb, final DigestURI url, final RSSFeed feed, String[] collections) {
public static void indexAllRssFeed(final Switchboard sb, final DigestURI url, final RSSFeed feed, Map<String, Pattern> collections) {
int loadCount = 0;
List<DigestURI> list = new ArrayList<DigestURI>();
Map<String, DigestURI> urlmap = new HashMap<String, DigestURI>();

@ -2685,7 +2685,7 @@ public final class Switchboard extends serverSwitch {
private void storeDocumentIndex(
final Response queueEntry,
final String[] collections,
final Map<String, Pattern> collections,
final Document document,
final Condenser condenser,
final SearchEvent searchEvent,
@ -2808,7 +2808,7 @@ public final class Switchboard extends serverSwitch {
final Map<DigestURI, String> links,
final SearchEvent searchEvent,
final String heuristicName,
final String[] collections) {
final Map<String, Pattern> collections) {
List<DigestURI> urls = new ArrayList<DigestURI>();
// add the landing page to the index. should not load that again since it should be in the cache
@ -2978,7 +2978,7 @@ public final class Switchboard extends serverSwitch {
* @throws IOException
* @throws Parser.Failure
*/
public void addToIndex(final Collection<DigestURI> urls, final SearchEvent searchEvent, final String heuristicName, final String[] collections) {
public void addToIndex(final Collection<DigestURI> urls, final SearchEvent searchEvent, final String heuristicName, final Map<String, Pattern> collections) {
Map<String, DigestURI> urlmap = new HashMap<String, DigestURI>();
for (DigestURI url: urls) urlmap.put(ASCII.String(url.hash()), url);
if (searchEvent != null) {
@ -3421,7 +3421,7 @@ public final class Switchboard extends serverSwitch {
}
// add all pages to the index
addAllToIndex(url, links, searchEvent, "site", new String[]{"site"});
addAllToIndex(url, links, searchEvent, "site", CrawlProfile.collectionParser("site"));
}
} catch (final Throwable e ) {
ConcurrentLog.logException(e);
@ -3535,7 +3535,7 @@ public final class Switchboard extends serverSwitch {
+ feedName
+ "' rss feed");
// add all pages to the index
addAllToIndex(null, links, searchEvent, feedName, new String[]{"rss"});
addAllToIndex(null, links, searchEvent, feedName, CrawlProfile.collectionParser("rss"));
}
} catch (final Throwable e ) {
//Log.logException(e);

@ -36,6 +36,7 @@ import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
import java.util.regex.Pattern;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
@ -575,7 +576,7 @@ public class Segment {
public SolrInputDocument storeDocument(
final DigestURI url,
final DigestURI referrerURL,
final String[] collections,
final Map<String, Pattern> collections,
final ResponseHeader responseHeader,
final Document document,
final Condenser condenser,

@ -42,6 +42,7 @@ import java.util.Properties;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
import java.util.regex.Pattern;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
@ -195,7 +196,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
return sd;
}
public void addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURI digestURI, final char doctype) {
/**
* add uri attributes to solr document
* @param doc
* @param allAttr
* @param digestURI
* @param doctype
* @return the normalized url
*/
public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURI digestURI, final char doctype) {
add(doc, CollectionSchema.id, ASCII.String(digestURI.hash()));
String us = digestURI.toNormalform(true);
add(doc, CollectionSchema.sku, us);
@ -236,6 +245,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (allAttr || contains(CollectionSchema.url_parameter_key_sxt)) add(doc, CollectionSchema.url_parameter_key_sxt, searchpart.keySet().toArray(new String[searchpart.size()]));
if (allAttr || contains(CollectionSchema.url_parameter_value_sxt)) add(doc, CollectionSchema.url_parameter_value_sxt, searchpart.values().toArray(new String[searchpart.size()]));
}
return us;
}
public SolrInputDocument metadata2solr(final URIMetadataRow md) {
@ -346,7 +356,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
public SolrVector yacy2solr(
final String id, final String[] collections, final ResponseHeader responseHeader,
final String id, final Map<String, Pattern> collections, final ResponseHeader responseHeader,
final Document document, final Condenser condenser, final DigestURI referrerURL, final String language,
final IndexCell<CitationReference> citations,
final WebgraphConfiguration webgraph) {
@ -354,7 +364,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
SolrVector doc = new SolrVector();
final DigestURI digestURI = document.dc_source();
boolean allAttr = this.isEmpty();
addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI));
String url = addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI));
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
@ -378,7 +388,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
processTypes.add(ProcessType.CITATION); // postprocessing needed
}
if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.length > 0) add(doc, CollectionSchema.collection_sxt, collections);
if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) {
List<String> cs = new ArrayList<String>();
for (Map.Entry<String, Pattern> e: collections.entrySet()) {
if (e.getValue().matcher(url).matches()) cs.add(e.getKey());
}
add(doc, CollectionSchema.collection_sxt, cs);
}
List<String> titles = document.titles();
if (allAttr || contains(CollectionSchema.title)) {
@ -1166,19 +1182,25 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
* @param httpstatus
* @throws IOException
*/
public SolrInputDocument err(final DigestURI digestURI, final String[] collections, final String failReason, final FailType failType, final int httpstatus) throws IOException {
public SolrInputDocument err(final DigestURI digestURI, final Map<String, Pattern> collections, final String failReason, final FailType failType, final int httpstatus) throws IOException {
boolean allAttr = this.isEmpty();
assert allAttr || contains(CollectionSchema.failreason_s);
final SolrInputDocument doc = new SolrInputDocument();
addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI));
String url = addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI));
if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, new Date());
// fail reason and status
if (allAttr || contains(CollectionSchema.failreason_s)) add(doc, CollectionSchema.failreason_s, failReason);
if (allAttr || contains(CollectionSchema.failtype_s)) add(doc, CollectionSchema.failtype_s, failType.name());
if (allAttr || contains(CollectionSchema.httpstatus_i)) add(doc, CollectionSchema.httpstatus_i, httpstatus);
if (allAttr || contains(CollectionSchema.collection_sxt)) add(doc, CollectionSchema.collection_sxt, collections);
if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) {
List<String> cs = new ArrayList<String>();
for (Map.Entry<String, Pattern> e: collections.entrySet()) {
if (e.getValue().matcher(url).matches()) cs.add(e.getKey());
}
add(doc, CollectionSchema.collection_sxt, cs);
}
return doc;
}

@ -37,6 +37,7 @@ import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import java.util.regex.Pattern;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
@ -114,7 +115,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
public void addEdges(
final Subgraph subgraph,
final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth_source,
final DigestURI source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int clickdepth_source,
final Map<DigestURI, Properties> alllinks, final Map<DigestURI, ImageEntry> images,
final boolean inbound, final Set<DigestURI> links,
final IndexCell<CitationReference> citations) {
@ -146,11 +147,17 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
add(edge, WebgraphSchema.load_date_dt, loadDate);
}
if (allAttr || contains(WebgraphSchema.last_modified)) add(edge, WebgraphSchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified());
add(edge, WebgraphSchema.collection_sxt, collections);
final String source_url_string = source.toNormalform(false);
if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) {
List<String> cs = new ArrayList<String>();
for (Map.Entry<String, Pattern> e: collections.entrySet()) {
if (e.getValue().matcher(source_url_string).matches()) cs.add(e.getKey());
}
add(edge, WebgraphSchema.collection_sxt, cs);
}
// add the source attributes
add(edge, WebgraphSchema.source_id_s, source_id);
final String source_url_string = source.toNormalform(false);
int pr_source = source_url_string.indexOf("://",0);
if (allAttr || contains(WebgraphSchema.source_protocol_s)) add(edge, WebgraphSchema.source_protocol_s, source_url_string.substring(0, pr_source));
if (allAttr || contains(WebgraphSchema.source_urlstub_s)) add(edge, WebgraphSchema.source_urlstub_s, source_url_string.substring(pr_source + 3));

Loading…
Cancel
Save