added a feature to set a collection for a crawl result based on a

regular expression on th url: the collection attribut for a crawl start may be now either a token or a list of tokens, seperated by ',' where a token is either a string or a pair <string,pattern> where the string is separated to the pattern with a ':' and the string is assigned to the document as collection only if the pattern matches with the url.
12 years ago · a88a62f7aa
parent 3c5abedabf
commit a88a62f7aa
7 changed files with 72 additions and 23 deletions
--- a/htroot/Load_RSS_p.java
+++ b/htroot/Load_RSS_p.java
@ -42,6 +42,7 @@ import net.yacy.cora.util.CommonPattern;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.cora.util.SpaceExceededException;
 import net.yacy.crawler.HarvestProcess;
+import net.yacy.crawler.data.CrawlProfile;
 import net.yacy.crawler.retrieval.RSSLoader;
 import net.yacy.crawler.retrieval.Response;
 import net.yacy.data.WorkTables;
@ -62,7 +63,7 @@ public class Load_RSS_p {
        final Switchboard sb = (Switchboard)env;

        final String collection = post == null ? "user" : CommonPattern.SPACE.matcher(post.get("collection", "user").trim()).replaceAll("");
-        final String[] collections = collection.length() == 0 ? new String[0] : collection.split(",");
+        Map<String, Pattern> collections = CrawlProfile.collectionParser(collection);
        boolean collectionEnabled = sb.index.fulltext().getDefaultConfiguration().isEmpty() || sb.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.collection_sxt);
        prop.put("showload_collectionEnabled", collectionEnabled ? 1 : 0);
        prop.put("showload_collection", collection);
--- a/source/net/yacy/crawler/data/CrawlProfile.java
+++ b/source/net/yacy/crawler/data/CrawlProfile.java
@ -26,7 +26,9 @@
 package net.yacy.crawler.data;

 import java.text.DateFormat;
+import java.util.HashMap;
 import java.util.Iterator;
+import java.util.LinkedHashMap;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
@ -43,6 +45,7 @@ import net.yacy.cora.util.CommonPattern;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.crawler.CrawlSwitchboard;
 import net.yacy.kelondro.data.word.Word;
+import net.yacy.search.query.QueryParams;
 import net.yacy.server.serverObjects;

 public class CrawlProfile extends ConcurrentHashMap<String, String> implements Map<String, String> {
@ -259,15 +262,29 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        //if (r == null) return null;
        return r;
    }
+    
+    private Map<String, Pattern> cmap = null;

    /**
     * get the collections for this crawl
     * @return a list of collection names
     */
-    public String[] collections() {
+    public Map<String, Pattern> collections() {
+        if (cmap != null) return cmap;
        final String r = get(COLLECTIONS);
-        if (r == null) return new String[0];
-        return r.split(",");
+        this.cmap = collectionParser(r);
+        return this.cmap;
+    }
+    
+    public static Map<String, Pattern> collectionParser(String collectionString) {
+        if (collectionString == null || collectionString.length() == 0) return new HashMap<String, Pattern>();
+        String[] cs = collectionString.split(",");
+        final Map<String, Pattern> cm = new LinkedHashMap<String, Pattern>();
+        for (String c: cs) {
+            int p = c.indexOf(':');
+            if (p < 0) cm.put(c, QueryParams.catchall_pattern); else cm.put(c.substring(0, p), Pattern.compile(c.substring(p + 1)));
+        }
+        return cm;
    }

    /**
--- a/source/net/yacy/crawler/retrieval/RSSLoader.java
+++ b/source/net/yacy/crawler/retrieval/RSSLoader.java
@ -31,6 +31,7 @@ import java.util.Date;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
+import java.util.regex.Pattern;

 import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.RSSFeed;
@ -58,10 +59,10 @@ public class RSSLoader extends Thread {

    private final DigestURI urlf;
    private final Switchboard sb;
-    private final String[] collections;
+    private final Map<String, Pattern> collections;
    private final ClientIdentification.Agent agent;

-    public RSSLoader(final Switchboard sb, final DigestURI urlf, final String[] collections, final ClientIdentification.Agent agent) {
+    public RSSLoader(final Switchboard sb, final DigestURI urlf, final Map<String, Pattern> collections, final ClientIdentification.Agent agent) {
        this.sb = sb;
        this.urlf = urlf;
        this.collections = collections;
@ -93,7 +94,7 @@ public class RSSLoader extends Thread {
        recordAPI(this.sb, null, this.urlf, feed, 7, "seldays");
    }

-    public static void indexAllRssFeed(final Switchboard sb, final DigestURI url, final RSSFeed feed, String[] collections) {
+    public static void indexAllRssFeed(final Switchboard sb, final DigestURI url, final RSSFeed feed, Map<String, Pattern> collections) {
        int loadCount = 0;
        List<DigestURI> list = new ArrayList<DigestURI>();
        Map<String, DigestURI> urlmap = new HashMap<String, DigestURI>();
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -2685,7 +2685,7 @@ public final class Switchboard extends serverSwitch {

    private void storeDocumentIndex(
        final Response queueEntry,
-        final String[] collections,
+        final Map<String, Pattern> collections,
        final Document document,
        final Condenser condenser,
        final SearchEvent searchEvent,
@ -2808,7 +2808,7 @@ public final class Switchboard extends serverSwitch {
        final Map<DigestURI, String> links,
        final SearchEvent searchEvent,
        final String heuristicName,
-        final String[] collections) {
+        final Map<String, Pattern> collections) {

        List<DigestURI> urls = new ArrayList<DigestURI>();
        // add the landing page to the index. should not load that again since it should be in the cache
@ -2978,7 +2978,7 @@ public final class Switchboard extends serverSwitch {
     * @throws IOException
     * @throws Parser.Failure
     */
-    public void addToIndex(final Collection<DigestURI> urls, final SearchEvent searchEvent, final String heuristicName, final String[] collections) {
+    public void addToIndex(final Collection<DigestURI> urls, final SearchEvent searchEvent, final String heuristicName, final Map<String, Pattern> collections) {
        Map<String, DigestURI> urlmap = new HashMap<String, DigestURI>();
        for (DigestURI url: urls) urlmap.put(ASCII.String(url.hash()), url);
        if (searchEvent != null) {
@ -3421,7 +3421,7 @@ public final class Switchboard extends serverSwitch {
                        }

                        // add all pages to the index
-                        addAllToIndex(url, links, searchEvent, "site", new String[]{"site"});
+                        addAllToIndex(url, links, searchEvent, "site", CrawlProfile.collectionParser("site"));
                    }
                } catch (final Throwable e ) {
                    ConcurrentLog.logException(e);
@ -3535,7 +3535,7 @@ public final class Switchboard extends serverSwitch {
                            + feedName
                            + "' rss feed");
                        // add all pages to the index
-                        addAllToIndex(null, links, searchEvent, feedName, new String[]{"rss"});
+                        addAllToIndex(null, links, searchEvent, feedName, CrawlProfile.collectionParser("rss"));
                    }
                } catch (final Throwable e ) {
                    //Log.logException(e);
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@ -36,6 +36,7 @@ import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.concurrent.BlockingQueue;
+import java.util.regex.Pattern;

 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrDocumentList;
@ -575,7 +576,7 @@ public class Segment {
    public SolrInputDocument storeDocument(
            final DigestURI url,
            final DigestURI referrerURL,
-            final String[] collections,
+            final Map<String, Pattern> collections,
            final ResponseHeader responseHeader,
            final Document document,
            final Condenser condenser,
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -42,6 +42,7 @@ import java.util.Properties;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.concurrent.BlockingQueue;
+import java.util.regex.Pattern;

 import net.yacy.cora.document.ASCII;
 import net.yacy.cora.document.MultiProtocolURI;
@ -195,7 +196,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        return sd;
    }
    
-    public void addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURI digestURI, final char doctype) {
+    /**
+     * add uri attributes to solr document
+     * @param doc
+     * @param allAttr
+     * @param digestURI
+     * @param doctype
+     * @return the normalized url
+     */
+    public String addURIAttributes(final SolrInputDocument doc, final boolean allAttr, final DigestURI digestURI, final char doctype) {
        add(doc, CollectionSchema.id, ASCII.String(digestURI.hash()));
        String us = digestURI.toNormalform(true);
        add(doc, CollectionSchema.sku, us);
@ -236,6 +245,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            if (allAttr || contains(CollectionSchema.url_parameter_key_sxt)) add(doc, CollectionSchema.url_parameter_key_sxt, searchpart.keySet().toArray(new String[searchpart.size()]));
            if (allAttr || contains(CollectionSchema.url_parameter_value_sxt)) add(doc, CollectionSchema.url_parameter_value_sxt,  searchpart.values().toArray(new String[searchpart.size()]));
        }
+        return us;
    }
    
    public SolrInputDocument metadata2solr(final URIMetadataRow md) {
@ -346,7 +356,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
    }
    
    public SolrVector yacy2solr(
-            final String id, final String[] collections, final ResponseHeader responseHeader,
+            final String id, final Map<String, Pattern> collections, final ResponseHeader responseHeader,
            final Document document, final Condenser condenser, final DigestURI referrerURL, final String language,
            final IndexCell<CitationReference> citations,
            final WebgraphConfiguration webgraph) {
@ -354,7 +364,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        SolrVector doc = new SolrVector();
        final DigestURI digestURI = document.dc_source();
        boolean allAttr = this.isEmpty();
-        addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI));
+        String url = addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI));
        
        Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
        
@ -378,7 +388,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            processTypes.add(ProcessType.CITATION); // postprocessing needed
        }
        
-        if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.length > 0) add(doc, CollectionSchema.collection_sxt, collections);
+        if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) {
+            List<String> cs = new ArrayList<String>();
+            for (Map.Entry<String, Pattern> e: collections.entrySet()) {
+                if (e.getValue().matcher(url).matches()) cs.add(e.getKey());
+            }
+            add(doc, CollectionSchema.collection_sxt, cs);
+        }
        
        List<String> titles = document.titles();
        if (allAttr || contains(CollectionSchema.title)) {
@ -1166,19 +1182,25 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
     * @param httpstatus
     * @throws IOException
     */
-    public SolrInputDocument err(final DigestURI digestURI, final String[] collections, final String failReason, final FailType failType, final int httpstatus) throws IOException {
+    public SolrInputDocument err(final DigestURI digestURI, final Map<String, Pattern> collections, final String failReason, final FailType failType, final int httpstatus) throws IOException {
        boolean allAttr = this.isEmpty();
        assert allAttr || contains(CollectionSchema.failreason_s);
        
        final SolrInputDocument doc = new SolrInputDocument();
-        addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI));
+        String url = addURIAttributes(doc, allAttr, digestURI, Response.docType(digestURI));
        if (allAttr || contains(CollectionSchema.load_date_dt)) add(doc, CollectionSchema.load_date_dt, new Date());
        
        // fail reason and status
        if (allAttr || contains(CollectionSchema.failreason_s)) add(doc, CollectionSchema.failreason_s, failReason);
        if (allAttr || contains(CollectionSchema.failtype_s)) add(doc, CollectionSchema.failtype_s, failType.name());
        if (allAttr || contains(CollectionSchema.httpstatus_i)) add(doc, CollectionSchema.httpstatus_i, httpstatus);
-        if (allAttr || contains(CollectionSchema.collection_sxt)) add(doc, CollectionSchema.collection_sxt, collections);
+        if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) {
+            List<String> cs = new ArrayList<String>();
+            for (Map.Entry<String, Pattern> e: collections.entrySet()) {
+                if (e.getValue().matcher(url).matches()) cs.add(e.getKey());
+            }
+            add(doc, CollectionSchema.collection_sxt, cs);
+        }
        return doc;
    }

--- a/source/net/yacy/search/schema/WebgraphConfiguration.java
+++ b/source/net/yacy/search/schema/WebgraphConfiguration.java
@ -37,6 +37,7 @@ import java.util.Map;
 import java.util.Properties;
 import java.util.Set;
 import java.util.concurrent.BlockingQueue;
+import java.util.regex.Pattern;

 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrInputDocument;
@ -114,7 +115,7 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
    
    public void addEdges(
            final Subgraph subgraph,
-            final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth_source,
+            final DigestURI source, final ResponseHeader responseHeader, Map<String, Pattern> collections, int clickdepth_source,
            final Map<DigestURI, Properties> alllinks, final Map<DigestURI, ImageEntry> images,
            final boolean inbound, final Set<DigestURI> links,
            final IndexCell<CitationReference> citations) {
@ -146,11 +147,17 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
                add(edge, WebgraphSchema.load_date_dt, loadDate);
            }
            if (allAttr || contains(WebgraphSchema.last_modified)) add(edge, WebgraphSchema.last_modified, responseHeader == null ? new Date() : responseHeader.lastModified());
-            add(edge, WebgraphSchema.collection_sxt, collections);
+            final String source_url_string = source.toNormalform(false);
+            if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.size() > 0) {
+                List<String> cs = new ArrayList<String>();
+                for (Map.Entry<String, Pattern> e: collections.entrySet()) {
+                    if (e.getValue().matcher(source_url_string).matches()) cs.add(e.getKey());
+                }
+                add(edge, WebgraphSchema.collection_sxt, cs);
+            }

            // add the source attributes
            add(edge, WebgraphSchema.source_id_s, source_id);
-            final String source_url_string = source.toNormalform(false);
            int pr_source = source_url_string.indexOf("://",0);
            if (allAttr || contains(WebgraphSchema.source_protocol_s)) add(edge, WebgraphSchema.source_protocol_s, source_url_string.substring(0, pr_source));
            if (allAttr || contains(WebgraphSchema.source_urlstub_s)) add(edge, WebgraphSchema.source_urlstub_s, source_url_string.substring(pr_source + 3));