From c0910001659ebc46398941dec9e2abad8291608e Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 24 Apr 2013 01:14:35 +0200 Subject: [PATCH] added collection attribute also to the rss feed reader --- htroot/CrawlStartSite_p.html | 30 +++++++++-------- htroot/Load_RSS_p.html | 32 ++++++++++--------- htroot/Load_RSS_p.java | 13 ++++++-- source/net/yacy/cora/document/RSSMessage.java | 7 ---- .../net/yacy/crawler/retrieval/RSSLoader.java | 10 +++--- source/net/yacy/search/Switchboard.java | 20 +++++++----- source/net/yacy/search/index/Segment.java | 5 ++- .../net/yacy/search/query/QueryModifier.java | 3 +- .../schema/CollectionConfiguration.java | 7 ++-- 9 files changed, 69 insertions(+), 58 deletions(-) diff --git a/htroot/CrawlStartSite_p.html b/htroot/CrawlStartSite_p.html index 6c273a8b4..357e58ef6 100644 --- a/htroot/CrawlStartSite_p.html +++ b/htroot/CrawlStartSite_p.html @@ -77,21 +77,25 @@
- allow query-strings (urls with a '?' in the path) - - - - - - - - - - - + allow query-strings (urls with a '?' in the path) +
+
+
+
-
+
+ + + + + + + + + + +
diff --git a/htroot/Load_RSS_p.html b/htroot/Load_RSS_p.html index 0733da280..f52114c43 100644 --- a/htroot/Load_RSS_p.html +++ b/htroot/Load_RSS_p.html @@ -34,28 +34,30 @@
Preview
Indexing
-
#(showload)#Available after successful loading of rss feed in preview:: - +
#(showload)#Available after successful loading of rss feed in preview::
once
load this feed once now
scheduled
repeat the feed loading every
- - automatically. + + automatically.
+
collection
+
+ #(/showload)#
diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java index bddc8a072..ff500c60e 100644 --- a/htroot/Load_RSS_p.java +++ b/htroot/Load_RSS_p.java @@ -36,6 +36,7 @@ import net.yacy.cora.document.RSSReader; import net.yacy.cora.document.UTF8; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.SpaceExceededException; import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.retrieval.RSSLoader; @@ -48,6 +49,7 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.logging.Log; import net.yacy.repository.Blacklist.BlacklistType; import net.yacy.search.Switchboard; +import net.yacy.search.schema.CollectionSchema; import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; @@ -58,6 +60,11 @@ public class Load_RSS_p { final serverObjects prop = new serverObjects(); final Switchboard sb = (Switchboard)env; + final String collection = post == null ? "user" : CommonPattern.SPACE.matcher(post.get("collection", "user").trim()).replaceAll(""); + final String[] collections = collection.length() == 0 ? new String[0] : collection.split(","); + boolean collectionEnabled = sb.index.fulltext().getDefaultConfiguration().isEmpty() || sb.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.collection_sxt); + prop.put("showload_collectionEnabled", collectionEnabled ? 1 : 0); + prop.put("showload_collection", collection); prop.put("showload", 0); prop.put("showitems", 0); prop.put("shownewfeeds", 0); @@ -167,7 +174,7 @@ public class Load_RSS_p { continue; } // load feeds concurrently to get better responsibility in web interface - new RSSLoader(sb, url).start(); + new RSSLoader(sb, url, collections).start(); } } } @@ -274,7 +281,7 @@ public class Load_RSS_p { final DigestURI messageurl = new DigestURI(message.getLink()); if (RSSLoader.indexTriggered.containsKey(messageurl.hash())) continue loop; if (sb.urlExists(ASCII.String(messageurl.hash())) != null) continue loop; - sb.addToIndex(messageurl, null, null); + sb.addToIndex(messageurl, null, null, collections); RSSLoader.indexTriggered.insertIfAbsent(messageurl.hash(), new Date()); } catch (final IOException e) { Log.logException(e); @@ -287,7 +294,7 @@ public class Load_RSS_p { if (rss != null && post.containsKey("indexAllItemContent")) { record_api = true; final RSSFeed feed = rss.getFeed(); - RSSLoader.indexAllRssFeed(sb, url, feed); + RSSLoader.indexAllRssFeed(sb, url, feed, collections); } if (record_api && rss != null && rss.getFeed() != null && rss.getFeed().getChannel() != null) { diff --git a/source/net/yacy/cora/document/RSSMessage.java b/source/net/yacy/cora/document/RSSMessage.java index 9dd22d036..4f4ed6d94 100644 --- a/source/net/yacy/cora/document/RSSMessage.java +++ b/source/net/yacy/cora/document/RSSMessage.java @@ -25,27 +25,20 @@ package net.yacy.cora.document; import java.text.ParseException; -import java.util.ArrayList; import java.util.Arrays; import java.util.Comparator; import java.util.Date; import java.util.HashMap; import java.util.HashSet; -import java.util.List; import java.util.Map; -import java.util.Properties; import java.util.Set; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.date.ISO8601Formatter; -import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.lod.vocabulary.DublinCore; import net.yacy.cora.lod.vocabulary.Geo; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.util.CommonPattern; -import net.yacy.document.Document; -import net.yacy.document.parser.html.ImageEntry; -import net.yacy.kelondro.data.meta.DigestURI; public class RSSMessage implements Hit, Comparable, Comparator { diff --git a/source/net/yacy/crawler/retrieval/RSSLoader.java b/source/net/yacy/crawler/retrieval/RSSLoader.java index 265ccced9..fb80849b4 100644 --- a/source/net/yacy/crawler/retrieval/RSSLoader.java +++ b/source/net/yacy/crawler/retrieval/RSSLoader.java @@ -54,10 +54,12 @@ public class RSSLoader extends Thread { DigestURI urlf; Switchboard sb; + String[] collections; - public RSSLoader(final Switchboard sb, final DigestURI urlf) { + public RSSLoader(final Switchboard sb, final DigestURI urlf, final String[] collections) { this.sb = sb; this.urlf = urlf; + this.collections = collections; } @Override @@ -79,20 +81,20 @@ public class RSSLoader extends Thread { return; } final RSSFeed feed = rss.getFeed(); - indexAllRssFeed(this.sb, this.urlf, feed); + indexAllRssFeed(this.sb, this.urlf, feed, this.collections); // add the feed also to the scheduler recordAPI(this.sb, null, this.urlf, feed, 7, "seldays"); } - public static void indexAllRssFeed(final Switchboard sb, final DigestURI url, final RSSFeed feed) { + public static void indexAllRssFeed(final Switchboard sb, final DigestURI url, final RSSFeed feed, String[] collections) { int loadCount = 0; loop: for (final RSSMessage message: feed) { try { final DigestURI messageurl = new DigestURI(message.getLink()); if (indexTriggered.containsKey(messageurl.hash())) continue loop; if (sb.urlExists(ASCII.String(messageurl.hash())) != null) continue loop; - sb.addToIndex(messageurl, null, null); + sb.addToIndex(messageurl, null, null, collections); indexTriggered.insertIfAbsent(messageurl.hash(), new Date()); loadCount++; } catch (final IOException e) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 3affb3e82..21c0703c6 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2620,6 +2620,7 @@ public final class Switchboard extends serverSwitch { for ( int i = 0; i < in.documents.length; i++ ) { storeDocumentIndex( in.queueEntry, + in.queueEntry.profile().collections(), in.documents[i], in.condenser[i], null, @@ -2631,6 +2632,7 @@ public final class Switchboard extends serverSwitch { private void storeDocumentIndex( final Response queueEntry, + final String[] collections, final Document document, final Condenser condenser, final SearchEvent searchEvent, @@ -2679,7 +2681,7 @@ public final class Switchboard extends serverSwitch { this.index.storeDocument( url, referrerURL, - queueEntry.profile(), + collections, queueEntry.getResponseHeader(), document, condenser, @@ -2749,12 +2751,13 @@ public final class Switchboard extends serverSwitch { final DigestURI url, final Map links, final SearchEvent searchEvent, - final String heuristicName) { + final String heuristicName, + final String[] collections) { // add the landing page to the index. should not load that again since it should be in the cache if ( url != null ) { try { - addToIndex(url, searchEvent, heuristicName); + addToIndex(url, searchEvent, heuristicName, collections); } catch ( final IOException e ) { } catch ( final Parser.Failure e ) { } @@ -2767,7 +2770,7 @@ public final class Switchboard extends serverSwitch { // take the matcher and load them all for ( final Map.Entry entry : matcher.entrySet() ) { try { - addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName); + addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName, collections); } catch ( final IOException e ) { } catch ( final Parser.Failure e ) { } @@ -2776,7 +2779,7 @@ public final class Switchboard extends serverSwitch { // take then the no-matcher and load them also for ( final Map.Entry entry : links.entrySet() ) { try { - addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName); + addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName, collections); } catch ( final IOException e ) { } catch ( final Parser.Failure e ) { } @@ -2909,7 +2912,7 @@ public final class Switchboard extends serverSwitch { * @throws IOException * @throws Parser.Failure */ - public void addToIndex(final DigestURI url, final SearchEvent searchEvent, final String heuristicName) + public void addToIndex(final DigestURI url, final SearchEvent searchEvent, final String heuristicName, final String[] collections) throws IOException, Parser.Failure { if (searchEvent != null) { @@ -2956,6 +2959,7 @@ public final class Switchboard extends serverSwitch { Switchboard.this.webStructure.generateCitationReference(url, document); storeDocumentIndex( response, + collections, document, condenser, searchEvent, @@ -3341,7 +3345,7 @@ public final class Switchboard extends serverSwitch { } // add all pages to the index - addAllToIndex(url, links, searchEvent, "site"); + addAllToIndex(url, links, searchEvent, "site", new String[]{"site"}); } } catch ( final Throwable e ) { Log.logException(e); @@ -3454,7 +3458,7 @@ public final class Switchboard extends serverSwitch { + feedName + "' rss feed"); // add all pages to the index - addAllToIndex(null, links, searchEvent, feedName); + addAllToIndex(null, links, searchEvent, feedName, new String[]{"rss"}); } } catch ( final Throwable e ) { //Log.logException(e); diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index 6a97d6ae9..4f3ce9485 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -51,7 +51,6 @@ import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.LookAheadIterator; import net.yacy.cora.util.SpaceExceededException; -import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.data.CrawlQueues; import net.yacy.crawler.retrieval.Response; import net.yacy.document.Condenser; @@ -449,7 +448,7 @@ public class Segment { public SolrInputDocument storeDocument( final DigestURI url, final DigestURI referrerURL, - final CrawlProfile profile, + final String[] collections, final ResponseHeader responseHeader, final Document document, final Condenser condenser, @@ -489,7 +488,7 @@ public class Segment { char docType = Response.docType(document.dc_format()); // CREATE SOLR DOCUMENT - final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration()); + final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(id, collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration()); // FIND OUT IF THIS IS A DOUBLE DOCUMENT String hostid = url.hosthash(); diff --git a/source/net/yacy/search/query/QueryModifier.java b/source/net/yacy/search/query/QueryModifier.java index df542238e..3734d9892 100644 --- a/source/net/yacy/search/query/QueryModifier.java +++ b/source/net/yacy/search/query/QueryModifier.java @@ -202,10 +202,11 @@ public class QueryModifier { } StringBuilder filterQuery = new StringBuilder(20); if (sites.size() > 1) { - filterQuery.append(CollectionSchema.collection_sxt.getSolrFieldName()).append(':').append(sites.get(0)); + filterQuery.append('(').append(CollectionSchema.collection_sxt.getSolrFieldName()).append(':').append(sites.get(0)); for (int i = 1; i < sites.size(); i++) { filterQuery.append(" OR ").append(CollectionSchema.collection_sxt.getSolrFieldName()).append(':').append(sites.get(i)); } + filterQuery.append(')'); } else if (sites.size() == 1) { filterQuery.append(CollectionSchema.collection_sxt.getSolrFieldName()).append(':').append(sites.get(0)); } diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index 30c83a084..0357bbe56 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -56,7 +56,6 @@ import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.util.CommonPattern; -import net.yacy.crawler.data.CrawlProfile; import net.yacy.crawler.retrieval.Response; import net.yacy.document.Condenser; import net.yacy.document.Document; @@ -329,7 +328,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } public SolrVector yacy2solr( - final String id, final CrawlProfile profile, final ResponseHeader responseHeader, + final String id, final String[] collections, final ResponseHeader responseHeader, final Document document, Condenser condenser, DigestURI referrerURL, String language, IndexCell citations, WebgraphConfiguration webgraph) { @@ -362,7 +361,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri final InetAddress address = digestURI.getInetAddress(); if (address != null) add(doc, CollectionSchema.ip_s, address.getHostAddress()); } - if (allAttr || contains(CollectionSchema.collection_sxt) && profile != null) add(doc, CollectionSchema.collection_sxt, profile.collections()); + if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.length > 0) add(doc, CollectionSchema.collection_sxt, collections); if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURI.getProtocol()); Map searchpart = digestURI.getSearchpartMap(); if (searchpart == null) { @@ -756,7 +755,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount()); // list all links - WebgraphConfiguration.Subgraph subgraph = webgraph.edges(digestURI, responseHeader, profile.collections(), clickdepth, document.getAnchors(), images, inboundLinks, outboundLinks, citations); + WebgraphConfiguration.Subgraph subgraph = webgraph.edges(digestURI, responseHeader, collections, clickdepth, document.getAnchors(), images, inboundLinks, outboundLinks, citations); doc.webgraphDocuments.addAll(subgraph.edges); if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0])); if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_txt)) add(doc, CollectionSchema.inboundlinks_urlstub_txt, subgraph.urlStubs[0]);