added collection attribute also to the rss feed reader

pull/1/head
Michael Peter Christen 12 years ago
parent 43ca359e24
commit c091000165

@ -77,21 +77,25 @@
</dd>
<dt><label>Dynamic URLs</label></dt>
<dd>
<input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /> allow <a href="http://en.wikipedia.org/wiki/Query_string">query-strings</a> (urls with a '?' in the path)
<input type="hidden" name="directDocByURL" id="directDocByURL" value="off" />
<input type="hidden" name="recrawl" id="recrawl" value="reload" />
<input type="hidden" name="reloadIfOlderNumber" id="reloadIfOlderNumber" value="3" />
<input type="hidden" name="reloadIfOlderUnit" id="reloadIfOlderUnit" value="day" />
<input type="hidden" name="deleteold" id="deleteold" value="on" />
<input type="hidden" name="storeHTCache" id="storeHTCache" value="on" />
<input type="hidden" name="cachePolicy" id="cachePolicy" value="iffresh" />
<input type="hidden" name="indexText" id="indexText" value="on" />
<input type="hidden" name="indexMedia" id="indexMedia" value="on" />
<input type="hidden" name="intention" id="intention" value="" />
<input type="hidden" name="collection" id="collection" value="" />
<input type="checkbox" name="crawlingQ" id="crawlingQ" #(crawlingQChecked)#::checked="checked"#(/crawlingQChecked)# /> allow <a href="http://en.wikipedia.org/wiki/Query_string">query-strings</a> (urls with a '?' in the path)
</dd>
<dt><label>Collection</label></dt>
<dd>
<input name="collection" id="collection" type="text" size="60" maxlength="100" value="#[collection]#" #(collectionEnabled)#disabled="disabled"::#(/collectionEnabled)# />
</dd>
<dt><label>Start</label></dt>
<dd><input type="submit" name="crawlingstart" value="Start New Crawl" class="submitready"/>
<dd>
<input type="hidden" name="directDocByURL" id="directDocByURL" value="off" />
<input type="hidden" name="recrawl" id="recrawl" value="reload" />
<input type="hidden" name="reloadIfOlderNumber" id="reloadIfOlderNumber" value="3" />
<input type="hidden" name="reloadIfOlderUnit" id="reloadIfOlderUnit" value="day" />
<input type="hidden" name="deleteold" id="deleteold" value="on" />
<input type="hidden" name="storeHTCache" id="storeHTCache" value="on" />
<input type="hidden" name="cachePolicy" id="cachePolicy" value="iffresh" />
<input type="hidden" name="indexText" id="indexText" value="on" />
<input type="hidden" name="indexMedia" id="indexMedia" value="on" />
<input type="hidden" name="intention" id="intention" value="" />
<input type="submit" name="crawlingstart" value="Start New Crawl" class="submitready"/>
</dd>
</dl>

@ -34,28 +34,30 @@
<dt>Preview</dt>
<dd><input type="submit" name="showrss" value="Show RSS Items" /></dd>
<dt>Indexing</dt>
<dd>#(showload)#Available after successful loading of rss feed in preview::
<input type="submit" name="indexAllItemContent" value="Add All Items to Index (full content of url)" />
<dd>#(showload)#<input type="hidden" name="collection" id="collection" value="#[collection]#" />Available after successful loading of rss feed in preview::
<dl>
<dt>once<input type="radio" name="repeat" value="off" checked="checked"/></dt>
<dd>load this feed once now</dd>
<dt>scheduled<input type="radio" name="repeat" value="on"/></dt>
<dd>repeat the feed loading every<br/>
<select name="repeat_time">
<option value="1">1</option><option value="2">2</option><option value="3">3</option>
<option value="4">4</option><option value="5">5</option><option value="6">6</option>
<option value="7" selected="selected">7</option>
<option value="8">8</option><option value="9">9</option><option value="10">10</option>
<option value="12">12</option><option value="14">14</option><option value="21">21</option>
<option value="28">28</option><option value="30">30</option>
</select>
<select name="repeat_unit">
<option value="selminutes">minutes</option>
<option value="selhours">hours</option>
<option value="seldays" selected="selected">days</option>
</select> automatically.
<select name="repeat_time">
<option value="1">1</option><option value="2">2</option><option value="3">3</option>
<option value="4">4</option><option value="5">5</option><option value="6">6</option>
<option value="7" selected="selected">7</option>
<option value="8">8</option><option value="9">9</option><option value="10">10</option>
<option value="12">12</option><option value="14">14</option><option value="21">21</option>
<option value="28">28</option><option value="30">30</option>
</select>
<select name="repeat_unit">
<option value="selminutes">minutes</option>
<option value="selhours">hours</option>
<option value="seldays" selected="selected">days</option>
</select> automatically.
</dd>
<dt>collection</dt>
<dd><input name="collection" id="collection" type="text" size="60" maxlength="100" value="#[collection]#" #(collectionEnabled)#disabled="disabled"::#(/collectionEnabled)# /></dd>
</dl>
<input type="submit" name="indexAllItemContent" value="Add All Items to Index (full content of url)" />
#(/showload)#</dd>
</dl>

@ -36,6 +36,7 @@ import net.yacy.cora.document.RSSReader;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.retrieval.RSSLoader;
@ -48,6 +49,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.repository.Blacklist.BlacklistType;
import net.yacy.search.Switchboard;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -58,6 +60,11 @@ public class Load_RSS_p {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard)env;
final String collection = post == null ? "user" : CommonPattern.SPACE.matcher(post.get("collection", "user").trim()).replaceAll("");
final String[] collections = collection.length() == 0 ? new String[0] : collection.split(",");
boolean collectionEnabled = sb.index.fulltext().getDefaultConfiguration().isEmpty() || sb.index.fulltext().getDefaultConfiguration().contains(CollectionSchema.collection_sxt);
prop.put("showload_collectionEnabled", collectionEnabled ? 1 : 0);
prop.put("showload_collection", collection);
prop.put("showload", 0);
prop.put("showitems", 0);
prop.put("shownewfeeds", 0);
@ -167,7 +174,7 @@ public class Load_RSS_p {
continue;
}
// load feeds concurrently to get better responsibility in web interface
new RSSLoader(sb, url).start();
new RSSLoader(sb, url, collections).start();
}
}
}
@ -274,7 +281,7 @@ public class Load_RSS_p {
final DigestURI messageurl = new DigestURI(message.getLink());
if (RSSLoader.indexTriggered.containsKey(messageurl.hash())) continue loop;
if (sb.urlExists(ASCII.String(messageurl.hash())) != null) continue loop;
sb.addToIndex(messageurl, null, null);
sb.addToIndex(messageurl, null, null, collections);
RSSLoader.indexTriggered.insertIfAbsent(messageurl.hash(), new Date());
} catch (final IOException e) {
Log.logException(e);
@ -287,7 +294,7 @@ public class Load_RSS_p {
if (rss != null && post.containsKey("indexAllItemContent")) {
record_api = true;
final RSSFeed feed = rss.getFeed();
RSSLoader.indexAllRssFeed(sb, url, feed);
RSSLoader.indexAllRssFeed(sb, url, feed, collections);
}
if (record_api && rss != null && rss.getFeed() != null && rss.getFeed().getChannel() != null) {

@ -25,27 +25,20 @@
package net.yacy.cora.document;
import java.text.ParseException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.lod.vocabulary.DublinCore;
import net.yacy.cora.lod.vocabulary.Geo;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.util.CommonPattern;
import net.yacy.document.Document;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.data.meta.DigestURI;
public class RSSMessage implements Hit, Comparable<RSSMessage>, Comparator<RSSMessage> {

@ -54,10 +54,12 @@ public class RSSLoader extends Thread {
DigestURI urlf;
Switchboard sb;
String[] collections;
public RSSLoader(final Switchboard sb, final DigestURI urlf) {
public RSSLoader(final Switchboard sb, final DigestURI urlf, final String[] collections) {
this.sb = sb;
this.urlf = urlf;
this.collections = collections;
}
@Override
@ -79,20 +81,20 @@ public class RSSLoader extends Thread {
return;
}
final RSSFeed feed = rss.getFeed();
indexAllRssFeed(this.sb, this.urlf, feed);
indexAllRssFeed(this.sb, this.urlf, feed, this.collections);
// add the feed also to the scheduler
recordAPI(this.sb, null, this.urlf, feed, 7, "seldays");
}
public static void indexAllRssFeed(final Switchboard sb, final DigestURI url, final RSSFeed feed) {
public static void indexAllRssFeed(final Switchboard sb, final DigestURI url, final RSSFeed feed, String[] collections) {
int loadCount = 0;
loop: for (final RSSMessage message: feed) {
try {
final DigestURI messageurl = new DigestURI(message.getLink());
if (indexTriggered.containsKey(messageurl.hash())) continue loop;
if (sb.urlExists(ASCII.String(messageurl.hash())) != null) continue loop;
sb.addToIndex(messageurl, null, null);
sb.addToIndex(messageurl, null, null, collections);
indexTriggered.insertIfAbsent(messageurl.hash(), new Date());
loadCount++;
} catch (final IOException e) {

@ -2620,6 +2620,7 @@ public final class Switchboard extends serverSwitch {
for ( int i = 0; i < in.documents.length; i++ ) {
storeDocumentIndex(
in.queueEntry,
in.queueEntry.profile().collections(),
in.documents[i],
in.condenser[i],
null,
@ -2631,6 +2632,7 @@ public final class Switchboard extends serverSwitch {
private void storeDocumentIndex(
final Response queueEntry,
final String[] collections,
final Document document,
final Condenser condenser,
final SearchEvent searchEvent,
@ -2679,7 +2681,7 @@ public final class Switchboard extends serverSwitch {
this.index.storeDocument(
url,
referrerURL,
queueEntry.profile(),
collections,
queueEntry.getResponseHeader(),
document,
condenser,
@ -2749,12 +2751,13 @@ public final class Switchboard extends serverSwitch {
final DigestURI url,
final Map<DigestURI, String> links,
final SearchEvent searchEvent,
final String heuristicName) {
final String heuristicName,
final String[] collections) {
// add the landing page to the index. should not load that again since it should be in the cache
if ( url != null ) {
try {
addToIndex(url, searchEvent, heuristicName);
addToIndex(url, searchEvent, heuristicName, collections);
} catch ( final IOException e ) {
} catch ( final Parser.Failure e ) {
}
@ -2767,7 +2770,7 @@ public final class Switchboard extends serverSwitch {
// take the matcher and load them all
for ( final Map.Entry<DigestURI, String> entry : matcher.entrySet() ) {
try {
addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName);
addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName, collections);
} catch ( final IOException e ) {
} catch ( final Parser.Failure e ) {
}
@ -2776,7 +2779,7 @@ public final class Switchboard extends serverSwitch {
// take then the no-matcher and load them also
for ( final Map.Entry<DigestURI, String> entry : links.entrySet() ) {
try {
addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName);
addToIndex(new DigestURI(entry.getKey(), (byte[]) null), searchEvent, heuristicName, collections);
} catch ( final IOException e ) {
} catch ( final Parser.Failure e ) {
}
@ -2909,7 +2912,7 @@ public final class Switchboard extends serverSwitch {
* @throws IOException
* @throws Parser.Failure
*/
public void addToIndex(final DigestURI url, final SearchEvent searchEvent, final String heuristicName)
public void addToIndex(final DigestURI url, final SearchEvent searchEvent, final String heuristicName, final String[] collections)
throws IOException,
Parser.Failure {
if (searchEvent != null) {
@ -2956,6 +2959,7 @@ public final class Switchboard extends serverSwitch {
Switchboard.this.webStructure.generateCitationReference(url, document);
storeDocumentIndex(
response,
collections,
document,
condenser,
searchEvent,
@ -3341,7 +3345,7 @@ public final class Switchboard extends serverSwitch {
}
// add all pages to the index
addAllToIndex(url, links, searchEvent, "site");
addAllToIndex(url, links, searchEvent, "site", new String[]{"site"});
}
} catch ( final Throwable e ) {
Log.logException(e);
@ -3454,7 +3458,7 @@ public final class Switchboard extends serverSwitch {
+ feedName
+ "' rss feed");
// add all pages to the index
addAllToIndex(null, links, searchEvent, feedName);
addAllToIndex(null, links, searchEvent, feedName, new String[]{"rss"});
}
} catch ( final Throwable e ) {
//Log.logException(e);

@ -51,7 +51,6 @@ import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.LookAheadIterator;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.data.CrawlQueues;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser;
@ -449,7 +448,7 @@ public class Segment {
public SolrInputDocument storeDocument(
final DigestURI url,
final DigestURI referrerURL,
final CrawlProfile profile,
final String[] collections,
final ResponseHeader responseHeader,
final Document document,
final Condenser condenser,
@ -489,7 +488,7 @@ public class Segment {
char docType = Response.docType(document.dc_format());
// CREATE SOLR DOCUMENT
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(id, profile, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration());
final CollectionConfiguration.SolrVector vector = this.fulltext.getDefaultConfiguration().yacy2solr(id, collections, responseHeader, document, condenser, referrerURL, language, urlCitationIndex, this.fulltext.getWebgraphConfiguration());
// FIND OUT IF THIS IS A DOUBLE DOCUMENT
String hostid = url.hosthash();

@ -202,10 +202,11 @@ public class QueryModifier {
}
StringBuilder filterQuery = new StringBuilder(20);
if (sites.size() > 1) {
filterQuery.append(CollectionSchema.collection_sxt.getSolrFieldName()).append(':').append(sites.get(0));
filterQuery.append('(').append(CollectionSchema.collection_sxt.getSolrFieldName()).append(':').append(sites.get(0));
for (int i = 1; i < sites.size(); i++) {
filterQuery.append(" OR ").append(CollectionSchema.collection_sxt.getSolrFieldName()).append(':').append(sites.get(i));
}
filterQuery.append(')');
} else if (sites.size() == 1) {
filterQuery.append(CollectionSchema.collection_sxt.getSolrFieldName()).append(':').append(sites.get(0));
}

@ -56,7 +56,6 @@ import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.CommonPattern;
import net.yacy.crawler.data.CrawlProfile;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Condenser;
import net.yacy.document.Document;
@ -329,7 +328,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
public SolrVector yacy2solr(
final String id, final CrawlProfile profile, final ResponseHeader responseHeader,
final String id, final String[] collections, final ResponseHeader responseHeader,
final Document document, Condenser condenser, DigestURI referrerURL, String language,
IndexCell<CitationReference> citations,
WebgraphConfiguration webgraph) {
@ -362,7 +361,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
final InetAddress address = digestURI.getInetAddress();
if (address != null) add(doc, CollectionSchema.ip_s, address.getHostAddress());
}
if (allAttr || contains(CollectionSchema.collection_sxt) && profile != null) add(doc, CollectionSchema.collection_sxt, profile.collections());
if (allAttr || contains(CollectionSchema.collection_sxt) && collections != null && collections.length > 0) add(doc, CollectionSchema.collection_sxt, collections);
if (allAttr || contains(CollectionSchema.url_protocol_s)) add(doc, CollectionSchema.url_protocol_s, digestURI.getProtocol());
Map<String, String> searchpart = digestURI.getSearchpartMap();
if (searchpart == null) {
@ -756,7 +755,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount());
// list all links
WebgraphConfiguration.Subgraph subgraph = webgraph.edges(digestURI, responseHeader, profile.collections(), clickdepth, document.getAnchors(), images, inboundLinks, outboundLinks, citations);
WebgraphConfiguration.Subgraph subgraph = webgraph.edges(digestURI, responseHeader, collections, clickdepth, document.getAnchors(), images, inboundLinks, outboundLinks, citations);
doc.webgraphDocuments.addAll(subgraph.edges);
if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0]));
if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_txt)) add(doc, CollectionSchema.inboundlinks_urlstub_txt, subgraph.urlStubs[0]);

Loading…
Cancel
Save