- added nice colors to feed indexing state messages

- added a 'remove all' button for new and scheduled rss feed list
- made adding of new rss feeds concurrent so interface is more responsible

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7078 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 23ba107834
commit 104318d58a

@ -5,17 +5,12 @@
#%env/templates/metas.template%#
<script type="text/javascript">
<!--
function setall(name, check){
function setall(name) {
var selectForm = document.forms.namedItem(name);
var count = selectForm.elements["num"].value;
if (check) for(i = 0; i < count; i++) {
if (selectForm.elements["item_" + i].checked) {
check = false;
break;
}
}
for(i = 0; i < count; i++){
selectForm.elements["item_" + i].checked = check;
for (i = 0; i < count; i++) {
if (selectForm.elements["item_" + i] == null) continue;
selectForm.elements["item_" + i].checked = !selectForm.elements["item_" + i].checked;
}
}
-->
@ -73,7 +68,7 @@
<legend><label for="table">List of Scheduled RSS Feed Load Targets</label></legend>
<table class="sortable" border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">
<td><input type="checkbox" name="allswitch" onclick="setall(this.form.name, this.value)" /></td>
<td><input type="checkbox" name="allswitch" onclick="setall(this.form.name)" /></td>
<td>Title</td>
<td>URL/Referrer</td>
<td>Recording</td>
@ -99,7 +94,8 @@
</table>
<p>
<input type="hidden" name="num" value="#[num]#" />
<input type="submit" name="removeSelectedFeedScheduler" value="Remove Selected Feeds from Scheduler" /></dt>
<input type="submit" name="removeSelectedFeedsScheduler" value="Remove Selected Feeds from Scheduler" /></dt>
<input type="submit" name="removeAllFeedsScheduler" value="Remove All Feeds from Scheduler" /></dt>
</p>
</fieldset></form>
#(/showscheduledfeeds)#
@ -109,7 +105,7 @@
<legend><label for="table">Available RSS Feed List</label></legend>
<table class="sortable" border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">
<td><input type="checkbox" name="allswitch" onclick="setall(this.form.name, this.value)" /></td>
<td><input type="checkbox" name="allswitch" onclick="setall(this.form.name)" /></td>
<td>Title</td>
<td>URL/Referrer</td>
<td>Recording</td>
@ -125,7 +121,8 @@
</table>
<p>
<input type="hidden" name="num" value="#[num]#" />
<input type="submit" name="removeSelectedFeedNewList" value="Remove Selected Feeds from Feed List" /></dt>
<input type="submit" name="removeSelectedFeedsNewList" value="Remove Selected Feeds from Feed List" /></dt>
<input type="submit" name="removeAllFeedsNewList" value="Remove All Feeds from Feed List" /></dt>
<input type="submit" name="addSelectedFeedScheduler" value="Add Selected Feeds to Scheduler" /></dt>
</p>
</fieldset></form>
@ -145,7 +142,7 @@
</dl>
<table class="sortable" border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">
<td><input type="checkbox" name="allswitch" onclick="setall(this.form.name, this.value)" /></td>
<td><input type="checkbox" name="allswitch" onclick="setall(this.form.name)" /></td>
<td>State</td>
<td>Title</td>
<td>URL</td>
@ -156,8 +153,8 @@
</tr>
#{item}#
<tr class="TableCellLight">
<td align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[guid]#" /></td>
<td>#(state)#new::enqueued::indexed#(/state)#</td>
<td align="left">#(state)#<input type="checkbox" name="item_#[count]#" value="mark_#[guid]#" />::&nbsp;::&nbsp;#(/state)#</td>
#(state)#<td>new</td>::<td class="info">enqueued</td>::<td class="commit">indexed</td>#(/state)#
<td><a href="#[link]#">#[title]#</a></td>
<td><a href="#[link]#">#[link]#</a></td>
<td>#[author]#</td>

@ -21,8 +21,10 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.text.DateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import net.yacy.cora.document.Hit;
@ -30,17 +32,15 @@ import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.RSSReader;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ComparableARC;
import net.yacy.document.Parser.Failure;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.blob.Tables.Row;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.RSSLoader;
import de.anomic.crawler.retrieval.Response;
import de.anomic.data.WorkTables;
import de.anomic.search.Segments;
@ -50,8 +50,6 @@ import de.anomic.server.serverSwitch;
public class Load_RSS_p {
private static final ARC<byte[], Date> indexTriggered = new ComparableARC<byte[], Date>(1000, Base64Order.enhancedCoder);
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final serverObjects prop = new serverObjects();
@ -63,7 +61,7 @@ public class Load_RSS_p {
prop.put("showscheduledfeeds", 0);
prop.put("url", "");
if (post != null && post.containsKey("removeSelectedFeedNewList")) {
if (post != null && post.containsKey("removeSelectedFeedsNewList")) {
for (Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getValue().startsWith("mark_")) try {
sb.tables.delete("rss", entry.getValue().substring(5).getBytes());
@ -73,7 +71,32 @@ public class Load_RSS_p {
}
}
if (post != null && post.containsKey("removeSelectedFeedScheduler")) {
if (post != null && post.containsKey("removeAllFeedsNewList")) try {
Iterator<Row> plainIterator = sb.tables.iterator("rss");
Row row;
String messageurl;
List<byte[]> d = new ArrayList<byte[]>();
while (plainIterator.hasNext()) {
row = plainIterator.next();
if (row == null) continue;
messageurl = row.get("url", "");
if (messageurl.length() == 0) continue;
byte[] api_pk = row.get("api_pk");
Row r = api_pk == null ? null : sb.tables.select("api", api_pk);
if (r == null || !r.get("comment", "").matches(".*\\Q" + messageurl + "\\E.*")) {
d.add(row.getPK());
}
}
for (byte[] pk: d) {
sb.tables.delete("rss", pk);
}
} catch (IOException e) {
Log.logException(e);
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
if (post != null && post.containsKey("removeSelectedFeedsScheduler")) {
for (Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getValue().startsWith("mark_")) try {
byte[] pk = entry.getValue().substring(5).getBytes();
@ -90,11 +113,39 @@ public class Load_RSS_p {
}
}
if (post != null && post.containsKey("removeAllFeedsScheduler")) try {
Iterator<Row> plainIterator = sb.tables.iterator("rss");
Row row;
String messageurl;
List<byte[]> d = new ArrayList<byte[]>();
while (plainIterator.hasNext()) {
row = plainIterator.next();
if (row == null) continue;
messageurl = row.get("url", "");
if (messageurl.length() == 0) continue;
byte[] api_pk = row.get("api_pk");
Row r = api_pk == null ? null : sb.tables.select("api", api_pk);
if (r != null && r.get("comment", "").matches(".*\\Q" + messageurl + "\\E.*")) {
d.add(row.getPK());
}
}
for (byte[] pk: d) {
Row rssRow = sb.tables.select("rss", pk);
byte[] schedulerPK = rssRow.get("api_pk", (byte[]) null);
if (schedulerPK != null) sb.tables.delete("api", schedulerPK);
rssRow.remove("api_pk");
sb.tables.insert("rss", pk, rssRow);
}
} catch (IOException e) {
Log.logException(e);
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
if (post != null && post.containsKey("addSelectedFeedScheduler")) {
for (Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getValue().startsWith("mark_")) {
Row row;
RSSReader rss = null;
try {
byte [] pk = entry.getValue().substring(5).getBytes();
row = sb.tables.select("rss", pk);
@ -112,28 +163,19 @@ public class Load_RSS_p {
Log.logWarning("Load_RSS", "malformed url '" + row.get("url", "") + "': " + e.getMessage());
continue;
}
try {
Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
byte[] resource = response == null ? null : response.getContent();
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
} catch (IOException e) {
Log.logWarning("Load_RSS", "rss loading for url '" + url.toNormalform(true, false) + "' failed: " + e.getMessage());
continue;
}
if (rss == null) {
Log.logWarning("Load_RSS", "no rss for url " + url.toNormalform(true, false));
} else {
RSSFeed feed = rss.getFeed();
indexAllRssFeed(sb, url, feed);
// add the feed also to the scheduler
recordAPI(sb, url, rss.getFeed(), 1, "seldays");
}
// load feeds concurrently to get better responsibility in web interface
new RSSLoader(sb, url).start();
}
}
}
if (post == null || (post != null && (post.containsKey("addSelectedFeedScheduler") || post.containsKey("removeSelectedFeedNewList") || post.containsKey("removeSelectedFeedScheduler")))) {
if (post == null || (post != null && (
post.containsKey("addSelectedFeedScheduler") ||
post.containsKey("removeSelectedFeedsNewList") ||
post.containsKey("removeAllFeedsNewList") ||
post.containsKey("removeSelectedFeedsScheduler") ||
post.containsKey("removeAllFeedsScheduler")
))) {
try {
// get list of primary keys from the api table with scheduled feed loading requests
Tables.Row row;
@ -177,6 +219,7 @@ public class Load_RSS_p {
prop.put("shownewfeeds_list_" + newc + "_recording", DateFormat.getDateTimeInstance().format(row.get("recording_date", new Date())));
newc++;
}
if (apic > 1000 || newc > 1000) break;
}
prop.put("showscheduledfeeds_list" , apic);
prop.put("showscheduledfeeds_num", apic);
@ -226,10 +269,10 @@ public class Load_RSS_p {
if (entry.getValue().startsWith("mark_")) try {
RSSMessage message = feed.getMessage(entry.getValue().substring(5));
DigestURI messageurl = new DigestURI(message.getLink());
if (indexTriggered.containsKey(messageurl.hash())) continue loop;
if (RSSLoader.indexTriggered.containsKey(messageurl.hash())) continue loop;
if (sb.urlExists(Segments.Process.LOCALCRAWLING, messageurl.hash()) != null) continue loop;
sb.addToIndex(messageurl, null, null);
indexTriggered.put(messageurl.hash(), new Date());
RSSLoader.indexTriggered.put(messageurl.hash(), new Date());
} catch (IOException e) {
Log.logException(e);
} catch (Failure e) {
@ -241,12 +284,12 @@ public class Load_RSS_p {
if (rss != null && post.containsKey("indexAllItemContent")) {
record_api = true;
RSSFeed feed = rss.getFeed();
indexAllRssFeed(sb, url, feed);
RSSLoader.indexAllRssFeed(sb, url, feed);
}
if (record_api && rss != null && rss.getFeed() != null && rss.getFeed().getChannel() != null) {
// record API action
recordAPI(sb, url, rss.getFeed(), repeat_time, repeat_unit);
RSSLoader.recordAPI(sb, url, rss.getFeed(), repeat_time, repeat_unit);
}
// show items from rss
@ -272,9 +315,9 @@ public class Load_RSS_p {
author = item.getAuthor();
if (author == null) author = item.getCopyright();
pubDate = item.getPubDate();
prop.put("showitems_item_" + i + "_count", i);
prop.put("showitems_item_" + i + "_state", sb.urlExists(Segments.Process.LOCALCRAWLING, messageurl.hash()) != null ? 2 : indexTriggered.containsKey(messageurl.hash()) ? 1 : 0);
prop.putHTML("showitems_item_" + i + "_guid", item.getGuid());
prop.put("showitems_item_" + i + "_state", sb.urlExists(Segments.Process.LOCALCRAWLING, messageurl.hash()) != null ? 2 : RSSLoader.indexTriggered.containsKey(messageurl.hash()) ? 1 : 0);
prop.put("showitems_item_" + i + "_state_count", i);
prop.putHTML("showitems_item_" + i + "_state_guid", item.getGuid());
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
prop.putHTML("showitems_item_" + i + "_link", messageurl.toNormalform(false, false));
@ -299,71 +342,4 @@ public class Load_RSS_p {
return prop;
}
private static void indexAllRssFeed(Switchboard sb, DigestURI url, RSSFeed feed) {
int loadCount = 0;
loop: for (RSSMessage message: feed) {
try {
DigestURI messageurl = new DigestURI(message.getLink());
if (indexTriggered.containsKey(messageurl.hash())) continue loop;
if (sb.urlExists(Segments.Process.LOCALCRAWLING, messageurl.hash()) != null) continue loop;
sb.addToIndex(messageurl, null, null);
indexTriggered.put(messageurl.hash(), new Date());
loadCount++;
} catch (IOException e) {
Log.logException(e);
} catch (Failure e) {
Log.logException(e);
}
}
// update info for loading
try {
Tables.Data rssRow = sb.tables.select("rss", url.hash());
if (rssRow == null) rssRow = new Tables.Data();
Date lastLoadDate = rssRow.get("last_load_date", new Date(0));
long deltaTime = Math.min(System.currentTimeMillis() - lastLoadDate.getTime(), 1000 * 60 * 60 * 24);
int allLoadCount = rssRow.get("all_load_count", 0);
int lastAvg = rssRow.get("avg_upd_per_day", 0);
long thisAvg = 1000 * 60 * 60 * 24 / deltaTime * loadCount;
long nextAvg = lastAvg == 0 ? thisAvg : (thisAvg + lastAvg * 2) / 3;
rssRow.put("url", url.toNormalform(true, false).getBytes());
rssRow.put("title", feed.getChannel().getTitle());
rssRow.put("last_load_date", new Date());
rssRow.put("last_load_count", loadCount);
rssRow.put("all_load_count", allLoadCount + loadCount);
rssRow.put("avg_upd_per_day", nextAvg);
sb.tables.update("rss", url.hash(), rssRow);
} catch (IOException e) {
Log.logException(e);
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
}
private static void recordAPI(Switchboard sb, DigestURI url, RSSFeed feed, int repeat_time, String repeat_unit) {
// record API action
byte[] pk = null;
serverObjects post = new serverObjects();
post.put("url", url.toNormalform(true, false));
post.put("indexAllItemContent", "");
if (repeat_time > 0) {
// store as scheduled api call
pk = sb.tables.recordAPICall(post, "Load_RSS_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "import feed " + url.toNormalform(true, false), repeat_time, repeat_unit.substring(3));
} else {
// store just a protocol
pk = sb.tables.recordAPICall(post, "Load_RSS_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "import feed " + url.toNormalform(true, false));
}
// store pk of api table into rss table to show that the entry has been recorded
assert pk != null;
Tables.Data rssRow = new Tables.Data();
rssRow.put("url", url.toNormalform(true, false).getBytes());
rssRow.put("title", feed.getChannel().getTitle());
rssRow.put("api_pk", pk);
try {
sb.tables.update("rss", url.hash(), rssRow);
} catch (IOException e) {
Log.logException(e);
}
}
}

@ -129,7 +129,7 @@ tt, *.tt {
.info {
font-weight:bold;
color:yellow;
color:olive;
}
.commit {
@ -142,7 +142,7 @@ tt, *.tt {
}
.hidden {
display:hidden;
display:hidden;
}
/* .snippetLoaded {

@ -0,0 +1,147 @@
/**
* RSSLoader
* Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt a. M., Germany
* First released 27.8.2010 at http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package de.anomic.crawler;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
import net.yacy.cora.document.RSSFeed;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.RSSReader;
import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ComparableARC;
import net.yacy.document.Parser.Failure;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
import de.anomic.crawler.retrieval.Response;
import de.anomic.data.WorkTables;
import de.anomic.search.Segments;
import de.anomic.search.Switchboard;
import de.anomic.server.serverObjects;
public class RSSLoader extends Thread {
public static final ARC<byte[], Date> indexTriggered = new ComparableARC<byte[], Date>(1000, Base64Order.enhancedCoder);
DigestURI urlf;
Switchboard sb;
public RSSLoader(Switchboard sb, DigestURI urlf) {
this.sb = sb;
this.urlf = urlf;
}
public void run() {
RSSReader rss = null;
try {
Response response = sb.loader.load(sb.loader.request(urlf, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
byte[] resource = response == null ? null : response.getContent();
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
} catch (MalformedURLException e) {
Log.logWarning("Load_RSS", "rss loading for url '" + this.getName().substring(9) + "' failed: " + e.getMessage());
return;
} catch (IOException e) {
Log.logWarning("Load_RSS", "rss loading for url '" + urlf.toNormalform(true, false) + "' failed: " + e.getMessage());
return;
}
if (rss == null) {
Log.logWarning("Load_RSS", "no rss for url " + urlf.toNormalform(true, false));
return;
}
RSSFeed feed = rss.getFeed();
indexAllRssFeed(sb, urlf, feed);
// add the feed also to the scheduler
recordAPI(sb, urlf, feed, 7, "seldays");
}
public static void indexAllRssFeed(Switchboard sb, DigestURI url, RSSFeed feed) {
int loadCount = 0;
loop: for (RSSMessage message: feed) {
try {
DigestURI messageurl = new DigestURI(message.getLink());
if (indexTriggered.containsKey(messageurl.hash())) continue loop;
if (sb.urlExists(Segments.Process.LOCALCRAWLING, messageurl.hash()) != null) continue loop;
sb.addToIndex(messageurl, null, null);
indexTriggered.put(messageurl.hash(), new Date());
loadCount++;
} catch (IOException e) {
Log.logException(e);
} catch (Failure e) {
Log.logException(e);
}
}
// update info for loading
try {
Tables.Data rssRow = sb.tables.select("rss", url.hash());
if (rssRow == null) rssRow = new Tables.Data();
Date lastLoadDate = rssRow.get("last_load_date", new Date(0));
long deltaTime = Math.min(System.currentTimeMillis() - lastLoadDate.getTime(), 1000 * 60 * 60 * 24);
int allLoadCount = rssRow.get("all_load_count", 0);
int lastAvg = rssRow.get("avg_upd_per_day", 0);
long thisAvg = 1000 * 60 * 60 * 24 / deltaTime * loadCount;
long nextAvg = lastAvg == 0 ? thisAvg : (thisAvg + lastAvg * 2) / 3;
rssRow.put("url", url.toNormalform(true, false).getBytes());
rssRow.put("title", feed.getChannel().getTitle());
rssRow.put("last_load_date", new Date());
rssRow.put("last_load_count", loadCount);
rssRow.put("all_load_count", allLoadCount + loadCount);
rssRow.put("avg_upd_per_day", nextAvg);
sb.tables.update("rss", url.hash(), rssRow);
} catch (IOException e) {
Log.logException(e);
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
}
public static void recordAPI(Switchboard sb, DigestURI url, RSSFeed feed, int repeat_time, String repeat_unit) {
// record API action
byte[] pk = null;
serverObjects post = new serverObjects();
post.put("url", url.toNormalform(true, false));
post.put("indexAllItemContent", "");
if (repeat_time > 0) {
// store as scheduled api call
pk = sb.tables.recordAPICall(post, "Load_RSS_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "import feed " + url.toNormalform(true, false), repeat_time, repeat_unit.substring(3));
} else {
// store just a protocol
pk = sb.tables.recordAPICall(post, "Load_RSS_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "import feed " + url.toNormalform(true, false));
}
// store pk of api table into rss table to show that the entry has been recorded
assert pk != null;
Tables.Data rssRow = new Tables.Data();
rssRow.put("url", url.toNormalform(true, false).getBytes());
rssRow.put("title", feed.getChannel().getTitle());
rssRow.put("api_pk", pk);
try {
sb.tables.update("rss", url.hash(), rssRow);
} catch (IOException e) {
Log.logException(e);
}
}
}
Loading…
Cancel
Save