added two more tables in rss reader interface:

- fresh recorded rss feeds (not yet loaded or in scheduler)
- rss feeds in scheduler
The first list has a button that can be used to place rss feeds into the scheduler
The second list has a button to delete rss feeds from the scheduler

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7074 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent 0010cd9db1
commit 42414a6ae3

@ -68,6 +68,69 @@
</fieldset>
</form>
#(showscheduledfeeds)#::
<form name="scheduledfeeds"><fieldset>
<legend><label for="table">List of Scheduled RSS Feed Load Targets</label></legend>
<table class="sortable" border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">
<td><input type="checkbox" name="allswitch" onclick="setall(this.form.name, this.value)" /></td>
<td>Title</td>
<td>URL/Referrer</td>
<td>Recording</td>
<td>Last Load</td>
<td>Next Load</td>
<td>Last Count</td>
<td>All Count</td>
<td>Avg. Update/Day</td>
</tr>
#{list}#
<tr class="TableCellLight">
<td align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[pk]#" /></td>
<td><a href="/Load_RSS_p.html?showrss=&url=#[rss]#">#[title]#</a></td>
<td><a href="/Load_RSS_p.html?showrss=&url=#[rss]#">#[rss]#</a><br/><a href="#[referrer]#">#[referrer]#</a></td>
<td>#[recording]#</td>
<td>#[lastload]#</td>
<td>#[nextload]#</td>
<td>#[lastcount]#</td>
<td>#[allcount]#</td>
<td>#[updperday]#</td>
</tr>
#{/list}#
</table>
<p>
<input type="hidden" name="num" value="#[num]#" />
<input type="submit" name="removeSelectedFeedScheduler" value="Remove Selected Feeds from Scheduler" /></dt>
</p>
</fieldset></form>
#(/showscheduledfeeds)#
#(shownewfeeds)#::
<form name="newfeeds"><fieldset>
<legend><label for="table">Available RSS Feed List</label></legend>
<table class="sortable" border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">
<td><input type="checkbox" name="allswitch" onclick="setall(this.form.name, this.value)" /></td>
<td>Title</td>
<td>URL/Referrer</td>
<td>Recording</td>
</tr>
#{list}#
<tr class="TableCellLight">
<td align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[pk]#" /></td>
<td><a href="/Load_RSS_p.html?showrss=&url=#[rss]#">#[title]#</a></td>
<td><a href="/Load_RSS_p.html?showrss=&url=#[rss]#">#[rss]#</a><br/><a href="#[referrer]#">#[referrer]#</a></td>
<td>#[recording]#</td>
</tr>
#{/list}#
</table>
<p>
<input type="hidden" name="num" value="#[num]#" />
<input type="submit" name="removeSelectedFeedNewList" value="Remove Selected Feeds from Feed List" /></dt>
<input type="submit" name="addSelectedFeedScheduler" value="Add Selected Feeds to Scheduler" /></dt>
</p>
</fieldset></form>
#(/shownewfeeds)#
#(showitems)#::
<form name="rssfeed"><fieldset>
<legend><label for="table">RSS Feed of #[rss]#</label></legend>

@ -22,6 +22,7 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.text.DateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
import net.yacy.cora.document.Hit;
@ -32,7 +33,10 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ComparableARC;
import net.yacy.document.Parser.Failure;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.blob.Tables.Row;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.Base64Order;
@ -53,11 +57,111 @@ public class Load_RSS_p {
final serverObjects prop = new serverObjects();
final Switchboard sb = (Switchboard)env;
prop.put("showitems", 0);
prop.put("showload", 0);
prop.put("showitems", 0);
prop.put("shownewfeeds", 0);
prop.put("showscheduledfeeds", 0);
prop.put("url", "");
if (post != null && (post.containsKey("removeSelectedFeedNewList") || post.containsKey("removeSelectedFeedScheduler"))) {
for (Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getValue().startsWith("mark_")) try {
sb.tables.delete("rss", entry.getValue().substring(5).getBytes());
} catch (IOException e) {
Log.logException(e);
}
}
}
if (post != null && post.containsKey("addSelectedFeedScheduler")) {
for (Map.Entry<String, String> entry: post.entrySet()) {
if (entry.getValue().startsWith("mark_")) try {
Row row = sb.tables.select("rss", entry.getValue().substring(5).getBytes());
RSSReader rss = null;
DigestURI url = new DigestURI(row.get("url", ""));
try {
Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
byte[] resource = response == null ? null : response.getContent();
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
} catch (IOException e) {
Log.logException(e);
}
if (rss == null) {
Log.logWarning("Load_RSS", "no rss for url" + url.toNormalform(true, false));
} else {
RSSFeed feed = rss.getFeed();
indexAllRssFeed(sb, url, feed);
// add the feed also to the scheduler
recordAPI(sb, url, rss.getFeed(), 1, "seldays");
}
} catch (IOException e) {
Log.logException(e);
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
}
}
if (post == null) return prop;
if (post == null || (post != null && (post.containsKey("addSelectedFeedScheduler") || post.containsKey("removeSelectedFeedNewList") || post.containsKey("removeSelectedFeedScheduler")))) {
try {
// get list of primary keys from the api table with scheduled feed loading requests
Tables.Row row;
String messageurl;
// check feeds
int newc = 0, apic = 0;
Iterator<Row> plainIterator = sb.tables.iterator("rss");
while (plainIterator.hasNext()) {
row = plainIterator.next();
if (row == null) continue;
messageurl = row.get("url", "");
if (messageurl.length() == 0) continue;
// get referrer
DigestURI referrer = sb.getURL(Segments.Process.LOCALCRAWLING, row.get("referrer", "").getBytes());
// check if feed is registered in scheduler
byte[] api_pk = row.get("api_pk");
Row r = api_pk == null ? null : sb.tables.select("api", api_pk);
if (r != null && r.get("comment", "").matches(".*\\Q" + messageurl + "\\E.*")) {
// this is a recorded entry
Date date_next_exec = r.containsKey(WorkTables.TABLE_API_COL_DATE_NEXT_EXEC) ? row.get(WorkTables.TABLE_API_COL_DATE_NEXT_EXEC, new Date()) : null;
prop.put("showscheduledfeeds_list_" + apic + "_pk", new String(row.getPK()));
prop.put("showscheduledfeeds_list_" + apic + "_count", apic);
prop.put("showscheduledfeeds_list_" + apic + "_rss", messageurl);
prop.put("showscheduledfeeds_list_" + apic + "_title", row.get("title", ""));
prop.put("showscheduledfeeds_list_" + apic + "_referrer", referrer == null ? "" : referrer.toNormalform(true, false));
prop.put("showscheduledfeeds_list_" + apic + "_recording", DateFormat.getDateTimeInstance().format(row.get("recording_date", new Date())));
prop.put("showscheduledfeeds_list_" + apic + "_lastload", DateFormat.getDateTimeInstance().format(row.get("last_load_date", new Date())));
prop.put("showscheduledfeeds_list_" + apic + "_nextload", date_next_exec == null ? "" : DateFormat.getDateTimeInstance().format(date_next_exec));
prop.put("showscheduledfeeds_list_" + apic + "_lastcount", row.get("last_load_count", 0));
prop.put("showscheduledfeeds_list_" + apic + "_allcount", row.get("all_load_count", 0));
prop.put("showscheduledfeeds_list_" + apic + "_updperday", row.get("avg_upd_per_day", 0));
apic++;
} else {
// this is a new entry
prop.put("shownewfeeds_list_" + newc + "_pk", new String(row.getPK()));
prop.put("shownewfeeds_list_" + newc + "_count", newc);
prop.put("shownewfeeds_list_" + newc + "_rss", messageurl);
prop.put("shownewfeeds_list_" + newc + "_title", row.get("title", ""));
prop.put("shownewfeeds_list_" + newc + "_referrer", referrer == null ? "" : referrer.toNormalform(true, false));
prop.put("shownewfeeds_list_" + newc + "_recording", DateFormat.getDateTimeInstance().format(row.get("recording_date", new Date())));
newc++;
}
}
prop.put("showscheduledfeeds_list" , apic);
prop.put("showscheduledfeeds_num", apic);
prop.put("showscheduledfeeds", apic > 0 ? apic : 0);
prop.put("shownewfeeds_list" , newc);
prop.put("shownewfeeds_num", newc);
prop.put("shownewfeeds", newc > 0 ? 1 : 0);
} catch (IOException e) {
Log.logException(e);
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
return prop;
}
prop.put("url", post.get("url", ""));
@ -78,8 +182,8 @@ public class Load_RSS_p {
RSSReader rss = null;
if (url != null) try {
prop.put("url", url.toNormalform(true, false));
Response entry = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
byte[] resource = entry == null ? null : entry.getContent();
Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
byte[] resource = response == null ? null : response.getContent();
rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource);
} catch (IOException e) {
Log.logException(e);
@ -103,33 +207,16 @@ public class Load_RSS_p {
}
}
}
if (rss != null && post.containsKey("indexAllItemContent")) {
record_api = true;
RSSFeed feed = rss.getFeed();
loop: for (RSSMessage message: feed) {
try {
DigestURI messageurl = new DigestURI(message.getLink());
if (indexTriggered.containsKey(messageurl.hash()) && post.containsKey("indexSelectedItemContent")) continue loop;
if (sb.urlExists(Segments.Process.LOCALCRAWLING, messageurl.hash()) != null) continue loop;
sb.addToIndex(messageurl, null, null);
indexTriggered.put(messageurl.hash(), new Date());
} catch (IOException e) {
Log.logException(e);
} catch (Failure e) {
Log.logException(e);
}
}
indexAllRssFeed(sb, url, feed);
}
if (record_api) {
if (record_api && rss != null && rss.getFeed() != null && rss.getFeed().getChannel() != null) {
// record API action
if (repeat_time > 0) {
// store as scheduled api call
sb.tables.recordAPICall(post, "Load_RSS_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "import feed " + url.toNormalform(true, false), repeat_time, repeat_unit.substring(3));
} else {
// store just a protocol
sb.tables.recordAPICall(post, "Load_RSS_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "import feed " + url.toNormalform(true, false));
}
recordAPI(sb, url, rss.getFeed(), repeat_time, repeat_unit);
}
// show items from rss
@ -137,16 +224,16 @@ public class Load_RSS_p {
prop.put("showitems", 1);
RSSFeed feed = rss.getFeed();
RSSMessage channel = feed.getChannel();
prop.putHTML("showitems_title", channel.getTitle());
String author = channel.getAuthor();
if (author == null || author.length() == 0) author = channel.getCopyright();
Date pubDate = channel.getPubDate();
prop.putHTML("showitems_title", channel == null ? "" : channel.getTitle());
String author = channel == null ? "" : channel.getAuthor();
if (author == null || author.length() == 0) author = channel == null ? "" : channel.getCopyright();
Date pubDate = channel == null ? null : channel.getPubDate();
prop.putHTML("showitems_author", author == null ? "" : author);
prop.putHTML("showitems_description", channel.getDescription());
prop.putHTML("showitems_language", channel.getLanguage());
prop.putHTML("showitems_description", channel == null ? "" : channel.getDescription());
prop.putHTML("showitems_language", channel == null ? "" : channel.getLanguage());
prop.putHTML("showitems_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
prop.putHTML("showitems_ttl", channel.getTTL());
prop.putHTML("showitems_docs", channel.getDocs());
prop.putHTML("showitems_ttl", channel == null ? "" : channel.getTTL());
prop.putHTML("showitems_docs", channel == null ? "" : channel.getDocs());
int i = 0;
for (final Hit item: feed) {
@ -182,4 +269,71 @@ public class Load_RSS_p {
return prop;
}
private static void indexAllRssFeed(Switchboard sb, DigestURI url, RSSFeed feed) {
int loadCount = 0;
loop: for (RSSMessage message: feed) {
try {
DigestURI messageurl = new DigestURI(message.getLink());
if (indexTriggered.containsKey(messageurl.hash())) continue loop;
if (sb.urlExists(Segments.Process.LOCALCRAWLING, messageurl.hash()) != null) continue loop;
sb.addToIndex(messageurl, null, null);
indexTriggered.put(messageurl.hash(), new Date());
loadCount++;
} catch (IOException e) {
Log.logException(e);
} catch (Failure e) {
Log.logException(e);
}
}
// update info for loading
try {
Tables.Data rssRow = sb.tables.select("rss", url.hash());
if (rssRow == null) rssRow = new Tables.Data();
Date lastLoadDate = rssRow.get("last_load_date", new Date(0));
long deltaTime = Math.min(System.currentTimeMillis() - lastLoadDate.getTime(), 1000 * 60 * 60 * 24);
int allLoadCount = rssRow.get("all_load_count", 0);
int lastAvg = rssRow.get("avg_upd_per_day", 0);
long thisAvg = 1000 * 60 * 60 * 24 / deltaTime * loadCount;
long nextAvg = lastAvg == 0 ? thisAvg : (thisAvg + lastAvg * 2) / 3;
rssRow.put("url", url.toNormalform(true, false).getBytes());
rssRow.put("title", feed.getChannel().getTitle());
rssRow.put("last_load_date", new Date());
rssRow.put("last_load_count", loadCount);
rssRow.put("all_load_count", allLoadCount + loadCount);
rssRow.put("avg_upd_per_day", nextAvg);
sb.tables.update("rss", url.hash(), rssRow);
} catch (IOException e) {
Log.logException(e);
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
}
private static void recordAPI(Switchboard sb, DigestURI url, RSSFeed feed, int repeat_time, String repeat_unit) {
// record API action
byte[] pk = null;
serverObjects post = new serverObjects();
post.put("url", url.toNormalform(true, false));
post.put("indexAllItemContent", "");
if (repeat_time > 0) {
// store as scheduled api call
pk = sb.tables.recordAPICall(post, "Load_RSS_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "import feed " + url.toNormalform(true, false), repeat_time, repeat_unit.substring(3));
} else {
// store just a protocol
pk = sb.tables.recordAPICall(post, "Load_RSS_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "import feed " + url.toNormalform(true, false));
}
// store pk of api table into rss table to show that the entry has been recorded
assert pk != null;
Tables.Data rssRow = new Tables.Data();
rssRow.put("url", url.toNormalform(true, false).getBytes());
rssRow.put("title", feed.getChannel().getTitle());
rssRow.put("api_pk", pk);
try {
sb.tables.update("rss", url.hash(), rssRow);
} catch (IOException e) {
Log.logException(e);
}
}
}

@ -73,10 +73,12 @@ public class WorkTables extends Tables {
* @param servletName the name of the servlet
* @param type name of the servlet category
* @param comment visual description of the process
* @return the pk of the new entry in the api table
*/
public void recordAPICall(final serverObjects post, final String servletName, final String type, final String comment) {
public byte[] recordAPICall(final serverObjects post, final String servletName, final String type, final String comment) {
// remove the apicall attributes from the post object
String pk = post.remove(TABLE_API_COL_APICALL_PK);
String pks = post.remove(TABLE_API_COL_APICALL_PK);
byte[] pk = pks == null ? null : pks.getBytes();
// generate the apicall url - without the apicall attributes
final String apiurl = /*"http://localhost:" + getConfig("port", "8080") +*/ "/" + servletName + "?" + post.toString();
@ -84,7 +86,7 @@ public class WorkTables extends Tables {
// read old entry from the apicall table (if exists)
Row row = null;
try {
row = (pk == null) ? null : super.select(TABLE_API_NAME, pk.getBytes());
row = (pk == null) ? null : super.select(TABLE_API_NAME, pk);
} catch (IOException e) {
Log.logException(e);
} catch (RowSpaceExceededException e) {
@ -105,7 +107,7 @@ public class WorkTables extends Tables {
// insert APICALL attributes
data.put(TABLE_API_COL_APICALL_COUNT, "1");
super.insert(TABLE_API_NAME, data);
pk = super.insert(TABLE_API_NAME, data);
} else {
// modify and update existing entry
@ -117,6 +119,7 @@ public class WorkTables extends Tables {
// insert APICALL attributes
row.put(TABLE_API_COL_APICALL_COUNT, row.get(TABLE_API_COL_APICALL_COUNT, 1) + 1);
super.update(TABLE_API_NAME, row);
assert pk != null;
}
} catch (IOException e) {
Log.logException(e);
@ -124,6 +127,7 @@ public class WorkTables extends Tables {
Log.logException(e);
}
Log.logInfo("APICALL", apiurl);
return pk;
}
/**
@ -135,12 +139,12 @@ public class WorkTables extends Tables {
* @param comment visual description of the process
* @param time the time until next scheduled execution of this api call
* @param unit the time unit for the scheduled call
* @return the pk of the new entry in the api table
*/
public void recordAPICall(final serverObjects post, final String servletName, final String type, final String comment, int time, String unit) {
public byte[] recordAPICall(final serverObjects post, final String servletName, final String type, final String comment, int time, String unit) {
if (post.containsKey(TABLE_API_COL_APICALL_PK)) {
// this api call has already been stored somewhere.
recordAPICall(post, servletName, type, comment);
return;
return recordAPICall(post, servletName, type, comment);
}
if (time < 0 || unit == null || unit.length() == 0 || "minutes,hours,days".indexOf(unit) < 0) {
time = 0; unit = "";
@ -150,7 +154,7 @@ public class WorkTables extends Tables {
// generate the apicall url - without the apicall attributes
final String apiurl = /*"http://localhost:" + getConfig("port", "8080") +*/ "/" + servletName + "?" + post.toString();
byte[] pk = null;
// insert entry
try {
// create and insert new entry
@ -167,13 +171,14 @@ public class WorkTables extends Tables {
data.put(TABLE_API_COL_APICALL_SCHEDULE_TIME, Integer.toString(time).getBytes());
data.put(TABLE_API_COL_APICALL_SCHEDULE_UNIT, unit.getBytes());
calculateAPIScheduler(data, false); // set next execution time
super.insert(TABLE_API_NAME, data);
pk = super.insert(TABLE_API_NAME, data);
} catch (IOException e) {
Log.logException(e);
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
Log.logInfo("APICALL", apiurl);
return pk;
}
/**

@ -1910,9 +1910,6 @@ public final class Switchboard extends serverSwitch {
rssRow.put("url", rssEntry.getKey().toNormalform(true, false).getBytes());
rssRow.put("title", rssEntry.getValue().getBytes());
rssRow.put("recording_date", new Date());
//rssRow.put("last_load_date", "".getBytes());
//rssRow.put("last_load_count", "".getBytes());
//rssRow.put("avg_upd_per_day", "".getBytes());
try {
this.tables.update("rss", new DigestURI(rssEntry.getKey()).hash(), rssRow);
} catch (IOException e) {

@ -136,6 +136,7 @@ public class RSSReader extends DefaultHandler {
public void endElement(final String uri, final String name, final String tag) {
if (tag == null) return;
if ("channel".equals(tag)) {
if (parsingChannel) theChannel.setChannel(item);
parsingChannel = false;
} else if ("item".equals(tag)) {
theChannel.addMessage(item);

@ -34,6 +34,7 @@ import java.util.LinkedHashSet;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Pattern;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
@ -249,6 +250,53 @@ public class BEncodedHeap implements Map<byte[], Map<String, byte[]>>, Iterable<
return map.get(key);
}
/**
* select all rows from a table where a given matcher matches with elements in a given row
* this method makes a full-table scan of the whole table
* @param columnName the name of the column where the matcher shall match
* @param columnMatcher the matcher for the elements of the column
* @return a set of primary keys where the matcher matched
*/
public Set<byte[]> select(String columnName, Pattern columnMatcher) {
Iterator<Map.Entry<byte[], Map<String, byte[]>>> i = iterator();
Map.Entry<byte[], Map<String, byte[]>> row;
Map<String, byte[]> prop;
byte[] val;
Set<byte[]> pks = new TreeSet<byte[]>(this.table.ordering);
while (i.hasNext()) {
row = i.next();
prop = row.getValue();
val = prop.get(columnName);
if (val != null) {
if (columnMatcher.matcher(new String(val)).matches()) pks.add(row.getKey());
}
}
return pks;
}
/**
* select one row from a table where a given matcher matches with elements in a given row
* this method stops the full-table scan as soon as a first matcher was found
* @param columnName the name of the column where the matcher shall match
* @param columnMatcher the matcher for the elements of the column
* @return the row where the matcher matched the given column
*/
public Map.Entry<byte[], Map<String, byte[]>> selectOne(String columnName, Pattern columnMatcher) {
Iterator<Map.Entry<byte[], Map<String, byte[]>>> i = iterator();
Map.Entry<byte[], Map<String, byte[]>> row;
Map<String, byte[]> prop;
byte[] val;
while (i.hasNext()) {
row = i.next();
prop = row.getValue();
val = prop.get(columnName);
if (val != null) {
if (columnMatcher.matcher(new String(val)).matches()) return row;
}
}
return null;
}
/**
* insert a map into the table
* this method shall be used in exchange of the get method if the
@ -269,6 +317,28 @@ public class BEncodedHeap implements Map<byte[], Map<String, byte[]>>, Iterable<
this.table.insert(pk, b);
this.columnames.add(key);
}
public void update(byte[] pk, Map<String, byte[]> map) throws RowSpaceExceededException, IOException {
Map<String, byte[]> entry = this.get(pk);
if (entry == null) {
insert(pk, map);
} else {
entry.putAll(map);
insert(pk, entry);
}
}
public void update(byte[] pk, String key, byte[] value) throws RowSpaceExceededException, IOException {
Map<String, byte[]> entry = this.get(pk);
if (entry == null) {
entry = new HashMap<String, byte[]>();
entry.put(key, value);
insert(pk, entry);
} else {
entry.put(key, value);
insert(pk, entry);
}
}
/**
* insert a map into the table

@ -183,7 +183,7 @@ public class Tables {
public void update(final String table, byte[] pk, Map<String, byte[]> map) throws IOException {
BEncodedHeap heap = getHeap(table);
try {
heap.insert(pk, map);
heap.update(pk, map);
} catch (RowSpaceExceededException e) {
throw new IOException(e.getMessage());
}
@ -192,7 +192,7 @@ public class Tables {
public void update(final String table, Row row) throws IOException {
BEncodedHeap heap = getHeap(table);
try {
heap.insert(row.pk, row);
heap.update(row.pk, row);
} catch (RowSpaceExceededException e) {
throw new IOException(e.getMessage());
}

Loading…
Cancel
Save