From 42414a6ae300afd7d04529c58719c085790655bb Mon Sep 17 00:00:00 2001 From: orbiter Date: Thu, 26 Aug 2010 16:01:45 +0000 Subject: [PATCH] added two more tables in rss reader interface: - fresh recorded rss feeds (not yet loaded or in scheduler) - rss feeds in scheduler The first list has a button that can be used to place rss feeds into the scheduler The second list has a button to delete rss feeds from the scheduler git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7074 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/Load_RSS_p.html | 63 +++++ htroot/Load_RSS_p.java | 220 +++++++++++++++--- source/de/anomic/data/WorkTables.java | 23 +- source/de/anomic/search/Switchboard.java | 3 - source/net/yacy/cora/document/RSSReader.java | 1 + .../net/yacy/kelondro/blob/BEncodedHeap.java | 70 ++++++ source/net/yacy/kelondro/blob/Tables.java | 4 +- 7 files changed, 337 insertions(+), 47 deletions(-) diff --git a/htroot/Load_RSS_p.html b/htroot/Load_RSS_p.html index 9a7b95937..c26a6b554 100644 --- a/htroot/Load_RSS_p.html +++ b/htroot/Load_RSS_p.html @@ -68,6 +68,69 @@ + #(showscheduledfeeds)#:: +
+ + + + + + + + + + + + + + #{list}# + + + + + + + + + + + + #{/list}# +
TitleURL/ReferrerRecordingLast LoadNext LoadLast CountAll CountAvg. Update/Day
#[title]##[rss]#
#[referrer]#
#[recording]##[lastload]##[nextload]##[lastcount]##[allcount]##[updperday]#
+

+ + +

+
+ #(/showscheduledfeeds)# + + #(shownewfeeds)#:: +
+ + + + + + + + + #{list}# + + + + + + + #{/list}# +
TitleURL/ReferrerRecording
#[title]##[rss]#
#[referrer]#
#[recording]#
+

+ + + +

+
+ #(/shownewfeeds)# + #(showitems)#::
diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java index 57b7d2950..0384b2239 100644 --- a/htroot/Load_RSS_p.java +++ b/htroot/Load_RSS_p.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.net.MalformedURLException; import java.text.DateFormat; import java.util.Date; +import java.util.Iterator; import java.util.Map; import net.yacy.cora.document.Hit; @@ -32,7 +33,10 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.storage.ARC; import net.yacy.cora.storage.ComparableARC; import net.yacy.document.Parser.Failure; +import net.yacy.kelondro.blob.Tables; +import net.yacy.kelondro.blob.Tables.Row; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.order.Base64Order; @@ -53,11 +57,111 @@ public class Load_RSS_p { final serverObjects prop = new serverObjects(); final Switchboard sb = (Switchboard)env; - prop.put("showitems", 0); prop.put("showload", 0); + prop.put("showitems", 0); + prop.put("shownewfeeds", 0); + prop.put("showscheduledfeeds", 0); prop.put("url", ""); + + if (post != null && (post.containsKey("removeSelectedFeedNewList") || post.containsKey("removeSelectedFeedScheduler"))) { + for (Map.Entry entry: post.entrySet()) { + if (entry.getValue().startsWith("mark_")) try { + sb.tables.delete("rss", entry.getValue().substring(5).getBytes()); + } catch (IOException e) { + Log.logException(e); + } + } + } + + if (post != null && post.containsKey("addSelectedFeedScheduler")) { + for (Map.Entry entry: post.entrySet()) { + if (entry.getValue().startsWith("mark_")) try { + Row row = sb.tables.select("rss", entry.getValue().substring(5).getBytes()); + RSSReader rss = null; + DigestURI url = new DigestURI(row.get("url", "")); + try { + Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); + byte[] resource = response == null ? null : response.getContent(); + rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); + } catch (IOException e) { + Log.logException(e); + } + if (rss == null) { + Log.logWarning("Load_RSS", "no rss for url" + url.toNormalform(true, false)); + } else { + RSSFeed feed = rss.getFeed(); + indexAllRssFeed(sb, url, feed); + + // add the feed also to the scheduler + recordAPI(sb, url, rss.getFeed(), 1, "seldays"); + } + } catch (IOException e) { + Log.logException(e); + } catch (RowSpaceExceededException e) { + Log.logException(e); + } + } + } - if (post == null) return prop; + if (post == null || (post != null && (post.containsKey("addSelectedFeedScheduler") || post.containsKey("removeSelectedFeedNewList") || post.containsKey("removeSelectedFeedScheduler")))) { + try { + // get list of primary keys from the api table with scheduled feed loading requests + Tables.Row row; + String messageurl; + + // check feeds + int newc = 0, apic = 0; + Iterator plainIterator = sb.tables.iterator("rss"); + while (plainIterator.hasNext()) { + row = plainIterator.next(); + if (row == null) continue; + messageurl = row.get("url", ""); + if (messageurl.length() == 0) continue; + // get referrer + DigestURI referrer = sb.getURL(Segments.Process.LOCALCRAWLING, row.get("referrer", "").getBytes()); + // check if feed is registered in scheduler + byte[] api_pk = row.get("api_pk"); + Row r = api_pk == null ? null : sb.tables.select("api", api_pk); + if (r != null && r.get("comment", "").matches(".*\\Q" + messageurl + "\\E.*")) { + // this is a recorded entry + Date date_next_exec = r.containsKey(WorkTables.TABLE_API_COL_DATE_NEXT_EXEC) ? row.get(WorkTables.TABLE_API_COL_DATE_NEXT_EXEC, new Date()) : null; + prop.put("showscheduledfeeds_list_" + apic + "_pk", new String(row.getPK())); + prop.put("showscheduledfeeds_list_" + apic + "_count", apic); + prop.put("showscheduledfeeds_list_" + apic + "_rss", messageurl); + prop.put("showscheduledfeeds_list_" + apic + "_title", row.get("title", "")); + prop.put("showscheduledfeeds_list_" + apic + "_referrer", referrer == null ? "" : referrer.toNormalform(true, false)); + prop.put("showscheduledfeeds_list_" + apic + "_recording", DateFormat.getDateTimeInstance().format(row.get("recording_date", new Date()))); + prop.put("showscheduledfeeds_list_" + apic + "_lastload", DateFormat.getDateTimeInstance().format(row.get("last_load_date", new Date()))); + prop.put("showscheduledfeeds_list_" + apic + "_nextload", date_next_exec == null ? "" : DateFormat.getDateTimeInstance().format(date_next_exec)); + prop.put("showscheduledfeeds_list_" + apic + "_lastcount", row.get("last_load_count", 0)); + prop.put("showscheduledfeeds_list_" + apic + "_allcount", row.get("all_load_count", 0)); + prop.put("showscheduledfeeds_list_" + apic + "_updperday", row.get("avg_upd_per_day", 0)); + apic++; + } else { + // this is a new entry + prop.put("shownewfeeds_list_" + newc + "_pk", new String(row.getPK())); + prop.put("shownewfeeds_list_" + newc + "_count", newc); + prop.put("shownewfeeds_list_" + newc + "_rss", messageurl); + prop.put("shownewfeeds_list_" + newc + "_title", row.get("title", "")); + prop.put("shownewfeeds_list_" + newc + "_referrer", referrer == null ? "" : referrer.toNormalform(true, false)); + prop.put("shownewfeeds_list_" + newc + "_recording", DateFormat.getDateTimeInstance().format(row.get("recording_date", new Date()))); + newc++; + } + } + prop.put("showscheduledfeeds_list" , apic); + prop.put("showscheduledfeeds_num", apic); + prop.put("showscheduledfeeds", apic > 0 ? apic : 0); + prop.put("shownewfeeds_list" , newc); + prop.put("shownewfeeds_num", newc); + prop.put("shownewfeeds", newc > 0 ? 1 : 0); + } catch (IOException e) { + Log.logException(e); + } catch (RowSpaceExceededException e) { + Log.logException(e); + } + + return prop; + } prop.put("url", post.get("url", "")); @@ -78,8 +182,8 @@ public class Load_RSS_p { RSSReader rss = null; if (url != null) try { prop.put("url", url.toNormalform(true, false)); - Response entry = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); - byte[] resource = entry == null ? null : entry.getContent(); + Response response = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); + byte[] resource = response == null ? null : response.getContent(); rss = resource == null ? null : RSSReader.parse(RSSFeed.DEFAULT_MAXSIZE, resource); } catch (IOException e) { Log.logException(e); @@ -103,33 +207,16 @@ public class Load_RSS_p { } } } + if (rss != null && post.containsKey("indexAllItemContent")) { record_api = true; RSSFeed feed = rss.getFeed(); - loop: for (RSSMessage message: feed) { - try { - DigestURI messageurl = new DigestURI(message.getLink()); - if (indexTriggered.containsKey(messageurl.hash()) && post.containsKey("indexSelectedItemContent")) continue loop; - if (sb.urlExists(Segments.Process.LOCALCRAWLING, messageurl.hash()) != null) continue loop; - sb.addToIndex(messageurl, null, null); - indexTriggered.put(messageurl.hash(), new Date()); - } catch (IOException e) { - Log.logException(e); - } catch (Failure e) { - Log.logException(e); - } - } + indexAllRssFeed(sb, url, feed); } - if (record_api) { + if (record_api && rss != null && rss.getFeed() != null && rss.getFeed().getChannel() != null) { // record API action - if (repeat_time > 0) { - // store as scheduled api call - sb.tables.recordAPICall(post, "Load_RSS_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "import feed " + url.toNormalform(true, false), repeat_time, repeat_unit.substring(3)); - } else { - // store just a protocol - sb.tables.recordAPICall(post, "Load_RSS_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "import feed " + url.toNormalform(true, false)); - } + recordAPI(sb, url, rss.getFeed(), repeat_time, repeat_unit); } // show items from rss @@ -137,16 +224,16 @@ public class Load_RSS_p { prop.put("showitems", 1); RSSFeed feed = rss.getFeed(); RSSMessage channel = feed.getChannel(); - prop.putHTML("showitems_title", channel.getTitle()); - String author = channel.getAuthor(); - if (author == null || author.length() == 0) author = channel.getCopyright(); - Date pubDate = channel.getPubDate(); + prop.putHTML("showitems_title", channel == null ? "" : channel.getTitle()); + String author = channel == null ? "" : channel.getAuthor(); + if (author == null || author.length() == 0) author = channel == null ? "" : channel.getCopyright(); + Date pubDate = channel == null ? null : channel.getPubDate(); prop.putHTML("showitems_author", author == null ? "" : author); - prop.putHTML("showitems_description", channel.getDescription()); - prop.putHTML("showitems_language", channel.getLanguage()); + prop.putHTML("showitems_description", channel == null ? "" : channel.getDescription()); + prop.putHTML("showitems_language", channel == null ? "" : channel.getLanguage()); prop.putHTML("showitems_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate)); - prop.putHTML("showitems_ttl", channel.getTTL()); - prop.putHTML("showitems_docs", channel.getDocs()); + prop.putHTML("showitems_ttl", channel == null ? "" : channel.getTTL()); + prop.putHTML("showitems_docs", channel == null ? "" : channel.getDocs()); int i = 0; for (final Hit item: feed) { @@ -182,4 +269,71 @@ public class Load_RSS_p { return prop; } + private static void indexAllRssFeed(Switchboard sb, DigestURI url, RSSFeed feed) { + int loadCount = 0; + loop: for (RSSMessage message: feed) { + try { + DigestURI messageurl = new DigestURI(message.getLink()); + if (indexTriggered.containsKey(messageurl.hash())) continue loop; + if (sb.urlExists(Segments.Process.LOCALCRAWLING, messageurl.hash()) != null) continue loop; + sb.addToIndex(messageurl, null, null); + indexTriggered.put(messageurl.hash(), new Date()); + loadCount++; + } catch (IOException e) { + Log.logException(e); + } catch (Failure e) { + Log.logException(e); + } + } + // update info for loading + + try { + Tables.Data rssRow = sb.tables.select("rss", url.hash()); + if (rssRow == null) rssRow = new Tables.Data(); + Date lastLoadDate = rssRow.get("last_load_date", new Date(0)); + long deltaTime = Math.min(System.currentTimeMillis() - lastLoadDate.getTime(), 1000 * 60 * 60 * 24); + int allLoadCount = rssRow.get("all_load_count", 0); + int lastAvg = rssRow.get("avg_upd_per_day", 0); + long thisAvg = 1000 * 60 * 60 * 24 / deltaTime * loadCount; + long nextAvg = lastAvg == 0 ? thisAvg : (thisAvg + lastAvg * 2) / 3; + rssRow.put("url", url.toNormalform(true, false).getBytes()); + rssRow.put("title", feed.getChannel().getTitle()); + rssRow.put("last_load_date", new Date()); + rssRow.put("last_load_count", loadCount); + rssRow.put("all_load_count", allLoadCount + loadCount); + rssRow.put("avg_upd_per_day", nextAvg); + sb.tables.update("rss", url.hash(), rssRow); + } catch (IOException e) { + Log.logException(e); + } catch (RowSpaceExceededException e) { + Log.logException(e); + } + } + + + private static void recordAPI(Switchboard sb, DigestURI url, RSSFeed feed, int repeat_time, String repeat_unit) { + // record API action + byte[] pk = null; + serverObjects post = new serverObjects(); + post.put("url", url.toNormalform(true, false)); + post.put("indexAllItemContent", ""); + if (repeat_time > 0) { + // store as scheduled api call + pk = sb.tables.recordAPICall(post, "Load_RSS_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "import feed " + url.toNormalform(true, false), repeat_time, repeat_unit.substring(3)); + } else { + // store just a protocol + pk = sb.tables.recordAPICall(post, "Load_RSS_p.html", WorkTables.TABLE_API_TYPE_CRAWLER, "import feed " + url.toNormalform(true, false)); + } + // store pk of api table into rss table to show that the entry has been recorded + assert pk != null; + Tables.Data rssRow = new Tables.Data(); + rssRow.put("url", url.toNormalform(true, false).getBytes()); + rssRow.put("title", feed.getChannel().getTitle()); + rssRow.put("api_pk", pk); + try { + sb.tables.update("rss", url.hash(), rssRow); + } catch (IOException e) { + Log.logException(e); + } + } } diff --git a/source/de/anomic/data/WorkTables.java b/source/de/anomic/data/WorkTables.java index 95033dd96..05ff5f5fd 100644 --- a/source/de/anomic/data/WorkTables.java +++ b/source/de/anomic/data/WorkTables.java @@ -73,10 +73,12 @@ public class WorkTables extends Tables { * @param servletName the name of the servlet * @param type name of the servlet category * @param comment visual description of the process + * @return the pk of the new entry in the api table */ - public void recordAPICall(final serverObjects post, final String servletName, final String type, final String comment) { + public byte[] recordAPICall(final serverObjects post, final String servletName, final String type, final String comment) { // remove the apicall attributes from the post object - String pk = post.remove(TABLE_API_COL_APICALL_PK); + String pks = post.remove(TABLE_API_COL_APICALL_PK); + byte[] pk = pks == null ? null : pks.getBytes(); // generate the apicall url - without the apicall attributes final String apiurl = /*"http://localhost:" + getConfig("port", "8080") +*/ "/" + servletName + "?" + post.toString(); @@ -84,7 +86,7 @@ public class WorkTables extends Tables { // read old entry from the apicall table (if exists) Row row = null; try { - row = (pk == null) ? null : super.select(TABLE_API_NAME, pk.getBytes()); + row = (pk == null) ? null : super.select(TABLE_API_NAME, pk); } catch (IOException e) { Log.logException(e); } catch (RowSpaceExceededException e) { @@ -105,7 +107,7 @@ public class WorkTables extends Tables { // insert APICALL attributes data.put(TABLE_API_COL_APICALL_COUNT, "1"); - super.insert(TABLE_API_NAME, data); + pk = super.insert(TABLE_API_NAME, data); } else { // modify and update existing entry @@ -117,6 +119,7 @@ public class WorkTables extends Tables { // insert APICALL attributes row.put(TABLE_API_COL_APICALL_COUNT, row.get(TABLE_API_COL_APICALL_COUNT, 1) + 1); super.update(TABLE_API_NAME, row); + assert pk != null; } } catch (IOException e) { Log.logException(e); @@ -124,6 +127,7 @@ public class WorkTables extends Tables { Log.logException(e); } Log.logInfo("APICALL", apiurl); + return pk; } /** @@ -135,12 +139,12 @@ public class WorkTables extends Tables { * @param comment visual description of the process * @param time the time until next scheduled execution of this api call * @param unit the time unit for the scheduled call + * @return the pk of the new entry in the api table */ - public void recordAPICall(final serverObjects post, final String servletName, final String type, final String comment, int time, String unit) { + public byte[] recordAPICall(final serverObjects post, final String servletName, final String type, final String comment, int time, String unit) { if (post.containsKey(TABLE_API_COL_APICALL_PK)) { // this api call has already been stored somewhere. - recordAPICall(post, servletName, type, comment); - return; + return recordAPICall(post, servletName, type, comment); } if (time < 0 || unit == null || unit.length() == 0 || "minutes,hours,days".indexOf(unit) < 0) { time = 0; unit = ""; @@ -150,7 +154,7 @@ public class WorkTables extends Tables { // generate the apicall url - without the apicall attributes final String apiurl = /*"http://localhost:" + getConfig("port", "8080") +*/ "/" + servletName + "?" + post.toString(); - + byte[] pk = null; // insert entry try { // create and insert new entry @@ -167,13 +171,14 @@ public class WorkTables extends Tables { data.put(TABLE_API_COL_APICALL_SCHEDULE_TIME, Integer.toString(time).getBytes()); data.put(TABLE_API_COL_APICALL_SCHEDULE_UNIT, unit.getBytes()); calculateAPIScheduler(data, false); // set next execution time - super.insert(TABLE_API_NAME, data); + pk = super.insert(TABLE_API_NAME, data); } catch (IOException e) { Log.logException(e); } catch (RowSpaceExceededException e) { Log.logException(e); } Log.logInfo("APICALL", apiurl); + return pk; } /** diff --git a/source/de/anomic/search/Switchboard.java b/source/de/anomic/search/Switchboard.java index b9824b267..65918fa25 100644 --- a/source/de/anomic/search/Switchboard.java +++ b/source/de/anomic/search/Switchboard.java @@ -1910,9 +1910,6 @@ public final class Switchboard extends serverSwitch { rssRow.put("url", rssEntry.getKey().toNormalform(true, false).getBytes()); rssRow.put("title", rssEntry.getValue().getBytes()); rssRow.put("recording_date", new Date()); - //rssRow.put("last_load_date", "".getBytes()); - //rssRow.put("last_load_count", "".getBytes()); - //rssRow.put("avg_upd_per_day", "".getBytes()); try { this.tables.update("rss", new DigestURI(rssEntry.getKey()).hash(), rssRow); } catch (IOException e) { diff --git a/source/net/yacy/cora/document/RSSReader.java b/source/net/yacy/cora/document/RSSReader.java index 8236d8075..24541770f 100644 --- a/source/net/yacy/cora/document/RSSReader.java +++ b/source/net/yacy/cora/document/RSSReader.java @@ -136,6 +136,7 @@ public class RSSReader extends DefaultHandler { public void endElement(final String uri, final String name, final String tag) { if (tag == null) return; if ("channel".equals(tag)) { + if (parsingChannel) theChannel.setChannel(item); parsingChannel = false; } else if ("item".equals(tag)) { theChannel.addMessage(item); diff --git a/source/net/yacy/kelondro/blob/BEncodedHeap.java b/source/net/yacy/kelondro/blob/BEncodedHeap.java index 10872b2ef..e6a8aa544 100644 --- a/source/net/yacy/kelondro/blob/BEncodedHeap.java +++ b/source/net/yacy/kelondro/blob/BEncodedHeap.java @@ -34,6 +34,7 @@ import java.util.LinkedHashSet; import java.util.Map; import java.util.Set; import java.util.TreeSet; +import java.util.regex.Pattern; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; @@ -249,6 +250,53 @@ public class BEncodedHeap implements Map>, Iterable< return map.get(key); } + /** + * select all rows from a table where a given matcher matches with elements in a given row + * this method makes a full-table scan of the whole table + * @param columnName the name of the column where the matcher shall match + * @param columnMatcher the matcher for the elements of the column + * @return a set of primary keys where the matcher matched + */ + public Set select(String columnName, Pattern columnMatcher) { + Iterator>> i = iterator(); + Map.Entry> row; + Map prop; + byte[] val; + Set pks = new TreeSet(this.table.ordering); + while (i.hasNext()) { + row = i.next(); + prop = row.getValue(); + val = prop.get(columnName); + if (val != null) { + if (columnMatcher.matcher(new String(val)).matches()) pks.add(row.getKey()); + } + } + return pks; + } + + /** + * select one row from a table where a given matcher matches with elements in a given row + * this method stops the full-table scan as soon as a first matcher was found + * @param columnName the name of the column where the matcher shall match + * @param columnMatcher the matcher for the elements of the column + * @return the row where the matcher matched the given column + */ + public Map.Entry> selectOne(String columnName, Pattern columnMatcher) { + Iterator>> i = iterator(); + Map.Entry> row; + Map prop; + byte[] val; + while (i.hasNext()) { + row = i.next(); + prop = row.getValue(); + val = prop.get(columnName); + if (val != null) { + if (columnMatcher.matcher(new String(val)).matches()) return row; + } + } + return null; + } + /** * insert a map into the table * this method shall be used in exchange of the get method if the @@ -269,6 +317,28 @@ public class BEncodedHeap implements Map>, Iterable< this.table.insert(pk, b); this.columnames.add(key); } + + public void update(byte[] pk, Map map) throws RowSpaceExceededException, IOException { + Map entry = this.get(pk); + if (entry == null) { + insert(pk, map); + } else { + entry.putAll(map); + insert(pk, entry); + } + } + + public void update(byte[] pk, String key, byte[] value) throws RowSpaceExceededException, IOException { + Map entry = this.get(pk); + if (entry == null) { + entry = new HashMap(); + entry.put(key, value); + insert(pk, entry); + } else { + entry.put(key, value); + insert(pk, entry); + } + } /** * insert a map into the table diff --git a/source/net/yacy/kelondro/blob/Tables.java b/source/net/yacy/kelondro/blob/Tables.java index fd0971542..ef168f83e 100644 --- a/source/net/yacy/kelondro/blob/Tables.java +++ b/source/net/yacy/kelondro/blob/Tables.java @@ -183,7 +183,7 @@ public class Tables { public void update(final String table, byte[] pk, Map map) throws IOException { BEncodedHeap heap = getHeap(table); try { - heap.insert(pk, map); + heap.update(pk, map); } catch (RowSpaceExceededException e) { throw new IOException(e.getMessage()); } @@ -192,7 +192,7 @@ public class Tables { public void update(final String table, Row row) throws IOException { BEncodedHeap heap = getHeap(table); try { - heap.insert(row.pk, row); + heap.update(row.pk, row); } catch (RowSpaceExceededException e) { throw new IOException(e.getMessage()); }