diff --git a/htroot/FeedReader_p.html b/htroot/FeedReader_p.html deleted file mode 100644 index f31542047..000000000 --- a/htroot/FeedReader_p.html +++ /dev/null @@ -1,39 +0,0 @@ - - - - YaCy '#[clientname]#': Feed Reader - #%env/templates/metas.template%# - - -#%env/templates/header.template%# -#(page)# -please select your feed with ?url=Feedurl&max=5&offset=1 (to be implemented in html ;)) -:: -
-
Title
-
#[title]#
- #(hasAuthor)#::
Author
-
#[author]#
#(/hasAuthor)# -
Description
-
#[description]#
-
- -
-#{items}# -
#[title]#
-
#[description]#
-#{/items}# -
-:: - -Error: -#(error)# -You need to install libx -:: -Problem with url -#(/error)# -test -#(/page)# -#%env/templates/footer.template%# - - diff --git a/htroot/FeedReader_p.java b/htroot/FeedReader_p.java deleted file mode 100644 index b1c26dbf5..000000000 --- a/htroot/FeedReader_p.java +++ /dev/null @@ -1,91 +0,0 @@ -//FeedReader_p.java -//------------ -// part of YACY -// -// (C) 2007 Alexander Schier -// -//$LastChangedDate$ -//$LastChangedRevision$ -//$LastChangedBy$ -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -import java.io.IOException; -import java.net.MalformedURLException; - -import net.yacy.cora.document.Hit; -import net.yacy.cora.document.RSSFeed; -import net.yacy.cora.document.RSSReader; -import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.logging.Log; -import net.yacy.kelondro.util.DateFormatter; - -import de.anomic.http.server.RequestHeader; -import de.anomic.server.serverObjects; -import de.anomic.server.serverSwitch; -import de.anomic.server.servletProperties; - -// test url: -// http://localhost:8080/FeedReader_p.html?url=http://www.tagesthemen.de/xml/rss2 - -public class FeedReader_p { - - public static servletProperties respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { - final servletProperties prop = new servletProperties(); - - prop.put("page", "0"); - if (post != null) { - DigestURI url; - try { - url = new DigestURI(post.get("url"), null); - } catch (final MalformedURLException e) { - prop.put("page", "2"); - return prop; - } - - // int maxitems=Integer.parseInt(post.get("max", "0")); - // int offset=Integer.parseInt(post.get("offset", "0")); //offset to the first displayed item - try { - final RSSFeed feed = new RSSReader(url.toString()).getFeed(); - - prop.putHTML("page_title", feed.getChannel().getTitle()); - if (feed.getChannel().getAuthor() == null) { - prop.put("page_hasAuthor", "0"); - } else { - prop.put("page_hasAuthor", "1"); - prop.putHTML("page_hasAuthor_author", feed.getChannel().getAuthor()); - } - prop.putHTML("page_description", feed.getChannel().getDescription()); - - int i = 0; - for (final Hit item: feed) { - prop.putHTML("page_items_" + i + "_author", item.getAuthor()); - prop.putHTML("page_items_" + i + "_title", item.getTitle()); - prop.putHTML("page_items_" + i + "_link", item.getLink()); - prop.putHTML("page_items_" + i + "_description", item.getDescription()); - prop.putHTML("page_items_" + i + "_date", DateFormatter.formatShortSecond(item.getPubDate())); - i++; - } - prop.put("page_items", feed.size()); - prop.put("page", "1"); - } catch (IOException e) { - Log.logException(e); - } - } - - // return rewrite properties - return prop; - } -} diff --git a/htroot/ConfigWikiSearch.html b/htroot/Load_MediawikiWiki.html similarity index 100% rename from htroot/ConfigWikiSearch.html rename to htroot/Load_MediawikiWiki.html diff --git a/htroot/ConfigWikiSearch.java b/htroot/Load_MediawikiWiki.java similarity index 98% rename from htroot/ConfigWikiSearch.java rename to htroot/Load_MediawikiWiki.java index ece8113ec..db1714f19 100644 --- a/htroot/ConfigWikiSearch.java +++ b/htroot/Load_MediawikiWiki.java @@ -30,7 +30,7 @@ import de.anomic.search.SwitchboardConstants; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; -public class ConfigWikiSearch { +public class Load_MediawikiWiki { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { // return variable that accumulates replacements diff --git a/htroot/ConfigPHPBB3Search.html b/htroot/Load_PHPBB3.html similarity index 100% rename from htroot/ConfigPHPBB3Search.html rename to htroot/Load_PHPBB3.html diff --git a/htroot/ConfigPHPBB3Search.java b/htroot/Load_PHPBB3.java similarity index 98% rename from htroot/ConfigPHPBB3Search.java rename to htroot/Load_PHPBB3.java index 5663e1322..f93431068 100644 --- a/htroot/ConfigPHPBB3Search.java +++ b/htroot/Load_PHPBB3.java @@ -30,7 +30,7 @@ import de.anomic.search.SwitchboardConstants; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; -public class ConfigPHPBB3Search { +public class Load_PHPBB3 { public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { // return variable that accumulates replacements diff --git a/htroot/Load_RSS_p.html b/htroot/Load_RSS_p.html new file mode 100644 index 000000000..4776b39a3 --- /dev/null +++ b/htroot/Load_RSS_p.html @@ -0,0 +1,90 @@ + + + + YaCy '#[clientname]#': Configuration of a Wiki Search + #%env/templates/metas.template%# + + + + + #%env/templates/header.template%# + #%env/templates/submenuIndexCreate.template%# +

Loading of RSS Feeds

+

+ RSS feeds can be loaded into the YaCy search index. + This does not load the rss file as such into the index but all the messages inside the RSS feeds as individual documents. +

+ +
+
+
+
URL of the RSS feed
+
+
Simulation Mode
+
+
Indexing Mode
+
#(showload)#Available after successful loading of rss feed in simulation mode:: + not yet implemented THIS INTERFACE IS A STUB - DEVELOPMENT IS ONGOING + #(/showload)#
+ +
+
+
+ + #(showitems)#:: +
+ +
+
Title
#[title]#
+
Author
#[author]#
+
Description
#[description]#
+
Language
#[language]#
+
Date
#[date]#
+
Time-to-live
#[ttl]#
+
Docs
#[docs]#
+
+ + + + + + + + + + + #{item}# + + + + + + + + + + #{/item}# +
TitleURLAuthorLanguageDateDescription
#[title]##[link]##[author]##[language]##[date]##[description]#
+ +
+ #(/showitems)# + + #%env/templates/footer.template%# + + diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java new file mode 100644 index 000000000..dc8693cf1 --- /dev/null +++ b/htroot/Load_RSS_p.java @@ -0,0 +1,114 @@ +/** + * RSSLoader_p + * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 20.08.2010 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +import java.io.IOException; +import java.net.MalformedURLException; +import java.text.DateFormat; + +import net.yacy.cora.document.Hit; +import net.yacy.cora.document.RSSFeed; +import net.yacy.cora.document.RSSMessage; +import net.yacy.cora.document.RSSReader; +import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; + +import de.anomic.crawler.CrawlProfile; +import de.anomic.crawler.retrieval.Response; +import de.anomic.http.server.RequestHeader; +import de.anomic.search.Switchboard; +import de.anomic.server.serverObjects; +import de.anomic.server.serverSwitch; + +public class Load_RSS_p { + + public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { + + final serverObjects prop = new serverObjects(); + final Switchboard sb = (Switchboard)env; + + prop.put("showitems", 0); + prop.put("showload", 0); + prop.put("url", ""); + + if (post == null) return prop; + + prop.put("url", post.get("url", "")); + + DigestURI url = null; + try { + url = post.containsKey("url") ? new DigestURI(post.get("url", ""), null) : null; + } catch (MalformedURLException e) { + Log.logException(e); + } + + // if we have an url then try to load the rss + RSSReader rss = null; + if (url != null) try { + prop.put("url", url.toNormalform(true, false)); + Response entry = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); + byte[] resource = entry == null ? null : entry.getContent(); + rss = resource == null ? null : RSSReader.parse(resource); + } catch (IOException e) { + Log.logException(e); + } + + if (rss != null) { + prop.put("showitems", 1); + RSSFeed feed = rss.getFeed(); + RSSMessage channel = feed.getChannel(); + prop.putHTML("showitems_title", channel.getTitle()); + String author = channel.getAuthor(); + if (author == null || author.length() == 0) author = channel.getCopyright(); + prop.putHTML("showitems_author", author == null ? "" : author); + prop.putHTML("showitems_description", channel.getDescription()); + prop.putHTML("showitems_language", channel.getLanguage()); + prop.putHTML("showitems_date", DateFormat.getDateTimeInstance().format(channel.getPubDate())); + prop.putHTML("showitems_ttl", channel.getTTL()); + prop.putHTML("showitems_docs", channel.getDocs()); + + int i = 0; + for (final Hit item: feed) { + try { + url = new DigestURI(item.getLink(), null); + author = item.getAuthor(); + if (author == null) author = item.getCopyright(); + prop.put("showitems_item_" + i + "_count", i); + prop.putHTML("showitems_item_" + i + "_hash", new String(url.hash())); + prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author); + prop.putHTML("showitems_item_" + i + "_title", item.getTitle()); + prop.putHTML("showitems_item_" + i + "_link", url.toNormalform(false, false)); + prop.putHTML("showitems_item_" + i + "_description", item.getDescription()); + prop.putHTML("showitems_item_" + i + "_language", item.getLanguage()); + prop.putHTML("showitems_item_" + i + "_date", DateFormat.getDateTimeInstance().format(item.getPubDate())); + i++; + } catch (MalformedURLException e) { + Log.logException(e); + continue; + } + } + prop.put("showitems_item", i); + prop.put("showitems_num", i); + if (i > 0) prop.put("showload", 1); + } + + return prop; + } + +} diff --git a/htroot/RSSLoader_p.java b/htroot/RSSLoader_p.java deleted file mode 100644 index a4df0b070..000000000 --- a/htroot/RSSLoader_p.java +++ /dev/null @@ -1,97 +0,0 @@ -//ViewFile.java -//----------------------- -//part of YaCy -//(C) by Michael Peter Christen; mc@yacy.net -//first published on http://www.anomic.de -//Frankfurt, Germany, 2004 - -//last major change: 12.07.2004 - -//This program is free software; you can redistribute it and/or modify -//it under the terms of the GNU General Public License as published by -//the Free Software Foundation; either version 2 of the License, or -//(at your option) any later version. - -//This program is distributed in the hope that it will be useful, -//but WITHOUT ANY WARRANTY; without even the implied warranty of -//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -//GNU General Public License for more details. - -//You should have received a copy of the GNU General Public License -//along with this program; if not, write to the Free Software -//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -//you must compile this file with -//javac -classpath .:../Classes Status.java -//if the shell's current path is HTROOT - -import java.io.IOException; -import java.net.MalformedURLException; - -import net.yacy.cora.document.RSSReader; -import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.logging.Log; - -import de.anomic.crawler.CrawlProfile; -import de.anomic.crawler.retrieval.Response; -import de.anomic.http.server.RequestHeader; -import de.anomic.search.Switchboard; -import de.anomic.server.serverObjects; -import de.anomic.server.serverSwitch; - -public class RSSLoader_p { - - public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { - - final serverObjects prop = new serverObjects(); - final Switchboard sb = (Switchboard)env; - - if (post == null) { - return prop; - } - - DigestURI url = null; - - final String urlString = post.get("url", ""); - if (urlString.length() > 0) try { - url = new DigestURI(urlString, null); - } catch (final MalformedURLException e) { - return prop; - } - - - // if the resource body was not cached we try to load it from web - Response entry = null; - try { - entry = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE); - } catch (final Exception e) { - return prop; - } - if (entry == null) return prop; - - byte[] resource = entry.getContent(); - - if (resource == null) { - return prop; - } - - // now parse the content as rss - RSSReader rss; - try { - rss = RSSReader.parse(resource); - } catch (IOException e) { - Log.logException(e); - return prop; - } - - // get the links out of the rss - //Map map = doc.getAnchors(); - - // put the urls into crawler using the proxy profile - - - - return prop; - } - -} diff --git a/htroot/env/templates/submenuIndexCreate.template b/htroot/env/templates/submenuIndexCreate.template index 2039e717c..0ce56a761 100644 --- a/htroot/env/templates/submenuIndexCreate.template +++ b/htroot/env/templates/submenuIndexCreate.template @@ -2,8 +2,9 @@

Index Creation

\ No newline at end of file diff --git a/source/net/yacy/cora/document/RSSMessage.java b/source/net/yacy/cora/document/RSSMessage.java index 341da7500..b79c83942 100644 --- a/source/net/yacy/cora/document/RSSMessage.java +++ b/source/net/yacy/cora/document/RSSMessage.java @@ -39,7 +39,7 @@ public class RSSMessage implements Hit { title("title"), link("link"), description("description"), - pubDate("pubDate"), + pubDate("pubDate,lastBuildDate"), copyright("copyright,dc:publisher,publisher"), author("author,dc:creator,creator"), subject("subject,dc:subject"), @@ -47,6 +47,7 @@ public class RSSMessage implements Hit { referrer("referrer,referer"), language("language"), guid("guid"), + ttl("ttl"), docs("docs"); private Set keys; @@ -159,6 +160,10 @@ public class RSSMessage implements Hit { return Token.guid.valueFrom(this.map); } + public String getTTL() { + return Token.ttl.valueFrom(this.map); + } + public String getDocs() { return Token.docs.valueFrom(this.map); } diff --git a/source/net/yacy/cora/document/RSSReader.java b/source/net/yacy/cora/document/RSSReader.java index 0de7aa246..83b07af81 100644 --- a/source/net/yacy/cora/document/RSSReader.java +++ b/source/net/yacy/cora/document/RSSReader.java @@ -120,6 +120,11 @@ public class RSSReader extends DefaultHandler { item = new RSSMessage(); parsingChannel = true; } else if ("item".equals(tag)) { + if (parsingChannel) { + // the channel ends with the first item not with the channel close tag + theChannel.setChannel(item); + parsingChannel = false; + } item = new RSSMessage(); parsingItem = true; } else if ("image".equals(tag)) { @@ -132,7 +137,6 @@ public class RSSReader extends DefaultHandler { if (tag == null) return; if ("channel".equals(tag)) { parsingChannel = false; - theChannel.setChannel(item); } else if ("item".equals(tag)) { theChannel.addMessage(item); parsingItem = false; diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index 33c8d5291..5450a4367 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -45,6 +45,7 @@ import net.yacy.document.parser.ooxmlParser; import net.yacy.document.parser.pdfParser; import net.yacy.document.parser.pptParser; import net.yacy.document.parser.psParser; +import net.yacy.document.parser.rssParser; import net.yacy.document.parser.rtfParser; import net.yacy.document.parser.sevenzipParser; import net.yacy.document.parser.swfParser; @@ -81,6 +82,7 @@ public final class TextParser { initParser(new pdfParser()); initParser(new pptParser()); initParser(new psParser()); + initParser(new rssParser()); initParser(new rtfParser()); initParser(new sevenzipParser()); initParser(new swfParser()); diff --git a/source/net/yacy/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java new file mode 100644 index 000000000..8328ba308 --- /dev/null +++ b/source/net/yacy/document/parser/rssParser.java @@ -0,0 +1,102 @@ +/** + * rssParser.java + * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany + * First released 20.08.2010 at http://yacy.net + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + + +package net.yacy.document.parser; + +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.RSSFeed; +import net.yacy.cora.document.RSSReader; +import net.yacy.cora.document.Hit; +import net.yacy.document.AbstractParser; +import net.yacy.document.Document; +import net.yacy.document.Parser; +import net.yacy.document.TextParser; +import net.yacy.document.parser.html.ImageEntry; + +public class rssParser extends AbstractParser implements Parser { + + public rssParser() { + super("RSS Parser"); + SUPPORTED_EXTENSIONS.add("rss"); + SUPPORTED_EXTENSIONS.add("xml"); + SUPPORTED_MIME_TYPES.add("XML"); + SUPPORTED_MIME_TYPES.add("text/rss"); + SUPPORTED_MIME_TYPES.add("application/rss+xml"); + SUPPORTED_MIME_TYPES.add("application/atom+xml"); + } + + public Document[] parse(MultiProtocolURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException { + RSSReader rssReader; + try { + rssReader = new RSSReader(source); + } catch (IOException e) { + throw new Parser.Failure("Load error:" + e.getMessage(), url); + } + + RSSFeed feed = rssReader.getFeed(); + //RSSMessage channel = feed.getChannel(); + List docs = new ArrayList(); + MultiProtocolURI uri; + Set languages; + Map anchors; + Document doc; + for (Hit item: feed) try { + uri = new MultiProtocolURI(item.getLink()); + languages = new HashSet(); + languages.add(item.getLanguage()); + anchors = new HashMap(); + anchors.put(uri, item.getTitle()); + doc = new Document( + uri, + TextParser.mimeOf(url), + charset, + languages, + item.getSubject(), + item.getTitle(), + item.getAuthor(), + item.getCopyright(), + new String[0], + item.getDescription(), + null, + anchors, + new HashMap(), + false); + docs.add(doc); + } catch (MalformedURLException e) { + continue; + } + + Document[] da = new Document[docs.size()]; + docs.toArray(da); + return da; + } + +}