diff --git a/htroot/FeedReader_p.html b/htroot/FeedReader_p.html
deleted file mode 100644
index f31542047..000000000
--- a/htroot/FeedReader_p.html
+++ /dev/null
@@ -1,39 +0,0 @@
-
-
-
- YaCy '#[clientname]#': Feed Reader
- #%env/templates/metas.template%#
-
-
-#%env/templates/header.template%#
-#(page)#
-please select your feed with ?url=Feedurl&max=5&offset=1 (to be implemented in html ;))
-::
-
- - Title
- - #[title]#
- #(hasAuthor)#::- Author
- - #[author]#
#(/hasAuthor)#
- - Description
- - #[description]#
-
-
-
-#{items}#
- - #[title]#
- - #[description]#
-#{/items}#
-
-::
-
-Error:
-#(error)#
-You need to install libx
-::
-Problem with url
-#(/error)#
-test
-#(/page)#
-#%env/templates/footer.template%#
-
-
diff --git a/htroot/FeedReader_p.java b/htroot/FeedReader_p.java
deleted file mode 100644
index b1c26dbf5..000000000
--- a/htroot/FeedReader_p.java
+++ /dev/null
@@ -1,91 +0,0 @@
-//FeedReader_p.java
-//------------
-// part of YACY
-//
-// (C) 2007 Alexander Schier
-//
-//$LastChangedDate$
-//$LastChangedRevision$
-//$LastChangedBy$
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-import java.io.IOException;
-import java.net.MalformedURLException;
-
-import net.yacy.cora.document.Hit;
-import net.yacy.cora.document.RSSFeed;
-import net.yacy.cora.document.RSSReader;
-import net.yacy.kelondro.data.meta.DigestURI;
-import net.yacy.kelondro.logging.Log;
-import net.yacy.kelondro.util.DateFormatter;
-
-import de.anomic.http.server.RequestHeader;
-import de.anomic.server.serverObjects;
-import de.anomic.server.serverSwitch;
-import de.anomic.server.servletProperties;
-
-// test url:
-// http://localhost:8080/FeedReader_p.html?url=http://www.tagesthemen.de/xml/rss2
-
-public class FeedReader_p {
-
- public static servletProperties respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
- final servletProperties prop = new servletProperties();
-
- prop.put("page", "0");
- if (post != null) {
- DigestURI url;
- try {
- url = new DigestURI(post.get("url"), null);
- } catch (final MalformedURLException e) {
- prop.put("page", "2");
- return prop;
- }
-
- // int maxitems=Integer.parseInt(post.get("max", "0"));
- // int offset=Integer.parseInt(post.get("offset", "0")); //offset to the first displayed item
- try {
- final RSSFeed feed = new RSSReader(url.toString()).getFeed();
-
- prop.putHTML("page_title", feed.getChannel().getTitle());
- if (feed.getChannel().getAuthor() == null) {
- prop.put("page_hasAuthor", "0");
- } else {
- prop.put("page_hasAuthor", "1");
- prop.putHTML("page_hasAuthor_author", feed.getChannel().getAuthor());
- }
- prop.putHTML("page_description", feed.getChannel().getDescription());
-
- int i = 0;
- for (final Hit item: feed) {
- prop.putHTML("page_items_" + i + "_author", item.getAuthor());
- prop.putHTML("page_items_" + i + "_title", item.getTitle());
- prop.putHTML("page_items_" + i + "_link", item.getLink());
- prop.putHTML("page_items_" + i + "_description", item.getDescription());
- prop.putHTML("page_items_" + i + "_date", DateFormatter.formatShortSecond(item.getPubDate()));
- i++;
- }
- prop.put("page_items", feed.size());
- prop.put("page", "1");
- } catch (IOException e) {
- Log.logException(e);
- }
- }
-
- // return rewrite properties
- return prop;
- }
-}
diff --git a/htroot/ConfigWikiSearch.html b/htroot/Load_MediawikiWiki.html
similarity index 100%
rename from htroot/ConfigWikiSearch.html
rename to htroot/Load_MediawikiWiki.html
diff --git a/htroot/ConfigWikiSearch.java b/htroot/Load_MediawikiWiki.java
similarity index 98%
rename from htroot/ConfigWikiSearch.java
rename to htroot/Load_MediawikiWiki.java
index ece8113ec..db1714f19 100644
--- a/htroot/ConfigWikiSearch.java
+++ b/htroot/Load_MediawikiWiki.java
@@ -30,7 +30,7 @@ import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
-public class ConfigWikiSearch {
+public class Load_MediawikiWiki {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
diff --git a/htroot/ConfigPHPBB3Search.html b/htroot/Load_PHPBB3.html
similarity index 100%
rename from htroot/ConfigPHPBB3Search.html
rename to htroot/Load_PHPBB3.html
diff --git a/htroot/ConfigPHPBB3Search.java b/htroot/Load_PHPBB3.java
similarity index 98%
rename from htroot/ConfigPHPBB3Search.java
rename to htroot/Load_PHPBB3.java
index 5663e1322..f93431068 100644
--- a/htroot/ConfigPHPBB3Search.java
+++ b/htroot/Load_PHPBB3.java
@@ -30,7 +30,7 @@ import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
-public class ConfigPHPBB3Search {
+public class Load_PHPBB3 {
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
// return variable that accumulates replacements
diff --git a/htroot/Load_RSS_p.html b/htroot/Load_RSS_p.html
new file mode 100644
index 000000000..4776b39a3
--- /dev/null
+++ b/htroot/Load_RSS_p.html
@@ -0,0 +1,90 @@
+
+
+
+ YaCy '#[clientname]#': Configuration of a Wiki Search
+ #%env/templates/metas.template%#
+
+
+
+
+ #%env/templates/header.template%#
+ #%env/templates/submenuIndexCreate.template%#
+ Loading of RSS Feeds
+
+ RSS feeds can be loaded into the YaCy search index.
+ This does not load the rss file as such into the index but all the messages inside the RSS feeds as individual documents.
+
+
+
+
+ #(showitems)#::
+
+ #(/showitems)#
+
+ #%env/templates/footer.template%#
+
+
diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java
new file mode 100644
index 000000000..dc8693cf1
--- /dev/null
+++ b/htroot/Load_RSS_p.java
@@ -0,0 +1,114 @@
+/**
+ * RSSLoader_p
+ * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
+ * First released 20.08.2010 at http://yacy.net
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program in the file lgpl21.txt
+ * If not, see .
+ */
+
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.text.DateFormat;
+
+import net.yacy.cora.document.Hit;
+import net.yacy.cora.document.RSSFeed;
+import net.yacy.cora.document.RSSMessage;
+import net.yacy.cora.document.RSSReader;
+import net.yacy.kelondro.data.meta.DigestURI;
+import net.yacy.kelondro.logging.Log;
+
+import de.anomic.crawler.CrawlProfile;
+import de.anomic.crawler.retrieval.Response;
+import de.anomic.http.server.RequestHeader;
+import de.anomic.search.Switchboard;
+import de.anomic.server.serverObjects;
+import de.anomic.server.serverSwitch;
+
+public class Load_RSS_p {
+
+ public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
+
+ final serverObjects prop = new serverObjects();
+ final Switchboard sb = (Switchboard)env;
+
+ prop.put("showitems", 0);
+ prop.put("showload", 0);
+ prop.put("url", "");
+
+ if (post == null) return prop;
+
+ prop.put("url", post.get("url", ""));
+
+ DigestURI url = null;
+ try {
+ url = post.containsKey("url") ? new DigestURI(post.get("url", ""), null) : null;
+ } catch (MalformedURLException e) {
+ Log.logException(e);
+ }
+
+ // if we have an url then try to load the rss
+ RSSReader rss = null;
+ if (url != null) try {
+ prop.put("url", url.toNormalform(true, false));
+ Response entry = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
+ byte[] resource = entry == null ? null : entry.getContent();
+ rss = resource == null ? null : RSSReader.parse(resource);
+ } catch (IOException e) {
+ Log.logException(e);
+ }
+
+ if (rss != null) {
+ prop.put("showitems", 1);
+ RSSFeed feed = rss.getFeed();
+ RSSMessage channel = feed.getChannel();
+ prop.putHTML("showitems_title", channel.getTitle());
+ String author = channel.getAuthor();
+ if (author == null || author.length() == 0) author = channel.getCopyright();
+ prop.putHTML("showitems_author", author == null ? "" : author);
+ prop.putHTML("showitems_description", channel.getDescription());
+ prop.putHTML("showitems_language", channel.getLanguage());
+ prop.putHTML("showitems_date", DateFormat.getDateTimeInstance().format(channel.getPubDate()));
+ prop.putHTML("showitems_ttl", channel.getTTL());
+ prop.putHTML("showitems_docs", channel.getDocs());
+
+ int i = 0;
+ for (final Hit item: feed) {
+ try {
+ url = new DigestURI(item.getLink(), null);
+ author = item.getAuthor();
+ if (author == null) author = item.getCopyright();
+ prop.put("showitems_item_" + i + "_count", i);
+ prop.putHTML("showitems_item_" + i + "_hash", new String(url.hash()));
+ prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
+ prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
+ prop.putHTML("showitems_item_" + i + "_link", url.toNormalform(false, false));
+ prop.putHTML("showitems_item_" + i + "_description", item.getDescription());
+ prop.putHTML("showitems_item_" + i + "_language", item.getLanguage());
+ prop.putHTML("showitems_item_" + i + "_date", DateFormat.getDateTimeInstance().format(item.getPubDate()));
+ i++;
+ } catch (MalformedURLException e) {
+ Log.logException(e);
+ continue;
+ }
+ }
+ prop.put("showitems_item", i);
+ prop.put("showitems_num", i);
+ if (i > 0) prop.put("showload", 1);
+ }
+
+ return prop;
+ }
+
+}
diff --git a/htroot/RSSLoader_p.java b/htroot/RSSLoader_p.java
deleted file mode 100644
index a4df0b070..000000000
--- a/htroot/RSSLoader_p.java
+++ /dev/null
@@ -1,97 +0,0 @@
-//ViewFile.java
-//-----------------------
-//part of YaCy
-//(C) by Michael Peter Christen; mc@yacy.net
-//first published on http://www.anomic.de
-//Frankfurt, Germany, 2004
-
-//last major change: 12.07.2004
-
-//This program is free software; you can redistribute it and/or modify
-//it under the terms of the GNU General Public License as published by
-//the Free Software Foundation; either version 2 of the License, or
-//(at your option) any later version.
-
-//This program is distributed in the hope that it will be useful,
-//but WITHOUT ANY WARRANTY; without even the implied warranty of
-//MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-//GNU General Public License for more details.
-
-//You should have received a copy of the GNU General Public License
-//along with this program; if not, write to the Free Software
-//Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-//you must compile this file with
-//javac -classpath .:../Classes Status.java
-//if the shell's current path is HTROOT
-
-import java.io.IOException;
-import java.net.MalformedURLException;
-
-import net.yacy.cora.document.RSSReader;
-import net.yacy.kelondro.data.meta.DigestURI;
-import net.yacy.kelondro.logging.Log;
-
-import de.anomic.crawler.CrawlProfile;
-import de.anomic.crawler.retrieval.Response;
-import de.anomic.http.server.RequestHeader;
-import de.anomic.search.Switchboard;
-import de.anomic.server.serverObjects;
-import de.anomic.server.serverSwitch;
-
-public class RSSLoader_p {
-
- public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
-
- final serverObjects prop = new serverObjects();
- final Switchboard sb = (Switchboard)env;
-
- if (post == null) {
- return prop;
- }
-
- DigestURI url = null;
-
- final String urlString = post.get("url", "");
- if (urlString.length() > 0) try {
- url = new DigestURI(urlString, null);
- } catch (final MalformedURLException e) {
- return prop;
- }
-
-
- // if the resource body was not cached we try to load it from web
- Response entry = null;
- try {
- entry = sb.loader.load(sb.loader.request(url, true, false), CrawlProfile.CacheStrategy.NOCACHE, Long.MAX_VALUE);
- } catch (final Exception e) {
- return prop;
- }
- if (entry == null) return prop;
-
- byte[] resource = entry.getContent();
-
- if (resource == null) {
- return prop;
- }
-
- // now parse the content as rss
- RSSReader rss;
- try {
- rss = RSSReader.parse(resource);
- } catch (IOException e) {
- Log.logException(e);
- return prop;
- }
-
- // get the links out of the rss
- //Map map = doc.getAnchors();
-
- // put the urls into crawler using the proxy profile
-
-
-
- return prop;
- }
-
-}
diff --git a/htroot/env/templates/submenuIndexCreate.template b/htroot/env/templates/submenuIndexCreate.template
index 2039e717c..0ce56a761 100644
--- a/htroot/env/templates/submenuIndexCreate.template
+++ b/htroot/env/templates/submenuIndexCreate.template
@@ -2,8 +2,9 @@
Index Creation
\ No newline at end of file
diff --git a/source/net/yacy/cora/document/RSSMessage.java b/source/net/yacy/cora/document/RSSMessage.java
index 341da7500..b79c83942 100644
--- a/source/net/yacy/cora/document/RSSMessage.java
+++ b/source/net/yacy/cora/document/RSSMessage.java
@@ -39,7 +39,7 @@ public class RSSMessage implements Hit {
title("title"),
link("link"),
description("description"),
- pubDate("pubDate"),
+ pubDate("pubDate,lastBuildDate"),
copyright("copyright,dc:publisher,publisher"),
author("author,dc:creator,creator"),
subject("subject,dc:subject"),
@@ -47,6 +47,7 @@ public class RSSMessage implements Hit {
referrer("referrer,referer"),
language("language"),
guid("guid"),
+ ttl("ttl"),
docs("docs");
private Set keys;
@@ -159,6 +160,10 @@ public class RSSMessage implements Hit {
return Token.guid.valueFrom(this.map);
}
+ public String getTTL() {
+ return Token.ttl.valueFrom(this.map);
+ }
+
public String getDocs() {
return Token.docs.valueFrom(this.map);
}
diff --git a/source/net/yacy/cora/document/RSSReader.java b/source/net/yacy/cora/document/RSSReader.java
index 0de7aa246..83b07af81 100644
--- a/source/net/yacy/cora/document/RSSReader.java
+++ b/source/net/yacy/cora/document/RSSReader.java
@@ -120,6 +120,11 @@ public class RSSReader extends DefaultHandler {
item = new RSSMessage();
parsingChannel = true;
} else if ("item".equals(tag)) {
+ if (parsingChannel) {
+ // the channel ends with the first item not with the channel close tag
+ theChannel.setChannel(item);
+ parsingChannel = false;
+ }
item = new RSSMessage();
parsingItem = true;
} else if ("image".equals(tag)) {
@@ -132,7 +137,6 @@ public class RSSReader extends DefaultHandler {
if (tag == null) return;
if ("channel".equals(tag)) {
parsingChannel = false;
- theChannel.setChannel(item);
} else if ("item".equals(tag)) {
theChannel.addMessage(item);
parsingItem = false;
diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java
index 33c8d5291..5450a4367 100644
--- a/source/net/yacy/document/TextParser.java
+++ b/source/net/yacy/document/TextParser.java
@@ -45,6 +45,7 @@ import net.yacy.document.parser.ooxmlParser;
import net.yacy.document.parser.pdfParser;
import net.yacy.document.parser.pptParser;
import net.yacy.document.parser.psParser;
+import net.yacy.document.parser.rssParser;
import net.yacy.document.parser.rtfParser;
import net.yacy.document.parser.sevenzipParser;
import net.yacy.document.parser.swfParser;
@@ -81,6 +82,7 @@ public final class TextParser {
initParser(new pdfParser());
initParser(new pptParser());
initParser(new psParser());
+ initParser(new rssParser());
initParser(new rtfParser());
initParser(new sevenzipParser());
initParser(new swfParser());
diff --git a/source/net/yacy/document/parser/rssParser.java b/source/net/yacy/document/parser/rssParser.java
new file mode 100644
index 000000000..8328ba308
--- /dev/null
+++ b/source/net/yacy/document/parser/rssParser.java
@@ -0,0 +1,102 @@
+/**
+ * rssParser.java
+ * Copyright 2010 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
+ * First released 20.08.2010 at http://yacy.net
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program in the file lgpl21.txt
+ * If not, see .
+ */
+
+
+package net.yacy.document.parser;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.MalformedURLException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import net.yacy.cora.document.MultiProtocolURI;
+import net.yacy.cora.document.RSSFeed;
+import net.yacy.cora.document.RSSReader;
+import net.yacy.cora.document.Hit;
+import net.yacy.document.AbstractParser;
+import net.yacy.document.Document;
+import net.yacy.document.Parser;
+import net.yacy.document.TextParser;
+import net.yacy.document.parser.html.ImageEntry;
+
+public class rssParser extends AbstractParser implements Parser {
+
+ public rssParser() {
+ super("RSS Parser");
+ SUPPORTED_EXTENSIONS.add("rss");
+ SUPPORTED_EXTENSIONS.add("xml");
+ SUPPORTED_MIME_TYPES.add("XML");
+ SUPPORTED_MIME_TYPES.add("text/rss");
+ SUPPORTED_MIME_TYPES.add("application/rss+xml");
+ SUPPORTED_MIME_TYPES.add("application/atom+xml");
+ }
+
+ public Document[] parse(MultiProtocolURI url, String mimeType, String charset, InputStream source) throws Failure, InterruptedException {
+ RSSReader rssReader;
+ try {
+ rssReader = new RSSReader(source);
+ } catch (IOException e) {
+ throw new Parser.Failure("Load error:" + e.getMessage(), url);
+ }
+
+ RSSFeed feed = rssReader.getFeed();
+ //RSSMessage channel = feed.getChannel();
+ List docs = new ArrayList();
+ MultiProtocolURI uri;
+ Set languages;
+ Map anchors;
+ Document doc;
+ for (Hit item: feed) try {
+ uri = new MultiProtocolURI(item.getLink());
+ languages = new HashSet();
+ languages.add(item.getLanguage());
+ anchors = new HashMap();
+ anchors.put(uri, item.getTitle());
+ doc = new Document(
+ uri,
+ TextParser.mimeOf(url),
+ charset,
+ languages,
+ item.getSubject(),
+ item.getTitle(),
+ item.getAuthor(),
+ item.getCopyright(),
+ new String[0],
+ item.getDescription(),
+ null,
+ anchors,
+ new HashMap(),
+ false);
+ docs.add(doc);
+ } catch (MalformedURLException e) {
+ continue;
+ }
+
+ Document[] da = new Document[docs.size()];
+ docs.toArray(da);
+ return da;
+ }
+
+}