- removed old rss parser - removed unused rss parser libraries - added new rss reader - added previously removed FeedReader_p.java and adopted it to new rss parser - adopted parser interface for rss indexing to new rss parser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3970 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
26ddf797eb
commit
9da0e53fe8
@ -0,0 +1,79 @@
|
||||
//FeedReader_p.java
|
||||
//------------
|
||||
// part of YACY
|
||||
//
|
||||
// (C) 2007 Alexander Schier
|
||||
//
|
||||
// last change: $LastChangedDate: $ by $LastChangedBy: $
|
||||
// $LastChangedRevision: $
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
|
||||
import de.anomic.http.httpHeader;
|
||||
import de.anomic.net.URL;
|
||||
import de.anomic.server.serverObjects;
|
||||
import de.anomic.server.serverSwitch;
|
||||
import de.anomic.server.servletProperties;
|
||||
import de.anomic.xml.rssReader;
|
||||
|
||||
// test url:
|
||||
// http://localhost:8080/FeedReader_p.html?url=http://www.tagesthemen.de/xml/rss2
|
||||
|
||||
public class FeedReader_p {
|
||||
|
||||
public static servletProperties respond(httpHeader header, serverObjects post, serverSwitch env) {
|
||||
servletProperties prop = new servletProperties();
|
||||
|
||||
prop.put("page", 0);
|
||||
if (post != null) {
|
||||
URL url;
|
||||
try {
|
||||
url = new URL((String) post.get("url"));
|
||||
} catch (MalformedURLException e) {
|
||||
prop.put("page", 2);
|
||||
return prop;
|
||||
}
|
||||
|
||||
// int maxitems=Integer.parseInt(post.get("max", "0"));
|
||||
// int offset=Integer.parseInt(post.get("offset", "0")); //offset to the first displayed item
|
||||
rssReader parser = new rssReader(url.toString());
|
||||
|
||||
prop.put("page_title", parser.getChannel().getTitle());
|
||||
if (parser.getChannel().getAuthor() == null) {
|
||||
prop.put("page_hasAuthor", 0);
|
||||
} else {
|
||||
prop.put("page_hasAuthor", 1);
|
||||
prop.put("page_hasAuthor_author", parser.getChannel().getAuthor());
|
||||
}
|
||||
prop.put("page_description", parser.getChannel().getDescription());
|
||||
|
||||
for (int i = 0; i < parser.items(); i++) {
|
||||
rssReader.Item item = parser.getItem(i);
|
||||
prop.put("page_items_" + i + "_author", item.getAuthor());
|
||||
prop.put("page_items_" + i + "_title", item.getTitle());
|
||||
prop.put("page_items_" + i + "_link", item.getLink());
|
||||
prop.putASIS("page_items_" + i + "_description", item.getDescription());
|
||||
prop.put("page_items_" + i + "_date", item.getPubDate());
|
||||
}
|
||||
prop.put("page_items", parser.items());
|
||||
prop.put("page", 1);
|
||||
}
|
||||
|
||||
// return rewrite properties
|
||||
return prop;
|
||||
}
|
||||
}
|
@ -1,126 +0,0 @@
|
||||
//rssReader.java
|
||||
//------------
|
||||
// part of YACY
|
||||
//
|
||||
// (C) 2007 Alexander Schier
|
||||
//
|
||||
// last change: $LastChangedDate: $ by $LastChangedBy: $
|
||||
// $LastChangedRevision: $
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.data;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.net.URL;
|
||||
import java.util.Collection;
|
||||
import java.util.Comparator;
|
||||
import java.util.Date;
|
||||
import java.util.Iterator;
|
||||
import java.util.TreeSet;
|
||||
|
||||
import de.nava.informa.core.ChannelIF;
|
||||
import de.nava.informa.core.ParseException;
|
||||
import de.nava.informa.impl.basic.ChannelBuilder;
|
||||
import de.nava.informa.parsers.FeedParser;
|
||||
|
||||
import de.anomic.yacy.yacyCore;
|
||||
|
||||
public class rssReader {
|
||||
URL url;
|
||||
ChannelIF channel;
|
||||
TreeSet feedItems;
|
||||
public rssReader(String url) throws MalformedURLException{
|
||||
this.url=new URL(url);
|
||||
String yAddress=yacyCore.seedDB.resolveYacyAddress(this.url.getHost());
|
||||
if(yAddress != null){
|
||||
this.url=new URL(this.url.getProtocol()+"://"+yAddress+"/"+this.url.getPath());
|
||||
}
|
||||
ChannelBuilder builder=new ChannelBuilder();
|
||||
try {
|
||||
channel=FeedParser.parse(builder, this.url);
|
||||
Collection oldfeedItems=channel.getItems();
|
||||
feedItems=new TreeSet(new ItemComparator());
|
||||
Iterator it=oldfeedItems.iterator();
|
||||
int count=0;
|
||||
while(it.hasNext()){
|
||||
de.nava.informa.impl.basic.Item item=(de.nava.informa.impl.basic.Item) it.next();
|
||||
Item newItem=new Item(count++, item.getLink(), item.getTitle(), item.getDescription(), item.getDate(), item.getCreator());
|
||||
feedItems.add(newItem);
|
||||
}
|
||||
}
|
||||
catch (IOException e) {}
|
||||
catch (ParseException e) {}
|
||||
}
|
||||
public String getCreator(){
|
||||
return (channel!=null)? channel.getCreator(): null;
|
||||
}
|
||||
public String getTitle(){
|
||||
return (channel!=null)? channel.getTitle(): null;
|
||||
}
|
||||
public String getDescription(){
|
||||
return (channel!=null)? channel.getDescription(): null;
|
||||
}
|
||||
public Collection getFeedItems(){
|
||||
return feedItems;
|
||||
}
|
||||
|
||||
public class Item{
|
||||
String creator, title, description;
|
||||
Date date;
|
||||
URL link;
|
||||
int num;
|
||||
public Item(int num, URL link, String title, String description, Date date, String creator){
|
||||
this.link=link;
|
||||
this.title=title;
|
||||
this.description=description;
|
||||
this.date=date;
|
||||
this.creator=creator;
|
||||
this.num=num;
|
||||
}
|
||||
public URL getLink(){
|
||||
return link;
|
||||
}
|
||||
public String getTitle(){
|
||||
return (title!=null)? title: "";
|
||||
}
|
||||
public String getDescription(){
|
||||
return (description!=null)? description: "";
|
||||
}
|
||||
public Date getDate(){
|
||||
return (date!=null)? date: new Date();
|
||||
}
|
||||
public String getCreator(){
|
||||
return (creator!=null)? creator: "";
|
||||
}
|
||||
public int getNum(){
|
||||
return num;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
public class ItemComparator implements Comparator {
|
||||
public int compare(Object o1, Object o2){
|
||||
int num1=((Item)o1).getNum();
|
||||
int num2=((Item)o2).getNum();
|
||||
return num2-num1;
|
||||
}
|
||||
public boolean equals(Object o1, Object o2){
|
||||
return compare(o1, o2)==0;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,213 @@
|
||||
package de.anomic.xml;
|
||||
|
||||
import java.io.InputStream;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
|
||||
import javax.xml.parsers.SAXParser;
|
||||
import javax.xml.parsers.SAXParserFactory;
|
||||
|
||||
import org.xml.sax.Attributes;
|
||||
import org.xml.sax.SAXException;
|
||||
import org.xml.sax.helpers.DefaultHandler;
|
||||
|
||||
public class rssReader extends DefaultHandler {
|
||||
|
||||
// statics for item generation and automatic categorization
|
||||
private static int guidcount = 0;
|
||||
private static final String[] tagsDef = new String[]{
|
||||
"author", //
|
||||
"copyright", //
|
||||
"category", //
|
||||
"title", //
|
||||
"link", //
|
||||
"language", //
|
||||
"description", //
|
||||
"creator", //
|
||||
"pubDate", //
|
||||
"guid", //
|
||||
"docs" //
|
||||
};
|
||||
|
||||
private static final HashSet tags = new HashSet();
|
||||
static {
|
||||
for (int i = 0; i < tagsDef.length; i++) {
|
||||
tags.add(tagsDef[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// class variables
|
||||
private Item channel, item;
|
||||
private StringBuffer buffer;
|
||||
private boolean parsingChannel, parsingImage, parsingItem;
|
||||
private String imageURL;
|
||||
private ArrayList itemsGUID; // a list of GUIDs, so the items can be retrieved by a specific order
|
||||
private HashMap items; // a guid:Item map
|
||||
|
||||
|
||||
public rssReader(String path) {
|
||||
init();
|
||||
parse(path);
|
||||
}
|
||||
|
||||
public rssReader(InputStream stream) {
|
||||
init();
|
||||
parse(stream);
|
||||
}
|
||||
|
||||
private void init() {
|
||||
itemsGUID = new ArrayList();
|
||||
items = new HashMap();
|
||||
buffer = new StringBuffer();
|
||||
item = null;
|
||||
channel = null;
|
||||
parsingChannel = false;
|
||||
parsingImage = false;
|
||||
parsingItem = false;
|
||||
}
|
||||
|
||||
private void parse(String path) {
|
||||
try {
|
||||
SAXParserFactory factory = SAXParserFactory.newInstance();
|
||||
SAXParser saxParser = factory.newSAXParser();
|
||||
saxParser.parse(path, this);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
private void parse(InputStream stream) {
|
||||
try {
|
||||
SAXParserFactory factory = SAXParserFactory.newInstance();
|
||||
SAXParser saxParser = factory.newSAXParser();
|
||||
saxParser.parse(stream, this);
|
||||
} catch (Exception e) {
|
||||
e.printStackTrace();
|
||||
}
|
||||
}
|
||||
|
||||
public void startElement(String uri, String name, String tag, Attributes atts) throws SAXException {
|
||||
if ("channel".equals(tag)) {
|
||||
channel = new Item();
|
||||
parsingChannel = true;
|
||||
} else if ("item".equals(tag)) {
|
||||
item = new Item();
|
||||
parsingItem = true;
|
||||
} else if ("image".equals(tag)) {
|
||||
parsingImage = true;
|
||||
}
|
||||
}
|
||||
|
||||
public void endElement(String uri, String name, String tag) {
|
||||
if (tag == null) return;
|
||||
if ("channel".equals(tag)) {
|
||||
parsingChannel = false;
|
||||
} else if ("item".equals(tag)) {
|
||||
String guid = item.getGuid();
|
||||
itemsGUID.add(guid);
|
||||
items.put(guid, item);
|
||||
parsingItem = false;
|
||||
} else if ("image".equals(tag)) {
|
||||
parsingImage = false;
|
||||
} else if ((parsingImage) && (parsingChannel)) {
|
||||
String value = buffer.toString().trim();
|
||||
buffer.setLength(0);
|
||||
if ("url".equals(tag)) imageURL = value;
|
||||
} else if (parsingItem) {
|
||||
String value = buffer.toString().trim();
|
||||
buffer.setLength(0);
|
||||
if (tags.contains(tag)) item.setValue(tag, value);
|
||||
} else if (parsingChannel) {
|
||||
String value = buffer.toString().trim();
|
||||
buffer.setLength(0);
|
||||
if (tags.contains(tag)) channel.setValue(tag, value);
|
||||
}
|
||||
}
|
||||
|
||||
public void characters(char ch[], int start, int length) {
|
||||
if (parsingItem || parsingChannel) {
|
||||
buffer.append(ch, start, length);
|
||||
}
|
||||
}
|
||||
|
||||
public Item getChannel() {
|
||||
return channel;
|
||||
}
|
||||
|
||||
public Item getItem(int i) {
|
||||
// retrieve item by order number
|
||||
return getItem((String) itemsGUID.get(i));
|
||||
}
|
||||
|
||||
public Item getItem(String guid) {
|
||||
// retrieve item by guid
|
||||
return (Item) items.get(guid);
|
||||
}
|
||||
|
||||
public int items() {
|
||||
return items.size();
|
||||
}
|
||||
|
||||
public String getImage() {
|
||||
return this.imageURL;
|
||||
}
|
||||
|
||||
public static class Item {
|
||||
|
||||
private HashMap map;
|
||||
|
||||
public Item() {
|
||||
this.map = new HashMap();
|
||||
this.map.put("guid", Long.toHexString(System.currentTimeMillis()) + ":" + guidcount++);
|
||||
}
|
||||
|
||||
public void setValue(String name, String value) {
|
||||
map.put(name, value);
|
||||
}
|
||||
|
||||
public String getAuthor() {
|
||||
return (String) map.get("author");
|
||||
}
|
||||
|
||||
public String getCopyright() {
|
||||
return (String) map.get("copyright");
|
||||
}
|
||||
|
||||
public String getCategory() {
|
||||
return (String) map.get("category");
|
||||
}
|
||||
|
||||
public String getTitle() {
|
||||
return (String) map.get("title");
|
||||
}
|
||||
|
||||
public String getLink() {
|
||||
return (String) map.get("link");
|
||||
}
|
||||
|
||||
public String getLanguage() {
|
||||
return (String) map.get("language");
|
||||
}
|
||||
|
||||
public String getDescription() {
|
||||
return (String) map.get("description");
|
||||
}
|
||||
|
||||
public String getCreator() {
|
||||
return (String) map.get("creator");
|
||||
}
|
||||
|
||||
public String getPubDate() {
|
||||
return (String) map.get("pubDate");
|
||||
}
|
||||
|
||||
public String getGuid() {
|
||||
return (String) map.get("guid");
|
||||
}
|
||||
|
||||
public String getDocs() {
|
||||
return (String) map.get("docs");
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in new issue