repaired rss feed reader

- removed old rss parser - removed unused rss parser libraries - added new rss reader - added previously removed FeedReader_p.java and adopted it to new rss parser - adopted parser interface for rss indexing to new rss parser git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3970 6c8d7289-2bf4-0310-a012-ef5d649a1542
18 years ago · 9da0e53fe8
parent 26ddf797eb
commit 9da0e53fe8
7 changed files with 308 additions and 185 deletions
--- a/.classpath
+++ b/.classpath
@ -16,7 +16,6 @@
 	<classpathentry kind="lib" path="libx/commons-discovery.jar"/>
 	<classpathentry kind="lib" path="libx/commons-jxpath-1.1.jar"/>
 	<classpathentry kind="lib" path="libx/commons-logging.jar"/>
-	<classpathentry kind="lib" path="libx/informa-0.6.0.jar"/>
 	<classpathentry kind="lib" path="libx/jakarta-oro-2.0.7.jar"/>
 	<classpathentry kind="lib" path="libx/jaxrpc.jar"/>
 	<classpathentry kind="lib" path="libx/jdom.jar"/>
--- a/htroot/FeedReader_p.java
+++ b/htroot/FeedReader_p.java
@ -0,0 +1,79 @@
+//FeedReader_p.java
+//------------
+// part of YACY
+//
+// (C) 2007 Alexander Schier
+//
+// last change: $LastChangedDate:  $ by $LastChangedBy: $
+// $LastChangedRevision: $
+//
+// This program is free software; you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation; either version 2 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+
+import java.net.MalformedURLException;
+
+import de.anomic.http.httpHeader;
+import de.anomic.net.URL;
+import de.anomic.server.serverObjects;
+import de.anomic.server.serverSwitch;
+import de.anomic.server.servletProperties;
+import de.anomic.xml.rssReader;
+
+// test url:
+// http://localhost:8080/FeedReader_p.html?url=http://www.tagesthemen.de/xml/rss2
+
+public class FeedReader_p {
+    
+    public static servletProperties respond(httpHeader header, serverObjects post, serverSwitch env) {
+        servletProperties prop = new servletProperties();
+        
+        prop.put("page", 0);
+        if (post != null) {
+            URL url;
+            try {
+                url = new URL((String) post.get("url"));
+            } catch (MalformedURLException e) {
+                prop.put("page", 2);
+                return prop;
+            }
+            
+            // int maxitems=Integer.parseInt(post.get("max", "0"));
+            // int offset=Integer.parseInt(post.get("offset", "0")); //offset to the first displayed item
+            rssReader parser = new rssReader(url.toString());
+
+            prop.put("page_title", parser.getChannel().getTitle());
+            if (parser.getChannel().getAuthor() == null) {
+                prop.put("page_hasAuthor", 0);
+            } else {
+                prop.put("page_hasAuthor", 1);
+                prop.put("page_hasAuthor_author", parser.getChannel().getAuthor());
+            }
+            prop.put("page_description", parser.getChannel().getDescription());
+
+            for (int i = 0; i < parser.items(); i++) {
+                rssReader.Item item = parser.getItem(i);
+                prop.put("page_items_" + i + "_author", item.getAuthor());
+                prop.put("page_items_" + i + "_title", item.getTitle());
+                prop.put("page_items_" + i + "_link", item.getLink());
+                prop.putASIS("page_items_" + i + "_description", item.getDescription());
+                prop.put("page_items_" + i + "_date", item.getPubDate());
+            }
+            prop.put("page_items", parser.items());
+            prop.put("page", 1);
+        }
+    
+        // return rewrite properties
+        return prop;
+    }
+}
--- a/source/de/anomic/data/rssReader.java
+++ b/source/de/anomic/data/rssReader.java
@ -1,126 +0,0 @@
-//rssReader.java
-//------------
-// part of YACY
-//
-// (C) 2007 Alexander Schier
-//
-// last change: $LastChangedDate:  $ by $LastChangedBy: $
-// $LastChangedRevision: $
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-
-package de.anomic.data;
-
-import java.io.IOException;
-import java.net.MalformedURLException;
-import java.net.URL;
-import java.util.Collection;
-import java.util.Comparator;
-import java.util.Date;
-import java.util.Iterator;
-import java.util.TreeSet;
-
-import de.nava.informa.core.ChannelIF;
-import de.nava.informa.core.ParseException;
-import de.nava.informa.impl.basic.ChannelBuilder;
-import de.nava.informa.parsers.FeedParser;
-
-import de.anomic.yacy.yacyCore;
-
-public class rssReader {
-	URL url;
-	ChannelIF channel;
-	TreeSet feedItems;
-	public rssReader(String url) throws MalformedURLException{
-		this.url=new URL(url);
-		String yAddress=yacyCore.seedDB.resolveYacyAddress(this.url.getHost());
-		if(yAddress != null){
-			this.url=new URL(this.url.getProtocol()+"://"+yAddress+"/"+this.url.getPath());
-		}
-		ChannelBuilder builder=new ChannelBuilder();
-		try {
-			channel=FeedParser.parse(builder, this.url);
-			Collection oldfeedItems=channel.getItems();
-			feedItems=new TreeSet(new ItemComparator());
-			Iterator it=oldfeedItems.iterator();
-			int count=0;
-			while(it.hasNext()){
-				de.nava.informa.impl.basic.Item item=(de.nava.informa.impl.basic.Item) it.next();
-				Item newItem=new Item(count++, item.getLink(), item.getTitle(), item.getDescription(), item.getDate(), item.getCreator());
-				feedItems.add(newItem);
-			}
-		}
-		catch (IOException e) {} 
-		catch (ParseException e) {}
-	}
-	public String getCreator(){
-		return (channel!=null)? channel.getCreator(): null;
-	}
-	public String getTitle(){
-		return (channel!=null)? channel.getTitle(): null;
-	}
-	public String getDescription(){
-		return (channel!=null)? channel.getDescription(): null;
-	}
-	public Collection getFeedItems(){
-		return feedItems;
-	}
-    
-    public class Item{
-        String creator, title, description;
-        Date date;
-        URL link;
-        int num;
-        public Item(int num, URL link, String title, String description, Date date, String creator){
-            this.link=link;
-            this.title=title;
-            this.description=description;
-            this.date=date;
-            this.creator=creator;
-            this.num=num;
-        }
-        public URL getLink(){
-            return link;
-        }
-        public String getTitle(){
-            return (title!=null)? title: "";
-        }
-        public String getDescription(){
-            return (description!=null)? description: "";
-        }
-        public Date getDate(){
-            return (date!=null)? date: new Date();
-        }
-        public String getCreator(){
-            return (creator!=null)? creator: "";
-        }
-        public int getNum(){
-            return num;
-        }
-        
-    }
-    
-    public class ItemComparator implements Comparator {
-        public int compare(Object o1, Object o2){
-            int num1=((Item)o1).getNum();
-            int num2=((Item)o2).getNum();
-            return num2-num1;
-        }
-        public boolean equals(Object o1, Object o2){
-            return compare(o1, o2)==0;
-        }
-    }
-	
-}
--- a/source/de/anomic/plasma/parser/mimeType/rssDetector.java
+++ b/source/de/anomic/plasma/parser/mimeType/rssDetector.java
@ -48,7 +48,6 @@ import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.InputStream;
-import java.lang.reflect.Method;
 import java.util.Map;

 import net.sf.jmimemagic.MagicDetector;
@ -93,26 +92,7 @@ public class rssDetector implements MagicDetector {
    }
    
    private String[] detect(InputStream input) {
-        try {
-
-            // getting the format detector class
-            Class formatDetector = Class.forName("de.nava.informa.utils.FormatDetector");
-            
-            // getting the proper method
-            Method getFormat = formatDetector.getMethod("getFormat", new Class[]{InputStream.class});
-            
-            // invoke the method
-            Object format = getFormat.invoke(null, new Object[] {input});
-            
-            if (format == null) return null;
-            else if (format.toString().startsWith("RSS ")) return new String[]{"application/rss+xml"};
-            else if (format.toString().startsWith("Atom ")) return new String[]{"application/atom+xml"};
-            else return null;
-        } catch (Exception e) {
-            return null;
-        } catch (Error e) {
-            return null;
-        }        
+        return new String[]{"application/rss+xml"};
    }

 }
--- a/source/de/anomic/plasma/parser/rss/rssParser.java
+++ b/source/de/anomic/plasma/parser/rss/rssParser.java
@ -46,10 +46,8 @@ package de.anomic.plasma.parser.rss;
 import java.io.ByteArrayInputStream;
 import java.io.InputStream;
 import java.io.Writer;
-import java.util.Collection;
 import java.util.HashMap;
 import java.util.Hashtable;
-import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.Map;
 import java.util.TreeSet;
@ -66,11 +64,8 @@ import de.anomic.plasma.parser.ParserException;
 import de.anomic.server.serverByteBuffer;
 import de.anomic.server.serverCharBuffer;
 import de.anomic.server.serverFileUtils;
-import de.nava.informa.core.ChannelIF;
-import de.nava.informa.core.ImageIF;
-import de.nava.informa.impl.basic.ChannelBuilder;
-import de.nava.informa.impl.basic.Item;
-import de.nava.informa.parsers.FeedParser;
+import de.anomic.xml.rssReader;
+import de.anomic.xml.rssReader.Item;

 public class rssParser extends AbstractParser implements Parser {

@ -79,7 +74,7 @@ public class rssParser extends AbstractParser implements Parser {
     * @see #getSupportedMimeTypes()
     */  
    public static final Hashtable SUPPORTED_MIME_TYPES = new Hashtable();    
-    static { 
+    static {
        SUPPORTED_MIME_TYPES.put("text/rss","xml,rss,rdf"); 
        SUPPORTED_MIME_TYPES.put("application/rdf+xml","xml,rss,rdf");
        SUPPORTED_MIME_TYPES.put("application/rss+xml","xml,rss,rdf");
@ -90,11 +85,7 @@ public class rssParser extends AbstractParser implements Parser {
     * a list of library names that are needed by this parser
     * @see Parser#getLibxDependences()
     */
-    private static final String[] LIBX_DEPENDENCIES = new String[] {
-        "informa-0.6.0.jar",
-        "commons-logging.jar",
-        "jdom.jar"
-    };       
+    private static final String[] LIBX_DEPENDENCIES = new String[] {};       
    
 	public rssParser() {
 		super(LIBX_DEPENDENCIES);
@ -110,44 +101,32 @@ public class rssParser extends AbstractParser implements Parser {
            serverByteBuffer text = new serverByteBuffer();
            serverCharBuffer authors = new serverCharBuffer();
            
-            
-	        // creating a channel-builder
-	        ChannelBuilder builder = new ChannelBuilder();   
-            
-            // parsing the rss/atom feed
-	        ChannelIF channel = FeedParser.parse(builder, source);
+            rssReader reader = new rssReader(source);
            
            // getting the rss feed title and description
-            String feedTitle = channel.getTitle();
+            String feedTitle = reader.getChannel().getTitle();

            // getting feed creator
-			String feedCreator = channel.getCreator();
+			String feedCreator = reader.getChannel().getAuthor();
 			if (feedCreator != null && feedCreator.length() > 0) authors.append(",").append(feedCreator);            
            
            // getting the feed description
-            String feedDescription = channel.getDescription();
-            
-            // getting the channel site url
-            //URL	channelSiteURL = channel.getSite();
+            String feedDescription = reader.getChannel().getDescription();
            
-            ImageIF channelImage = channel.getImage();
-            if (channelImage != null) {
-                images.add(new htmlFilterImageEntry(new URL(channelImage.getLocation().toExternalForm()), channelImage.getTitle(), -1, -1));
+            if (reader.getImage() != null) {
+                images.add(new htmlFilterImageEntry(new URL(reader.getImage()), feedTitle, -1, -1));
            }            
            
            // loop through the feed items
-            Collection feedItemCollection = channel.getItems();
-            if (!feedItemCollection.isEmpty()) {
-				Iterator feedItemIterator = feedItemCollection.iterator();
-                while (feedItemIterator.hasNext()) {
+            for (int i = 0; i < reader.items(); i++) {
                    // check for interruption
                    checkInterruption();
                    
                    // getting the next item
-					Item item = (Item)feedItemIterator.next();	
+					Item item = reader.getItem(i);	
                    
        			String itemTitle = item.getTitle();
-        			URL    itemURL   = new URL(item.getLink().toExternalForm());
+        			URL    itemURL   = new URL(item.getLink());
        			String itemDescr = item.getDescription();
        			String itemCreator = item.getCreator();
        			if (itemCreator != null && itemCreator.length() > 0) authors.append(",").append(itemCreator);
@ -158,7 +137,7 @@ public class rssParser extends AbstractParser implements Parser {
                	if ((text.length() != 0) && (text.byteAt(text.length() - 1) != 32)) text.append((byte) 32);
                	text.append(new serverCharBuffer(htmlFilterAbstractScraper.stripAll(new serverCharBuffer(itemDescr.toCharArray()))).trim().toString()).append(' ');
                    
-                    String itemContent = item.getElementValue("content");
+                    String itemContent = item.getDescription();
                    if ((itemContent != null) && (itemContent.length() > 0)) {
                        
                        htmlFilterContentScraper scraper = new htmlFilterContentScraper(itemURL);
@ -187,7 +166,6 @@ public class rssParser extends AbstractParser implements Parser {
                        }
                        
                    }
-                }
            }
            
            plasmaParserDocument theDoc = new plasmaParserDocument(
--- a/source/de/anomic/soap/services/SearchService.java
+++ b/source/de/anomic/soap/services/SearchService.java
@ -186,7 +186,7 @@ public class SearchService extends AbstractService
    
    
    /**
-    * @param url the url
+    * @param link the url
    * @param viewMode one of (VIEW_MODE_AS_PLAIN_TEXT = 1,
    * VIEW_MODE_AS_PARSED_TEXT = 2,
    * VIEW_MODE_AS_PARSED_SENTENCES = 3) [Source: ViewFile.java]
--- a/source/de/anomic/xml/rssReader.java
+++ b/source/de/anomic/xml/rssReader.java
@ -0,0 +1,213 @@
+package de.anomic.xml;
+
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class rssReader extends DefaultHandler {
+    
+    // statics for item generation and automatic categorization
+    private static int guidcount = 0;
+    private static final String[] tagsDef = new String[]{
+        "author",      //
+        "copyright",   //
+        "category",    //
+        "title",       //
+        "link",        //
+        "language",    //
+        "description", //
+        "creator",     //
+        "pubDate",     //
+        "guid",        //
+        "docs"         //
+        };
+
+    private static final HashSet tags = new HashSet();
+    static {
+        for (int i = 0; i < tagsDef.length; i++) {
+            tags.add(tagsDef[i]);
+        }
+    }
+    
+    // class variables
+    private Item channel, item;
+    private StringBuffer buffer;
+    private boolean parsingChannel, parsingImage, parsingItem;
+    private String imageURL;
+    private ArrayList itemsGUID; // a list of GUIDs, so the items can be retrieved by a specific order
+    private HashMap items; // a guid:Item map
+    
+    
+    public rssReader(String path) {
+        init();
+        parse(path);
+    }
+    
+    public rssReader(InputStream stream) {
+        init();
+        parse(stream);
+    }
+    
+    private void init() {
+        itemsGUID = new ArrayList();
+        items = new HashMap();
+        buffer = new StringBuffer();
+        item = null;
+        channel = null;
+        parsingChannel = false;
+        parsingImage = false;
+        parsingItem = false;
+    }
+    
+    private void parse(String path) {
+        try {
+            SAXParserFactory factory = SAXParserFactory.newInstance();
+            SAXParser saxParser = factory.newSAXParser();
+            saxParser.parse(path, this);
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+    
+    private void parse(InputStream stream) {
+        try {
+            SAXParserFactory factory = SAXParserFactory.newInstance();
+            SAXParser saxParser = factory.newSAXParser();
+            saxParser.parse(stream, this);
+        } catch (Exception e) {
+            e.printStackTrace();
+        }
+    }
+
+    public void startElement(String uri, String name, String tag, Attributes atts) throws SAXException {
+        if ("channel".equals(tag)) {
+            channel = new Item();
+            parsingChannel = true;
+        } else if ("item".equals(tag)) {
+            item = new Item();
+            parsingItem = true;
+        } else if ("image".equals(tag)) {
+            parsingImage = true;
+        }
+    }
+
+    public void endElement(String uri, String name, String tag) {
+        if (tag == null) return;
+        if ("channel".equals(tag)) {
+            parsingChannel = false;
+        } else if ("item".equals(tag)) {
+            String guid = item.getGuid();
+            itemsGUID.add(guid);
+            items.put(guid, item);
+            parsingItem = false;
+        } else if ("image".equals(tag)) {
+            parsingImage = false;
+        } else if ((parsingImage) && (parsingChannel)) {
+            String value = buffer.toString().trim();
+            buffer.setLength(0);
+            if ("url".equals(tag)) imageURL = value;
+        } else if (parsingItem)  {
+            String value = buffer.toString().trim();
+            buffer.setLength(0);
+            if (tags.contains(tag)) item.setValue(tag, value);
+        } else if (parsingChannel) {
+            String value = buffer.toString().trim();
+            buffer.setLength(0);
+            if (tags.contains(tag)) channel.setValue(tag, value);
+        }
+    }
+
+    public void characters(char ch[], int start, int length) {
+        if (parsingItem || parsingChannel) {
+            buffer.append(ch, start, length);
+        }
+    }
+
+    public Item getChannel() {
+        return channel;
+    }
+
+    public Item getItem(int i) {
+        // retrieve item by order number
+        return getItem((String) itemsGUID.get(i));
+    }
+
+    public Item getItem(String guid) {
+        // retrieve item by guid
+        return (Item) items.get(guid);
+    }
+
+    public int items() {
+        return items.size();
+    }
+    
+    public String getImage() {
+        return this.imageURL;
+    }
+    
+    public static class Item {
+        
+        private HashMap map;
+
+        public Item() {
+            this.map = new HashMap();
+            this.map.put("guid", Long.toHexString(System.currentTimeMillis()) + ":" + guidcount++);
+        }
+        
+        public void setValue(String name, String value) {
+            map.put(name, value);
+        }
+        
+        public String getAuthor() {
+            return (String) map.get("author");
+        }
+        
+        public String getCopyright() {
+            return (String) map.get("copyright");
+        }
+        
+        public String getCategory() {
+            return (String) map.get("category");
+        }
+        
+        public String getTitle() {
+            return (String) map.get("title");
+        }
+        
+        public String getLink() {
+            return (String) map.get("link");
+        }
+        
+        public String getLanguage() {
+            return (String) map.get("language");
+        }
+        
+        public String getDescription() {
+            return (String) map.get("description");
+        }
+        
+        public String getCreator() {
+            return (String) map.get("creator");
+        }
+        
+        public String getPubDate() {
+            return (String) map.get("pubDate");
+        }
+        
+        public String getGuid() {
+            return (String) map.get("guid");
+        }
+        
+        public String getDocs() {
+            return (String) map.get("docs");
+        }
+    }
+}