From 63a0255166ab8d7e27eb072449c936f1c07a7c2d Mon Sep 17 00:00:00 2001 From: orbiter Date: Tue, 26 May 2009 07:44:22 +0000 Subject: [PATCH] - refactoring: added new content package, which will contain connector classes for different types of data sources to import texts into the YaCy index - refactoring: migrated data objects for the new connector classes - added a DAO interface class to specify an abstract interface for database retrieval connector methods git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5977 6c8d7289-2bf4-0310-a012-ef5d649a1542 --- htroot/FeedReader_p.java | 2 +- htroot/api/feed.java | 2 +- htroot/rct_p.java | 2 +- htroot/yacy/search.java | 2 +- htroot/yacy/transferRWI.java | 2 +- htroot/yacy/transferURL.java | 2 +- htroot/yacysearch.java | 2 +- .../Surrogate.java => content/DCEntry.java} | 16 +++- .../anomic/{xml => content}/RSSMessage.java | 2 +- source/de/anomic/content/dao/Dao.java | 82 +++++++++++++++++++ .../file}/SurrogateReader.java | 18 ++-- source/de/anomic/crawler/CrawlQueues.java | 2 +- .../anomic/plasma/parser/rss/rssParser.java | 2 +- .../de/anomic/plasma/plasmaSwitchboard.java | 6 +- source/de/anomic/plasma/plasmaWordIndex.java | 2 +- source/de/anomic/xml/RSSFeed.java | 2 + source/de/anomic/xml/RSSReader.java | 1 + source/de/anomic/yacy/yacyCore.java | 2 +- source/de/anomic/yacy/yacyPeerActions.java | 2 +- 19 files changed, 122 insertions(+), 29 deletions(-) rename source/de/anomic/{crawler/Surrogate.java => content/DCEntry.java} (92%) rename source/de/anomic/{xml => content}/RSSMessage.java (99%) create mode 100644 source/de/anomic/content/dao/Dao.java rename source/de/anomic/{xml => content/file}/SurrogateReader.java (91%) diff --git a/htroot/FeedReader_p.java b/htroot/FeedReader_p.java index b59217bc9..45cc1defa 100644 --- a/htroot/FeedReader_p.java +++ b/htroot/FeedReader_p.java @@ -24,12 +24,12 @@ import java.io.IOException; import java.net.MalformedURLException; +import de.anomic.content.RSSMessage; import de.anomic.http.httpRequestHeader; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.server.servletProperties; import de.anomic.xml.RSSFeed; -import de.anomic.xml.RSSMessage; import de.anomic.xml.RSSReader; import de.anomic.yacy.yacyURL; diff --git a/htroot/api/feed.java b/htroot/api/feed.java index 134f43376..c1bfdefb0 100755 --- a/htroot/api/feed.java +++ b/htroot/api/feed.java @@ -2,12 +2,12 @@ import java.util.Date; +import de.anomic.content.RSSMessage; import de.anomic.http.httpRequestHeader; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.xml.RSSFeed; -import de.anomic.xml.RSSMessage; public class feed { diff --git a/htroot/rct_p.java b/htroot/rct_p.java index 02b2f2de4..185e9f6f3 100644 --- a/htroot/rct_p.java +++ b/htroot/rct_p.java @@ -30,6 +30,7 @@ import java.text.ParseException; import java.util.Date; import java.util.Iterator; +import de.anomic.content.RSSMessage; import de.anomic.crawler.CrawlEntry; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.util.DateFormatter; @@ -37,7 +38,6 @@ import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.xml.RSSFeed; -import de.anomic.xml.RSSMessage; import de.anomic.yacy.yacyClient; import de.anomic.yacy.yacySeed; import de.anomic.yacy.yacyURL; diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 1daf76879..376cd7150 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -34,6 +34,7 @@ import java.util.Map; import java.util.Set; import java.util.TreeSet; +import de.anomic.content.RSSMessage; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.order.Base64Order; import de.anomic.kelondro.order.Bitfield; @@ -54,7 +55,6 @@ import de.anomic.server.serverSwitch; import de.anomic.tools.crypt; import de.anomic.tools.iso639; import de.anomic.xml.RSSFeed; -import de.anomic.xml.RSSMessage; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyNetwork; import de.anomic.yacy.yacySeed; diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java index 40deb0bbf..153645661 100644 --- a/htroot/yacy/transferRWI.java +++ b/htroot/yacy/transferRWI.java @@ -32,6 +32,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.List; +import de.anomic.content.RSSMessage; import de.anomic.data.Blacklist; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.text.referencePrototype.WordReferenceRow; @@ -43,7 +44,6 @@ import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.xml.RSSFeed; -import de.anomic.xml.RSSMessage; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyNetwork; import de.anomic.yacy.yacySeed; diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index b0a50f438..0d5747abe 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -29,6 +29,7 @@ import java.io.IOException; import java.text.ParseException; +import de.anomic.content.RSSMessage; import de.anomic.data.Blacklist; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; @@ -38,7 +39,6 @@ import de.anomic.server.serverCore; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; import de.anomic.xml.RSSFeed; -import de.anomic.xml.RSSMessage; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyNetwork; import de.anomic.yacy.yacySeed; diff --git a/htroot/yacysearch.java b/htroot/yacysearch.java index ae0bdb214..537d6c430 100644 --- a/htroot/yacysearch.java +++ b/htroot/yacysearch.java @@ -32,6 +32,7 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.TreeSet; +import de.anomic.content.RSSMessage; import de.anomic.http.httpRequestHeader; import de.anomic.kelondro.order.Bitfield; import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow; @@ -57,7 +58,6 @@ import de.anomic.server.serverSwitch; import de.anomic.tools.iso639; import de.anomic.tools.Formatter; import de.anomic.xml.RSSFeed; -import de.anomic.xml.RSSMessage; import de.anomic.yacy.yacyNewsPool; import de.anomic.yacy.yacyNewsRecord; import de.anomic.yacy.yacyURL; diff --git a/source/de/anomic/crawler/Surrogate.java b/source/de/anomic/content/DCEntry.java similarity index 92% rename from source/de/anomic/crawler/Surrogate.java rename to source/de/anomic/content/DCEntry.java index 173083217..09df00279 100644 --- a/source/de/anomic/crawler/Surrogate.java +++ b/source/de/anomic/content/DCEntry.java @@ -1,4 +1,4 @@ -// Surrogate.java +// DCEntry.java // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 15.04.2009 on http://yacy.net // @@ -23,7 +23,7 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.crawler; +package de.anomic.content; import java.net.MalformedURLException; import java.text.ParseException; @@ -35,10 +35,11 @@ import de.anomic.kelondro.util.DateFormatter; import de.anomic.plasma.plasmaParserDocument; import de.anomic.yacy.yacyURL; -public class Surrogate extends HashMap { +public class DCEntry extends HashMap { + private static final long serialVersionUID = -2050291583515701559L; - public Surrogate() { + public DCEntry() { super(); } @@ -73,6 +74,7 @@ public class Surrogate extends HashMap { return new Date(); } } + public yacyURL url() { String u = this.get("url"); if (u == null) u = this.get("dc:Identifier"); @@ -84,11 +86,13 @@ public class Surrogate extends HashMap { return null; } } + public String language() { String l = this.get("language"); if (l == null) l = this.get("dc:Language"); if (l == null) return "en"; else return l; } + public String title() { String t = this.get("title"); if (t == null) t = this.get("dc:Title"); @@ -96,6 +100,7 @@ public class Surrogate extends HashMap { if (t == null) return ""; return t; } + public String body() { String t = this.get("body"); if (t == null) t = this.get("dc:Description"); @@ -103,6 +108,7 @@ public class Surrogate extends HashMap { if (t == null) return ""; return t; } + public String[] categories() { String t = this.get("categories"); if (t == null) this.get("dc:Subject"); @@ -110,6 +116,7 @@ public class Surrogate extends HashMap { if (t == null) return new String[]{}; return t.split(";"); } + private String stripCDATA(String s) { if (s == null) return null; s = s.trim(); @@ -117,6 +124,7 @@ public class Surrogate extends HashMap { if (s.endsWith("]]")) s = s.substring(0, s.length() - 2); return s; } + public plasmaParserDocument document() { HashSet languages = new HashSet(); languages.add(language()); diff --git a/source/de/anomic/xml/RSSMessage.java b/source/de/anomic/content/RSSMessage.java similarity index 99% rename from source/de/anomic/xml/RSSMessage.java rename to source/de/anomic/content/RSSMessage.java index 430ceb52d..9b45dcd1b 100644 --- a/source/de/anomic/xml/RSSMessage.java +++ b/source/de/anomic/content/RSSMessage.java @@ -25,7 +25,7 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.xml; +package de.anomic.content; import java.util.Date; import java.util.HashMap; diff --git a/source/de/anomic/content/dao/Dao.java b/source/de/anomic/content/dao/Dao.java new file mode 100644 index 000000000..143678fdc --- /dev/null +++ b/source/de/anomic/content/dao/Dao.java @@ -0,0 +1,82 @@ +// Dao.java +// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany +// first published 25.05.2009 on http://yacy.net +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +package de.anomic.content.dao; + +import java.util.ArrayList; +import java.util.Date; + +import de.anomic.content.DCEntry; + +/* + * Database Access Objects are used to get a normalized view on database objects with java objects + */ +public interface Dao { + + // item-oriented retrieval + + /** + * get the maximum number of items in the database + */ + public int maxItems(); + + /** + * retrieve a single item from the database + * @param item + * @return + */ + public DCEntry get(int item); + + /** + * retrieve a list of entries in the database; + * the object denoted with until is not contained in the list + * @param from + * @param until + * @return + */ + public ArrayList get(int from, int until); + + + // date-oriented retrieval + + /** + * return the date of the first entry + */ + public Date firstEntry(); + + /** + * return the date of the latest entry + * @return + */ + public Date latestEntry(); + + /** + * get a list of entries in the database; + * the returned list contains all entries up to the most recent + * @param from + * @return + */ + public ArrayList get(Date from); + +} diff --git a/source/de/anomic/xml/SurrogateReader.java b/source/de/anomic/content/file/SurrogateReader.java similarity index 91% rename from source/de/anomic/xml/SurrogateReader.java rename to source/de/anomic/content/file/SurrogateReader.java index 002229629..54d84e3e1 100644 --- a/source/de/anomic/xml/SurrogateReader.java +++ b/source/de/anomic/content/file/SurrogateReader.java @@ -22,7 +22,7 @@ // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -package de.anomic.xml; +package de.anomic.content.file; import java.io.BufferedInputStream; import java.io.File; @@ -40,18 +40,18 @@ import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import de.anomic.crawler.Surrogate; +import de.anomic.content.DCEntry; public class SurrogateReader extends DefaultHandler implements Runnable { - public static final Surrogate poison = new Surrogate(); + public static final DCEntry poison = new DCEntry(); // class variables private final StringBuilder buffer; private boolean parsingValue; - private Surrogate surrogate; + private DCEntry surrogate; private String elementName; - private BlockingQueue surrogates; + private BlockingQueue surrogates; private SAXParser saxParser; private InputStream stream; @@ -60,7 +60,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable { this.parsingValue = false; this.surrogate = null; this.elementName = null; - this.surrogates = new ArrayBlockingQueue(queueSize); + this.surrogates = new ArrayBlockingQueue(queueSize); this.stream = stream; final SAXParserFactory factory = SAXParserFactory.newInstance(); try { @@ -97,7 +97,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable { public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException { if ("record".equals(tag) || "document".equals(tag)) { - this.surrogate = new Surrogate(); + this.surrogate = new DCEntry(); } else if ("element".equals(tag)) { this.elementName = atts.getValue("name"); } else if ("value".equals(tag)) { @@ -151,7 +151,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable { } } - public Surrogate take() { + public DCEntry take() { try { return this.surrogates.take(); } catch (InterruptedException e) { @@ -168,7 +168,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable { Thread t = new Thread(sr, "Surrogate-Reader " + f.getAbsolutePath()); t.start(); - Surrogate s; + DCEntry s; System.out.println("1"); while ((s = sr.take()) != SurrogateReader.poison) { System.out.println("Title: " + s.title()); diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index dbf57977a..66053e208 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -36,6 +36,7 @@ import java.util.Iterator; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import de.anomic.content.RSSMessage; import de.anomic.http.httpClient; import de.anomic.kelondro.table.FlexWidthArray; import de.anomic.kelondro.util.DateFormatter; @@ -47,7 +48,6 @@ import de.anomic.plasma.plasmaSwitchboardConstants; import de.anomic.plasma.parser.Document; import de.anomic.server.serverProcessorJob; import de.anomic.xml.RSSFeed; -import de.anomic.xml.RSSMessage; import de.anomic.yacy.yacyClient; import de.anomic.yacy.yacySeed; import de.anomic.yacy.yacyURL; diff --git a/source/de/anomic/plasma/parser/rss/rssParser.java b/source/de/anomic/plasma/parser/rss/rssParser.java index 58dc41ac7..0a9581fcf 100644 --- a/source/de/anomic/plasma/parser/rss/rssParser.java +++ b/source/de/anomic/plasma/parser/rss/rssParser.java @@ -34,6 +34,7 @@ import java.util.Hashtable; import java.util.LinkedList; import java.util.Map; +import de.anomic.content.RSSMessage; import de.anomic.htmlFilter.htmlFilterAbstractScraper; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterImageEntry; @@ -46,7 +47,6 @@ import de.anomic.plasma.parser.Parser; import de.anomic.plasma.parser.ParserException; import de.anomic.server.serverCharBuffer; import de.anomic.xml.RSSFeed; -import de.anomic.xml.RSSMessage; import de.anomic.xml.RSSReader; import de.anomic.yacy.yacyURL; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 095bbfc9a..2c9b6ca99 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -111,6 +111,8 @@ import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Pattern; +import de.anomic.content.DCEntry; +import de.anomic.content.file.SurrogateReader; import de.anomic.crawler.CrawlEntry; import de.anomic.crawler.CrawlProfile; import de.anomic.crawler.CrawlQueues; @@ -124,7 +126,6 @@ import de.anomic.crawler.ResourceObserver; import de.anomic.crawler.ResultImages; import de.anomic.crawler.ResultURLs; import de.anomic.crawler.RobotsTxt; -import de.anomic.crawler.Surrogate; import de.anomic.crawler.ZURL; import de.anomic.crawler.CrawlProfile.entry; import de.anomic.crawler.IndexingStack.QueueEntry; @@ -174,7 +175,6 @@ import de.anomic.server.serverSystem; import de.anomic.server.serverThread; import de.anomic.tools.crypt; import de.anomic.tools.CryptoLib; -import de.anomic.xml.SurrogateReader; import de.anomic.yacy.yacyClient; import de.anomic.yacy.yacyCore; import de.anomic.yacy.yacyNewsPool; @@ -1220,7 +1220,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch { // static channel names of feeds diff --git a/source/de/anomic/xml/RSSReader.java b/source/de/anomic/xml/RSSReader.java index 87edaffe1..799ed555a 100644 --- a/source/de/anomic/xml/RSSReader.java +++ b/source/de/anomic/xml/RSSReader.java @@ -38,6 +38,7 @@ import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; +import de.anomic.content.RSSMessage; import de.anomic.kelondro.util.ByteBuffer; import de.anomic.kelondro.util.Log; diff --git a/source/de/anomic/yacy/yacyCore.java b/source/de/anomic/yacy/yacyCore.java index 5fcb10d82..b1bcfe5d9 100644 --- a/source/de/anomic/yacy/yacyCore.java +++ b/source/de/anomic/yacy/yacyCore.java @@ -51,13 +51,13 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; +import de.anomic.content.RSSMessage; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.Log; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.server.serverCore; import de.anomic.server.serverSemaphore; import de.anomic.xml.RSSFeed; -import de.anomic.xml.RSSMessage; import de.anomic.yacy.dht.PeerSelection; public class yacyCore { diff --git a/source/de/anomic/yacy/yacyPeerActions.java b/source/de/anomic/yacy/yacyPeerActions.java index 40f78a8b3..7dd0bf70e 100644 --- a/source/de/anomic/yacy/yacyPeerActions.java +++ b/source/de/anomic/yacy/yacyPeerActions.java @@ -27,11 +27,11 @@ package de.anomic.yacy; import java.io.IOException; import java.util.HashMap; +import de.anomic.content.RSSMessage; import de.anomic.kelondro.util.DateFormatter; import de.anomic.kelondro.util.Log; import de.anomic.server.serverCodings; import de.anomic.xml.RSSFeed; -import de.anomic.xml.RSSMessage; public class yacyPeerActions {