- refactoring: added new content package, which will contain connector classes for different types of data sources to import texts into the YaCy index

- refactoring: migrated data objects for the new connector classes
- added a DAO interface class to specify an abstract interface for database retrieval connector methods

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5977 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent f246928c20
commit 63a0255166

@ -24,12 +24,12 @@
import java.io.IOException;
import java.net.MalformedURLException;
import de.anomic.content.RSSMessage;
import de.anomic.http.httpRequestHeader;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.server.servletProperties;
import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage;
import de.anomic.xml.RSSReader;
import de.anomic.yacy.yacyURL;

@ -2,12 +2,12 @@
import java.util.Date;
import de.anomic.content.RSSMessage;
import de.anomic.http.httpRequestHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage;
public class feed {

@ -30,6 +30,7 @@ import java.text.ParseException;
import java.util.Date;
import java.util.Iterator;
import de.anomic.content.RSSMessage;
import de.anomic.crawler.CrawlEntry;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.util.DateFormatter;
@ -37,7 +38,6 @@ import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage;
import de.anomic.yacy.yacyClient;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacyURL;

@ -34,6 +34,7 @@ import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import de.anomic.content.RSSMessage;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.order.Base64Order;
import de.anomic.kelondro.order.Bitfield;
@ -54,7 +55,6 @@ import de.anomic.server.serverSwitch;
import de.anomic.tools.crypt;
import de.anomic.tools.iso639;
import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyNetwork;
import de.anomic.yacy.yacySeed;

@ -32,6 +32,7 @@ import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import de.anomic.content.RSSMessage;
import de.anomic.data.Blacklist;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.text.referencePrototype.WordReferenceRow;
@ -43,7 +44,6 @@ import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyNetwork;
import de.anomic.yacy.yacySeed;

@ -29,6 +29,7 @@
import java.io.IOException;
import java.text.ParseException;
import de.anomic.content.RSSMessage;
import de.anomic.data.Blacklist;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
@ -38,7 +39,6 @@ import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyNetwork;
import de.anomic.yacy.yacySeed;

@ -32,6 +32,7 @@ import java.util.ArrayList;
import java.util.HashMap;
import java.util.TreeSet;
import de.anomic.content.RSSMessage;
import de.anomic.http.httpRequestHeader;
import de.anomic.kelondro.order.Bitfield;
import de.anomic.kelondro.text.metadataPrototype.URLMetadataRow;
@ -57,7 +58,6 @@ import de.anomic.server.serverSwitch;
import de.anomic.tools.iso639;
import de.anomic.tools.Formatter;
import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage;
import de.anomic.yacy.yacyNewsPool;
import de.anomic.yacy.yacyNewsRecord;
import de.anomic.yacy.yacyURL;

@ -1,4 +1,4 @@
// Surrogate.java
// DCEntry.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 15.04.2009 on http://yacy.net
//
@ -23,7 +23,7 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.crawler;
package de.anomic.content;
import java.net.MalformedURLException;
import java.text.ParseException;
@ -35,10 +35,11 @@ import de.anomic.kelondro.util.DateFormatter;
import de.anomic.plasma.plasmaParserDocument;
import de.anomic.yacy.yacyURL;
public class Surrogate extends HashMap<String, String> {
public class DCEntry extends HashMap<String, String> {
private static final long serialVersionUID = -2050291583515701559L;
public Surrogate() {
public DCEntry() {
super();
}
@ -73,6 +74,7 @@ public class Surrogate extends HashMap<String, String> {
return new Date();
}
}
public yacyURL url() {
String u = this.get("url");
if (u == null) u = this.get("dc:Identifier");
@ -84,11 +86,13 @@ public class Surrogate extends HashMap<String, String> {
return null;
}
}
public String language() {
String l = this.get("language");
if (l == null) l = this.get("dc:Language");
if (l == null) return "en"; else return l;
}
public String title() {
String t = this.get("title");
if (t == null) t = this.get("dc:Title");
@ -96,6 +100,7 @@ public class Surrogate extends HashMap<String, String> {
if (t == null) return "";
return t;
}
public String body() {
String t = this.get("body");
if (t == null) t = this.get("dc:Description");
@ -103,6 +108,7 @@ public class Surrogate extends HashMap<String, String> {
if (t == null) return "";
return t;
}
public String[] categories() {
String t = this.get("categories");
if (t == null) this.get("dc:Subject");
@ -110,6 +116,7 @@ public class Surrogate extends HashMap<String, String> {
if (t == null) return new String[]{};
return t.split(";");
}
private String stripCDATA(String s) {
if (s == null) return null;
s = s.trim();
@ -117,6 +124,7 @@ public class Surrogate extends HashMap<String, String> {
if (s.endsWith("]]")) s = s.substring(0, s.length() - 2);
return s;
}
public plasmaParserDocument document() {
HashSet<String> languages = new HashSet<String>();
languages.add(language());

@ -25,7 +25,7 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.xml;
package de.anomic.content;
import java.util.Date;
import java.util.HashMap;

@ -0,0 +1,82 @@
// Dao.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 25.05.2009 on http://yacy.net
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.content.dao;
import java.util.ArrayList;
import java.util.Date;
import de.anomic.content.DCEntry;
/*
* Database Access Objects are used to get a normalized view on database objects with java objects
*/
public interface Dao {
// item-oriented retrieval
/**
* get the maximum number of items in the database
*/
public int maxItems();
/**
* retrieve a single item from the database
* @param item
* @return
*/
public DCEntry get(int item);
/**
* retrieve a list of entries in the database;
* the object denoted with until is not contained in the list
* @param from
* @param until
* @return
*/
public ArrayList<DCEntry> get(int from, int until);
// date-oriented retrieval
/**
* return the date of the first entry
*/
public Date firstEntry();
/**
* return the date of the latest entry
* @return
*/
public Date latestEntry();
/**
* get a list of entries in the database;
* the returned list contains all entries up to the most recent
* @param from
* @return
*/
public ArrayList<DCEntry> get(Date from);
}

@ -22,7 +22,7 @@
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.xml;
package de.anomic.content.file;
import java.io.BufferedInputStream;
import java.io.File;
@ -40,18 +40,18 @@ import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import de.anomic.crawler.Surrogate;
import de.anomic.content.DCEntry;
public class SurrogateReader extends DefaultHandler implements Runnable {
public static final Surrogate poison = new Surrogate();
public static final DCEntry poison = new DCEntry();
// class variables
private final StringBuilder buffer;
private boolean parsingValue;
private Surrogate surrogate;
private DCEntry surrogate;
private String elementName;
private BlockingQueue<Surrogate> surrogates;
private BlockingQueue<DCEntry> surrogates;
private SAXParser saxParser;
private InputStream stream;
@ -60,7 +60,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
this.parsingValue = false;
this.surrogate = null;
this.elementName = null;
this.surrogates = new ArrayBlockingQueue<Surrogate>(queueSize);
this.surrogates = new ArrayBlockingQueue<DCEntry>(queueSize);
this.stream = stream;
final SAXParserFactory factory = SAXParserFactory.newInstance();
try {
@ -97,7 +97,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
public void startElement(final String uri, final String name, final String tag, final Attributes atts) throws SAXException {
if ("record".equals(tag) || "document".equals(tag)) {
this.surrogate = new Surrogate();
this.surrogate = new DCEntry();
} else if ("element".equals(tag)) {
this.elementName = atts.getValue("name");
} else if ("value".equals(tag)) {
@ -151,7 +151,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
}
}
public Surrogate take() {
public DCEntry take() {
try {
return this.surrogates.take();
} catch (InterruptedException e) {
@ -168,7 +168,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
Thread t = new Thread(sr, "Surrogate-Reader " + f.getAbsolutePath());
t.start();
Surrogate s;
DCEntry s;
System.out.println("1");
while ((s = sr.take()) != SurrogateReader.poison) {
System.out.println("Title: " + s.title());

@ -36,6 +36,7 @@ import java.util.Iterator;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import de.anomic.content.RSSMessage;
import de.anomic.http.httpClient;
import de.anomic.kelondro.table.FlexWidthArray;
import de.anomic.kelondro.util.DateFormatter;
@ -47,7 +48,6 @@ import de.anomic.plasma.plasmaSwitchboardConstants;
import de.anomic.plasma.parser.Document;
import de.anomic.server.serverProcessorJob;
import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage;
import de.anomic.yacy.yacyClient;
import de.anomic.yacy.yacySeed;
import de.anomic.yacy.yacyURL;

@ -34,6 +34,7 @@ import java.util.Hashtable;
import java.util.LinkedList;
import java.util.Map;
import de.anomic.content.RSSMessage;
import de.anomic.htmlFilter.htmlFilterAbstractScraper;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
@ -46,7 +47,6 @@ import de.anomic.plasma.parser.Parser;
import de.anomic.plasma.parser.ParserException;
import de.anomic.server.serverCharBuffer;
import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage;
import de.anomic.xml.RSSReader;
import de.anomic.yacy.yacyURL;

@ -111,6 +111,8 @@ import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern;
import de.anomic.content.DCEntry;
import de.anomic.content.file.SurrogateReader;
import de.anomic.crawler.CrawlEntry;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.CrawlQueues;
@ -124,7 +126,6 @@ import de.anomic.crawler.ResourceObserver;
import de.anomic.crawler.ResultImages;
import de.anomic.crawler.ResultURLs;
import de.anomic.crawler.RobotsTxt;
import de.anomic.crawler.Surrogate;
import de.anomic.crawler.ZURL;
import de.anomic.crawler.CrawlProfile.entry;
import de.anomic.crawler.IndexingStack.QueueEntry;
@ -174,7 +175,6 @@ import de.anomic.server.serverSystem;
import de.anomic.server.serverThread;
import de.anomic.tools.crypt;
import de.anomic.tools.CryptoLib;
import de.anomic.xml.SurrogateReader;
import de.anomic.yacy.yacyClient;
import de.anomic.yacy.yacyCore;
import de.anomic.yacy.yacyNewsPool;
@ -1220,7 +1220,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
SurrogateReader reader = new SurrogateReader(new BufferedInputStream(new FileInputStream(surrogateFile)), 3);
Thread readerThread = new Thread(reader, "Surrogate-Reader " + surrogateFile.getAbsolutePath());
readerThread.start();
Surrogate surrogate;
DCEntry surrogate;
QueueEntry queueentry;
while ((surrogate = reader.take()) != SurrogateReader.poison) {
// check if url is in accepted domain

@ -36,6 +36,7 @@ import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import de.anomic.content.RSSMessage;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.IndexingStack;
import de.anomic.data.Blacklist;
@ -60,7 +61,6 @@ import de.anomic.plasma.parser.Word;
import de.anomic.plasma.parser.Condenser;
import de.anomic.tools.iso639;
import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage;
import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.yacyURL;

@ -31,6 +31,8 @@ import java.util.Iterator;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentLinkedQueue;
import de.anomic.content.RSSMessage;
public class RSSFeed implements Iterable<RSSMessage> {
// static channel names of feeds

@ -38,6 +38,7 @@ import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import de.anomic.content.RSSMessage;
import de.anomic.kelondro.util.ByteBuffer;
import de.anomic.kelondro.util.Log;

@ -51,13 +51,13 @@ import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import de.anomic.content.RSSMessage;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.Log;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCore;
import de.anomic.server.serverSemaphore;
import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage;
import de.anomic.yacy.dht.PeerSelection;
public class yacyCore {

@ -27,11 +27,11 @@ package de.anomic.yacy;
import java.io.IOException;
import java.util.HashMap;
import de.anomic.content.RSSMessage;
import de.anomic.kelondro.util.DateFormatter;
import de.anomic.kelondro.util.Log;
import de.anomic.server.serverCodings;
import de.anomic.xml.RSSFeed;
import de.anomic.xml.RSSMessage;
public class yacyPeerActions {

Loading…
Cancel
Save