fixed some problems with surrogate reader. This is now ready for testing.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5817 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 3a1364ed5c
commit 7dfe7e7cc6

@ -99,7 +99,7 @@ public class Surrogate extends HashMap<String, String> {
}
public String body() {
String t = this.get("body");
if (t == null) this.get("dc:description");
if (t == null) t = this.get("dc:description");
t = stripCDATA(t);
if (t == null) return "";
return t;

@ -1193,13 +1193,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
if (outfile.exists()) return false;
boolean moved = false;
try {
SurrogateReader reader = new SurrogateReader(new BufferedInputStream(new FileInputStream(surrogateFile)));
SurrogateReader reader = new SurrogateReader(new BufferedInputStream(new FileInputStream(surrogateFile)), 3);
Thread readerThread = new Thread(reader, "Surrogate-Reader " + surrogateFile.getAbsolutePath());
readerThread.start();
Surrogate surrogate;
QueueEntry queueentry;
while (reader.hasNext()) {
surrogate = reader.next();
while ((surrogate = reader.take()) != SurrogateReader.poison) {
plasmaParserDocument document = surrogate.document();
queueentry = this.webIndex.queuePreStack.newEntry(surrogate.url(), null, null, false, null, 0, this.webIndex.defaultSurrogateProfile.handle(), null);
/*

@ -29,7 +29,6 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
@ -43,8 +42,10 @@ import org.xml.sax.helpers.DefaultHandler;
import de.anomic.crawler.Surrogate;
public class SurrogateReader extends DefaultHandler implements Runnable, Iterator<Surrogate> {
public class SurrogateReader extends DefaultHandler implements Runnable {
public static final Surrogate poison = new Surrogate();
// class variables
private final StringBuilder buffer;
private boolean parsingValue;
@ -53,14 +54,13 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
private BlockingQueue<Surrogate> surrogates;
private SAXParser saxParser;
private InputStream stream;
private boolean isrunning;
public SurrogateReader(final InputStream stream) throws IOException {
public SurrogateReader(final InputStream stream, int queueSize) throws IOException {
this.buffer = new StringBuilder();
this.parsingValue = false;
this.surrogate = null;
this.elementName = null;
this.surrogates = new ArrayBlockingQueue<Surrogate>(3);
this.surrogates = new ArrayBlockingQueue<Surrogate>(queueSize);
this.stream = stream;
final SAXParserFactory factory = SAXParserFactory.newInstance();
try {
@ -72,11 +72,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
e.printStackTrace();
throw new IOException(e.getMessage());
}
this.isrunning = false;
}
public void run() {
this.isrunning = true;
try {
this.saxParser.parse(this.stream, this);
} catch (SAXException e) {
@ -84,7 +82,16 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
} catch (IOException e) {
e.printStackTrace();
} finally {
this.isrunning = false;
try {
this.surrogates.put(poison);
} catch (InterruptedException e1) {
e1.printStackTrace();
}
try {
this.stream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
@ -144,11 +151,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
}
}
public boolean hasNext() {
return this.isrunning || this.surrogates.size() > 0;
}
public Surrogate next() {
public Surrogate take() {
try {
return this.surrogates.take();
} catch (InterruptedException e) {
@ -165,14 +168,13 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
File f = new File(args[0]);
SurrogateReader sr;
try {
sr = new SurrogateReader(new BufferedInputStream(new FileInputStream(f)));
sr = new SurrogateReader(new BufferedInputStream(new FileInputStream(f)), 1);
Thread t = new Thread(sr, "Surrogate-Reader " + f.getAbsolutePath());
t.start();
Surrogate s;
System.out.println("1");
while (sr.hasNext()) {
s = sr.next();
while ((s = sr.take()) != SurrogateReader.poison) {
System.out.println("Title: " + s.title());
System.out.println("Date: " + s.date());
System.out.println("URL: " + s.url());

Loading…
Cancel
Save