fixed some problems with surrogate reader. This is now ready for testing.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@5817 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 16 years ago
parent 3a1364ed5c
commit 7dfe7e7cc6

@ -99,7 +99,7 @@ public class Surrogate extends HashMap<String, String> {
} }
public String body() { public String body() {
String t = this.get("body"); String t = this.get("body");
if (t == null) this.get("dc:description"); if (t == null) t = this.get("dc:description");
t = stripCDATA(t); t = stripCDATA(t);
if (t == null) return ""; if (t == null) return "";
return t; return t;

@ -1193,13 +1193,12 @@ public final class plasmaSwitchboard extends serverAbstractSwitch<IndexingStack.
if (outfile.exists()) return false; if (outfile.exists()) return false;
boolean moved = false; boolean moved = false;
try { try {
SurrogateReader reader = new SurrogateReader(new BufferedInputStream(new FileInputStream(surrogateFile))); SurrogateReader reader = new SurrogateReader(new BufferedInputStream(new FileInputStream(surrogateFile)), 3);
Thread readerThread = new Thread(reader, "Surrogate-Reader " + surrogateFile.getAbsolutePath()); Thread readerThread = new Thread(reader, "Surrogate-Reader " + surrogateFile.getAbsolutePath());
readerThread.start(); readerThread.start();
Surrogate surrogate; Surrogate surrogate;
QueueEntry queueentry; QueueEntry queueentry;
while (reader.hasNext()) { while ((surrogate = reader.take()) != SurrogateReader.poison) {
surrogate = reader.next();
plasmaParserDocument document = surrogate.document(); plasmaParserDocument document = surrogate.document();
queueentry = this.webIndex.queuePreStack.newEntry(surrogate.url(), null, null, false, null, 0, this.webIndex.defaultSurrogateProfile.handle(), null); queueentry = this.webIndex.queuePreStack.newEntry(surrogate.url(), null, null, false, null, 0, this.webIndex.defaultSurrogateProfile.handle(), null);
/* /*

@ -29,7 +29,6 @@ import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.Iterator;
import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
@ -43,7 +42,9 @@ import org.xml.sax.helpers.DefaultHandler;
import de.anomic.crawler.Surrogate; import de.anomic.crawler.Surrogate;
public class SurrogateReader extends DefaultHandler implements Runnable, Iterator<Surrogate> { public class SurrogateReader extends DefaultHandler implements Runnable {
public static final Surrogate poison = new Surrogate();
// class variables // class variables
private final StringBuilder buffer; private final StringBuilder buffer;
@ -53,14 +54,13 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
private BlockingQueue<Surrogate> surrogates; private BlockingQueue<Surrogate> surrogates;
private SAXParser saxParser; private SAXParser saxParser;
private InputStream stream; private InputStream stream;
private boolean isrunning;
public SurrogateReader(final InputStream stream) throws IOException { public SurrogateReader(final InputStream stream, int queueSize) throws IOException {
this.buffer = new StringBuilder(); this.buffer = new StringBuilder();
this.parsingValue = false; this.parsingValue = false;
this.surrogate = null; this.surrogate = null;
this.elementName = null; this.elementName = null;
this.surrogates = new ArrayBlockingQueue<Surrogate>(3); this.surrogates = new ArrayBlockingQueue<Surrogate>(queueSize);
this.stream = stream; this.stream = stream;
final SAXParserFactory factory = SAXParserFactory.newInstance(); final SAXParserFactory factory = SAXParserFactory.newInstance();
try { try {
@ -72,11 +72,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
e.printStackTrace(); e.printStackTrace();
throw new IOException(e.getMessage()); throw new IOException(e.getMessage());
} }
this.isrunning = false;
} }
public void run() { public void run() {
this.isrunning = true;
try { try {
this.saxParser.parse(this.stream, this); this.saxParser.parse(this.stream, this);
} catch (SAXException e) { } catch (SAXException e) {
@ -84,7 +82,16 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
} catch (IOException e) { } catch (IOException e) {
e.printStackTrace(); e.printStackTrace();
} finally { } finally {
this.isrunning = false; try {
this.surrogates.put(poison);
} catch (InterruptedException e1) {
e1.printStackTrace();
}
try {
this.stream.close();
} catch (IOException e) {
e.printStackTrace();
}
} }
} }
@ -144,11 +151,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
} }
} }
public boolean hasNext() { public Surrogate take() {
return this.isrunning || this.surrogates.size() > 0;
}
public Surrogate next() {
try { try {
return this.surrogates.take(); return this.surrogates.take();
} catch (InterruptedException e) { } catch (InterruptedException e) {
@ -165,14 +168,13 @@ public class SurrogateReader extends DefaultHandler implements Runnable, Iterato
File f = new File(args[0]); File f = new File(args[0]);
SurrogateReader sr; SurrogateReader sr;
try { try {
sr = new SurrogateReader(new BufferedInputStream(new FileInputStream(f))); sr = new SurrogateReader(new BufferedInputStream(new FileInputStream(f)), 1);
Thread t = new Thread(sr, "Surrogate-Reader " + f.getAbsolutePath()); Thread t = new Thread(sr, "Surrogate-Reader " + f.getAbsolutePath());
t.start(); t.start();
Surrogate s; Surrogate s;
System.out.println("1"); System.out.println("1");
while (sr.hasNext()) { while ((s = sr.take()) != SurrogateReader.poison) {
s = sr.next();
System.out.println("Title: " + s.title()); System.out.println("Title: " + s.title());
System.out.println("Date: " + s.date()); System.out.println("Date: " + s.date());
System.out.println("URL: " + s.url()); System.out.println("URL: " + s.url());

Loading…
Cancel
Save