Fixed isSolrDump function : PushBackInputStream was not unread when

returning false (for example with a WikiMedia dump).
pull/32/head
luc 9 years ago
parent 135a123a77
commit 27d11f8671

@ -40,12 +40,6 @@ import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.CrawlStacker;
import net.yacy.search.schema.CollectionConfiguration;
import org.apache.solr.client.solrj.impl.XMLResponseParser;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
@ -56,6 +50,11 @@ import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.DefaultHandler;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.CrawlStacker;
import net.yacy.search.schema.CollectionConfiguration;
public class SurrogateReader extends DefaultHandler implements Runnable {
@ -83,6 +82,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
private final CrawlStacker crawlStacker;
private final CollectionConfiguration configuration;
private final int concurrency;
private String charsetName = "UTF-8";
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
private static SAXParser getParser() throws SAXException {
@ -112,9 +112,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
this.elementName = null;
this.surrogates = new ArrayBlockingQueue<>(queueSize);
Reader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
Reader reader = new BufferedReader(new InputStreamReader(stream, this.charsetName));
this.inputSource = new InputSource(reader);
this.inputSource.setEncoding("UTF-8");
this.inputSource.setEncoding(this.charsetName);
this.inputStream = stream;
try {
@ -130,7 +130,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
// test the syntax of the stream by reading parts of the beginning
try {
if (isSolrDump()) {
BufferedReader br = new BufferedReader(new InputStreamReader(this.inputStream, "UTF-8"));
BufferedReader br = new BufferedReader(new InputStreamReader(this.inputStream, this.charsetName));
String line;
while ((line = br.readLine()) != null) {
if (!line.startsWith("<doc>")) continue;
@ -180,26 +180,33 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
}
}
/**
* @return true when inputStream is likely to contain a rich and full-text Solr xml data dump (see IndexExport_p.html)
*/
private boolean isSolrDump() {
try {
boolean res = false;
byte[] b = new byte[100];
this.inputStream.read(b);
int nbRead = -1;
try {
String s = UTF8.String(b);
nbRead = this.inputStream.read(b);
if(nbRead > 0) {
String s = new String(b, 0, nbRead, this.charsetName);
if ((s.contains("<response>") && s.contains("<result>")) || s.startsWith("<doc>")) {
this.inputStream.unread(b);
return true;
res = true;
}
} catch (IOException e) {
ConcurrentLog.logException(e);
this.inputStream.unread(b);
return false;
}
} catch (IOException e) {
ConcurrentLog.logException(e);
return false;
} finally {
if (nbRead > 0) {
try {
this.inputStream.unread(b, 0, nbRead);
} catch (IOException e2) {
ConcurrentLog.logException(e2);
}
}
}
return false;
return res;
}
@Override

Loading…
Cancel
Save