Fixed isSolrDump function : PushBackInputStream was not unread when

returning false (for example with a WikiMedia dump).
pull/32/head
luc 9 years ago
parent 135a123a77
commit 27d11f8671

@ -40,12 +40,6 @@ import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory; import javax.xml.parsers.SAXParserFactory;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.CrawlStacker;
import net.yacy.search.schema.CollectionConfiguration;
import org.apache.solr.client.solrj.impl.XMLResponseParser; import org.apache.solr.client.solrj.impl.XMLResponseParser;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
@ -56,6 +50,11 @@ import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException; import org.xml.sax.SAXParseException;
import org.xml.sax.helpers.DefaultHandler; import org.xml.sax.helpers.DefaultHandler;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.CrawlStacker;
import net.yacy.search.schema.CollectionConfiguration;
public class SurrogateReader extends DefaultHandler implements Runnable { public class SurrogateReader extends DefaultHandler implements Runnable {
@ -83,6 +82,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
private final CrawlStacker crawlStacker; private final CrawlStacker crawlStacker;
private final CollectionConfiguration configuration; private final CollectionConfiguration configuration;
private final int concurrency; private final int concurrency;
private String charsetName = "UTF-8";
private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>(); private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
private static SAXParser getParser() throws SAXException { private static SAXParser getParser() throws SAXException {
@ -112,9 +112,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
this.elementName = null; this.elementName = null;
this.surrogates = new ArrayBlockingQueue<>(queueSize); this.surrogates = new ArrayBlockingQueue<>(queueSize);
Reader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); Reader reader = new BufferedReader(new InputStreamReader(stream, this.charsetName));
this.inputSource = new InputSource(reader); this.inputSource = new InputSource(reader);
this.inputSource.setEncoding("UTF-8"); this.inputSource.setEncoding(this.charsetName);
this.inputStream = stream; this.inputStream = stream;
try { try {
@ -130,7 +130,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
// test the syntax of the stream by reading parts of the beginning // test the syntax of the stream by reading parts of the beginning
try { try {
if (isSolrDump()) { if (isSolrDump()) {
BufferedReader br = new BufferedReader(new InputStreamReader(this.inputStream, "UTF-8")); BufferedReader br = new BufferedReader(new InputStreamReader(this.inputStream, this.charsetName));
String line; String line;
while ((line = br.readLine()) != null) { while ((line = br.readLine()) != null) {
if (!line.startsWith("<doc>")) continue; if (!line.startsWith("<doc>")) continue;
@ -180,26 +180,33 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
} }
} }
/**
* @return true when inputStream is likely to contain a rich and full-text Solr xml data dump (see IndexExport_p.html)
*/
private boolean isSolrDump() { private boolean isSolrDump() {
try { boolean res = false;
byte[] b = new byte[100]; byte[] b = new byte[100];
this.inputStream.read(b); int nbRead = -1;
try { try {
String s = UTF8.String(b); nbRead = this.inputStream.read(b);
if(nbRead > 0) {
String s = new String(b, 0, nbRead, this.charsetName);
if ((s.contains("<response>") && s.contains("<result>")) || s.startsWith("<doc>")) { if ((s.contains("<response>") && s.contains("<result>")) || s.startsWith("<doc>")) {
this.inputStream.unread(b); res = true;
return true;
} }
} catch (IOException e) {
ConcurrentLog.logException(e);
this.inputStream.unread(b);
return false;
} }
} catch (IOException e) { } catch (IOException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
return false; } finally {
if (nbRead > 0) {
try {
this.inputStream.unread(b, 0, nbRead);
} catch (IOException e2) {
ConcurrentLog.logException(e2);
}
}
} }
return false; return res;
} }
@Override @Override

Loading…
Cancel
Save