Fixed isSolrDump function : PushBackInputStream was not unread when

returning false (for example with a WikiMedia dump).
9 years ago · 27d11f8671
parent 135a123a77
commit 27d11f8671
1 changed files with 37 additions and 30 deletions
--- a/source/net/yacy/document/content/SurrogateReader.java
+++ b/source/net/yacy/document/content/SurrogateReader.java
@ -40,12 +40,6 @@ import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;
 import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.crawler.CrawlStacker;
 import net.yacy.search.schema.CollectionConfiguration;
 import org.apache.solr.client.solrj.impl.XMLResponseParser;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrInputDocument;
@ -56,6 +50,11 @@ import org.xml.sax.SAXException;
 import org.xml.sax.SAXParseException;
 import org.xml.sax.helpers.DefaultHandler;
 import net.yacy.cora.document.id.DigestURL;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.crawler.CrawlStacker;
 import net.yacy.search.schema.CollectionConfiguration;
 public class SurrogateReader extends DefaultHandler implements Runnable {
@ -83,6 +82,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
    private final CrawlStacker crawlStacker;
    private final CollectionConfiguration configuration;
    private final int concurrency;
    private String charsetName = "UTF-8";
    private static final ThreadLocal<SAXParser> tlSax = new ThreadLocal<SAXParser>();
    private static SAXParser getParser() throws SAXException {
@ -112,9 +112,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
        this.elementName = null;
        this.surrogates = new ArrayBlockingQueue<>(queueSize);
-        Reader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8"));
+        Reader reader = new BufferedReader(new InputStreamReader(stream, this.charsetName));
        this.inputSource = new InputSource(reader);
-        this.inputSource.setEncoding("UTF-8");
+        this.inputSource.setEncoding(this.charsetName);
        this.inputStream = stream;
        try {
@ -130,7 +130,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
        // test the syntax of the stream by reading parts of the beginning
        try {
            if (isSolrDump()) {
-                BufferedReader br = new BufferedReader(new InputStreamReader(this.inputStream, "UTF-8"));
+                BufferedReader br = new BufferedReader(new InputStreamReader(this.inputStream, this.charsetName));
                String line;
                while ((line = br.readLine()) != null) {
                    if (!line.startsWith("<doc>")) continue;
@ -180,26 +180,33 @@ public class SurrogateReader extends DefaultHandler implements Runnable {
        }
    }
    /**
     * @return true when inputStream is likely to contain a rich and full-text Solr xml data dump (see IndexExport_p.html)
     */
 	private boolean isSolrDump() {
-        try {
+		boolean res = false;
 		byte[] b = new byte[100];
-            this.inputStream.read(b);
+		int nbRead = -1;
 		try {
-                String s = UTF8.String(b);
+			nbRead = this.inputStream.read(b);
 			if(nbRead > 0) {
 				String s = new String(b, 0, nbRead, this.charsetName);
 				if ((s.contains("<response>") && s.contains("<result>")) || s.startsWith("<doc>")) {
-                    this.inputStream.unread(b);
+					res = true;
                    return true;
 				}
            } catch (IOException e) {
                ConcurrentLog.logException(e);
                this.inputStream.unread(b);
                return false;
 			}
 		} catch (IOException e) {
 			ConcurrentLog.logException(e);
-            return false;
+		} finally {
 			if (nbRead > 0) {
 				try {
 					this.inputStream.unread(b, 0, nbRead);
 				} catch (IOException e2) {
 					ConcurrentLog.logException(e2);
 				}
 			}
 		}
-        return false;
+		return res;
 	}
    @Override