diff --git a/source/net/yacy/document/content/SurrogateReader.java b/source/net/yacy/document/content/SurrogateReader.java index 983fe8e72..5d80a2f7b 100644 --- a/source/net/yacy/document/content/SurrogateReader.java +++ b/source/net/yacy/document/content/SurrogateReader.java @@ -40,12 +40,6 @@ import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; -import net.yacy.cora.document.encoding.UTF8; -import net.yacy.cora.document.id.DigestURL; -import net.yacy.cora.util.ConcurrentLog; -import net.yacy.crawler.CrawlStacker; -import net.yacy.search.schema.CollectionConfiguration; - import org.apache.solr.client.solrj.impl.XMLResponseParser; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; @@ -56,6 +50,11 @@ import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; import org.xml.sax.helpers.DefaultHandler; +import net.yacy.cora.document.id.DigestURL; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.crawler.CrawlStacker; +import net.yacy.search.schema.CollectionConfiguration; + public class SurrogateReader extends DefaultHandler implements Runnable { @@ -83,6 +82,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable { private final CrawlStacker crawlStacker; private final CollectionConfiguration configuration; private final int concurrency; + private String charsetName = "UTF-8"; private static final ThreadLocal tlSax = new ThreadLocal(); private static SAXParser getParser() throws SAXException { @@ -112,9 +112,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable { this.elementName = null; this.surrogates = new ArrayBlockingQueue<>(queueSize); - Reader reader = new BufferedReader(new InputStreamReader(stream, "UTF-8")); + Reader reader = new BufferedReader(new InputStreamReader(stream, this.charsetName)); this.inputSource = new InputSource(reader); - this.inputSource.setEncoding("UTF-8"); + this.inputSource.setEncoding(this.charsetName); this.inputStream = stream; try { @@ -130,7 +130,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable { // test the syntax of the stream by reading parts of the beginning try { if (isSolrDump()) { - BufferedReader br = new BufferedReader(new InputStreamReader(this.inputStream, "UTF-8")); + BufferedReader br = new BufferedReader(new InputStreamReader(this.inputStream, this.charsetName)); String line; while ((line = br.readLine()) != null) { if (!line.startsWith("")) continue; @@ -180,27 +180,34 @@ public class SurrogateReader extends DefaultHandler implements Runnable { } } - private boolean isSolrDump() { - try { - byte[] b = new byte[100]; - this.inputStream.read(b); - try { - String s = UTF8.String(b); - if ((s.contains("") && s.contains("")) || s.startsWith("")) { - this.inputStream.unread(b); - return true; - } - } catch (IOException e) { - ConcurrentLog.logException(e); - this.inputStream.unread(b); - return false; - } - } catch (IOException e) { - ConcurrentLog.logException(e); - return false; - } - return false; - } + /** + * @return true when inputStream is likely to contain a rich and full-text Solr xml data dump (see IndexExport_p.html) + */ + private boolean isSolrDump() { + boolean res = false; + byte[] b = new byte[100]; + int nbRead = -1; + try { + nbRead = this.inputStream.read(b); + if(nbRead > 0) { + String s = new String(b, 0, nbRead, this.charsetName); + if ((s.contains("") && s.contains("")) || s.startsWith("")) { + res = true; + } + } + } catch (IOException e) { + ConcurrentLog.logException(e); + } finally { + if (nbRead > 0) { + try { + this.inputStream.unread(b, 0, nbRead); + } catch (IOException e2) { + ConcurrentLog.logException(e2); + } + } + } + return res; + } @Override public void startElement(final String uri, final String name, String tag, final Attributes atts) throws SAXException {