From 79fdf14b0abe85e74ae35cae44e0406b38f26559 Mon Sep 17 00:00:00 2001 From: luccioman Date: Tue, 2 May 2017 09:32:04 +0200 Subject: [PATCH] Fixed regression introduced by commit 9ad4d16 On MediaWiki dump imports, the SurrogateReader was trying to unread too many bytes, then failing with the following exception : "java.io.IOException: Push back buffer is full". --- source/net/yacy/document/content/SurrogateReader.java | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/source/net/yacy/document/content/SurrogateReader.java b/source/net/yacy/document/content/SurrogateReader.java index 962e23f80..ff0524d2f 100644 --- a/source/net/yacy/document/content/SurrogateReader.java +++ b/source/net/yacy/document/content/SurrogateReader.java @@ -71,6 +71,9 @@ public class SurrogateReader extends DefaultHandler implements Runnable { public final static String SURROGATES_MAIN_ELEMENT_CLOSE = ""; public final static SolrInputDocument POISON_DOCUMENT = new SolrInputDocument(); + + /** Maximum bytes number that can be unread on the underlying input stream */ + private static final int PUSHBACK_SIZE = 1024; // class variables private final StringBuilder buffer; @@ -100,7 +103,7 @@ public class SurrogateReader extends DefaultHandler implements Runnable { } public SurrogateReader(final InputStream stream, int queueSize, CrawlStacker crawlStacker, CollectionConfiguration configuration, int concurrency) throws IOException { - this(new PushbackInputStream(stream, 200), queueSize, crawlStacker, configuration, concurrency); + this(new PushbackInputStream(stream, PUSHBACK_SIZE), queueSize, crawlStacker, configuration, concurrency); } public SurrogateReader(final PushbackInputStream stream, int queueSize, CrawlStacker crawlStacker, CollectionConfiguration configuration, int concurrency) throws IOException { @@ -181,14 +184,14 @@ public class SurrogateReader extends DefaultHandler implements Runnable { /** * Check for format string in responseHeader "yacy.index.export.solr.xml" - * (introduced v1.92/9188 2017-04-30) or guess format by existing "" - * and "" or "" tag in the first 1024 characters. + * (introduced v1.92/9188 2017-04-30) or guess format by existing "" + * and "" or "" tag in the first {@value #PUSHBACK_SIZE} characters. * * @return true when inputStream is likely to contain a rich and full-text Solr xml data dump (see IndexExport_p.html) */ private boolean isSolrDump() { boolean res = false; - byte[] b = new byte[1024]; + byte[] b = new byte[PUSHBACK_SIZE]; int nbRead = -1; try { nbRead = this.inputStream.read(b);