fixed a problem in warc importer: do not fail if single WARC entries are

faulty
pull/402/head
Michael Peter Christen 4 years ago
parent 3078b74e1d
commit d3526c52af

@ -24,7 +24,6 @@ package net.yacy.document.importer;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.zip.GZIPInputStream;
@ -121,6 +120,8 @@ public class WarcImporter extends Thread implements Importer {
InputStream istream = wrec.getPayloadContent();
hl = http.getHeader(HeaderFramework.TRANSFER_ENCODING);
content = null;
try {
if (hl != null && hl.value.contains("chunked")) {
// because chunked stream.read doesn't read source fully, make sure all chunks are read
istream = new ChunkedInputStream(istream);
@ -134,10 +135,8 @@ public class WarcImporter extends Thread implements Importer {
content = new byte[(int) http.getPayloadLength()];
istream.read(content, 0, content.length);
}
istream.close();
RequestHeader requestHeader = new RequestHeader();
ResponseHeader responseHeader = new ResponseHeader(http.statusCode);
for (HeaderLine hx : http.getHeaderList()) { // include all original response headers for parser
responseHeader.put(hx.name, hx.value);
@ -163,7 +162,13 @@ public class WarcImporter extends Thread implements Importer {
);
String error = Switchboard.getSwitchboard().toIndexer(response);
if (error != null) ConcurrentLog.info("WarcImporter", "error: " + error);
if (error != null) ConcurrentLog.info("WarcImporter", "error parsing: " + error);
} catch (IOException e) {
ConcurrentLog.info("WarcImporter", "error reading: " + e.getMessage());
} finally {
try {istream.close();} catch (IOException e) {}
}
recordCnt++;
}
}

Loading…
Cancel
Save