fixed warc importer

The importer tried to import a gziped files as plain warc.
It will now check the file extension and use a unzip automatically
on-the-fly.
pull/402/head
Michael Peter Christen 4 years ago
parent 39f87f7f28
commit d359d521a1

@ -17,12 +17,10 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.MalformedURLException;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.importer.WarcImporter;
import net.yacy.server.serverObjects;
@ -59,7 +57,7 @@ public class IndexImportWarc_p {
WarcImporter wi = new WarcImporter(sourcefile);
wi.start();
prop.put("import_thread", "started");
} catch (FileNotFoundException ex) {
} catch (IOException ex) {
prop.put("import_thread", "Error: file not found [" + filename + "]");
}
prop.put("import", 1);
@ -73,7 +71,7 @@ public class IndexImportWarc_p {
if (urlstr != null && urlstr.length() > 0) {
try {
MultiProtocolURL url = new MultiProtocolURL(urlstr);
WarcImporter wi = new WarcImporter(url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent), urlstr);
WarcImporter wi = new WarcImporter(url);
wi.start();
prop.put("import_thread", "started");
} catch (MalformedURLException ex) {

@ -27,8 +27,12 @@ import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
@ -60,7 +64,7 @@ public class WarcImporter extends Thread implements Importer {
static public WarcImporter job; // static object to assure only one importer is running (if started from a servlet, this object is used to store the thread)
private final InputStream source; // current input warc archive
private InputStream source; // current input warc archive
private String name; // file name of input source
private int recordCnt; // number of responses indexed (for statistic)
@ -69,30 +73,21 @@ public class WarcImporter extends Thread implements Importer {
private long consumed; // bytes consumed from input source (for statistic)
private boolean abort = false; // flag to signal stop of import
public WarcImporter(InputStream f) {
public WarcImporter(MultiProtocolURL url) throws IOException {
super("WarcImporter - from InputStream");
source = f;
recordCnt = 0;
sourceSize = -1;
}
/**
* Init the WarcImporter with input stream with a informational filename or
* url als info for calls to the importer methode source() which returns
* the urlinfo. Otherwise this methode is equivalent to WarchImporter(inputstream)
* @param f the input stream to read the warc archive from
* @param urlinfo a info like the url or the filename
*/
public WarcImporter (InputStream f, String urlinfo) {
this(f);
name = urlinfo;
this.recordCnt = 0;
this.sourceSize = -1;
this.name = url.toNormalform(true);
this.source = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
if (this.name.endsWith(".gz")) this.source = new GZIPInputStream(this.source);
}
public WarcImporter(File f) throws FileNotFoundException{
public WarcImporter(File f) throws IOException {
super("WarcImporter - from file " + f.getName());
name = f.getName();
sourceSize = f.length();
source = new FileInputStream(f);
this.name = f.getName();
this.sourceSize = f.length();
this.source = new FileInputStream(f);
if (this.name.endsWith(".gz")) this.source = new GZIPInputStream(this.source);
}
/**
@ -167,7 +162,8 @@ public class WarcImporter extends Thread implements Importer {
content
);
Switchboard.getSwitchboard().toIndexer(response);
String error = Switchboard.getSwitchboard().toIndexer(response);
if (error != null) ConcurrentLog.info("WarcImporter", "error: " + error);
recordCnt++;
}
}

Loading…
Cancel
Save