fixed warc importer

The importer tried to import a gziped files as plain warc.
It will now check the file extension and use a unzip automatically
on-the-fly.
pull/402/head
Michael Peter Christen 4 years ago
parent 39f87f7f28
commit d359d521a1

@ -17,12 +17,10 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import java.io.File; import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.importer.WarcImporter; import net.yacy.document.importer.WarcImporter;
import net.yacy.server.serverObjects; import net.yacy.server.serverObjects;
@ -59,7 +57,7 @@ public class IndexImportWarc_p {
WarcImporter wi = new WarcImporter(sourcefile); WarcImporter wi = new WarcImporter(sourcefile);
wi.start(); wi.start();
prop.put("import_thread", "started"); prop.put("import_thread", "started");
} catch (FileNotFoundException ex) { } catch (IOException ex) {
prop.put("import_thread", "Error: file not found [" + filename + "]"); prop.put("import_thread", "Error: file not found [" + filename + "]");
} }
prop.put("import", 1); prop.put("import", 1);
@ -73,7 +71,7 @@ public class IndexImportWarc_p {
if (urlstr != null && urlstr.length() > 0) { if (urlstr != null && urlstr.length() > 0) {
try { try {
MultiProtocolURL url = new MultiProtocolURL(urlstr); MultiProtocolURL url = new MultiProtocolURL(urlstr);
WarcImporter wi = new WarcImporter(url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent), urlstr); WarcImporter wi = new WarcImporter(url);
wi.start(); wi.start();
prop.put("import_thread", "started"); prop.put("import_thread", "started");
} catch (MalformedURLException ex) { } catch (MalformedURLException ex) {

@ -27,8 +27,12 @@ import java.io.FileInputStream;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.protocol.ResponseHeader;
@ -60,7 +64,7 @@ public class WarcImporter extends Thread implements Importer {
static public WarcImporter job; // static object to assure only one importer is running (if started from a servlet, this object is used to store the thread) static public WarcImporter job; // static object to assure only one importer is running (if started from a servlet, this object is used to store the thread)
private final InputStream source; // current input warc archive private InputStream source; // current input warc archive
private String name; // file name of input source private String name; // file name of input source
private int recordCnt; // number of responses indexed (for statistic) private int recordCnt; // number of responses indexed (for statistic)
@ -69,30 +73,21 @@ public class WarcImporter extends Thread implements Importer {
private long consumed; // bytes consumed from input source (for statistic) private long consumed; // bytes consumed from input source (for statistic)
private boolean abort = false; // flag to signal stop of import private boolean abort = false; // flag to signal stop of import
public WarcImporter(InputStream f) { public WarcImporter(MultiProtocolURL url) throws IOException {
super("WarcImporter - from InputStream"); super("WarcImporter - from InputStream");
source = f; this.recordCnt = 0;
recordCnt = 0; this.sourceSize = -1;
sourceSize = -1; this.name = url.toNormalform(true);
} this.source = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent);
if (this.name.endsWith(".gz")) this.source = new GZIPInputStream(this.source);
/**
* Init the WarcImporter with input stream with a informational filename or
* url als info for calls to the importer methode source() which returns
* the urlinfo. Otherwise this methode is equivalent to WarchImporter(inputstream)
* @param f the input stream to read the warc archive from
* @param urlinfo a info like the url or the filename
*/
public WarcImporter (InputStream f, String urlinfo) {
this(f);
name = urlinfo;
} }
public WarcImporter(File f) throws FileNotFoundException{ public WarcImporter(File f) throws IOException {
super("WarcImporter - from file " + f.getName()); super("WarcImporter - from file " + f.getName());
name = f.getName(); this.name = f.getName();
sourceSize = f.length(); this.sourceSize = f.length();
source = new FileInputStream(f); this.source = new FileInputStream(f);
if (this.name.endsWith(".gz")) this.source = new GZIPInputStream(this.source);
} }
/** /**
@ -167,7 +162,8 @@ public class WarcImporter extends Thread implements Importer {
content content
); );
Switchboard.getSwitchboard().toIndexer(response); String error = Switchboard.getSwitchboard().toIndexer(response);
if (error != null) ConcurrentLog.info("WarcImporter", "error: " + error);
recordCnt++; recordCnt++;
} }
} }

Loading…
Cancel
Save