From bec34d35468bc952e71028a1630fef5026bf448a Mon Sep 17 00:00:00 2001
From: reger
Date: Sun, 16 Apr 2017 04:25:29 +0200
Subject: [PATCH] Add url input field as source for WarcImporter allowing to
import warc from url without prior download.
---
htroot/IndexImportWarc_p.html | 17 ++++---
htroot/IndexImportWarc_p.java | 51 ++++++++++++++-----
.../yacy/document/importer/WarcImporter.java | 12 +++++
3 files changed, 59 insertions(+), 21 deletions(-)
diff --git a/htroot/IndexImportWarc_p.html b/htroot/IndexImportWarc_p.html
index d6003bc9e..0d490eb9e 100644
--- a/htroot/IndexImportWarc_p.html
+++ b/htroot/IndexImportWarc_p.html
@@ -22,13 +22,16 @@
You can download warc archives for example here
Internet Archive.
-
+
+
+
+
+ - or
+
+
+
+
+
diff --git a/htroot/IndexImportWarc_p.java b/htroot/IndexImportWarc_p.java
index f503fe98b..6a3127952 100644
--- a/htroot/IndexImportWarc_p.java
+++ b/htroot/IndexImportWarc_p.java
@@ -18,6 +18,10 @@
import java.io.File;
import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import net.yacy.cora.document.id.MultiProtocolURL;
+import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.importer.WarcImporter;
@@ -45,23 +49,42 @@ public class IndexImportWarc_p {
} else {
prop.put("import", 0);
if (post != null) {
- if (post.containsKey("file")) {
- String file = post.get("file");
- final File sourcefile = new File(file);
- if (sourcefile.exists()) {
- try {
- WarcImporter wi = new WarcImporter(sourcefile);
- wi.start();
- prop.put("import_thread", "started");
- } catch (FileNotFoundException ex) {
- prop.put("import_thread", "Error: file not found [" + file + "]");
+ if (post.containsKey("file") || post.containsKey("url")) {
+ String filename = post.get("file");
+ if (filename != null && filename.length() > 0) {
+ final File sourcefile = new File(filename);
+ if (sourcefile.exists()) {
+ try {
+ WarcImporter wi = new WarcImporter(sourcefile);
+ wi.start();
+ prop.put("import_thread", "started");
+ } catch (FileNotFoundException ex) {
+ prop.put("import_thread", "Error: file not found [" + filename + "]");
+ }
+ prop.put("import", 1);
+ prop.put("import_warcfile", filename);
+ } else {
+ prop.put("import_warcfile", "");
+ prop.put("import_thread", "Error: file not found [" + filename + "]");
}
- prop.put("import_warcfile", file);
} else {
- prop.put("import_warcfile", "");
- prop.put("import_thread", "Error: file not found [" + file + "]");
+ String urlstr = post.get("url");
+ if (urlstr != null && urlstr.length() > 0) {
+ try {
+ MultiProtocolURL url = new MultiProtocolURL(urlstr);
+ WarcImporter wi = new WarcImporter(url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent), urlstr);
+ wi.start();
+ prop.put("import_thread", "started");
+ } catch (MalformedURLException ex) {
+ prop.put("import_thread", ex.getMessage());
+ } catch (IOException ex) {
+ prop.put("import_thread", ex.getMessage());
+ }
+ prop.put("import", 1);
+ prop.put("import_warcfile", urlstr);
+ }
}
- prop.put("import", 1);
+
prop.put("import_count", 0);
prop.put("import_speed", 0);
prop.put("import_runningHours", 0);
diff --git a/source/net/yacy/document/importer/WarcImporter.java b/source/net/yacy/document/importer/WarcImporter.java
index e921765ce..5ad4582b3 100644
--- a/source/net/yacy/document/importer/WarcImporter.java
+++ b/source/net/yacy/document/importer/WarcImporter.java
@@ -73,6 +73,18 @@ public class WarcImporter extends Thread implements Importer {
sourceSize = -1;
}
+ /**
+ * Init the WarcImporter with input stream with a informational filename or
+ * url als info for calls to the importer methode source() which returns
+ * the urlinfo. Otherwise this methode is equivalent to WarchImporter(inputstream)
+ * @param f the input stream to read the warc archive from
+ * @param urlinfo a info like the url or the filename
+ */
+ public WarcImporter (InputStream f, String urlinfo) {
+ this(f);
+ name = urlinfo;
+ }
+
public WarcImporter(File f) throws FileNotFoundException{
name = f.getName();
sourceSize = f.length();