From bec34d35468bc952e71028a1630fef5026bf448a Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 16 Apr 2017 04:25:29 +0200 Subject: [PATCH] Add url input field as source for WarcImporter allowing to import warc from url without prior download. --- htroot/IndexImportWarc_p.html | 17 ++++--- htroot/IndexImportWarc_p.java | 51 ++++++++++++++----- .../yacy/document/importer/WarcImporter.java | 12 +++++ 3 files changed, 59 insertions(+), 21 deletions(-) diff --git a/htroot/IndexImportWarc_p.html b/htroot/IndexImportWarc_p.html index d6003bc9e..0d490eb9e 100644 --- a/htroot/IndexImportWarc_p.html +++ b/htroot/IndexImportWarc_p.html @@ -22,13 +22,16 @@ You can download warc archives for example here Internet Archive.

-
- - -
- -
-
+
+
+
+
+
or
+
+
+
+
+
diff --git a/htroot/IndexImportWarc_p.java b/htroot/IndexImportWarc_p.java index f503fe98b..6a3127952 100644 --- a/htroot/IndexImportWarc_p.java +++ b/htroot/IndexImportWarc_p.java @@ -18,6 +18,10 @@ import java.io.File; import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.MalformedURLException; +import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.document.importer.WarcImporter; @@ -45,23 +49,42 @@ public class IndexImportWarc_p { } else { prop.put("import", 0); if (post != null) { - if (post.containsKey("file")) { - String file = post.get("file"); - final File sourcefile = new File(file); - if (sourcefile.exists()) { - try { - WarcImporter wi = new WarcImporter(sourcefile); - wi.start(); - prop.put("import_thread", "started"); - } catch (FileNotFoundException ex) { - prop.put("import_thread", "Error: file not found [" + file + "]"); + if (post.containsKey("file") || post.containsKey("url")) { + String filename = post.get("file"); + if (filename != null && filename.length() > 0) { + final File sourcefile = new File(filename); + if (sourcefile.exists()) { + try { + WarcImporter wi = new WarcImporter(sourcefile); + wi.start(); + prop.put("import_thread", "started"); + } catch (FileNotFoundException ex) { + prop.put("import_thread", "Error: file not found [" + filename + "]"); + } + prop.put("import", 1); + prop.put("import_warcfile", filename); + } else { + prop.put("import_warcfile", ""); + prop.put("import_thread", "Error: file not found [" + filename + "]"); } - prop.put("import_warcfile", file); } else { - prop.put("import_warcfile", ""); - prop.put("import_thread", "Error: file not found [" + file + "]"); + String urlstr = post.get("url"); + if (urlstr != null && urlstr.length() > 0) { + try { + MultiProtocolURL url = new MultiProtocolURL(urlstr); + WarcImporter wi = new WarcImporter(url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent), urlstr); + wi.start(); + prop.put("import_thread", "started"); + } catch (MalformedURLException ex) { + prop.put("import_thread", ex.getMessage()); + } catch (IOException ex) { + prop.put("import_thread", ex.getMessage()); + } + prop.put("import", 1); + prop.put("import_warcfile", urlstr); + } } - prop.put("import", 1); + prop.put("import_count", 0); prop.put("import_speed", 0); prop.put("import_runningHours", 0); diff --git a/source/net/yacy/document/importer/WarcImporter.java b/source/net/yacy/document/importer/WarcImporter.java index e921765ce..5ad4582b3 100644 --- a/source/net/yacy/document/importer/WarcImporter.java +++ b/source/net/yacy/document/importer/WarcImporter.java @@ -73,6 +73,18 @@ public class WarcImporter extends Thread implements Importer { sourceSize = -1; } + /** + * Init the WarcImporter with input stream with a informational filename or + * url als info for calls to the importer methode source() which returns + * the urlinfo. Otherwise this methode is equivalent to WarchImporter(inputstream) + * @param f the input stream to read the warc archive from + * @param urlinfo a info like the url or the filename + */ + public WarcImporter (InputStream f, String urlinfo) { + this(f); + name = urlinfo; + } + public WarcImporter(File f) throws FileNotFoundException{ name = f.getName(); sourceSize = f.length();