Add url input field as source for WarcImporter

allowing to import warc from url without prior download.
pull/122/head
reger 8 years ago
parent d3df8a46c4
commit bec34d3546

@ -22,13 +22,16 @@
You can download warc archives for example here You can download warc archives for example here
<a href="https://archive.org/search.php?query=subject%3A%22warcarchives%22&and[]=subject%3A%22warcarchives%22" target="_blank">Internet Archive</a>. <a href="https://archive.org/search.php?query=subject%3A%22warcarchives%22&and[]=subject%3A%22warcarchives%22" target="_blank">Internet Archive</a>.
</p> </p>
<div class="input-group"> <dl>
<span style="display: inline-block"> <dt class="TableCellDark"><label for="file">File:</label></dt>
<input name="file" type="file" value="" size="75" /></span> <dd><input name="file" id="file" type="file" value="" size="75" /></dd>
<div class="btn-group"> <dt></dt>
<input name="submit" class="btn btn-primary" type="submit" value="Import Warc File" /> <dd>or</dd>
</div> <dt class="TableCellDark"><label for="url">Url:</label></dt>
</div> <dd><input name="url" id="url" value="" size="75"/></dd>
<dt></dt>
<dd><input name="submit" class="btn btn-primary" type="submit" value="Import Warc File" /></dd>
</dl>
</fieldset> </fieldset>
</form> </form>

@ -18,6 +18,10 @@
import java.io.File; import java.io.File;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.IOException;
import java.net.MalformedURLException;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.importer.WarcImporter; import net.yacy.document.importer.WarcImporter;
@ -45,23 +49,42 @@ public class IndexImportWarc_p {
} else { } else {
prop.put("import", 0); prop.put("import", 0);
if (post != null) { if (post != null) {
if (post.containsKey("file")) { if (post.containsKey("file") || post.containsKey("url")) {
String file = post.get("file"); String filename = post.get("file");
final File sourcefile = new File(file); if (filename != null && filename.length() > 0) {
if (sourcefile.exists()) { final File sourcefile = new File(filename);
try { if (sourcefile.exists()) {
WarcImporter wi = new WarcImporter(sourcefile); try {
wi.start(); WarcImporter wi = new WarcImporter(sourcefile);
prop.put("import_thread", "started"); wi.start();
} catch (FileNotFoundException ex) { prop.put("import_thread", "started");
prop.put("import_thread", "Error: file not found [" + file + "]"); } catch (FileNotFoundException ex) {
prop.put("import_thread", "Error: file not found [" + filename + "]");
}
prop.put("import", 1);
prop.put("import_warcfile", filename);
} else {
prop.put("import_warcfile", "");
prop.put("import_thread", "Error: file not found [" + filename + "]");
} }
prop.put("import_warcfile", file);
} else { } else {
prop.put("import_warcfile", ""); String urlstr = post.get("url");
prop.put("import_thread", "Error: file not found [" + file + "]"); if (urlstr != null && urlstr.length() > 0) {
try {
MultiProtocolURL url = new MultiProtocolURL(urlstr);
WarcImporter wi = new WarcImporter(url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent), urlstr);
wi.start();
prop.put("import_thread", "started");
} catch (MalformedURLException ex) {
prop.put("import_thread", ex.getMessage());
} catch (IOException ex) {
prop.put("import_thread", ex.getMessage());
}
prop.put("import", 1);
prop.put("import_warcfile", urlstr);
}
} }
prop.put("import", 1);
prop.put("import_count", 0); prop.put("import_count", 0);
prop.put("import_speed", 0); prop.put("import_speed", 0);
prop.put("import_runningHours", 0); prop.put("import_runningHours", 0);

@ -73,6 +73,18 @@ public class WarcImporter extends Thread implements Importer {
sourceSize = -1; sourceSize = -1;
} }
/**
* Init the WarcImporter with input stream with a informational filename or
* url als info for calls to the importer methode source() which returns
* the urlinfo. Otherwise this methode is equivalent to WarchImporter(inputstream)
* @param f the input stream to read the warc archive from
* @param urlinfo a info like the url or the filename
*/
public WarcImporter (InputStream f, String urlinfo) {
this(f);
name = urlinfo;
}
public WarcImporter(File f) throws FileNotFoundException{ public WarcImporter(File f) throws FileNotFoundException{
name = f.getName(); name = f.getName();
sourceSize = f.length(); sourceSize = f.length();

Loading…
Cancel
Save