From 309adb814e100098c593ccda59fc428c3de0bb37 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 25 Oct 2022 00:51:53 +0200 Subject: [PATCH] fixed import of jsonlist imort from searchlab.eu using a direct URL --- .../env/templates/submenuIndexImport.template | 1 + .../document/importer/JsonListImporter.java | 5 +++-- .../net/yacy/htroot/IndexImportJsonList_p.java | 17 ++++++++++++++--- source/net/yacy/search/Switchboard.java | 2 +- 4 files changed, 19 insertions(+), 6 deletions(-) diff --git a/htroot/env/templates/submenuIndexImport.template b/htroot/env/templates/submenuIndexImport.template index d85b313c2..39a1029a3 100644 --- a/htroot/env/templates/submenuIndexImport.template +++ b/htroot/env/templates/submenuIndexImport.template @@ -14,6 +14,7 @@
  • RSS Feed Importer
  • OAI-PMH Importer
  • Warc Importer
  • +
  • JsonList Importer
  • diff --git a/source/net/yacy/document/importer/JsonListImporter.java b/source/net/yacy/document/importer/JsonListImporter.java index ac23db760..06082d701 100644 --- a/source/net/yacy/document/importer/JsonListImporter.java +++ b/source/net/yacy/document/importer/JsonListImporter.java @@ -73,7 +73,7 @@ public class JsonListImporter extends Thread implements Importer { private boolean abort; private final boolean deletewhendone; - public JsonListImporter(final File inputFile, final boolean deletewhendone) throws IOException { + public JsonListImporter(final File inputFile, final boolean gz, final boolean deletewhendone) throws IOException { super("JsonListImporter - from file " + inputFile.getName()); this.lineCount = 0; this.consumed = 0; @@ -83,7 +83,7 @@ public class JsonListImporter extends Thread implements Importer { this.abort = false; this.deletewhendone = deletewhendone; this.source = new FileInputStream(inputFile); - if (this.name.endsWith(".gz")) this.source = new GZIPInputStream(this.source); + if (this.name.endsWith(".gz") || gz) this.source = new GZIPInputStream(this.source); } @Override @@ -97,6 +97,7 @@ public class JsonListImporter extends Thread implements Importer { public void processSurrogateJson() throws IOException { this.startTime = System.currentTimeMillis(); + job = this; // start indexer threads which mostly care about tokenization and facet + synonym enrichment final int concurrency = Runtime.getRuntime().availableProcessors(); diff --git a/source/net/yacy/htroot/IndexImportJsonList_p.java b/source/net/yacy/htroot/IndexImportJsonList_p.java index 785ba0248..fce17f270 100644 --- a/source/net/yacy/htroot/IndexImportJsonList_p.java +++ b/source/net/yacy/htroot/IndexImportJsonList_p.java @@ -58,7 +58,7 @@ public class IndexImportJsonList_p { final File sourcefile = new File(filename); if (sourcefile.exists()) { try { - final JsonListImporter wi = new JsonListImporter(sourcefile, false); + final JsonListImporter wi = new JsonListImporter(sourcefile, false, false); wi.start(); prop.put("import_thread", "started"); } catch (final IOException ex) { @@ -72,14 +72,25 @@ public class IndexImportJsonList_p { } } else { final String urlstr = post.get("url"); +/* + final HTTPClient client = new HTTPClient(ClientIdentification.yacyInternetCrawlerAgent); + final byte[] b = client.GETbytes(urlstr, null, null, true); + final File tempfile = File.createTempFile("jsonlistimporter", ""); + final FileOutputStream fos = new FileOutputStream(tempfile); + fos.write(b); + fos.close(); + client.close(); + */ if (urlstr != null && urlstr.length() > 0) { try { final URL url = new URL(urlstr); - final File tempfile = File.createTempFile("jsonlistimporter", ""); + final String tempfilename = "jsonlistimporter"; + final boolean gz = urlstr.endsWith(".gz"); + final File tempfile = File.createTempFile(tempfilename, ""); final FileOutputStream fos = new FileOutputStream(tempfile); fos.getChannel().transferFrom(Channels.newChannel(url.openStream()), 0, Long.MAX_VALUE); fos.close(); - final JsonListImporter wi = new JsonListImporter(tempfile, true); + final JsonListImporter wi = new JsonListImporter(tempfile, gz, true); wi.start(); prop.put("import_thread", "started"); } catch (final IOException ex) { diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 75a2411e0..a7a971ce5 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2206,7 +2206,7 @@ public final class Switchboard extends serverSwitch { // see https://github.com/yacy/yacy_grid_parser/blob/master/README.md this.log.info("processing json surrogate " + infile); try { - final JsonListImporter importer = new JsonListImporter(infile, false); + final JsonListImporter importer = new JsonListImporter(infile, false, false); importer.run(); } catch (final IOException e) { this.log.warn(e);