added concurrency to enhance indexing speed during json surrogate import

4 years ago · 8f876a8c72
parent f8cbaeef93
commit 8f876a8c72
1 changed files with 181 additions and 131 deletions
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@ -78,6 +78,8 @@ import java.util.Set;
 import java.util.SortedSet;
 import java.util.TreeMap;
 import java.util.TreeSet;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.LinkedBlockingQueue;
 import java.util.concurrent.Semaphore;
@ -97,7 +99,6 @@ import org.apache.solr.common.SolrException;
 import org.apache.solr.common.SolrInputDocument;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.search.SyntaxError;
-import org.eclipse.jetty.http.DateParser;
 import org.json.JSONArray;
 import org.json.JSONException;
 import org.json.JSONObject;
@ -129,6 +130,7 @@ import net.yacy.cora.federate.solr.connector.ShardSelection;
 import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
 import net.yacy.cora.federate.solr.instance.RemoteInstance;
 import net.yacy.cora.federate.yacy.CacheStrategy;
+import net.yacy.cora.language.synonyms.SynonymLibrary;
 import net.yacy.cora.lod.vocabulary.Tagging;
 import net.yacy.cora.order.Base64Order;
 import net.yacy.cora.order.Digest;
@ -2168,7 +2170,7 @@ public final class Switchboard extends serverSwitch {
                        baos.write(buffer, 0, size);
                    }
                    baos.flush();
-                    processSurrogate(new ByteArrayInputStream(baos.toByteArray()), entry.getName());
+                    processSurrogateXML(new ByteArrayInputStream(baos.toByteArray()), entry.getName());
                    baos.close();
                    if (shallTerminate()) break;
                }
@ -2196,11 +2198,96 @@ public final class Switchboard extends serverSwitch {
            }
            return moved;
        } else if (s.endsWith(".jsonlist") || s.endsWith(".flatjson")) {
+            return processSurrogateJson(infile, outfile);
+        }
+        InputStream is = null;
+        try {
+            is = new BufferedInputStream(new FileInputStream(infile));
+            if (s.endsWith(".gz")) is = new GZIPInputStream(is, 65535);
+            processSurrogateXML(is, infile.getName());
+        } catch (final IOException e ) {
+            ConcurrentLog.logException(e);
+        } finally {
+            if (!shallTerminate()) {
+                moved = infile.renameTo(outfile);
+                if ( moved ) {
+                    // check if this file is already compressed, if not, compress now
+                    if ( !outfile.getName().endsWith(".gz") ) {
+                        final String gzname = outfile.getName() + ".gz";
+                        final File gzfile = new File(outfile.getParentFile(), gzname);
+                        try (
+                            /* Resources automatically closed by this try-with-resources statement */
+                            final FileOutputStream fileOutStream = new FileOutputStream(gzfile);
+                            final OutputStream os = new BufferedOutputStream(new GZIPOutputStream(fileOutStream, 65536){{def.setLevel(Deflater.BEST_COMPRESSION);}});
+                            final FileInputStream fileInStream = new FileInputStream(outfile);
+                            final BufferedInputStream bis = new BufferedInputStream(fileInStream);
+                        ) {
+                            FileUtils.copy(bis, os);
+                            if ( gzfile.exists() ) {
+                                FileUtils.deletedelete(outfile);
+                            }
+                        } catch (final FileNotFoundException e ) {
+                            ConcurrentLog.logException(e);
+                        } catch (final IOException e ) {
+                            /* Catch but log any IO exception that can occur on copy, automatic closing or streams creation */
+                            ConcurrentLog.logException(e);
+                        }
+                    }
+                    log.info("processed surrogate " + infile);
+                }
+            }
+            if (is != null) try {is.close();} catch (IOException e) {
+                log.warn("Could not close input stream on file " + infile);
+            }
+        }
+        return moved;
+    }
+
+    private boolean processSurrogateJson(File infile, File outfile) {
        // parse a file that can be generated with yacy_grid_parser
        // see https://github.com/yacy/yacy_grid_parser/blob/master/README.md
+        log.info("processing json surrogate " + infile);
+        long starttime = System.currentTimeMillis();
+
+        boolean moved = false;
        FileInputStream fis = null;
        BufferedReader br = null;
+
+        // start indexer threads which mostly care about tokenization and facet + synonym enrichment
+        final int concurrency = Runtime.getRuntime().availableProcessors();
+        final BlockingQueue<SolrInputDocument> sidQueue = new ArrayBlockingQueue<>(concurrency * 2);
+        final Thread[] indexer = new Thread[concurrency];
+        for (int t = 0; t < indexer.length; t++) {
+            indexer[t] = new Thread("Switchboard.processSurrogateJson-" + t) {
+                @Override
+                public void run() {
                    VocabularyScraper scraper = new VocabularyScraper();
+                    SolrInputDocument sid;
+                    try {
+                        while ((sid = sidQueue.take()) != SurrogateReader.POISON_DOCUMENT ) {
+                            // enrich the surrogate
+                            final String id = (String) sid.getFieldValue(CollectionSchema.id.getSolrFieldName());
+                            final String text = (String) sid.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
+                            DigestURL rootURL;
+                            if (text != null && text.length() > 0 && id != null ) try {
+                                if (SynonymLibrary.size() > 0 || !LibraryProvider.autotagging.isEmpty()) {
+                                    rootURL = new DigestURL((String) sid.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes(id));
+                                    // run the tokenizer on the text to get vocabularies and synonyms
+                                    final Tokenizer tokenizer = new Tokenizer(rootURL, text, LibraryProvider.dymLib, true, scraper);
+                                    final Map<String, Set<String>> facets = Document.computeGenericFacets(tokenizer.tags());
+                                    // overwrite the given vocabularies and synonyms with new computed ones
+                                    Switchboard.this.index.fulltext().getDefaultConfiguration().enrich(sid, tokenizer.synonyms(), facets);
+                                }
+                                Switchboard.this.index.putDocument(sid);
+                            } catch (MalformedURLException e) {}
+                        }
+                    } catch (InterruptedException e) {
+                    }
+                }
+            };
+            indexer[t].start();
+        }
+
        try {
            fis = new FileInputStream(infile);
            InputStream is = new BufferedInputStream(fis);
@ -2283,23 +2370,25 @@ public final class Switchboard extends serverSwitch {
                    }
                }

-                    // enrich the surrogate
-                    final String id = (String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName());
-                    final String text = (String) surrogate.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
-                    final DigestURL rootURL = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes(id));
-                    if (text != null && text.length() > 0 && id != null ) {
-                        // run the tokenizer on the text to get vocabularies and synonyms
-                        final Tokenizer tokenizer = new Tokenizer(rootURL, text, LibraryProvider.dymLib, true, scraper);
-                        final Map<String, Set<String>> facets = Document.computeGenericFacets(tokenizer.tags());
-                        // overwrite the given vocabularies and synonyms with new computed ones
-                        Switchboard.this.index.fulltext().getDefaultConfiguration().enrich(surrogate, tokenizer.synonyms(), facets);
+                try {
+                    sidQueue.put(surrogate);
+                } catch (InterruptedException e) {
+                    e.printStackTrace();
                }
-
-                    Switchboard.this.index.putDocument(surrogate);
            }
            br.close();
            br = null;
            fis = null;
+
+            // finish indexing threads by giving them poison
+            for (int t = 0; t < indexer.length; t++) {
+                try {sidQueue.put(SurrogateReader.POISON_DOCUMENT);} catch (InterruptedException e) {}
+            }
+            // wait until indexer threads are finished
+            for (int t = 0; t < indexer.length; t++) {
+                try {indexer[t].join(10000);} catch (InterruptedException e) {}
+            }
+
            moved = infile.renameTo(outfile);
        } catch (IOException | JSONException  ex) {
            log.warn("IO Error processing flatjson file " + infile);
@ -2321,52 +2410,13 @@ public final class Switchboard extends serverSwitch {
                }
            }
        }
-            return moved;
-        }
-        InputStream is = null;
-        try {
-            is = new BufferedInputStream(new FileInputStream(infile));
-            if (s.endsWith(".gz")) is = new GZIPInputStream(is, 65535);
-            processSurrogate(is, infile.getName());
-        } catch (final IOException e ) {
-            ConcurrentLog.logException(e);
-        } finally {
-            if (!shallTerminate()) {
-                moved = infile.renameTo(outfile);
-                if ( moved ) {
-                    // check if this file is already compressed, if not, compress now
-                    if ( !outfile.getName().endsWith(".gz") ) {
-                        final String gzname = outfile.getName() + ".gz";
-                        final File gzfile = new File(outfile.getParentFile(), gzname);
-                        try (
-                            /* Resources automatically closed by this try-with-resources statement */
-                            final FileOutputStream fileOutStream = new FileOutputStream(gzfile);
-                            final OutputStream os = new BufferedOutputStream(new GZIPOutputStream(fileOutStream, 65536){{def.setLevel(Deflater.BEST_COMPRESSION);}});
-                            final FileInputStream fileInStream = new FileInputStream(outfile);
-                            final BufferedInputStream bis = new BufferedInputStream(fileInStream);
-                        ) {
-                            FileUtils.copy(bis, os);
-                            if ( gzfile.exists() ) {
-                                FileUtils.deletedelete(outfile);
-                            }
-                        } catch (final FileNotFoundException e ) {
-                            ConcurrentLog.logException(e);
-                        } catch (final IOException e ) {
-                            /* Catch but log any IO exception that can occur on copy, automatic closing or streams creation */
-                            ConcurrentLog.logException(e);
-                        }
-                    }
-                    log.info("processed surrogate " + infile);
-                }
-            }
-            if (is != null) try {is.close();} catch (IOException e) {
-                log.warn("Could not close input stream on file " + infile);
-            }
-        }
+
+        log.info("finished processing json surrogate: " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds");
+
        return moved;
    }

-    public void processSurrogate(final InputStream is, final String name) throws IOException {
+    private void processSurrogateXML(final InputStream is, final String name) throws IOException {
        final int concurrency = Runtime.getRuntime().availableProcessors();

        // start reader thread
@ -2379,7 +2429,7 @@ public final class Switchboard extends serverSwitch {
        assert this.crawlStacker != null;
        Thread[] indexer = new Thread[concurrency];
        for (int t = 0; t < concurrency; t++) {
-            indexer[t] = new Thread("Switchboard.processSurrogate-" + t) {
+            indexer[t] = new Thread("Switchboard.processSurrogateXML-" + t) {
                @Override
                public void run() {
                    VocabularyScraper scraper = new VocabularyScraper();