added concurrency to enhance indexing speed during json surrogate import

pull/408/head
Michael Peter Christen 4 years ago
parent f8cbaeef93
commit 8f876a8c72

@ -78,6 +78,8 @@ import java.util.Set;
import java.util.SortedSet;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;
@ -97,7 +99,6 @@ import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.core.SolrCore;
import org.apache.solr.search.SyntaxError;
import org.eclipse.jetty.http.DateParser;
import org.json.JSONArray;
import org.json.JSONException;
import org.json.JSONObject;
@ -129,6 +130,7 @@ import net.yacy.cora.federate.solr.connector.ShardSelection;
import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
import net.yacy.cora.federate.solr.instance.RemoteInstance;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.language.synonyms.SynonymLibrary;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.Digest;
@ -2168,7 +2170,7 @@ public final class Switchboard extends serverSwitch {
baos.write(buffer, 0, size);
}
baos.flush();
processSurrogate(new ByteArrayInputStream(baos.toByteArray()), entry.getName());
processSurrogateXML(new ByteArrayInputStream(baos.toByteArray()), entry.getName());
baos.close();
if (shallTerminate()) break;
}
@ -2196,11 +2198,96 @@ public final class Switchboard extends serverSwitch {
}
return moved;
} else if (s.endsWith(".jsonlist") || s.endsWith(".flatjson")) {
return processSurrogateJson(infile, outfile);
}
InputStream is = null;
try {
is = new BufferedInputStream(new FileInputStream(infile));
if (s.endsWith(".gz")) is = new GZIPInputStream(is, 65535);
processSurrogateXML(is, infile.getName());
} catch (final IOException e ) {
ConcurrentLog.logException(e);
} finally {
if (!shallTerminate()) {
moved = infile.renameTo(outfile);
if ( moved ) {
// check if this file is already compressed, if not, compress now
if ( !outfile.getName().endsWith(".gz") ) {
final String gzname = outfile.getName() + ".gz";
final File gzfile = new File(outfile.getParentFile(), gzname);
try (
/* Resources automatically closed by this try-with-resources statement */
final FileOutputStream fileOutStream = new FileOutputStream(gzfile);
final OutputStream os = new BufferedOutputStream(new GZIPOutputStream(fileOutStream, 65536){{def.setLevel(Deflater.BEST_COMPRESSION);}});
final FileInputStream fileInStream = new FileInputStream(outfile);
final BufferedInputStream bis = new BufferedInputStream(fileInStream);
) {
FileUtils.copy(bis, os);
if ( gzfile.exists() ) {
FileUtils.deletedelete(outfile);
}
} catch (final FileNotFoundException e ) {
ConcurrentLog.logException(e);
} catch (final IOException e ) {
/* Catch but log any IO exception that can occur on copy, automatic closing or streams creation */
ConcurrentLog.logException(e);
}
}
log.info("processed surrogate " + infile);
}
}
if (is != null) try {is.close();} catch (IOException e) {
log.warn("Could not close input stream on file " + infile);
}
}
return moved;
}
private boolean processSurrogateJson(File infile, File outfile) {
// parse a file that can be generated with yacy_grid_parser
// see https://github.com/yacy/yacy_grid_parser/blob/master/README.md
log.info("processing json surrogate " + infile);
long starttime = System.currentTimeMillis();
boolean moved = false;
FileInputStream fis = null;
BufferedReader br = null;
// start indexer threads which mostly care about tokenization and facet + synonym enrichment
final int concurrency = Runtime.getRuntime().availableProcessors();
final BlockingQueue<SolrInputDocument> sidQueue = new ArrayBlockingQueue<>(concurrency * 2);
final Thread[] indexer = new Thread[concurrency];
for (int t = 0; t < indexer.length; t++) {
indexer[t] = new Thread("Switchboard.processSurrogateJson-" + t) {
@Override
public void run() {
VocabularyScraper scraper = new VocabularyScraper();
SolrInputDocument sid;
try {
while ((sid = sidQueue.take()) != SurrogateReader.POISON_DOCUMENT ) {
// enrich the surrogate
final String id = (String) sid.getFieldValue(CollectionSchema.id.getSolrFieldName());
final String text = (String) sid.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
DigestURL rootURL;
if (text != null && text.length() > 0 && id != null ) try {
if (SynonymLibrary.size() > 0 || !LibraryProvider.autotagging.isEmpty()) {
rootURL = new DigestURL((String) sid.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes(id));
// run the tokenizer on the text to get vocabularies and synonyms
final Tokenizer tokenizer = new Tokenizer(rootURL, text, LibraryProvider.dymLib, true, scraper);
final Map<String, Set<String>> facets = Document.computeGenericFacets(tokenizer.tags());
// overwrite the given vocabularies and synonyms with new computed ones
Switchboard.this.index.fulltext().getDefaultConfiguration().enrich(sid, tokenizer.synonyms(), facets);
}
Switchboard.this.index.putDocument(sid);
} catch (MalformedURLException e) {}
}
} catch (InterruptedException e) {
}
}
};
indexer[t].start();
}
try {
fis = new FileInputStream(infile);
InputStream is = new BufferedInputStream(fis);
@ -2283,23 +2370,25 @@ public final class Switchboard extends serverSwitch {
}
}
// enrich the surrogate
final String id = (String) surrogate.getFieldValue(CollectionSchema.id.getSolrFieldName());
final String text = (String) surrogate.getFieldValue(CollectionSchema.text_t.getSolrFieldName());
final DigestURL rootURL = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName()), ASCII.getBytes(id));
if (text != null && text.length() > 0 && id != null ) {
// run the tokenizer on the text to get vocabularies and synonyms
final Tokenizer tokenizer = new Tokenizer(rootURL, text, LibraryProvider.dymLib, true, scraper);
final Map<String, Set<String>> facets = Document.computeGenericFacets(tokenizer.tags());
// overwrite the given vocabularies and synonyms with new computed ones
Switchboard.this.index.fulltext().getDefaultConfiguration().enrich(surrogate, tokenizer.synonyms(), facets);
try {
sidQueue.put(surrogate);
} catch (InterruptedException e) {
e.printStackTrace();
}
Switchboard.this.index.putDocument(surrogate);
}
br.close();
br = null;
fis = null;
// finish indexing threads by giving them poison
for (int t = 0; t < indexer.length; t++) {
try {sidQueue.put(SurrogateReader.POISON_DOCUMENT);} catch (InterruptedException e) {}
}
// wait until indexer threads are finished
for (int t = 0; t < indexer.length; t++) {
try {indexer[t].join(10000);} catch (InterruptedException e) {}
}
moved = infile.renameTo(outfile);
} catch (IOException | JSONException ex) {
log.warn("IO Error processing flatjson file " + infile);
@ -2321,52 +2410,13 @@ public final class Switchboard extends serverSwitch {
}
}
}
return moved;
}
InputStream is = null;
try {
is = new BufferedInputStream(new FileInputStream(infile));
if (s.endsWith(".gz")) is = new GZIPInputStream(is, 65535);
processSurrogate(is, infile.getName());
} catch (final IOException e ) {
ConcurrentLog.logException(e);
} finally {
if (!shallTerminate()) {
moved = infile.renameTo(outfile);
if ( moved ) {
// check if this file is already compressed, if not, compress now
if ( !outfile.getName().endsWith(".gz") ) {
final String gzname = outfile.getName() + ".gz";
final File gzfile = new File(outfile.getParentFile(), gzname);
try (
/* Resources automatically closed by this try-with-resources statement */
final FileOutputStream fileOutStream = new FileOutputStream(gzfile);
final OutputStream os = new BufferedOutputStream(new GZIPOutputStream(fileOutStream, 65536){{def.setLevel(Deflater.BEST_COMPRESSION);}});
final FileInputStream fileInStream = new FileInputStream(outfile);
final BufferedInputStream bis = new BufferedInputStream(fileInStream);
) {
FileUtils.copy(bis, os);
if ( gzfile.exists() ) {
FileUtils.deletedelete(outfile);
}
} catch (final FileNotFoundException e ) {
ConcurrentLog.logException(e);
} catch (final IOException e ) {
/* Catch but log any IO exception that can occur on copy, automatic closing or streams creation */
ConcurrentLog.logException(e);
}
}
log.info("processed surrogate " + infile);
}
}
if (is != null) try {is.close();} catch (IOException e) {
log.warn("Could not close input stream on file " + infile);
}
}
log.info("finished processing json surrogate: " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds");
return moved;
}
public void processSurrogate(final InputStream is, final String name) throws IOException {
private void processSurrogateXML(final InputStream is, final String name) throws IOException {
final int concurrency = Runtime.getRuntime().availableProcessors();
// start reader thread
@ -2379,7 +2429,7 @@ public final class Switchboard extends serverSwitch {
assert this.crawlStacker != null;
Thread[] indexer = new Thread[concurrency];
for (int t = 0; t < concurrency; t++) {
indexer[t] = new Thread("Switchboard.processSurrogate-" + t) {
indexer[t] = new Thread("Switchboard.processSurrogateXML-" + t) {
@Override
public void run() {
VocabularyScraper scraper = new VocabularyScraper();

Loading…
Cancel
Save