fixed several problems with postprocessing:

- unique-postprocessing was destroying results from other
postprocessings; removed cross-updates as they had been not necessary
- unique-postprocessing did not restrict on same protocol
- inefficient concurrent update cache was redesigned completely
- increased limits for concurrent blocking queues to prevent early
time-out
pull/1/head
Michael Peter Christen 11 years ago
parent 640b684bb6
commit 8ad41a882c

@ -72,7 +72,7 @@ public class Vocabulary_p {
Segment segment = sb.index; Segment segment = sb.index;
String t; String t;
if (!discoverNot) { if (!discoverNot) {
Iterator<DigestURL> ui = segment.urlSelector(discoveruri, 600000L, 100000); Iterator<DigestURL> ui = segment.urlSelector(discoveruri, Long.MAX_VALUE, 100000);
while (ui.hasNext()) { while (ui.hasNext()) {
DigestURL u = ui.next(); DigestURL u = ui.next();
String u0 = u.toNormalform(true); String u0 = u.toNormalform(true);

@ -145,6 +145,7 @@ public class SchemaConfiguration extends Configuration implements Serializable {
boolean changed = false; boolean changed = false;
// FIND OUT IF THIS IS A DOUBLE DOCUMENT // FIND OUT IF THIS IS A DOUBLE DOCUMENT
String hostid = url.hosthash(); String hostid = url.hosthash();
String protocol = url.getProtocol();
for (CollectionSchema[] checkfields: new CollectionSchema[][]{ for (CollectionSchema[] checkfields: new CollectionSchema[][]{
{CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b, CollectionSchema.exact_signature_copycount_i}, {CollectionSchema.exact_signature_l, CollectionSchema.exact_signature_unique_b, CollectionSchema.exact_signature_copycount_i},
{CollectionSchema.fuzzy_signature_l, CollectionSchema.fuzzy_signature_unique_b, CollectionSchema.fuzzy_signature_copycount_i}}) { {CollectionSchema.fuzzy_signature_l, CollectionSchema.fuzzy_signature_unique_b, CollectionSchema.fuzzy_signature_copycount_i}}) {
@ -155,7 +156,7 @@ public class SchemaConfiguration extends Configuration implements Serializable {
// lookup the document with the same signature // lookup the document with the same signature
long signature = ((Long) sid.getField(checkfield.getSolrFieldName()).getValue()).longValue(); long signature = ((Long) sid.getField(checkfield.getSolrFieldName()).getValue()).longValue();
try { try {
long count = segment.fulltext().getDefaultConnector().getCountByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + checkfield.getSolrFieldName() + ":\"" + Long.toString(signature) + "\""); long count = segment.fulltext().getDefaultConnector().getCountByQuery(CollectionSchema.url_protocol_s.getSolrFieldName() + ":\"" + protocol + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + checkfield.getSolrFieldName() + ":\"" + Long.toString(signature) + "\"");
if (count > 1) { if (count > 1) {
String urlhash = ASCII.String(url.hash()); String urlhash = ASCII.String(url.hash());
if (uniqueURLs.contains(urlhash)) { if (uniqueURLs.contains(urlhash)) {
@ -172,7 +173,6 @@ public class SchemaConfiguration extends Configuration implements Serializable {
} catch (final IOException e) {} } catch (final IOException e) {}
} }
} }
// CHECK IF TITLE AND DESCRIPTION IS UNIQUE (this is by default not switched on) // CHECK IF TITLE AND DESCRIPTION IS UNIQUE (this is by default not switched on)
if (segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.host_id_s)) { if (segment.fulltext().getDefaultConfiguration().contains(CollectionSchema.host_id_s)) {
uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{ uniquecheck: for (CollectionSchema[] checkfields: new CollectionSchema[][]{
@ -191,15 +191,10 @@ public class SchemaConfiguration extends Configuration implements Serializable {
continue uniquecheck; continue uniquecheck;
} }
try { try {
final SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"", null, 0, 1); final SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(CollectionSchema.url_protocol_s.getSolrFieldName() + ":\"" + protocol + "\" AND " + CollectionSchema.host_id_s.getSolrFieldName() + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"", null, 0, 1);
if (docs != null && !docs.isEmpty()) { if (docs != null && !docs.isEmpty()) {
SolrDocument doc = docs.get(0);
// switch unique attribute in new document // switch unique attribute in new document
sid.setField(uniquefield.getSolrFieldName(), false); sid.setField(uniquefield.getSolrFieldName(), false);
// switch attribute in existing document
SolrInputDocument sidContext = segment.fulltext().getDefaultConfiguration().toSolrInputDocument(doc);
sidContext.setField(uniquefield.getSolrFieldName(), false);
segment.putDocument(sidContext);
changed = true; changed = true;
} else { } else {
sid.setField(uniquefield.getSolrFieldName(), true); sid.setField(uniquefield.getSolrFieldName(), true);

@ -167,7 +167,10 @@ public abstract class AbstractSolrConnector implements SolrConnector {
try {queue.put(d);} catch (final InterruptedException e) {break;} try {queue.put(d);} catch (final InterruptedException e) {break;}
count++; count++;
} }
if (sdl.size() < pagesize) break; if (sdl.size() < pagesize) {
//System.out.println("sdl.size() = " + sdl.size() + ", pagesize = " + pagesize);
break;
}
o += sdl.size(); o += sdl.size();
} catch (final SolrException e) { } catch (final SolrException e) {
break; break;
@ -175,7 +178,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
break; break;
} }
} }
for (int i = 0; i < concurrency; i++) { for (int i = 0; i < Math.max(1, concurrency); i++) {
try {queue.put(AbstractSolrConnector.POISON_DOCUMENT);} catch (final InterruptedException e1) {} try {queue.put(AbstractSolrConnector.POISON_DOCUMENT);} catch (final InterruptedException e1) {}
} }
} }

@ -21,11 +21,10 @@
package net.yacy.cora.federate.solr.connector; package net.yacy.cora.federate.solr.connector;
import java.io.IOException; import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection; import java.util.Collection;
import java.util.Iterator; import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.Map; import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
import net.yacy.cora.sorting.ReversibleScoreMap; import net.yacy.cora.sorting.ReversibleScoreMap;
@ -54,75 +53,40 @@ import org.apache.solr.common.params.ModifiableSolrParams;
*/ */
public class ConcurrentUpdateSolrConnector implements SolrConnector { public class ConcurrentUpdateSolrConnector implements SolrConnector {
SolrConnector connector; private final static long AUTOCOMMIT = 3000; // milliseconds
private final static Object POISON_PROCESS = new Object();
private class ProcessHandler extends Thread { private class CommitHandler extends Thread {
@Override @Override
public void run() { public void run() {
try { try {
Object process; while (ConcurrentUpdateSolrConnector.this.commitProcessRunning) {
Collection<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(); commitDocBuffer();
while ((process = ConcurrentUpdateSolrConnector.this.processQueue.take()) != POISON_PROCESS) { try {Thread.sleep(AUTOCOMMIT);} catch (final InterruptedException e) {
ConcurrentLog.logException(e);
if (process instanceof String) {
// delete document
if (docs.size() > 0) addSynchronized(docs);
String id = (String) process;
try {
ConcurrentUpdateSolrConnector.this.connector.deleteById(id);
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
}
if (process instanceof SolrInputDocument) {
SolrInputDocument doc = (SolrInputDocument) process;
docs.add(doc);
}
if (docs.size() > 0 &&
(ConcurrentUpdateSolrConnector.this.processQueue.size() == 0 ||
docs.size() >= ConcurrentUpdateSolrConnector.this.processQueue.size() + ConcurrentUpdateSolrConnector.this.processQueue.remainingCapacity())) {
addSynchronized(docs);
} }
} }
} catch (final InterruptedException e) { } finally {
ConcurrentLog.logException(e); commitDocBuffer();
} }
} }
private void addSynchronized(final Collection<SolrInputDocument> docs) {
assert docs.size() > 0;
try {
ConcurrentUpdateSolrConnector.this.connector.add(docs);
} catch (final OutOfMemoryError e) {
// clear and try again...
clearCaches();
try {
ConcurrentUpdateSolrConnector.this.connector.add(docs);
} catch (final IOException ee) {
ConcurrentLog.logException(e);
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
docs.clear();
}
} }
private SolrConnector connector;
private ARC<String, Metadata> metadataCache; private ARC<String, Metadata> metadataCache;
private ARH<String> missCache; private final ARH<String> missCache;
private BlockingQueue<Object> processQueue; private final LinkedHashMap<String, SolrInputDocument> docBuffer;
private ProcessHandler processHandler; private CommitHandler processHandler;
private final int updateCapacity;
private boolean commitProcessRunning;
public ConcurrentUpdateSolrConnector(final SolrConnector connector, final int updateCapacity, final int idCacheCapacity, final int concurrency) { public ConcurrentUpdateSolrConnector(final SolrConnector connector, final int updateCapacity, final int idCacheCapacity, final int concurrency) {
this.connector = connector; this.connector = connector;
this.metadataCache = new ConcurrentARC<String, Metadata>(idCacheCapacity, concurrency); this.updateCapacity = updateCapacity;
this.missCache = new ConcurrentARH<String>(idCacheCapacity, concurrency); this.metadataCache = new ConcurrentARC<>(idCacheCapacity, concurrency);
this.processQueue = new ArrayBlockingQueue<Object>(updateCapacity); this.missCache = new ConcurrentARH<>(idCacheCapacity, concurrency);
this.docBuffer = new LinkedHashMap<>();
this.processHandler = null; this.processHandler = null;
this.commitProcessRunning = true;
ensureAliveProcessHandler(); ensureAliveProcessHandler();
} }
@ -136,9 +100,34 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
return o instanceof ConcurrentUpdateSolrConnector && this.connector.equals(((ConcurrentUpdateSolrConnector) o).connector); return o instanceof ConcurrentUpdateSolrConnector && this.connector.equals(((ConcurrentUpdateSolrConnector) o).connector);
} }
private void commitDocBuffer() {
synchronized (this.docBuffer) {
//System.out.println("*** commit of " + this.docBuffer.size() + " documents");
//Thread.dumpStack();
if (this.docBuffer.size() > 0) try {
this.connector.add(this.docBuffer.values());
} catch (final OutOfMemoryError e) {
// clear and try again...
clearCaches();
try {
this.connector.add(this.docBuffer.values());
} catch (final IOException ee) {
ConcurrentLog.logException(e);
}
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
// move documents to metadata cache
for (Map.Entry<String, SolrInputDocument> entry: this.docBuffer.entrySet()) {
updateCache(entry.getKey(), AbstractSolrConnector.getMetadata(entry.getValue()));
}
this.docBuffer.clear();
}
}
@Override @Override
public int bufferSize() { public int bufferSize() {
return this.processQueue.size(); return this.updateCapacity;
} }
@Override @Override
@ -148,57 +137,6 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
this.missCache.clear(); this.missCache.clear();
} }
/**
* used for debugging
*/
private static void cacheSuccessSign() {
//ConcurrentLog.info("ConcurrentUpdate", "**** cache hit");
}
private boolean containsDeleteInProcessQueue(final String id) {
boolean delete = false;
boolean ctch = false;
for (Object o: this.processQueue) {
if (o == null) break;
if (checkDelete(o, id)) delete = true; // do not add a break here!
if (checkAdd(o, id)) {delete = false; ctch = true;} // do not add a break here!
}
if (ctch && delete) removeFromProcessQueue(id); // clean up put+remove
return delete;
}
private SolrInputDocument getFromProcessQueue(final String id) {
SolrInputDocument d = null;
boolean ctch = false;
for (Object o: this.processQueue) {
if (o == null) break;
if (checkDelete(o, id)) d = null; // do not add a break here!
if (checkAdd(o, id)) {d = (SolrInputDocument) o; ctch = true;} // do not add a break here!
}
if (ctch && d == null) removeFromProcessQueue(id); // clean up put+remove
return d;
}
private void removeFromProcessQueue(final String id) {
Iterator<Object> i = this.processQueue.iterator();
while (i.hasNext()) {
if (checkAdd(i.next(), id)) {i.remove(); break;}
}
}
private boolean checkDelete(final Object o, final String id) {
if (!(o instanceof String)) return false;
String docID = (String) o;
return (docID != null && docID.equals(id));
}
private boolean checkAdd(final Object o, final String id) {
if (!(o instanceof SolrInputDocument)) return false;
SolrInputDocument doc = (SolrInputDocument) o;
String docID = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
return (docID != null && docID.equals(id));
}
private void updateCache(final String id, final Metadata md) { private void updateCache(final String id, final Metadata md) {
if (id == null) return; if (id == null) return;
if (MemoryControl.shortStatus()) { if (MemoryControl.shortStatus()) {
@ -211,7 +149,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
public void ensureAliveProcessHandler() { public void ensureAliveProcessHandler() {
if (this.processHandler == null || !this.processHandler.isAlive()) { if (this.processHandler == null || !this.processHandler.isAlive()) {
this.processHandler = new ProcessHandler(); this.processHandler = new CommitHandler();
this.processHandler.setName(this.getClass().getName() + "_ProcessHandler"); this.processHandler.setName(this.getClass().getName() + "_ProcessHandler");
this.processHandler.start(); this.processHandler.start();
} }
@ -224,22 +162,19 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
@Override @Override
public long getSize() { public long getSize() {
return this.connector.getSize() + this.processQueue.size(); return this.connector.getSize() + this.docBuffer.size();
} }
@Override @Override
public void commit(boolean softCommit) { public void commit(boolean softCommit) {
long timeout = System.currentTimeMillis() + 1000;
ensureAliveProcessHandler(); ensureAliveProcessHandler();
while (this.processQueue.size() > 0) { commitDocBuffer();
try {Thread.sleep(10);} catch (final InterruptedException e) {}
if (System.currentTimeMillis() > timeout) break;
}
this.connector.commit(softCommit); this.connector.commit(softCommit);
} }
@Override @Override
public void optimize(int maxSegments) { public void optimize(int maxSegments) {
commitDocBuffer();
this.connector.optimize(maxSegments); this.connector.optimize(maxSegments);
} }
@ -256,7 +191,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
@Override @Override
public void close() { public void close() {
ensureAliveProcessHandler(); ensureAliveProcessHandler();
try {this.processQueue.put(POISON_PROCESS);} catch (final InterruptedException e) {} this.commitProcessRunning = false;
try {this.processHandler.join();} catch (final InterruptedException e) {} try {this.processHandler.join();} catch (final InterruptedException e) {}
this.connector.close(); this.connector.close();
this.metadataCache.clear(); this.metadataCache.clear();
@ -266,21 +201,20 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
@Override @Override
public void clear() throws IOException { public void clear() throws IOException {
this.processQueue.clear(); this.docBuffer.clear();
this.connector.clear(); this.connector.clear();
this.metadataCache.clear(); this.metadataCache.clear();
this.missCache.clear();
} }
@Override @Override
public synchronized void deleteById(String id) throws IOException { public synchronized void deleteById(String id) throws IOException {
this.metadataCache.remove(id); this.metadataCache.remove(id);
this.missCache.add(id); this.missCache.add(id);
ensureAliveProcessHandler(); synchronized (this.docBuffer) {
if (this.processHandler.isAlive()) { this.docBuffer.remove(id);
try {this.processQueue.put(id);} catch (final InterruptedException e) {}
} else {
this.connector.deleteById(id);
} }
this.connector.deleteById(id);
} }
@Override @Override
@ -289,33 +223,40 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
this.metadataCache.remove(id); this.metadataCache.remove(id);
this.missCache.add(id); this.missCache.add(id);
} }
ensureAliveProcessHandler(); synchronized (this.docBuffer) {
if (this.processHandler.isAlive()) { for (String id: ids) {
for (String id: ids) try {this.processQueue.put(id);} catch (final InterruptedException e) {} this.docBuffer.remove(id);
} else { }
this.connector.deleteByIds(ids);
} }
this.connector.deleteByIds(ids);
} }
@Override @Override
public void deleteByQuery(final String querystring) throws IOException { public void deleteByQuery(final String querystring) throws IOException {
commitDocBuffer();
try { try {
ConcurrentUpdateSolrConnector.this.connector.deleteByQuery(querystring); this.connector.deleteByQuery(querystring);
ConcurrentUpdateSolrConnector.this.metadataCache.clear(); this.metadataCache.clear();
} catch (final IOException e) { } catch (final IOException e) {
ConcurrentLog.severe("ConcurrentUpdateSolrConnector", e.getMessage(), e); ConcurrentLog.severe("ConcurrentUpdateSolrConnector", e.getMessage(), e);
} }
ConcurrentUpdateSolrConnector.this.connector.commit(true);
} }
@Override @Override
public Metadata getMetadata(String id) throws IOException { public Metadata getMetadata(String id) throws IOException {
if (this.missCache.contains(id)) {cacheSuccessSign(); return null;} if (this.missCache.contains(id)) return null;
Metadata md = this.metadataCache.get(id); Metadata md = this.metadataCache.get(id);
if (md != null) {cacheSuccessSign(); return md;} if (md != null) {
if (containsDeleteInProcessQueue(id)) {cacheSuccessSign(); return null;} //System.out.println("*** metadata cache hit; metadataCache.size() = " + metadataCache.size());
SolrInputDocument doc = getFromProcessQueue(id); //Thread.dumpStack();
if (doc != null) {cacheSuccessSign(); return AbstractSolrConnector.getMetadata(doc);} return md;
}
SolrInputDocument doc = this.docBuffer.get(id);
if (doc != null) {
//System.out.println("*** docBuffer cache hit; docBuffer.size() = " + docBuffer.size());
//Thread.dumpStack();
return AbstractSolrConnector.getMetadata(doc);
}
md = this.connector.getMetadata(id); md = this.connector.getMetadata(id);
if (md == null) {this.missCache.add(id); return null;} if (md == null) {this.missCache.add(id); return null;}
updateCache(id, md); updateCache(id, md);
@ -325,26 +266,34 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
@Override @Override
public void add(SolrInputDocument solrdoc) throws IOException, SolrException { public void add(SolrInputDocument solrdoc) throws IOException, SolrException {
String id = (String) solrdoc.getFieldValue(CollectionSchema.id.getSolrFieldName()); String id = (String) solrdoc.getFieldValue(CollectionSchema.id.getSolrFieldName());
updateCache(id, AbstractSolrConnector.getMetadata(solrdoc)); this.metadataCache.remove(id); // remove the id from the metadata cache because it will be overwritten by the update process anyway
ensureAliveProcessHandler(); ensureAliveProcessHandler();
if (this.processHandler.isAlive()) { if (this.processHandler.isAlive()) {
try {this.processQueue.put(solrdoc);} catch (final InterruptedException e) {} synchronized (this.docBuffer) {this.docBuffer.put(id, solrdoc);}
} else { } else {
this.connector.add(solrdoc); this.connector.add(solrdoc);
updateCache(id, AbstractSolrConnector.getMetadata(solrdoc));
}
if (MemoryControl.shortStatus() || this.docBuffer.size() > this.updateCapacity) {
commitDocBuffer();
} }
} }
@Override @Override
public void add(Collection<SolrInputDocument> solrdocs) throws IOException, SolrException { public void add(Collection<SolrInputDocument> solrdocs) throws IOException, SolrException {
for (SolrInputDocument doc: solrdocs) {
String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
updateCache(id, AbstractSolrConnector.getMetadata(doc));
}
ensureAliveProcessHandler(); ensureAliveProcessHandler();
if (this.processHandler.isAlive()) { synchronized (this.docBuffer) {
for (SolrInputDocument doc: solrdocs) try {this.processQueue.put(doc);} catch (final InterruptedException e) {} for (SolrInputDocument solrdoc: solrdocs) {
} else { String id = (String) solrdoc.getFieldValue(CollectionSchema.id.getSolrFieldName());
this.connector.add(solrdocs); if (this.processHandler.isAlive()) {
this.docBuffer.put(id, solrdoc);
} else {
this.connector.add(solrdoc);
}
}
}
if (MemoryControl.shortStatus() || this.docBuffer.size() > this.updateCapacity) {
commitDocBuffer();
} }
} }
@ -352,26 +301,36 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
public SolrDocument getDocumentById(final String id, String... fields) throws IOException { public SolrDocument getDocumentById(final String id, String... fields) throws IOException {
assert id.length() == Word.commonHashLength : "wrong id: " + id; assert id.length() == Word.commonHashLength : "wrong id: " + id;
if (this.missCache.contains(id)) return null; if (this.missCache.contains(id)) return null;
if (containsDeleteInProcessQueue(id)) return null; SolrInputDocument idoc = this.docBuffer.get(id);
SolrInputDocument idoc = getFromProcessQueue(id); if (idoc != null) {
if (idoc != null) {cacheSuccessSign(); return ClientUtils.toSolrDocument(idoc);} //System.out.println("*** docBuffer cache hit; docBuffer.size() = " + docBuffer.size());
SolrDocument doc = this.connector.getDocumentById(id, AbstractSolrConnector.ensureEssentialFieldsIncluded(fields)); //Thread.dumpStack();
if (doc == null) { return ClientUtils.toSolrDocument(idoc);
}
SolrDocument solrdoc = this.connector.getDocumentById(id, AbstractSolrConnector.ensureEssentialFieldsIncluded(fields));
if (solrdoc == null) {
this.missCache.add(id); this.missCache.add(id);
this.metadataCache.remove(id);
} else { } else {
updateCache(id, AbstractSolrConnector.getMetadata(doc)); updateCache(id, AbstractSolrConnector.getMetadata(solrdoc));
} }
return doc; return solrdoc;
} }
@Override @Override
public QueryResponse getResponseByParams(ModifiableSolrParams query) throws IOException, SolrException { public QueryResponse getResponseByParams(ModifiableSolrParams query) throws IOException, SolrException {
commitDocBuffer();
return this.connector.getResponseByParams(query); return this.connector.getResponseByParams(query);
} }
@Override @Override
public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException, SolrException { public SolrDocumentList getDocumentListByParams(ModifiableSolrParams params) throws IOException, SolrException {
commitDocBuffer();
SolrDocumentList sdl = this.connector.getDocumentListByParams(params); SolrDocumentList sdl = this.connector.getDocumentListByParams(params);
for (SolrDocument doc: sdl) {
String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
updateCache(id, AbstractSolrConnector.getMetadata(doc));
}
return sdl; return sdl;
} }
@ -383,6 +342,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
@Override @Override
public SolrDocumentList getDocumentListByQuery(String querystring, String sort, int offset, int count, String... fields) throws IOException, SolrException { public SolrDocumentList getDocumentListByQuery(String querystring, String sort, int offset, int count, String... fields) throws IOException, SolrException {
commitDocBuffer();
if (offset == 0 && count == 1 && querystring.startsWith("id:") && if (offset == 0 && count == 1 && querystring.startsWith("id:") &&
((querystring.length() == 17 && querystring.charAt(3) == '"' && querystring.charAt(16) == '"') || ((querystring.length() == 17 && querystring.charAt(3) == '"' && querystring.charAt(16) == '"') ||
querystring.length() == 15)) { querystring.length() == 15)) {
@ -393,34 +353,30 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
} }
SolrDocumentList sdl = this.connector.getDocumentListByQuery(querystring, sort, offset, count, AbstractSolrConnector.ensureEssentialFieldsIncluded(fields)); SolrDocumentList sdl = this.connector.getDocumentListByQuery(querystring, sort, offset, count, AbstractSolrConnector.ensureEssentialFieldsIncluded(fields));
/*
Iterator<SolrDocument> i = sdl.iterator();
while (i.hasNext()) {
SolrDocument doc = i.next();
String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
if (doc != null) updateIdCache(id, AbstractSolrConnector.getLoadDate(doc));
}
*/
return sdl; return sdl;
} }
@Override @Override
public long getCountByQuery(String querystring) throws IOException { public long getCountByQuery(String querystring) throws IOException {
commitDocBuffer();
return this.connector.getCountByQuery(querystring); return this.connector.getCountByQuery(querystring);
} }
@Override @Override
public Map<String, ReversibleScoreMap<String>> getFacets(String query, int maxresults, String... fields) throws IOException { public Map<String, ReversibleScoreMap<String>> getFacets(String query, int maxresults, String... fields) throws IOException {
commitDocBuffer();
return this.connector.getFacets(query, maxresults, fields); return this.connector.getFacets(query, maxresults, fields);
} }
@Override @Override
public BlockingQueue<SolrDocument> concurrentDocumentsByQuery(String querystring, String sort, int offset, int maxcount, long maxtime, int buffersize, final int concurrency, String... fields) { public BlockingQueue<SolrDocument> concurrentDocumentsByQuery(String querystring, String sort, int offset, int maxcount, long maxtime, int buffersize, final int concurrency, String... fields) {
commitDocBuffer();
return this.connector.concurrentDocumentsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency, fields); return this.connector.concurrentDocumentsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency, fields);
} }
@Override @Override
public BlockingQueue<String> concurrentIDsByQuery(String querystring, String sort, int offset, int maxcount, long maxtime, int buffersize, final int concurrency) { public BlockingQueue<String> concurrentIDsByQuery(String querystring, String sort, int offset, int maxcount, long maxtime, int buffersize, final int concurrency) {
commitDocBuffer();
return this.connector.concurrentIDsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency); return this.connector.concurrentIDsByQuery(querystring, sort, offset, maxcount, maxtime, buffersize, concurrency);
} }

@ -37,16 +37,16 @@ public class InstanceMirror {
private EmbeddedInstance embeddedSolrInstance; private EmbeddedInstance embeddedSolrInstance;
private ShardInstance remoteSolrInstance; private ShardInstance remoteSolrInstance;
private Map<String, ConcurrentUpdateSolrConnector> mirrorConnectorCache; private Map<String, SolrConnector> mirrorConnectorCache;
private Map<String, EmbeddedSolrConnector> embeddedConnectorCache; private Map<String, EmbeddedSolrConnector> embeddedConnectorCache;
private Map<String, RemoteSolrConnector> remoteConnectorCache; private Map<String, RemoteSolrConnector> remoteConnectorCache;
public InstanceMirror() { public InstanceMirror() {
this.embeddedSolrInstance = null; this.embeddedSolrInstance = null;
this.remoteSolrInstance = null; this.remoteSolrInstance = null;
this.mirrorConnectorCache = new ConcurrentHashMap<String, ConcurrentUpdateSolrConnector>(); this.mirrorConnectorCache = new ConcurrentHashMap<>();
this.embeddedConnectorCache = new ConcurrentHashMap<String, EmbeddedSolrConnector>(); this.embeddedConnectorCache = new ConcurrentHashMap<>();
this.remoteConnectorCache = new ConcurrentHashMap<String, RemoteSolrConnector>(); this.remoteConnectorCache = new ConcurrentHashMap<>();
} }
public boolean isConnectedEmbedded() { public boolean isConnectedEmbedded() {
@ -161,11 +161,12 @@ public class InstanceMirror {
} }
public SolrConnector getGenericMirrorConnector(String corename) { public SolrConnector getGenericMirrorConnector(String corename) {
ConcurrentUpdateSolrConnector msc = this.mirrorConnectorCache.get(corename); SolrConnector msc = this.mirrorConnectorCache.get(corename);
if (msc != null) return msc; if (msc != null) return msc;
EmbeddedSolrConnector esc = getEmbeddedConnector(corename); EmbeddedSolrConnector esc = getEmbeddedConnector(corename);
RemoteSolrConnector rsc = getRemoteConnector(corename); RemoteSolrConnector rsc = getRemoteConnector(corename);
msc = new ConcurrentUpdateSolrConnector(new MirrorSolrConnector(esc, rsc), RemoteInstance.queueSizeByMemory(), 100000, Runtime.getRuntime().availableProcessors()); msc = new ConcurrentUpdateSolrConnector(new MirrorSolrConnector(esc, rsc), RemoteInstance.queueSizeByMemory(), 10000, Runtime.getRuntime().availableProcessors());
//msc = new MirrorSolrConnector(esc, rsc);
this.mirrorConnectorCache.put(corename, msc); this.mirrorConnectorCache.put(corename, msc);
return msc; return msc;
} }

@ -256,6 +256,6 @@ public class RemoteInstance implements SolrInstance {
} }
public static int queueSizeByMemory() { public static int queueSizeByMemory() {
return (int) Math.min(500, Math.max(1, MemoryControl.maxMemory() / 1024 / 1024 / 12)); return (int) Math.min(30, Math.max(1, MemoryControl.maxMemory() / 1024 / 1024 / 12));
} }
} }

@ -241,7 +241,7 @@ public final class Fulltext {
public long collectionSize() { public long collectionSize() {
long t = System.currentTimeMillis(); long t = System.currentTimeMillis();
if (t - this.collectionSizeLastAccess < 1000) return this.collectionSizeLastValue; if (t - this.collectionSizeLastAccess < 1000) return this.collectionSizeLastValue;
SolrConnector sc = this.solrInstances.getDefaultMirrorConnector(); SolrConnector sc = getDefaultConnector();
if (sc == null) return 0; if (sc == null) return 0;
long size = sc.getSize(); long size = sc.getSize();
this.collectionSizeLastAccess = t; this.collectionSizeLastAccess = t;
@ -429,7 +429,7 @@ public final class Fulltext {
final String collectionQuery = CollectionSchema.host_s.getSolrFieldName() + ":\"" + host + "\"" + final String collectionQuery = CollectionSchema.host_s.getSolrFieldName() + ":\"" + host + "\"" +
((freshdate != null && freshdate.before(new Date())) ? (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : ""); ((freshdate != null && freshdate.before(new Date())) ? (" AND " + CollectionSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]") : "");
final AtomicInteger count = new AtomicInteger(0); final AtomicInteger count = new AtomicInteger(0);
final BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(collectionQuery, null, 0, 1000000, 600000, 100, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName()); final BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(collectionQuery, null, 0, 1000000, Long.MAX_VALUE, 100, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName());
try { try {
Set<String> deleteIDs = new HashSet<String>(); Set<String> deleteIDs = new HashSet<String>();
SolrDocument doc; SolrDocument doc;
@ -665,7 +665,7 @@ public final class Fulltext {
this.count++; this.count++;
} }
} else { } else {
BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, 10 * 60 * 60 * 1000, 100, 1, BlockingQueue<SolrDocument> docs = Fulltext.this.getDefaultConnector().concurrentDocumentsByQuery(this.query + " AND " + CollectionSchema.httpstatus_i.getSolrFieldName() + ":200", null, 0, 100000000, Long.MAX_VALUE, 100, 1,
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(), CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.title.getSolrFieldName(),
CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName()); CollectionSchema.author.getSolrFieldName(), CollectionSchema.description_txt.getSolrFieldName(), CollectionSchema.size_i.getSolrFieldName(), CollectionSchema.last_modified.getSolrFieldName());
SolrDocument doc; SolrDocument doc;

@ -268,7 +268,7 @@ public class Segment {
if ((internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.useWebgraph()) { if ((internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.useWebgraph()) {
// reqd the references from the webgraph // reqd the references from the webgraph
SolrConnector webgraph = Segment.this.fulltext.getWebgraphConnector(); SolrConnector webgraph = Segment.this.fulltext.getWebgraphConnector();
BlockingQueue<SolrDocument> docs = webgraph.concurrentDocumentsByQuery("{!raw f=" + WebgraphSchema.target_id_s.getSolrFieldName() + "}" + ASCII.String(id), WebgraphSchema.source_chars_i.getSolrFieldName() + " asc", 0, 10000000, 1000, 100, 1, WebgraphSchema.source_id_s.getSolrFieldName()); BlockingQueue<SolrDocument> docs = webgraph.concurrentDocumentsByQuery("{!raw f=" + WebgraphSchema.target_id_s.getSolrFieldName() + "}" + ASCII.String(id), WebgraphSchema.source_chars_i.getSolrFieldName() + " asc", 0, 10000000, 10000, 100, 1, WebgraphSchema.source_id_s.getSolrFieldName());
SolrDocument doc; SolrDocument doc;
try { try {
while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) { while ((doc = docs.take()) != AbstractSolrConnector.POISON_DOCUMENT) {

@ -996,7 +996,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
collection.contains(CollectionSchema.cr_host_chance_d) && collection.contains(CollectionSchema.cr_host_chance_d) &&
collection.contains(CollectionSchema.cr_host_norm_i)))) try { collection.contains(CollectionSchema.cr_host_norm_i)))) try {
int concurrency = Math.min(hostscore.size(), Runtime.getRuntime().availableProcessors()); int concurrency = Math.min(hostscore.size(), Runtime.getRuntime().availableProcessors());
ConcurrentLog.info("CollectionConfiguration", "collecting " + hostscore.size() + " hosts, concrrency = " + concurrency); ConcurrentLog.info("CollectionConfiguration", "collecting " + hostscore.size() + " hosts, concurrency = " + concurrency);
int countcheck = 0; int countcheck = 0;
for (String host: hostscore.keyList(true)) { for (String host: hostscore.keyList(true)) {
// Patch the citation index for links with canonical tags. // Patch the citation index for links with canonical tags.
@ -1005,7 +1005,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links // To do so, we first must collect all canonical links, find all references to them, get the anchor list of the documents and patch the citation reference of these links
String patchquery = CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM; String patchquery = CollectionSchema.host_s.getSolrFieldName() + ":" + host + " AND " + CollectionSchema.canonical_s.getSolrFieldName() + AbstractSolrConnector.CATCHALL_DTERM;
long patchquerycount = collectionConnector.getCountByQuery(patchquery); long patchquerycount = collectionConnector.getCountByQuery(patchquery);
BlockingQueue<SolrDocument> documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(patchquery, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000000, 86400000, 200, 1, BlockingQueue<SolrDocument> documents_with_canonical_tag = collectionConnector.concurrentDocumentsByQuery(patchquery, CollectionSchema.url_chars_i.getSolrFieldName() + " asc", 0, 100000000, Long.MAX_VALUE, 200, 1,
CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName()); CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.canonical_s.getSolrFieldName());
SolrDocument doc_B; SolrDocument doc_B;
int patchquerycountcheck = 0; int patchquerycountcheck = 0;
@ -1087,7 +1087,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
final long count = segment.fulltext().getWebgraphConnector().getCountByQuery(query); final long count = segment.fulltext().getWebgraphConnector().getCountByQuery(query);
int concurrency = Math.min((int) count, Math.max(1, Runtime.getRuntime().availableProcessors() / 4)); int concurrency = Math.min((int) count, Math.max(1, Runtime.getRuntime().availableProcessors() / 4));
ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the webgraph, concurrency = " + concurrency); ConcurrentLog.info("CollectionConfiguration", "collecting " + count + " documents from the webgraph, concurrency = " + concurrency);
final BlockingQueue<SolrDocument> docs = segment.fulltext().getWebgraphConnector().concurrentDocumentsByQuery(query, WebgraphSchema.source_chars_i.getSolrFieldName() + " asc", 0, 100000000, 86400000, 200, concurrency); final BlockingQueue<SolrDocument> docs = segment.fulltext().getWebgraphConnector().concurrentDocumentsByQuery(query, WebgraphSchema.source_chars_i.getSolrFieldName() + " asc", 0, 100000000, Long.MAX_VALUE, 200, concurrency);
final AtomicInteger proccount = new AtomicInteger(0); final AtomicInteger proccount = new AtomicInteger(0);
Thread[] t = new Thread[concurrency]; Thread[] t = new Thread[concurrency];
for (final AtomicInteger i = new AtomicInteger(0); i.get() < t.length; i.incrementAndGet()) { for (final AtomicInteger i = new AtomicInteger(0); i.get() < t.length; i.incrementAndGet()) {
@ -1127,7 +1127,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
try { try {
sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName()); sid.removeField(WebgraphSchema.process_sxt.getSolrFieldName());
sid.removeField(WebgraphSchema.harvestkey_s.getSolrFieldName()); sid.removeField(WebgraphSchema.harvestkey_s.getSolrFieldName());
segment.fulltext().getWebgraphConnector().deleteById((String) sid.getFieldValue(WebgraphSchema.id.getSolrFieldName())); //segment.fulltext().getWebgraphConnector().deleteById((String) sid.getFieldValue(WebgraphSchema.id.getSolrFieldName()));
segment.fulltext().getWebgraphConnector().add(sid); segment.fulltext().getWebgraphConnector().add(sid);
} catch (SolrException e) { } catch (SolrException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
@ -1173,9 +1173,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery( BlockingQueue<SolrDocument> docs = collectionConnector.concurrentDocumentsByQuery(
query, query,
CollectionSchema.host_subdomain_s.getSolrFieldName() + " asc," + // sort on subdomain to get hosts without subdomain first; that gives an opportunity to set www_unique_b flag to false CollectionSchema.host_subdomain_s.getSolrFieldName() + " asc," + // sort on subdomain to get hosts without subdomain first; that gives an opportunity to set www_unique_b flag to false
CollectionSchema.url_protocol_s.getSolrFieldName() + " asc," + // sort on protocol to get http before htts; that gives an opportunity to set http_unique_b flag to false CollectionSchema.url_protocol_s.getSolrFieldName() + " asc," + // sort on protocol to get http before https; that gives an opportunity to set http_unique_b flag to false
CollectionSchema.url_chars_i.getSolrFieldName() + " asc", CollectionSchema.url_chars_i.getSolrFieldName() + " asc",
0, 100000000, 86400000, 200, 1); 0, 100000000, Long.MAX_VALUE, 200, 1);
int countcheck = 0; int countcheck = 0;
Collection<String> failids = new ArrayList<String>(); Collection<String> failids = new ArrayList<String>();
SolrDocument doc; SolrDocument doc;
@ -1232,7 +1232,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
sid.removeField(CollectionSchema.harvestkey_s.getSolrFieldName()); sid.removeField(CollectionSchema.harvestkey_s.getSolrFieldName());
// send back to index // send back to index
collectionConnector.deleteById(i); //collectionConnector.deleteById(i);
collectionConnector.add(sid); collectionConnector.add(sid);
proccount++; allcount.incrementAndGet(); proccount++; allcount.incrementAndGet();
@ -1260,6 +1260,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
} catch (IOException e3) { } catch (IOException e3) {
ConcurrentLog.warn("CollectionConfiguration", e3.getMessage(), e3); ConcurrentLog.warn("CollectionConfiguration", e3.getMessage(), e3);
} }
collectionConnector.commit(true); // make changes available directly to prevent that the process repeats again
return allcount.get(); return allcount.get();
} }

@ -57,7 +57,7 @@ public class HyperlinkGraph implements Iterable<HyperlinkEdge> {
this.hostname = null; this.hostname = null;
} }
public void fill(final SolrConnector solrConnector, String hostname, final DigestURL stopURL, final int maxtime, final int maxnodes) { public void fill(final SolrConnector solrConnector, String hostname, final DigestURL stopURL, final long maxtime, final int maxnodes) {
this.hostname = hostname; this.hostname = hostname;
if (hostname.startsWith("www.")) hostname = hostname.substring(4); if (hostname.startsWith("www.")) hostname = hostname.substring(4);
StringBuilder q = new StringBuilder(); StringBuilder q = new StringBuilder();

Loading…
Cancel
Save