replaced check for load time method

instead of loading the solr document, an index only for the last loading
time was created. This prevents that solr has to fetch from its index
while the index is created. Excessive re-loading of documents while
indexing has shown to produce deadlocks, so this should now be
prevented.
pull/436/head
Michael Peter Christen 3 years ago
parent 1ead7b85b5
commit 163ba26d90

@ -82,11 +82,11 @@ public class IndexControlURLs_p {
List<File> dumpFiles = segment.fulltext().dumpFiles(); List<File> dumpFiles = segment.fulltext().dumpFiles();
prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath()); prop.put("dumprestore_dumpfile", dumpFiles.size() == 0 ? "" : dumpFiles.get(dumpFiles.size() - 1).getAbsolutePath());
prop.put("dumprestore_optimizemax", 10); prop.put("dumprestore_optimizemax", 10);
prop.put("dumprestore_rebootSolrEnabled", prop.put("dumprestore_rebootSolrEnabled",
sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT,
SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT) SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT)
&& !sb.getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED, && !sb.getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED,
SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED_DEFAULT)); SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED_DEFAULT));
prop.put("cleanup", ucount == 0 ? 0 : 1); prop.put("cleanup", ucount == 0 ? 0 : 1);
prop.put("cleanupsolr", segment.fulltext().connectedRemoteSolr() ? 1 : 0); prop.put("cleanupsolr", segment.fulltext().connectedRemoteSolr() ? 1 : 0);
prop.put("cleanuprwi", segment.termIndex() != null && !segment.termIndex().isEmpty() ? 1 : 0); prop.put("cleanuprwi", segment.termIndex() != null && !segment.termIndex().isEmpty() ? 1 : 0);
@ -119,14 +119,20 @@ public class IndexControlURLs_p {
// delete everything // delete everything
if ( post.containsKey("deletecomplete") ) { if ( post.containsKey("deletecomplete") ) {
/* Check the transaction is valid */ /* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post); TransactionManager.checkPostTransaction(header, post);
if ( post.get("deleteIndex", "").equals("on") ) { if ( post.get("deleteIndex", "").equals("on") ) {
try {segment.fulltext().clearLocalSolr();} catch (final IOException e) {} try {
segment.fulltext().clearLocalSolr();
segment.loadTimeIndex().clear();
} catch (final IOException e) {}
} }
if ( post.get("deleteRemoteSolr", "").equals("on")) { if ( post.get("deleteRemoteSolr", "").equals("on")) {
try {segment.fulltext().clearRemoteSolr();} catch (final IOException e) {} try {
segment.fulltext().clearRemoteSolr();
segment.loadTimeIndex().clear();
} catch (final IOException e) {}
} }
if ( post.get("deleteRWI", "").equals("on")) { if ( post.get("deleteRWI", "").equals("on")) {
if (segment.termIndex() != null) try {segment.termIndex().clear();} catch (final IOException e) {} if (segment.termIndex() != null) try {segment.termIndex().clear();} catch (final IOException e) {}
@ -135,7 +141,10 @@ public class IndexControlURLs_p {
if (segment.connectedCitation()) try {segment.urlCitation().clear();} catch (final IOException e) {} if (segment.connectedCitation()) try {segment.urlCitation().clear();} catch (final IOException e) {}
} }
if ( post.get("deleteFirstSeen", "").equals("on")) { if ( post.get("deleteFirstSeen", "").equals("on")) {
try {segment.firstSeen().clear();} catch (final IOException e) {} try {
segment.firstSeenIndex().clear();
segment.loadTimeIndex().clear();
} catch (final IOException e) {}
} }
if ( post.get("deleteCrawlQueues", "").equals("on") ) { if ( post.get("deleteCrawlQueues", "").equals("on") ) {
sb.crawlQueues.clear(); sb.crawlQueues.clear();
@ -152,8 +161,8 @@ public class IndexControlURLs_p {
} }
if (post.containsKey("urlhashdeleteall")) { if (post.containsKey("urlhashdeleteall")) {
/* Check the transaction is valid */ /* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post); TransactionManager.checkPostTransaction(header, post);
ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName)); ClientIdentification.Agent agent = ClientIdentification.getAgent(post.get("agentName", ClientIdentification.yacyInternetCrawlerAgentName));
int i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, agent, CacheStrategy.IFEXIST); int i = segment.removeAllUrlReferences(urlhash.getBytes(), sb.loader, agent, CacheStrategy.IFEXIST);
@ -161,8 +170,8 @@ public class IndexControlURLs_p {
} }
if (post.containsKey("urlhashdelete")) { if (post.containsKey("urlhashdelete")) {
/* Check the transaction is valid */ /* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post); TransactionManager.checkPostTransaction(header, post);
DigestURL url; DigestURL url;
try { try {
@ -181,8 +190,8 @@ public class IndexControlURLs_p {
} }
if (post.containsKey("urldelete")) { if (post.containsKey("urldelete")) {
/* Check the transaction is valid */ /* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post); TransactionManager.checkPostTransaction(header, post);
try { try {
urlhash = ASCII.String((new DigestURL(urlstring)).hash()); urlhash = ASCII.String((new DigestURL(urlstring)).hash());
@ -229,31 +238,31 @@ public class IndexControlURLs_p {
} }
if (post.containsKey("optimizesolr")) { if (post.containsKey("optimizesolr")) {
/* Check the transaction is valid */ /* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post); TransactionManager.checkPostTransaction(header, post);
final int size = post.getInt("optimizemax", 10); final int size = post.getInt("optimizemax", 10);
segment.fulltext().optimize(size); segment.fulltext().optimize(size);
sb.tables.recordAPICall(post, "IndexControlURLs_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr optimize " + size); sb.tables.recordAPICall(post, "IndexControlURLs_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr optimize " + size);
} }
if (post.containsKey("rebootsolr")) { if (post.containsKey("rebootsolr")) {
/* Check the transaction is valid */ /* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post); TransactionManager.checkPostTransaction(header, post);
if (sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT, if (sb.getConfigBool(SwitchboardConstants.CORE_SERVICE_FULLTEXT,
SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT) SwitchboardConstants.CORE_SERVICE_FULLTEXT_DEFAULT)
&& !sb.getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED, && !sb.getConfigBool(SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED,
SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED_DEFAULT)) { SwitchboardConstants.FEDERATED_SERVICE_SOLR_INDEXING_ENABLED_DEFAULT)) {
/* This operation is designed only for an embdded local Solr with no mirroring to an external remote Solr server */ /* This operation is designed only for an embdded local Solr with no mirroring to an external remote Solr server */
segment.fulltext().rebootEmbeddedLocalSolr(); segment.fulltext().rebootEmbeddedLocalSolr();
sb.tables.recordAPICall(post, "IndexControlURLs_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr reboot"); sb.tables.recordAPICall(post, "IndexControlURLs_p.html", WorkTables.TABLE_API_TYPE_STEERING, "solr reboot");
} }
} }
if (post.containsKey("deletedomain")) { if (post.containsKey("deletedomain")) {
/* Check the transaction is valid */ /* Check the transaction is valid */
TransactionManager.checkPostTransaction(header, post); TransactionManager.checkPostTransaction(header, post);
final String domain = post.get("domain"); final String domain = post.get("domain");
Set<String> hostnames = new HashSet<String>(); Set<String> hostnames = new HashSet<String>();

@ -96,6 +96,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
protected final static int pagesize_docs = 100; protected final static int pagesize_docs = 100;
protected final static int pagesize_ids = 1000; protected final static int pagesize_ids = 1000;
@Deprecated
protected static LoadTimeURL getLoadTimeURL(final Object doc) { protected static LoadTimeURL getLoadTimeURL(final Object doc) {
if (doc == null) return null; if (doc == null) return null;
Object d = null; Object d = null;
@ -478,6 +479,7 @@ public abstract class AbstractSolrConnector implements SolrConnector {
* @return metadata if any entry in solr exists, null otherwise * @return metadata if any entry in solr exists, null otherwise
* @throws IOException * @throws IOException
*/ */
@Deprecated
@Override @Override
public LoadTimeURL getLoadTimeURL(String id) throws IOException { public LoadTimeURL getLoadTimeURL(String id) throws IOException {
// construct raw query // construct raw query

@ -425,10 +425,12 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
/** /**
* check if a given document, identified by url hash as document id exists * check if a given document, identified by url hash as document id exists
* @Deprecated use Segment.getLastSeenTime instead
* @param id the url hash and document id * @param id the url hash and document id
* @return the load date if any entry in solr exists, null otherwise * @return the load date if any entry in solr exists, null otherwise
* @throws IOException * @throws IOException
*/ */
@Deprecated
@Override @Override
public LoadTimeURL getLoadTimeURL(String id) throws IOException { public LoadTimeURL getLoadTimeURL(String id) throws IOException {
int responseCount = 0; int responseCount = 0;

@ -27,9 +27,6 @@ import java.util.Map;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.AtomicLong;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.kelondro.data.word.Word;
import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrDocumentList;
@ -38,6 +35,9 @@ import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.CommonParams;
import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.ModifiableSolrParams;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.kelondro.data.word.Word;
public class MirrorSolrConnector extends AbstractSolrConnector implements SolrConnector { public class MirrorSolrConnector extends AbstractSolrConnector implements SolrConnector {
// the twin solrs // the twin solrs
@ -188,7 +188,7 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
public SolrDocument getDocumentById(final String key, final String ... fields) throws IOException { public SolrDocument getDocumentById(final String key, final String ... fields) throws IOException {
assert key.length() == Word.commonHashLength : "wrong id: " + key; assert key.length() == Word.commonHashLength : "wrong id: " + key;
SolrDocument doc; SolrDocument doc;
if ((solr0 != null && ((doc = solr0.getDocumentById(key, fields)) != null)) || (solr1 != null && ((doc = solr1.getDocumentById(key, fields)) != null))) { if ((this.solr0 != null && ((doc = this.solr0.getDocumentById(key, fields)) != null)) || (this.solr1 != null && ((doc = this.solr1.getDocumentById(key, fields)) != null))) {
return doc; return doc;
} }
return null; return null;
@ -418,6 +418,7 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
return result; return result;
} }
@Deprecated
@Override @Override
public LoadTimeURL getLoadTimeURL(String id) throws IOException { public LoadTimeURL getLoadTimeURL(String id) throws IOException {
if (this.solr0 != null && this.solr1 == null) return this.solr0.getLoadTimeURL(id); if (this.solr0 != null && this.solr1 == null) return this.solr0.getLoadTimeURL(id);

@ -26,8 +26,6 @@ import java.util.LinkedHashMap;
import java.util.List; import java.util.List;
import java.util.concurrent.BlockingQueue; import java.util.concurrent.BlockingQueue;
import net.yacy.cora.sorting.ReversibleScoreMap;
import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.SolrDocumentList;
@ -35,6 +33,8 @@ import org.apache.solr.common.SolrException;
import org.apache.solr.common.SolrInputDocument; import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.params.ModifiableSolrParams; import org.apache.solr.common.params.ModifiableSolrParams;
import net.yacy.cora.sorting.ReversibleScoreMap;
public interface SolrConnector extends Iterable<String> /* Iterable of document IDs */ { public interface SolrConnector extends Iterable<String> /* Iterable of document IDs */ {
public static class LoadTimeURL { public static class LoadTimeURL {
@ -123,6 +123,7 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
* @return the load time metadata (url and load data) if any entry in solr exists, null otherwise * @return the load time metadata (url and load data) if any entry in solr exists, null otherwise
* @throws IOException * @throws IOException
*/ */
@Deprecated
public LoadTimeURL getLoadTimeURL(final String id) throws IOException; public LoadTimeURL getLoadTimeURL(final String id) throws IOException;
/** /**

@ -425,23 +425,15 @@ public final class CrawlStacker implements WorkflowTask<Request>{
if (dbocc != null) { if (dbocc != null) {
return CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX + ": " + dbocc.name(); return CRAWL_REJECT_REASON_DOUBLE_IN_PREFIX + ": " + dbocc.name();
} }
final String urlhash = ASCII.String(url.hash()); String urls = url.toNormalform(false);
LoadTimeURL oldEntry = null; LoadTimeURL oldEntry = this.indexSegment.getLoadTimeURL(urls, url.hash());
try {
oldEntry = this.indexSegment.fulltext().getDefaultConnector().getLoadTimeURL(urlhash);
} catch (final IOException e) {
// if an exception here occurs then there is the danger that urls which had been in the crawler are overwritten a second time
// to prevent that, we reject urls in these events
ConcurrentLog.logException(e);
return "exception during double-test: " + e.getMessage();
}
// deny urls that exceed allowed number of occurrences // deny urls that exceed allowed number of occurrences
final int maxAllowedPagesPerDomain = profile.domMaxPages(); final int maxAllowedPagesPerDomain = profile.domMaxPages();
if (maxAllowedPagesPerDomain < Integer.MAX_VALUE && maxAllowedPagesPerDomain > 0) { if (maxAllowedPagesPerDomain < Integer.MAX_VALUE && maxAllowedPagesPerDomain > 0) {
final AtomicInteger dp = profile.getCount(url.getHost()); final AtomicInteger dp = profile.getCount(url.getHost());
if (dp != null && dp.get() >= maxAllowedPagesPerDomain) { if (dp != null && dp.get() >= maxAllowedPagesPerDomain) {
if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL '" + url.toNormalform(false) + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed."); if (CrawlStacker.log.isFine()) CrawlStacker.log.fine("URL '" + urls + "' appeared too often in crawl stack, a maximum of " + maxAllowedPagesPerDomain + " is allowed.");
return "crawl stack domain counter exceeded (test by profile)"; return "crawl stack domain counter exceeded (test by profile)";
} }

File diff suppressed because it is too large Load Diff

@ -319,8 +319,8 @@ public final class Fulltext {
private long lastCommit = 0; private long lastCommit = 0;
public void commit(boolean softCommit) { public void commit(boolean softCommit) {
long t = System.currentTimeMillis(); long t = System.currentTimeMillis();
if (lastCommit + 10000 > t) return; if (this.lastCommit + 10000 > t) return;
lastCommit = t; this.lastCommit = t;
getDefaultConnector().commit(softCommit); getDefaultConnector().commit(softCommit);
if (this.writeWebgraph) getWebgraphConnector().commit(softCommit); if (this.writeWebgraph) getWebgraphConnector().commit(softCommit);
} }
@ -587,6 +587,7 @@ public final class Fulltext {
* @param urlHash * @param urlHash
* @return the time in milliseconds since epoch for the load time or -1 if the document does not exist * @return the time in milliseconds since epoch for the load time or -1 if the document does not exist
*/ */
@Deprecated
private long getLoadTime(final String urlHash) throws IOException { private long getLoadTime(final String urlHash) throws IOException {
if (urlHash == null) return -1l; if (urlHash == null) return -1l;
SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash); SolrConnector.LoadTimeURL md = this.getDefaultConnector().getLoadTimeURL(urlHash);
@ -854,7 +855,7 @@ public final class Fulltext {
try (/* Resources automatically closed by this try-with-resources statement */ try (/* Resources automatically closed by this try-with-resources statement */
final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(this.f.getAbsolutePath() + ".gz") : this.f); final OutputStream os = new FileOutputStream(this.format == ExportFormat.solr ? new File(this.f.getAbsolutePath() + ".gz") : this.f);
final OutputStream wrappedStream = ((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{def.setLevel(Deflater.BEST_COMPRESSION);}} : os; final OutputStream wrappedStream = ((this.format == ExportFormat.solr)) ? new GZIPOutputStream(os, 65536){{this.def.setLevel(Deflater.BEST_COMPRESSION);}} : os;
final PrintWriter pw = new PrintWriter(new BufferedOutputStream(wrappedStream)); final PrintWriter pw = new PrintWriter(new BufferedOutputStream(wrappedStream));
) { ) {
if (this.format == ExportFormat.html) { if (this.format == ExportFormat.html) {

@ -51,6 +51,7 @@ import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector; import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector.LoadTimeURL;
import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.ByteOrder; import net.yacy.cora.order.ByteOrder;
@ -110,8 +111,9 @@ public class Segment {
public static final long targetFileSize = 64 * 1024 * 1024; // 256 MB public static final long targetFileSize = 64 * 1024 * 1024; // 256 MB
public static final int writeBufferSize = 4 * 1024 * 1024; public static final int writeBufferSize = 4 * 1024 * 1024;
public static final String termIndexName = "text.index"; public static final String termIndexName = "text.index";
public static final String citationIndexName = "citation.index"; public static final String citationIndexName = "citation.index";
public static final String firstseenIndexName = "firstseen.index"; public static final String firstseenIndexName = "firstseen.index";
public static final String loadtimeIndexName = "loadtime.index";
// the reference factory // the reference factory
public static final ReferenceFactory<WordReference> wordReferenceFactory = new WordReferenceFactory(); public static final ReferenceFactory<WordReference> wordReferenceFactory = new WordReferenceFactory();
@ -122,9 +124,10 @@ public class Segment {
private final File segmentPath; private final File segmentPath;
protected final Fulltext fulltext; protected final Fulltext fulltext;
protected IndexCell<WordReference> termIndex; protected IndexCell<WordReference> termIndex;
protected IndexCell<CitationReference> urlCitationIndex; private IndexCell<CitationReference> urlCitationIndex;
protected IndexTable firstSeenIndex; private IndexTable firstSeenIndex;
protected IODispatcher merger = null; // shared iodispatcher for kelondro indexes private IndexTable loadTimeIndex;
private IODispatcher merger = null; // shared iodispatcher for kelondro indexes
/** /**
* create a new Segment * create a new Segment
@ -143,6 +146,7 @@ public class Segment {
this.termIndex = null; this.termIndex = null;
this.urlCitationIndex = null; this.urlCitationIndex = null;
this.firstSeenIndex = new IndexTable(new File(segmentPath, firstseenIndexName), 12, 8, false, false); this.firstSeenIndex = new IndexTable(new File(segmentPath, firstseenIndexName), 12, 8, false, false);
this.loadTimeIndex = new IndexTable(new File(segmentPath, loadtimeIndexName), 12, 8, false, false);
} }
public boolean connectedRWI() { public boolean connectedRWI() {
@ -166,7 +170,7 @@ public class Segment {
targetFileSize, targetFileSize,
maxFileSize, maxFileSize,
writeBufferSize, writeBufferSize,
merger); this.merger);
} }
public void disconnectRWI() { public void disconnectRWI() {
@ -196,7 +200,7 @@ public class Segment {
targetFileSize, targetFileSize,
maxFileSize, maxFileSize,
writeBufferSize, writeBufferSize,
merger); this.merger);
} }
public void disconnectCitation() { public void disconnectCitation() {
@ -225,10 +229,14 @@ public class Segment {
return this.urlCitationIndex; return this.urlCitationIndex;
} }
public IndexTable firstSeen() { public IndexTable firstSeenIndex() {
return this.firstSeenIndex; return this.firstSeenIndex;
} }
public IndexTable loadTimeIndex() {
return this.loadTimeIndex;
}
public ReferenceReportCache getReferenceReportCache() { public ReferenceReportCache getReferenceReportCache() {
return new ReferenceReportCache(); return new ReferenceReportCache();
} }
@ -239,12 +247,12 @@ public class Segment {
this.cache = new ConcurrentHashMap<String, ReferenceReport>(); this.cache = new ConcurrentHashMap<String, ReferenceReport>();
} }
public ReferenceReport getReferenceReport(final String id, final boolean acceptSelfReference) throws IOException { public ReferenceReport getReferenceReport(final String id, final boolean acceptSelfReference) throws IOException {
ReferenceReport rr = cache.get(id); ReferenceReport rr = this.cache.get(id);
if (MemoryControl.shortStatus()) cache.clear(); if (MemoryControl.shortStatus()) this.cache.clear();
if (rr != null) return rr; if (rr != null) return rr;
try { try {
rr = new ReferenceReport(ASCII.getBytes(id), acceptSelfReference); rr = new ReferenceReport(ASCII.getBytes(id), acceptSelfReference);
cache.put(id, rr); this.cache.put(id, rr);
return rr; return rr;
} catch (final SpaceExceededException e) { } catch (final SpaceExceededException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
@ -278,19 +286,19 @@ public class Segment {
CitationReference ref = ri.next(); CitationReference ref = ri.next();
byte[] hh = ref.hosthash(); // host hash byte[] hh = ref.hosthash(); // host hash
if (ByteBuffer.equals(hh, 0, id, 6, 6)) { if (ByteBuffer.equals(hh, 0, id, 6, 6)) {
internalIDs.put(ref.urlhash()); this.internalIDs.put(ref.urlhash());
internal++; this.internal++;
} else { } else {
externalHosts.put(hh); this.externalHosts.put(hh);
externalIDs.put(ref.urlhash()); this.externalIDs.put(ref.urlhash());
external++; this.external++;
} }
} }
} catch (SpaceExceededException e) { } catch (SpaceExceededException e) {
// the Citation Index got too large, we ignore the problem and hope that a second solr index is attached which will take over now // the Citation Index got too large, we ignore the problem and hope that a second solr index is attached which will take over now
if (Segment.this.fulltext.useWebgraph()) internalIDs.clear(); if (Segment.this.fulltext.useWebgraph()) this.internalIDs.clear();
} }
if ((internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.useWebgraph()) { if ((this.internalIDs.size() == 0 || !connectedCitation()) && Segment.this.fulltext.useWebgraph()) {
// reqd the references from the webgraph // reqd the references from the webgraph
SolrConnector webgraph = Segment.this.fulltext.getWebgraphConnector(); SolrConnector webgraph = Segment.this.fulltext.getWebgraphConnector();
BlockingQueue<SolrDocument> docs = webgraph.concurrentDocumentsByQuery("{!cache=false raw f=" + WebgraphSchema.target_id_s.getSolrFieldName() + "}" + ASCII.String(id), WebgraphSchema.source_chars_i.getSolrFieldName() + " asc", 0, 10000000, Long.MAX_VALUE, 100, 1, false, WebgraphSchema.source_id_s.getSolrFieldName()); BlockingQueue<SolrDocument> docs = webgraph.concurrentDocumentsByQuery("{!cache=false raw f=" + WebgraphSchema.target_id_s.getSolrFieldName() + "}" + ASCII.String(id), WebgraphSchema.source_chars_i.getSolrFieldName() + " asc", 0, 10000000, Long.MAX_VALUE, 100, 1, false, WebgraphSchema.source_id_s.getSolrFieldName());
@ -305,13 +313,13 @@ public class Segment {
System.arraycopy(refidh, 6, hh, 0, 6); System.arraycopy(refidh, 6, hh, 0, 6);
if (ByteBuffer.equals(hh, 0, id, 6, 6)) { if (ByteBuffer.equals(hh, 0, id, 6, 6)) {
if (acceptSelfReference || !Arrays.equals(refidh, id)) { if (acceptSelfReference || !Arrays.equals(refidh, id)) {
internalIDs.put(refidh); this.internalIDs.put(refidh);
internal++; this.internal++;
} }
} else { } else {
externalHosts.put(hh); this.externalHosts.put(hh);
externalIDs.put(refidh); this.externalIDs.put(refidh);
external++; this.external++;
} }
} }
} catch (final InterruptedException e) { } catch (final InterruptedException e) {
@ -398,6 +406,35 @@ public class Segment {
} }
} }
public void setLoadTime(final byte[] urlhash, long time) {
if (urlhash == null || time <= 0) return;
try {
this.loadTimeIndex.put(urlhash, time); // ALWAYS overwrite!
} catch (IOException e) {
ConcurrentLog.logException(e);
}
}
public long getLoadTime(final byte[] urlhash) {
if (urlhash == null) return -1;
try {
return this.loadTimeIndex.get(urlhash);
} catch (IOException e) {
ConcurrentLog.logException(e);
return -1;
}
}
public LoadTimeURL getLoadTimeURL(String url, byte[] urlhash) {
long t = getLoadTime(urlhash);
if (t < 0) return null;
return new LoadTimeURL(url, t);
}
public LoadTimeURL getLoadTimeURL(String url, String id) {
return getLoadTimeURL(url, id.getBytes());
}
/** /**
* check if a given document, identified by url hash as document id exists * check if a given document, identified by url hash as document id exists
* @param id the url hash and document id * @param id the url hash and document id
@ -483,6 +520,7 @@ public class Segment {
if (this.fulltext != null) this.fulltext.close(); if (this.fulltext != null) this.fulltext.close();
if (this.urlCitationIndex != null) this.urlCitationIndex.close(); if (this.urlCitationIndex != null) this.urlCitationIndex.close();
if (this.firstSeenIndex != null) this.firstSeenIndex.close(); if (this.firstSeenIndex != null) this.firstSeenIndex.close();
if (this.loadTimeIndex != null) this.loadTimeIndex.close();
if (this.merger != null) { if (this.merger != null) {
this.merger.terminate(); this.merger.terminate();
this.merger = null; this.merger = null;
@ -661,7 +699,9 @@ public class Segment {
} }
// REMEMBER FIRST SEEN // REMEMBER FIRST SEEN
setFirstSeenTime(url.hash(), Math.min(document.getLastModified().getTime(), System.currentTimeMillis())); // should exist already in the index at this time, but just to make sure long now = System.currentTimeMillis();
setFirstSeenTime(url.hash(), Math.min(document.getLastModified().getTime(), now)); // should exist already in the index at this time, but just to make sure
setLoadTime(url.hash(), now); // always overwrites index entry
// write the edges to the citation reference index // write the edges to the citation reference index
if (this.connectedCitation()) try { if (this.connectedCitation()) try {
@ -676,7 +716,7 @@ public class Segment {
String referrerhash = id; String referrerhash = id;
String anchorhash = ASCII.String(new DigestURL(targetURL).hash()); String anchorhash = ASCII.String(new DigestURL(targetURL).hash());
if (referrerhash != null && anchorhash != null) { if (referrerhash != null && anchorhash != null) {
urlCitationIndex.add(ASCII.getBytes(anchorhash), new CitationReference(ASCII.getBytes(referrerhash), loadDate.getTime())); this.urlCitationIndex.add(ASCII.getBytes(anchorhash), new CitationReference(ASCII.getBytes(referrerhash), loadDate.getTime()));
} }
} catch (Throwable e) { } catch (Throwable e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
@ -692,7 +732,7 @@ public class Segment {
String referrerhash = id; String referrerhash = id;
String anchorhash = ASCII.String(new DigestURL(targetURL).hash()); String anchorhash = ASCII.String(new DigestURL(targetURL).hash());
if (referrerhash != null && anchorhash != null) { if (referrerhash != null && anchorhash != null) {
urlCitationIndex.add(ASCII.getBytes(anchorhash), new CitationReference(ASCII.getBytes(referrerhash), loadDate.getTime())); this.urlCitationIndex.add(ASCII.getBytes(anchorhash), new CitationReference(ASCII.getBytes(referrerhash), loadDate.getTime()));
} }
} catch (Throwable e) { } catch (Throwable e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);

Loading…
Cancel
Save