prevent overwrite of crawled or received full documents by (newer) metadata

To protect rich index data (full resource) from overwriting by metadata gathered during remote search,
the newly introduced "firstSeen" index is used to differentiate between full-resource-doc and metadata,
as a "firstSeen" entry is only added on store's of full-resource-docs (during crawl or remote search).
pull/1/head
reger 10 years ago
parent 7cf28c4f94
commit 796770e070

@ -746,7 +746,11 @@ public final class Protocol {
if (event.addResultsToLocalIndex) { if (event.addResultsToLocalIndex) {
for (URIMetadataNode entry : storeDocs) { for (URIMetadataNode entry : storeDocs) {
try { try {
// firstSseen is set on access (crawl/index) to full resource,
// on existing firstSeen prevent that metadata overwrite this rich data (this can be the case if crawldata has older loaddate as metadata)
if (!event.query.getSegment().firstSeen().has(entry.hash())) { // TODO: cleanup firstSeen on document deletion from index
event.query.getSegment().fulltext().putMetadata(entry); event.query.getSegment().fulltext().putMetadata(entry);
}
} catch (final IOException e) { } catch (final IOException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
@ -1111,14 +1115,14 @@ public final class Protocol {
// passed all checks, store url // passed all checks, store url
if (!localsearch) { if (!localsearch) {
event.query.getSegment().setFirstSeenTime(urlEntry.hash(), Math.min(urlEntry.moddate().getTime(), System.currentTimeMillis()));
// put the remote documents to the local index. We must convert the solr document to a solr input document: // put the remote documents to the local index. We must convert the solr document to a solr input document:
if (event.addResultsToLocalIndex) { if (event.addResultsToLocalIndex) {
final SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc); final SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc);
// the input document stays untouched because it contains top-level cloned objects // the input document stays untouched because it contains top-level cloned objects
if (event.addResultsToLocalIndex) docs.add(sid); docs.add(sid);
// will be stored to index, and is a full solr document, can be added to firstseen
event.query.getSegment().setFirstSeenTime(urlEntry.hash(), Math.min(urlEntry.moddate().getTime(), System.currentTimeMillis()));
} }
// after this conversion we can remove the largest and not used field text_t and synonyms_sxt from the document // after this conversion we can remove the largest and not used field text_t and synonyms_sxt from the document

Loading…
Cancel
Save