From 9bf0d7ecb9925f149cb2a81106e85814c2b137d0 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 24 Mar 2015 12:32:39 +0100 Subject: [PATCH] added a new collection type 'dht' to all documents from the peer-to-peer interface to distinguish rich and poor document data. This also reverts some changes from commit 796770e070daf38289b594f4cbdc65b9ce0ca2b1 because the firstSeen database is the wrong method to distinguish these types of data --- htroot/yacy/crawlReceipt.java | 2 +- htroot/yacy/transferURL.java | 2 +- .../kelondro/data/meta/URIMetadataNode.java | 9 ++++++--- .../kelondro/logging/ConsoleOutErrHandler.java | 4 ++-- source/net/yacy/peers/Protocol.java | 10 ++++------ source/net/yacy/search/index/Fulltext.java | 18 +++++++++++++++++- 6 files changed, 31 insertions(+), 14 deletions(-) diff --git a/htroot/yacy/crawlReceipt.java b/htroot/yacy/crawlReceipt.java index e73ed3ae9..cdf5fc0a2 100644 --- a/htroot/yacy/crawlReceipt.java +++ b/htroot/yacy/crawlReceipt.java @@ -114,7 +114,7 @@ public final class crawlReceipt { } // generating a new loaded URL entry - final URIMetadataNode entry = URIMetadataNode.importEntry(propStr); + final URIMetadataNode entry = URIMetadataNode.importEntry(propStr, "dht"); if (entry == null) { if (log.isWarn()) log.warn("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr); prop.put("delay", "3600"); diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java index a032c411d..4b042376f 100644 --- a/htroot/yacy/transferURL.java +++ b/htroot/yacy/transferURL.java @@ -103,7 +103,7 @@ public final class transferURL { } // parse new lurl-entry - lEntry = URIMetadataNode.importEntry(urls); + lEntry = URIMetadataNode.importEntry(urls, "dht"); if (lEntry == null) { if (Network.log.isWarn()) Network.log.warn("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls); blocked++; diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index 8f603fe6e..17f2c772d 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -77,7 +77,7 @@ public class URIMetadataNode extends SolrDocument { protected String snippet = null; protected WordReferenceVars word = null; // this is only used if the url is transported via remote search requests - public URIMetadataNode(final Properties prop) { + public URIMetadataNode(final Properties prop, String collection) { // generates an plasmaLURLEntry using the properties from the argument // the property names must correspond to the one from toString //System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); @@ -139,6 +139,9 @@ public class URIMetadataNode extends SolrDocument { this.appc = Integer.parseInt(prop.getProperty("lapp", "0")); this.snippet = crypt.simpleDecode(prop.getProperty("snippet", "")); this.score = Float.parseFloat(prop.getProperty("score", "0.0")); + List cs = new ArrayList(); + cs.add(collection); + this.setField(CollectionSchema.collection_sxt.name(), cs); this.word = null; if (prop.containsKey("wi")) { this.word = new WordReferenceVars(new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""))), false); @@ -497,13 +500,13 @@ public class URIMetadataNode extends SolrDocument { return getStringList(CollectionSchema.description_txt); } - public static URIMetadataNode importEntry(final String propStr) { + public static URIMetadataNode importEntry(final String propStr, String collection) { if (propStr == null || propStr.isEmpty() || propStr.charAt(0) != '{' || !propStr.endsWith("}")) { ConcurrentLog.severe("URIMetadataNode", "importEntry: propStr is not proper: " + propStr); return null; } try { - return new URIMetadataNode(MapTools.s2p(propStr.substring(1, propStr.length() - 1))); + return new URIMetadataNode(MapTools.s2p(propStr.substring(1, propStr.length() - 1)), collection); } catch (final kelondroException e) { // wrong format ConcurrentLog.severe("URIMetadataNode", e.getMessage()); diff --git a/source/net/yacy/kelondro/logging/ConsoleOutErrHandler.java b/source/net/yacy/kelondro/logging/ConsoleOutErrHandler.java index 9965686d1..0d11cd577 100644 --- a/source/net/yacy/kelondro/logging/ConsoleOutErrHandler.java +++ b/source/net/yacy/kelondro/logging/ConsoleOutErrHandler.java @@ -171,7 +171,7 @@ public final class ConsoleOutErrHandler extends Handler { } @Override - public void setFormatter(final Formatter newFormatter) throws SecurityException { + public synchronized void setFormatter(final Formatter newFormatter) throws SecurityException { super.setFormatter(newFormatter); if (newFormatter == null) return; try { @@ -183,7 +183,7 @@ public final class ConsoleOutErrHandler extends Handler { } @Override - public final void setFilter(final Filter newFilter) throws SecurityException { + public final synchronized void setFilter(final Filter newFilter) throws SecurityException { super.setFilter(newFilter); if (newFilter == null) return; try { diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index b5230b996..77654639e 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -746,11 +746,8 @@ public final class Protocol { if (event.addResultsToLocalIndex) { for (URIMetadataNode entry : storeDocs) { try { - // firstSseen is set on access (crawl/index) to full resource, - // on existing firstSeen prevent that metadata overwrite this rich data (this can be the case if crawldata has older loaddate as metadata) - if (!event.query.getSegment().firstSeen().has(entry.hash())) { // TODO: cleanup firstSeen on document deletion from index - event.query.getSegment().fulltext().putMetadata(entry); - } + event.query.getSegment().setFirstSeenTime(entry.hash(), Math.min(entry.moddate().getTime(), System.currentTimeMillis())); + event.query.getSegment().fulltext().putMetadata(entry); // it will be checked inside the putMetadata that poor metadata does not overwrite rich metadata } catch (final IOException e) { ConcurrentLog.logException(e); } @@ -920,7 +917,7 @@ public final class Protocol { if ( resultLine == null ) { continue; } - final URIMetadataNode urlEntry = URIMetadataNode.importEntry(resultLine); + final URIMetadataNode urlEntry = URIMetadataNode.importEntry(resultLine, "dht"); if ( urlEntry == null ) { continue; } @@ -1115,6 +1112,7 @@ public final class Protocol { // passed all checks, store url if (!localsearch) { + // put the remote documents to the local index. We must convert the solr document to a solr input document: if (event.addResultsToLocalIndex) { final SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc); diff --git a/source/net/yacy/search/index/Fulltext.java b/source/net/yacy/search/index/Fulltext.java index 7bb610c61..9f9d8fb38 100644 --- a/source/net/yacy/search/index/Fulltext.java +++ b/source/net/yacy/search/index/Fulltext.java @@ -341,8 +341,24 @@ public final class Fulltext { try { // because node entries are richer than metadata entries we must check if they exist to prevent that they are overwritten long date = this.getLoadTime(id); - if (date < entry.loaddate().getTime()) { + if (date == -1) { + // document does not exist putDocument(getDefaultConfiguration().metadata2solr(entry)); + } else { + // check if document contains rich data + if (date < entry.loaddate().getTime()) { + SolrDocument doc = this.getDefaultConnector().getDocumentById(id, CollectionSchema.collection_sxt.getSolrFieldName()); + if (doc == null || !doc.containsKey(CollectionSchema.collection_sxt.getSolrFieldName())) { + putDocument(getDefaultConfiguration().metadata2solr(entry)); + } else { + Collection collections = doc.getFieldValues(CollectionSchema.collection_sxt.getSolrFieldName()); + for (Object s: collections) { + if (!"dht".equals(s)) return; + } + // passed all checks, overwrite document + putDocument(getDefaultConfiguration().metadata2solr(entry)); + } + } } } catch (final SolrException e) { throw new IOException(e.getMessage(), e);