added a new collection type 'dht' to all documents from the peer-to-peer

interface to distinguish rich and poor document data.
This also reverts some changes from commit
796770e070 because the firstSeen database
is the wrong method to distinguish these types of data
pull/1/head
Michael Peter Christen 10 years ago
parent 7fcf0d0b71
commit 9bf0d7ecb9

@ -114,7 +114,7 @@ public final class crawlReceipt {
} }
// generating a new loaded URL entry // generating a new loaded URL entry
final URIMetadataNode entry = URIMetadataNode.importEntry(propStr); final URIMetadataNode entry = URIMetadataNode.importEntry(propStr, "dht");
if (entry == null) { if (entry == null) {
if (log.isWarn()) log.warn("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr); if (log.isWarn()) log.warn("crawlReceipt: RECEIVED wrong RECEIPT (entry null) from peer " + iam + "\n\tURL properties: "+ propStr);
prop.put("delay", "3600"); prop.put("delay", "3600");

@ -103,7 +103,7 @@ public final class transferURL {
} }
// parse new lurl-entry // parse new lurl-entry
lEntry = URIMetadataNode.importEntry(urls); lEntry = URIMetadataNode.importEntry(urls, "dht");
if (lEntry == null) { if (lEntry == null) {
if (Network.log.isWarn()) Network.log.warn("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls); if (Network.log.isWarn()) Network.log.warn("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
blocked++; blocked++;

@ -77,7 +77,7 @@ public class URIMetadataNode extends SolrDocument {
protected String snippet = null; protected String snippet = null;
protected WordReferenceVars word = null; // this is only used if the url is transported via remote search requests protected WordReferenceVars word = null; // this is only used if the url is transported via remote search requests
public URIMetadataNode(final Properties prop) { public URIMetadataNode(final Properties prop, String collection) {
// generates an plasmaLURLEntry using the properties from the argument // generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString // the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); //System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
@ -139,6 +139,9 @@ public class URIMetadataNode extends SolrDocument {
this.appc = Integer.parseInt(prop.getProperty("lapp", "0")); this.appc = Integer.parseInt(prop.getProperty("lapp", "0"));
this.snippet = crypt.simpleDecode(prop.getProperty("snippet", "")); this.snippet = crypt.simpleDecode(prop.getProperty("snippet", ""));
this.score = Float.parseFloat(prop.getProperty("score", "0.0")); this.score = Float.parseFloat(prop.getProperty("score", "0.0"));
List<String> cs = new ArrayList<String>();
cs.add(collection);
this.setField(CollectionSchema.collection_sxt.name(), cs);
this.word = null; this.word = null;
if (prop.containsKey("wi")) { if (prop.containsKey("wi")) {
this.word = new WordReferenceVars(new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""))), false); this.word = new WordReferenceVars(new WordReferenceRow(Base64Order.enhancedCoder.decodeString(prop.getProperty("wi", ""))), false);
@ -497,13 +500,13 @@ public class URIMetadataNode extends SolrDocument {
return getStringList(CollectionSchema.description_txt); return getStringList(CollectionSchema.description_txt);
} }
public static URIMetadataNode importEntry(final String propStr) { public static URIMetadataNode importEntry(final String propStr, String collection) {
if (propStr == null || propStr.isEmpty() || propStr.charAt(0) != '{' || !propStr.endsWith("}")) { if (propStr == null || propStr.isEmpty() || propStr.charAt(0) != '{' || !propStr.endsWith("}")) {
ConcurrentLog.severe("URIMetadataNode", "importEntry: propStr is not proper: " + propStr); ConcurrentLog.severe("URIMetadataNode", "importEntry: propStr is not proper: " + propStr);
return null; return null;
} }
try { try {
return new URIMetadataNode(MapTools.s2p(propStr.substring(1, propStr.length() - 1))); return new URIMetadataNode(MapTools.s2p(propStr.substring(1, propStr.length() - 1)), collection);
} catch (final kelondroException e) { } catch (final kelondroException e) {
// wrong format // wrong format
ConcurrentLog.severe("URIMetadataNode", e.getMessage()); ConcurrentLog.severe("URIMetadataNode", e.getMessage());

@ -171,7 +171,7 @@ public final class ConsoleOutErrHandler extends Handler {
} }
@Override @Override
public void setFormatter(final Formatter newFormatter) throws SecurityException { public synchronized void setFormatter(final Formatter newFormatter) throws SecurityException {
super.setFormatter(newFormatter); super.setFormatter(newFormatter);
if (newFormatter == null) return; if (newFormatter == null) return;
try { try {
@ -183,7 +183,7 @@ public final class ConsoleOutErrHandler extends Handler {
} }
@Override @Override
public final void setFilter(final Filter newFilter) throws SecurityException { public final synchronized void setFilter(final Filter newFilter) throws SecurityException {
super.setFilter(newFilter); super.setFilter(newFilter);
if (newFilter == null) return; if (newFilter == null) return;
try { try {

@ -746,11 +746,8 @@ public final class Protocol {
if (event.addResultsToLocalIndex) { if (event.addResultsToLocalIndex) {
for (URIMetadataNode entry : storeDocs) { for (URIMetadataNode entry : storeDocs) {
try { try {
// firstSseen is set on access (crawl/index) to full resource, event.query.getSegment().setFirstSeenTime(entry.hash(), Math.min(entry.moddate().getTime(), System.currentTimeMillis()));
// on existing firstSeen prevent that metadata overwrite this rich data (this can be the case if crawldata has older loaddate as metadata) event.query.getSegment().fulltext().putMetadata(entry); // it will be checked inside the putMetadata that poor metadata does not overwrite rich metadata
if (!event.query.getSegment().firstSeen().has(entry.hash())) { // TODO: cleanup firstSeen on document deletion from index
event.query.getSegment().fulltext().putMetadata(entry);
}
} catch (final IOException e) { } catch (final IOException e) {
ConcurrentLog.logException(e); ConcurrentLog.logException(e);
} }
@ -920,7 +917,7 @@ public final class Protocol {
if ( resultLine == null ) { if ( resultLine == null ) {
continue; continue;
} }
final URIMetadataNode urlEntry = URIMetadataNode.importEntry(resultLine); final URIMetadataNode urlEntry = URIMetadataNode.importEntry(resultLine, "dht");
if ( urlEntry == null ) { if ( urlEntry == null ) {
continue; continue;
} }
@ -1115,6 +1112,7 @@ public final class Protocol {
// passed all checks, store url // passed all checks, store url
if (!localsearch) { if (!localsearch) {
// put the remote documents to the local index. We must convert the solr document to a solr input document: // put the remote documents to the local index. We must convert the solr document to a solr input document:
if (event.addResultsToLocalIndex) { if (event.addResultsToLocalIndex) {
final SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc); final SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc);

@ -341,8 +341,24 @@ public final class Fulltext {
try { try {
// because node entries are richer than metadata entries we must check if they exist to prevent that they are overwritten // because node entries are richer than metadata entries we must check if they exist to prevent that they are overwritten
long date = this.getLoadTime(id); long date = this.getLoadTime(id);
if (date < entry.loaddate().getTime()) { if (date == -1) {
// document does not exist
putDocument(getDefaultConfiguration().metadata2solr(entry)); putDocument(getDefaultConfiguration().metadata2solr(entry));
} else {
// check if document contains rich data
if (date < entry.loaddate().getTime()) {
SolrDocument doc = this.getDefaultConnector().getDocumentById(id, CollectionSchema.collection_sxt.getSolrFieldName());
if (doc == null || !doc.containsKey(CollectionSchema.collection_sxt.getSolrFieldName())) {
putDocument(getDefaultConfiguration().metadata2solr(entry));
} else {
Collection<Object> collections = doc.getFieldValues(CollectionSchema.collection_sxt.getSolrFieldName());
for (Object s: collections) {
if (!"dht".equals(s)) return;
}
// passed all checks, overwrite document
putDocument(getDefaultConfiguration().metadata2solr(entry));
}
}
} }
} catch (final SolrException e) { } catch (final SolrException e) {
throw new IOException(e.getMessage(), e); throw new IOException(e.getMessage(), e);

Loading…
Cancel
Save