#(mimeTypeAvailable)#::
diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java
index 9a102411c..92ddcb2d9 100644
--- a/htroot/ViewFile.java
+++ b/htroot/ViewFile.java
@@ -31,6 +31,7 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Collection;
+import java.util.Date;
import java.util.Iterator;
import java.util.Map;
@@ -379,6 +380,9 @@ public class ViewFile {
prop.put("error_url", url.toNormalform(true));
prop.put("error_hash", urlHash);
prop.put("error_wordCount", wordCount);
+ prop.put("error_firstSeen", "");
+ long firstseen = sb.index.getFirstSeenTime(ASCII.getBytes(urlHash));
+ prop.put("error_firstSeen", firstseen < 0 ? "" : new Date(firstseen).toString());
prop.putHTML("error_desc", (descr.isEmpty()) ? " " : descr);
prop.putNum("error_size", size);
prop.put("error_mimeTypeAvailable", (response.getMimeType() == null) ? "0" : "1");
diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java
index 37237c9a7..df1312b55 100644
--- a/source/net/yacy/peers/Protocol.java
+++ b/source/net/yacy/peers/Protocol.java
@@ -1088,6 +1088,8 @@ public final class Protocol {
// passed all checks, store url
if (!localsearch) {
+ event.query.getSegment().setFirstSeenTime(urlEntry.hash(), Math.min(urlEntry.moddate().getTime(), System.currentTimeMillis()));
+
// put the remote documents to the local index. We must convert the solr document to a solr input document:
SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc);
diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java
index c8542c726..a634ab755 100644
--- a/source/net/yacy/search/Switchboard.java
+++ b/source/net/yacy/search/Switchboard.java
@@ -507,7 +507,7 @@ public final class Switchboard extends serverSwitch {
// initialize index
ReferenceContainer.maxReferences = getConfigInt("index.maxReferences", 0);
final File segmentsPath = new File(new File(indexPath, networkName), "SEGMENTS");
- this.index = new Segment(this.log, segmentsPath, archivePath, solrCollectionConfigurationWork, solrWebgraphConfigurationWork);
+ try {this.index = new Segment(this.log, segmentsPath, archivePath, solrCollectionConfigurationWork, solrWebgraphConfigurationWork);} catch (IOException e) {ConcurrentLog.logException(e);}
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) try {
this.index.connectRWI(wordCacheMaxCount, fileSizeMax);
} catch (final IOException e) {ConcurrentLog.logException(e);}
diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java
index ac580d7d1..bb6b5020c 100644
--- a/source/net/yacy/search/index/Segment.java
+++ b/source/net/yacy/search/index/Segment.java
@@ -73,6 +73,7 @@ import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceFactory;
+import net.yacy.kelondro.table.IndexTable;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.ISO639;
import net.yacy.kelondro.util.MemoryControl;
@@ -103,6 +104,7 @@ public class Segment {
public static final int writeBufferSize = 4 * 1024 * 1024;
public static final String termIndexName = "text.index";
public static final String citationIndexName = "citation.index";
+ public static final String firstseenIndexName = "firstseen.index";
// the reference factory
public static final ReferenceFactory wordReferenceFactory = new WordReferenceFactory();
@@ -114,15 +116,17 @@ public class Segment {
protected final Fulltext fulltext;
protected IndexCell termIndex;
protected IndexCell urlCitationIndex;
+ protected IndexTable firstSeenIndex;
/**
* create a new Segment
* @param log
* @param segmentPath that should be the path ponting to the directory "SEGMENT"
* @param collectionSchema
+ * @throws IOException
*/
public Segment(final ConcurrentLog log, final File segmentPath, final File archivePath,
- final CollectionConfiguration collectionConfiguration, final WebgraphConfiguration webgraphConfiguration) {
+ final CollectionConfiguration collectionConfiguration, final WebgraphConfiguration webgraphConfiguration) throws IOException {
log.info("Initializing Segment '" + segmentPath + ".");
this.log = log;
this.segmentPath = segmentPath;
@@ -132,6 +136,7 @@ public class Segment {
this.fulltext = new Fulltext(segmentPath, archivePath, collectionConfiguration, webgraphConfiguration);
this.termIndex = null;
this.urlCitationIndex = null;
+ this.firstSeenIndex = new IndexTable(new File(segmentPath, firstseenIndexName), 12, 8, false, false);
}
public boolean connectedRWI() {
@@ -202,6 +207,10 @@ public class Segment {
return this.urlCitationIndex;
}
+ public IndexTable firstSeen() {
+ return this.firstSeenIndex;
+ }
+
public ReferenceReportCache getReferenceReportCache() {
return new ReferenceReportCache();
}
@@ -350,6 +359,26 @@ public class Segment {
return 0;
}
}
+
+ public void setFirstSeenTime(final byte[] urlhash, long time) {
+ if (urlhash == null || time <= 0) return;
+ try {
+ if (this.firstSeenIndex.has(urlhash)) return; // NEVER overwrite, that is the purpose of this index.
+ this.firstSeenIndex.put(urlhash, time);
+ } catch (IOException e) {
+ ConcurrentLog.logException(e);
+ }
+ }
+
+ public long getFirstSeenTime(final byte[] urlhash) {
+ if (urlhash == null) return -1;
+ try {
+ return this.firstSeenIndex.get(urlhash);
+ } catch (IOException e) {
+ ConcurrentLog.logException(e);
+ return -1;
+ }
+ }
/**
* get the load time of a resource.
@@ -435,6 +464,7 @@ public class Segment {
if (this.termIndex != null) this.termIndex.close();
if (this.fulltext != null) this.fulltext.close();
if (this.urlCitationIndex != null) this.urlCitationIndex.close();
+ if (this.firstSeenIndex != null) this.firstSeenIndex.close();
}
private static String votedLanguage(
@@ -517,7 +547,6 @@ public class Segment {
final long startTime = System.currentTimeMillis();
// CREATE INDEX
-
// load some document metadata
final Date loadDate = new Date();
final String id = ASCII.String(url.hash());
@@ -533,7 +562,7 @@ public class Segment {
// CREATE SOLR DOCUMENT
final CollectionConfiguration collectionConfig = this.fulltext.getDefaultConfiguration();
- final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName);
+ final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(this, collections, responseHeader, document, condenser, referrerURL, language, this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName);
// ENRICH DOCUMENT WITH RANKING INFORMATION
this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null);
@@ -563,6 +592,8 @@ public class Segment {
}
+ // REMEMBER FIRST SEEN
+ setFirstSeenTime(url.hash(), Math.min(document.getDate().getTime(), System.currentTimeMillis())); // should exist already in the index at this time, but just to make sure
// write the edges to the citation reference index
if (this.connectedCitation()) try {
diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java
index efb244d2b..2f6d9c5e6 100644
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@@ -402,6 +402,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
public SolrVector yacy2solr(
+ final Segment segment,
final Map collections, final ResponseHeader responseHeader,
final Document document, final Condenser condenser, final DigestURL referrerURL, final String language,
final WebgraphConfiguration webgraph, final String sourceName) {
@@ -486,6 +487,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
Date lastModified = responseHeader == null ? new Date() : responseHeader.lastModified();
if (lastModified == null) lastModified = new Date();
if (document.getDate().before(lastModified)) lastModified = document.getDate();
+ long firstSeen = segment.getFirstSeenTime(digestURL.hash());
+ if (firstSeen > 0 && firstSeen < lastModified.getTime()) lastModified = new Date(firstSeen); // patch the date if we have seen the document earlier
add(doc, CollectionSchema.last_modified, lastModified);
}
if (allAttr || contains(CollectionSchema.keywords)) {