diff --git a/htroot/IndexControlURLs_p.html b/htroot/IndexControlURLs_p.html index 148c5009a..e4eee30f5 100644 --- a/htroot/IndexControlURLs_p.html +++ b/htroot/IndexControlURLs_p.html @@ -98,11 +98,12 @@ function updatepage(str) {
Index Deletion
Delete local search index (embedded Solr and old Metadata)
#(cleanupsolr)#:: Delete remote solr index
#(/cleanupsolr)# #(cleanuprwi)#:: Delete RWI Index (DHT transmission words)
#(/cleanuprwi)# #(cleanupcitation)#:: Delete Citation Index (linking between URLs)
#(/cleanupcitation)# + Delete First-Seen Date Table
Delete HTTP & FTP Cache
Stop Crawler and delete Crawl Queues
Delete robots.txt Cache
diff --git a/htroot/IndexControlURLs_p.java b/htroot/IndexControlURLs_p.java index 345d35bed..40e3f26ab 100644 --- a/htroot/IndexControlURLs_p.java +++ b/htroot/IndexControlURLs_p.java @@ -160,6 +160,9 @@ public class IndexControlURLs_p { if ( post.get("deleteCitation", "").equals("on")) { if (segment.connectedCitation()) try {segment.urlCitation().clear();} catch (final IOException e) {} } + if ( post.get("deleteFirstSeen", "").equals("on")) { + try {segment.firstSeen().clear();} catch (final IOException e) {} + } if ( post.get("deleteCrawlQueues", "").equals("on") ) { sb.crawlQueues.clear(); sb.crawlStacker.clear(); diff --git a/htroot/ViewFile.html b/htroot/ViewFile.html index c1b7e2bbb..04025547f 100644 --- a/htroot/ViewFile.html +++ b/htroot/ViewFile.html @@ -106,6 +106,7 @@ function updatepage(str) {
Hash:
#[hash]# (click this for full metadata)
In Metadata:
#(inurldb)#no::yes#(/inurldb)#
In Cache:
#(incache)#no::yes#(/incache)#
+
First Seen:
#[firstSeen]#
Word Count:
#[wordCount]#
Description:
#[desc]#
Size:
#[size]# Bytes
#(mimeTypeAvailable)#:: diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index 9a102411c..92ddcb2d9 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -31,6 +31,7 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.Arrays; import java.util.Collection; +import java.util.Date; import java.util.Iterator; import java.util.Map; @@ -379,6 +380,9 @@ public class ViewFile { prop.put("error_url", url.toNormalform(true)); prop.put("error_hash", urlHash); prop.put("error_wordCount", wordCount); + prop.put("error_firstSeen", ""); + long firstseen = sb.index.getFirstSeenTime(ASCII.getBytes(urlHash)); + prop.put("error_firstSeen", firstseen < 0 ? "" : new Date(firstseen).toString()); prop.putHTML("error_desc", (descr.isEmpty()) ? " " : descr); prop.putNum("error_size", size); prop.put("error_mimeTypeAvailable", (response.getMimeType() == null) ? "0" : "1"); diff --git a/source/net/yacy/peers/Protocol.java b/source/net/yacy/peers/Protocol.java index 37237c9a7..df1312b55 100644 --- a/source/net/yacy/peers/Protocol.java +++ b/source/net/yacy/peers/Protocol.java @@ -1088,6 +1088,8 @@ public final class Protocol { // passed all checks, store url if (!localsearch) { + event.query.getSegment().setFirstSeenTime(urlEntry.hash(), Math.min(urlEntry.moddate().getTime(), System.currentTimeMillis())); + // put the remote documents to the local index. We must convert the solr document to a solr input document: SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index c8542c726..a634ab755 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -507,7 +507,7 @@ public final class Switchboard extends serverSwitch { // initialize index ReferenceContainer.maxReferences = getConfigInt("index.maxReferences", 0); final File segmentsPath = new File(new File(indexPath, networkName), "SEGMENTS"); - this.index = new Segment(this.log, segmentsPath, archivePath, solrCollectionConfigurationWork, solrWebgraphConfigurationWork); + try {this.index = new Segment(this.log, segmentsPath, archivePath, solrCollectionConfigurationWork, solrWebgraphConfigurationWork);} catch (IOException e) {ConcurrentLog.logException(e);} if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) try { this.index.connectRWI(wordCacheMaxCount, fileSizeMax); } catch (final IOException e) {ConcurrentLog.logException(e);} diff --git a/source/net/yacy/search/index/Segment.java b/source/net/yacy/search/index/Segment.java index ac580d7d1..bb6b5020c 100644 --- a/source/net/yacy/search/index/Segment.java +++ b/source/net/yacy/search/index/Segment.java @@ -73,6 +73,7 @@ import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.rwi.IndexCell; import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.rwi.ReferenceFactory; +import net.yacy.kelondro.table.IndexTable; import net.yacy.kelondro.util.Bitfield; import net.yacy.kelondro.util.ISO639; import net.yacy.kelondro.util.MemoryControl; @@ -103,6 +104,7 @@ public class Segment { public static final int writeBufferSize = 4 * 1024 * 1024; public static final String termIndexName = "text.index"; public static final String citationIndexName = "citation.index"; + public static final String firstseenIndexName = "firstseen.index"; // the reference factory public static final ReferenceFactory wordReferenceFactory = new WordReferenceFactory(); @@ -114,15 +116,17 @@ public class Segment { protected final Fulltext fulltext; protected IndexCell termIndex; protected IndexCell urlCitationIndex; + protected IndexTable firstSeenIndex; /** * create a new Segment * @param log * @param segmentPath that should be the path ponting to the directory "SEGMENT" * @param collectionSchema + * @throws IOException */ public Segment(final ConcurrentLog log, final File segmentPath, final File archivePath, - final CollectionConfiguration collectionConfiguration, final WebgraphConfiguration webgraphConfiguration) { + final CollectionConfiguration collectionConfiguration, final WebgraphConfiguration webgraphConfiguration) throws IOException { log.info("Initializing Segment '" + segmentPath + "."); this.log = log; this.segmentPath = segmentPath; @@ -132,6 +136,7 @@ public class Segment { this.fulltext = new Fulltext(segmentPath, archivePath, collectionConfiguration, webgraphConfiguration); this.termIndex = null; this.urlCitationIndex = null; + this.firstSeenIndex = new IndexTable(new File(segmentPath, firstseenIndexName), 12, 8, false, false); } public boolean connectedRWI() { @@ -202,6 +207,10 @@ public class Segment { return this.urlCitationIndex; } + public IndexTable firstSeen() { + return this.firstSeenIndex; + } + public ReferenceReportCache getReferenceReportCache() { return new ReferenceReportCache(); } @@ -350,6 +359,26 @@ public class Segment { return 0; } } + + public void setFirstSeenTime(final byte[] urlhash, long time) { + if (urlhash == null || time <= 0) return; + try { + if (this.firstSeenIndex.has(urlhash)) return; // NEVER overwrite, that is the purpose of this index. + this.firstSeenIndex.put(urlhash, time); + } catch (IOException e) { + ConcurrentLog.logException(e); + } + } + + public long getFirstSeenTime(final byte[] urlhash) { + if (urlhash == null) return -1; + try { + return this.firstSeenIndex.get(urlhash); + } catch (IOException e) { + ConcurrentLog.logException(e); + return -1; + } + } /** * get the load time of a resource. @@ -435,6 +464,7 @@ public class Segment { if (this.termIndex != null) this.termIndex.close(); if (this.fulltext != null) this.fulltext.close(); if (this.urlCitationIndex != null) this.urlCitationIndex.close(); + if (this.firstSeenIndex != null) this.firstSeenIndex.close(); } private static String votedLanguage( @@ -517,7 +547,6 @@ public class Segment { final long startTime = System.currentTimeMillis(); // CREATE INDEX - // load some document metadata final Date loadDate = new Date(); final String id = ASCII.String(url.hash()); @@ -533,7 +562,7 @@ public class Segment { // CREATE SOLR DOCUMENT final CollectionConfiguration collectionConfig = this.fulltext.getDefaultConfiguration(); - final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName); + final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(this, collections, responseHeader, document, condenser, referrerURL, language, this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName); // ENRICH DOCUMENT WITH RANKING INFORMATION this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null); @@ -563,6 +592,8 @@ public class Segment { } + // REMEMBER FIRST SEEN + setFirstSeenTime(url.hash(), Math.min(document.getDate().getTime(), System.currentTimeMillis())); // should exist already in the index at this time, but just to make sure // write the edges to the citation reference index if (this.connectedCitation()) try { diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index efb244d2b..2f6d9c5e6 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -402,6 +402,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri } public SolrVector yacy2solr( + final Segment segment, final Map collections, final ResponseHeader responseHeader, final Document document, final Condenser condenser, final DigestURL referrerURL, final String language, final WebgraphConfiguration webgraph, final String sourceName) { @@ -486,6 +487,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri Date lastModified = responseHeader == null ? new Date() : responseHeader.lastModified(); if (lastModified == null) lastModified = new Date(); if (document.getDate().before(lastModified)) lastModified = document.getDate(); + long firstSeen = segment.getFirstSeenTime(digestURL.hash()); + if (firstSeen > 0 && firstSeen < lastModified.getTime()) lastModified = new Date(firstSeen); // patch the date if we have seen the document earlier add(doc, CollectionSchema.last_modified, lastModified); } if (allAttr || contains(CollectionSchema.keywords)) {