added new 'firstSeen' database table and necessary data structures which

hold a date for each URL to record when a url was first seen. This is
then used to overwrite the modification date for urls upon recrawl in
case that the first-seen date is before the latest document date. This
behaviour is necessary due to the common behaviour of content management
systems which attach always the current date to all documents. Using the
firstSeen database it is possible to approximate a real first document
creation date in case that the crawler starts frequently for the same
domain. As a result the search results ordered by date have a much
better quality and the usage of YaCy as search agent for latest news has
a better quality.
pull/1/head
Michael Peter Christen 10 years ago
parent 487a733c99
commit 0a879c98e7

@ -98,11 +98,12 @@ function updatepage(str) {
<dl>
<dt class="TableCellDark">Index Deletion</dt>
<dd><input type="checkbox" name="deleteIndex" id="deleteIndex"
onclick="x=document.getElementById('deleteIndex').checked;#(rwi)#::document.getElementById('deleteRWI').checked=x;#(/rwi)#document.getElementById('deleteRobots').checked=x;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteCrawlQueues').checked=x;c='disabled';document.getElementById('deleteSearchFl').checked=x;if(x){c='';};document.getElementById('deletecomplete').disabled=c;document.getElementById('deleteCache').disabled=c;document.getElementById('deleteRobots').disabled=c;document.getElementById('deleteCrawlQueues').disabled=c;document.getElementById('deleteSearchFl').disabled=c;"
onclick="x=document.getElementById('deleteIndex').checked;#(rwi)#::document.getElementById('deleteRWI').checked=x;#(/rwi)#document.getElementById('deleteRobots').checked=x;document.getElementById('deleteRobots').checked=x;document.getElementById('deleteCrawlQueues').checked=x;c='disabled';document.getElementById('deleteSearchFl').checked=x;if(x){c='';};document.getElementById('deletecomplete').disabled=c;document.getElementById('deleteCache').disabled=c;document.getElementById('deleteFirstSeen').disabled=c;document.getElementById('deleteRobots').disabled=c;document.getElementById('deleteCrawlQueues').disabled=c;document.getElementById('deleteSearchFl').disabled=c;"
/> Delete local search index (embedded Solr and old Metadata)<br/>
#(cleanupsolr)#::<input type="checkbox" name="deleteRemoteSolr" id="deleteRemoteSolr" onclick="x=document.getElementById('deleteRemoteSolr').checked;c='disabled';if(x){c='';};document.getElementById('deletecomplete').disabled=c;" /> Delete remote solr index<br/>#(/cleanupsolr)#
#(cleanuprwi)#::<input type="checkbox" name="deleteRWI" id="deleteRWI" onclick="x=document.getElementById('deleteRWI').checked;c='disabled';if(x){c='';};document.getElementById('deletecomplete').disabled=c;" /> Delete RWI Index (DHT transmission words)<br/>#(/cleanuprwi)#
#(cleanupcitation)#::<input type="checkbox" name="deleteCitation" id="deleteCitation" onclick="x=document.getElementById('deleteCitation').checked;c='disabled';if(x){c='';};document.getElementById('deletecomplete').disabled=c;" /> Delete Citation Index (linking between URLs)<br/>#(/cleanupcitation)#
<input type="checkbox" name="deleteFirstSeen" id="deleteFirstSeen" disabled="disabled" /> Delete First-Seen Date Table<br/>
<input type="checkbox" name="deleteCache" id="deleteCache" disabled="disabled" /> Delete HTTP &amp; FTP Cache<br/>
<input type="checkbox" name="deleteCrawlQueues" id="deleteCrawlQueues" disabled="disabled" /> Stop Crawler and delete Crawl Queues<br/>
<input type="checkbox" name="deleteRobots" id="deleteRobots" disabled="disabled" /> Delete robots.txt Cache<br/>

@ -160,6 +160,9 @@ public class IndexControlURLs_p {
if ( post.get("deleteCitation", "").equals("on")) {
if (segment.connectedCitation()) try {segment.urlCitation().clear();} catch (final IOException e) {}
}
if ( post.get("deleteFirstSeen", "").equals("on")) {
try {segment.firstSeen().clear();} catch (final IOException e) {}
}
if ( post.get("deleteCrawlQueues", "").equals("on") ) {
sb.crawlQueues.clear();
sb.crawlStacker.clear();

@ -106,6 +106,7 @@ function updatepage(str) {
<dt>Hash:</dt><dd><a href="solr/select?defType=edismax&start=0&rows=3&core=collection1&wt=html&q=id:%22#[hash]#%22">#[hash]#</a> (click this for full metadata)</dd>
<dt>In Metadata:</dt><dd>#(inurldb)#no::yes#(/inurldb)#</dd>
<dt>In Cache:</dt><dd>#(incache)#no::yes#(/incache)#</dd>
<dt>First Seen:</dt><dd>#[firstSeen]#</dd>
<dt>Word Count:</dt><dd>#[wordCount]#</dd>
<dt>Description:</dt><dd>#[desc]#</dd>
<dt>Size:</dt><dd>#[size]# Bytes</dd>#(mimeTypeAvailable)#::

@ -31,6 +31,7 @@ import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Collection;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
@ -379,6 +380,9 @@ public class ViewFile {
prop.put("error_url", url.toNormalform(true));
prop.put("error_hash", urlHash);
prop.put("error_wordCount", wordCount);
prop.put("error_firstSeen", "");
long firstseen = sb.index.getFirstSeenTime(ASCII.getBytes(urlHash));
prop.put("error_firstSeen", firstseen < 0 ? "" : new Date(firstseen).toString());
prop.putHTML("error_desc", (descr.isEmpty()) ? "&nbsp;" : descr);
prop.putNum("error_size", size);
prop.put("error_mimeTypeAvailable", (response.getMimeType() == null) ? "0" : "1");

@ -1088,6 +1088,8 @@ public final class Protocol {
// passed all checks, store url
if (!localsearch) {
event.query.getSegment().setFirstSeenTime(urlEntry.hash(), Math.min(urlEntry.moddate().getTime(), System.currentTimeMillis()));
// put the remote documents to the local index. We must convert the solr document to a solr input document:
SolrInputDocument sid = event.query.getSegment().fulltext().getDefaultConfiguration().toSolrInputDocument(doc);

@ -507,7 +507,7 @@ public final class Switchboard extends serverSwitch {
// initialize index
ReferenceContainer.maxReferences = getConfigInt("index.maxReferences", 0);
final File segmentsPath = new File(new File(indexPath, networkName), "SEGMENTS");
this.index = new Segment(this.log, segmentsPath, archivePath, solrCollectionConfigurationWork, solrWebgraphConfigurationWork);
try {this.index = new Segment(this.log, segmentsPath, archivePath, solrCollectionConfigurationWork, solrWebgraphConfigurationWork);} catch (IOException e) {ConcurrentLog.logException(e);}
if (this.getConfigBool(SwitchboardConstants.CORE_SERVICE_RWI, true)) try {
this.index.connectRWI(wordCacheMaxCount, fileSizeMax);
} catch (final IOException e) {ConcurrentLog.logException(e);}

@ -73,6 +73,7 @@ import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.rwi.IndexCell;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceFactory;
import net.yacy.kelondro.table.IndexTable;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.ISO639;
import net.yacy.kelondro.util.MemoryControl;
@ -103,6 +104,7 @@ public class Segment {
public static final int writeBufferSize = 4 * 1024 * 1024;
public static final String termIndexName = "text.index";
public static final String citationIndexName = "citation.index";
public static final String firstseenIndexName = "firstseen.index";
// the reference factory
public static final ReferenceFactory<WordReference> wordReferenceFactory = new WordReferenceFactory();
@ -114,15 +116,17 @@ public class Segment {
protected final Fulltext fulltext;
protected IndexCell<WordReference> termIndex;
protected IndexCell<CitationReference> urlCitationIndex;
protected IndexTable firstSeenIndex;
/**
* create a new Segment
* @param log
* @param segmentPath that should be the path ponting to the directory "SEGMENT"
* @param collectionSchema
* @throws IOException
*/
public Segment(final ConcurrentLog log, final File segmentPath, final File archivePath,
final CollectionConfiguration collectionConfiguration, final WebgraphConfiguration webgraphConfiguration) {
final CollectionConfiguration collectionConfiguration, final WebgraphConfiguration webgraphConfiguration) throws IOException {
log.info("Initializing Segment '" + segmentPath + ".");
this.log = log;
this.segmentPath = segmentPath;
@ -132,6 +136,7 @@ public class Segment {
this.fulltext = new Fulltext(segmentPath, archivePath, collectionConfiguration, webgraphConfiguration);
this.termIndex = null;
this.urlCitationIndex = null;
this.firstSeenIndex = new IndexTable(new File(segmentPath, firstseenIndexName), 12, 8, false, false);
}
public boolean connectedRWI() {
@ -202,6 +207,10 @@ public class Segment {
return this.urlCitationIndex;
}
public IndexTable firstSeen() {
return this.firstSeenIndex;
}
public ReferenceReportCache getReferenceReportCache() {
return new ReferenceReportCache();
}
@ -350,6 +359,26 @@ public class Segment {
return 0;
}
}
public void setFirstSeenTime(final byte[] urlhash, long time) {
if (urlhash == null || time <= 0) return;
try {
if (this.firstSeenIndex.has(urlhash)) return; // NEVER overwrite, that is the purpose of this index.
this.firstSeenIndex.put(urlhash, time);
} catch (IOException e) {
ConcurrentLog.logException(e);
}
}
public long getFirstSeenTime(final byte[] urlhash) {
if (urlhash == null) return -1;
try {
return this.firstSeenIndex.get(urlhash);
} catch (IOException e) {
ConcurrentLog.logException(e);
return -1;
}
}
/**
* get the load time of a resource.
@ -435,6 +464,7 @@ public class Segment {
if (this.termIndex != null) this.termIndex.close();
if (this.fulltext != null) this.fulltext.close();
if (this.urlCitationIndex != null) this.urlCitationIndex.close();
if (this.firstSeenIndex != null) this.firstSeenIndex.close();
}
private static String votedLanguage(
@ -517,7 +547,6 @@ public class Segment {
final long startTime = System.currentTimeMillis();
// CREATE INDEX
// load some document metadata
final Date loadDate = new Date();
final String id = ASCII.String(url.hash());
@ -533,7 +562,7 @@ public class Segment {
// CREATE SOLR DOCUMENT
final CollectionConfiguration collectionConfig = this.fulltext.getDefaultConfiguration();
final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(collections, responseHeader, document, condenser, referrerURL, language, this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName);
final CollectionConfiguration.SolrVector vector = collectionConfig.yacy2solr(this, collections, responseHeader, document, condenser, referrerURL, language, this.fulltext().useWebgraph() ? this.fulltext.getWebgraphConfiguration() : null, sourceName);
// ENRICH DOCUMENT WITH RANKING INFORMATION
this.fulltext.getDefaultConfiguration().postprocessing_references(this.getReferenceReportCache(), vector, url, null);
@ -563,6 +592,8 @@ public class Segment {
}
// REMEMBER FIRST SEEN
setFirstSeenTime(url.hash(), Math.min(document.getDate().getTime(), System.currentTimeMillis())); // should exist already in the index at this time, but just to make sure
// write the edges to the citation reference index
if (this.connectedCitation()) try {

@ -402,6 +402,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
public SolrVector yacy2solr(
final Segment segment,
final Map<String, Pattern> collections, final ResponseHeader responseHeader,
final Document document, final Condenser condenser, final DigestURL referrerURL, final String language,
final WebgraphConfiguration webgraph, final String sourceName) {
@ -486,6 +487,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
Date lastModified = responseHeader == null ? new Date() : responseHeader.lastModified();
if (lastModified == null) lastModified = new Date();
if (document.getDate().before(lastModified)) lastModified = document.getDate();
long firstSeen = segment.getFirstSeenTime(digestURL.hash());
if (firstSeen > 0 && firstSeen < lastModified.getTime()) lastModified = new Date(firstSeen); // patch the date if we have seen the document earlier
add(doc, CollectionSchema.last_modified, lastModified);
}
if (allAttr || contains(CollectionSchema.keywords)) {

Loading…
Cancel
Save