refactoring date -> lastModified

pull/1/head
Michael Peter Christen 10 years ago
parent ab6cc3c88c
commit 6a1865f507

@ -94,7 +94,7 @@ public class Document {
private final double lon, lat;
private final Object parserObject; // the source object that was used to create the Document
private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
private final Date date;
private final Date lastModified;
private int crawldepth;
public Document(final DigestURL location, final String mimeType, final String charset,
@ -110,7 +110,7 @@ public class Document {
final LinkedHashMap<DigestURL, String> rss,
final LinkedHashMap<DigestURL, ImageEntry> images,
final boolean indexingDenied,
final Date date) {
final Date lastModified) {
this.source = location;
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
this.charset = charset;
@ -146,7 +146,7 @@ public class Document {
this.indexingDenied = indexingDenied;
this.text = text == null ? "" : text;
this.generic_facets = new HashMap<String, Set<String>>();
this.date = date == null ? new Date() : date;
this.lastModified = lastModified == null ? new Date() : lastModified;
this.crawldepth = 999; // unknown yet
}
@ -476,8 +476,8 @@ dc_rights
return this.emaillinks;
}
public Date getDate() {
return this.date;
public Date getLastModified() {
return this.lastModified;
}
public double lon() {
@ -868,7 +868,7 @@ dc_rights
rss.putAll(doc.getRSS());
images.putAll(doc.getImages());
if (doc.lon() != 0.0 && doc.lat() != 0.0) { lon = doc.lon(); lat = doc.lat(); }
if (doc.date.before(date)) date = doc.date;
if (doc.lastModified.before(date)) date = doc.lastModified;
if (doc.getDepth() < mindepth) mindepth = doc.getDepth();
if (doc.dc_language() != null) languages.add(doc.dc_language());

@ -547,7 +547,8 @@ public class Segment {
final SearchEvent searchEvent,
final String sourceName, // contains the crawl profile hash if this comes from a web crawl
final boolean storeToRWI,
final String proxy
final String proxy,
final String acceptLanguage
) {
final long startTime = System.currentTimeMillis();
@ -579,7 +580,7 @@ public class Segment {
String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase();
if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) {
// STORE IMAGE AND METADATA
Transactions.store(vector, crawlProfile.snapshotLoadImage(), crawlProfile.snapshotReplaceold(), proxy, crawlProfile.getAgent());
Transactions.store(vector, crawlProfile.snapshotLoadImage(), crawlProfile.snapshotReplaceold(), proxy, crawlProfile.getAgent(), acceptLanguage);
}
}
@ -609,7 +610,7 @@ public class Segment {
}
// REMEMBER FIRST SEEN
setFirstSeenTime(url.hash(), Math.min(document.getDate().getTime(), System.currentTimeMillis())); // should exist already in the index at this time, but just to make sure
setFirstSeenTime(url.hash(), Math.min(document.getLastModified().getTime(), System.currentTimeMillis())); // should exist already in the index at this time, but just to make sure
// write the edges to the citation reference index
if (this.connectedCitation()) try {

@ -490,7 +490,7 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (allAttr || contains(CollectionSchema.last_modified)) {
Date lastModified = responseHeader == null ? new Date() : responseHeader.lastModified();
if (lastModified == null) lastModified = new Date();
if (document.getDate().before(lastModified)) lastModified = document.getDate();
if (document.getLastModified().before(lastModified)) lastModified = document.getLastModified();
long firstSeen = segment.getFirstSeenTime(digestURL.hash());
if (firstSeen > 0 && firstSeen < lastModified.getTime()) lastModified = new Date(firstSeen); // patch the date if we have seen the document earlier
add(doc, CollectionSchema.last_modified, lastModified);

Loading…
Cancel
Save