*) Don't overwrite new entries with older ones

see: http://www.yacy-forum.de/viewtopic.php?t=2015



git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1874 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
hermens 19 years ago
parent be88687d8c
commit ad119f06af

@ -499,6 +499,37 @@ public final class plasmaCrawlLURL extends plasmaURL {
}
private void store() {
// Check if there is a more recent Entry already in the DB
Entry oldEntry;
try {
if (exists(urlHash)) {
oldEntry = new Entry (urlHash, null);
} else {
oldEntry = null;
}
} catch (Exception e) {
oldEntry = null;
}
if ((oldEntry != null) && (isOlder(oldEntry))) {
// the fetched oldEntry is better, so return its properties instead of the new ones
// this.urlHash = oldEntry.urlHash; // unnecessary, should be the same
// this.url = oldEntry.url; // unnecessary, should be the same
this.descr = oldEntry.descr;
this.moddate = oldEntry.moddate;
this.loaddate = oldEntry.loaddate;
this.referrerHash = oldEntry.referrerHash;
this.copyCount = oldEntry.copyCount;
this.flags = oldEntry.flags;
this.quality = oldEntry.quality;
this.language = oldEntry.language;
this.doctype = oldEntry.doctype;
this.size = oldEntry.size;
this.wordCount = oldEntry.wordCount;
// this.snippet // not read from db
// this.word // not read from db
return;
}
// stores the values from the object variables into the database
final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, urlDateLength);
final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength);
@ -597,6 +628,18 @@ public final class plasmaCrawlLURL extends plasmaURL {
return word;
}
public boolean isOlder (Entry other) {
if (other == null) return false;
if (moddate.before(other.moddate())) return true;
if (moddate.equals(other.moddate())) {
if (loaddate.before(other.loaddate())) return true;
if (loaddate.equals(other.loaddate())) {
if (quality < other.quality()) return true;
}
}
return false;
}
private StringBuffer corePropList() {
// generate a parseable string; this is a simple property-list
final StringBuffer corePropStr = new StringBuffer(300);

@ -149,6 +149,10 @@ public final class plasmaWordIndexEntity {
public boolean addEntry(plasmaWordIndexEntry entry) throws IOException {
if (entry == null) return false;
plasmaWordIndexEntry oldEntry = getEntry(entry.getUrlHash());
if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this entity
return false;
}
return (theIndex.put(entry.getUrlHash().getBytes(), entry.toEncodedForm().getBytes()) == null);
}

@ -414,6 +414,24 @@ public final class plasmaWordIndexEntry implements Cloneable {
public char getType() { return doctype; }
public boolean isLocal() { return localflag == LT_LOCAL; }
public boolean isNewer(plasmaWordIndexEntry other) {
if (other == null) return true;
if (this.lastModified > other.lastModified) return true;
if (this.lastModified == other.getLastModified()) {
if (this.quality > other.quality) return true;
}
return false;
}
public boolean isOlder(plasmaWordIndexEntry other) {
if (other == null) return false;
if (this.lastModified < other.getLastModified()) return true;
if (this.lastModified == other.getLastModified()) {
if (this.quality < other.quality) return true;
}
return false;
}
public int domlengthNormalized() {
return 255 * plasmaURL.domLengthEstimation(this.urlHash) / 30;
}

@ -132,6 +132,10 @@ public final class plasmaWordIndexEntryContainer {
private boolean addi(plasmaWordIndexEntry entry) {
// returns true if the new entry was added, false if it already existet
plasmaWordIndexEntry oldEntry = (plasmaWordIndexEntry) container.get(entry.getUrlHash());
if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this container
return false;
}
return (container.put(entry.getUrlHash(), entry) == null);
}

Loading…
Cancel
Save