*) Don't overwrite new entries with older ones

see: http://www.yacy-forum.de/viewtopic.php?t=2015



git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1874 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
hermens 19 years ago
parent be88687d8c
commit ad119f06af

@ -384,7 +384,7 @@ public final class plasmaCrawlLURL extends plasmaURL {
public class Entry { public class Entry {
private URL url; private URL url;
private String descr; private String descr;
private Date moddate; private Date moddate;
@ -465,171 +465,214 @@ public final class plasmaCrawlLURL extends plasmaURL {
} }
} }
public Entry(Properties prop, boolean setGlobal) { public Entry(Properties prop, boolean setGlobal) {
// generates an plasmaLURLEntry using the properties from the argument // generates an plasmaLURLEntry using the properties from the argument
// the property names must correspond to the one from toString // the property names must correspond to the one from toString
//System.out.println("DEBUG-ENTRY: prop=" + prop.toString()); //System.out.println("DEBUG-ENTRY: prop=" + prop.toString());
this.urlHash = prop.getProperty("hash", dummyHash); this.urlHash = prop.getProperty("hash", dummyHash);
try { try {
//byte[][] entry = urlHashCache.get(urlHash.getBytes()); //byte[][] entry = urlHashCache.get(urlHash.getBytes());
//if (entry == null) { //if (entry == null) {
this.referrerHash = prop.getProperty("referrer", dummyHash); this.referrerHash = prop.getProperty("referrer", dummyHash);
this.moddate = shortDayFormatter.parse(prop.getProperty("mod", "20000101")); this.moddate = shortDayFormatter.parse(prop.getProperty("mod", "20000101"));
//System.out.println("DEBUG: moddate = " + moddate + ", prop=" + prop.getProperty("mod")); //System.out.println("DEBUG: moddate = " + moddate + ", prop=" + prop.getProperty("mod"));
this.loaddate = shortDayFormatter.parse(prop.getProperty("load", "20000101")); this.loaddate = shortDayFormatter.parse(prop.getProperty("load", "20000101"));
this.copyCount = Integer.parseInt(prop.getProperty("cc", "0")); this.copyCount = Integer.parseInt(prop.getProperty("cc", "0"));
this.flags = ((prop.getProperty("local", "true").equals("true")) ? "L " : " "); this.flags = ((prop.getProperty("local", "true").equals("true")) ? "L " : " ");
if (setGlobal) this.flags = "G "; if (setGlobal) this.flags = "G ";
this.url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null)); this.url = new URL(crypt.simpleDecode(prop.getProperty("url", ""), null));
this.descr = crypt.simpleDecode(prop.getProperty("descr", ""), null); this.descr = crypt.simpleDecode(prop.getProperty("descr", ""), null);
if (this.descr == null) this.descr = this.url.toString(); if (this.descr == null) this.descr = this.url.toString();
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", "")); this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(prop.getProperty("q", ""));
this.language = prop.getProperty("lang", "uk"); this.language = prop.getProperty("lang", "uk");
this.doctype = prop.getProperty("dt", "t").charAt(0); this.doctype = prop.getProperty("dt", "t").charAt(0);
this.size = Integer.parseInt(prop.getProperty("size", "0")); this.size = Integer.parseInt(prop.getProperty("size", "0"));
this.wordCount = Integer.parseInt(prop.getProperty("wc", "0")); this.wordCount = Integer.parseInt(prop.getProperty("wc", "0"));
this.snippet = prop.getProperty("snippet", ""); this.snippet = prop.getProperty("snippet", "");
if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null); if (snippet.length() == 0) snippet = null; else snippet = crypt.simpleDecode(snippet, null);
this.word = (prop.containsKey("word")) ? new plasmaWordIndexEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null; this.word = (prop.containsKey("word")) ? new plasmaWordIndexEntry(kelondroBase64Order.enhancedCoder.decodeString(prop.getProperty("word",""))) : null;
store(); store();
//} //}
} catch (Exception e) { } catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/2: " + e.toString(), e); serverLog.logSevere("PLASMA", "INTERNAL ERROR in plasmaLURL.entry/2: " + e.toString(), e);
}
} }
}
private void store() { private void store() {
// stores the values from the object variables into the database // Check if there is a more recent Entry already in the DB
final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, urlDateLength); Entry oldEntry;
final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength); try {
if (exists(urlHash)) {
oldEntry = new Entry (urlHash, null);
} else {
oldEntry = null;
}
} catch (Exception e) {
oldEntry = null;
}
if ((oldEntry != null) && (isOlder(oldEntry))) {
// the fetched oldEntry is better, so return its properties instead of the new ones
// this.urlHash = oldEntry.urlHash; // unnecessary, should be the same
// this.url = oldEntry.url; // unnecessary, should be the same
this.descr = oldEntry.descr;
this.moddate = oldEntry.moddate;
this.loaddate = oldEntry.loaddate;
this.referrerHash = oldEntry.referrerHash;
this.copyCount = oldEntry.copyCount;
this.flags = oldEntry.flags;
this.quality = oldEntry.quality;
this.language = oldEntry.language;
this.doctype = oldEntry.doctype;
this.size = oldEntry.size;
this.wordCount = oldEntry.wordCount;
// this.snippet // not read from db
// this.word // not read from db
return;
}
// store the hash in the hash cache // stores the values from the object variables into the database
try { final String moddatestr = kelondroBase64Order.enhancedCoder.encodeLong(moddate.getTime() / 86400000, urlDateLength);
// even if the entry exists, we simply overwrite it final String loaddatestr = kelondroBase64Order.enhancedCoder.encodeLong(loaddate.getTime() / 86400000, urlDateLength);
final byte[][] entry = new byte[][] {
urlHash.getBytes(), // store the hash in the hash cache
url.toString().getBytes(), try {
descr.getBytes(), // null? // even if the entry exists, we simply overwrite it
moddatestr.getBytes(), final byte[][] entry = new byte[][] {
loaddatestr.getBytes(), urlHash.getBytes(),
referrerHash.getBytes(), url.toString().getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(copyCount, urlCopyCountLength).getBytes(), descr.getBytes(), // null?
flags.getBytes(), moddatestr.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(quality, urlQualityLength).getBytes(), loaddatestr.getBytes(),
language.getBytes(), referrerHash.getBytes(),
new byte[] {(byte) doctype}, kelondroBase64Order.enhancedCoder.encodeLong(copyCount, urlCopyCountLength).getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(size, urlSizeLength).getBytes(), flags.getBytes(),
kelondroBase64Order.enhancedCoder.encodeLong(wordCount, urlWordCountLength).getBytes(), kelondroBase64Order.enhancedCoder.encodeLong(quality, urlQualityLength).getBytes(),
}; language.getBytes(),
urlHashCache.put(entry); new byte[] {(byte) doctype},
} catch (Exception e) { kelondroBase64Order.enhancedCoder.encodeLong(size, urlSizeLength).getBytes(),
serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaCrawlLURL:store:" + e.toString(), e); kelondroBase64Order.enhancedCoder.encodeLong(wordCount, urlWordCountLength).getBytes(),
};
urlHashCache.put(entry);
} catch (Exception e) {
serverLog.logSevere("PLASMA", "INTERNAL ERROR AT plasmaCrawlLURL:store:" + e.toString(), e);
}
} }
}
public String hash() { public String hash() {
// return a url-hash, based on the md5 algorithm // return a url-hash, based on the md5 algorithm
// the result is a String of 12 bytes within a 72-bit space // the result is a String of 12 bytes within a 72-bit space
// (each byte has an 6-bit range) // (each byte has an 6-bit range)
// that should be enough for all web pages on the world // that should be enough for all web pages on the world
return this.urlHash; return this.urlHash;
} }
public URL url() { public URL url() {
return url; return url;
} }
public String descr() { public String descr() {
return descr; return descr;
} }
public Date moddate() { public Date moddate() {
return moddate; return moddate;
} }
public Date loaddate() { public Date loaddate() {
return loaddate; return loaddate;
} }
public String referrerHash() { public String referrerHash() {
// return the creator's hash // return the creator's hash
return referrerHash; return referrerHash;
} }
public char doctype() { public char doctype() {
return doctype; return doctype;
} }
public int copyCount() { public int copyCount() {
// return number of copies of this object in the global index // return number of copies of this object in the global index
return copyCount; return copyCount;
} }
public boolean local() { public boolean local() {
// returns true if the url was created locally and is needed for own word index // returns true if the url was created locally and is needed for own word index
if (flags == null) return false; if (flags == null) return false;
return flags.charAt(0) == 'L'; return flags.charAt(0) == 'L';
} }
public int quality() { public int quality() {
return quality; return quality;
} }
public String language() { public String language() {
return language; return language;
} }
public int size() { public int size() {
return size; return size;
} }
public int wordCount() { public int wordCount() {
return wordCount; return wordCount;
} }
public String snippet() { public String snippet() {
// the snippet may appear here if the url was transported in a remote search // the snippet may appear here if the url was transported in a remote search
// it will not be saved anywhere, but can only be requested here // it will not be saved anywhere, but can only be requested here
return snippet; return snippet;
} }
public plasmaWordIndexEntry word() { public plasmaWordIndexEntry word() {
return word; return word;
} }
private StringBuffer corePropList() { public boolean isOlder (Entry other) {
// generate a parseable string; this is a simple property-list if (other == null) return false;
final StringBuffer corePropStr = new StringBuffer(300); if (moddate.before(other.moddate())) return true;
try { if (moddate.equals(other.moddate())) {
corePropStr if (loaddate.before(other.loaddate())) return true;
.append("hash=") .append(urlHash) if (loaddate.equals(other.loaddate())) {
.append(",referrer=").append(referrerHash) if (quality < other.quality()) return true;
.append(",mod=") .append(shortDayFormatter.format(moddate)) }
.append(",load=") .append(shortDayFormatter.format(loaddate))
.append(",size=") .append(size)
.append(",wc=") .append(wordCount)
.append(",cc=") .append(copyCount)
.append(",local=") .append(((local()) ? "true" : "false"))
.append(",q=") .append(kelondroBase64Order.enhancedCoder.encodeLong(quality, urlQualityLength))
.append(",dt=") .append(doctype)
.append(",lang=") .append(language)
.append(",url=") .append(crypt.simpleEncode(url.toString()))
.append(",descr=") .append(crypt.simpleEncode(descr));
if (this.word != null) {
// append also word properties
corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toExternalForm()));
} }
return corePropStr; return false;
}
} catch (Exception e) { private StringBuffer corePropList() {
// generate a parseable string; this is a simple property-list
final StringBuffer corePropStr = new StringBuffer(300);
try {
corePropStr
.append("hash=") .append(urlHash)
.append(",referrer=").append(referrerHash)
.append(",mod=") .append(shortDayFormatter.format(moddate))
.append(",load=") .append(shortDayFormatter.format(loaddate))
.append(",size=") .append(size)
.append(",wc=") .append(wordCount)
.append(",cc=") .append(copyCount)
.append(",local=") .append(((local()) ? "true" : "false"))
.append(",q=") .append(kelondroBase64Order.enhancedCoder.encodeLong(quality, urlQualityLength))
.append(",dt=") .append(doctype)
.append(",lang=") .append(language)
.append(",url=") .append(crypt.simpleEncode(url.toString()))
.append(",descr=") .append(crypt.simpleEncode(descr));
if (this.word != null) {
// append also word properties
corePropStr.append(",word=").append(kelondroBase64Order.enhancedCoder.encodeString(word.toExternalForm()));
}
return corePropStr;
} catch (Exception e) {
// serverLog.logFailure("plasmaLURL.corePropList", e.getMessage()); // serverLog.logFailure("plasmaLURL.corePropList", e.getMessage());
// if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null"); // if (moddate == null) serverLog.logFailure("plasmaLURL.corePropList", "moddate=null");
// if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null"); // if (loaddate == null) serverLog.logFailure("plasmaLURL.corePropList", "loaddate=null");
// e.printStackTrace(); // e.printStackTrace();
return null; return null;
}
} }
}
/* /*
public String toString(int posintext, int posinphrase, int posofphrase) { public String toString(int posintext, int posinphrase, int posofphrase) {
@ -647,48 +690,48 @@ public final class plasmaCrawlLURL extends plasmaURL {
} }
*/ */
public String toString(String snippet) { public String toString(String snippet) {
// add information needed for remote transport // add information needed for remote transport
final StringBuffer core = corePropList(); final StringBuffer core = corePropList();
if (core == null) return null; if (core == null) return null;
core.ensureCapacity(core.length() + snippet.length()*2); core.ensureCapacity(core.length() + snippet.length()*2);
core.insert(0,"{"); core.insert(0,"{");
core.append(",snippet=").append(crypt.simpleEncode(snippet)); core.append(",snippet=").append(crypt.simpleEncode(snippet));
core.append("}"); core.append("}");
return core.toString(); return core.toString();
//return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}"; //return "{" + core + ",snippet=" + crypt.simpleEncode(snippet) + "}";
} }
/** /**
* Returns this object as String.<br> * Returns this object as String.<br>
* This e.g. looks like this: * This e.g. looks like this:
* <pre>{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}</pre> * <pre>{hash=jmqfMk7Y3NKw,referrer=------------,mod=20050610,load=20051003,size=51666,wc=1392,cc=0,local=true,q=AEn,dt=h,lang=uk,url=b|aHR0cDovL3d3dy50cmFuc3BhcmVuY3kub3JnL3N1cnZleXMv,descr=b|S25vd2xlZGdlIENlbnRyZTogQ29ycnVwdGlvbiBTdXJ2ZXlzIGFuZCBJbmRpY2Vz}</pre>
*/ */
public String toString() { public String toString() {
final StringBuffer core = corePropList(); final StringBuffer core = corePropList();
if (core == null) return null; if (core == null) return null;
core.insert(0,"{"); core.insert(0,"{");
core.append("}"); core.append("}");
return core.toString(); return core.toString();
//return "{" + core + "}"; //return "{" + core + "}";
} }
public void print() { public void print() {
System.out.println("URL : " + url); System.out.println("URL : " + url);
System.out.println("Description : " + descr); System.out.println("Description : " + descr);
System.out.println("Modified : " + httpc.dateString(moddate)); System.out.println("Modified : " + httpc.dateString(moddate));
System.out.println("Loaded : " + httpc.dateString(loaddate)); System.out.println("Loaded : " + httpc.dateString(loaddate));
System.out.println("Size : " + size + " bytes, " + wordCount + " words"); System.out.println("Size : " + size + " bytes, " + wordCount + " words");
System.out.println("Referrer Hash : " + referrerHash); System.out.println("Referrer Hash : " + referrerHash);
System.out.println("Quality : " + quality); System.out.println("Quality : " + quality);
System.out.println("Language : " + language); System.out.println("Language : " + language);
System.out.println("DocType : " + doctype); System.out.println("DocType : " + doctype);
System.out.println(); System.out.println();
} }
} // class Entry } // class Entry
public class kiter implements Iterator { public class kiter implements Iterator {

@ -149,6 +149,10 @@ public final class plasmaWordIndexEntity {
public boolean addEntry(plasmaWordIndexEntry entry) throws IOException { public boolean addEntry(plasmaWordIndexEntry entry) throws IOException {
if (entry == null) return false; if (entry == null) return false;
plasmaWordIndexEntry oldEntry = getEntry(entry.getUrlHash());
if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this entity
return false;
}
return (theIndex.put(entry.getUrlHash().getBytes(), entry.toEncodedForm().getBytes()) == null); return (theIndex.put(entry.getUrlHash().getBytes(), entry.toEncodedForm().getBytes()) == null);
} }

@ -414,6 +414,24 @@ public final class plasmaWordIndexEntry implements Cloneable {
public char getType() { return doctype; } public char getType() { return doctype; }
public boolean isLocal() { return localflag == LT_LOCAL; } public boolean isLocal() { return localflag == LT_LOCAL; }
public boolean isNewer(plasmaWordIndexEntry other) {
if (other == null) return true;
if (this.lastModified > other.lastModified) return true;
if (this.lastModified == other.getLastModified()) {
if (this.quality > other.quality) return true;
}
return false;
}
public boolean isOlder(plasmaWordIndexEntry other) {
if (other == null) return false;
if (this.lastModified < other.getLastModified()) return true;
if (this.lastModified == other.getLastModified()) {
if (this.quality < other.quality) return true;
}
return false;
}
public int domlengthNormalized() { public int domlengthNormalized() {
return 255 * plasmaURL.domLengthEstimation(this.urlHash) / 30; return 255 * plasmaURL.domLengthEstimation(this.urlHash) / 30;
} }

@ -132,6 +132,10 @@ public final class plasmaWordIndexEntryContainer {
private boolean addi(plasmaWordIndexEntry entry) { private boolean addi(plasmaWordIndexEntry entry) {
// returns true if the new entry was added, false if it already existet // returns true if the new entry was added, false if it already existet
plasmaWordIndexEntry oldEntry = (plasmaWordIndexEntry) container.get(entry.getUrlHash());
if ((oldEntry != null) && (entry.isOlder(oldEntry))) { // A more recent Entry is already in this container
return false;
}
return (container.put(entry.getUrlHash(), entry) == null); return (container.put(entry.getUrlHash(), entry) == null);
} }

Loading…
Cancel
Save